From 4a99d912093f27f3f99a91c9a2444ba1e0424925 Mon Sep 17 00:00:00 2001 From: iliana etaoin Date: Fri, 26 Sep 2025 12:50:44 -0700 Subject: [PATCH 01/19] add `time_pruned` to the tuf_repo table --- nexus/db-model/src/schema_versions.rs | 3 ++- nexus/db-model/src/tuf_repo.rs | 2 ++ nexus/db-schema/src/schema.rs | 1 + schema/crdb/dbinit.sql | 5 ++++- schema/crdb/tuf-pruned/up01.sql | 2 ++ 5 files changed, 11 insertions(+), 2 deletions(-) create mode 100644 schema/crdb/tuf-pruned/up01.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 7b5281a7a60..a20ab1d4600 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(193, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(194, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(194, "tuf-pruned"), KnownVersion::new(193, "nexus-lockstep-port"), KnownVersion::new(192, "blueprint-source"), KnownVersion::new(191, "debug-log-blueprint-planner"), diff --git a/nexus/db-model/src/tuf_repo.rs b/nexus/db-model/src/tuf_repo.rs index 3b4a7c5bae4..fd54f95ccca 100644 --- a/nexus/db-model/src/tuf_repo.rs +++ b/nexus/db-model/src/tuf_repo.rs @@ -87,6 +87,7 @@ impl TufRepoDescription { pub struct TufRepo { pub id: DbTypedUuid, pub time_created: DateTime, + pub time_pruned: Option>, // XXX: We're overloading ArtifactHash here to also mean the hash of the // repository zip itself. pub sha256: ArtifactHash, @@ -108,6 +109,7 @@ impl TufRepo { Self { id: TypedUuid::new_v4().into(), time_created: Utc::now(), + time_pruned: None, sha256, targets_role_version: targets_role_version as i64, valid_until, diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index 777c4cb2dbb..ad191dfe067 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1417,6 +1417,7 @@ table! { valid_until -> Timestamptz, system_version -> Text, file_name -> Text, + time_pruned -> Nullable, } } diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 5126e39944b..e38868033d6 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2548,6 +2548,9 @@ CREATE TABLE IF NOT EXISTS omicron.public.tuf_repo ( -- Filename provided by the user. file_name TEXT NOT NULL, + -- Set when the repository's artifacts can be deleted from replication. + time_pruned TIMESTAMPTZ, + CONSTRAINT unique_checksum UNIQUE (sha256), CONSTRAINT unique_system_version UNIQUE (system_version) ); @@ -6688,7 +6691,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '193.0.0', NULL) + (TRUE, NOW(), NOW(), '194.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/tuf-pruned/up01.sql b/schema/crdb/tuf-pruned/up01.sql new file mode 100644 index 00000000000..d6756973f3c --- /dev/null +++ b/schema/crdb/tuf-pruned/up01.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.public.tuf_repo + ADD COLUMN IF NOT EXISTS time_pruned TIMESTAMPTZ; From fb842cf67a63556ebcb0197f3a078f13fc86105f Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 26 Sep 2025 13:51:59 -0700 Subject: [PATCH 02/19] WIP: boilerplate for adding a new background task --- nexus-config/src/nexus_config.rs | 33 ++++++++++ nexus/examples/config-second.toml | 3 + nexus/examples/config.toml | 3 + nexus/src/app/background/init.rs | 15 +++++ nexus/src/app/background/tasks/mod.rs | 1 + .../app/background/tasks/tuf_repo_pruner.rs | 66 +++++++++++++++++++ nexus/tests/config.test.toml | 3 + smf/nexus/multi-sled/config-partial.toml | 3 + smf/nexus/single-sled/config-partial.toml | 3 + 9 files changed, 130 insertions(+) create mode 100644 nexus/src/app/background/tasks/tuf_repo_pruner.rs diff --git a/nexus-config/src/nexus_config.rs b/nexus-config/src/nexus_config.rs index a91d98dcaa8..a352cbe351d 100644 --- a/nexus-config/src/nexus_config.rs +++ b/nexus-config/src/nexus_config.rs @@ -430,6 +430,8 @@ pub struct BackgroundTaskConfig { RegionSnapshotReplacementFinishConfig, /// configuration for TUF artifact replication task pub tuf_artifact_replication: TufArtifactReplicationConfig, + /// configuration for TUF repo pruner task + pub tuf_repo_pruner: TufRepoPrunerConfig, /// configuration for read-only region replacement start task pub read_only_region_replacement_start: ReadOnlyRegionReplacementStartConfig, @@ -765,6 +767,26 @@ pub struct TufArtifactReplicationConfig { pub min_sled_replication: usize, } +#[serde_as] +#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] +pub struct TufRepoPrunerConfig { + /// period (in seconds) for periodic activations of this background task + #[serde_as(as = "DurationSeconds")] + pub period_secs: Duration, + + /// number of extra recent target releases to keep + /// + /// The system always keeps two: the current release and the previous one. + /// This number is in addition to that. + pub nkeep_extra_target_releases: u8, + + /// number of extra recently uploaded repos to keep + /// + /// The system always keeps one, assuming that the operator may be about to + /// update to it. This number is in addition to that. + pub nkeep_extra_newly_uploaded: u8, +} + #[serde_as] #[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)] pub struct ReadOnlyRegionReplacementStartConfig { @@ -1119,6 +1141,9 @@ mod test { region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 tuf_artifact_replication.min_sled_replication = 3 + tuf_repo_pruner.period_secs = 299 + tuf_repo_pruner.nkeep_extra_target_releases = 51 + tuf_repo_pruner.nkeep_extra_newly_uploaded = 52 read_only_region_replacement_start.period_secs = 30 alert_dispatcher.period_secs = 42 webhook_deliverator.period_secs = 43 @@ -1342,6 +1367,11 @@ mod test { period_secs: Duration::from_secs(300), min_sled_replication: 3, }, + tuf_repo_pruner: TufRepoPrunerConfig { + period_secs: Duration::from_secs(299), + nkeep_extra_target_releases: 51, + nkeep_extra_newly_uploaded: 52, + }, read_only_region_replacement_start: ReadOnlyRegionReplacementStartConfig { period_secs: Duration::from_secs(30), @@ -1449,6 +1479,9 @@ mod test { region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 tuf_artifact_replication.min_sled_replication = 3 + tuf_repo_pruner.period_secs = 299 + tuf_repo_pruner.nkeep_extra_target_releases = 51 + tuf_repo_pruner.nkeep_extra_newly_uploaded = 52 read_only_region_replacement_start.period_secs = 30 alert_dispatcher.period_secs = 42 webhook_deliverator.period_secs = 43 diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 3bf8b526ad7..64c9a55c08a 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -152,6 +152,9 @@ region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 tuf_artifact_replication.min_sled_replication = 1 +tuf_repo_pruner.period_secs = 300 +tuf_repo_pruner.nkeep_extra_target_releases = 1 +tuf_repo_pruner.nkeep_extra_newly_uploaded = 1 # In general, the webhook dispatcher will be activated when events are queued, # so we don't need to periodically activate it *that* frequently. alert_dispatcher.period_secs = 60 diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index 80fa495baad..d3888d3b6ad 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -136,6 +136,9 @@ region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 tuf_artifact_replication.min_sled_replication = 1 +tuf_repo_pruner.period_secs = 300 +tuf_repo_pruner.nkeep_extra_target_releases = 1 +tuf_repo_pruner.nkeep_extra_newly_uploaded = 1 # In general, the webhook dispatcher will be activated when events are queued, # so we don't need to periodically activate it *that* frequently. alert_dispatcher.period_secs = 60 diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index 14283341354..fe4712a18f4 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -126,6 +126,7 @@ use super::tasks::support_bundle_collector; use super::tasks::sync_service_zone_nat::ServiceZoneNatTracker; use super::tasks::sync_switch_configuration::SwitchPortSettingsManager; use super::tasks::tuf_artifact_replication; +use super::tasks::tuf_repo_pruner; use super::tasks::v2p_mappings::V2PManager; use super::tasks::vpc_routes; use super::tasks::webhook_deliverator; @@ -230,6 +231,7 @@ impl BackgroundTasksInitializer { task_region_snapshot_replacement_step: Activator::new(), task_region_snapshot_replacement_finish: Activator::new(), task_tuf_artifact_replication: Activator::new(), + task_tuf_repo_pruner: Activator::new(), task_read_only_region_replacement_start: Activator::new(), task_alert_dispatcher: Activator::new(), task_webhook_deliverator: Activator::new(), @@ -307,6 +309,7 @@ impl BackgroundTasksInitializer { task_region_snapshot_replacement_step, task_region_snapshot_replacement_finish, task_tuf_artifact_replication, + task_tuf_repo_pruner, task_read_only_region_replacement_start, task_alert_dispatcher, task_webhook_deliverator, @@ -917,6 +920,18 @@ impl BackgroundTasksInitializer { activator: task_tuf_artifact_replication, }); + driver.register(TaskDefinition { + name: "tuf_repo_pruner", + description: "determine which TUF repos' artifacts can be pruned", + period: config.tuf_repo_pruner.period_secs, + task_impl: Box::new(tuf_repo_pruner::TufRepoPruner::new( + datastore.clone(), + )), + opctx: opctx.child(BTreeMap::new()), + watchers: vec![], + activator: task_tuf_repo_pruner, + }); + driver.register(TaskDefinition { name: "read_only_region_replacement_start", description: diff --git a/nexus/src/app/background/tasks/mod.rs b/nexus/src/app/background/tasks/mod.rs index 993789a6296..ac4d4ccc875 100644 --- a/nexus/src/app/background/tasks/mod.rs +++ b/nexus/src/app/background/tasks/mod.rs @@ -42,6 +42,7 @@ pub mod support_bundle_collector; pub mod sync_service_zone_nat; pub mod sync_switch_configuration; pub mod tuf_artifact_replication; +pub mod tuf_repo_pruner; pub mod v2p_mappings; pub mod vpc_routes; pub mod webhook_deliverator; diff --git a/nexus/src/app/background/tasks/tuf_repo_pruner.rs b/nexus/src/app/background/tasks/tuf_repo_pruner.rs new file mode 100644 index 00000000000..ae2109de405 --- /dev/null +++ b/nexus/src/app/background/tasks/tuf_repo_pruner.rs @@ -0,0 +1,66 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Background task for determining when to prune artifacts from TUF repos + +use super::reconfigurator_config::ReconfiguratorConfigLoaderState; +use crate::app::background::BackgroundTask; +use chrono::Utc; +use futures::future::BoxFuture; +use nexus_auth::authz; +use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::DataStore; +use nexus_reconfigurator_planning::planner::Planner; +use nexus_reconfigurator_planning::planner::PlannerRng; +use nexus_reconfigurator_preparation::PlanningInputFromDb; +use nexus_types::deployment::BlueprintSource; +use nexus_types::deployment::PlanningReport; +use nexus_types::deployment::{Blueprint, BlueprintTarget}; +use nexus_types::internal_api::background::BlueprintPlannerStatus; +use omicron_common::api::external::LookupType; +use omicron_uuid_kinds::CollectionUuid; +use omicron_uuid_kinds::GenericUuid as _; +use serde_json::json; +use slog_error_chain::InlineErrorChain; +use std::sync::Arc; +use tokio::sync::watch::{self, Receiver, Sender}; + +// XXX-dap +#[derive(Serialize, JsonSchema)] +struct TufRepoPrunerStatus; + +/// Background task that marks TUF repos for pruning +pub struct TufRepoPruner { + datastore: Arc, +} + +impl TufRepoPruner { + pub fn new(datastore: Arc) -> Self { + Self { datastore } + } + + fn tuf_repos_prune(&self, opctx: &OpContext) -> TufRepoPrunerStatus { + todo!(); + } +} + +impl BackgroundTask for TufRepoPruner { + fn activate<'a>( + &'a mut self, + opctx: &'a OpContext, + ) -> BoxFuture<'a, serde_json::Value> { + Box::pin(async move { + let status = self.tuf_repos_prune(opctx).await; + match serde_json::to_value(status) { + Ok(val) => val, + Err(err) => json!({ + "error": format!( + "could not serialize task status: {}", + InlineErrorChain::new(&err) + ), + }), + } + }) + } +} diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 56d174ce451..0576e85181b 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -169,6 +169,9 @@ region_snapshot_replacement_finish.period_secs = 999999 tuf_artifact_replication.period_secs = 3600 # Update integration tests are started with 4 sled agents. tuf_artifact_replication.min_sled_replication = 3 +tuf_repo_pruner.period_secs = 300 +tuf_repo_pruner.nkeep_extra_target_releases = 1 +tuf_repo_pruner.nkeep_extra_newly_uploaded = 1 # In general, the webhook dispatcher will be activated when events are queued, # so we don't need to periodically activate it *that* frequently. alert_dispatcher.period_secs = 60 diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 5548c926122..2f4c0bede44 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -76,6 +76,9 @@ region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 tuf_artifact_replication.min_sled_replication = 3 +tuf_repo_pruner.period_secs = 300 +tuf_repo_pruner.nkeep_extra_target_releases = 1 +tuf_repo_pruner.nkeep_extra_newly_uploaded = 1 read_only_region_replacement_start.period_secs = 30 # In general, the webhook dispatcher will be activated when events are queued, # so we don't need to periodically activate it *that* frequently. diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index 005a4f83dbb..d5b3f98a94e 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -76,6 +76,9 @@ region_snapshot_replacement_step.period_secs = 30 region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 tuf_artifact_replication.min_sled_replication = 1 +tuf_repo_pruner.period_secs = 300 +tuf_repo_pruner.nkeep_extra_target_releases = 1 +tuf_repo_pruner.nkeep_extra_newly_uploaded = 1 read_only_region_replacement_start.period_secs = 30 # In general, the webhook dispatcher will be activated when events are queued, # so we don't need to periodically activate it *that* frequently. From f660f7228883103bc1bdc4272ac7895947a3b0fe Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 26 Sep 2025 15:56:42 -0700 Subject: [PATCH 03/19] WIP: flesh out tuf_repo_pruner background task --- nexus/background-task-interface/src/init.rs | 1 + .../src/db/datastore/target_release.rs | 63 ++++++ nexus/db-queries/src/db/datastore/update.rs | 43 +++- nexus/src/app/background/init.rs | 1 + .../app/background/tasks/tuf_repo_pruner.rs | 193 +++++++++++++++--- nexus/types/src/internal_api/background.rs | 31 +++ 6 files changed, 300 insertions(+), 32 deletions(-) diff --git a/nexus/background-task-interface/src/init.rs b/nexus/background-task-interface/src/init.rs index 90816d365d6..f0bf7766d15 100644 --- a/nexus/background-task-interface/src/init.rs +++ b/nexus/background-task-interface/src/init.rs @@ -44,6 +44,7 @@ pub struct BackgroundTasks { pub task_region_snapshot_replacement_step: Activator, pub task_region_snapshot_replacement_finish: Activator, pub task_tuf_artifact_replication: Activator, + pub task_tuf_repo_pruner: Activator, pub task_read_only_region_replacement_start: Activator, pub task_alert_dispatcher: Activator, pub task_webhook_deliverator: Activator, diff --git a/nexus/db-queries/src/db/datastore/target_release.rs b/nexus/db-queries/src/db/datastore/target_release.rs index 0a02eab1f17..f6aa8289534 100644 --- a/nexus/db-queries/src/db/datastore/target_release.rs +++ b/nexus/db-queries/src/db/datastore/target_release.rs @@ -17,6 +17,8 @@ use nexus_db_errors::{ErrorHandler, public_error_from_diesel}; use nexus_db_schema::schema::target_release::dsl; use nexus_types::external_api::views; use omicron_common::api::external::{CreateResult, Error, LookupResult}; +use omicron_uuid_kinds::TufRepoUuid; +use std::collections::BTreeSet; impl DataStore { /// Fetch the current target release, i.e., the row with the largest @@ -127,6 +129,67 @@ impl DataStore { }; Ok(target_release.into_external(release_source)) } + + /// Lists the most recent N distinct target releases + pub async fn target_release_fetch_recent_distinct( + &self, + opctx: &OpContext, + count: u8, + ) -> Result, Error> { + opctx + .authorize(authz::Action::Read, &authz::TARGET_RELEASE_CONFIG) + .await?; + + // In almost all cases, `count` = 2 and we only need to look back two + // rows to find the most recent target releases. But we do allow this + // to be configurable, and there are cases where the same release can + // appear in sequential `target_release` rows (e.g., after a MUPdate). + // + // We want to avoid a loop so that this function can be used in contexts + // that don't want to take an arbitrary amount of time. + // + // The solution: `count` is a `u8`, so it can be at most 255. We'll + // multiply this by 4 to account for an unbelievable number of MUPdates. + // That's still small enough to do in one go. If we're wrong and can't + // find enough distinct releases, we'll return an error. (This seems + // extremely unlikely.) + let limit = 4 * u16::from(count); + let conn = self.pool_connection_authorized(opctx).await?; + let rows = dsl::target_release + .select(TargetRelease::as_select()) + .order_by(dsl::generation.desc()) + .limit(i64::from(limit)) + .load_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + let nfound = rows.len(); + let mut rv = BTreeSet::new(); + for target_release in rows { + if let Some(repo_id) = target_release.tuf_repo_id { + rv.insert(repo_id.into()); + if rv.len() >= usize::from(u8::from(count)) { + return Ok(rv); + } + } + } + + // We ran out of rows before finding enough distinct target releases. + // If we got `limit` rows, there may have been more that we didn't + // search. This is the case we called "extremely unlikely" above. + // Return an error in that case. + if nfound == usize::from(limit) { + return Err(Error::internal_error(&format!( + "looking for {} distinct releases in the most recent {} \ + target_release rows, but found only {} before giving up", + count, + limit, + rv.len(), + ))); + } + + // Otherwise, that's it: we found all the releases that were there. + Ok(rv) + } } #[cfg(test)] diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index dabe70f5100..71cd42da649 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -23,11 +23,11 @@ use nexus_db_model::{ }; use omicron_common::api::external::{ self, CreateResult, DataPageParams, DeleteResult, Generation, - ListResultVec, LookupResult, LookupType, ResourceType, TufRepoInsertStatus, + ListResultVec, LookupResult, LookupType, ResourceType, TufRepoInsertStatus, UpdateResult, }; -use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::TufRepoKind; use omicron_uuid_kinds::TypedUuid; +use omicron_uuid_kinds::{GenericUuid, TufRepoUuid}; use swrite::{SWrite, swrite}; use tufaceous_artifact::ArtifactVersion; use uuid::Uuid; @@ -195,6 +195,45 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + /// Pages through the list of all not-yet-pruned TUF repos in the system + pub async fn tuf_list_repos_unpruned( + &self, + opctx: &OpContext, + pagparams: &DataPageParams<'_, Uuid>, + ) -> ListResultVec { + opctx.authorize(authz::Action::Read, &authz::FLEET).await?; + + use nexus_db_schema::schema::tuf_repo::dsl; + + paginated(dsl::tuf_repo, dsl::id, pagparams) + .filter(dsl::time_pruned.is_null()) + .select(TufRepo::as_select()) + .load_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + /// Marks the given TUF repo as eligible for pruning + pub async fn tuf_repo_mark_pruned( + &self, + opctx: &OpContext, + tuf_repo_id: TufRepoUuid, + ) -> UpdateResult<()> { + use nexus_db_schema::schema::tuf_repo::dsl; + + opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; + + let conn = self.pool_connection_authorized(opctx).await?; + diesel::update(dsl::tuf_repo) + .filter(dsl::id.eq(to_db_typed_uuid(tuf_repo_id))) + .filter(dsl::time_pruned.is_null()) + .set(dsl::time_pruned.eq(chrono::Utc::now())) + .execute_async(&*conn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .map(|_| ()) + } + /// Returns the current TUF repo generation number. pub async fn tuf_get_generation( &self, diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index fe4712a18f4..a11a65616d1 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -926,6 +926,7 @@ impl BackgroundTasksInitializer { period: config.tuf_repo_pruner.period_secs, task_impl: Box::new(tuf_repo_pruner::TufRepoPruner::new( datastore.clone(), + config.tuf_repo_pruner.clone() )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], diff --git a/nexus/src/app/background/tasks/tuf_repo_pruner.rs b/nexus/src/app/background/tasks/tuf_repo_pruner.rs index ae2109de405..7b4781a0d3e 100644 --- a/nexus/src/app/background/tasks/tuf_repo_pruner.rs +++ b/nexus/src/app/background/tasks/tuf_repo_pruner.rs @@ -4,44 +4,49 @@ //! Background task for determining when to prune artifacts from TUF repos -use super::reconfigurator_config::ReconfiguratorConfigLoaderState; use crate::app::background::BackgroundTask; -use chrono::Utc; +use anyhow::Context; use futures::future::BoxFuture; -use nexus_auth::authz; +use iddqd::IdOrdMap; +use nexus_config::TufRepoPrunerConfig; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; -use nexus_reconfigurator_planning::planner::Planner; -use nexus_reconfigurator_planning::planner::PlannerRng; -use nexus_reconfigurator_preparation::PlanningInputFromDb; -use nexus_types::deployment::BlueprintSource; -use nexus_types::deployment::PlanningReport; -use nexus_types::deployment::{Blueprint, BlueprintTarget}; -use nexus_types::internal_api::background::BlueprintPlannerStatus; -use omicron_common::api::external::LookupType; -use omicron_uuid_kinds::CollectionUuid; +use nexus_db_queries::db::datastore::SQL_BATCH_SIZE; +use nexus_db_queries::db::pagination::Paginator; +use nexus_types::internal_api::background::TufRepoInfo; +use nexus_types::internal_api::background::TufRepoPrunerStatus; use omicron_uuid_kinds::GenericUuid as _; +use omicron_uuid_kinds::TufRepoUuid; use serde_json::json; use slog_error_chain::InlineErrorChain; +use std::collections::BTreeSet; +use std::num::NonZeroU32; use std::sync::Arc; -use tokio::sync::watch::{self, Receiver, Sender}; -// XXX-dap -#[derive(Serialize, JsonSchema)] -struct TufRepoPrunerStatus; +/// number of recent distinct target releases that we always keep, regardless of +/// configuration +/// +/// This is intended to include (1) the current target release, and (2) the +/// previous target release. We shouldn't need more because you can't generally +/// start another update until one has finished. +const NKEEP_RECENT_TARGET_RELEASES_ALWAYS: u8 = 2; + +/// number of distinct newly-uploaded releases that we always keep, regardless +/// of configuration +/// +/// This is intended to cover a release that the operator has just uploaded in +/// order to update to it. +const NKEEP_RECENT_UPLOADS_ALWAYS: u8 = 1; /// Background task that marks TUF repos for pruning pub struct TufRepoPruner { datastore: Arc, + config: TufRepoPrunerConfig, } impl TufRepoPruner { - pub fn new(datastore: Arc) -> Self { - Self { datastore } - } - - fn tuf_repos_prune(&self, opctx: &OpContext) -> TufRepoPrunerStatus { - todo!(); + pub fn new(datastore: Arc, config: TufRepoPrunerConfig) -> Self { + Self { datastore, config } } } @@ -51,16 +56,144 @@ impl BackgroundTask for TufRepoPruner { opctx: &'a OpContext, ) -> BoxFuture<'a, serde_json::Value> { Box::pin(async move { - let status = self.tuf_repos_prune(opctx).await; - match serde_json::to_value(status) { - Ok(val) => val, - Err(err) => json!({ - "error": format!( - "could not serialize task status: {}", - InlineErrorChain::new(&err) - ), + match tuf_repos_prune(opctx, &self.datastore, &self.config).await { + Ok(status) => match serde_json::to_value(status) { + Ok(val) => val, + Err(err) => json!({ + "error": format!( + "could not serialize task status: {}", + InlineErrorChain::new(&err) + ), + }), + }, + Err(error) => json!({ + "error": error.to_string(), }), } }) } } + +async fn tuf_repos_prune( + opctx: &OpContext, + datastore: &DataStore, + config: &TufRepoPrunerConfig, +) -> Result { + let nkeep_recent_releases = NKEEP_RECENT_TARGET_RELEASES_ALWAYS + + config.nkeep_extra_target_releases; + let nkeep_recent_uploads = + NKEEP_RECENT_UPLOADS_ALWAYS + config.nkeep_extra_newly_uploaded; + + let all_tuf_repos = fetch_all_tuf_repos(opctx, datastore, SQL_BATCH_SIZE) + .await + .context("fetching all TUF repos")?; + let recent_releases = datastore + .target_release_fetch_recent_distinct(opctx, nkeep_recent_releases) + .await + .context("listing recent target releases")?; + + // After this point, errors are not fatal. They just generate warnings. + let mut status = TufRepoPrunerStatus { + repos_keep: IdOrdMap::new(), + repos_prune: IdOrdMap::new(), + warnings: Vec::new(), + nkeep_recent_releases, + nkeep_recent_uploads, + }; + decide_prune(&all_tuf_repos, &recent_releases, &mut status); + for tuf_repo in &status.repos_prune { + if let Err(error) = + datastore.tuf_repo_mark_pruned(opctx, tuf_repo.id).await + { + status.warnings.push(format!( + "failed to prune {}: {}", + tuf_repo.id, + InlineErrorChain::new(&error), + )); + }; + } + + Ok(status) +} + +async fn fetch_all_tuf_repos( + opctx: &OpContext, + datastore: &DataStore, + batch_size: NonZeroU32, +) -> Result, anyhow::Error> { + let mut paginator = + Paginator::new(batch_size, dropshot::PaginationOrder::Ascending); + let mut rv = IdOrdMap::new(); + while let Some(p) = paginator.next() { + let batch = datastore + .tuf_list_repos_unpruned(opctx, &p.current_pagparams()) + .await + .context("fetching page of TUF repos")?; + paginator = p.found_batch(&batch, &|a| a.id.into_untyped_uuid()); + for repo in batch { + // It should be impossible to have duplicates here and there's + // nothing to do about it if we find them. + rv.insert_overwrite(TufRepoInfo { + id: repo.id(), + system_version: repo.system_version.into(), + time_created: repo.time_created, + }); + } + } + + Ok(rv) +} + +/// Given the complete list of TUF repos and a set of recent releases that we +/// definitely want to keep, decide what to prune and what to keep. +fn decide_prune( + all_repos: &IdOrdMap, + releases_to_keep: &BTreeSet, + status: &mut TufRepoPrunerStatus, +) { + // Record that we're keeping all of the `releases_to_keep`. + for repo_id in releases_to_keep { + match all_repos.get(repo_id) { + Some(repo_info) => { + status.repos_keep.insert_overwrite(repo_info.clone()); + } + None => { + // This is unusual: there's a recent target release with no + // associated entry in the `tuf_repo` table. This is + // conceivable if the repo has just been uploaded and made the + // target release, though still quite surprising since that + // all would have had to happen between our two database queries + // above. Still, it's not a problem. We'll note it here, but + // otherwise move on. (We will not wind up pruning this.) + status.warnings.push(format!( + "wanting to keep recent target release repo {repo_id}, \ + but did not find it in the tuf_repo table", + )); + } + } + } + + // Keep as many other uploaded repos as makes sense. + let mut nleft = status.nkeep_recent_uploads; + let mut candidate_repos_by_upload_time = + all_repos.into_iter().collect::>(); + candidate_repos_by_upload_time.sort_by_key(|k| k.time_created); + let mut candidates = candidate_repos_by_upload_time.into_iter(); + while nleft > 0 { + let Some(next) = candidates.next() else { + break; + }; + + // Only decrement `nleft` if we weren't already keeping this one. + if let Ok(_) = status.repos_keep.insert_unique(next.clone()) { + nleft -= 1; + } + } + + // Everything else will be pruned. + for repo in all_repos { + if !status.repos_keep.contains_key(&repo.id) { + status.repos_prune.insert_overwrite(repo.clone()); + } + } +} diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index f3ea81bef3e..a30fbddeb1b 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -6,6 +6,9 @@ use crate::deployment::PlanningReport; use crate::external_api::views; use chrono::DateTime; use chrono::Utc; +use iddqd::IdOrdItem; +use iddqd::IdOrdMap; +use iddqd::id_upcast; use omicron_common::api::external::Generation; use omicron_uuid_kinds::AlertReceiverUuid; use omicron_uuid_kinds::AlertUuid; @@ -13,7 +16,9 @@ use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::CollectionUuid; use omicron_uuid_kinds::SledUuid; use omicron_uuid_kinds::SupportBundleUuid; +use omicron_uuid_kinds::TufRepoUuid; use omicron_uuid_kinds::WebhookDeliveryUuid; +use semver::Version; use serde::Deserialize; use serde::Serialize; use std::collections::BTreeMap; @@ -379,6 +384,32 @@ pub enum TufArtifactReplicationOperation { Copy { hash: ArtifactHash, source_sled: SledUuid }, } +#[derive(Debug, Serialize)] +pub struct TufRepoPrunerStatus { + pub repos_keep: IdOrdMap, + pub repos_prune: IdOrdMap, + pub warnings: Vec, + pub nkeep_recent_releases: u8, + pub nkeep_recent_uploads: u8, +} + +#[derive(Clone, Debug, Serialize)] +pub struct TufRepoInfo { + pub id: TufRepoUuid, + pub system_version: Version, + pub time_created: DateTime, +} + +impl IdOrdItem for TufRepoInfo { + type Key<'a> = &'a TufRepoUuid; + + fn key(&self) -> Self::Key<'_> { + &self.id + } + + id_upcast!(); +} + /// The status of an `blueprint_rendezvous` background task activation. #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub struct BlueprintRendezvousStatus { From 693d70dadfc216eb7780138cda5186a01515faf6 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Fri, 26 Sep 2025 17:15:07 -0700 Subject: [PATCH 04/19] fix clippy, rustfmt --- nexus/db-queries/src/db/datastore/target_release.rs | 2 +- nexus/db-queries/src/db/datastore/update.rs | 3 ++- nexus/src/app/background/init.rs | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/target_release.rs b/nexus/db-queries/src/db/datastore/target_release.rs index f6aa8289534..d9e2aa16433 100644 --- a/nexus/db-queries/src/db/datastore/target_release.rs +++ b/nexus/db-queries/src/db/datastore/target_release.rs @@ -167,7 +167,7 @@ impl DataStore { for target_release in rows { if let Some(repo_id) = target_release.tuf_repo_id { rv.insert(repo_id.into()); - if rv.len() >= usize::from(u8::from(count)) { + if rv.len() >= usize::from(count) { return Ok(rv); } } diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index 71cd42da649..c17c9364376 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -23,7 +23,8 @@ use nexus_db_model::{ }; use omicron_common::api::external::{ self, CreateResult, DataPageParams, DeleteResult, Generation, - ListResultVec, LookupResult, LookupType, ResourceType, TufRepoInsertStatus, UpdateResult, + ListResultVec, LookupResult, LookupType, ResourceType, TufRepoInsertStatus, + UpdateResult, }; use omicron_uuid_kinds::TufRepoKind; use omicron_uuid_kinds::TypedUuid; diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index a11a65616d1..10fe0e74944 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -926,7 +926,7 @@ impl BackgroundTasksInitializer { period: config.tuf_repo_pruner.period_secs, task_impl: Box::new(tuf_repo_pruner::TufRepoPruner::new( datastore.clone(), - config.tuf_repo_pruner.clone() + config.tuf_repo_pruner.clone(), )), opctx: opctx.child(BTreeMap::new()), watchers: vec![], From 6a3514071d09c87f5fbe1e03cb2a129ef834b6d6 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Sun, 28 Sep 2025 20:41:24 -0700 Subject: [PATCH 05/19] take most recently uploaded, not least --- nexus/src/app/background/tasks/tuf_repo_pruner.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nexus/src/app/background/tasks/tuf_repo_pruner.rs b/nexus/src/app/background/tasks/tuf_repo_pruner.rs index 7b4781a0d3e..fb42a4f45c9 100644 --- a/nexus/src/app/background/tasks/tuf_repo_pruner.rs +++ b/nexus/src/app/background/tasks/tuf_repo_pruner.rs @@ -173,11 +173,13 @@ fn decide_prune( } } - // Keep as many other uploaded repos as makes sense. + // Keep as many other uploaded repos as makes sense, preferring the most + // recently uploaded. let mut nleft = status.nkeep_recent_uploads; let mut candidate_repos_by_upload_time = all_repos.into_iter().collect::>(); candidate_repos_by_upload_time.sort_by_key(|k| k.time_created); + candidate_repos_by_upload_time.reverse(); let mut candidates = candidate_repos_by_upload_time.into_iter(); while nleft > 0 { let Some(next) = candidates.next() else { From 0c14313d7ac87632810a38d4cbb13110346d0f56 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 09:45:07 -0700 Subject: [PATCH 06/19] WIP test --- .../app/background/tasks/tuf_repo_pruner.rs | 127 ++++++++++++++++++ 1 file changed, 127 insertions(+) diff --git a/nexus/src/app/background/tasks/tuf_repo_pruner.rs b/nexus/src/app/background/tasks/tuf_repo_pruner.rs index fb42a4f45c9..cbe3d3b24d6 100644 --- a/nexus/src/app/background/tasks/tuf_repo_pruner.rs +++ b/nexus/src/app/background/tasks/tuf_repo_pruner.rs @@ -199,3 +199,130 @@ fn decide_prune( } } } + +#[cfg(test)] +mod test { + use super::decide_prune; + use iddqd::IdOrdMap; + use nexus_types::internal_api::background::TufRepoInfo; + use nexus_types::internal_api::background::TufRepoPrunerStatus; + use omicron_uuid_kinds::TufRepoUuid; + use std::collections::BTreeSet; + + fn make_test_repos() -> [TufRepoInfo; 6] { + [ + TufRepoInfo { + id: TufRepoUuid::new_v4(), + time_created: "2025-09-26T01:00:00Z".parse().unwrap(), + system_version: "1.0.0".parse().unwrap(), + }, + TufRepoInfo { + id: TufRepoUuid::new_v4(), + time_created: "2025-09-26T02:00:00Z".parse().unwrap(), + system_version: "2.0.0".parse().unwrap(), + }, + TufRepoInfo { + id: TufRepoUuid::new_v4(), + time_created: "2025-09-26T03:00:00Z".parse().unwrap(), + system_version: "3.0.0".parse().unwrap(), + }, + TufRepoInfo { + id: TufRepoUuid::new_v4(), + time_created: "2025-09-26T04:00:00Z".parse().unwrap(), + system_version: "4.0.0".parse().unwrap(), + }, + TufRepoInfo { + id: TufRepoUuid::new_v4(), + time_created: "2025-09-26T05:00:00Z".parse().unwrap(), + system_version: "4.1.0".parse().unwrap(), + }, + TufRepoInfo { + id: TufRepoUuid::new_v4(), + time_created: "2025-09-26T06:00:00Z".parse().unwrap(), + system_version: "4.2.0".parse().unwrap(), + }, + ] + } + + #[test] + fn test_decide_prune() { + let all_repos = make_test_repos(); + let [r1, r2, r3, r4, r5, r6] = &all_repos; + let all_repos: IdOrdMap<_> = all_repos.clone().into_iter().collect(); + + // Trivial case + let mut status = TufRepoPrunerStatus { + repos_keep: IdOrdMap::new(), + repos_prune: IdOrdMap::new(), + warnings: Vec::new(), + nkeep_recent_releases: 2, + nkeep_recent_uploads: 1, + }; + decide_prune(&IdOrdMap::new(), &BTreeSet::new(), &mut status); + assert!(status.warnings.is_empty()); + assert!(status.repos_keep.is_empty()); + assert!(status.repos_prune.is_empty()); + + // Simple case: have some target releases to keep, no uploads. + let releases_to_keep: BTreeSet<_> = + [r3.id, r4.id].into_iter().collect(); + let mut status = TufRepoPrunerStatus { + repos_keep: IdOrdMap::new(), + repos_prune: IdOrdMap::new(), + warnings: Vec::new(), + nkeep_recent_releases: u8::try_from(releases_to_keep.len()) + .unwrap(), + nkeep_recent_uploads: 0, + }; + decide_prune(&all_repos, &releases_to_keep, &mut status); + assert!(status.warnings.is_empty()); + assert_eq!(status.repos_keep.len(), 2); + assert!(status.repos_keep.contains_key(&r3.id)); + assert!(status.repos_keep.contains_key(&r4.id)); + assert_eq!( + status.repos_prune.len(), + all_repos.len() - releases_to_keep.len() + ); + assert!(status.repos_keep.contains_key(&r1.id)); + assert!(status.repos_keep.contains_key(&r2.id)); + assert!(status.repos_keep.contains_key(&r5.id)); + assert!(status.repos_keep.contains_key(&r6.id)); + + // Simple case: have no target releases to keep, but some uploads. + let releases_to_keep = BTreeSet::new(); + let mut status = TufRepoPrunerStatus { + repos_keep: IdOrdMap::new(), + repos_prune: IdOrdMap::new(), + warnings: Vec::new(), + nkeep_recent_releases: u8::try_from(releases_to_keep.len()) + .unwrap(), + nkeep_recent_uploads: 0, + }; + decide_prune(&all_repos, &releases_to_keep, &mut status); + assert!(status.warnings.is_empty()); + assert_eq!(status.repos_keep.len(), 2); + assert!(status.repos_keep.contains_key(&r3.id)); + assert!(status.repos_keep.contains_key(&r4.id)); + assert_eq!( + status.repos_prune.len(), + all_repos.len() - releases_to_keep.len() + ); + assert!(status.repos_keep.contains_key(&r1.id)); + assert!(status.repos_keep.contains_key(&r2.id)); + assert!(status.repos_keep.contains_key(&r5.id)); + assert!(status.repos_keep.contains_key(&r6.id)); + + // Case 1: a common case + // XXX-dap + // let releases_to_keep: BTreeSet<_> = + // [r3.id, r4.id].into_iter().collect(); + // let mut status = TufRepoPrunerStatus { + // repos_keep: IdOrdMap::new(), + // repos_prune: IdOrdMap::new(), + // warnings: Vec::new(), + // nkeep_recent_releases: u8::try_from(releases_to_keep.len()) + // .unwrap(), + // nkeep_recent_uploads: 1, + // }; + } +} From bf8720947ea8effefb0d63bc4b262d53bd9c13f7 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 11:09:42 -0700 Subject: [PATCH 07/19] marking repos pruned must be conditional on generation numbers not having changed --- .../src/db/datastore/target_release.rs | 70 ++++++++-- nexus/db-queries/src/db/datastore/update.rs | 129 ++++++++++++++++-- .../app/background/tasks/tuf_repo_pruner.rs | 25 +++- 3 files changed, 196 insertions(+), 28 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/target_release.rs b/nexus/db-queries/src/db/datastore/target_release.rs index d9e2aa16433..3e73c2fe462 100644 --- a/nexus/db-queries/src/db/datastore/target_release.rs +++ b/nexus/db-queries/src/db/datastore/target_release.rs @@ -16,7 +16,9 @@ use diesel::prelude::*; use nexus_db_errors::{ErrorHandler, public_error_from_diesel}; use nexus_db_schema::schema::target_release::dsl; use nexus_types::external_api::views; -use omicron_common::api::external::{CreateResult, Error, LookupResult}; +use omicron_common::api::external::{ + CreateResult, Error, InternalContext, LookupResult, +}; use omicron_uuid_kinds::TufRepoUuid; use std::collections::BTreeSet; @@ -135,11 +137,26 @@ impl DataStore { &self, opctx: &OpContext, count: u8, - ) -> Result, Error> { + ) -> Result { opctx .authorize(authz::Action::Read, &authz::TARGET_RELEASE_CONFIG) .await?; + // Fetch the current TUF repo generation. This is used by some + // callers so that they can perform transactions later conditional + // on none of this having changed. + // + // As long as we fetch this before fetching the target release rows + // below, and the caller makes any changes they want conditional on this + // generation (which they need to do anyway for correctness), then we + // don't need to use a transaction in this function. + let tuf_generation = self + .tuf_get_generation(opctx) + .await + .internal_context("fetching TUF repo generation")?; + + // Fetch recent rows from `target_release`. + // // In almost all cases, `count` = 2 and we only need to look back two // rows to find the most recent target releases. But we do allow this // to be configurable, and there are cases where the same release can @@ -161,14 +178,28 @@ impl DataStore { .limit(i64::from(limit)) .load_async(&*conn) .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))?; + .map_err(|err| { + public_error_from_diesel(err, ErrorHandler::Server) + })?; + if rows.is_empty() { + return Err(Error::internal_error( + "unexpectedly found no rows in `target_release` table", + )); + } + + let target_release_generation = rows[0].generation.0; let nfound = rows.len(); - let mut rv = BTreeSet::new(); + let mut releases = BTreeSet::new(); for target_release in rows { if let Some(repo_id) = target_release.tuf_repo_id { - rv.insert(repo_id.into()); - if rv.len() >= usize::from(count) { - return Ok(rv); + releases.insert(repo_id.into()); + if releases.len() >= usize::from(count) { + return Ok(RecentTargetReleases { + releases, + count, + tuf_generation, + target_release_generation, + }); } } } @@ -183,15 +214,36 @@ impl DataStore { target_release rows, but found only {} before giving up", count, limit, - rv.len(), + releases.len(), ))); } // Otherwise, that's it: we found all the releases that were there. - Ok(rv) + Ok(RecentTargetReleases { + releases, + count, + tuf_generation, + target_release_generation, + }) } } +/// Represents information fetched about recent target releases +pub struct RecentTargetReleases { + /// distinct target releases found + pub releases: BTreeSet, + /// how many releases we tried to find + pub(crate) count: u8, + /// tuf_generation value when we fetched these releases + /// (used to notice if new repos were added or removed while pruning) + pub(crate) tuf_generation: omicron_common::api::external::Generation, + /// latest target_release generation when we fetched these releases + /// (used to notice if a new target release has been set that could + /// invalidate this information) + pub(crate) target_release_generation: + omicron_common::api::external::Generation, +} + #[cfg(test)] mod test { use super::*; diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index c17c9364376..c52e2dc6b32 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -9,23 +9,25 @@ use std::collections::HashMap; use super::DataStore; use crate::authz; use crate::context::OpContext; +use crate::db::datastore::target_release::RecentTargetReleases; use crate::db::model::SemverVersion; use crate::db::pagination::paginated; -use async_bb8_diesel::AsyncRunQueryDsl; +use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; use diesel::prelude::*; use diesel::result::Error as DieselError; -use nexus_db_errors::OptionalError; use nexus_db_errors::{ErrorHandler, public_error_from_diesel}; +use nexus_db_errors::{OptionalError, TransactionError}; use nexus_db_lookup::DbConnection; use nexus_db_model::{ - ArtifactHash, TufArtifact, TufRepo, TufRepoDescription, TufTrustRoot, - to_db_typed_uuid, + ArtifactHash, TargetRelease, TufArtifact, TufRepo, TufRepoDescription, + TufTrustRoot, to_db_typed_uuid, }; use omicron_common::api::external::{ self, CreateResult, DataPageParams, DeleteResult, Generation, ListResultVec, LookupResult, LookupType, ResourceType, TufRepoInsertStatus, UpdateResult, }; +use omicron_common::api::external::{Error, InternalContext}; use omicron_uuid_kinds::TufRepoKind; use omicron_uuid_kinds::TypedUuid; use omicron_uuid_kinds::{GenericUuid, TufRepoUuid}; @@ -218,21 +220,120 @@ impl DataStore { pub async fn tuf_repo_mark_pruned( &self, opctx: &OpContext, + recent_releases: &RecentTargetReleases, tuf_repo_id: TufRepoUuid, ) -> UpdateResult<()> { - use nexus_db_schema::schema::tuf_repo::dsl; + // Double-check that the caller's done their diligence. + // + // These are not the primary way that we check these conditions. + // They're a belt-and-suspenders check, since we have this information + // available. + if recent_releases.count < 2 { + return Err(Error::internal_error( + "must have fetched at least two recent releases to properly \ + validate that an important release is not being pruned", + )); + } + if recent_releases.releases.contains(&tuf_repo_id) { + return Err(Error::internal_error( + "attempting to prune a current or recent target release", + )); + } opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; - let conn = self.pool_connection_authorized(opctx).await?; - diesel::update(dsl::tuf_repo) - .filter(dsl::id.eq(to_db_typed_uuid(tuf_repo_id))) - .filter(dsl::time_pruned.is_null()) - .set(dsl::time_pruned.eq(chrono::Utc::now())) - .execute_async(&*conn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) - .map(|_| ()) + conn.transaction_async(|txn| async move { + // If the target release generation has changed, bail out. This + // means someone has changed the target release, which means they + // may have set it to the repo we're trying to prune. (This check + // could be more fine-grained, but this is adequate for now) + let target_release_generation_now = { + use nexus_db_schema::schema::target_release::dsl; + dsl::target_release + .select(TargetRelease::as_select()) + .order_by(dsl::generation.desc()) + .limit(1) + .first_async(&txn) + .await + .map_err(|err| { + public_error_from_diesel(err, ErrorHandler::Server) + }) + .internal_context( + "fetching latest target_release generation", + )? + .generation + .0 + }; + if target_release_generation_now + != recent_releases.target_release_generation + { + return Err(TransactionError::CustomError(Error::conflict( + format!( + "bailing out to avoid risk of marking current target \ + release pruned: target release has changed since \ + check (currently {}, was {})", + target_release_generation_now, + recent_releases.target_release_generation + ), + ))); + } + + // If the TUF repo generation has changed, bail out. Someone else + // is adding or pruning repos. Force the caller to re-evaluate. + // This is probably more conservative than necessary, but ensures + // that two Nexus instances don't concurrently decide to prune a lot + // more than either of them would on their own because they chose + // different repos to keep. + let tuf_generation_now = + get_generation(&txn).await.map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + if tuf_generation_now != recent_releases.tuf_generation { + return Err(TransactionError::CustomError(Error::conflict( + format!( + "bailing out to avoid risk of pruning too much: \ + tuf repo generation has changed since check \ + (currently {}, was {})", + tuf_generation_now, recent_releases.tuf_generation, + ), + ))); + } + + // Try to mark the repo pruned. + use nexus_db_schema::schema::tuf_repo::dsl; + let count = diesel::update(dsl::tuf_repo) + .filter(dsl::id.eq(to_db_typed_uuid(tuf_repo_id))) + .filter(dsl::time_pruned.is_null()) + .set(dsl::time_pruned.eq(chrono::Utc::now())) + .execute_async(&txn) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + .internal_context("marking TUF repo pruned")?; + + // If we made any changes, bump the TUF repo generation. This is + // necessary because that generation covers the set of TUF repos + // whose artifacts should be replicated. + if count > 0 { + put_generation( + &txn, + tuf_generation_now.into(), + tuf_generation_now.next().into(), + ) + .await + .map_err(|error| { + public_error_from_diesel(error, ErrorHandler::Server) + })?; + } + + Ok(()) + }) + .await + .map_err(|error: TransactionError| match error { + TransactionError::CustomError(error) => error, + TransactionError::Database(error) => { + public_error_from_diesel(error, ErrorHandler::Server) + } + }) } /// Returns the current TUF repo generation number. diff --git a/nexus/src/app/background/tasks/tuf_repo_pruner.rs b/nexus/src/app/background/tasks/tuf_repo_pruner.rs index cbe3d3b24d6..e682de58443 100644 --- a/nexus/src/app/background/tasks/tuf_repo_pruner.rs +++ b/nexus/src/app/background/tasks/tuf_repo_pruner.rs @@ -100,19 +100,34 @@ async fn tuf_repos_prune( nkeep_recent_releases, nkeep_recent_uploads, }; - decide_prune(&all_tuf_repos, &recent_releases, &mut status); - for tuf_repo in &status.repos_prune { - if let Err(error) = - datastore.tuf_repo_mark_pruned(opctx, tuf_repo.id).await + decide_prune(&all_tuf_repos, &recent_releases.releases, &mut status); + + // We're only going to prune one release at a time. + // + // We could do more, but that would require making `tuf_repo_mark_pruned()` + // accept a set, and then we'd want to bound that set, and in reality we + // only ever expect to get one at a time here. + let mut to_prune = status.repos_prune.iter(); + if let Some(repo_to_prune) = to_prune.next() { + let prune_id = repo_to_prune.id; + if let Err(error) = datastore + .tuf_repo_mark_pruned(opctx, &recent_releases, prune_id) + .await { status.warnings.push(format!( "failed to prune {}: {}", - tuf_repo.id, + prune_id, InlineErrorChain::new(&error), )); }; } + for repo in to_prune { + status + .warnings + .push(format!("skipping prune {}: only pruning one", repo.id)); + } + Ok(status) } From 9452cb96c2050a462efe9998cb06945432a3e617 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 11:14:30 -0700 Subject: [PATCH 08/19] nit / cleanup --- nexus/src/app/background/tasks/tuf_repo_pruner.rs | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/nexus/src/app/background/tasks/tuf_repo_pruner.rs b/nexus/src/app/background/tasks/tuf_repo_pruner.rs index e682de58443..36167561814 100644 --- a/nexus/src/app/background/tasks/tuf_repo_pruner.rs +++ b/nexus/src/app/background/tasks/tuf_repo_pruner.rs @@ -84,13 +84,21 @@ async fn tuf_repos_prune( let nkeep_recent_uploads = NKEEP_RECENT_UPLOADS_ALWAYS + config.nkeep_extra_newly_uploaded; - let all_tuf_repos = fetch_all_tuf_repos(opctx, datastore, SQL_BATCH_SIZE) - .await - .context("fetching all TUF repos")?; + // It's important that we fetch recent releases first because the returned + // structure contains a generation number that will be checked later in a + // transaction to avoid making any changes if the underlying state has + // changed. + // + // If we instead fetched this after listing TUF repos, there'd be a + // possibility that things changed between these steps and we pruned + // something anyway. let recent_releases = datastore .target_release_fetch_recent_distinct(opctx, nkeep_recent_releases) .await .context("listing recent target releases")?; + let all_tuf_repos = fetch_all_tuf_repos(opctx, datastore, SQL_BATCH_SIZE) + .await + .context("fetching all TUF repos")?; // After this point, errors are not fatal. They just generate warnings. let mut status = TufRepoPrunerStatus { From cf1e2599e789184580925b192101f362706b4ec7 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 12:02:13 -0700 Subject: [PATCH 09/19] clean up these APIs --- .../src/db/datastore/target_release.rs | 28 +- nexus/db-queries/src/db/datastore/update.rs | 54 ++- .../app/background/tasks/tuf_repo_pruner.rs | 423 +++++++++--------- nexus/types/src/internal_api/background.rs | 18 +- 4 files changed, 279 insertions(+), 244 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/target_release.rs b/nexus/db-queries/src/db/datastore/target_release.rs index 3e73c2fe462..0d14f553687 100644 --- a/nexus/db-queries/src/db/datastore/target_release.rs +++ b/nexus/db-queries/src/db/datastore/target_release.rs @@ -16,9 +16,7 @@ use diesel::prelude::*; use nexus_db_errors::{ErrorHandler, public_error_from_diesel}; use nexus_db_schema::schema::target_release::dsl; use nexus_types::external_api::views; -use omicron_common::api::external::{ - CreateResult, Error, InternalContext, LookupResult, -}; +use omicron_common::api::external::{CreateResult, Error, LookupResult}; use omicron_uuid_kinds::TufRepoUuid; use std::collections::BTreeSet; @@ -142,19 +140,6 @@ impl DataStore { .authorize(authz::Action::Read, &authz::TARGET_RELEASE_CONFIG) .await?; - // Fetch the current TUF repo generation. This is used by some - // callers so that they can perform transactions later conditional - // on none of this having changed. - // - // As long as we fetch this before fetching the target release rows - // below, and the caller makes any changes they want conditional on this - // generation (which they need to do anyway for correctness), then we - // don't need to use a transaction in this function. - let tuf_generation = self - .tuf_get_generation(opctx) - .await - .internal_context("fetching TUF repo generation")?; - // Fetch recent rows from `target_release`. // // In almost all cases, `count` = 2 and we only need to look back two @@ -197,7 +182,6 @@ impl DataStore { return Ok(RecentTargetReleases { releases, count, - tuf_generation, target_release_generation, }); } @@ -219,12 +203,7 @@ impl DataStore { } // Otherwise, that's it: we found all the releases that were there. - Ok(RecentTargetReleases { - releases, - count, - tuf_generation, - target_release_generation, - }) + Ok(RecentTargetReleases { releases, count, target_release_generation }) } } @@ -234,9 +213,6 @@ pub struct RecentTargetReleases { pub releases: BTreeSet, /// how many releases we tried to find pub(crate) count: u8, - /// tuf_generation value when we fetched these releases - /// (used to notice if new repos were added or removed while pruning) - pub(crate) tuf_generation: omicron_common::api::external::Generation, /// latest target_release generation when we fetched these releases /// (used to notice if a new target release has been set that could /// invalidate this information) diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index c52e2dc6b32..2a928bc1d1c 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -9,8 +9,10 @@ use std::collections::HashMap; use super::DataStore; use crate::authz; use crate::context::OpContext; +use crate::db::datastore::SQL_BATCH_SIZE; use crate::db::datastore::target_release::RecentTargetReleases; use crate::db::model::SemverVersion; +use crate::db::pagination::Paginator; use crate::db::pagination::paginated; use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; use diesel::prelude::*; @@ -216,10 +218,58 @@ impl DataStore { .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) } + /// Lists all unpruned TUF repos, making as many queries as needed to get + /// them all + /// + /// This should not be used from contexts that shouldn't make lots of + /// database queries (e.g., API endpoints). + /// + /// Since this involves pagination, this is not a consistent snapshot. + /// Consider using `tuf_get_generation()` before calling this function and + /// then making any subsequent queries conditional on the generation not + /// having changed. + pub async fn tuf_list_repos_unpruned_batched( + &self, + opctx: &OpContext, + ) -> ListResultVec { + opctx.check_complex_operations_allowed()?; + let mut paginator = Paginator::new( + SQL_BATCH_SIZE, + dropshot::PaginationOrder::Ascending, + ); + let mut rv = Vec::new(); + while let Some(p) = paginator.next() { + let batch = self + .tuf_list_repos_unpruned(opctx, &p.current_pagparams()) + .await + .internal_context("fetching page of TUF repos")?; + paginator = p.found_batch(&batch, &|a| a.id.into_untyped_uuid()); + rv.extend(batch); + } + Ok(rv) + } + /// Marks the given TUF repo as eligible for pruning + /// + /// Callers are expected to verify that it's safe to prune this TUF repo. + /// + /// `recent_releases` comes from `target_release_fetch_recent_distinct()`. + /// As part of verifying that it's safe to prune this TUF repo, callers are + /// expected to check that it's not the current or immediately previous + /// target release. + /// + /// This transaction will be conditional on: + /// + /// - the current TUF generation matching `initial_tuf_generation` + /// (i.e., we will not prune a release if some other query has added or + /// pruned a release since the caller fetched this generation) + /// - the current target release generation matching what it was when + /// `recent_releases` was fetched (because this would invalidate the check + /// mentioned above that we're not pruning the target release). pub async fn tuf_repo_mark_pruned( &self, opctx: &OpContext, + initial_tuf_generation: Generation, recent_releases: &RecentTargetReleases, tuf_repo_id: TufRepoUuid, ) -> UpdateResult<()> { @@ -288,13 +338,13 @@ impl DataStore { get_generation(&txn).await.map_err(|e| { public_error_from_diesel(e, ErrorHandler::Server) })?; - if tuf_generation_now != recent_releases.tuf_generation { + if tuf_generation_now != initial_tuf_generation { return Err(TransactionError::CustomError(Error::conflict( format!( "bailing out to avoid risk of pruning too much: \ tuf repo generation has changed since check \ (currently {}, was {})", - tuf_generation_now, recent_releases.tuf_generation, + tuf_generation_now, initial_tuf_generation, ), ))); } diff --git a/nexus/src/app/background/tasks/tuf_repo_pruner.rs b/nexus/src/app/background/tasks/tuf_repo_pruner.rs index 36167561814..91ff3a83de5 100644 --- a/nexus/src/app/background/tasks/tuf_repo_pruner.rs +++ b/nexus/src/app/background/tasks/tuf_repo_pruner.rs @@ -11,16 +11,12 @@ use iddqd::IdOrdMap; use nexus_config::TufRepoPrunerConfig; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; -use nexus_db_queries::db::datastore::SQL_BATCH_SIZE; -use nexus_db_queries::db::pagination::Paginator; use nexus_types::internal_api::background::TufRepoInfo; use nexus_types::internal_api::background::TufRepoPrunerStatus; -use omicron_uuid_kinds::GenericUuid as _; use omicron_uuid_kinds::TufRepoUuid; use serde_json::json; use slog_error_chain::InlineErrorChain; use std::collections::BTreeSet; -use std::num::NonZeroU32; use std::sync::Arc; /// number of recent distinct target releases that we always keep, regardless of @@ -79,106 +75,95 @@ async fn tuf_repos_prune( datastore: &DataStore, config: &TufRepoPrunerConfig, ) -> Result { + // Compute configuration. let nkeep_recent_releases = NKEEP_RECENT_TARGET_RELEASES_ALWAYS + config.nkeep_extra_target_releases; let nkeep_recent_uploads = NKEEP_RECENT_UPLOADS_ALWAYS + config.nkeep_extra_newly_uploaded; - // It's important that we fetch recent releases first because the returned - // structure contains a generation number that will be checked later in a - // transaction to avoid making any changes if the underlying state has - // changed. - // - // If we instead fetched this after listing TUF repos, there'd be a - // possibility that things changed between these steps and we pruned - // something anyway. + // Fetch the state we need to make a decision. + let tuf_generation = datastore + .tuf_get_generation(opctx) + .await + .context("fetching current TUF generation")?; + let all_tuf_repos: IdOrdMap<_> = datastore + .tuf_list_repos_unpruned_batched(opctx) + .await + .context("listing unpruned TUF repos")? + .into_iter() + .map(|repo| TufRepoInfo { + id: repo.id(), + system_version: repo.system_version.into(), + time_created: repo.time_created, + }) + .collect(); let recent_releases = datastore .target_release_fetch_recent_distinct(opctx, nkeep_recent_releases) .await .context("listing recent target releases")?; - let all_tuf_repos = fetch_all_tuf_repos(opctx, datastore, SQL_BATCH_SIZE) - .await - .context("fetching all TUF repos")?; - // After this point, errors are not fatal. They just generate warnings. - let mut status = TufRepoPrunerStatus { - repos_keep: IdOrdMap::new(), - repos_prune: IdOrdMap::new(), - warnings: Vec::new(), + // Finally, make the decision about what to prune. + let mut status = decide_prune(TufRepoPrune { nkeep_recent_releases, nkeep_recent_uploads, - }; - decide_prune(&all_tuf_repos, &recent_releases.releases, &mut status); + all_tuf_repos: &all_tuf_repos, + recent_releases: &recent_releases.releases, + }); - // We're only going to prune one release at a time. - // - // We could do more, but that would require making `tuf_repo_mark_pruned()` - // accept a set, and then we'd want to bound that set, and in reality we - // only ever expect to get one at a time here. - let mut to_prune = status.repos_prune.iter(); - if let Some(repo_to_prune) = to_prune.next() { - let prune_id = repo_to_prune.id; + // If we decided to prune something, do it. + if let Some(to_prune) = &status.repo_prune { + let prune_id = to_prune.id; if let Err(error) = datastore - .tuf_repo_mark_pruned(opctx, &recent_releases, prune_id) + .tuf_repo_mark_pruned( + opctx, + tuf_generation, + &recent_releases, + prune_id, + ) .await { status.warnings.push(format!( - "failed to prune {}: {}", + "failed to prune {} (release {}): {}", prune_id, + to_prune.system_version, InlineErrorChain::new(&error), )); - }; - } - - for repo in to_prune { - status - .warnings - .push(format!("skipping prune {}: only pruning one", repo.id)); + } } Ok(status) } -async fn fetch_all_tuf_repos( - opctx: &OpContext, - datastore: &DataStore, - batch_size: NonZeroU32, -) -> Result, anyhow::Error> { - let mut paginator = - Paginator::new(batch_size, dropshot::PaginationOrder::Ascending); - let mut rv = IdOrdMap::new(); - while let Some(p) = paginator.next() { - let batch = datastore - .tuf_list_repos_unpruned(opctx, &p.current_pagparams()) - .await - .context("fetching page of TUF repos")?; - paginator = p.found_batch(&batch, &|a| a.id.into_untyped_uuid()); - for repo in batch { - // It should be impossible to have duplicates here and there's - // nothing to do about it if we find them. - rv.insert_overwrite(TufRepoInfo { - id: repo.id(), - system_version: repo.system_version.into(), - time_created: repo.time_created, - }); - } - } - - Ok(rv) +/// Arguments to `decide_prune()` +struct TufRepoPrune<'a> { + /// how many of the most recent target releases to keep + nkeep_recent_releases: u8, + /// how many recent uploads (that aren't also recent relases) to keep + nkeep_recent_uploads: u8, + /// description of all unpruned TUF repos in the system + all_tuf_repos: &'a IdOrdMap, + /// set of recent target releases + recent_releases: &'a BTreeSet, } /// Given the complete list of TUF repos and a set of recent releases that we /// definitely want to keep, decide what to prune and what to keep. -fn decide_prune( - all_repos: &IdOrdMap, - releases_to_keep: &BTreeSet, - status: &mut TufRepoPrunerStatus, -) { +fn decide_prune(args: TufRepoPrune) -> TufRepoPrunerStatus { + let TufRepoPrune { + nkeep_recent_releases, + nkeep_recent_uploads, + all_tuf_repos, + recent_releases, + } = args; + + let mut warnings = Vec::new(); + // Record that we're keeping all of the `releases_to_keep`. - for repo_id in releases_to_keep { - match all_repos.get(repo_id) { + let mut repos_keep_target_release = IdOrdMap::new(); + for repo_id in recent_releases { + match all_tuf_repos.get(repo_id) { Some(repo_info) => { - status.repos_keep.insert_overwrite(repo_info.clone()); + repos_keep_target_release.insert_overwrite(repo_info.clone()); } None => { // This is unusual: there's a recent target release with no @@ -188,7 +173,7 @@ fn decide_prune( // all would have had to happen between our two database queries // above. Still, it's not a problem. We'll note it here, but // otherwise move on. (We will not wind up pruning this.) - status.warnings.push(format!( + warnings.push(format!( "wanting to keep recent target release repo {repo_id}, \ but did not find it in the tuf_repo table", )); @@ -196,156 +181,168 @@ fn decide_prune( } } - // Keep as many other uploaded repos as makes sense, preferring the most - // recently uploaded. - let mut nleft = status.nkeep_recent_uploads; - let mut candidate_repos_by_upload_time = - all_repos.into_iter().collect::>(); - candidate_repos_by_upload_time.sort_by_key(|k| k.time_created); - candidate_repos_by_upload_time.reverse(); - let mut candidates = candidate_repos_by_upload_time.into_iter(); - while nleft > 0 { - let Some(next) = candidates.next() else { - break; - }; + // Partition all TUF repos *other* than those into: + // - ones that we're keeping because they were most recently uploaded + // - the one we want to prune + // - other ones we would have pruned, but won't because we only prune one + let mut non_target_release_repos = all_tuf_repos + .iter() + .filter(|r| !repos_keep_target_release.contains_key(&r.id)) + .collect::>(); + non_target_release_repos.sort_by_key(|k| k.time_created); + non_target_release_repos.reverse(); + let mut repos_keep_recent_uploads = IdOrdMap::new(); + let mut repo_prune = None; + let mut other_repos_eligible_to_prune = IdOrdMap::new(); + for repo in non_target_release_repos { + if repos_keep_recent_uploads.len() < usize::from(nkeep_recent_uploads) { + repos_keep_recent_uploads.insert_overwrite(repo.clone()); + continue; + } - // Only decrement `nleft` if we weren't already keeping this one. - if let Ok(_) = status.repos_keep.insert_unique(next.clone()) { - nleft -= 1; + if repo_prune.is_none() { + repo_prune = Some(repo.clone()); + continue; } + + other_repos_eligible_to_prune.insert_overwrite(repo.clone()); } - // Everything else will be pruned. - for repo in all_repos { - if !status.repos_keep.contains_key(&repo.id) { - status.repos_prune.insert_overwrite(repo.clone()); - } + TufRepoPrunerStatus { + nkeep_recent_releases, + nkeep_recent_uploads, + repos_keep_target_release, + repos_keep_recent_uploads, + repo_prune, + other_repos_eligible_to_prune, + warnings, } } #[cfg(test)] mod test { - use super::decide_prune; - use iddqd::IdOrdMap; - use nexus_types::internal_api::background::TufRepoInfo; - use nexus_types::internal_api::background::TufRepoPrunerStatus; - use omicron_uuid_kinds::TufRepoUuid; - use std::collections::BTreeSet; - - fn make_test_repos() -> [TufRepoInfo; 6] { - [ - TufRepoInfo { - id: TufRepoUuid::new_v4(), - time_created: "2025-09-26T01:00:00Z".parse().unwrap(), - system_version: "1.0.0".parse().unwrap(), - }, - TufRepoInfo { - id: TufRepoUuid::new_v4(), - time_created: "2025-09-26T02:00:00Z".parse().unwrap(), - system_version: "2.0.0".parse().unwrap(), - }, - TufRepoInfo { - id: TufRepoUuid::new_v4(), - time_created: "2025-09-26T03:00:00Z".parse().unwrap(), - system_version: "3.0.0".parse().unwrap(), - }, - TufRepoInfo { - id: TufRepoUuid::new_v4(), - time_created: "2025-09-26T04:00:00Z".parse().unwrap(), - system_version: "4.0.0".parse().unwrap(), - }, - TufRepoInfo { - id: TufRepoUuid::new_v4(), - time_created: "2025-09-26T05:00:00Z".parse().unwrap(), - system_version: "4.1.0".parse().unwrap(), - }, - TufRepoInfo { - id: TufRepoUuid::new_v4(), - time_created: "2025-09-26T06:00:00Z".parse().unwrap(), - system_version: "4.2.0".parse().unwrap(), - }, - ] - } - - #[test] - fn test_decide_prune() { - let all_repos = make_test_repos(); - let [r1, r2, r3, r4, r5, r6] = &all_repos; - let all_repos: IdOrdMap<_> = all_repos.clone().into_iter().collect(); + // XXX-dap clean up all imports + // use super::decide_prune; + // use iddqd::IdOrdMap; + // use nexus_types::internal_api::background::TufRepoInfo; + // use nexus_types::internal_api::background::TufRepoPrunerStatus; + // use omicron_uuid_kinds::TufRepoUuid; + // use std::collections::BTreeSet; + // + // fn make_test_repos() -> [TufRepoInfo; 6] { + // [ + // TufRepoInfo { + // id: TufRepoUuid::new_v4(), + // time_created: "2025-09-26T01:00:00Z".parse().unwrap(), + // system_version: "1.0.0".parse().unwrap(), + // }, + // TufRepoInfo { + // id: TufRepoUuid::new_v4(), + // time_created: "2025-09-26T02:00:00Z".parse().unwrap(), + // system_version: "2.0.0".parse().unwrap(), + // }, + // TufRepoInfo { + // id: TufRepoUuid::new_v4(), + // time_created: "2025-09-26T03:00:00Z".parse().unwrap(), + // system_version: "3.0.0".parse().unwrap(), + // }, + // TufRepoInfo { + // id: TufRepoUuid::new_v4(), + // time_created: "2025-09-26T04:00:00Z".parse().unwrap(), + // system_version: "4.0.0".parse().unwrap(), + // }, + // TufRepoInfo { + // id: TufRepoUuid::new_v4(), + // time_created: "2025-09-26T05:00:00Z".parse().unwrap(), + // system_version: "4.1.0".parse().unwrap(), + // }, + // TufRepoInfo { + // id: TufRepoUuid::new_v4(), + // time_created: "2025-09-26T06:00:00Z".parse().unwrap(), + // system_version: "4.2.0".parse().unwrap(), + // }, + // ] + // } + // + // #[test] + // fn test_decide_prune() { + // let all_repos = make_test_repos(); + // let [r1, r2, r3, r4, r5, r6] = &all_repos; + // let all_repos: IdOrdMap<_> = all_repos.clone().into_iter().collect(); - // Trivial case - let mut status = TufRepoPrunerStatus { - repos_keep: IdOrdMap::new(), - repos_prune: IdOrdMap::new(), - warnings: Vec::new(), - nkeep_recent_releases: 2, - nkeep_recent_uploads: 1, - }; - decide_prune(&IdOrdMap::new(), &BTreeSet::new(), &mut status); - assert!(status.warnings.is_empty()); - assert!(status.repos_keep.is_empty()); - assert!(status.repos_prune.is_empty()); + // // Trivial case + // let mut status = TufRepoPrunerStatus { + // repos_keep: IdOrdMap::new(), + // repos_prune: IdOrdMap::new(), + // warnings: Vec::new(), + // nkeep_recent_releases: 2, + // nkeep_recent_uploads: 1, + // }; + // decide_prune(&IdOrdMap::new(), &BTreeSet::new(), &mut status); + // assert!(status.warnings.is_empty()); + // assert!(status.repos_keep.is_empty()); + // assert!(status.repos_prune.is_empty()); - // Simple case: have some target releases to keep, no uploads. - let releases_to_keep: BTreeSet<_> = - [r3.id, r4.id].into_iter().collect(); - let mut status = TufRepoPrunerStatus { - repos_keep: IdOrdMap::new(), - repos_prune: IdOrdMap::new(), - warnings: Vec::new(), - nkeep_recent_releases: u8::try_from(releases_to_keep.len()) - .unwrap(), - nkeep_recent_uploads: 0, - }; - decide_prune(&all_repos, &releases_to_keep, &mut status); - assert!(status.warnings.is_empty()); - assert_eq!(status.repos_keep.len(), 2); - assert!(status.repos_keep.contains_key(&r3.id)); - assert!(status.repos_keep.contains_key(&r4.id)); - assert_eq!( - status.repos_prune.len(), - all_repos.len() - releases_to_keep.len() - ); - assert!(status.repos_keep.contains_key(&r1.id)); - assert!(status.repos_keep.contains_key(&r2.id)); - assert!(status.repos_keep.contains_key(&r5.id)); - assert!(status.repos_keep.contains_key(&r6.id)); + // // Simple case: have some target releases to keep, no uploads. + // let releases_to_keep: BTreeSet<_> = + // [r3.id, r4.id].into_iter().collect(); + // let mut status = TufRepoPrunerStatus { + // repos_keep: IdOrdMap::new(), + // repos_prune: IdOrdMap::new(), + // warnings: Vec::new(), + // nkeep_recent_releases: u8::try_from(releases_to_keep.len()) + // .unwrap(), + // nkeep_recent_uploads: 0, + // }; + // decide_prune(&all_repos, &releases_to_keep, &mut status); + // assert!(status.warnings.is_empty()); + // assert_eq!(status.repos_keep.len(), 2); + // assert!(status.repos_keep.contains_key(&r3.id)); + // assert!(status.repos_keep.contains_key(&r4.id)); + // assert_eq!( + // status.repos_prune.len(), + // all_repos.len() - releases_to_keep.len() + // ); + // assert!(status.repos_keep.contains_key(&r1.id)); + // assert!(status.repos_keep.contains_key(&r2.id)); + // assert!(status.repos_keep.contains_key(&r5.id)); + // assert!(status.repos_keep.contains_key(&r6.id)); - // Simple case: have no target releases to keep, but some uploads. - let releases_to_keep = BTreeSet::new(); - let mut status = TufRepoPrunerStatus { - repos_keep: IdOrdMap::new(), - repos_prune: IdOrdMap::new(), - warnings: Vec::new(), - nkeep_recent_releases: u8::try_from(releases_to_keep.len()) - .unwrap(), - nkeep_recent_uploads: 0, - }; - decide_prune(&all_repos, &releases_to_keep, &mut status); - assert!(status.warnings.is_empty()); - assert_eq!(status.repos_keep.len(), 2); - assert!(status.repos_keep.contains_key(&r3.id)); - assert!(status.repos_keep.contains_key(&r4.id)); - assert_eq!( - status.repos_prune.len(), - all_repos.len() - releases_to_keep.len() - ); - assert!(status.repos_keep.contains_key(&r1.id)); - assert!(status.repos_keep.contains_key(&r2.id)); - assert!(status.repos_keep.contains_key(&r5.id)); - assert!(status.repos_keep.contains_key(&r6.id)); + // // Simple case: have no target releases to keep, but some uploads. + // let releases_to_keep = BTreeSet::new(); + // let mut status = TufRepoPrunerStatus { + // repos_keep: IdOrdMap::new(), + // repos_prune: IdOrdMap::new(), + // warnings: Vec::new(), + // nkeep_recent_releases: u8::try_from(releases_to_keep.len()) + // .unwrap(), + // nkeep_recent_uploads: 0, + // }; + // decide_prune(&all_repos, &releases_to_keep, &mut status); + // assert!(status.warnings.is_empty()); + // assert_eq!(status.repos_keep.len(), 2); + // assert!(status.repos_keep.contains_key(&r3.id)); + // assert!(status.repos_keep.contains_key(&r4.id)); + // assert_eq!( + // status.repos_prune.len(), + // all_repos.len() - releases_to_keep.len() + // ); + // assert!(status.repos_keep.contains_key(&r1.id)); + // assert!(status.repos_keep.contains_key(&r2.id)); + // assert!(status.repos_keep.contains_key(&r5.id)); + // assert!(status.repos_keep.contains_key(&r6.id)); - // Case 1: a common case - // XXX-dap - // let releases_to_keep: BTreeSet<_> = - // [r3.id, r4.id].into_iter().collect(); - // let mut status = TufRepoPrunerStatus { - // repos_keep: IdOrdMap::new(), - // repos_prune: IdOrdMap::new(), - // warnings: Vec::new(), - // nkeep_recent_releases: u8::try_from(releases_to_keep.len()) - // .unwrap(), - // nkeep_recent_uploads: 1, - // }; - } + // // Case 1: a common case + // // XXX-dap + // // let releases_to_keep: BTreeSet<_> = + // // [r3.id, r4.id].into_iter().collect(); + // // let mut status = TufRepoPrunerStatus { + // // repos_keep: IdOrdMap::new(), + // // repos_prune: IdOrdMap::new(), + // // warnings: Vec::new(), + // // nkeep_recent_releases: u8::try_from(releases_to_keep.len()) + // // .unwrap(), + // // nkeep_recent_uploads: 1, + // // }; + // } } diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index 5d5b19604a1..c8625f6edae 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -387,11 +387,23 @@ pub enum TufArtifactReplicationOperation { #[derive(Debug, Serialize)] pub struct TufRepoPrunerStatus { - pub repos_keep: IdOrdMap, - pub repos_prune: IdOrdMap, - pub warnings: Vec, + // Input + /// how many recent releases we're configured to keep pub nkeep_recent_releases: u8, + /// how many recent uploads we're configured to keep pub nkeep_recent_uploads: u8, + + // Output + /// repos that we're keeping because they're a recent target release + pub repos_keep_target_release: IdOrdMap, + /// repos that we're keeping because they were recently uploaded + pub repos_keep_recent_uploads: IdOrdMap, + /// repo that we're pruning + pub repo_prune: Option, + /// other repos that were eligible for pruning + pub other_repos_eligible_to_prune: IdOrdMap, + /// runtime warnings while attempting to prune repos + pub warnings: Vec, } #[derive(Clone, Debug, Serialize)] From 393a215b199d1dc34d62124a0a7fd3416da0a49c Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 13:31:22 -0700 Subject: [PATCH 10/19] pull in new index from 9109 --- nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-queries/src/db/datastore/update.rs | 239 ++++++++++++++++++++ schema/crdb/dbinit.sql | 6 +- schema/crdb/tuf-pruned-index/up01.sql | 3 + 4 files changed, 249 insertions(+), 2 deletions(-) create mode 100644 schema/crdb/tuf-pruned-index/up01.sql diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index a20ab1d4600..36cee81bea7 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(194, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(195, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(195, "tuf-pruned-index"), KnownVersion::new(194, "tuf-pruned"), KnownVersion::new(193, "nexus-lockstep-port"), KnownVersion::new(192, "blueprint-source"), diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index 2a928bc1d1c..ab8001a328f 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -835,3 +835,242 @@ fn display_nvk( fn display_kind_hash(kind: &str, hash: ArtifactHash) -> String { format!("(kind: {kind}, hash: {hash})") } + +#[cfg(test)] +mod test { + use crate::db::DataStore; + use crate::db::pub_test_utils::TestDatabase; + use nexus_auth::context::OpContext; + use nexus_db_model::TargetRelease; + use omicron_common::api::external::TufArtifactMeta; + use omicron_common::api::external::TufRepoDescription; + use omicron_common::api::external::TufRepoMeta; + use omicron_common::update::ArtifactId; + use omicron_test_utils::dev; + use omicron_uuid_kinds::TufRepoUuid; + use slog_error_chain::InlineErrorChain; + use tufaceous_artifact::ArtifactHash; + use tufaceous_artifact::ArtifactKind; + use tufaceous_artifact::ArtifactVersion; + + fn make_test_repo(version: u8) -> TufRepoDescription { + let hash = ArtifactHash([version; 32]); + let version = semver::Version::new(u64::from(version), 0, 0); + let artifact_version = ArtifactVersion::new(version.to_string()) + .expect("valid artifact version"); + TufRepoDescription { + repo: TufRepoMeta { + hash, + targets_role_version: 0, + valid_until: chrono::Utc::now(), + system_version: version, + file_name: String::new(), + }, + artifacts: vec![TufArtifactMeta { + id: ArtifactId { + name: String::new(), + version: artifact_version, + kind: ArtifactKind::from_static("empty"), + }, + hash, + size: 0, + board: None, + sign: None, + }], + } + } + + async fn make_and_insert( + opctx: &OpContext, + datastore: &DataStore, + version: u8, + ) -> TufRepoUuid { + let repo = make_test_repo(version); + datastore + .tuf_repo_insert(opctx, &repo) + .await + .expect("inserting repo") + .recorded + .repo + .id() + } + + #[tokio::test] + async fn test_repo_mark_pruned() { + let logctx = dev::test_setup_log("test_repo_mark_pruned"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // Initially, there should be no TUF repos. + let repos = datastore + .tuf_list_repos_unpruned_batched(opctx) + .await + .expect("listing all repos"); + assert!(repos.is_empty()); + + // Add one TUF repo to the database. + let repo1id = make_and_insert(opctx, datastore, 1).await; + + // Make sure it's there. + let repos = datastore + .tuf_list_repos_unpruned_batched(opctx) + .await + .expect("listing all repos"); + assert!(!repos.is_empty()); + assert!(repos.iter().any(|r| r.id() == repo1id)); + + // Now prune that one. + let tuf_generation1 = datastore + .tuf_get_generation(opctx) + .await + .expect("fetching TUF generation"); + let recent1 = datastore + .target_release_fetch_recent_distinct(opctx, 2) + .await + .expect("fetching recent target releases"); + datastore + .tuf_repo_mark_pruned(opctx, tuf_generation1, &recent1, repo1id) + .await + .expect("pruning release"); + + // Make sure it was pruned. + let repos = datastore + .tuf_list_repos_unpruned_batched(opctx) + .await + .expect("listing all repos"); + assert!(repos.is_empty()); + + // Now set up a more realistic case. + let old_target_repo1_id = make_and_insert(opctx, datastore, 2).await; + let old_target_repo2_id = make_and_insert(opctx, datastore, 3).await; + let old_target_repo3_id = make_and_insert(opctx, datastore, 4).await; + let old_target_repo4_id = make_and_insert(opctx, datastore, 5).await; + let new_upload1 = make_and_insert(opctx, datastore, 10).await; + let new_upload2 = make_and_insert(opctx, datastore, 11).await; + let new_upload3 = make_and_insert(opctx, datastore, 12).await; + + // Make sure they're all there. + let repos = datastore + .tuf_list_repos_unpruned_batched(opctx) + .await + .expect("listing all repos"); + assert_eq!(repos.len(), 7); + + // Set the target release a few times. + let initial = datastore + .target_release_get_current(opctx) + .await + .expect("initial target release"); + let mut next = initial; + for repo_id in + [old_target_repo2_id, old_target_repo3_id, old_target_repo4_id] + { + next = datastore + .target_release_insert( + opctx, + TargetRelease::new_system_version(&next, repo_id.into()), + ) + .await + .expect("setting target release"); + } + + // We should be able to prune the following releases. + for repo_id in + [old_target_repo1_id, new_upload1, new_upload2, new_upload3] + { + let repos = datastore + .tuf_list_repos_unpruned_batched(opctx) + .await + .expect("listing all repos"); + assert!(repos.iter().any(|r| r.id() == repo_id)); + + let tuf_generation = datastore + .tuf_get_generation(opctx) + .await + .expect("fetching TUF generation"); + let recent = datastore + .target_release_fetch_recent_distinct(opctx, 3) + .await + .expect("fetching recent target releases"); + + // If we supply the initial TUF generation OR the initial "recent + // releases", this should fail because things have changed. + let error = datastore + .tuf_repo_mark_pruned(opctx, tuf_generation1, &recent, repo_id) + .await + .expect_err( + "unexpectedly succeeded in pruning release with old \ + tuf_generation", + ); + eprintln!( + "got error (expected one): {}", + InlineErrorChain::new(&error) + ); + let error = datastore + .tuf_repo_mark_pruned(opctx, tuf_generation, &recent1, repo_id) + .await + .expect_err( + "unexpectedly succeeded in pruning release with old \ + recent_releases", + ); + eprintln!( + "got error (expected one): {}", + InlineErrorChain::new(&error) + ); + + // It should still be there. + let repos = datastore + .tuf_list_repos_unpruned_batched(opctx) + .await + .expect("listing all repos"); + assert!(repos.iter().any(|r| r.id() == repo_id)); + + // With up-to-date info, this should succeed. + datastore + .tuf_repo_mark_pruned(opctx, tuf_generation, &recent, repo_id) + .await + .expect("pruning release"); + let repos = datastore + .tuf_list_repos_unpruned_batched(opctx) + .await + .expect("listing all repos"); + assert!(!repos.iter().any(|r| r.id() == repo_id)); + } + + // It should be illegal to prune the following releases because they're + // too recent target releases. + for repo_id in + [old_target_repo2_id, old_target_repo3_id, old_target_repo4_id] + { + let repos = datastore + .tuf_list_repos_unpruned_batched(opctx) + .await + .expect("listing all repos"); + assert!(repos.iter().any(|r| r.id() == repo_id)); + let tuf_generation = datastore + .tuf_get_generation(opctx) + .await + .expect("fetching TUF generation"); + let recent = datastore + .target_release_fetch_recent_distinct(opctx, 3) + .await + .expect("fetching recent target releases"); + let error = datastore + .tuf_repo_mark_pruned(opctx, tuf_generation, &recent, repo_id) + .await + .expect_err("unexpectedly pruned recent target release repo"); + eprintln!( + "found error (expected one): {}", + InlineErrorChain::new(&error) + ); + let repos = datastore + .tuf_list_repos_unpruned_batched(opctx) + .await + .expect("listing all repos"); + assert!(repos.iter().any(|r| r.id() == repo_id)); + } + + db.terminate().await; + logctx.cleanup_successful(); + } +} diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index e38868033d6..2407b7d7ca5 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -2555,6 +2555,10 @@ CREATE TABLE IF NOT EXISTS omicron.public.tuf_repo ( CONSTRAINT unique_system_version UNIQUE (system_version) ); +CREATE UNIQUE INDEX IF NOT EXISTS tuf_repo_not_pruned + ON omicron.public.tuf_repo (id) + WHERE time_pruned IS NULL; + -- Describes an individual artifact from an uploaded TUF repo. -- -- In the future, this may also be used to describe artifacts that are fetched @@ -6691,7 +6695,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '194.0.0', NULL) + (TRUE, NOW(), NOW(), '195.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/tuf-pruned-index/up01.sql b/schema/crdb/tuf-pruned-index/up01.sql new file mode 100644 index 00000000000..3adaa2f48c8 --- /dev/null +++ b/schema/crdb/tuf-pruned-index/up01.sql @@ -0,0 +1,3 @@ +CREATE UNIQUE INDEX IF NOT EXISTS tuf_repo_not_pruned + ON omicron.public.tuf_repo (id) + WHERE time_pruned IS NULL; From aefd1dd4d439094a4b47580742665c365c257855 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 14:09:43 -0700 Subject: [PATCH 11/19] add test for listing TUF repos --- nexus/db-queries/src/db/datastore/update.rs | 54 +++++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index ab8001a328f..c6836501551 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -839,6 +839,7 @@ fn display_kind_hash(kind: &str, hash: ArtifactHash) -> String { #[cfg(test)] mod test { use crate::db::DataStore; + use crate::db::datastore::SQL_BATCH_SIZE; use crate::db::pub_test_utils::TestDatabase; use nexus_auth::context::OpContext; use nexus_db_model::TargetRelease; @@ -849,12 +850,18 @@ mod test { use omicron_test_utils::dev; use omicron_uuid_kinds::TufRepoUuid; use slog_error_chain::InlineErrorChain; + use std::collections::BTreeSet; use tufaceous_artifact::ArtifactHash; use tufaceous_artifact::ArtifactKind; use tufaceous_artifact::ArtifactVersion; - fn make_test_repo(version: u8) -> TufRepoDescription { - let hash = ArtifactHash([version; 32]); + fn make_test_repo(version: u32) -> TufRepoDescription { + // We just need a unique hash for each repo. We'll key it on the + // version for determinism. + let version_bytes = version.to_le_bytes(); + let hash_bytes: [u8; 32] = + std::array::from_fn(|i| version_bytes[i % 4]); + let hash = ArtifactHash(hash_bytes); let version = semver::Version::new(u64::from(version), 0, 0); let artifact_version = ArtifactVersion::new(version.to_string()) .expect("valid artifact version"); @@ -883,7 +890,7 @@ mod test { async fn make_and_insert( opctx: &OpContext, datastore: &DataStore, - version: u8, + version: u32, ) -> TufRepoUuid { let repo = make_test_repo(version); datastore @@ -1073,4 +1080,45 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + /// Tests pagination behavior around `tuf_list_repos_unpruned_batched()`. + /// + /// The behavior of filtering out pruned repos is tested in + /// test_repo_mark_pruned(). + #[tokio::test] + async fn test_list_unpruned() { + let logctx = dev::test_setup_log("test_list_unpruned"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + let repos = datastore + .tuf_list_repos_unpruned_batched(opctx) + .await + .expect("listing all repos"); + assert!(repos.is_empty()); + + // Make sure we have more than a page worth of TUF repos. + let count = u32::from(SQL_BATCH_SIZE.get()) + 3; + let mut expected_repos = BTreeSet::new(); + for i in 0..count { + assert!( + expected_repos + .insert(make_and_insert(opctx, datastore, i + 1).await) + ); + } + + // Fetch them. Make sure we got them all and nothing else. + let repos = datastore + .tuf_list_repos_unpruned_batched(opctx) + .await + .expect("listing all repos"); + assert_eq!(repos.len(), usize::try_from(count).unwrap()); + for repo in repos { + assert!(expected_repos.remove(&repo.id())); + } + assert!(expected_repos.is_empty()); + + db.terminate().await; + logctx.cleanup_successful(); + } } From b6c5ef2826fc1d529349eea54c93e92d05a8a6c7 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 14:17:15 -0700 Subject: [PATCH 12/19] fix lint --- nexus/db-queries/src/db/datastore/update.rs | 208 +++++++++++--------- 1 file changed, 118 insertions(+), 90 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index c6836501551..fae3775ade7 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -14,7 +14,7 @@ use crate::db::datastore::target_release::RecentTargetReleases; use crate::db::model::SemverVersion; use crate::db::pagination::Paginator; use crate::db::pagination::paginated; -use async_bb8_diesel::{AsyncConnection, AsyncRunQueryDsl}; +use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use diesel::result::Error as DieselError; use nexus_db_errors::{ErrorHandler, public_error_from_diesel}; @@ -292,98 +292,126 @@ impl DataStore { opctx.authorize(authz::Action::Modify, &authz::FLEET).await?; let conn = self.pool_connection_authorized(opctx).await?; - conn.transaction_async(|txn| async move { - // If the target release generation has changed, bail out. This - // means someone has changed the target release, which means they - // may have set it to the repo we're trying to prune. (This check - // could be more fine-grained, but this is adequate for now) - let target_release_generation_now = { - use nexus_db_schema::schema::target_release::dsl; - dsl::target_release - .select(TargetRelease::as_select()) - .order_by(dsl::generation.desc()) - .limit(1) - .first_async(&txn) - .await - .map_err(|err| { - public_error_from_diesel(err, ErrorHandler::Server) - }) - .internal_context( - "fetching latest target_release generation", - )? - .generation - .0 - }; - if target_release_generation_now - != recent_releases.target_release_generation - { - return Err(TransactionError::CustomError(Error::conflict( - format!( - "bailing out to avoid risk of marking current target \ - release pruned: target release has changed since \ - check (currently {}, was {})", - target_release_generation_now, - recent_releases.target_release_generation - ), - ))); - } + let error = OptionalError::new(); + self.transaction_retry_wrapper("tuf_repo_mark_pruned") + .transaction(&conn, |txn| { + let error = error.clone(); + async move { + // If the target release generation has changed, bail out. + // This means someone has changed the target release, which + // means they may have set it to the repo we're trying to + // prune. (This check could be more fine-grained, but this + // is adequate for now) + let target_release_generation_now = { + use nexus_db_schema::schema::target_release::dsl; + dsl::target_release + .select(TargetRelease::as_select()) + .order_by(dsl::generation.desc()) + .limit(1) + .first_async(&txn) + .await + .map_err(|e| { + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + }) + .internal_context( + "fetching latest target_release generation", + ) + .map_err(|e| { + error.bail(TransactionError::CustomError(e)) + })? + .generation + .0 + }; + if target_release_generation_now + != recent_releases.target_release_generation + { + return Err(error.bail(TransactionError::CustomError( + Error::conflict(format!( + "bailing out to avoid risk of marking current \ + target release pruned: target release has \ + changed since check (currently {}, was {})", + target_release_generation_now, + recent_releases.target_release_generation + )), + ))); + } - // If the TUF repo generation has changed, bail out. Someone else - // is adding or pruning repos. Force the caller to re-evaluate. - // This is probably more conservative than necessary, but ensures - // that two Nexus instances don't concurrently decide to prune a lot - // more than either of them would on their own because they chose - // different repos to keep. - let tuf_generation_now = - get_generation(&txn).await.map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - })?; - if tuf_generation_now != initial_tuf_generation { - return Err(TransactionError::CustomError(Error::conflict( - format!( - "bailing out to avoid risk of pruning too much: \ - tuf repo generation has changed since check \ - (currently {}, was {})", - tuf_generation_now, initial_tuf_generation, - ), - ))); - } + // If the TUF repo generation has changed, bail out. + // Someone else is adding or pruning repos. Force the + // caller to re-evaluate. This is probably more + // conservative than necessary, but ensures that two Nexus + // instances don't concurrently decide to prune a lot more + // than either of them would on their own because they chose + // different repos to keep. + let tuf_generation_now = get_generation(&txn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + }) + .internal_context("fetching latest TUF generation") + .map_err(|e| { + error.bail(TransactionError::CustomError(e)) + })?; + if tuf_generation_now != initial_tuf_generation { + return Err(error.bail(TransactionError::CustomError( + Error::conflict(format!( + "bailing out to avoid risk of pruning too much: \ + tuf repo generation has changed since check \ + (currently {}, was {})", + tuf_generation_now, initial_tuf_generation, + )), + ))); + } - // Try to mark the repo pruned. - use nexus_db_schema::schema::tuf_repo::dsl; - let count = diesel::update(dsl::tuf_repo) - .filter(dsl::id.eq(to_db_typed_uuid(tuf_repo_id))) - .filter(dsl::time_pruned.is_null()) - .set(dsl::time_pruned.eq(chrono::Utc::now())) - .execute_async(&txn) - .await - .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) - .internal_context("marking TUF repo pruned")?; - - // If we made any changes, bump the TUF repo generation. This is - // necessary because that generation covers the set of TUF repos - // whose artifacts should be replicated. - if count > 0 { - put_generation( - &txn, - tuf_generation_now.into(), - tuf_generation_now.next().into(), - ) - .await - .map_err(|error| { - public_error_from_diesel(error, ErrorHandler::Server) - })?; - } + // Try to mark the repo pruned. + use nexus_db_schema::schema::tuf_repo::dsl; + let count = diesel::update(dsl::tuf_repo) + .filter(dsl::id.eq(to_db_typed_uuid(tuf_repo_id))) + .filter(dsl::time_pruned.is_null()) + .set(dsl::time_pruned.eq(chrono::Utc::now())) + .execute_async(&txn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + }) + .internal_context("marking TUF repo pruned") + .map_err(|e| { + error.bail(TransactionError::CustomError(e)) + })?; + + // If we made any changes, bump the TUF repo generation. + // This is necessary because that generation covers the set + // of TUF repos whose artifacts should be replicated. + if count > 0 { + put_generation( + &txn, + tuf_generation_now.into(), + tuf_generation_now.next().into(), + ) + .await + .map_err(|error| { + public_error_from_diesel( + error, + ErrorHandler::Server, + ) + }) + .internal_context("bumping TUF generation") + .map_err(|e| { + error.bail(TransactionError::CustomError(e)) + })?; + } - Ok(()) - }) - .await - .map_err(|error: TransactionError| match error { - TransactionError::CustomError(error) => error, - TransactionError::Database(error) => { - public_error_from_diesel(error, ErrorHandler::Server) - } - }) + Ok(()) + } + }) + .await + .map_err(|e| match error.take() { + Some(err) => err.into(), + None => public_error_from_diesel(e, ErrorHandler::Server), + }) } /// Returns the current TUF repo generation number. From e1f57e9c9d06e2e1ecca76f4229606839caa146b Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 14:45:51 -0700 Subject: [PATCH 13/19] fix lint, add tests --- .../src/db/datastore/target_release.rs | 227 +++++++++++++++++- nexus/db-queries/src/db/datastore/update.rs | 60 +---- 2 files changed, 227 insertions(+), 60 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/target_release.rs b/nexus/db-queries/src/db/datastore/target_release.rs index 0d14f553687..f4e1fd786be 100644 --- a/nexus/db-queries/src/db/datastore/target_release.rs +++ b/nexus/db-queries/src/db/datastore/target_release.rs @@ -208,6 +208,7 @@ impl DataStore { } /// Represents information fetched about recent target releases +#[derive(Debug)] pub struct RecentTargetReleases { /// distinct target releases found pub releases: BTreeSet, @@ -221,17 +222,21 @@ pub struct RecentTargetReleases { } #[cfg(test)] -mod test { - use super::*; +pub(crate) mod test { + use crate::db::DataStore; use crate::db::model::{Generation, TargetReleaseSource}; use crate::db::pub_test_utils::TestDatabase; use chrono::{TimeDelta, Utc}; + use nexus_auth::context::OpContext; + use nexus_db_model::TargetRelease; use omicron_common::api::external::{ TufArtifactMeta, TufRepoDescription, TufRepoMeta, }; use omicron_common::update::ArtifactId; use omicron_test_utils::dev; + use omicron_uuid_kinds::TufRepoUuid; use semver::Version; + use tufaceous_artifact::ArtifactHash; use tufaceous_artifact::{ArtifactKind, ArtifactVersion}; #[tokio::test] @@ -353,4 +358,222 @@ mod test { db.terminate().await; logctx.cleanup_successful(); } + + pub(crate) fn make_test_repo(version: u32) -> TufRepoDescription { + // We just need a unique hash for each repo. We'll key it on the + // version for determinism. + let version_bytes = version.to_le_bytes(); + let hash_bytes: [u8; 32] = + std::array::from_fn(|i| version_bytes[i % 4]); + let hash = ArtifactHash(hash_bytes); + let version = semver::Version::new(u64::from(version), 0, 0); + let artifact_version = ArtifactVersion::new(version.to_string()) + .expect("valid artifact version"); + TufRepoDescription { + repo: TufRepoMeta { + hash, + targets_role_version: 0, + valid_until: chrono::Utc::now(), + system_version: version, + file_name: String::new(), + }, + artifacts: vec![TufArtifactMeta { + id: ArtifactId { + name: String::new(), + version: artifact_version, + kind: ArtifactKind::from_static("empty"), + }, + hash, + size: 0, + board: None, + sign: None, + }], + } + } + + pub(crate) async fn make_and_insert( + opctx: &OpContext, + datastore: &DataStore, + version: u32, + ) -> TufRepoUuid { + let repo = make_test_repo(version); + datastore + .tuf_repo_insert(opctx, &repo) + .await + .expect("inserting repo") + .recorded + .repo + .id() + } + + #[tokio::test] + async fn test_recent_distinct() { + let logctx = dev::test_setup_log("target_release_datastore"); + let db = TestDatabase::new_with_datastore(&logctx.log).await; + let (opctx, datastore) = (db.opctx(), db.datastore()); + + // From initial conditions, this should succeed but find nothing of + // note. That's because on a freshly installed system, there's a row in + // the target_release table, but it has no system version in it. + let generation = datastore + .target_release_get_current(opctx) + .await + .unwrap() + .generation + .0; + let recent = datastore + .target_release_fetch_recent_distinct(opctx, 3) + .await + .unwrap(); + assert_eq!(recent.count, 3); + assert!(recent.releases.is_empty()); + assert_eq!(recent.target_release_generation, generation); + + // Now insert a TUF repo and try again. That alone shouldn't change + // anything. + let repo1id = make_and_insert(opctx, datastore, 1).await; + let target_release = + datastore.target_release_get_current(opctx).await.unwrap(); + let last_generation = generation; + let generation = target_release.generation.0; + let recent = datastore + .target_release_fetch_recent_distinct(opctx, 3) + .await + .unwrap(); + assert_eq!(recent.count, 3); + assert!(recent.releases.is_empty()); + assert_eq!(last_generation, generation); + assert_eq!(recent.target_release_generation, generation); + + // Now insert a target release and try again. + let target_release = datastore + .target_release_insert( + opctx, + TargetRelease::new_system_version( + &target_release, + repo1id.into(), + ), + ) + .await + .unwrap(); + let last_generation = generation; + let generation = target_release.generation.0; + assert_ne!(last_generation, generation); + let recent = datastore + .target_release_fetch_recent_distinct(opctx, 3) + .await + .unwrap(); + assert_eq!(recent.count, 3); + assert_eq!(recent.releases.len(), 1); + assert!(recent.releases.contains(&repo1id)); + assert_eq!(recent.target_release_generation, generation); + + // Now insert a second distinct target release and try again. + let repo2id = make_and_insert(opctx, datastore, 2).await; + let target_release = datastore + .target_release_insert( + opctx, + TargetRelease::new_system_version( + &target_release, + repo2id.into(), + ), + ) + .await + .unwrap(); + let last_generation = generation; + let generation = target_release.generation.0; + assert_ne!(last_generation, generation); + let recent = datastore + .target_release_fetch_recent_distinct(opctx, 3) + .await + .unwrap(); + assert_eq!(recent.count, 3); + assert_eq!(recent.releases.len(), 2); + assert!(recent.releases.contains(&repo1id)); + assert!(recent.releases.contains(&repo2id)); + assert_eq!(recent.target_release_generation, generation); + + // If we only look back far enough for one, we'll only find one. + let recent = datastore + .target_release_fetch_recent_distinct(opctx, 1) + .await + .unwrap(); + assert_eq!(recent.count, 1); + assert_eq!(recent.releases.len(), 1); + assert!(recent.releases.contains(&repo2id)); + assert_eq!(recent.target_release_generation, generation); + + // Set the target release to the same value again. We'll use this to + // test that it looks back further than two rows when necessary. + let target_release = datastore + .target_release_insert( + opctx, + TargetRelease::new_system_version( + &target_release, + repo2id.into(), + ), + ) + .await + .unwrap(); + let generation = target_release.generation.0; + let recent = datastore + .target_release_fetch_recent_distinct(opctx, 3) + .await + .unwrap(); + assert_eq!(recent.count, 3); + assert_eq!(recent.releases.len(), 2); + assert!(recent.releases.contains(&repo1id)); + assert!(recent.releases.contains(&repo2id)); + assert_eq!(recent.target_release_generation, generation); + + // If we do that a total of 7 times (so, five more times), it will have + // to look back 8 rows to find two distinct releases, and it won't know + // if it's looked back far enough. + let mut target_release = target_release; + for i in 0..6 { + target_release = datastore + .target_release_insert( + opctx, + TargetRelease::new_system_version( + &target_release, + repo2id.into(), + ), + ) + .await + .unwrap(); + let generation = target_release.generation.0; + if i < 5 { + let recent = datastore + .target_release_fetch_recent_distinct(opctx, 2) + .await + .unwrap(); + assert_eq!(recent.count, 2); + assert_eq!(recent.releases.len(), 2); + assert!(recent.releases.contains(&repo1id)); + assert!(recent.releases.contains(&repo2id)); + assert_eq!(recent.target_release_generation, generation); + } else { + let error = datastore + .target_release_fetch_recent_distinct(opctx, 2) + .await + .unwrap_err(); + eprintln!("got error (expected one): {}", error); + // It'll look further if we're looking for more distinct + // releases. + let recent = datastore + .target_release_fetch_recent_distinct(opctx, 3) + .await + .unwrap(); + assert_eq!(recent.count, 3); + assert_eq!(recent.releases.len(), 2); + assert!(recent.releases.contains(&repo1id)); + assert!(recent.releases.contains(&repo2id)); + assert_eq!(recent.target_release_generation, generation); + } + } + + // Clean up. + db.terminate().await; + logctx.cleanup_successful(); + } } diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index fae3775ade7..5ce12339310 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -866,69 +866,13 @@ fn display_kind_hash(kind: &str, hash: ArtifactHash) -> String { #[cfg(test)] mod test { - use crate::db::DataStore; use crate::db::datastore::SQL_BATCH_SIZE; + use crate::db::datastore::target_release::test::make_and_insert; use crate::db::pub_test_utils::TestDatabase; - use nexus_auth::context::OpContext; use nexus_db_model::TargetRelease; - use omicron_common::api::external::TufArtifactMeta; - use omicron_common::api::external::TufRepoDescription; - use omicron_common::api::external::TufRepoMeta; - use omicron_common::update::ArtifactId; use omicron_test_utils::dev; - use omicron_uuid_kinds::TufRepoUuid; use slog_error_chain::InlineErrorChain; use std::collections::BTreeSet; - use tufaceous_artifact::ArtifactHash; - use tufaceous_artifact::ArtifactKind; - use tufaceous_artifact::ArtifactVersion; - - fn make_test_repo(version: u32) -> TufRepoDescription { - // We just need a unique hash for each repo. We'll key it on the - // version for determinism. - let version_bytes = version.to_le_bytes(); - let hash_bytes: [u8; 32] = - std::array::from_fn(|i| version_bytes[i % 4]); - let hash = ArtifactHash(hash_bytes); - let version = semver::Version::new(u64::from(version), 0, 0); - let artifact_version = ArtifactVersion::new(version.to_string()) - .expect("valid artifact version"); - TufRepoDescription { - repo: TufRepoMeta { - hash, - targets_role_version: 0, - valid_until: chrono::Utc::now(), - system_version: version, - file_name: String::new(), - }, - artifacts: vec![TufArtifactMeta { - id: ArtifactId { - name: String::new(), - version: artifact_version, - kind: ArtifactKind::from_static("empty"), - }, - hash, - size: 0, - board: None, - sign: None, - }], - } - } - - async fn make_and_insert( - opctx: &OpContext, - datastore: &DataStore, - version: u32, - ) -> TufRepoUuid { - let repo = make_test_repo(version); - datastore - .tuf_repo_insert(opctx, &repo) - .await - .expect("inserting repo") - .recorded - .repo - .id() - } #[tokio::test] async fn test_repo_mark_pruned() { @@ -1126,7 +1070,7 @@ mod test { assert!(repos.is_empty()); // Make sure we have more than a page worth of TUF repos. - let count = u32::from(SQL_BATCH_SIZE.get()) + 3; + let count = SQL_BATCH_SIZE.get() + 3; let mut expected_repos = BTreeSet::new(); for i in 0..count { assert!( From 5b3edcf39f70b145d1ede62c6512251f90c63094 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 15:53:21 -0700 Subject: [PATCH 14/19] use retryable stuff properly --- nexus/db-queries/src/db/datastore/update.rs | 88 ++++++++++----------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index 5ce12339310..ec74f397eb9 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -17,8 +17,8 @@ use crate::db::pagination::paginated; use async_bb8_diesel::AsyncRunQueryDsl; use diesel::prelude::*; use diesel::result::Error as DieselError; +use nexus_db_errors::OptionalError; use nexus_db_errors::{ErrorHandler, public_error_from_diesel}; -use nexus_db_errors::{OptionalError, TransactionError}; use nexus_db_lookup::DbConnection; use nexus_db_model::{ ArtifactHash, TargetRelease, TufArtifact, TufRepo, TufRepoDescription, @@ -311,16 +311,16 @@ impl DataStore { .first_async(&txn) .await .map_err(|e| { - public_error_from_diesel( - e, - ErrorHandler::Server, - ) - }) - .internal_context( - "fetching latest target_release generation", - ) - .map_err(|e| { - error.bail(TransactionError::CustomError(e)) + error.bail_retryable_or_else(e, |e| { + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + .internal_context( + "fetching latest target_release \ + generation", + ) + }) })? .generation .0 @@ -328,15 +328,13 @@ impl DataStore { if target_release_generation_now != recent_releases.target_release_generation { - return Err(error.bail(TransactionError::CustomError( - Error::conflict(format!( - "bailing out to avoid risk of marking current \ + return Err(error.bail(Error::conflict(format!( + "bailing out to avoid risk of marking current \ target release pruned: target release has \ changed since check (currently {}, was {})", - target_release_generation_now, - recent_releases.target_release_generation - )), - ))); + target_release_generation_now, + recent_releases.target_release_generation + )))); } // If the TUF repo generation has changed, bail out. @@ -346,24 +344,25 @@ impl DataStore { // instances don't concurrently decide to prune a lot more // than either of them would on their own because they chose // different repos to keep. - let tuf_generation_now = get_generation(&txn) - .await - .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - }) - .internal_context("fetching latest TUF generation") - .map_err(|e| { - error.bail(TransactionError::CustomError(e)) + let tuf_generation_now = + get_generation(&txn).await.map_err(|e| { + error.bail_retryable_or_else(e, |e| { + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + .internal_context( + "fetching latest TUF generation", + ) + }) })?; if tuf_generation_now != initial_tuf_generation { - return Err(error.bail(TransactionError::CustomError( - Error::conflict(format!( + return Err(error.bail(Error::conflict(format!( "bailing out to avoid risk of pruning too much: \ tuf repo generation has changed since check \ (currently {}, was {})", tuf_generation_now, initial_tuf_generation, - )), - ))); + )))); } // Try to mark the repo pruned. @@ -375,11 +374,13 @@ impl DataStore { .execute_async(&txn) .await .map_err(|e| { - public_error_from_diesel(e, ErrorHandler::Server) - }) - .internal_context("marking TUF repo pruned") - .map_err(|e| { - error.bail(TransactionError::CustomError(e)) + error.bail_retryable_or_else(e, |e| { + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + .internal_context("marking TUF repo pruned") + }) })?; // If we made any changes, bump the TUF repo generation. @@ -392,15 +393,14 @@ impl DataStore { tuf_generation_now.next().into(), ) .await - .map_err(|error| { - public_error_from_diesel( - error, - ErrorHandler::Server, - ) - }) - .internal_context("bumping TUF generation") .map_err(|e| { - error.bail(TransactionError::CustomError(e)) + error.bail_retryable_or_else(e, |e| { + public_error_from_diesel( + e, + ErrorHandler::Server, + ) + .internal_context("bumping TUF generation") + }) })?; } @@ -409,7 +409,7 @@ impl DataStore { }) .await .map_err(|e| match error.take() { - Some(err) => err.into(), + Some(err) => err, None => public_error_from_diesel(e, ErrorHandler::Server), }) } From b5d657f7fecf87bbaf394f06a6e952717e319de9 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 16:12:24 -0700 Subject: [PATCH 15/19] fix up task tests --- .../app/background/tasks/tuf_repo_pruner.rs | 252 +++++++++--------- 1 file changed, 132 insertions(+), 120 deletions(-) diff --git a/nexus/src/app/background/tasks/tuf_repo_pruner.rs b/nexus/src/app/background/tasks/tuf_repo_pruner.rs index 91ff3a83de5..70fc2cce5ce 100644 --- a/nexus/src/app/background/tasks/tuf_repo_pruner.rs +++ b/nexus/src/app/background/tasks/tuf_repo_pruner.rs @@ -221,128 +221,140 @@ fn decide_prune(args: TufRepoPrune) -> TufRepoPrunerStatus { #[cfg(test)] mod test { - // XXX-dap clean up all imports - // use super::decide_prune; - // use iddqd::IdOrdMap; - // use nexus_types::internal_api::background::TufRepoInfo; - // use nexus_types::internal_api::background::TufRepoPrunerStatus; - // use omicron_uuid_kinds::TufRepoUuid; - // use std::collections::BTreeSet; - // - // fn make_test_repos() -> [TufRepoInfo; 6] { - // [ - // TufRepoInfo { - // id: TufRepoUuid::new_v4(), - // time_created: "2025-09-26T01:00:00Z".parse().unwrap(), - // system_version: "1.0.0".parse().unwrap(), - // }, - // TufRepoInfo { - // id: TufRepoUuid::new_v4(), - // time_created: "2025-09-26T02:00:00Z".parse().unwrap(), - // system_version: "2.0.0".parse().unwrap(), - // }, - // TufRepoInfo { - // id: TufRepoUuid::new_v4(), - // time_created: "2025-09-26T03:00:00Z".parse().unwrap(), - // system_version: "3.0.0".parse().unwrap(), - // }, - // TufRepoInfo { - // id: TufRepoUuid::new_v4(), - // time_created: "2025-09-26T04:00:00Z".parse().unwrap(), - // system_version: "4.0.0".parse().unwrap(), - // }, - // TufRepoInfo { - // id: TufRepoUuid::new_v4(), - // time_created: "2025-09-26T05:00:00Z".parse().unwrap(), - // system_version: "4.1.0".parse().unwrap(), - // }, - // TufRepoInfo { - // id: TufRepoUuid::new_v4(), - // time_created: "2025-09-26T06:00:00Z".parse().unwrap(), - // system_version: "4.2.0".parse().unwrap(), - // }, - // ] - // } - // - // #[test] - // fn test_decide_prune() { - // let all_repos = make_test_repos(); - // let [r1, r2, r3, r4, r5, r6] = &all_repos; - // let all_repos: IdOrdMap<_> = all_repos.clone().into_iter().collect(); + use super::decide_prune; + use crate::app::background::tasks::tuf_repo_pruner::TufRepoPrune; + use iddqd::IdOrdMap; + use nexus_types::internal_api::background::TufRepoInfo; + use omicron_uuid_kinds::TufRepoUuid; + use std::collections::BTreeSet; - // // Trivial case - // let mut status = TufRepoPrunerStatus { - // repos_keep: IdOrdMap::new(), - // repos_prune: IdOrdMap::new(), - // warnings: Vec::new(), - // nkeep_recent_releases: 2, - // nkeep_recent_uploads: 1, - // }; - // decide_prune(&IdOrdMap::new(), &BTreeSet::new(), &mut status); - // assert!(status.warnings.is_empty()); - // assert!(status.repos_keep.is_empty()); - // assert!(status.repos_prune.is_empty()); + fn make_test_repos() -> [TufRepoInfo; 6] { + [ + TufRepoInfo { + id: TufRepoUuid::new_v4(), + time_created: "2025-09-26T01:00:00Z".parse().unwrap(), + system_version: "1.0.0".parse().unwrap(), + }, + TufRepoInfo { + id: TufRepoUuid::new_v4(), + time_created: "2025-09-26T02:00:00Z".parse().unwrap(), + system_version: "2.0.0".parse().unwrap(), + }, + TufRepoInfo { + id: TufRepoUuid::new_v4(), + time_created: "2025-09-26T03:00:00Z".parse().unwrap(), + system_version: "3.0.0".parse().unwrap(), + }, + TufRepoInfo { + id: TufRepoUuid::new_v4(), + time_created: "2025-09-26T04:00:00Z".parse().unwrap(), + system_version: "4.0.0".parse().unwrap(), + }, + TufRepoInfo { + id: TufRepoUuid::new_v4(), + time_created: "2025-09-26T05:00:00Z".parse().unwrap(), + system_version: "4.1.0".parse().unwrap(), + }, + TufRepoInfo { + id: TufRepoUuid::new_v4(), + time_created: "2025-09-26T06:00:00Z".parse().unwrap(), + system_version: "4.2.0".parse().unwrap(), + }, + ] + } + + #[test] + fn test_decide_prune() { + let all_repos = make_test_repos(); + let [r1, r2, r3, r4, r5, r6] = &all_repos; + let all_repos: IdOrdMap<_> = all_repos.clone().into_iter().collect(); - // // Simple case: have some target releases to keep, no uploads. - // let releases_to_keep: BTreeSet<_> = - // [r3.id, r4.id].into_iter().collect(); - // let mut status = TufRepoPrunerStatus { - // repos_keep: IdOrdMap::new(), - // repos_prune: IdOrdMap::new(), - // warnings: Vec::new(), - // nkeep_recent_releases: u8::try_from(releases_to_keep.len()) - // .unwrap(), - // nkeep_recent_uploads: 0, - // }; - // decide_prune(&all_repos, &releases_to_keep, &mut status); - // assert!(status.warnings.is_empty()); - // assert_eq!(status.repos_keep.len(), 2); - // assert!(status.repos_keep.contains_key(&r3.id)); - // assert!(status.repos_keep.contains_key(&r4.id)); - // assert_eq!( - // status.repos_prune.len(), - // all_repos.len() - releases_to_keep.len() - // ); - // assert!(status.repos_keep.contains_key(&r1.id)); - // assert!(status.repos_keep.contains_key(&r2.id)); - // assert!(status.repos_keep.contains_key(&r5.id)); - // assert!(status.repos_keep.contains_key(&r6.id)); + // Trivial case: nothing available to keep or prune. + let empty_map = IdOrdMap::new(); + let empty_set = BTreeSet::new(); + let status = decide_prune(TufRepoPrune { + nkeep_recent_releases: 2, + nkeep_recent_uploads: 1, + all_tuf_repos: &empty_map, + recent_releases: &empty_set, + }); + assert_eq!(status.nkeep_recent_releases, 2); + assert_eq!(status.nkeep_recent_uploads, 1); + assert!(status.warnings.is_empty()); + assert!(status.repos_keep_target_release.is_empty()); + assert!(status.repos_keep_recent_uploads.is_empty()); + assert!(status.repo_prune.is_none()); + assert!(status.other_repos_eligible_to_prune.is_empty()); - // // Simple case: have no target releases to keep, but some uploads. - // let releases_to_keep = BTreeSet::new(); - // let mut status = TufRepoPrunerStatus { - // repos_keep: IdOrdMap::new(), - // repos_prune: IdOrdMap::new(), - // warnings: Vec::new(), - // nkeep_recent_releases: u8::try_from(releases_to_keep.len()) - // .unwrap(), - // nkeep_recent_uploads: 0, - // }; - // decide_prune(&all_repos, &releases_to_keep, &mut status); - // assert!(status.warnings.is_empty()); - // assert_eq!(status.repos_keep.len(), 2); - // assert!(status.repos_keep.contains_key(&r3.id)); - // assert!(status.repos_keep.contains_key(&r4.id)); - // assert_eq!( - // status.repos_prune.len(), - // all_repos.len() - releases_to_keep.len() - // ); - // assert!(status.repos_keep.contains_key(&r1.id)); - // assert!(status.repos_keep.contains_key(&r2.id)); - // assert!(status.repos_keep.contains_key(&r5.id)); - // assert!(status.repos_keep.contains_key(&r6.id)); + // Simple case: we're only allowing it to keep recent releases. + let releases_to_keep: BTreeSet<_> = + [r3.id, r4.id].into_iter().collect(); + let status = decide_prune(TufRepoPrune { + nkeep_recent_releases: u8::try_from(releases_to_keep.len()) + .unwrap(), + nkeep_recent_uploads: 0, + all_tuf_repos: &all_repos, + recent_releases: &releases_to_keep, + }); + assert!(status.warnings.is_empty()); + assert_eq!(status.repos_keep_target_release.len(), 2); + assert!(status.repos_keep_target_release.contains_key(&r3.id)); + assert!(status.repos_keep_target_release.contains_key(&r4.id)); + assert!(status.repos_keep_recent_uploads.is_empty()); + assert_eq!(status.repo_prune.expect("repo to prune").id, r1.id); + assert_eq!( + status.other_repos_eligible_to_prune.len(), + all_repos.len() - releases_to_keep.len() + ); + assert!(status.other_repos_eligible_to_prune.contains_key(&r2.id)); + assert!(status.other_repos_eligible_to_prune.contains_key(&r5.id)); + assert!(status.other_repos_eligible_to_prune.contains_key(&r6.id)); - // // Case 1: a common case - // // XXX-dap - // // let releases_to_keep: BTreeSet<_> = - // // [r3.id, r4.id].into_iter().collect(); - // // let mut status = TufRepoPrunerStatus { - // // repos_keep: IdOrdMap::new(), - // // repos_prune: IdOrdMap::new(), - // // warnings: Vec::new(), - // // nkeep_recent_releases: u8::try_from(releases_to_keep.len()) - // // .unwrap(), - // // nkeep_recent_uploads: 1, - // // }; - // } + // Simple case: we're only allowing it to keep recent uploads, + // by virtue of having no recent target releases. + let status = decide_prune(TufRepoPrune { + nkeep_recent_releases: 3, + nkeep_recent_uploads: 2, + all_tuf_repos: &all_repos, + recent_releases: &empty_set, + }); + assert!(status.warnings.is_empty()); + assert!(status.repos_keep_target_release.is_empty()); + assert!(status.repos_keep_target_release.contains_key(&r5.id)); + assert!(status.repos_keep_target_release.contains_key(&r6.id)); + assert_eq!(status.repos_keep_recent_uploads.len(), 2); + assert_eq!(status.repo_prune.expect("repo to prune").id, r1.id); + assert_eq!( + status.other_repos_eligible_to_prune.len(), + all_repos.len() - 1 - status.repos_keep_recent_uploads.len(), + ); + assert!(status.other_repos_eligible_to_prune.contains_key(&r2.id)); + assert!(status.other_repos_eligible_to_prune.contains_key(&r3.id)); + assert!(status.other_repos_eligible_to_prune.contains_key(&r4.id)); + + // Keep a combination of recent target releases and recent uploads. + let releases_to_keep: BTreeSet<_> = + [r3.id, r4.id].into_iter().collect(); + let status = decide_prune(TufRepoPrune { + nkeep_recent_releases: u8::try_from(releases_to_keep.len()) + .unwrap(), + nkeep_recent_uploads: 1, + all_tuf_repos: &all_repos, + recent_releases: &releases_to_keep, + }); + assert!(status.warnings.is_empty()); + assert_eq!(status.repos_keep_target_release.len(), 2); + assert!(status.repos_keep_target_release.contains_key(&r3.id)); + assert!(status.repos_keep_target_release.contains_key(&r4.id)); + assert_eq!(status.repos_keep_recent_uploads.len(), 1); + assert!(status.repos_keep_recent_uploads.contains_key(&r6.id)); + assert_eq!(status.repo_prune.expect("repo to prune").id, r1.id); + assert_eq!( + status.other_repos_eligible_to_prune.len(), + all_repos.len() + - releases_to_keep.len() + - status.repos_keep_recent_uploads.len(), + ); + assert!(status.other_repos_eligible_to_prune.contains_key(&r5.id)); + } } From d381e0a0c1082c521106ddadd7114f77792ec8bd Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 17:21:07 -0700 Subject: [PATCH 16/19] fix the bug again --- .../app/background/tasks/tuf_repo_pruner.rs | 28 +++++-------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/nexus/src/app/background/tasks/tuf_repo_pruner.rs b/nexus/src/app/background/tasks/tuf_repo_pruner.rs index 70fc2cce5ce..fae158ef625 100644 --- a/nexus/src/app/background/tasks/tuf_repo_pruner.rs +++ b/nexus/src/app/background/tasks/tuf_repo_pruner.rs @@ -200,12 +200,9 @@ fn decide_prune(args: TufRepoPrune) -> TufRepoPrunerStatus { continue; } - if repo_prune.is_none() { - repo_prune = Some(repo.clone()); - continue; + if let Some(last_maybe_prune) = repo_prune.replace(repo.clone()) { + other_repos_eligible_to_prune.insert_overwrite(last_maybe_prune); } - - other_repos_eligible_to_prune.insert_overwrite(repo.clone()); } TufRepoPrunerStatus { @@ -266,6 +263,7 @@ mod test { #[test] fn test_decide_prune() { let all_repos = make_test_repos(); + eprintln!("repos: {:#?}", all_repos); let [r1, r2, r3, r4, r5, r6] = &all_repos; let all_repos: IdOrdMap<_> = all_repos.clone().into_iter().collect(); @@ -302,10 +300,7 @@ mod test { assert!(status.repos_keep_target_release.contains_key(&r4.id)); assert!(status.repos_keep_recent_uploads.is_empty()); assert_eq!(status.repo_prune.expect("repo to prune").id, r1.id); - assert_eq!( - status.other_repos_eligible_to_prune.len(), - all_repos.len() - releases_to_keep.len() - ); + assert_eq!(status.other_repos_eligible_to_prune.len(), 3); assert!(status.other_repos_eligible_to_prune.contains_key(&r2.id)); assert!(status.other_repos_eligible_to_prune.contains_key(&r5.id)); assert!(status.other_repos_eligible_to_prune.contains_key(&r6.id)); @@ -320,14 +315,9 @@ mod test { }); assert!(status.warnings.is_empty()); assert!(status.repos_keep_target_release.is_empty()); - assert!(status.repos_keep_target_release.contains_key(&r5.id)); - assert!(status.repos_keep_target_release.contains_key(&r6.id)); assert_eq!(status.repos_keep_recent_uploads.len(), 2); assert_eq!(status.repo_prune.expect("repo to prune").id, r1.id); - assert_eq!( - status.other_repos_eligible_to_prune.len(), - all_repos.len() - 1 - status.repos_keep_recent_uploads.len(), - ); + assert_eq!(status.other_repos_eligible_to_prune.len(), 3); assert!(status.other_repos_eligible_to_prune.contains_key(&r2.id)); assert!(status.other_repos_eligible_to_prune.contains_key(&r3.id)); assert!(status.other_repos_eligible_to_prune.contains_key(&r4.id)); @@ -349,12 +339,8 @@ mod test { assert_eq!(status.repos_keep_recent_uploads.len(), 1); assert!(status.repos_keep_recent_uploads.contains_key(&r6.id)); assert_eq!(status.repo_prune.expect("repo to prune").id, r1.id); - assert_eq!( - status.other_repos_eligible_to_prune.len(), - all_repos.len() - - releases_to_keep.len() - - status.repos_keep_recent_uploads.len(), - ); + assert_eq!(status.other_repos_eligible_to_prune.len(), 2); + assert!(status.other_repos_eligible_to_prune.contains_key(&r2.id)); assert!(status.other_repos_eligible_to_prune.contains_key(&r5.id)); } } From fab9379da27533122b3f7a21b543beaff2d60e94 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 20:18:06 -0700 Subject: [PATCH 17/19] add integration test for repo pruning --- .../src/db/datastore/target_release.rs | 56 +------------------ nexus/db-queries/src/db/datastore/update.rs | 27 +++++---- .../src/db/pub_test_utils/helpers.rs | 53 ++++++++++++++++++ nexus/tests/integration_tests/updates.rs | 43 ++++++++++++++ 4 files changed, 115 insertions(+), 64 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/target_release.rs b/nexus/db-queries/src/db/datastore/target_release.rs index f4e1fd786be..6e8f582ec38 100644 --- a/nexus/db-queries/src/db/datastore/target_release.rs +++ b/nexus/db-queries/src/db/datastore/target_release.rs @@ -223,20 +223,17 @@ pub struct RecentTargetReleases { #[cfg(test)] pub(crate) mod test { - use crate::db::DataStore; use crate::db::model::{Generation, TargetReleaseSource}; use crate::db::pub_test_utils::TestDatabase; + use crate::db::pub_test_utils::helpers::insert_test_tuf_repo; use chrono::{TimeDelta, Utc}; - use nexus_auth::context::OpContext; use nexus_db_model::TargetRelease; use omicron_common::api::external::{ TufArtifactMeta, TufRepoDescription, TufRepoMeta, }; use omicron_common::update::ArtifactId; use omicron_test_utils::dev; - use omicron_uuid_kinds::TufRepoUuid; use semver::Version; - use tufaceous_artifact::ArtifactHash; use tufaceous_artifact::{ArtifactKind, ArtifactVersion}; #[tokio::test] @@ -359,53 +356,6 @@ pub(crate) mod test { logctx.cleanup_successful(); } - pub(crate) fn make_test_repo(version: u32) -> TufRepoDescription { - // We just need a unique hash for each repo. We'll key it on the - // version for determinism. - let version_bytes = version.to_le_bytes(); - let hash_bytes: [u8; 32] = - std::array::from_fn(|i| version_bytes[i % 4]); - let hash = ArtifactHash(hash_bytes); - let version = semver::Version::new(u64::from(version), 0, 0); - let artifact_version = ArtifactVersion::new(version.to_string()) - .expect("valid artifact version"); - TufRepoDescription { - repo: TufRepoMeta { - hash, - targets_role_version: 0, - valid_until: chrono::Utc::now(), - system_version: version, - file_name: String::new(), - }, - artifacts: vec![TufArtifactMeta { - id: ArtifactId { - name: String::new(), - version: artifact_version, - kind: ArtifactKind::from_static("empty"), - }, - hash, - size: 0, - board: None, - sign: None, - }], - } - } - - pub(crate) async fn make_and_insert( - opctx: &OpContext, - datastore: &DataStore, - version: u32, - ) -> TufRepoUuid { - let repo = make_test_repo(version); - datastore - .tuf_repo_insert(opctx, &repo) - .await - .expect("inserting repo") - .recorded - .repo - .id() - } - #[tokio::test] async fn test_recent_distinct() { let logctx = dev::test_setup_log("target_release_datastore"); @@ -431,7 +381,7 @@ pub(crate) mod test { // Now insert a TUF repo and try again. That alone shouldn't change // anything. - let repo1id = make_and_insert(opctx, datastore, 1).await; + let repo1id = insert_test_tuf_repo(opctx, datastore, 1).await; let target_release = datastore.target_release_get_current(opctx).await.unwrap(); let last_generation = generation; @@ -469,7 +419,7 @@ pub(crate) mod test { assert_eq!(recent.target_release_generation, generation); // Now insert a second distinct target release and try again. - let repo2id = make_and_insert(opctx, datastore, 2).await; + let repo2id = insert_test_tuf_repo(opctx, datastore, 2).await; let target_release = datastore .target_release_insert( opctx, diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index ec74f397eb9..ac9d3ebca36 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -867,8 +867,8 @@ fn display_kind_hash(kind: &str, hash: ArtifactHash) -> String { #[cfg(test)] mod test { use crate::db::datastore::SQL_BATCH_SIZE; - use crate::db::datastore::target_release::test::make_and_insert; use crate::db::pub_test_utils::TestDatabase; + use crate::db::pub_test_utils::helpers::insert_test_tuf_repo; use nexus_db_model::TargetRelease; use omicron_test_utils::dev; use slog_error_chain::InlineErrorChain; @@ -888,7 +888,7 @@ mod test { assert!(repos.is_empty()); // Add one TUF repo to the database. - let repo1id = make_and_insert(opctx, datastore, 1).await; + let repo1id = insert_test_tuf_repo(opctx, datastore, 1).await; // Make sure it's there. let repos = datastore @@ -920,13 +920,17 @@ mod test { assert!(repos.is_empty()); // Now set up a more realistic case. - let old_target_repo1_id = make_and_insert(opctx, datastore, 2).await; - let old_target_repo2_id = make_and_insert(opctx, datastore, 3).await; - let old_target_repo3_id = make_and_insert(opctx, datastore, 4).await; - let old_target_repo4_id = make_and_insert(opctx, datastore, 5).await; - let new_upload1 = make_and_insert(opctx, datastore, 10).await; - let new_upload2 = make_and_insert(opctx, datastore, 11).await; - let new_upload3 = make_and_insert(opctx, datastore, 12).await; + let old_target_repo1_id = + insert_test_tuf_repo(opctx, datastore, 2).await; + let old_target_repo2_id = + insert_test_tuf_repo(opctx, datastore, 3).await; + let old_target_repo3_id = + insert_test_tuf_repo(opctx, datastore, 4).await; + let old_target_repo4_id = + insert_test_tuf_repo(opctx, datastore, 5).await; + let new_upload1 = insert_test_tuf_repo(opctx, datastore, 10).await; + let new_upload2 = insert_test_tuf_repo(opctx, datastore, 11).await; + let new_upload3 = insert_test_tuf_repo(opctx, datastore, 12).await; // Make sure they're all there. let repos = datastore @@ -1074,8 +1078,9 @@ mod test { let mut expected_repos = BTreeSet::new(); for i in 0..count { assert!( - expected_repos - .insert(make_and_insert(opctx, datastore, i + 1).await) + expected_repos.insert( + insert_test_tuf_repo(opctx, datastore, i + 1).await + ) ); } diff --git a/nexus/db-queries/src/db/pub_test_utils/helpers.rs b/nexus/db-queries/src/db/pub_test_utils/helpers.rs index aaae77aeb78..7b9fbc7e00c 100644 --- a/nexus/db-queries/src/db/pub_test_utils/helpers.rs +++ b/nexus/db-queries/src/db/pub_test_utils/helpers.rs @@ -34,13 +34,20 @@ use nexus_db_model::SnapshotState; use nexus_types::external_api::params; use nexus_types::identity::Resource; use omicron_common::api::external; +use omicron_common::api::external::{ + TufArtifactMeta, TufRepoDescription, TufRepoMeta, +}; +use omicron_common::update::ArtifactId; use omicron_uuid_kinds::GenericUuid; use omicron_uuid_kinds::InstanceUuid; use omicron_uuid_kinds::SledUuid; +use omicron_uuid_kinds::TufRepoUuid; use omicron_uuid_kinds::VolumeUuid; use std::net::Ipv6Addr; use std::net::SocketAddrV6; use std::str::FromStr; +use tufaceous_artifact::ArtifactHash; +use tufaceous_artifact::{ArtifactKind, ArtifactVersion}; use uuid::Uuid; /// Creates a project within the silo of "opctx". @@ -495,3 +502,49 @@ pub async fn create_project_image( .await .unwrap() } + +pub async fn insert_test_tuf_repo( + opctx: &OpContext, + datastore: &DataStore, + version: u32, +) -> TufRepoUuid { + let repo = make_test_repo(version); + datastore + .tuf_repo_insert(opctx, &repo) + .await + .expect("inserting repo") + .recorded + .repo + .id() +} + +fn make_test_repo(version: u32) -> TufRepoDescription { + // We just need a unique hash for each repo. We'll key it on the + // version for determinism. + let version_bytes = version.to_le_bytes(); + let hash_bytes: [u8; 32] = std::array::from_fn(|i| version_bytes[i % 4]); + let hash = ArtifactHash(hash_bytes); + let version = semver::Version::new(u64::from(version), 0, 0); + let artifact_version = ArtifactVersion::new(version.to_string()) + .expect("valid artifact version"); + TufRepoDescription { + repo: TufRepoMeta { + hash, + targets_role_version: 0, + valid_until: chrono::Utc::now(), + system_version: version, + file_name: String::new(), + }, + artifacts: vec![TufArtifactMeta { + id: ArtifactId { + name: String::new(), + version: artifact_version, + kind: ArtifactKind::from_static("empty"), + }, + hash, + size: 0, + board: None, + sign: None, + }], + } +} diff --git a/nexus/tests/integration_tests/updates.rs b/nexus/tests/integration_tests/updates.rs index 0f55a3e7f86..45328c4b24b 100644 --- a/nexus/tests/integration_tests/updates.rs +++ b/nexus/tests/integration_tests/updates.rs @@ -9,6 +9,8 @@ use chrono::{DateTime, Duration, Timelike, Utc}; use dropshot::ResultsPage; use http::{Method, StatusCode}; use nexus_db_queries::context::OpContext; +use nexus_db_queries::db::pub_test_utils::helpers::insert_test_tuf_repo; +use nexus_test_utils::background::activate_background_task; use nexus_test_utils::background::run_tuf_artifact_replication_step; use nexus_test_utils::background::wait_tuf_artifact_replication_step; use nexus_test_utils::http_testing::{AuthnMode, NexusRequest, RequestBuilder}; @@ -625,3 +627,44 @@ async fn test_trust_root_operations(cptestctx: &ControlPlaneTestContext) { .expect("failed to parse list after delete response"); assert!(response.items.is_empty()); } + +#[nexus_test] +async fn test_repo_prune(cptestctx: &ControlPlaneTestContext) { + let logctx = &cptestctx.logctx; + let datastore = cptestctx.server.server_context().nexus.datastore(); + let opctx = OpContext::for_tests(logctx.log.new(o!()), datastore.clone()); + + // Wait for one activation of the task to avoid racing with it. + let client = &cptestctx.lockstep_client; + activate_background_task(client, "tuf_repo_pruner").await; + + // Insert four TUF repos. + let repo1id = insert_test_tuf_repo(&opctx, datastore, 1).await; + let repo2id = insert_test_tuf_repo(&opctx, datastore, 2).await; + let repo3id = insert_test_tuf_repo(&opctx, datastore, 3).await; + let repo4id = insert_test_tuf_repo(&opctx, datastore, 4).await; + + // Immediately, all four repos ought to be visible. + let repos = datastore + .tuf_list_repos_unpruned_batched(&opctx) + .await + .expect("listing repos"); + assert_eq!(repos.len(), 4); + assert!(repos.iter().any(|r| r.id() == repo1id)); + assert!(repos.iter().any(|r| r.id() == repo2id)); + assert!(repos.iter().any(|r| r.id() == repo3id)); + assert!(repos.iter().any(|r| r.id() == repo4id)); + + // Activate the task again and wait for it to complete. Exactly one of + // the repos should be pruned. + activate_background_task(client, "tuf_repo_pruner").await; + let repos = datastore + .tuf_list_repos_unpruned_batched(&opctx) + .await + .expect("listing repos"); + assert_eq!(repos.len(), 3); + assert!(!repos.iter().any(|r| r.id() == repo1id)); + assert!(repos.iter().any(|r| r.id() == repo2id)); + assert!(repos.iter().any(|r| r.id() == repo3id)); + assert!(repos.iter().any(|r| r.id() == repo4id)); +} From 0a77507cf8ba9cff30bdfc9c6a2b9f8202024fe1 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Mon, 29 Sep 2025 21:06:40 -0700 Subject: [PATCH 18/19] add logging, omdb support --- Cargo.lock | 2 + dev-tools/omdb/src/bin/omdb/nexus.rs | 16 +++ dev-tools/omdb/tests/env.out | 12 ++ dev-tools/omdb/tests/successes.out | 28 +++++ .../app/background/tasks/tuf_repo_pruner.rs | 16 +++ nexus/types/Cargo.toml | 2 + nexus/types/output/tuf_repo_pruner_status.out | 10 ++ nexus/types/src/internal_api/background.rs | 113 +++++++++++++++++- 8 files changed, 197 insertions(+), 2 deletions(-) create mode 100644 nexus/types/output/tuf_repo_pruner_status.out diff --git a/Cargo.lock b/Cargo.lock index b54bb161fdb..4c62aa61162 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7190,6 +7190,7 @@ dependencies = [ "derive-where", "derive_more 0.99.20", "dropshot", + "expectorate", "futures", "gateway-client", "gateway-types", @@ -7226,6 +7227,7 @@ dependencies = [ "slog-error-chain", "steno", "strum 0.27.2", + "swrite", "tabled 0.15.0", "test-strategy", "textwrap", diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index ba395aff074..c40a0a151ab 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -70,6 +70,7 @@ use nexus_types::internal_api::background::SupportBundleEreportStatus; use nexus_types::internal_api::background::TufArtifactReplicationCounters; use nexus_types::internal_api::background::TufArtifactReplicationRequest; use nexus_types::internal_api::background::TufArtifactReplicationStatus; +use nexus_types::internal_api::background::TufRepoPrunerStatus; use nexus_types::inventory::BaseboardId; use omicron_uuid_kinds::BlueprintUuid; use omicron_uuid_kinds::CollectionUuid; @@ -1199,6 +1200,9 @@ fn print_task_details(bgtask: &BackgroundTask, details: &serde_json::Value) { "tuf_artifact_replication" => { print_task_tuf_artifact_replication(details); } + "tuf_repo_pruner" => { + print_task_tuf_repo_pruner(details); + } "alert_dispatcher" => { print_task_alert_dispatcher(details); } @@ -2573,6 +2577,18 @@ fn print_task_tuf_artifact_replication(details: &serde_json::Value) { } } +fn print_task_tuf_repo_pruner(details: &serde_json::Value) { + match serde_json::from_value::(details.clone()) { + Err(error) => eprintln!( + "warning: failed to interpret task details: {:?}: {:?}", + error, details + ), + Ok(status) => { + print!("{}", status); + } + } +} + fn print_task_alert_dispatcher(details: &serde_json::Value) { use nexus_types::internal_api::background::AlertDispatched; use nexus_types::internal_api::background::AlertDispatcherStatus; diff --git a/dev-tools/omdb/tests/env.out b/dev-tools/omdb/tests/env.out index 51d6807144d..b1dae6cf103 100644 --- a/dev-tools/omdb/tests/env.out +++ b/dev-tools/omdb/tests/env.out @@ -199,6 +199,10 @@ task: "tuf_artifact_replication" replicate update repo artifacts across sleds +task: "tuf_repo_pruner" + determine which TUF repos' artifacts can be pruned + + task: "v2p_manager" manages opte v2p mappings for vpc networking @@ -407,6 +411,10 @@ task: "tuf_artifact_replication" replicate update repo artifacts across sleds +task: "tuf_repo_pruner" + determine which TUF repos' artifacts can be pruned + + task: "v2p_manager" manages opte v2p mappings for vpc networking @@ -602,6 +610,10 @@ task: "tuf_artifact_replication" replicate update repo artifacts across sleds +task: "tuf_repo_pruner" + determine which TUF repos' artifacts can be pruned + + task: "v2p_manager" manages opte v2p mappings for vpc networking diff --git a/dev-tools/omdb/tests/successes.out b/dev-tools/omdb/tests/successes.out index fc0fd5ccfde..e5bba7d098f 100644 --- a/dev-tools/omdb/tests/successes.out +++ b/dev-tools/omdb/tests/successes.out @@ -424,6 +424,10 @@ task: "tuf_artifact_replication" replicate update repo artifacts across sleds +task: "tuf_repo_pruner" + determine which TUF repos' artifacts can be pruned + + task: "v2p_manager" manages opte v2p mappings for vpc networking @@ -811,6 +815,18 @@ task: "tuf_artifact_replication" copy err: 0 local repos: 0 +task: "tuf_repo_pruner" + configured period: every m + last completed activation: , triggered by + started at (s ago) and ran for ms + configuration: + nkeep_recent_releases: 3 + nkeep_recent_uploads: 3 + repo pruned: none + repos kept because they're recent target releases: none + repos kept because they're recent uploads: none + other repos eligible for pruning: none + task: "v2p_manager" configured period: every s last completed activation: , triggered by @@ -1329,6 +1345,18 @@ task: "tuf_artifact_replication" copy err: 0 local repos: 0 +task: "tuf_repo_pruner" + configured period: every m + last completed activation: , triggered by + started at (s ago) and ran for ms + configuration: + nkeep_recent_releases: 3 + nkeep_recent_uploads: 3 + repo pruned: none + repos kept because they're recent target releases: none + repos kept because they're recent uploads: none + other repos eligible for pruning: none + task: "v2p_manager" configured period: every s last completed activation: , triggered by diff --git a/nexus/src/app/background/tasks/tuf_repo_pruner.rs b/nexus/src/app/background/tasks/tuf_repo_pruner.rs index fae158ef625..586caadbf35 100644 --- a/nexus/src/app/background/tasks/tuf_repo_pruner.rs +++ b/nexus/src/app/background/tasks/tuf_repo_pruner.rs @@ -110,8 +110,17 @@ async fn tuf_repos_prune( recent_releases: &recent_releases.releases, }); + info!(&opctx.log, "tuf_repo_prune decision"; "status" => ?status); + // If we decided to prune something, do it. if let Some(to_prune) = &status.repo_prune { + info!( + &opctx.log, + "tuf_repo_prune: pruning repo"; + "repo_id" => %to_prune.id, + "system_version" => %to_prune.system_version, + "time_created" => %to_prune.time_created, + ); let prune_id = to_prune.id; if let Err(error) = datastore .tuf_repo_mark_pruned( @@ -122,6 +131,13 @@ async fn tuf_repos_prune( ) .await { + warn!( + &opctx.log, + "tuf_repo_prune: failed to prune"; + "repo_id" => %to_prune.id, + "system_version" => %to_prune.system_version, + "time_created" => %to_prune.time_created, + ); status.warnings.push(format!( "failed to prune {} (release {}): {}", prune_id, diff --git a/nexus/types/Cargo.toml b/nexus/types/Cargo.toml index 5eca41eddca..3218549670b 100644 --- a/nexus/types/Cargo.toml +++ b/nexus/types/Cargo.toml @@ -66,12 +66,14 @@ omicron-common.workspace = true omicron-passwords.workspace = true omicron-workspace-hack.workspace = true semver.workspace = true +swrite.workspace = true tough.workspace = true # Note: we're trying to avoid a dependency from nexus-types to sled-agent-types # because the correct direction of dependency is unclear. If there are types # common to both, put them in `omicron-common` or `nexus-sled-agent-shared`. [dev-dependencies] +expectorate.workspace = true gateway-types = { workspace = true, features = ["testing"] } iddqd = { workspace = true, features = ["proptest"] } newtype-uuid = { workspace = true, features = ["proptest1"] } diff --git a/nexus/types/output/tuf_repo_pruner_status.out b/nexus/types/output/tuf_repo_pruner_status.out new file mode 100644 index 00000000000..5e43e5db4a2 --- /dev/null +++ b/nexus/types/output/tuf_repo_pruner_status.out @@ -0,0 +1,10 @@ + configuration: + nkeep_recent_releases: 1 + nkeep_recent_uploads: 1 + repo pruned: 4e8a87a0-3102-4014-99d3-e1bf486685bd (1.2.3, created 2025-09-29 01:23:45 UTC) + repos kept because they're recent target releases: + 4e8a87a0-3102-4014-99d3-e1bf486685bd (1.2.3, created 2025-09-29 01:23:45 UTC) + repos kept because they're recent uploads: none + other repos eligible for pruning: + 4e8a87a0-3102-4014-99d3-e1bf486685bd (1.2.3, created 2025-09-29 01:23:45 UTC) + 867e42ae-ed72-4dc3-abcd-508b875c9601 (4.5.6, created 2025-09-29 02:34:56 UTC) diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index c8625f6edae..d363202eeb3 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -25,6 +25,8 @@ use serde::Serialize; use std::collections::BTreeMap; use std::collections::VecDeque; use std::sync::Arc; +use swrite::SWrite; +use swrite::swriteln; use tufaceous_artifact::ArtifactHash; use uuid::Uuid; @@ -385,7 +387,7 @@ pub enum TufArtifactReplicationOperation { Copy { hash: ArtifactHash, source_sled: SledUuid }, } -#[derive(Debug, Serialize)] +#[derive(Debug, Deserialize, Serialize)] pub struct TufRepoPrunerStatus { // Input /// how many recent releases we're configured to keep @@ -406,7 +408,74 @@ pub struct TufRepoPrunerStatus { pub warnings: Vec, } -#[derive(Clone, Debug, Serialize)] +impl std::fmt::Display for TufRepoPrunerStatus { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + fn print_collection(c: &IdOrdMap) -> String { + if c.is_empty() { + return String::from("none\n"); + } + + let mut rv = String::from("\n"); + for repo in c { + swriteln!( + rv, + " {} ({}, created {})", + repo.id, + repo.system_version, + repo.time_created, + ); + } + + rv + } + + // This is indented appropriately for use in `omdb`. + writeln!(f, " configuration:")?; + writeln!( + f, + " nkeep_recent_releases: {}", + self.nkeep_recent_releases + )?; + writeln!( + f, + " nkeep_recent_uploads: {}", + self.nkeep_recent_releases + )?; + + write!(f, " repo pruned:")?; + if let Some(pruned) = &self.repo_prune { + writeln!( + f, + " {} ({}, created {})", + pruned.id, pruned.system_version, pruned.time_created + )?; + } else { + writeln!(f, " none")?; + } + + write!( + f, + " repos kept because they're recent target releases: {}", + print_collection(&self.repos_keep_target_release) + )?; + + write!( + f, + " repos kept because they're recent uploads: {}", + print_collection(&self.repos_keep_recent_uploads) + )?; + + write!( + f, + " other repos eligible for pruning: {}", + print_collection(&self.other_repos_eligible_to_prune) + )?; + + Ok(()) + } +} + +#[derive(Clone, Debug, Deserialize, Serialize)] pub struct TufRepoInfo { pub id: TufRepoUuid, pub system_version: Version, @@ -658,3 +727,43 @@ pub struct EreporterStatus { pub requests: usize, pub errors: Vec, } + +#[cfg(test)] +mod test { + use super::TufRepoInfo; + use super::TufRepoPrunerStatus; + use expectorate::assert_contents; + use iddqd::IdOrdMap; + + #[test] + fn test_display_tuf_repo_pruner_status() { + let repo1 = TufRepoInfo { + id: "4e8a87a0-3102-4014-99d3-e1bf486685bd".parse().unwrap(), + system_version: "1.2.3".parse().unwrap(), + time_created: "2025-09-29T01:23:45Z".parse().unwrap(), + }; + let repo2 = TufRepoInfo { + id: "867e42ae-ed72-4dc3-abcd-508b875c9601".parse().unwrap(), + system_version: "4.5.6".parse().unwrap(), + time_created: "2025-09-29T02:34:56Z".parse().unwrap(), + }; + let repo_map: IdOrdMap<_> = std::iter::once(repo1.clone()).collect(); + + let status = TufRepoPrunerStatus { + nkeep_recent_releases: 1, + nkeep_recent_uploads: 2, + repos_keep_target_release: repo_map, + repos_keep_recent_uploads: IdOrdMap::new(), + repo_prune: Some(repo1.clone()), + other_repos_eligible_to_prune: [repo1.clone(), repo2.clone()] + .into_iter() + .collect(), + warnings: vec![String::from("fake-oh problem-oh")], + }; + + assert_contents( + "output/tuf_repo_pruner_status.out", + &status.to_string(), + ); + } +} From f9ea55987c41bd1893f7e6c3a79e82a9353e1440 Mon Sep 17 00:00:00 2001 From: David Pacheco Date: Tue, 30 Sep 2025 13:02:04 -0700 Subject: [PATCH 19/19] review feedback --- dev-tools/omdb/src/bin/omdb/nexus.rs | 5 +++-- nexus/db-queries/src/db/datastore/update.rs | 9 ++++++--- nexus/examples/config-second.toml | 6 ++++++ nexus/examples/config.toml | 6 ++++++ nexus/src/app/background/tasks/tuf_repo_pruner.rs | 12 ++++++------ nexus/tests/config.test.toml | 6 ++++++ smf/nexus/multi-sled/config-partial.toml | 6 ++++++ smf/nexus/single-sled/config-partial.toml | 6 ++++++ 8 files changed, 45 insertions(+), 11 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index c2542a8b1a0..cfbc140bd3e 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -2580,8 +2580,9 @@ fn print_task_tuf_artifact_replication(details: &serde_json::Value) { fn print_task_tuf_repo_pruner(details: &serde_json::Value) { match serde_json::from_value::(details.clone()) { Err(error) => eprintln!( - "warning: failed to interpret task details: {:?}: {:?}", - error, details + "warning: failed to interpret task details: {}: {:?}", + InlineErrorChain::new(&error), + details ), Ok(status) => { print!("{}", status); diff --git a/nexus/db-queries/src/db/datastore/update.rs b/nexus/db-queries/src/db/datastore/update.rs index ac9d3ebca36..1f45a2070e9 100644 --- a/nexus/db-queries/src/db/datastore/update.rs +++ b/nexus/db-queries/src/db/datastore/update.rs @@ -945,9 +945,12 @@ mod test { .await .expect("initial target release"); let mut next = initial; - for repo_id in - [old_target_repo2_id, old_target_repo3_id, old_target_repo4_id] - { + for repo_id in [ + old_target_repo1_id, + old_target_repo2_id, + old_target_repo3_id, + old_target_repo4_id, + ] { next = datastore .target_release_insert( opctx, diff --git a/nexus/examples/config-second.toml b/nexus/examples/config-second.toml index 64c9a55c08a..9b24578215c 100644 --- a/nexus/examples/config-second.toml +++ b/nexus/examples/config-second.toml @@ -153,7 +153,13 @@ region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 tuf_artifact_replication.min_sled_replication = 1 tuf_repo_pruner.period_secs = 300 +# How many extra recent target releases to keep +# The system always keeps two: the current release and the previous one. +# This number is in addition to that. tuf_repo_pruner.nkeep_extra_target_releases = 1 +# How many extra recently uploaded repos to keep +# The system always keeps one, assuming that the operator may be about to +# update to it. This number is in addition to that. tuf_repo_pruner.nkeep_extra_newly_uploaded = 1 # In general, the webhook dispatcher will be activated when events are queued, # so we don't need to periodically activate it *that* frequently. diff --git a/nexus/examples/config.toml b/nexus/examples/config.toml index d3888d3b6ad..3a3d3ec294c 100644 --- a/nexus/examples/config.toml +++ b/nexus/examples/config.toml @@ -137,7 +137,13 @@ region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 tuf_artifact_replication.min_sled_replication = 1 tuf_repo_pruner.period_secs = 300 +# How many extra recent target releases to keep +# The system always keeps two: the current release and the previous one. +# This number is in addition to that. tuf_repo_pruner.nkeep_extra_target_releases = 1 +# How many extra recently uploaded repos to keep +# The system always keeps one, assuming that the operator may be about to +# update to it. This number is in addition to that. tuf_repo_pruner.nkeep_extra_newly_uploaded = 1 # In general, the webhook dispatcher will be activated when events are queued, # so we don't need to periodically activate it *that* frequently. diff --git a/nexus/src/app/background/tasks/tuf_repo_pruner.rs b/nexus/src/app/background/tasks/tuf_repo_pruner.rs index 586caadbf35..2f9003baa0f 100644 --- a/nexus/src/app/background/tasks/tuf_repo_pruner.rs +++ b/nexus/src/app/background/tasks/tuf_repo_pruner.rs @@ -16,6 +16,7 @@ use nexus_types::internal_api::background::TufRepoPrunerStatus; use omicron_uuid_kinds::TufRepoUuid; use serde_json::json; use slog_error_chain::InlineErrorChain; +use std::cmp::Reverse; use std::collections::BTreeSet; use std::sync::Arc; @@ -63,7 +64,7 @@ impl BackgroundTask for TufRepoPruner { }), }, Err(error) => json!({ - "error": error.to_string(), + "error": InlineErrorChain::new(&*error).to_string(), }), } }) @@ -77,9 +78,9 @@ async fn tuf_repos_prune( ) -> Result { // Compute configuration. let nkeep_recent_releases = NKEEP_RECENT_TARGET_RELEASES_ALWAYS - + config.nkeep_extra_target_releases; - let nkeep_recent_uploads = - NKEEP_RECENT_UPLOADS_ALWAYS + config.nkeep_extra_newly_uploaded; + .saturating_add(config.nkeep_extra_target_releases); + let nkeep_recent_uploads = NKEEP_RECENT_UPLOADS_ALWAYS + .saturating_add(config.nkeep_extra_newly_uploaded); // Fetch the state we need to make a decision. let tuf_generation = datastore @@ -205,8 +206,7 @@ fn decide_prune(args: TufRepoPrune) -> TufRepoPrunerStatus { .iter() .filter(|r| !repos_keep_target_release.contains_key(&r.id)) .collect::>(); - non_target_release_repos.sort_by_key(|k| k.time_created); - non_target_release_repos.reverse(); + non_target_release_repos.sort_by_key(|k| Reverse(k.time_created)); let mut repos_keep_recent_uploads = IdOrdMap::new(); let mut repo_prune = None; let mut other_repos_eligible_to_prune = IdOrdMap::new(); diff --git a/nexus/tests/config.test.toml b/nexus/tests/config.test.toml index 0576e85181b..7f165286eee 100644 --- a/nexus/tests/config.test.toml +++ b/nexus/tests/config.test.toml @@ -170,7 +170,13 @@ tuf_artifact_replication.period_secs = 3600 # Update integration tests are started with 4 sled agents. tuf_artifact_replication.min_sled_replication = 3 tuf_repo_pruner.period_secs = 300 +# How many extra recent target releases to keep +# The system always keeps two: the current release and the previous one. +# This number is in addition to that. tuf_repo_pruner.nkeep_extra_target_releases = 1 +# How many extra recently uploaded repos to keep +# The system always keeps one, assuming that the operator may be about to +# update to it. This number is in addition to that. tuf_repo_pruner.nkeep_extra_newly_uploaded = 1 # In general, the webhook dispatcher will be activated when events are queued, # so we don't need to periodically activate it *that* frequently. diff --git a/smf/nexus/multi-sled/config-partial.toml b/smf/nexus/multi-sled/config-partial.toml index 2f4c0bede44..51ec021af3f 100644 --- a/smf/nexus/multi-sled/config-partial.toml +++ b/smf/nexus/multi-sled/config-partial.toml @@ -77,7 +77,13 @@ region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 tuf_artifact_replication.min_sled_replication = 3 tuf_repo_pruner.period_secs = 300 +# How many extra recent target releases to keep +# The system always keeps two: the current release and the previous one. +# This number is in addition to that. tuf_repo_pruner.nkeep_extra_target_releases = 1 +# How many extra recently uploaded repos to keep +# The system always keeps one, assuming that the operator may be about to +# update to it. This number is in addition to that. tuf_repo_pruner.nkeep_extra_newly_uploaded = 1 read_only_region_replacement_start.period_secs = 30 # In general, the webhook dispatcher will be activated when events are queued, diff --git a/smf/nexus/single-sled/config-partial.toml b/smf/nexus/single-sled/config-partial.toml index d5b3f98a94e..e8e0eea2d6c 100644 --- a/smf/nexus/single-sled/config-partial.toml +++ b/smf/nexus/single-sled/config-partial.toml @@ -77,7 +77,13 @@ region_snapshot_replacement_finish.period_secs = 30 tuf_artifact_replication.period_secs = 300 tuf_artifact_replication.min_sled_replication = 1 tuf_repo_pruner.period_secs = 300 +# How many extra recent target releases to keep +# The system always keeps two: the current release and the previous one. +# This number is in addition to that. tuf_repo_pruner.nkeep_extra_target_releases = 1 +# How many extra recently uploaded repos to keep +# The system always keeps one, assuming that the operator may be about to +# update to it. This number is in addition to that. tuf_repo_pruner.nkeep_extra_newly_uploaded = 1 read_only_region_replacement_start.period_secs = 30 # In general, the webhook dispatcher will be activated when events are queued,