From 77684b42d1c2c18d2718049ab9619e9667ea5303 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Mon, 28 Jul 2025 13:43:39 -0700 Subject: [PATCH 01/11] wip --- nexus/db-queries/src/db/datastore/ereport.rs | 124 +++++++++++++++++- nexus/db-queries/src/db/datastore/mod.rs | 1 + .../tasks/support_bundle_collector.rs | 17 ++- 3 files changed, 140 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index a7cab6f951..49692ea20a 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -17,7 +17,7 @@ use crate::db::model::SpMgsSlot; use crate::db::model::SpType; use crate::db::model::SqlU16; use crate::db::model::SqlU32; -use crate::db::pagination::paginated; +use crate::db::pagination::{paginated, paginated_multicolumn}; use async_bb8_diesel::AsyncRunQueryDsl; use chrono::DateTime; use chrono::Utc; @@ -48,6 +48,44 @@ pub struct EreporterRestartBySerial { pub ereports: u32, } +/// A set of filters for fetching ereports. +#[derive(Clone, Debug, Default)] +pub struct EreportFilters { + /// If present, include only ereports that were collected at the specified + /// timestamp or later. + /// + /// If `end_time` is also present, this value *must* be earlier than + /// `end_time`. + pub start_time: Option>, + /// If present, include only ereports that were collected at the specified + /// timestamp or before. + /// + /// If `start_time` is also present, this value *must* be later than + /// `start_time`. + pub end_time: Option>, + /// If this list is non-empty, include only ereports that were reported by + /// systems with the provided serial numbers. + pub only_serials: Vec, + /// If this list is non-empty, include only ereports with the provided class + /// strings. + // TODO(eliza): globbing could be nice to add here eventually... + pub only_classes: Vec, +} + +impl EreportFilters { + fn check_time_range(&self) -> Result<(), Error> { + if let (Some(start), Some(end)) = (self.start_time, self.end_time) { + if start > end { + return Err(Error::invalid_request( + "start time must be before end time", + )); + } + } + + Ok(()) + } +} + impl DataStore { /// Fetch an ereport by its restart ID and ENA. /// @@ -93,6 +131,90 @@ impl DataStore { Err(Error::non_resourcetype_not_found(format!("ereport {id}"))) } + pub async fn host_ereports_fetch_matching( + &self, + opctx: &OpContext, + filters: &EreportFilters, + pagparams: &DataPageParams<'_, (Uuid, DbEna)>, + ) -> ListResultVec { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + filters.check_time_range()?; + + let mut query = paginated_multicolumn( + host_dsl::host_ereport, + (host_dsl::restart_id, host_dsl::ena), + pagparams, + ) + .filter(host_dsl::time_deleted.is_null()) + .select(HostEreport::as_select()); + + if let Some(start) = filters.start_time { + query = query.filter(host_dsl::time_collected.ge(start)); + } + + if let Some(end) = filters.end_time { + query = query.filter(host_dsl::time_collected.le(end)); + } + + if !filters.only_serials.is_empty() { + query = query.filter( + host_dsl::sled_serial.eq_any(filters.only_serials.clone()), + ); + } + + if !filters.only_classes.is_empty() { + query = query + .filter(host_dsl::class.eq_any(filters.only_classes.clone())); + } + + query + .load_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + + pub async fn sp_ereports_fetch_matching( + &self, + opctx: &OpContext, + filters: &EreportFilters, + pagparams: &DataPageParams<'_, (Uuid, DbEna)>, + ) -> ListResultVec { + opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + filters.check_time_range()?; + + let mut query = paginated_multicolumn( + sp_dsl::sp_ereport, + (sp_dsl::restart_id, sp_dsl::ena), + pagparams, + ) + .filter(sp_dsl::time_deleted.is_null()) + .select(SpEreport::as_select()); + + if let Some(start) = filters.start_time { + query = query.filter(sp_dsl::time_collected.ge(start)); + } + + if let Some(end) = filters.end_time { + query = query.filter(sp_dsl::time_collected.le(end)); + } + + if !filters.only_serials.is_empty() { + query = query.filter( + sp_dsl::serial_number.eq_any(filters.only_serials.clone()), + ); + } + + if !filters.only_classes.is_empty() { + query = query + .filter(sp_dsl::class.eq_any(filters.only_classes.clone())); + } + + query + .load_async(&*self.pool_connection_authorized(opctx).await?) + .await + .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server)) + } + /// List ereports from the SP with the given restart ID. pub async fn sp_ereport_list_by_restart( &self, diff --git a/nexus/db-queries/src/db/datastore/mod.rs b/nexus/db-queries/src/db/datastore/mod.rs index 55f0e58222..ddedac2d0f 100644 --- a/nexus/db-queries/src/db/datastore/mod.rs +++ b/nexus/db-queries/src/db/datastore/mod.rs @@ -121,6 +121,7 @@ mod zpool; pub use address_lot::AddressLotCreateResult; pub use dns::DataStoreDnsTest; pub use dns::DnsVersionUpdateBuilder; +pub use ereport::EreportFilters; pub use instance::{ InstanceAndActiveVmm, InstanceGestalt, InstanceStateComputer, }; diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 74459fc86a..7d8bbe5b88 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -27,6 +27,7 @@ use nexus_db_model::SupportBundleState; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; +use nexus_db_queries::db::datastore::EreportFilters; use nexus_types::deployment::SledFilter; use nexus_types::identity::Asset; use nexus_types::internal_api::background::SupportBundleCleanupReport; @@ -84,11 +85,20 @@ struct BundleRequest { // // Typically, this is CHUNK_SIZE, but can be modified for testing. transfer_chunk_size: NonZeroU64, + + ereport_query: Option, } impl Default for BundleRequest { fn default() -> Self { - Self { skip_sled_info: false, transfer_chunk_size: CHUNK_SIZE } + Self { + skip_sled_info: false, + transfer_chunk_size: CHUNK_SIZE, + ereport_query: Some(EreportFilters { + start_time: Some(chrono::Utc::now() - chrono::Days::new(7)), + ..EreportFilters::default() + }), + } } } @@ -859,6 +869,10 @@ impl BundleCollection { } return Ok(()); } + + async fn collect_ereports(&self, path: Utf8PathBuf) -> Result<(), Error> { + + } } impl BackgroundTask for SupportBundleCollector { @@ -1543,6 +1557,7 @@ mod test { let request = BundleRequest { skip_sled_info: true, transfer_chunk_size: NonZeroU64::new(16).unwrap(), + ereport_query: None, }; let report = collector From 5b3c7ed62166a1a9024610338a11e5a55b05b4fc Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 30 Jul 2025 12:51:17 -0700 Subject: [PATCH 02/11] draw the rest of the owl --- Cargo.lock | 2 + dev-tools/omdb/src/bin/omdb/db/ereport.rs | 10 +- dev-tools/omdb/src/bin/omdb/nexus.rs | 26 +++ nexus/Cargo.toml | 2 +- nexus/db-model/src/ereport.rs | 37 ++- nexus/db-queries/src/db/datastore/ereport.rs | 2 +- .../tasks/support_bundle_collector.rs | 217 +++++++++++++++++- nexus/types/src/internal_api/background.rs | 20 ++ 8 files changed, 301 insertions(+), 15 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 67c5432690..467be1b00f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13625,6 +13625,8 @@ dependencies = [ "bytes", "futures-core", "futures-sink", + "futures-util", + "hashbrown 0.15.4", "pin-project-lite", "slab", "tokio", diff --git a/dev-tools/omdb/src/bin/omdb/db/ereport.rs b/dev-tools/omdb/src/bin/omdb/db/ereport.rs index e0d2d184d3..99db282534 100644 --- a/dev-tools/omdb/src/bin/omdb/db/ereport.rs +++ b/dev-tools/omdb/src/bin/omdb/db/ereport.rs @@ -155,7 +155,10 @@ async fn cmd_db_ereport_list( restart_id: restart_id.into_untyped_uuid(), ena: ena.into(), class: class.clone(), - source: db::model::Reporter::Sp { sp_type, slot: sp_slot.0 }, + source: db::model::Reporter::Sp { + sp_type: sp_type.into(), + slot: sp_slot.0, + }, serial: serial_number.as_deref(), part_number: part_number.as_deref(), } @@ -547,7 +550,10 @@ async fn cmd_db_ereporters( )| { ReporterRow { first_seen, - identity: db::model::Reporter::Sp { slot: slot.0, sp_type }, + identity: db::model::Reporter::Sp { + slot: slot.0, + sp_type: sp_type.into(), + }, serial, part_number, id: restart_id, diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index 6d63240b06..f6f2f333dc 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -66,6 +66,7 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; +use nexus_types::internal_api::background::SupportBundleEreportCollection; use nexus_types::internal_api::background::TufArtifactReplicationCounters; use nexus_types::internal_api::background::TufArtifactReplicationRequest; use nexus_types::internal_api::background::TufArtifactReplicationStatus; @@ -2414,6 +2415,8 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { listed_in_service_sleds, listed_sps, activated_in_db_ok, + sp_ereports, + host_ereports, }) = collection_report { println!(" Support Bundle Collection Report:"); @@ -2427,6 +2430,29 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { println!( " Bundle was activated in the database: {activated_in_db_ok}" ); + print_ereport_status("SP", &sp_ereports); + print_ereport_status("Host OS", &host_ereports); + } + } + } + + fn print_ereport_status( + which: &str, + status: &SupportBundleEreportCollection, + ) { + match status { + SupportBundleEreportCollection::NotRequested => { + println!(" {which} ereport collection was not requested"); + } + SupportBundleEreportCollection::Failed { error, n_collected } => { + println!(" {which} ereport collection failed:"); + println!( + " ereports collected successfully: {n_collected}" + ); + println!(" error: {error}"); + } + SupportBundleEreportCollection::Collected { n_collected } => { + println!(" {which} ereports collected: {n_collected}"); } } } diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index ffe813f80e..3808a96ae8 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -107,7 +107,7 @@ tempfile.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["full"] } tokio-postgres = { workspace = true, features = ["with-serde_json-1"] } -tokio-util = { workspace = true, features = ["codec"] } +tokio-util = { workspace = true, features = ["codec", "rt"] } tough.workspace = true tufaceous-artifact.workspace = true uuid.workspace = true diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index 12320836b5..1129cbc988 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -63,11 +63,14 @@ where } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] pub struct Ereport { + #[serde(flatten)] pub id: EreportId, + #[serde(flatten)] pub metadata: EreportMetadata, pub reporter: Reporter, + #[serde(flatten)] pub report: serde_json::Value, } @@ -96,7 +99,7 @@ impl From for Ereport { serial_number, class, }, - reporter: Reporter::Sp { sp_type, slot: sp_slot.0 }, + reporter: Reporter::Sp { sp_type: sp_type.into(), slot: sp_slot.0 }, report, } } @@ -131,7 +134,7 @@ impl From for Ereport { } } -#[derive(Clone, Debug)] +#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)] pub struct EreportMetadata { pub time_collected: DateTime, pub time_deleted: Option>, @@ -141,22 +144,40 @@ pub struct EreportMetadata { pub class: Option, } -#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd)] +#[derive( + Clone, + Debug, + Eq, + PartialEq, + Ord, + PartialOrd, + serde::Serialize, + serde::Deserialize, +)] pub enum Reporter { - Sp { sp_type: SpType, slot: u16 }, + Sp { sp_type: nexus_types::inventory::SpType, slot: u16 }, HostOs { sled: SledUuid }, } impl std::fmt::Display for Reporter { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Self::Sp { sp_type: SpType::Sled, slot } => { + Self::Sp { + sp_type: nexus_types::inventory::SpType::Sled, + slot, + } => { write!(f, "Sled (SP) {slot:02}") } - Self::Sp { sp_type: SpType::Switch, slot } => { + Self::Sp { + sp_type: nexus_types::inventory::SpType::Switch, + slot, + } => { write!(f, "Switch {slot}") } - Self::Sp { sp_type: SpType::Power, slot } => { + Self::Sp { + sp_type: nexus_types::inventory::SpType::Power, + slot, + } => { write!(f, "PSC {slot}") } Self::HostOs { sled } => { diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 49692ea20a..22a2b19aec 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -266,7 +266,7 @@ impl DataStore { EreporterRestartBySerial { id: EreporterRestartUuid::from_untyped_uuid(restart_id), reporter_kind: Reporter::Sp { - sp_type, + sp_type: sp_type.into(), slot: sp_slot.into(), }, first_seen_at: first_seen.expect(FIRST_SEEN_NOT_NULL), diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 7d8bbe5b88..116d5c4ce1 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -22,16 +22,20 @@ use gateway_client::types::SpIdentifier; use gateway_client::types::SpIgnition; use internal_dns_resolver::Resolver; use internal_dns_types::names::ServiceName; +use nexus_db_model::Ereport; use nexus_db_model::SupportBundle; use nexus_db_model::SupportBundleState; use nexus_db_queries::authz; use nexus_db_queries::context::OpContext; use nexus_db_queries::db::DataStore; +use nexus_db_queries::db::datastore; use nexus_db_queries::db::datastore::EreportFilters; +use nexus_db_queries::db::pagination::Paginator; use nexus_types::deployment::SledFilter; use nexus_types::identity::Asset; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; +use nexus_types::internal_api::background::SupportBundleEreportCollection; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::LookupType; @@ -53,6 +57,7 @@ use std::sync::Arc; use tokio::io::AsyncReadExt; use tokio::io::AsyncSeekExt; use tokio::io::SeekFrom; +use tokio_util::task::AbortOnDropHandle; use tufaceous_artifact::ArtifactHash; use zip::ZipArchive; use zip::ZipWriter; @@ -673,6 +678,78 @@ impl BundleCollection { ) .await?; + let ereport_collection = if let Some(ref ereport_filters) = + self.request.ereport_query + { + // If ereports are to be included in the bundle, have someone go do + // that in the background while we're gathering up other stuff. Note + // that the `JoinHandle`s for these tasks are wrapped in + // `AbortOnDropHandle`s for cancellation correctness; this ensures + // that if collecting the bundle is cancelled and this future is + // dropped, the tasks that we've spawned to collect ereports are + // aborted as well. + let dir = dir.path().join("ereports"); + let host = AbortOnDropHandle::new(tokio::spawn({ + let dir = dir.clone(); + let filters = ereport_filters.clone(); + let collection = self.clone(); + async move { + let mut n_collected = 0; + match collection + .collect_host_ereports(&filters, &dir, &mut n_collected) + .await + { + Ok(_) => SupportBundleEreportCollection::Collected { + n_collected, + }, + Err(e) => { + warn!( + collection.log, + "Failed to collect host OS ereports \ + ({n_collected} written successfully)"; + "err" => ?e, + ); + SupportBundleEreportCollection::Failed { + n_collected, + error: e.to_string(), + } + } + } + } + })); + let sp = AbortOnDropHandle::new(tokio::spawn({ + let filters = ereport_filters.clone(); + let collection = self.clone(); + async move { + let mut n_collected = 0; + match collection + .collect_sp_ereports(&filters, &dir, &mut n_collected) + .await + { + Ok(_) => SupportBundleEreportCollection::Collected { + n_collected, + }, + Err(e) => { + warn!( + collection.log, + "Failed to collect SP ereports ({n_collected} \ + written successfully)"; + "err" => ?e, + ); + SupportBundleEreportCollection::Failed { + n_collected, + error: e.to_string(), + } + } + } + } + })); + Some((host, sp)) + } else { + debug!(log, "Support bundle: ereports not requested"); + None + }; + let sp_dumps_dir = dir.path().join("sp_task_dumps"); tokio::fs::create_dir_all(&sp_dumps_dir).await.with_context(|| { format!("failed to create SP task dump directory {sp_dumps_dir}") @@ -720,6 +797,39 @@ impl BundleCollection { } } + if let Some((host, sp)) = ereport_collection { + let (host, sp) = tokio::join!(host, sp); + match host { + Ok(status) => report.host_ereports = status, + Err(err) => { + warn!( + &self.log, + "Support bundle: host ereport collection task failed"; + "err" => ?err, + ); + report.host_ereports = + SupportBundleEreportCollection::Failed { + n_collected: 0, + error: err.to_string(), + }; + } + } + match sp { + Ok(status) => report.sp_ereports = status, + Err(err) => { + warn!( + &self.log, + "Support bundle: SP ereport collection task failed"; + "err" => ?err, + ); + report.host_ereports = + SupportBundleEreportCollection::Failed { + n_collected: 0, + error: err.to_string(), + }; + } + } + } Ok(report) } @@ -869,9 +979,94 @@ impl BundleCollection { } return Ok(()); } - - async fn collect_ereports(&self, path: Utf8PathBuf) -> Result<(), Error> { - + + async fn collect_sp_ereports( + &self, + filters: &EreportFilters, + dir: &Utf8Path, + ereports_written: &mut usize, + ) -> anyhow::Result<()> { + let mut paginator = Paginator::new( + datastore::SQL_BATCH_SIZE, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + let ereports = self + .datastore + .sp_ereports_fetch_matching( + &self.opctx, + &filters, + &p.current_pagparams(), + ) + .await + .map_err(|e| { + e.internal_context("failed to query for SP ereports") + })?; + paginator = p.found_batch(&ereports, &|ereport| { + (ereport.restart_id.into_untyped_uuid(), ereport.ena.into()) + }); + + let n_ereports = ereports.len(); + for ereport in ereports { + write_ereport(ereport.into(), &dir).await?; + } + *ereports_written += n_ereports; + debug!( + self.log, + "Support bundle: added {n_ereports} SP ereports \ + ({ereports_written} total)" + ); + } + + info!( + self.log, + "Support bundle: collected {} total SP ereports", ereports_written + ); + Ok(()) + } + + async fn collect_host_ereports( + &self, + filters: &EreportFilters, + dir: &Utf8Path, + ereports_written: &mut usize, + ) -> anyhow::Result<()> { + let mut paginator = Paginator::new( + datastore::SQL_BATCH_SIZE, + dropshot::PaginationOrder::Ascending, + ); + while let Some(p) = paginator.next() { + let ereports = self + .datastore + .host_ereports_fetch_matching( + &self.opctx, + &filters, + &p.current_pagparams(), + ) + .await + .map_err(|e| { + e.internal_context("failed to query for host OS ereports") + })?; + paginator = p.found_batch(&ereports, &|ereport| { + (ereport.restart_id.into_untyped_uuid(), ereport.ena.into()) + }); + let n_ereports = ereports.len(); + for ereport in ereports { + write_ereport(ereport.into(), &dir).await?; + } + *ereports_written += n_ereports; + debug!( + self.log, + "Support bundle: added {n_ereports} host OS ereports \ + ({ereports_written} total)" + ); + } + + info!( + self.log, + "Support bundle: collected {ereports_written} total SP ereports", + ); + Ok(()) } } @@ -918,6 +1113,22 @@ impl BackgroundTask for SupportBundleCollector { } } +async fn write_ereport(ereport: Ereport, dir: &Utf8Path) -> anyhow::Result<()> { + let sn = + ereport.metadata.serial_number.as_deref().unwrap_or("unknown_serial"); + let dir = dir.join(sn).join(ereport.id.restart_id.to_string()); + tokio::fs::create_dir_all(&dir) + .await + .with_context(|| format!("failed to create directory '{dir}'"))?; + let file_path = dir.join(format!("{}.json", ereport.id.ena)); + let json = serde_json::to_vec(&ereport).with_context(|| { + format!("failed to serialize ereport '{sn}:{}'", ereport.id) + })?; + tokio::fs::write(&file_path, json) + .await + .with_context(|| format!("failed to write {file_path}'")) +} + // Takes a directory "dir", and zips the contents into a single zipfile. fn bundle_to_zipfile(dir: &Utf8TempDir) -> anyhow::Result { let tempfile = tempfile_in(TEMPDIR)?; diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index ca97f5f892..a544a9ef92 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -236,6 +236,24 @@ pub struct SupportBundleCollectionReport { /// True iff the bundle was successfully made 'active' in the database. pub activated_in_db_ok: bool, + + /// Status of host OS ereport collection. + pub host_ereports: SupportBundleEreportCollection, + + /// Status of SP ereport collection. + pub sp_ereports: SupportBundleEreportCollection, +} + +#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] +pub enum SupportBundleEreportCollection { + /// Ereports were not requested for this bundle. + NotRequested, + + /// Ereports were collected successfully. + Collected { n_collected: usize }, + + /// Ereport collection failed, though some ereports may have been written. + Failed { n_collected: usize, error: String }, } impl SupportBundleCollectionReport { @@ -245,6 +263,8 @@ impl SupportBundleCollectionReport { listed_in_service_sleds: false, listed_sps: false, activated_in_db_ok: false, + host_ereports: SupportBundleEreportCollection::NotRequested, + sp_ereports: SupportBundleEreportCollection::NotRequested, } } } From 28cac037b3664a1b1c797547316e6af260b0154e Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 30 Jul 2025 14:39:26 -0700 Subject: [PATCH 03/11] + integration tests --- nexus/tests/integration_tests/support_bundles.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 5bd3f18faf..184426a5b8 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -19,6 +19,7 @@ use nexus_types::external_api::shared::SupportBundleInfo; use nexus_types::external_api::shared::SupportBundleState; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; +use nexus_types::internal_api::background::SupportBundleEreportCollection; use omicron_uuid_kinds::SupportBundleUuid; use serde::Deserialize; use std::io::Cursor; @@ -492,6 +493,12 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { listed_in_service_sleds: true, listed_sps: true, activated_in_db_ok: true, + host_ereports: SupportBundleEreportCollection::Collected { + n_collected: 0 + }, + sp_ereports: SupportBundleEreportCollection::Collected { + n_collected: 0 + } }) ); let bundle = bundle_get(&client, bundle.id).await.unwrap(); @@ -589,6 +596,12 @@ async fn test_support_bundle_range_requests( listed_in_service_sleds: true, listed_sps: true, activated_in_db_ok: true, + host_ereports: SupportBundleEreportCollection::Collected { + n_collected: 0 + }, + sp_ereports: SupportBundleEreportCollection::Collected { + n_collected: 0 + } }) ); let bundle = bundle_get(&client, bundle.id).await.unwrap(); From aced60b74b3e9ef99b8a691deabdf4cb65e06d81 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Wed, 30 Jul 2025 14:39:45 -0700 Subject: [PATCH 04/11] reticulate naming --- dev-tools/omdb/src/bin/omdb/nexus.rs | 13 ++++----- .../tasks/support_bundle_collector.rs | 28 +++++++++---------- .../integration_tests/support_bundles.rs | 10 +++---- nexus/types/src/internal_api/background.rs | 10 +++---- 4 files changed, 28 insertions(+), 33 deletions(-) diff --git a/dev-tools/omdb/src/bin/omdb/nexus.rs b/dev-tools/omdb/src/bin/omdb/nexus.rs index f6f2f333dc..fb455235ba 100644 --- a/dev-tools/omdb/src/bin/omdb/nexus.rs +++ b/dev-tools/omdb/src/bin/omdb/nexus.rs @@ -66,7 +66,7 @@ use nexus_types::internal_api::background::RegionSnapshotReplacementStartStatus; use nexus_types::internal_api::background::RegionSnapshotReplacementStepStatus; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; -use nexus_types::internal_api::background::SupportBundleEreportCollection; +use nexus_types::internal_api::background::SupportBundleEreportStatus; use nexus_types::internal_api::background::TufArtifactReplicationCounters; use nexus_types::internal_api::background::TufArtifactReplicationRequest; use nexus_types::internal_api::background::TufArtifactReplicationStatus; @@ -2436,22 +2436,19 @@ fn print_task_support_bundle_collector(details: &serde_json::Value) { } } - fn print_ereport_status( - which: &str, - status: &SupportBundleEreportCollection, - ) { + fn print_ereport_status(which: &str, status: &SupportBundleEreportStatus) { match status { - SupportBundleEreportCollection::NotRequested => { + SupportBundleEreportStatus::NotRequested => { println!(" {which} ereport collection was not requested"); } - SupportBundleEreportCollection::Failed { error, n_collected } => { + SupportBundleEreportStatus::Failed { error, n_collected } => { println!(" {which} ereport collection failed:"); println!( " ereports collected successfully: {n_collected}" ); println!(" error: {error}"); } - SupportBundleEreportCollection::Collected { n_collected } => { + SupportBundleEreportStatus::Collected { n_collected } => { println!(" {which} ereports collected: {n_collected}"); } } diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 116d5c4ce1..67025fb969 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -35,7 +35,7 @@ use nexus_types::deployment::SledFilter; use nexus_types::identity::Asset; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; -use nexus_types::internal_api::background::SupportBundleEreportCollection; +use nexus_types::internal_api::background::SupportBundleEreportStatus; use omicron_common::api::external::DataPageParams; use omicron_common::api::external::Error; use omicron_common::api::external::LookupType; @@ -699,7 +699,7 @@ impl BundleCollection { .collect_host_ereports(&filters, &dir, &mut n_collected) .await { - Ok(_) => SupportBundleEreportCollection::Collected { + Ok(_) => SupportBundleEreportStatus::Collected { n_collected, }, Err(e) => { @@ -709,7 +709,7 @@ impl BundleCollection { ({n_collected} written successfully)"; "err" => ?e, ); - SupportBundleEreportCollection::Failed { + SupportBundleEreportStatus::Failed { n_collected, error: e.to_string(), } @@ -726,7 +726,7 @@ impl BundleCollection { .collect_sp_ereports(&filters, &dir, &mut n_collected) .await { - Ok(_) => SupportBundleEreportCollection::Collected { + Ok(_) => SupportBundleEreportStatus::Collected { n_collected, }, Err(e) => { @@ -736,7 +736,7 @@ impl BundleCollection { written successfully)"; "err" => ?e, ); - SupportBundleEreportCollection::Failed { + SupportBundleEreportStatus::Failed { n_collected, error: e.to_string(), } @@ -807,11 +807,10 @@ impl BundleCollection { "Support bundle: host ereport collection task failed"; "err" => ?err, ); - report.host_ereports = - SupportBundleEreportCollection::Failed { - n_collected: 0, - error: err.to_string(), - }; + report.host_ereports = SupportBundleEreportStatus::Failed { + n_collected: 0, + error: err.to_string(), + }; } } match sp { @@ -822,11 +821,10 @@ impl BundleCollection { "Support bundle: SP ereport collection task failed"; "err" => ?err, ); - report.host_ereports = - SupportBundleEreportCollection::Failed { - n_collected: 0, - error: err.to_string(), - }; + report.host_ereports = SupportBundleEreportStatus::Failed { + n_collected: 0, + error: err.to_string(), + }; } } } diff --git a/nexus/tests/integration_tests/support_bundles.rs b/nexus/tests/integration_tests/support_bundles.rs index 184426a5b8..215936dbec 100644 --- a/nexus/tests/integration_tests/support_bundles.rs +++ b/nexus/tests/integration_tests/support_bundles.rs @@ -19,7 +19,7 @@ use nexus_types::external_api::shared::SupportBundleInfo; use nexus_types::external_api::shared::SupportBundleState; use nexus_types::internal_api::background::SupportBundleCleanupReport; use nexus_types::internal_api::background::SupportBundleCollectionReport; -use nexus_types::internal_api::background::SupportBundleEreportCollection; +use nexus_types::internal_api::background::SupportBundleEreportStatus; use omicron_uuid_kinds::SupportBundleUuid; use serde::Deserialize; use std::io::Cursor; @@ -493,10 +493,10 @@ async fn test_support_bundle_lifecycle(cptestctx: &ControlPlaneTestContext) { listed_in_service_sleds: true, listed_sps: true, activated_in_db_ok: true, - host_ereports: SupportBundleEreportCollection::Collected { + host_ereports: SupportBundleEreportStatus::Collected { n_collected: 0 }, - sp_ereports: SupportBundleEreportCollection::Collected { + sp_ereports: SupportBundleEreportStatus::Collected { n_collected: 0 } }) @@ -596,10 +596,10 @@ async fn test_support_bundle_range_requests( listed_in_service_sleds: true, listed_sps: true, activated_in_db_ok: true, - host_ereports: SupportBundleEreportCollection::Collected { + host_ereports: SupportBundleEreportStatus::Collected { n_collected: 0 }, - sp_ereports: SupportBundleEreportCollection::Collected { + sp_ereports: SupportBundleEreportStatus::Collected { n_collected: 0 } }) diff --git a/nexus/types/src/internal_api/background.rs b/nexus/types/src/internal_api/background.rs index a544a9ef92..d16661782a 100644 --- a/nexus/types/src/internal_api/background.rs +++ b/nexus/types/src/internal_api/background.rs @@ -238,14 +238,14 @@ pub struct SupportBundleCollectionReport { pub activated_in_db_ok: bool, /// Status of host OS ereport collection. - pub host_ereports: SupportBundleEreportCollection, + pub host_ereports: SupportBundleEreportStatus, /// Status of SP ereport collection. - pub sp_ereports: SupportBundleEreportCollection, + pub sp_ereports: SupportBundleEreportStatus, } #[derive(Debug, Deserialize, Serialize, PartialEq, Eq)] -pub enum SupportBundleEreportCollection { +pub enum SupportBundleEreportStatus { /// Ereports were not requested for this bundle. NotRequested, @@ -263,8 +263,8 @@ impl SupportBundleCollectionReport { listed_in_service_sleds: false, listed_sps: false, activated_in_db_ok: false, - host_ereports: SupportBundleEreportCollection::NotRequested, - sp_ereports: SupportBundleEreportCollection::NotRequested, + host_ereports: SupportBundleEreportStatus::NotRequested, + sp_ereports: SupportBundleEreportStatus::NotRequested, } } } From 1bcd6614339ac16a5827188a6b952381142edb95 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 31 Jul 2025 10:59:29 -0700 Subject: [PATCH 05/11] add ereports to tests --- nexus/db-queries/src/db/datastore/ereport.rs | 43 ++++- .../tasks/support_bundle_collector.rs | 148 +++++++++++++++++- 2 files changed, 189 insertions(+), 2 deletions(-) diff --git a/nexus/db-queries/src/db/datastore/ereport.rs b/nexus/db-queries/src/db/datastore/ereport.rs index 22a2b19aec..ca5e7f4aae 100644 --- a/nexus/db-queries/src/db/datastore/ereport.rs +++ b/nexus/db-queries/src/db/datastore/ereport.rs @@ -383,8 +383,20 @@ impl DataStore { sled_id: SledUuid, ) -> Result, Error> { opctx.authorize(authz::Action::ListChildren, &authz::FLEET).await?; + self.host_latest_ereport_id_on_conn( + &*self.pool_connection_authorized(opctx).await?, + sled_id, + ) + .await + } + + async fn host_latest_ereport_id_on_conn( + &self, + conn: &async_bb8_diesel::Connection, + sled_id: SledUuid, + ) -> Result, Error> { let id = Self::host_latest_ereport_id_query(sled_id) - .get_result_async(&*self.pool_connection_authorized(opctx).await?) + .get_result_async(conn) .await .optional() .map_err(|e| public_error_from_diesel(e, ErrorHandler::Server))? @@ -436,6 +448,35 @@ impl DataStore { })?; Ok((created, latest)) } + + pub async fn host_ereports_insert( + &self, + opctx: &OpContext, + sled_id: SledUuid, + ereports: Vec, + ) -> CreateResult<(usize, Option)> { + opctx.authorize(authz::Action::CreateChild, &authz::FLEET).await?; + let conn = self.pool_connection_authorized(opctx).await?; + let created = diesel::insert_into(host_dsl::host_ereport) + .values(ereports) + .on_conflict((host_dsl::restart_id, host_dsl::ena)) + .do_nothing() + .execute_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + .internal_context("failed to insert ereports") + })?; + let latest = self + .host_latest_ereport_id_on_conn(&conn, sled_id) + .await + .map_err(|e| { + e.internal_context(format!( + "failed to refresh latest ereport ID for {sled_id}" + )) + })?; + Ok((created, latest)) + } } fn id_from_tuple( diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 67025fb969..94980939fb 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -1429,7 +1429,8 @@ mod test { use omicron_common::disk::SharedDatasetConfig; use omicron_common::zpool_name::ZpoolName; use omicron_uuid_kinds::{ - BlueprintUuid, DatasetUuid, PhysicalDiskUuid, SledUuid, + BlueprintUuid, DatasetUuid, EreporterRestartUuid, OmicronZoneUuid, + PhysicalDiskUuid, SledUuid, }; use uuid::Uuid; @@ -1570,6 +1571,139 @@ mod test { id } + async fn make_fake_ereports(datastore: &DataStore, opctx: &OpContext) { + use crate::db; + + const SP_SERIAL: &str = "BRM42000069"; + const HOST_SERIAL: &str = "BRM66600042"; + const GIMLET_PN: &str = "9130000019"; + // Make some SP ereports... + let sp_restart_id = EreporterRestartUuid::new_v4(); + datastore.sp_ereports_insert(&opctx, db::model::SpType::Sled, 8, vec![ + db::model::SpEreport { + restart_id: sp_restart_id.into(), + ena: ereport_types::Ena(1).into(), + time_collected: chrono::Utc::now(), + time_deleted: None, + collector_id: OmicronZoneUuid::new_v4().into(), + sp_type: db::model::SpType::Sled, + sp_slot: 8.into(), + part_number: Some(GIMLET_PN.to_string()), + serial_number: Some(SP_SERIAL.to_string()), + class: Some("ereport.fake.whatever".to_string()), + report: serde_json::json!({"hello world": true}) + }, + db::model::SpEreport { + restart_id: sp_restart_id.into(), + ena: ereport_types::Ena(2).into(), + time_collected: chrono::Utc::now(), + time_deleted: None, + collector_id: OmicronZoneUuid::new_v4().into(), + sp_type: db::model::SpType::Sled, + sp_slot: 8.into(), + part_number: Some(GIMLET_PN.to_string()), + serial_number: Some(SP_SERIAL.to_string()), + class: Some("ereport.something.blah".to_string()), + report: serde_json::json!({"system_working": "seems to be",}) + }, + db::model::SpEreport { + restart_id: EreporterRestartUuid::new_v4().into(), + ena: ereport_types::Ena(1).into(), + time_collected: chrono::Utc::now(), + time_deleted: None, + collector_id: OmicronZoneUuid::new_v4().into(), + sp_type: db::model::SpType::Sled, + sp_slot: 8.into(), + // Let's do a silly one! No VPD, to make sure that's also + // handled correctly. + part_number: None, + serial_number: None, + class: Some("ereport.fake.whatever".to_string()), + report: serde_json::json!({"hello_world": true}) + }, + ]).await.expect("failed to insert fake SP ereports"); + // And one from a different serial. N.B. that I made sure the number of + // host-OS and SP ereports are different for when we make assertions + // about the bundle report. + datastore + .sp_ereports_insert( + &opctx, + db::model::SpType::Switch, + 1, + vec![db::model::SpEreport { + restart_id: EreporterRestartUuid::new_v4().into(), + ena: ereport_types::Ena(1).into(), + time_collected: chrono::Utc::now(), + time_deleted: None, + collector_id: OmicronZoneUuid::new_v4().into(), + sp_type: db::model::SpType::Switch, + sp_slot: 1.into(), + part_number: Some("9130000006".to_string()), + serial_number: Some("BRM41000555".to_string()), + class: Some("ereport.fake.whatever".to_string()), + report: serde_json::json!({"im_a_sidecar": true}), + }], + ) + .await + .expect("failed to insert another fake SP ereport"); + // And some host OS ones... + let sled_id = SledUuid::new_v4(); + let restart_id = EreporterRestartUuid::new_v4().into(); + datastore + .host_ereports_insert( + &opctx, + sled_id, + vec![ + db::model::HostEreport { + restart_id, + ena: ereport_types::Ena(1).into(), + time_collected: chrono::Utc::now(), + time_deleted: None, + collector_id: OmicronZoneUuid::new_v4().into(), + sled_id: sled_id.into(), + sled_serial: HOST_SERIAL.to_string(), + class: Some("ereport.fake.whatever".to_string()), + report: serde_json::json!({"hello_world": true}), + }, + db::model::HostEreport { + restart_id, + ena: ereport_types::Ena(2).into(), + time_collected: chrono::Utc::now(), + time_deleted: None, + collector_id: OmicronZoneUuid::new_v4().into(), + sled_id: sled_id.into(), + sled_serial: HOST_SERIAL.to_string(), + class: Some("ereport.fake.whatever.thingy".to_string()), + report: serde_json::json!({"goodbye_world": false}), + }, + ], + ) + .await + .expect("failed to insert fake host OS ereports"); + // And another one with the same serial but different restart/sled IDs + let sled_id = SledUuid::new_v4(); + datastore + .host_ereports_insert( + &opctx, + sled_id, + vec![ + db::model::HostEreport { + restart_id: EreporterRestartUuid::new_v4().into(), + ena: ereport_types::Ena(1).into(), + time_collected: chrono::Utc::now(), + time_deleted: None, + collector_id: OmicronZoneUuid::new_v4().into(), + sled_id: sled_id.into(), + sled_serial: HOST_SERIAL.to_string(), + class: Some("ereport.something.hostos_related".to_string()), + report: serde_json::json!({"illumos": "very yes", "whatever": 42}), + }, + ], + ) + .await + .expect("failed to insert another fake host OS ereport"); + } + struct TestDataset { zpool_id: ZpoolUuid, dataset_id: DatasetUuid, @@ -1674,6 +1808,10 @@ mod test { let _datasets = TestDataset::setup(cptestctx, &datastore, &opctx, 1).await; + // Make up some ereports so that we can test that they're included in + // the bundle. + make_fake_ereports(&datastore, &opctx).await; + // Assign a bundle to ourselves. We expect to collect it on // the next call to "collect_bundle". let bundle = datastore @@ -1710,6 +1848,14 @@ mod test { assert!(report.listed_in_service_sleds); assert!(report.listed_sps); assert!(report.activated_in_db_ok); + assert_eq!( + report.host_ereports, + SupportBundleEreportStatus::Collected { n_collected: 3 } + ); + assert_eq!( + report.sp_ereports, + SupportBundleEreportStatus::Collected { n_collected: 4 } + ); let observed_bundle = datastore .support_bundle_get(&opctx, bundle.id.into()) From d2c3a7af10572bb20a91e2fa9c26e9115768eaa2 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 31 Jul 2025 11:40:22 -0700 Subject: [PATCH 06/11] hakari --- workspace-hack/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workspace-hack/Cargo.toml b/workspace-hack/Cargo.toml index 3fbbb606e3..04e7e38a3c 100644 --- a/workspace-hack/Cargo.toml +++ b/workspace-hack/Cargo.toml @@ -135,7 +135,7 @@ tokio = { version = "1.47.0", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.13", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } tokio-rustls = { version = "0.26.0", default-features = false, features = ["logging", "ring", "tls12"] } tokio-stream = { version = "0.1.17", features = ["net", "sync"] } -tokio-util = { version = "0.7.15", features = ["codec", "io-util", "time"] } +tokio-util = { version = "0.7.15", features = ["codec", "io-util", "rt", "time"] } toml = { version = "0.7.8" } toml_datetime = { version = "0.6.11", default-features = false, features = ["serde"] } toml_edit-3c51e837cfc5589a = { package = "toml_edit", version = "0.22.27", features = ["serde"] } @@ -273,7 +273,7 @@ tokio = { version = "1.47.0", features = ["full", "test-util"] } tokio-postgres = { version = "0.7.13", features = ["with-chrono-0_4", "with-serde_json-1", "with-uuid-1"] } tokio-rustls = { version = "0.26.0", default-features = false, features = ["logging", "ring", "tls12"] } tokio-stream = { version = "0.1.17", features = ["net", "sync"] } -tokio-util = { version = "0.7.15", features = ["codec", "io-util", "time"] } +tokio-util = { version = "0.7.15", features = ["codec", "io-util", "rt", "time"] } toml = { version = "0.7.8" } toml_datetime = { version = "0.6.11", default-features = false, features = ["serde"] } toml_edit-3c51e837cfc5589a = { package = "toml_edit", version = "0.22.27", features = ["serde"] } From 05b0d30ee1b044f439d8b17d5bcf3ea4c49e7509 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 31 Jul 2025 12:00:41 -0700 Subject: [PATCH 07/11] clippy --- nexus/src/app/background/tasks/support_bundle_collector.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 4d9a5613b4..6eb7f010ae 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -1003,7 +1003,7 @@ impl BundleCollection { e.internal_context("failed to query for SP ereports") })?; paginator = p.found_batch(&ereports, &|ereport| { - (ereport.restart_id.into_untyped_uuid(), ereport.ena.into()) + (ereport.restart_id.into_untyped_uuid(), ereport.ena) }); let n_ereports = ereports.len(); @@ -1048,7 +1048,7 @@ impl BundleCollection { e.internal_context("failed to query for host OS ereports") })?; paginator = p.found_batch(&ereports, &|ereport| { - (ereport.restart_id.into_untyped_uuid(), ereport.ena.into()) + (ereport.restart_id.into_untyped_uuid(), ereport.ena) }); let n_ereports = ereports.len(); for ereport in ereports { From a605c67c9f7e605c445ea097eea3ae5f028c98ea Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Thu, 31 Jul 2025 15:36:03 -0700 Subject: [PATCH 08/11] Update nexus/src/app/background/tasks/support_bundle_collector.rs Co-authored-by: Sean Klein --- nexus/src/app/background/tasks/support_bundle_collector.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 6eb7f010ae..7649005809 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -1064,7 +1064,7 @@ impl BundleCollection { info!( self.log, - "Support bundle: collected {ereports_written} total SP ereports", + "Support bundle: collected {ereports_written} total host ereports", ); Ok(()) } From 51fa882288112e70ac41face4814b51181c117e9 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 1 Aug 2025 11:59:19 -0700 Subject: [PATCH 09/11] ensure ereport count is accurate in face of abrupt task failures this shouldn't ever actually happen due to `panic="abort"`, but whatever --- .../tasks/support_bundle_collector.rs | 143 +++++++----------- 1 file changed, 55 insertions(+), 88 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 7649005809..d59ab16ddc 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -54,6 +54,7 @@ use std::future::Future; use std::io::Write; use std::num::NonZeroU64; use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, Ordering}; use tokio::io::AsyncReadExt; use tokio::io::AsyncSeekExt; use tokio::io::AsyncWriteExt; @@ -425,6 +426,8 @@ impl SupportBundleCollector { request: request.clone(), bundle: bundle.clone(), transfer_chunk_size: request.transfer_chunk_size, + host_ereports_collected: AtomicUsize::new(0), + sp_ereports_collected: AtomicUsize::new(0), }); let authz_bundle = authz_support_bundle_from_id(bundle.id.into()); @@ -470,6 +473,8 @@ struct BundleCollection { request: BundleRequest, bundle: SupportBundle, transfer_chunk_size: NonZeroU64, + host_ereports_collected: AtomicUsize, + sp_ereports_collected: AtomicUsize, } impl BundleCollection { @@ -691,61 +696,15 @@ impl BundleCollection { // dropped, the tasks that we've spawned to collect ereports are // aborted as well. let dir = dir.path().join("ereports"); - let host = AbortOnDropHandle::new(tokio::spawn({ - let dir = dir.clone(); - let filters = ereport_filters.clone(); - let collection = self.clone(); - async move { - let mut n_collected = 0; - match collection - .collect_host_ereports(&filters, &dir, &mut n_collected) - .await - { - Ok(_) => SupportBundleEreportStatus::Collected { - n_collected, - }, - Err(e) => { - warn!( - collection.log, - "Failed to collect host OS ereports \ - ({n_collected} written successfully)"; - "err" => ?e, - ); - SupportBundleEreportStatus::Failed { - n_collected, - error: e.to_string(), - } - } - } - } - })); - let sp = AbortOnDropHandle::new(tokio::spawn({ - let filters = ereport_filters.clone(); - let collection = self.clone(); - async move { - let mut n_collected = 0; - match collection - .collect_sp_ereports(&filters, &dir, &mut n_collected) - .await - { - Ok(_) => SupportBundleEreportStatus::Collected { - n_collected, - }, - Err(e) => { - warn!( - collection.log, - "Failed to collect SP ereports ({n_collected} \ - written successfully)"; - "err" => ?e, - ); - SupportBundleEreportStatus::Failed { - n_collected, - error: e.to_string(), - } - } - } - } - })); + let host = AbortOnDropHandle::new(tokio::spawn( + self.clone().collect_host_ereports( + ereport_filters.clone(), + dir.clone(), + ), + )); + let sp = AbortOnDropHandle::new(tokio::spawn( + self.clone().collect_sp_ereports(ereport_filters.clone(), dir), + )); Some((host, sp)) } else { debug!(log, "Support bundle: ereports not requested"); @@ -801,34 +760,47 @@ impl BundleCollection { if let Some((host, sp)) = ereport_collection { let (host, sp) = tokio::join!(host, sp); - match host { - Ok(status) => report.host_ereports = status, + const TASK_FAILURE_MSG: &str = "task failed"; + let n_collected = + self.host_ereports_collected.load(Ordering::Acquire); + report.host_ereports = match host + .map_err(|e| anyhow::anyhow!("{TASK_FAILURE_MSG}: {e}")) + .and_then(|x| x) + { + Ok(_) => SupportBundleEreportStatus::Collected { n_collected }, Err(err) => { warn!( &self.log, - "Support bundle: host ereport collection task failed"; + "Support bundle: host ereport collection failed \ + ({n_collected} collected successfully)"; "err" => ?err, ); - report.host_ereports = SupportBundleEreportStatus::Failed { - n_collected: 0, + SupportBundleEreportStatus::Failed { + n_collected, error: err.to_string(), - }; + } } - } - match sp { - Ok(status) => report.sp_ereports = status, + }; + let n_collected = + self.sp_ereports_collected.load(Ordering::Acquire); + report.sp_ereports = match sp + .map_err(|e| anyhow::anyhow!("{TASK_FAILURE_MSG}: {e}")) + .and_then(|x| x) + { + Ok(_) => SupportBundleEreportStatus::Collected { n_collected }, Err(err) => { warn!( &self.log, - "Support bundle: SP ereport collection task failed"; + "Support bundle: SP ereport collection failed \ + ({n_collected} collected successfully)"; "err" => ?err, ); - report.host_ereports = SupportBundleEreportStatus::Failed { - n_collected: 0, + SupportBundleEreportStatus::Failed { + n_collected, error: err.to_string(), - }; + } } - } + }; } Ok(report) } @@ -981,10 +953,9 @@ impl BundleCollection { } async fn collect_sp_ereports( - &self, - filters: &EreportFilters, - dir: &Utf8Path, - ereports_written: &mut usize, + self: Arc, + filters: EreportFilters, + dir: Utf8PathBuf, ) -> anyhow::Result<()> { let mut paginator = Paginator::new( datastore::SQL_BATCH_SIZE, @@ -1009,27 +980,23 @@ impl BundleCollection { let n_ereports = ereports.len(); for ereport in ereports { write_ereport(ereport.into(), &dir).await?; + self.sp_ereports_collected.fetch_add(1, Ordering::Release); } - *ereports_written += n_ereports; - debug!( - self.log, - "Support bundle: added {n_ereports} SP ereports \ - ({ereports_written} total)" - ); + debug!(self.log, "Support bundle: added {n_ereports} SP ereports"); } info!( self.log, - "Support bundle: collected {} total SP ereports", ereports_written + "Support bundle: collected {} total SP ereports", + self.sp_ereports_collected.load(Ordering::Relaxed) ); Ok(()) } async fn collect_host_ereports( - &self, - filters: &EreportFilters, - dir: &Utf8Path, - ereports_written: &mut usize, + self: Arc, + filters: EreportFilters, + dir: Utf8PathBuf, ) -> anyhow::Result<()> { let mut paginator = Paginator::new( datastore::SQL_BATCH_SIZE, @@ -1053,18 +1020,18 @@ impl BundleCollection { let n_ereports = ereports.len(); for ereport in ereports { write_ereport(ereport.into(), &dir).await?; + self.host_ereports_collected.fetch_add(1, Ordering::Release); } - *ereports_written += n_ereports; debug!( self.log, - "Support bundle: added {n_ereports} host OS ereports \ - ({ereports_written} total)" + "Support bundle: added {n_ereports} host OS ereports" ); } info!( self.log, - "Support bundle: collected {ereports_written} total host ereports", + "Support bundle: collected {} total host ereports", + self.host_ereports_collected.load(Ordering::Relaxed) ); Ok(()) } From 9c5a4045e5144ca79e515893d3fd887796d31646 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 1 Aug 2025 12:04:06 -0700 Subject: [PATCH 10/11] document + fix ereport paths --- .../tasks/support_bundle_collector.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index d59ab16ddc..f791dafe17 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -1081,9 +1081,24 @@ impl BackgroundTask for SupportBundleCollector { } async fn write_ereport(ereport: Ereport, dir: &Utf8Path) -> anyhow::Result<()> { + // Here's where we construct the file path for each ereport JSON file, given + // the top-lebel ereport directory path. Each ereport is stored in a + // subdirectory for the serial number of the system that produced the + // ereport. These paths take the following form: + // + // {serial_number}/{restart_id}/{ENA}.json + // + // We can assume that the restart ID and serial number consist only of + // filesystem-safe characters, as the restart ID is known to be a UUID, and + // the ENA is just an integer. let sn = ereport.metadata.serial_number.as_deref().unwrap_or("unknown_serial"); - let dir = dir.join(sn).join(ereport.id.restart_id.to_string()); + let dir = dir + .join(sn) + // N.B. that we call `into_untyped_uuid()` here, as the `Display` + // implementation for a typed UUID appends " (ereporter_restart)", which + // we don't want. + .join(ereport.id.restart_id.into_untyped_uuid().to_string()); tokio::fs::create_dir_all(&dir) .await .with_context(|| format!("failed to create directory '{dir}'"))?; From 23f1d3cfc1a2b5b5200ea3b4eb875cc79ee03d79 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 1 Aug 2025 13:40:06 -0700 Subject: [PATCH 11/11] sanitize serial number strings in ereport paths --- .../tasks/support_bundle_collector.rs | 64 +++++++++++++++++-- 1 file changed, 59 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index f791dafe17..69cad6240f 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -1082,17 +1082,29 @@ impl BackgroundTask for SupportBundleCollector { async fn write_ereport(ereport: Ereport, dir: &Utf8Path) -> anyhow::Result<()> { // Here's where we construct the file path for each ereport JSON file, given - // the top-lebel ereport directory path. Each ereport is stored in a + // the top-lebel ereport directory path. Each ereport is stored in a // subdirectory for the serial number of the system that produced the - // ereport. These paths take the following form: + // ereport. These paths take the following form: // // {serial_number}/{restart_id}/{ENA}.json // // We can assume that the restart ID and serial number consist only of // filesystem-safe characters, as the restart ID is known to be a UUID, and - // the ENA is just an integer. - let sn = - ereport.metadata.serial_number.as_deref().unwrap_or("unknown_serial"); + // the ENA is just an integer. For the serial number, which we don't have + // full control over --- it came from the ereport metadata --- we must + // check that it doesn't contain any characters unsuitable for use in a + // filesystem path. + let sn = &ereport + .metadata + .serial_number + .as_deref() + // If the serial number contains any unsavoury characters, it goes in + // the `unknown_serial` hole! Note that the alleged serial number from + // the ereport will still be present in the JSON as a string, so we're + // not *lying* about what was received; we're just giving up on using it + // in the path. + .filter(|&s| is_fs_safe_single_path_component(s)) + .unwrap_or("unknown_serial"); let dir = dir .join(sn) // N.B. that we call `into_untyped_uuid()` here, as the `Display` @@ -1393,6 +1405,48 @@ async fn save_sp_dumps( Ok(()) } +fn is_fs_safe_single_path_component(s: &str) -> bool { + // Might be path traversal... + if s == "." || s == ".." { + return false; + } + + if s == "~" { + return false; + } + + const BANNED_CHARS: &[char] = &[ + // Check for path separators. + // + // Naively, we might reach for `std::path::is_separator()` here. + // However, this function only checks if a path is a permitted + // separator on the *current* platform --- so, running on illumos, we + // will only check for Unix path separators. But, because the support + // bundle may be extracted on a workstation system by Oxide support + // personnel or by the customer, we should also make sure we don't + // allow the use of Windows path separators, which `is_separator()` + // won't check for on Unix systems. + '/', '\\', + // Characters forbidden on Windows, per: + // https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file#naming-conventions + '<', '>', ':', '"', '|', '?', '*', + ]; + + // Rather than using `s.contains()`, we do all the checks in one pass. + for c in s.chars() { + if BANNED_CHARS.contains(&c) { + return false; + } + + // Definitely no control characters! + if c.is_control() { + return false; + } + } + + true +} + #[cfg(test)] mod test { use super::*;