From 44c852326918d92d75355b339017168ea89098ca Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 5 Aug 2025 10:29:40 -0700 Subject: [PATCH 1/4] [nexus] Add part to service bundle ereport paths In #8739, I added code for collecting ereports into support bundles which stores the ereport JSON in directories for each sled/switch/PSC serial number from which an ereport was received. Unfortunately, I failed failed to consider that the version 1 Oxide serial numbers are only unique within the namespace of a particular part, and not globally --- so (for example) a switch and a compute sled may have colliding serials. This means that the current code could incorrectly group ereports reported by two totally different devices. While the ereport JSON files _do_ contain additional information that disambiguates this (it includes includes the part number, as well as MGS metadata with the SP type for SP ereports), and restart IDs are additionally capable of distinguishing between reporters, putting ereports from two different systems within the same directory still has the potential to be quite misleading. Thus, this branch changes the paths for ereports to include the part number as well as the serial number, in the format: ``` {part_number}-{serial_number}/{restart_id}/{ENA}.json ``` In order to include part numbers for host OS ereports, I decided to add a part number column to the `host_ereport` table as well. Initially, I had opted not to do this, as I was thinking that, since `host_ereport` includes a sled UUID, we could just join with the `sled` table to get the part number. However, it occurred to me that ereports may be received from a sled that's later expunged from the rack, and the `sled` record for the sled may eventually be deleted, so such a join would fail. We might retain such ereports past the lifetime of the sled in the rack. So, I thought it was better to always include the part number in the ereport record. I've added a migration that attempts to backfill the `host_ereport.part_number` column from the `sled` table for existing host OS ereport records. In practice, this won't do anything, since we're not collecting them yet,but it seemed nice to have. Sadly, the column had to be left nullable, since we may theoretically encounter an ereport with a sled UUID that points to an already-deleted sled record, but...whatever. Since there aren't currently any host OS ereport records anyway, this shouldn't happen, and we'll just handle the nullability; this isn't terrible as we must already do so for SP ereport records. Fixes #8765 --- nexus/db-model/src/ereport.rs | 9 +++- nexus/db-model/src/schema_versions.rs | 3 +- nexus/db-schema/src/schema.rs | 2 + .../tasks/support_bundle_collector.rs | 50 ++++++++++++------- .../crdb/add-host-ereport-part-number/up1.sql | 2 + .../crdb/add-host-ereport-part-number/up2.sql | 7 +++ schema/crdb/dbinit.sql | 4 +- 7 files changed, 55 insertions(+), 22 deletions(-) create mode 100644 schema/crdb/add-host-ereport-part-number/up1.sql create mode 100644 schema/crdb/add-host-ereport-part-number/up2.sql diff --git a/nexus/db-model/src/ereport.rs b/nexus/db-model/src/ereport.rs index 1129cbc988d..98658e9436a 100644 --- a/nexus/db-model/src/ereport.rs +++ b/nexus/db-model/src/ereport.rs @@ -117,6 +117,7 @@ impl From for Ereport { sled_id, class, report, + part_number, } = host_report; Ereport { id: EreportId { restart_id: restart_id.into(), ena: ena.into() }, @@ -124,7 +125,7 @@ impl From for Ereport { time_collected, time_deleted, collector_id: collector_id.into(), - part_number: None, // TODO + part_number, serial_number: Some(sled_serial), class, }, @@ -253,4 +254,10 @@ pub struct HostEreport { pub class: Option, pub report: serde_json::Value, + + // It's a shame this has to be nullable, while the serial is not. However, + // this field was added in a migration, and we have to be able to handle the + // case where a sled record was hard-deleted when backfilling the ereport + // table's part_number column. Sad. + pub part_number: Option, } diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index 1507aeab46f..e6a71e95c34 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(173, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(174, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(174, "add-host-ereport-part-number"), KnownVersion::new(173, "inv-internal-dns"), KnownVersion::new(172, "add-zones-with-mupdate-override"), KnownVersion::new(171, "inv-clear-mupdate-override"), diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index b81f4389218..d0c854af80c 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -2652,6 +2652,8 @@ table! { class -> Nullable, report -> Jsonb, + + part_number -> Nullable, } } diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 69cad6240f7..4ef4d9737c1 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -1081,32 +1081,41 @@ impl BackgroundTask for SupportBundleCollector { } async fn write_ereport(ereport: Ereport, dir: &Utf8Path) -> anyhow::Result<()> { - // Here's where we construct the file path for each ereport JSON file, given - // the top-lebel ereport directory path. Each ereport is stored in a - // subdirectory for the serial number of the system that produced the - // ereport. These paths take the following form: + // Here's where we construct the file path for each ereport JSON file, + // given the top-level ereport directory path. Each ereport is stored in a + // subdirectory for the part and serial numbers of the system that produced + // the ereport. Part numbers must be included in addition to serial + // numbers, as the v1 serial scheme only guarantees uniqueness within a + // part number. These paths take the following form: // - // {serial_number}/{restart_id}/{ENA}.json + // {part-number}-{serial_number}/{restart_id}/{ENA}.json // - // We can assume that the restart ID and serial number consist only of + // We can assume that the restart ID and ENA consist only of // filesystem-safe characters, as the restart ID is known to be a UUID, and - // the ENA is just an integer. For the serial number, which we don't have - // full control over --- it came from the ereport metadata --- we must - // check that it doesn't contain any characters unsuitable for use in a - // filesystem path. - let sn = &ereport + // the ENA is just an integer. For the serial and part numbers, which + // Nexus doesn't have full control over --- it came from the ereport + // metadata --- we must check that it doesn't contain any characters + // unsuitable for use in a filesystem path. + let pn = ereport + .metadata + .part_number + .as_deref() + // If the part or serial numbers contain any unsavoury characters, it + // goes in the `unknown_serial` hole! Note that the alleged serial + // number from the ereport will still be present in the JSON as a + // string, so we're not *lying* about what was received; we're just + // giving up on using it in the path. + .filter(|&s| is_fs_safe_single_path_component(s)) + .unwrap_or("unknown_part"); + let sn = ereport .metadata .serial_number .as_deref() - // If the serial number contains any unsavoury characters, it goes in - // the `unknown_serial` hole! Note that the alleged serial number from - // the ereport will still be present in the JSON as a string, so we're - // not *lying* about what was received; we're just giving up on using it - // in the path. .filter(|&s| is_fs_safe_single_path_component(s)) .unwrap_or("unknown_serial"); + let dir = dir - .join(sn) + .join(format!("{pn}-{sn}")) // N.B. that we call `into_untyped_uuid()` here, as the `Display` // implementation for a typed UUID appends " (ereporter_restart)", which // we don't want. @@ -1116,11 +1125,11 @@ async fn write_ereport(ereport: Ereport, dir: &Utf8Path) -> anyhow::Result<()> { .with_context(|| format!("failed to create directory '{dir}'"))?; let file_path = dir.join(format!("{}.json", ereport.id.ena)); let json = serde_json::to_vec(&ereport).with_context(|| { - format!("failed to serialize ereport '{sn}:{}'", ereport.id) + format!("failed to serialize ereport {pn}:{sn}/{}", ereport.id) })?; tokio::fs::write(&file_path, json) .await - .with_context(|| format!("failed to write {file_path}'")) + .with_context(|| format!("failed to write '{file_path}'")) } // Takes a directory "dir", and zips the contents into a single zipfile. @@ -1701,6 +1710,7 @@ mod test { collector_id: OmicronZoneUuid::new_v4().into(), sled_id: sled_id.into(), sled_serial: HOST_SERIAL.to_string(), + part_number: Some(GIMLET_PN.to_string()), class: Some("ereport.fake.whatever".to_string()), report: serde_json::json!({"hello_world": true}), }, @@ -1712,6 +1722,7 @@ mod test { collector_id: OmicronZoneUuid::new_v4().into(), sled_id: sled_id.into(), sled_serial: HOST_SERIAL.to_string(), + part_number: Some(GIMLET_PN.to_string()), class: Some("ereport.fake.whatever.thingy".to_string()), report: serde_json::json!({"goodbye_world": false}), }, @@ -1734,6 +1745,7 @@ mod test { collector_id: OmicronZoneUuid::new_v4().into(), sled_id: sled_id.into(), sled_serial: HOST_SERIAL.to_string(), + part_number: Some(GIMLET_PN.to_string()), class: Some("ereport.something.hostos_related".to_string()), report: serde_json::json!({"illumos": "very yes", "whatever": 42}), }, diff --git a/schema/crdb/add-host-ereport-part-number/up1.sql b/schema/crdb/add-host-ereport-part-number/up1.sql new file mode 100644 index 00000000000..b094b5eb973 --- /dev/null +++ b/schema/crdb/add-host-ereport-part-number/up1.sql @@ -0,0 +1,2 @@ +ALTER TABLE omicron.common.host_ereport + ADD COLUMN IF NOT EXISTS part_number STRING(63); diff --git a/schema/crdb/add-host-ereport-part-number/up2.sql b/schema/crdb/add-host-ereport-part-number/up2.sql new file mode 100644 index 00000000000..0f0cb276c04 --- /dev/null +++ b/schema/crdb/add-host-ereport-part-number/up2.sql @@ -0,0 +1,7 @@ +-- backfill host OS ereports part_number column by querying the sled table. if +-- there isn't a sled for the ereport's sled UUID in the sled table (i.e. if the +-- sled record was hard deleted), leave it null. +SET LOCAL disallow_full_table_scans = off; +UPDATE omicron.public.host_ereport + SET host_ereport.part_number = sled.part_number + FROM host_ereport INNER JOIN sled ON host_ereport.sled_id = sled.id; diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 1b1b2cd5e1a..d4dfc910f51 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6312,6 +6312,8 @@ CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( */ report JSONB NOT NULL, + part_number TEXT, + PRIMARY KEY (restart_id, ena) ); @@ -6347,7 +6349,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '173.0.0', NULL) + (TRUE, NOW(), NOW(), '174.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; From eb5023cdc93772e27f396882cda99f95f0f58931 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Tue, 5 Aug 2025 12:23:32 -0700 Subject: [PATCH 2/4] oops --- schema/crdb/dbinit.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index d4dfc910f51..913a91bf2e2 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -6312,7 +6312,7 @@ CREATE TABLE IF NOT EXISTS omicron.public.host_ereport ( */ report JSONB NOT NULL, - part_number TEXT, + part_number STRING(63), PRIMARY KEY (restart_id, ena) ); From 7a10c02b0881a73e20246e78d4daa0cfc9f47014 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Fri, 8 Aug 2025 15:34:29 -0700 Subject: [PATCH 3/4] oops haha --- schema/crdb/add-host-ereport-part-number/up1.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema/crdb/add-host-ereport-part-number/up1.sql b/schema/crdb/add-host-ereport-part-number/up1.sql index b094b5eb973..4974255efb5 100644 --- a/schema/crdb/add-host-ereport-part-number/up1.sql +++ b/schema/crdb/add-host-ereport-part-number/up1.sql @@ -1,2 +1,2 @@ -ALTER TABLE omicron.common.host_ereport +ALTER TABLE omicron.public.host_ereport ADD COLUMN IF NOT EXISTS part_number STRING(63); From 36a3f2c10068d65ea9f425d737af15ef9e85a8a3 Mon Sep 17 00:00:00 2001 From: Eliza Weisman Date: Sat, 9 Aug 2025 16:47:32 -0700 Subject: [PATCH 4/4] i always forget how UPDATE ... FROM works lol --- schema/crdb/add-host-ereport-part-number/up2.sql | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/schema/crdb/add-host-ereport-part-number/up2.sql b/schema/crdb/add-host-ereport-part-number/up2.sql index 0f0cb276c04..a720f75c248 100644 --- a/schema/crdb/add-host-ereport-part-number/up2.sql +++ b/schema/crdb/add-host-ereport-part-number/up2.sql @@ -3,5 +3,6 @@ -- sled record was hard deleted), leave it null. SET LOCAL disallow_full_table_scans = off; UPDATE omicron.public.host_ereport - SET host_ereport.part_number = sled.part_number - FROM host_ereport INNER JOIN sled ON host_ereport.sled_id = sled.id; + SET part_number = sled.part_number + FROM sled + WHERE host_ereport.sled_id = sled.id;