Skip to content

Commit 6ea36d1

Browse files
authored
[sled-agent-config-reconciler] Check zone dataset dependencies before starting zones (#8219)
This dramatically reduces the work that `ServiceManager::start_omicron_zone()` does by moving most of it to the config-reconciler: * Moved: shutting down existing zone of the same name * Moved: checking for time sync * Reworked: checking datasets and choosing a root zpool (now checks are performed against the most-recently-reconciled `DatasetConfig`s, and we never choose a root zpool since all zones have a property specifying which they should use) Builds on #8064 + #8218. Fixes #8173.
1 parent cf76de8 commit 6ea36d1

File tree

15 files changed

+972
-733
lines changed

15 files changed

+972
-733
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dev-tools/omdb/src/bin/omdb/db.rs

Lines changed: 22 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7350,30 +7350,28 @@ fn inv_collection_print_sleds(collection: &Collection) {
73507350
"LAST RECONCILED CONFIG",
73517351
&last_reconciliation.last_reconciled_config,
73527352
);
7353-
let disk_errs = collect_config_reconciler_errors(
7354-
&last_reconciliation.external_disks,
7355-
);
7356-
let dataset_errs = collect_config_reconciler_errors(
7357-
&last_reconciliation.datasets,
7358-
);
7359-
let zone_errs = collect_config_reconciler_errors(
7360-
&last_reconciliation.zones,
7361-
);
7362-
for (label, errs) in [
7363-
("disk", disk_errs),
7364-
("dataset", dataset_errs),
7365-
("zone", zone_errs),
7366-
] {
7367-
if errs.is_empty() {
7368-
println!(" all {label}s reconciled successfully");
7369-
} else {
7370-
println!(
7371-
" {} {label} reconciliation errors:",
7372-
errs.len()
7373-
);
7374-
for err in errs {
7375-
println!(" {err}");
7376-
}
7353+
}
7354+
let disk_errs = collect_config_reconciler_errors(
7355+
&last_reconciliation.external_disks,
7356+
);
7357+
let dataset_errs =
7358+
collect_config_reconciler_errors(&last_reconciliation.datasets);
7359+
let zone_errs =
7360+
collect_config_reconciler_errors(&last_reconciliation.zones);
7361+
for (label, errs) in [
7362+
("disk", disk_errs),
7363+
("dataset", dataset_errs),
7364+
("zone", zone_errs),
7365+
] {
7366+
if errs.is_empty() {
7367+
println!(" all {label}s reconciled successfully");
7368+
} else {
7369+
println!(
7370+
" {} {label} reconciliation errors:",
7371+
errs.len()
7372+
);
7373+
for err in errs {
7374+
println!(" {err}");
73777375
}
73787376
}
73797377
}

nexus-sled-agent-shared/src/inventory.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use chrono::{DateTime, Utc};
1212
use daft::Diffable;
1313
use id_map::IdMap;
1414
use id_map::IdMappable;
15+
use omicron_common::disk::{DatasetKind, DatasetName};
1516
use omicron_common::ledger::Ledgerable;
1617
use omicron_common::{
1718
api::{
@@ -327,6 +328,10 @@ impl OmicronZoneConfig {
327328
Some(self.id),
328329
)
329330
}
331+
332+
pub fn dataset_name(&self) -> Option<DatasetName> {
333+
self.zone_type.dataset_name()
334+
}
330335
}
331336

332337
/// Describes a persistent ZFS dataset associated with an Omicron zone
@@ -613,6 +618,41 @@ impl OmicronZoneType {
613618
| OmicronZoneType::Oximeter { .. } => None,
614619
}
615620
}
621+
622+
/// If this kind of zone has an associated dataset, return the dataset's
623+
/// name. Otherwise, return `None`.
624+
pub fn dataset_name(&self) -> Option<DatasetName> {
625+
let (dataset, dataset_kind) = match self {
626+
OmicronZoneType::BoundaryNtp { .. }
627+
| OmicronZoneType::InternalNtp { .. }
628+
| OmicronZoneType::Nexus { .. }
629+
| OmicronZoneType::Oximeter { .. }
630+
| OmicronZoneType::CruciblePantry { .. } => None,
631+
OmicronZoneType::Clickhouse { dataset, .. } => {
632+
Some((dataset, DatasetKind::Clickhouse))
633+
}
634+
OmicronZoneType::ClickhouseKeeper { dataset, .. } => {
635+
Some((dataset, DatasetKind::ClickhouseKeeper))
636+
}
637+
OmicronZoneType::ClickhouseServer { dataset, .. } => {
638+
Some((dataset, DatasetKind::ClickhouseServer))
639+
}
640+
OmicronZoneType::CockroachDb { dataset, .. } => {
641+
Some((dataset, DatasetKind::Cockroach))
642+
}
643+
OmicronZoneType::Crucible { dataset, .. } => {
644+
Some((dataset, DatasetKind::Crucible))
645+
}
646+
OmicronZoneType::ExternalDns { dataset, .. } => {
647+
Some((dataset, DatasetKind::ExternalDns))
648+
}
649+
OmicronZoneType::InternalDns { dataset, .. } => {
650+
Some((dataset, DatasetKind::InternalDns))
651+
}
652+
}?;
653+
654+
Some(DatasetName::new(dataset.pool_name, dataset_kind))
655+
}
616656
}
617657

618658
/// Like [`OmicronZoneType`], but without any associated data.

schema/all-zones-requests.json

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"$schema": "http://json-schema.org/draft-07/schema#",
33
"title": "OmicronZonesConfigLocal",
4-
"description": "Combines the Nexus-provided `OmicronZonesConfig` (which describes what Nexus wants for all of its zones) with the locally-determined configuration for these zones.",
4+
"description": "Legacy type of the ledgered zone config.",
55
"type": "object",
66
"required": [
77
"ledger_generation",
@@ -10,20 +10,10 @@
1010
],
1111
"properties": {
1212
"ledger_generation": {
13-
"description": "ledger-managed generation number\n\nThis generation is managed by the ledger facility itself. It's bumped whenever we write a new ledger. In practice, we don't currently have any reason to bump this _for a given Omicron generation_ so it's somewhat redundant. In principle, if we needed to modify the ledgered configuration due to some event that doesn't change the Omicron config (e.g., if we wanted to move the root filesystem to a different path), we could do that by bumping this generation.",
14-
"allOf": [
15-
{
16-
"$ref": "#/definitions/Generation"
17-
}
18-
]
13+
"$ref": "#/definitions/Generation"
1914
},
2015
"omicron_generation": {
21-
"description": "generation of the Omicron-provided part of the configuration\n\nThis generation number is outside of Sled Agent's control. We store exactly what we were given and use this number to decide when to fail requests to establish an outdated configuration.\n\nYou can think of this as a major version number, with `ledger_generation` being a minor version number. See `is_newer_than()`.",
22-
"allOf": [
23-
{
24-
"$ref": "#/definitions/Generation"
25-
}
26-
]
16+
"$ref": "#/definitions/Generation"
2717
},
2818
"zones": {
2919
"type": "array",
@@ -269,7 +259,6 @@
269259
}
270260
},
271261
"OmicronZoneConfigLocal": {
272-
"description": "Combines the Nexus-provided `OmicronZoneConfig` (which describes what Nexus wants for this zone) with any locally-determined configuration (like the path to the root filesystem)",
273262
"type": "object",
274263
"required": [
275264
"root",

sled-agent/config-reconciler/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ expectorate.workspace = true
4545
illumos-utils = { workspace = true, features = ["testing"] }
4646
omicron-test-utils.workspace = true
4747
proptest.workspace = true
48+
schemars.workspace = true
4849
scopeguard.workspace = true
4950
serde_json.workspace = true
5051
sled-storage = { workspace = true, features = ["testing"] }

sled-agent/config-reconciler/src/dataset_serialization_task.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,14 @@ impl IdMappable for DatasetEnsureResult {
175175
pub(crate) struct DatasetTaskHandle(mpsc::Sender<DatasetTaskRequest>);
176176

177177
impl DatasetTaskHandle {
178+
// For testing, create a handle on which requests will always fail with a
179+
// `DatasetTaskError`.
180+
#[cfg(any(test, feature = "testing"))]
181+
pub(crate) fn spawn_noop() -> Self {
182+
let (tx, _rx) = mpsc::channel(1);
183+
Self(tx)
184+
}
185+
178186
pub fn spawn_dataset_task(
179187
mount_config: Arc<MountConfig>,
180188
currently_managed_zpools_rx: CurrentlyManagedZpoolsReceiver,

sled-agent/config-reconciler/src/ledger/legacy_configs.rs

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ fn merge_old_configs(
218218

219219
/// Legacy type of the ledgered zone config.
220220
#[derive(Debug, Clone, Deserialize, Serialize)]
221+
#[cfg_attr(test, derive(schemars::JsonSchema))]
221222
struct OmicronZonesConfigLocal {
222223
omicron_generation: Generation,
223224
ledger_generation: Generation,
@@ -237,9 +238,11 @@ impl Ledgerable for OmicronZonesConfigLocal {
237238
}
238239

239240
#[derive(Debug, Clone, Deserialize, Serialize)]
241+
#[cfg_attr(test, derive(schemars::JsonSchema))]
240242
struct OmicronZoneConfigLocal {
241243
zone: OmicronZoneConfig,
242244
#[serde(rename = "root")]
245+
#[cfg_attr(test, schemars(with = "String"))]
243246
_root: Utf8PathBuf,
244247
}
245248

@@ -264,6 +267,15 @@ pub(super) mod tests {
264267
const MERGED_CONFIG_PATH: &str =
265268
"test-data/expectorate/merged-sled-config.json";
266269

270+
#[test]
271+
fn test_old_config_schema() {
272+
let schema = schemars::schema_for!(OmicronZonesConfigLocal);
273+
expectorate::assert_contents(
274+
"../../schema/all-zones-requests.json",
275+
&serde_json::to_string_pretty(&schema).unwrap(),
276+
);
277+
}
278+
267279
#[test]
268280
fn test_merge_old_configs() {
269281
let disks: OmicronPhysicalDisksConfig = {

sled-agent/config-reconciler/src/reconciler_task.rs

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -458,14 +458,12 @@ impl ReconcilerTask {
458458
// the old instance.
459459
match zone_shutdown_result {
460460
Ok(()) => {
461-
let currently_managed_zpools =
462-
self.external_disks.currently_managed_zpools();
463461
self.zones
464462
.start_zones_if_needed(
465463
&sled_config.zones,
466464
sled_agent_facilities,
467465
timesync_status.is_synchronized(),
468-
&currently_managed_zpools,
466+
&self.datasets,
469467
&self.log,
470468
)
471469
.await;

sled-agent/config-reconciler/src/reconciler_task/datasets.rs

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,26 +16,129 @@ use crate::dataset_serialization_task::DatasetEnsureResult;
1616
use crate::dataset_serialization_task::DatasetTaskHandle;
1717
use id_map::IdMap;
1818
use id_map::IdMappable;
19+
use illumos_utils::zpool::PathInPool;
20+
use illumos_utils::zpool::ZpoolOrRamdisk;
1921
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult;
22+
use nexus_sled_agent_shared::inventory::OmicronZoneConfig;
2023
use omicron_common::disk::DatasetConfig;
24+
use omicron_common::disk::DatasetKind;
25+
use omicron_common::disk::DatasetName;
2126
use omicron_uuid_kinds::DatasetUuid;
27+
use sled_storage::config::MountConfig;
28+
use sled_storage::dataset::ZONE_DATASET;
2229
use slog::Logger;
2330
use slog::warn;
2431
use slog_error_chain::InlineErrorChain;
2532
use std::collections::BTreeMap;
2633
use std::sync::Arc;
2734

35+
#[derive(Debug, thiserror::Error)]
36+
pub(super) enum ZoneDatasetDependencyError {
37+
#[error("zone config is missing a filesystem pool")]
38+
MissingFilesystemPool,
39+
#[error(
40+
"zone's transient root dataset is not available: {}", .0.full_name(),
41+
)]
42+
TransientZoneDatasetNotAvailable(DatasetName),
43+
#[error("zone's durable dataset is not available: {}", .0.full_name())]
44+
DurableDatasetNotAvailable(DatasetName),
45+
}
46+
2847
#[derive(Debug)]
2948
pub(super) struct OmicronDatasets {
3049
datasets: IdMap<OmicronDataset>,
3150
dataset_task: DatasetTaskHandle,
3251
}
3352

3453
impl OmicronDatasets {
54+
#[cfg(any(test, feature = "testing"))]
55+
pub(super) fn with_datasets<I>(datasets: I) -> Self
56+
where
57+
I: Iterator<Item = (DatasetConfig, Result<(), DatasetEnsureError>)>,
58+
{
59+
let dataset_task = DatasetTaskHandle::spawn_noop();
60+
let datasets = datasets
61+
.map(|(config, result)| OmicronDataset {
62+
config,
63+
state: match result {
64+
Ok(()) => DatasetState::Ensured,
65+
Err(err) => DatasetState::FailedToEnsure(Arc::new(err)),
66+
},
67+
})
68+
.collect();
69+
Self { datasets, dataset_task }
70+
}
71+
3572
pub(super) fn new(dataset_task: DatasetTaskHandle) -> Self {
3673
Self { datasets: IdMap::default(), dataset_task }
3774
}
3875

76+
/// Confirm that any dataset dependencies of `zone` have been ensured
77+
/// successfully, returning the path for the zone's filesystem root.
78+
pub(super) fn validate_zone_storage(
79+
&self,
80+
zone: &OmicronZoneConfig,
81+
mount_config: &MountConfig,
82+
) -> Result<PathInPool, ZoneDatasetDependencyError> {
83+
// Confirm that there's an ensured `TransientZoneRoot` dataset on this
84+
// zone's filesystem pool.
85+
let Some(filesystem_pool) = zone.filesystem_pool else {
86+
// This should never happen: Reconfigurator guarantees all zones
87+
// have filesystem pools. `filesystem_pool` is non-optional in
88+
// blueprints; we should make it non-optional in `OmicronZoneConfig`
89+
// too: https://github.com/oxidecomputer/omicron/issues/8216
90+
return Err(ZoneDatasetDependencyError::MissingFilesystemPool);
91+
};
92+
93+
let transient_dataset_name = DatasetName::new(
94+
filesystem_pool,
95+
DatasetKind::TransientZone { name: zone.zone_name() },
96+
);
97+
98+
// TODO-cleanup It would be nicer if the zone included the filesystem
99+
// dataset ID, so we could just do a lookup here instead of searching.
100+
// https://github.com/oxidecomputer/omicron/issues/7214
101+
if !self.datasets.iter().any(|d| {
102+
matches!(d.state, DatasetState::Ensured)
103+
&& d.config.name == transient_dataset_name
104+
}) {
105+
return Err(
106+
ZoneDatasetDependencyError::TransientZoneDatasetNotAvailable(
107+
transient_dataset_name,
108+
),
109+
);
110+
}
111+
112+
let zone_root_path = PathInPool {
113+
pool: ZpoolOrRamdisk::Zpool(filesystem_pool),
114+
// TODO-cleanup Should we get this path from the dataset we found
115+
// above?
116+
path: filesystem_pool
117+
.dataset_mountpoint(&mount_config.root, ZONE_DATASET),
118+
};
119+
120+
// Confirm that the durable dataset for this zone has been ensured, if
121+
// it has one.
122+
let Some(durable_dataset) = zone.dataset_name() else {
123+
return Ok(zone_root_path);
124+
};
125+
126+
// TODO-cleanup As above, if we had an ID we could look up instead of
127+
// searching.
128+
if !self.datasets.iter().any(|d| {
129+
matches!(d.state, DatasetState::Ensured)
130+
&& d.config.name == durable_dataset
131+
}) {
132+
return Err(
133+
ZoneDatasetDependencyError::DurableDatasetNotAvailable(
134+
durable_dataset,
135+
),
136+
);
137+
}
138+
139+
Ok(zone_root_path)
140+
}
141+
39142
pub(super) fn remove_datasets_if_needed(
40143
&mut self,
41144
datasets: &IdMap<DatasetConfig>,

0 commit comments

Comments
 (0)