Skip to content

Commit fe60eb9

Browse files
authored
reconfigurator: Ensure all durable datasets, not just crucible (#6065)
Reconfigurator currently ensures a `Dataset` row exists for all crucible zones, but fails to do so for other zones with durable datasets (notably Cockroach, which we now support adding and expunging!). This PR fixes that. I think we also need reconfigurator to learn how to _delete_ dataset rows for expunged zones, right? That will be a followup PR.
1 parent bedb238 commit fe60eb9

File tree

8 files changed

+242
-142
lines changed

8 files changed

+242
-142
lines changed

nexus/reconfigurator/execution/src/cockroachdb.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,9 @@ mod test {
8888
settings.preserve_downgrade,
8989
CockroachDbClusterVersion::NEWLY_INITIALIZED.to_string()
9090
);
91+
// Record the zpools so we don't fail to ensure datasets (unrelated to
92+
// crdb settings) during blueprint execution.
93+
crate::tests::insert_zpool_records(datastore, &opctx, &blueprint).await;
9194
// Execute the initial blueprint.
9295
let overrides = Overridables::for_test(cptestctx);
9396
crate::realize_blueprint_with_overrides(

nexus/reconfigurator/execution/src/datasets.rs

Lines changed: 89 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -6,41 +6,38 @@
66
77
use anyhow::Context;
88
use nexus_db_model::Dataset;
9-
use nexus_db_model::DatasetKind;
109
use nexus_db_queries::context::OpContext;
1110
use nexus_db_queries::db::DataStore;
12-
use nexus_types::deployment::blueprint_zone_type;
1311
use nexus_types::deployment::BlueprintZoneConfig;
14-
use nexus_types::deployment::BlueprintZoneType;
12+
use nexus_types::deployment::DurableDataset;
1513
use nexus_types::identity::Asset;
1614
use omicron_uuid_kinds::GenericUuid;
1715
use omicron_uuid_kinds::OmicronZoneUuid;
1816
use slog::info;
1917
use slog::warn;
2018
use std::collections::BTreeSet;
2119

22-
/// For each crucible zone in `all_omicron_zones`, ensure that a corresponding
23-
/// dataset record exists in `datastore`
20+
/// For each zone in `all_omicron_zones` that has an associated durable dataset,
21+
/// ensure that a corresponding dataset record exists in `datastore`.
2422
///
2523
/// Does not modify any existing dataset records. Returns the number of
2624
/// datasets inserted.
27-
pub(crate) async fn ensure_crucible_dataset_records_exist(
25+
pub(crate) async fn ensure_dataset_records_exist(
2826
opctx: &OpContext,
2927
datastore: &DataStore,
3028
all_omicron_zones: impl Iterator<Item = &BlueprintZoneConfig>,
3129
) -> anyhow::Result<usize> {
3230
// Before attempting to insert any datasets, first query for any existing
3331
// dataset records so we can filter them out. This looks like a typical
3432
// TOCTOU issue, but it is purely a performance optimization. We expect
35-
// almost all executions of this function to do nothing: new crucible
36-
// datasets are created very rarely relative to how frequently blueprint
37-
// realization happens. We could remove this check and filter and instead
38-
// run the below "insert if not exists" query on every crucible zone, and
39-
// the behavior would still be correct. However, that would issue far more
40-
// queries than necessary in the very common case of "we don't need to do
41-
// anything at all".
42-
let mut crucible_datasets = datastore
43-
.dataset_list_all_batched(opctx, Some(DatasetKind::Crucible))
33+
// almost all executions of this function to do nothing: new datasets are
34+
// created very rarely relative to how frequently blueprint realization
35+
// happens. We could remove this check and filter and instead run the below
36+
// "insert if not exists" query on every zone, and the behavior would still
37+
// be correct. However, that would issue far more queries than necessary in
38+
// the very common case of "we don't need to do anything at all".
39+
let mut existing_datasets = datastore
40+
.dataset_list_all_batched(opctx, None)
4441
.await
4542
.context("failed to list all datasets")?
4643
.into_iter()
@@ -51,18 +48,16 @@ pub(crate) async fn ensure_crucible_dataset_records_exist(
5148
let mut num_already_exist = 0;
5249

5350
for zone in all_omicron_zones {
54-
let BlueprintZoneType::Crucible(blueprint_zone_type::Crucible {
55-
address,
56-
dataset,
57-
}) = &zone.zone_type
51+
let Some(DurableDataset { dataset, kind, address }) =
52+
zone.zone_type.durable_dataset()
5853
else {
5954
continue;
6055
};
6156

6257
let id = zone.id;
6358

6459
// If already present in the datastore, move on.
65-
if crucible_datasets.remove(&id) {
60+
if existing_datasets.remove(&id) {
6661
num_already_exist += 1;
6762
continue;
6863
}
@@ -71,8 +66,8 @@ pub(crate) async fn ensure_crucible_dataset_records_exist(
7166
let dataset = Dataset::new(
7267
id.into_untyped_uuid(),
7368
pool_id.into_untyped_uuid(),
74-
*address,
75-
DatasetKind::Crucible,
69+
address,
70+
kind.into(),
7671
);
7772
let maybe_inserted = datastore
7873
.dataset_insert_if_not_exists(dataset)
@@ -87,8 +82,9 @@ pub(crate) async fn ensure_crucible_dataset_records_exist(
8782
if maybe_inserted.is_some() {
8883
info!(
8984
opctx.log,
90-
"inserted new dataset for crucible zone";
85+
"inserted new dataset for Omicron zone";
9186
"id" => %id,
87+
"kind" => ?kind,
9288
);
9389
num_inserted += 1;
9490
} else {
@@ -99,18 +95,18 @@ pub(crate) async fn ensure_crucible_dataset_records_exist(
9995
// We don't currently support removing datasets, so this would be
10096
// surprising: the database contains dataset records that are no longer in
10197
// our blueprint. We can't do anything about this, so just warn.
102-
if !crucible_datasets.is_empty() {
98+
if !existing_datasets.is_empty() {
10399
warn!(
104100
opctx.log,
105-
"database contains {} unexpected crucible datasets",
106-
crucible_datasets.len();
107-
"dataset_ids" => ?crucible_datasets,
101+
"database contains {} unexpected datasets",
102+
existing_datasets.len();
103+
"dataset_ids" => ?existing_datasets,
108104
);
109105
}
110106

111107
info!(
112108
opctx.log,
113-
"ensured all crucible zones have dataset records";
109+
"ensured all Omicron zones have dataset records";
114110
"num_inserted" => num_inserted,
115111
"num_already_existed" => num_already_exist,
116112
);
@@ -121,30 +117,27 @@ pub(crate) async fn ensure_crucible_dataset_records_exist(
121117
#[cfg(test)]
122118
mod tests {
123119
use super::*;
124-
use nexus_db_model::Generation;
125-
use nexus_db_model::SledBaseboard;
126-
use nexus_db_model::SledSystemHardware;
127-
use nexus_db_model::SledUpdate;
128120
use nexus_db_model::Zpool;
129121
use nexus_reconfigurator_planning::example::example;
130122
use nexus_test_utils_macros::nexus_test;
123+
use nexus_types::deployment::blueprint_zone_type;
131124
use nexus_types::deployment::BlueprintZoneDisposition;
132125
use nexus_types::deployment::BlueprintZoneFilter;
126+
use nexus_types::deployment::BlueprintZoneType;
133127
use omicron_common::zpool_name::ZpoolName;
134128
use omicron_uuid_kinds::GenericUuid;
135129
use omicron_uuid_kinds::ZpoolUuid;
136130
use sled_agent_client::types::OmicronZoneDataset;
137-
use sled_agent_client::types::OmicronZoneType;
138131
use uuid::Uuid;
139132

140133
type ControlPlaneTestContext =
141134
nexus_test_utils::ControlPlaneTestContext<omicron_nexus::Server>;
142135

143136
#[nexus_test]
144-
async fn test_ensure_crucible_dataset_records_exist(
137+
async fn test_ensure_dataset_records_exist(
145138
cptestctx: &ControlPlaneTestContext,
146139
) {
147-
const TEST_NAME: &str = "test_ensure_crucible_dataset_records_exist";
140+
const TEST_NAME: &str = "test_ensure_dataset_records_exist";
148141

149142
// Set up.
150143
let nexus = &cptestctx.server.server_context().nexus;
@@ -158,55 +151,14 @@ mod tests {
158151
// Use the standard example system.
159152
let (collection, _, blueprint) = example(&opctx.log, TEST_NAME, 5);
160153

161-
// Record the sleds and zpools contained in this collection.
162-
let rack_id = Uuid::new_v4();
163-
for (&sled_id, config) in &collection.omicron_zones {
164-
let sled = SledUpdate::new(
165-
sled_id.into_untyped_uuid(),
166-
"[::1]:0".parse().unwrap(),
167-
SledBaseboard {
168-
serial_number: format!("test-{sled_id}"),
169-
part_number: "test-sled".to_string(),
170-
revision: 0,
171-
},
172-
SledSystemHardware {
173-
is_scrimlet: false,
174-
usable_hardware_threads: 128,
175-
usable_physical_ram: (64 << 30).try_into().unwrap(),
176-
reservoir_size: (16 << 30).try_into().unwrap(),
177-
},
178-
rack_id,
179-
Generation::new(),
180-
);
181-
datastore.sled_upsert(sled).await.expect("failed to upsert sled");
182-
183-
for zone in &config.zones.zones {
184-
let OmicronZoneType::Crucible { dataset, .. } = &zone.zone_type
185-
else {
186-
continue;
187-
};
188-
let zpool = Zpool::new(
189-
dataset.pool_name.id().into_untyped_uuid(),
190-
sled_id.into_untyped_uuid(),
191-
Uuid::new_v4(), // physical_disk_id
192-
);
193-
datastore
194-
.zpool_insert(opctx, zpool)
195-
.await
196-
.expect("failed to upsert zpool");
197-
}
198-
}
199-
200-
// How many crucible zones are there?
201-
let ncrucible_zones = collection
202-
.all_omicron_zones()
203-
.filter(|z| matches!(z.zone_type, OmicronZoneType::Crucible { .. }))
204-
.count();
154+
// Record the sleds and zpools.
155+
crate::tests::insert_sled_records(datastore, &blueprint).await;
156+
crate::tests::insert_zpool_records(datastore, opctx, &blueprint).await;
205157

206158
// Prior to ensuring datasets exist, there should be none.
207159
assert_eq!(
208160
datastore
209-
.dataset_list_all_batched(opctx, Some(DatasetKind::Crucible))
161+
.dataset_list_all_batched(opctx, None)
210162
.await
211163
.unwrap()
212164
.len(),
@@ -219,46 +171,52 @@ mod tests {
219171
.map(|(_, zone)| zone)
220172
.collect::<Vec<_>>();
221173

222-
let ndatasets_inserted = ensure_crucible_dataset_records_exist(
174+
// How many zones are there with durable datasets?
175+
let nzones_with_durable_datasets = all_omicron_zones
176+
.iter()
177+
.filter(|z| z.zone_type.durable_dataset().is_some())
178+
.count();
179+
180+
let ndatasets_inserted = ensure_dataset_records_exist(
223181
opctx,
224182
datastore,
225183
all_omicron_zones.iter().copied(),
226184
)
227185
.await
228-
.expect("failed to ensure crucible datasets");
186+
.expect("failed to ensure datasets");
229187

230-
// We should have inserted a dataset for each crucible zone.
231-
assert_eq!(ncrucible_zones, ndatasets_inserted);
188+
// We should have inserted a dataset for each zone with a durable
189+
// dataset.
190+
assert_eq!(nzones_with_durable_datasets, ndatasets_inserted);
232191
assert_eq!(
233192
datastore
234-
.dataset_list_all_batched(opctx, Some(DatasetKind::Crucible))
193+
.dataset_list_all_batched(opctx, None)
235194
.await
236195
.unwrap()
237196
.len(),
238-
ncrucible_zones,
197+
nzones_with_durable_datasets,
239198
);
240199

241-
// Ensuring the same crucible datasets again should insert no new
242-
// records.
243-
let ndatasets_inserted = ensure_crucible_dataset_records_exist(
200+
// Ensuring the same datasets again should insert no new records.
201+
let ndatasets_inserted = ensure_dataset_records_exist(
244202
opctx,
245203
datastore,
246204
all_omicron_zones.iter().copied(),
247205
)
248206
.await
249-
.expect("failed to ensure crucible datasets");
207+
.expect("failed to ensure datasets");
250208
assert_eq!(0, ndatasets_inserted);
251209
assert_eq!(
252210
datastore
253-
.dataset_list_all_batched(opctx, Some(DatasetKind::Crucible))
211+
.dataset_list_all_batched(opctx, None)
254212
.await
255213
.unwrap()
256214
.len(),
257-
ncrucible_zones,
215+
nzones_with_durable_datasets,
258216
);
259217

260-
// Create another zpool on one of the sleds, so we can add a new
261-
// crucible zone that uses it.
218+
// Create another zpool on one of the sleds, so we can add new
219+
// zones that use it.
262220
let new_zpool_id = ZpoolUuid::new_v4();
263221
for &sled_id in collection.omicron_zones.keys().take(1) {
264222
let zpool = Zpool::new(
@@ -272,37 +230,53 @@ mod tests {
272230
.expect("failed to upsert zpool");
273231
}
274232

275-
// Call `ensure_crucible_dataset_records_exist` again, adding a new
276-
// crucible zone. It should insert only this new zone.
277-
let new_zone = BlueprintZoneConfig {
278-
disposition: BlueprintZoneDisposition::InService,
279-
id: OmicronZoneUuid::new_v4(),
280-
underlay_address: "::1".parse().unwrap(),
281-
filesystem_pool: Some(ZpoolName::new_external(new_zpool_id)),
282-
zone_type: BlueprintZoneType::Crucible(
283-
blueprint_zone_type::Crucible {
284-
address: "[::1]:0".parse().unwrap(),
285-
dataset: OmicronZoneDataset {
286-
pool_name: ZpoolName::new_external(new_zpool_id),
233+
// Call `ensure_dataset_records_exist` again, adding new crucible and
234+
// cockroach zones. It should insert only these new zones.
235+
let new_zones = [
236+
BlueprintZoneConfig {
237+
disposition: BlueprintZoneDisposition::InService,
238+
id: OmicronZoneUuid::new_v4(),
239+
underlay_address: "::1".parse().unwrap(),
240+
filesystem_pool: Some(ZpoolName::new_external(new_zpool_id)),
241+
zone_type: BlueprintZoneType::Crucible(
242+
blueprint_zone_type::Crucible {
243+
address: "[::1]:0".parse().unwrap(),
244+
dataset: OmicronZoneDataset {
245+
pool_name: ZpoolName::new_external(new_zpool_id),
246+
},
287247
},
288-
},
289-
),
290-
};
291-
let ndatasets_inserted = ensure_crucible_dataset_records_exist(
248+
),
249+
},
250+
BlueprintZoneConfig {
251+
disposition: BlueprintZoneDisposition::InService,
252+
id: OmicronZoneUuid::new_v4(),
253+
underlay_address: "::1".parse().unwrap(),
254+
filesystem_pool: Some(ZpoolName::new_external(new_zpool_id)),
255+
zone_type: BlueprintZoneType::CockroachDb(
256+
blueprint_zone_type::CockroachDb {
257+
address: "[::1]:0".parse().unwrap(),
258+
dataset: OmicronZoneDataset {
259+
pool_name: ZpoolName::new_external(new_zpool_id),
260+
},
261+
},
262+
),
263+
},
264+
];
265+
let ndatasets_inserted = ensure_dataset_records_exist(
292266
opctx,
293267
datastore,
294-
all_omicron_zones.iter().copied().chain(std::iter::once(&new_zone)),
268+
all_omicron_zones.iter().copied().chain(&new_zones),
295269
)
296270
.await
297-
.expect("failed to ensure crucible datasets");
298-
assert_eq!(ndatasets_inserted, 1);
271+
.expect("failed to ensure datasets");
272+
assert_eq!(ndatasets_inserted, 2);
299273
assert_eq!(
300274
datastore
301-
.dataset_list_all_batched(opctx, Some(DatasetKind::Crucible))
275+
.dataset_list_all_batched(opctx, None)
302276
.await
303277
.unwrap()
304278
.len(),
305-
ncrucible_zones + 1,
279+
nzones_with_durable_datasets + 2,
306280
);
307281
}
308282
}

nexus/reconfigurator/execution/src/dns.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1180,6 +1180,10 @@ mod test {
11801180
blueprint.cockroachdb_setting_preserve_downgrade =
11811181
CockroachDbPreserveDowngrade::DoNotModify;
11821182

1183+
// Record the zpools so we don't fail to ensure datasets (unrelated to
1184+
// DNS) during blueprint execution.
1185+
crate::tests::insert_zpool_records(datastore, &opctx, &blueprint).await;
1186+
11831187
// Now, execute the initial blueprint.
11841188
let overrides = Overridables::for_test(cptestctx);
11851189
crate::realize_blueprint_with_overrides(

0 commit comments

Comments
 (0)