Skip to content

Commit 8443ae8

Browse files
committed
fix failing planner tests
1 parent 7c0451c commit 8443ae8

File tree

1 file changed

+120
-66
lines changed

1 file changed

+120
-66
lines changed

nexus/reconfigurator/planning/src/planner.rs

Lines changed: 120 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -420,23 +420,37 @@ impl<'a> Planner<'a> {
420420
.blueprint
421421
.current_sled_zones(sled_id, BlueprintZoneDisposition::is_expunged)
422422
{
423-
match zone.disposition {
423+
// If this is a zone still waiting for cleanup, what generation of
424+
// sled config was it expunged?
425+
let as_of_generation = match zone.disposition {
424426
BlueprintZoneDisposition::Expunged {
425427
as_of_generation,
426428
ready_for_cleanup,
427-
} if !ready_for_cleanup => {
428-
if let Some(reconciled) = &sled_inv.last_reconciliation {
429-
if reconciled.last_reconciled_config.generation
430-
>= as_of_generation
431-
&& !reconciled.zones.contains_key(&zone.id)
432-
{
433-
zones_ready_for_cleanup.push(zone.id);
434-
}
435-
}
436-
}
429+
} if !ready_for_cleanup => as_of_generation,
437430
BlueprintZoneDisposition::InService
438431
| BlueprintZoneDisposition::Expunged { .. } => continue,
432+
};
433+
434+
// If the sled hasn't done any reconciliation, wait until it has.
435+
let Some(reconciliation) = &sled_inv.last_reconciliation else {
436+
continue;
437+
};
438+
439+
// If the sled hasn't reconciled a new-enough generation, wait until
440+
// it has.
441+
if reconciliation.last_reconciled_config.generation
442+
< as_of_generation
443+
{
444+
continue;
439445
}
446+
447+
// If the sled hasn't shut down the zone, wait until it has.
448+
if reconciliation.zones.contains_key(&zone.id) {
449+
continue;
450+
}
451+
452+
// All checks passed: we can mark this zone as ready for cleanup.
453+
zones_ready_for_cleanup.push(zone.id);
440454
}
441455

442456
if !zones_ready_for_cleanup.is_empty() {
@@ -1011,6 +1025,8 @@ pub(crate) mod test {
10111025
use clickhouse_admin_types::ClickhouseKeeperClusterMembership;
10121026
use clickhouse_admin_types::KeeperId;
10131027
use expectorate::assert_contents;
1028+
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventory;
1029+
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryResult;
10141030
use nexus_types::deployment::BlueprintDatasetDisposition;
10151031
use nexus_types::deployment::BlueprintDiffSummary;
10161032
use nexus_types::deployment::BlueprintPhysicalDiskDisposition;
@@ -2133,9 +2149,10 @@ pub(crate) mod test {
21332149
.sled_agents
21342150
.get_mut(&sled_id)
21352151
.unwrap()
2136-
.ledgered_sled_config
2152+
.last_reconciliation
21372153
.as_mut()
21382154
.unwrap()
2155+
.last_reconciled_config
21392156
.generation = Generation::from_u32(3);
21402157

21412158
let blueprint3 = Planner::new_based_on(
@@ -4005,6 +4022,14 @@ pub(crate) mod test {
40054022
// Use our example system.
40064023
let (mut collection, input, blueprint1) = example(&log, TEST_NAME);
40074024

4025+
// Don't start more internal DNS zones (which the planner would, as a
4026+
// side effect of our test details).
4027+
let input = {
4028+
let mut builder = input.into_builder();
4029+
builder.policy_mut().target_internal_dns_zone_count = 0;
4030+
builder.build()
4031+
};
4032+
40084033
// Find a Nexus zone we'll use for our test.
40094034
let (sled_id, nexus_config) = blueprint1
40104035
.sleds
@@ -4037,7 +4062,7 @@ pub(crate) mod test {
40374062
// Run the planner. It should expunge all zones on the disk we just
40384063
// expunged, including our Nexus zone, but not mark them as ready for
40394064
// cleanup yet.
4040-
let blueprint2 = Planner::new_based_on(
4065+
let mut blueprint2 = Planner::new_based_on(
40414066
logctx.log.clone(),
40424067
&blueprint1,
40434068
&input,
@@ -4049,6 +4074,27 @@ pub(crate) mod test {
40494074
.plan()
40504075
.expect("planned");
40514076

4077+
// Mark the disk we expected as "ready for cleanup"; this isn't what
4078+
// we're testing, and failing to do this will interfere with some of the
4079+
// checks we do below.
4080+
for mut disk in
4081+
blueprint2.sleds.get_mut(&sled_id).unwrap().disks.iter_mut()
4082+
{
4083+
match disk.disposition {
4084+
BlueprintPhysicalDiskDisposition::InService => (),
4085+
BlueprintPhysicalDiskDisposition::Expunged {
4086+
as_of_generation,
4087+
..
4088+
} => {
4089+
disk.disposition =
4090+
BlueprintPhysicalDiskDisposition::Expunged {
4091+
as_of_generation,
4092+
ready_for_cleanup: true,
4093+
};
4094+
}
4095+
}
4096+
}
4097+
40524098
// Helper to extract the Nexus zone's disposition in a blueprint.
40534099
let get_nexus_disposition = |bp: &Blueprint| {
40544100
bp.sleds.get(&sled_id).unwrap().zones.iter().find_map(|z| {
@@ -4057,33 +4103,34 @@ pub(crate) mod test {
40574103
};
40584104

40594105
// This sled's config generation should have been bumped...
4060-
let bp2_generation =
4061-
blueprint2.sleds.get(&sled_id).unwrap().sled_agent_generation;
4106+
let bp2_config = blueprint2.sleds.get(&sled_id).unwrap().clone();
4107+
let bp2_sled_config = bp2_config.clone().into_in_service_sled_config();
40624108
assert_eq!(
40634109
blueprint1
40644110
.sleds
40654111
.get(&sled_id)
40664112
.unwrap()
40674113
.sled_agent_generation
40684114
.next(),
4069-
bp2_generation
4115+
bp2_sled_config.generation
40704116
);
40714117
// ... and the Nexus should should have the disposition we expect.
40724118
assert_eq!(
40734119
get_nexus_disposition(&blueprint2),
40744120
Some(BlueprintZoneDisposition::Expunged {
4075-
as_of_generation: bp2_generation,
4121+
as_of_generation: bp2_sled_config.generation,
40764122
ready_for_cleanup: false,
40774123
})
40784124
);
40794125

40804126
// Running the planner again should make no changes until the inventory
4081-
// reports that the zone is not running and that the sled has seen a
4082-
// new-enough generation. Try three variants that should do nothing:
4127+
// reports that the zone is not running and that the sled has reconciled
4128+
// a new-enough generation. Try these variants:
40834129
//
4084-
// * same inventory as above
4085-
// * inventory reports a new generation (but zone still running)
4086-
// * inventory reports zone not running (but still the old generation)
4130+
// * same inventory as above (expect no changes)
4131+
// * new config is ledgered but not reconciled (expect no changes)
4132+
// * new config is reconciled, but zone is in an error state (expect
4133+
// no changes)
40874134
eprintln!("planning with no inventory change...");
40884135
assert_planning_makes_no_changes(
40894136
&logctx.log,
@@ -4092,15 +4139,7 @@ pub(crate) mod test {
40924139
&collection,
40934140
TEST_NAME,
40944141
);
4095-
// TODO-cleanup These checks depend on `last_reconciled_config`, which
4096-
// is not yet populated; uncomment these and check them by mutating
4097-
// `last_reconciled_config` once
4098-
// https://github.com/oxidecomputer/omicron/pull/8064 lands. We could
4099-
// just mutate `ledgered_sled_config` in the meantime (as this
4100-
// commented-out code does below), but that's not really checking what
4101-
// we care about.
4102-
/*
4103-
eprintln!("planning with generation bump but zone still running...");
4142+
eprintln!("planning with config ledgered but not reconciled...");
41044143
assert_planning_makes_no_changes(
41054144
&logctx.log,
41064145
&blueprint2,
@@ -4111,15 +4150,15 @@ pub(crate) mod test {
41114150
.sled_agents
41124151
.get_mut(&sled_id)
41134152
.unwrap()
4114-
.ledgered_sled_config
4115-
.as_mut()
4116-
.unwrap()
4117-
.generation = bp2_generation;
4153+
.ledgered_sled_config = Some(bp2_sled_config.clone());
41184154
collection
41194155
},
41204156
TEST_NAME,
41214157
);
4122-
eprintln!("planning with zone gone but generation not bumped...");
4158+
eprintln!(
4159+
"planning with config ledgered but \
4160+
zones failed to shut down..."
4161+
);
41234162
assert_planning_makes_no_changes(
41244163
&logctx.log,
41254164
&blueprint2,
@@ -4130,28 +4169,42 @@ pub(crate) mod test {
41304169
.sled_agents
41314170
.get_mut(&sled_id)
41324171
.unwrap()
4133-
.ledgered_sled_config
4134-
.as_mut()
4172+
.ledgered_sled_config = Some(bp2_sled_config.clone());
4173+
let mut reconciliation =
4174+
ConfigReconcilerInventory::debug_assume_success(
4175+
bp2_sled_config.clone(),
4176+
);
4177+
// For all the zones that are in bp2_config but not
4178+
// bp2_sled_config (i.e., zones that should have been shut
4179+
// down), insert an error result in the reconciliation.
4180+
for zone_id in bp2_config.zones.keys() {
4181+
if !reconciliation.zones.contains_key(zone_id) {
4182+
reconciliation.zones.insert(
4183+
*zone_id,
4184+
ConfigReconcilerInventoryResult::Err {
4185+
message: "failed to shut down".to_string(),
4186+
},
4187+
);
4188+
}
4189+
}
4190+
collection
4191+
.sled_agents
4192+
.get_mut(&sled_id)
41354193
.unwrap()
4136-
.zones
4137-
.retain(|z| z.id != nexus_config.id);
4194+
.last_reconciliation = Some(reconciliation);
41384195
collection
41394196
},
41404197
TEST_NAME,
41414198
);
4142-
*/
41434199

41444200
// Now make both changes to the inventory.
41454201
{
4146-
let config = &mut collection
4147-
.sled_agents
4148-
.get_mut(&sled_id)
4149-
.unwrap()
4150-
.ledgered_sled_config
4151-
.as_mut()
4152-
.unwrap();
4153-
config.generation = bp2_generation;
4154-
config.zones.retain(|z| z.id != nexus_config.id);
4202+
let config = collection.sled_agents.get_mut(&sled_id).unwrap();
4203+
config.ledgered_sled_config = Some(bp2_sled_config.clone());
4204+
config.last_reconciliation =
4205+
Some(ConfigReconcilerInventory::debug_assume_success(
4206+
bp2_sled_config.clone(),
4207+
));
41554208
}
41564209

41574210
// Run the planner. It mark our Nexus zone as ready for cleanup now that
@@ -4171,7 +4224,7 @@ pub(crate) mod test {
41714224
assert_eq!(
41724225
get_nexus_disposition(&blueprint3),
41734226
Some(BlueprintZoneDisposition::Expunged {
4174-
as_of_generation: bp2_generation,
4227+
as_of_generation: bp2_sled_config.generation,
41754228
ready_for_cleanup: true,
41764229
})
41774230
);
@@ -4180,7 +4233,7 @@ pub(crate) mod test {
41804233
// since it doesn't affect what's sent to sled-agent.
41814234
assert_eq!(
41824235
blueprint3.sleds.get(&sled_id).unwrap().sled_agent_generation,
4183-
bp2_generation
4236+
bp2_sled_config.generation
41844237
);
41854238

41864239
assert_planning_makes_no_changes(
@@ -4260,22 +4313,26 @@ pub(crate) mod test {
42604313
};
42614314

42624315
// This sled's config generation should have been bumped...
4263-
let bp2_generation =
4264-
blueprint2.sleds.get(&sled_id).unwrap().sled_agent_generation;
4316+
let bp2_config = blueprint2
4317+
.sleds
4318+
.get(&sled_id)
4319+
.unwrap()
4320+
.clone()
4321+
.into_in_service_sled_config();
42654322
assert_eq!(
42664323
blueprint1
42674324
.sleds
42684325
.get(&sled_id)
42694326
.unwrap()
42704327
.sled_agent_generation
42714328
.next(),
4272-
bp2_generation
4329+
bp2_config.generation
42734330
);
42744331
// ... and the DNS zone should should have the disposition we expect.
42754332
assert_eq!(
42764333
get_dns_disposition(&blueprint2),
42774334
Some(BlueprintZoneDisposition::Expunged {
4278-
as_of_generation: bp2_generation,
4335+
as_of_generation: bp2_config.generation,
42794336
ready_for_cleanup: false,
42804337
})
42814338
);
@@ -4293,15 +4350,12 @@ pub(crate) mod test {
42934350

42944351
// Make the inventory changes necessary for cleanup to proceed.
42954352
{
4296-
let config = &mut collection
4297-
.sled_agents
4298-
.get_mut(&sled_id)
4299-
.unwrap()
4300-
.ledgered_sled_config
4301-
.as_mut()
4302-
.unwrap();
4303-
config.generation = bp2_generation;
4304-
config.zones.retain(|z| z.id != internal_dns_config.id);
4353+
let config = &mut collection.sled_agents.get_mut(&sled_id).unwrap();
4354+
config.ledgered_sled_config = Some(bp2_config.clone());
4355+
config.last_reconciliation =
4356+
Some(ConfigReconcilerInventory::debug_assume_success(
4357+
bp2_config.clone(),
4358+
));
43054359
}
43064360

43074361
// Run the planner. It should mark our internal DNS zone as ready for
@@ -4323,7 +4377,7 @@ pub(crate) mod test {
43234377
assert_eq!(
43244378
get_dns_disposition(&blueprint3),
43254379
Some(BlueprintZoneDisposition::Expunged {
4326-
as_of_generation: bp2_generation,
4380+
as_of_generation: bp2_config.generation,
43274381
ready_for_cleanup: true,
43284382
})
43294383
);

0 commit comments

Comments
 (0)