@@ -420,23 +420,37 @@ impl<'a> Planner<'a> {
420420 . blueprint
421421 . current_sled_zones ( sled_id, BlueprintZoneDisposition :: is_expunged)
422422 {
423- match zone. disposition {
423+ // If this is a zone still waiting for cleanup, what generation of
424+ // sled config was it expunged?
425+ let as_of_generation = match zone. disposition {
424426 BlueprintZoneDisposition :: Expunged {
425427 as_of_generation,
426428 ready_for_cleanup,
427- } if !ready_for_cleanup => {
428- if let Some ( reconciled) = & sled_inv. last_reconciliation {
429- if reconciled. last_reconciled_config . generation
430- >= as_of_generation
431- && !reconciled. zones . contains_key ( & zone. id )
432- {
433- zones_ready_for_cleanup. push ( zone. id ) ;
434- }
435- }
436- }
429+ } if !ready_for_cleanup => as_of_generation,
437430 BlueprintZoneDisposition :: InService
438431 | BlueprintZoneDisposition :: Expunged { .. } => continue ,
432+ } ;
433+
434+ // If the sled hasn't done any reconciliation, wait until it has.
435+ let Some ( reconciliation) = & sled_inv. last_reconciliation else {
436+ continue ;
437+ } ;
438+
439+ // If the sled hasn't reconciled a new-enough generation, wait until
440+ // it has.
441+ if reconciliation. last_reconciled_config . generation
442+ < as_of_generation
443+ {
444+ continue ;
439445 }
446+
447+ // If the sled hasn't shut down the zone, wait until it has.
448+ if reconciliation. zones . contains_key ( & zone. id ) {
449+ continue ;
450+ }
451+
452+ // All checks passed: we can mark this zone as ready for cleanup.
453+ zones_ready_for_cleanup. push ( zone. id ) ;
440454 }
441455
442456 if !zones_ready_for_cleanup. is_empty ( ) {
@@ -1011,6 +1025,8 @@ pub(crate) mod test {
10111025 use clickhouse_admin_types:: ClickhouseKeeperClusterMembership ;
10121026 use clickhouse_admin_types:: KeeperId ;
10131027 use expectorate:: assert_contents;
1028+ use nexus_sled_agent_shared:: inventory:: ConfigReconcilerInventory ;
1029+ use nexus_sled_agent_shared:: inventory:: ConfigReconcilerInventoryResult ;
10141030 use nexus_types:: deployment:: BlueprintDatasetDisposition ;
10151031 use nexus_types:: deployment:: BlueprintDiffSummary ;
10161032 use nexus_types:: deployment:: BlueprintPhysicalDiskDisposition ;
@@ -2133,9 +2149,10 @@ pub(crate) mod test {
21332149 . sled_agents
21342150 . get_mut ( & sled_id)
21352151 . unwrap ( )
2136- . ledgered_sled_config
2152+ . last_reconciliation
21372153 . as_mut ( )
21382154 . unwrap ( )
2155+ . last_reconciled_config
21392156 . generation = Generation :: from_u32 ( 3 ) ;
21402157
21412158 let blueprint3 = Planner :: new_based_on (
@@ -4005,6 +4022,14 @@ pub(crate) mod test {
40054022 // Use our example system.
40064023 let ( mut collection, input, blueprint1) = example ( & log, TEST_NAME ) ;
40074024
4025+ // Don't start more internal DNS zones (which the planner would, as a
4026+ // side effect of our test details).
4027+ let input = {
4028+ let mut builder = input. into_builder ( ) ;
4029+ builder. policy_mut ( ) . target_internal_dns_zone_count = 0 ;
4030+ builder. build ( )
4031+ } ;
4032+
40084033 // Find a Nexus zone we'll use for our test.
40094034 let ( sled_id, nexus_config) = blueprint1
40104035 . sleds
@@ -4037,7 +4062,7 @@ pub(crate) mod test {
40374062 // Run the planner. It should expunge all zones on the disk we just
40384063 // expunged, including our Nexus zone, but not mark them as ready for
40394064 // cleanup yet.
4040- let blueprint2 = Planner :: new_based_on (
4065+ let mut blueprint2 = Planner :: new_based_on (
40414066 logctx. log . clone ( ) ,
40424067 & blueprint1,
40434068 & input,
@@ -4049,6 +4074,27 @@ pub(crate) mod test {
40494074 . plan ( )
40504075 . expect ( "planned" ) ;
40514076
4077+ // Mark the disk we expected as "ready for cleanup"; this isn't what
4078+ // we're testing, and failing to do this will interfere with some of the
4079+ // checks we do below.
4080+ for mut disk in
4081+ blueprint2. sleds . get_mut ( & sled_id) . unwrap ( ) . disks . iter_mut ( )
4082+ {
4083+ match disk. disposition {
4084+ BlueprintPhysicalDiskDisposition :: InService => ( ) ,
4085+ BlueprintPhysicalDiskDisposition :: Expunged {
4086+ as_of_generation,
4087+ ..
4088+ } => {
4089+ disk. disposition =
4090+ BlueprintPhysicalDiskDisposition :: Expunged {
4091+ as_of_generation,
4092+ ready_for_cleanup : true ,
4093+ } ;
4094+ }
4095+ }
4096+ }
4097+
40524098 // Helper to extract the Nexus zone's disposition in a blueprint.
40534099 let get_nexus_disposition = |bp : & Blueprint | {
40544100 bp. sleds . get ( & sled_id) . unwrap ( ) . zones . iter ( ) . find_map ( |z| {
@@ -4057,33 +4103,34 @@ pub(crate) mod test {
40574103 } ;
40584104
40594105 // This sled's config generation should have been bumped...
4060- let bp2_generation =
4061- blueprint2 . sleds . get ( & sled_id ) . unwrap ( ) . sled_agent_generation ;
4106+ let bp2_config = blueprint2 . sleds . get ( & sled_id ) . unwrap ( ) . clone ( ) ;
4107+ let bp2_sled_config = bp2_config . clone ( ) . into_in_service_sled_config ( ) ;
40624108 assert_eq ! (
40634109 blueprint1
40644110 . sleds
40654111 . get( & sled_id)
40664112 . unwrap( )
40674113 . sled_agent_generation
40684114 . next( ) ,
4069- bp2_generation
4115+ bp2_sled_config . generation
40704116 ) ;
40714117 // ... and the Nexus should should have the disposition we expect.
40724118 assert_eq ! (
40734119 get_nexus_disposition( & blueprint2) ,
40744120 Some ( BlueprintZoneDisposition :: Expunged {
4075- as_of_generation: bp2_generation ,
4121+ as_of_generation: bp2_sled_config . generation ,
40764122 ready_for_cleanup: false ,
40774123 } )
40784124 ) ;
40794125
40804126 // Running the planner again should make no changes until the inventory
4081- // reports that the zone is not running and that the sled has seen a
4082- // new-enough generation. Try three variants that should do nothing :
4127+ // reports that the zone is not running and that the sled has reconciled
4128+ // a new-enough generation. Try these variants:
40834129 //
4084- // * same inventory as above
4085- // * inventory reports a new generation (but zone still running)
4086- // * inventory reports zone not running (but still the old generation)
4130+ // * same inventory as above (expect no changes)
4131+ // * new config is ledgered but not reconciled (expect no changes)
4132+ // * new config is reconciled, but zone is in an error state (expect
4133+ // no changes)
40874134 eprintln ! ( "planning with no inventory change..." ) ;
40884135 assert_planning_makes_no_changes (
40894136 & logctx. log ,
@@ -4092,15 +4139,7 @@ pub(crate) mod test {
40924139 & collection,
40934140 TEST_NAME ,
40944141 ) ;
4095- // TODO-cleanup These checks depend on `last_reconciled_config`, which
4096- // is not yet populated; uncomment these and check them by mutating
4097- // `last_reconciled_config` once
4098- // https://github.com/oxidecomputer/omicron/pull/8064 lands. We could
4099- // just mutate `ledgered_sled_config` in the meantime (as this
4100- // commented-out code does below), but that's not really checking what
4101- // we care about.
4102- /*
4103- eprintln!("planning with generation bump but zone still running...");
4142+ eprintln ! ( "planning with config ledgered but not reconciled..." ) ;
41044143 assert_planning_makes_no_changes (
41054144 & logctx. log ,
41064145 & blueprint2,
@@ -4111,15 +4150,15 @@ pub(crate) mod test {
41114150 . sled_agents
41124151 . get_mut ( & sled_id)
41134152 . unwrap ( )
4114- .ledgered_sled_config
4115- .as_mut()
4116- .unwrap()
4117- .generation = bp2_generation;
4153+ . ledgered_sled_config = Some ( bp2_sled_config. clone ( ) ) ;
41184154 collection
41194155 } ,
41204156 TEST_NAME ,
41214157 ) ;
4122- eprintln!("planning with zone gone but generation not bumped...");
4158+ eprintln ! (
4159+ "planning with config ledgered but \
4160+ zones failed to shut down..."
4161+ ) ;
41234162 assert_planning_makes_no_changes (
41244163 & logctx. log ,
41254164 & blueprint2,
@@ -4130,28 +4169,42 @@ pub(crate) mod test {
41304169 . sled_agents
41314170 . get_mut ( & sled_id)
41324171 . unwrap ( )
4133- .ledgered_sled_config
4134- .as_mut()
4172+ . ledgered_sled_config = Some ( bp2_sled_config. clone ( ) ) ;
4173+ let mut reconciliation =
4174+ ConfigReconcilerInventory :: debug_assume_success (
4175+ bp2_sled_config. clone ( ) ,
4176+ ) ;
4177+ // For all the zones that are in bp2_config but not
4178+ // bp2_sled_config (i.e., zones that should have been shut
4179+ // down), insert an error result in the reconciliation.
4180+ for zone_id in bp2_config. zones . keys ( ) {
4181+ if !reconciliation. zones . contains_key ( zone_id) {
4182+ reconciliation. zones . insert (
4183+ * zone_id,
4184+ ConfigReconcilerInventoryResult :: Err {
4185+ message : "failed to shut down" . to_string ( ) ,
4186+ } ,
4187+ ) ;
4188+ }
4189+ }
4190+ collection
4191+ . sled_agents
4192+ . get_mut ( & sled_id)
41354193 . unwrap ( )
4136- .zones
4137- .retain(|z| z.id != nexus_config.id);
4194+ . last_reconciliation = Some ( reconciliation) ;
41384195 collection
41394196 } ,
41404197 TEST_NAME ,
41414198 ) ;
4142- */
41434199
41444200 // Now make both changes to the inventory.
41454201 {
4146- let config = & mut collection
4147- . sled_agents
4148- . get_mut ( & sled_id)
4149- . unwrap ( )
4150- . ledgered_sled_config
4151- . as_mut ( )
4152- . unwrap ( ) ;
4153- config. generation = bp2_generation;
4154- config. zones . retain ( |z| z. id != nexus_config. id ) ;
4202+ let config = collection. sled_agents . get_mut ( & sled_id) . unwrap ( ) ;
4203+ config. ledgered_sled_config = Some ( bp2_sled_config. clone ( ) ) ;
4204+ config. last_reconciliation =
4205+ Some ( ConfigReconcilerInventory :: debug_assume_success (
4206+ bp2_sled_config. clone ( ) ,
4207+ ) ) ;
41554208 }
41564209
41574210 // Run the planner. It mark our Nexus zone as ready for cleanup now that
@@ -4171,7 +4224,7 @@ pub(crate) mod test {
41714224 assert_eq ! (
41724225 get_nexus_disposition( & blueprint3) ,
41734226 Some ( BlueprintZoneDisposition :: Expunged {
4174- as_of_generation: bp2_generation ,
4227+ as_of_generation: bp2_sled_config . generation ,
41754228 ready_for_cleanup: true ,
41764229 } )
41774230 ) ;
@@ -4180,7 +4233,7 @@ pub(crate) mod test {
41804233 // since it doesn't affect what's sent to sled-agent.
41814234 assert_eq ! (
41824235 blueprint3. sleds. get( & sled_id) . unwrap( ) . sled_agent_generation,
4183- bp2_generation
4236+ bp2_sled_config . generation
41844237 ) ;
41854238
41864239 assert_planning_makes_no_changes (
@@ -4260,22 +4313,26 @@ pub(crate) mod test {
42604313 } ;
42614314
42624315 // This sled's config generation should have been bumped...
4263- let bp2_generation =
4264- blueprint2. sleds . get ( & sled_id) . unwrap ( ) . sled_agent_generation ;
4316+ let bp2_config = blueprint2
4317+ . sleds
4318+ . get ( & sled_id)
4319+ . unwrap ( )
4320+ . clone ( )
4321+ . into_in_service_sled_config ( ) ;
42654322 assert_eq ! (
42664323 blueprint1
42674324 . sleds
42684325 . get( & sled_id)
42694326 . unwrap( )
42704327 . sled_agent_generation
42714328 . next( ) ,
4272- bp2_generation
4329+ bp2_config . generation
42734330 ) ;
42744331 // ... and the DNS zone should should have the disposition we expect.
42754332 assert_eq ! (
42764333 get_dns_disposition( & blueprint2) ,
42774334 Some ( BlueprintZoneDisposition :: Expunged {
4278- as_of_generation: bp2_generation ,
4335+ as_of_generation: bp2_config . generation ,
42794336 ready_for_cleanup: false ,
42804337 } )
42814338 ) ;
@@ -4293,15 +4350,12 @@ pub(crate) mod test {
42934350
42944351 // Make the inventory changes necessary for cleanup to proceed.
42954352 {
4296- let config = & mut collection
4297- . sled_agents
4298- . get_mut ( & sled_id)
4299- . unwrap ( )
4300- . ledgered_sled_config
4301- . as_mut ( )
4302- . unwrap ( ) ;
4303- config. generation = bp2_generation;
4304- config. zones . retain ( |z| z. id != internal_dns_config. id ) ;
4353+ let config = & mut collection. sled_agents . get_mut ( & sled_id) . unwrap ( ) ;
4354+ config. ledgered_sled_config = Some ( bp2_config. clone ( ) ) ;
4355+ config. last_reconciliation =
4356+ Some ( ConfigReconcilerInventory :: debug_assume_success (
4357+ bp2_config. clone ( ) ,
4358+ ) ) ;
43054359 }
43064360
43074361 // Run the planner. It should mark our internal DNS zone as ready for
@@ -4323,7 +4377,7 @@ pub(crate) mod test {
43234377 assert_eq ! (
43244378 get_dns_disposition( & blueprint3) ,
43254379 Some ( BlueprintZoneDisposition :: Expunged {
4326- as_of_generation: bp2_generation ,
4380+ as_of_generation: bp2_config . generation ,
43274381 ready_for_cleanup: true ,
43284382 } )
43294383 ) ;
0 commit comments