Skip to content

Commit 9617f40

Browse files
authored
Don't fail active bundles with expunged Nexuses (reassign only) (#9286)
Previously, the upgrade pathway (involving expunging and adding new Nexuses) would cause all support bundles to be marked "failed". For fully-collected bundles, this is not necessary, and fixed by this PR. Fixes #9257
1 parent 7dbdba0 commit 9617f40

File tree

1 file changed

+56
-36
lines changed

1 file changed

+56
-36
lines changed

nexus/db-queries/src/db/datastore/support_bundle.rs

Lines changed: 56 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ impl DataStore {
324324
.execute_async(conn)
325325
.await?;
326326

327-
// Find all bundles on nexuses that no longer exist.
327+
// Find all bundles managed by nexuses that no longer exist.
328328
let bundles_with_bad_nexuses = dsl::support_bundle
329329
.filter(dsl::assigned_nexus.eq_any(invalid_nexus_zones))
330330
.select(SupportBundle::as_select())
@@ -333,15 +333,46 @@ impl DataStore {
333333

334334
let bundles_to_mark_failing = bundles_with_bad_nexuses
335335
.iter()
336-
.map(|b| b.id)
336+
.filter_map(|bundle| {
337+
match bundle.state {
338+
// If the Nexus died mid-collection, we'll mark
339+
// this bundle failing. This should ensure that
340+
// any storage it might have been using is
341+
// cleaned up.
342+
SupportBundleState::Collecting => {
343+
Some(bundle.id)
344+
}
345+
// If the bundle was marked "active", we need to
346+
// re-assign the "owning Nexus" so it can later
347+
// be deleted, but the bundle itself has already
348+
// been stored on a sled.
349+
//
350+
// There's no need to fail it.
351+
SupportBundleState::Active => None,
352+
// If the bundle has already been marked
353+
// "Destroying/Failing/Failed", it's already
354+
// irreversibly marked for deletion.
355+
SupportBundleState::Destroying
356+
| SupportBundleState::Failing
357+
| SupportBundleState::Failed => None,
358+
}
359+
})
337360
.collect::<Vec<_>>();
361+
338362
let bundles_to_reassign = bundles_with_bad_nexuses
339363
.iter()
340364
.filter_map(|bundle| {
341-
if bundle.state != SupportBundleState::Failed {
342-
Some(bundle.id)
343-
} else {
344-
None
365+
match bundle.state {
366+
// If the bundle might be using any storage on
367+
// the provisioned sled, it gets assigned to a
368+
// new Nexus.
369+
SupportBundleState::Collecting
370+
| SupportBundleState::Active
371+
| SupportBundleState::Destroying
372+
| SupportBundleState::Failing => {
373+
Some(bundle.id)
374+
}
375+
SupportBundleState::Failed => None,
345376
}
346377
})
347378
.collect::<Vec<_>>();
@@ -371,30 +402,21 @@ impl DataStore {
371402
bundles_reassigned: 0,
372403
};
373404

374-
// Exit a little early if there are no bundles to re-assign.
375-
//
376-
// This is a tiny optimization, but really, it means that
377-
// tests without Nexuses in their blueprints can succeed if
378-
// they also have no support bundles. In practice, this is
379-
// rare, but in our existing test framework, it's fairly
380-
// common.
381-
if bundles_to_reassign.is_empty() {
382-
return Ok(report);
405+
if !bundles_to_reassign.is_empty() {
406+
// Reassign bundles that haven't been marked "fully failed"
407+
// to ourselves, so we can free their storage if they have
408+
// been provisioned on a sled.
409+
report.bundles_reassigned =
410+
diesel::update(dsl::support_bundle)
411+
.filter(dsl::id.eq_any(bundles_to_reassign))
412+
.set(
413+
dsl::assigned_nexus
414+
.eq(our_nexus_id.into_untyped_uuid()),
415+
)
416+
.execute_async(conn)
417+
.await?;
383418
}
384419

385-
// Reassign bundles that haven't been marked "fully failed"
386-
// to ourselves, so we can free their storage if they have
387-
// been provisioned on a sled.
388-
report.bundles_reassigned =
389-
diesel::update(dsl::support_bundle)
390-
.filter(dsl::id.eq_any(bundles_to_reassign))
391-
.set(
392-
dsl::assigned_nexus
393-
.eq(our_nexus_id.into_untyped_uuid()),
394-
)
395-
.execute_async(conn)
396-
.await?;
397-
398420
Ok(report)
399421
}
400422
.boxed()
@@ -1442,7 +1464,7 @@ mod test {
14421464

14431465
assert_eq!(
14441466
SupportBundleExpungementReport {
1445-
bundles_failing_missing_nexus: 1,
1467+
bundles_failing_missing_nexus: 0,
14461468
bundles_reassigned: 1,
14471469
..Default::default()
14481470
},
@@ -1452,13 +1474,11 @@ mod test {
14521474
let observed_bundle = datastore
14531475
.support_bundle_get(&opctx, bundle.id.into())
14541476
.await
1455-
.expect("Should be able to get bundle we just failed");
1456-
assert_eq!(SupportBundleState::Failing, observed_bundle.state);
1457-
assert!(
1458-
observed_bundle
1459-
.reason_for_failure
1460-
.unwrap()
1461-
.contains(FAILURE_REASON_NO_NEXUS)
1477+
.expect("Should be able to get bundle we reassigned");
1478+
assert_eq!(SupportBundleState::Active, observed_bundle.state);
1479+
assert_eq!(
1480+
observed_bundle.assigned_nexus.unwrap(),
1481+
nexus_ids[1].into(),
14621482
);
14631483

14641484
db.terminate().await;

0 commit comments

Comments
 (0)