From c5a31ce9210a46d0be327ee353d20bc1aae1dc35 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 21 Oct 2025 17:55:39 -0700 Subject: [PATCH 1/7] cancel-task --- .../tasks/support_bundle_collector.rs | 75 +++++++++++++++---- 1 file changed, 59 insertions(+), 16 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 8dc13e7ab4..fed18ccb8a 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -513,32 +513,75 @@ impl BundleCollection { work_duration, ); - loop { - tokio::select! { - // Timer fired mid-collection - let's check if we should stop. - _ = yield_interval.tick() => { + // We run this task to "check for cancellation" as a whole tokio task + // for a critical, but subtle reason: After the tick timer yields, + // we may then try to "await" a database function. + // + // This, at a surface-level glance seems innocent enough. However, there + // is something potentially insidious here: if calling a datastore + // function - such as "support_bundle_get" - blocks on acquiring access + // to a connection from the connection pool, while creating the + // collection ALSO potentially blocks on acquiring access to the + // connection pool, it is possible for: + // + // 1. The "&mut collection" arm to have created a future, currently + // yielded, which wants access to this underlying resource. + // 2. The current task of execution, in "support_bundle_get", to be + // blocked "await-ing" for this same underlying resource. + // + // In this specific case, the connection pool would be attempting to + // yield to the "&mut collection" arm, which cannot run, if we were + // blocking on the body of a different async select arm. This would + // result in a deadlock. + // + // In the future, we may attempt to make access to the connection pool + // safer from concurrent asynchronous access - it is unsettling that + // multiple concurrent ".claim()" functions can cause this behavior - + // but in the meantime, we spawn this cancellation check in an entirely + // new tokio task. Because of this separation, each task (the one + // checking for cancellation, and the main thread attempting to collect + // the bundle) do not risk preventing the other from being polled + // indefinitely. + let mut check_for_cancellation = tokio::task::spawn({ + let s = self.clone(); + async move { + loop { + // Timer fired mid-collection - let's check if we should stop. + yield_interval.tick().await; trace!( - &self.log, + &s.log, "Checking if Bundle Collection cancelled"; - "bundle" => %self.bundle.id + "bundle" => %s.bundle.id ); - let bundle = self.datastore.support_bundle_get( - &self.opctx, - self.bundle.id.into() - ).await?; + let bundle = s + .datastore + .support_bundle_get(&s.opctx, s.bundle.id.into()) + .await?; if !matches!(bundle.state, SupportBundleState::Collecting) { - warn!( - &self.log, - "Support Bundle cancelled - stopping collection"; - "bundle" => %self.bundle.id, - "state" => ?self.bundle.state - ); anyhow::bail!("Support Bundle Cancelled"); } + } + } + }); + + loop { + tokio::select! { + // Returns if the bundle has been cancelled explicitly, or if we + // cannot successfully check the bundle state. + why = &mut check_for_cancellation => { + let why = why.expect("Should not cancel the bundle-checking task without returning"); + warn!( + &self.log, + "Support Bundle cancelled - stopping collection"; + "bundle" => %self.bundle.id, + "state" => ?self.bundle.state + ); + return why; }, // Otherwise, keep making progress on the collection itself. report = &mut collection => { + check_for_cancellation.abort(); info!( &self.log, "Bundle Collection completed"; From 0bdb888a78590debba2ec4ed8d1041f2ad024942 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Tue, 21 Oct 2025 18:29:37 -0700 Subject: [PATCH 2/7] less loopy --- .../tasks/support_bundle_collector.rs | 48 +++++++++---------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index fed18ccb8a..0eb1998cee 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -565,31 +565,29 @@ impl BundleCollection { } }); - loop { - tokio::select! { - // Returns if the bundle has been cancelled explicitly, or if we - // cannot successfully check the bundle state. - why = &mut check_for_cancellation => { - let why = why.expect("Should not cancel the bundle-checking task without returning"); - warn!( - &self.log, - "Support Bundle cancelled - stopping collection"; - "bundle" => %self.bundle.id, - "state" => ?self.bundle.state - ); - return why; - }, - // Otherwise, keep making progress on the collection itself. - report = &mut collection => { - check_for_cancellation.abort(); - info!( - &self.log, - "Bundle Collection completed"; - "bundle" => %self.bundle.id - ); - return report; - }, - } + tokio::select! { + // Returns if the bundle has been cancelled explicitly, or if we + // cannot successfully check the bundle state. + why = &mut check_for_cancellation => { + let why = why.expect("Should not cancel the bundle-checking task without returning"); + warn!( + &self.log, + "Support Bundle cancelled - stopping collection"; + "bundle" => %self.bundle.id, + "state" => ?self.bundle.state + ); + return why; + }, + // Otherwise, keep making progress on the collection itself. + report = &mut collection => { + check_for_cancellation.abort(); + info!( + &self.log, + "Bundle Collection completed"; + "bundle" => %self.bundle.id + ); + return report; + }, } } From f25f00eeaef92a342c8ecc9740cf702096c903b6 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 23 Oct 2025 15:14:33 -0700 Subject: [PATCH 3/7] feedback --- .../tasks/support_bundle_collector.rs | 79 ++++++++++--------- 1 file changed, 41 insertions(+), 38 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 0eb1998cee..5b7b3d86e5 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -502,7 +502,7 @@ impl BundleCollection { self: &Arc, dir: &Utf8TempDir, ) -> anyhow::Result { - let mut collection = Box::pin(self.collect_bundle_as_file(&dir)); + let collection = Box::pin(self.collect_bundle_as_file(&dir)); // We periodically check the state of the support bundle - if a user // explicitly cancels it, we should stop the collection process and @@ -513,54 +513,59 @@ impl BundleCollection { work_duration, ); - // We run this task to "check for cancellation" as a whole tokio task + // TL;DR: This `tokio::select` is allowed to poll multiple futures, but + // should not do any async work within the body of any chosen branch. A + // previous iteration of this code polled the "collection" as "&mut + // collection", and checked the status of the support bundle within a + // branch of the "select" polling "yield_interval.tick()". + // + // We organize this work to "check for cancellation" as a whole future // for a critical, but subtle reason: After the tick timer yields, - // we may then try to "await" a database function. + // we may then try to `await` a database function. // // This, at a surface-level glance seems innocent enough. However, there // is something potentially insidious here: if calling a datastore - // function - such as "support_bundle_get" - blocks on acquiring access + // function - such as "support_bundle_get" - awaits acquiring access // to a connection from the connection pool, while creating the - // collection ALSO potentially blocks on acquiring access to the + // collection ALSO potentially awaits acquiring access to the // connection pool, it is possible for: // - // 1. The "&mut collection" arm to have created a future, currently + // 1. The `&mut collection` arm to have created a future, currently // yielded, which wants access to this underlying resource. - // 2. The current task of execution, in "support_bundle_get", to be - // blocked "await-ing" for this same underlying resource. + // 2. The current operation executing in `support_bundle_get` to + // be awaiting access to this same underlying resource. // // In this specific case, the connection pool would be attempting to - // yield to the "&mut collection" arm, which cannot run, if we were - // blocking on the body of a different async select arm. This would + // yield to the `&mut collection` arm, which cannot run, if we were + // awaiting in the body of a different async select arm. This would // result in a deadlock. // // In the future, we may attempt to make access to the connection pool // safer from concurrent asynchronous access - it is unsettling that - // multiple concurrent ".claim()" functions can cause this behavior - - // but in the meantime, we spawn this cancellation check in an entirely - // new tokio task. Because of this separation, each task (the one - // checking for cancellation, and the main thread attempting to collect - // the bundle) do not risk preventing the other from being polled - // indefinitely. - let mut check_for_cancellation = tokio::task::spawn({ - let s = self.clone(); - async move { - loop { - // Timer fired mid-collection - let's check if we should stop. - yield_interval.tick().await; - trace!( - &s.log, - "Checking if Bundle Collection cancelled"; - "bundle" => %s.bundle.id - ); + // multiple concurrent `.claim()` functions can cause this behavior - + // but in the meantime, we perform this cancellation check in a single + // future that always is polled concurrently with the collection work. + // Because of this separation, each future is polled until one + // completes, at which point we deterministically exit. + // + // For more details, see: + // https://github.com/oxidecomputer/omicron/issues/9259 + let check_for_cancellation = Box::pin(async move { + loop { + // Timer fired mid-collection - check if we should stop. + yield_interval.tick().await; + trace!( + &self.log, + "Checking if Bundle Collection cancelled"; + "bundle" => %self.bundle.id + ); - let bundle = s - .datastore - .support_bundle_get(&s.opctx, s.bundle.id.into()) - .await?; - if !matches!(bundle.state, SupportBundleState::Collecting) { - anyhow::bail!("Support Bundle Cancelled"); - } + let bundle = self + .datastore + .support_bundle_get(&self.opctx, self.bundle.id.into()) + .await?; + if !matches!(bundle.state, SupportBundleState::Collecting) { + anyhow::bail!("Support Bundle Cancelled"); } } }); @@ -568,8 +573,7 @@ impl BundleCollection { tokio::select! { // Returns if the bundle has been cancelled explicitly, or if we // cannot successfully check the bundle state. - why = &mut check_for_cancellation => { - let why = why.expect("Should not cancel the bundle-checking task without returning"); + why = check_for_cancellation => { warn!( &self.log, "Support Bundle cancelled - stopping collection"; @@ -579,8 +583,7 @@ impl BundleCollection { return why; }, // Otherwise, keep making progress on the collection itself. - report = &mut collection => { - check_for_cancellation.abort(); + report = collection => { info!( &self.log, "Bundle Collection completed"; From ba866066e4d2d6486c71de0d0d853d181ce895f7 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 23 Oct 2025 16:15:45 -0700 Subject: [PATCH 4/7] I guess we don't need to pin either --- nexus/src/app/background/tasks/support_bundle_collector.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 5b7b3d86e5..ae9e392887 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -502,7 +502,7 @@ impl BundleCollection { self: &Arc, dir: &Utf8TempDir, ) -> anyhow::Result { - let collection = Box::pin(self.collect_bundle_as_file(&dir)); + let collection = self.collect_bundle_as_file(&dir); // We periodically check the state of the support bundle - if a user // explicitly cancels it, we should stop the collection process and @@ -550,7 +550,7 @@ impl BundleCollection { // // For more details, see: // https://github.com/oxidecomputer/omicron/issues/9259 - let check_for_cancellation = Box::pin(async move { + let check_for_cancellation = async move { loop { // Timer fired mid-collection - check if we should stop. yield_interval.tick().await; @@ -568,7 +568,7 @@ impl BundleCollection { anyhow::bail!("Support Bundle Cancelled"); } } - }); + }; tokio::select! { // Returns if the bundle has been cancelled explicitly, or if we From aac4dc72641b8b7ea9f338c2697181fd8274f9b9 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Thu, 23 Oct 2025 16:24:03 -0700 Subject: [PATCH 5/7] don't bail on db fail --- .../tasks/support_bundle_collector.rs | 36 ++++++++++++++++--- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index ae9e392887..efbbd5e5fa 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -555,17 +555,43 @@ impl BundleCollection { // Timer fired mid-collection - check if we should stop. yield_interval.tick().await; trace!( - &self.log, + self.log, "Checking if Bundle Collection cancelled"; "bundle" => %self.bundle.id ); - let bundle = self + match self .datastore .support_bundle_get(&self.opctx, self.bundle.id.into()) - .await?; - if !matches!(bundle.state, SupportBundleState::Collecting) { - anyhow::bail!("Support Bundle Cancelled"); + .await + { + Ok(bundle) => { + if !matches!( + bundle.state, + SupportBundleState::Collecting + ) { + anyhow::bail!("Support Bundle Cancelled"); + } + + // Bundle still collecting; continue... + continue; + } + Err(err) + if matches!(err, Error::ObjectNotFound { .. }) + || matches!(err, Error::NotFound { .. }) => + { + anyhow::bail!("Support Bundle Deleted"); + } + Err(err) => { + warn!( + self.log, + "Database error checking bundle cancellation"; + InlineErrorChain::new(&err) + ); + + // If we cannot contact the database, retry later + continue; + } } } }; From 4281d27e3eab042145a598e7d5c11912baa3be80 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 24 Oct 2025 09:54:54 -0700 Subject: [PATCH 6/7] refactor --- .../tasks/support_bundle_collector.rs | 124 +++++++++--------- 1 file changed, 63 insertions(+), 61 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index efbbd5e5fa..42014b953d 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -502,17 +502,6 @@ impl BundleCollection { self: &Arc, dir: &Utf8TempDir, ) -> anyhow::Result { - let collection = self.collect_bundle_as_file(&dir); - - // We periodically check the state of the support bundle - if a user - // explicitly cancels it, we should stop the collection process and - // return. - let work_duration = tokio::time::Duration::from_secs(5); - let mut yield_interval = tokio::time::interval_at( - tokio::time::Instant::now() + work_duration, - work_duration, - ); - // TL;DR: This `tokio::select` is allowed to poll multiple futures, but // should not do any async work within the body of any chosen branch. A // previous iteration of this code polled the "collection" as "&mut @@ -550,66 +539,20 @@ impl BundleCollection { // // For more details, see: // https://github.com/oxidecomputer/omicron/issues/9259 - let check_for_cancellation = async move { - loop { - // Timer fired mid-collection - check if we should stop. - yield_interval.tick().await; - trace!( - self.log, - "Checking if Bundle Collection cancelled"; - "bundle" => %self.bundle.id - ); - - match self - .datastore - .support_bundle_get(&self.opctx, self.bundle.id.into()) - .await - { - Ok(bundle) => { - if !matches!( - bundle.state, - SupportBundleState::Collecting - ) { - anyhow::bail!("Support Bundle Cancelled"); - } - - // Bundle still collecting; continue... - continue; - } - Err(err) - if matches!(err, Error::ObjectNotFound { .. }) - || matches!(err, Error::NotFound { .. }) => - { - anyhow::bail!("Support Bundle Deleted"); - } - Err(err) => { - warn!( - self.log, - "Database error checking bundle cancellation"; - InlineErrorChain::new(&err) - ); - - // If we cannot contact the database, retry later - continue; - } - } - } - }; tokio::select! { - // Returns if the bundle has been cancelled explicitly, or if we - // cannot successfully check the bundle state. - why = check_for_cancellation => { + // Returns if the bundle should no longer be collected. + why = self.check_for_cancellation() => { warn!( &self.log, "Support Bundle cancelled - stopping collection"; "bundle" => %self.bundle.id, "state" => ?self.bundle.state ); - return why; + return Err(why); }, // Otherwise, keep making progress on the collection itself. - report = collection => { + report = self.collect_bundle_as_file(&dir) => { info!( &self.log, "Bundle Collection completed"; @@ -726,6 +669,65 @@ impl BundleCollection { Ok(()) } + // Indefinitely perform periodic checks about whether or not we should + // cancel the bundle. + // + // Returns an error if: + // - The bundle state is no longer SupportBundleState::Collecting + // (which happens if the bundle has been explicitly cancelled, or + // if the backing storage has been expunged). + // - The bundle has been deleted + // + // Otherwise, keeps checking indefinitely while polled. + async fn check_for_cancellation(&self) -> anyhow::Error { + let work_duration = tokio::time::Duration::from_secs(5); + let mut yield_interval = tokio::time::interval_at( + tokio::time::Instant::now() + work_duration, + work_duration, + ); + + loop { + // Timer fired mid-collection - check if we should stop. + yield_interval.tick().await; + trace!( + self.log, + "Checking if Bundle Collection cancelled"; + "bundle" => %self.bundle.id + ); + + match self + .datastore + .support_bundle_get(&self.opctx, self.bundle.id.into()) + .await + { + Ok(bundle) => { + if !matches!(bundle.state, SupportBundleState::Collecting) { + return anyhow::anyhow!("Support Bundle Cancelled"); + } + + // Bundle still collecting; continue... + continue; + } + Err(err) + if matches!(err, Error::ObjectNotFound { .. }) + || matches!(err, Error::NotFound { .. }) => + { + return anyhow::anyhow!("Support Bundle Deleted"); + } + Err(err) => { + warn!( + self.log, + "Database error checking bundle cancellation"; + InlineErrorChain::new(&err) + ); + + // If we cannot contact the database, retry later + continue; + } + } + } + } + // Perform the work of collecting the support bundle into a temporary directory // // - "dir" is a directory where data can be stored. From 1bb6e82a80af3f9212c2e1188c73d51e29dfb175 Mon Sep 17 00:00:00 2001 From: Sean Klein Date: Fri, 24 Oct 2025 11:31:15 -0700 Subject: [PATCH 7/7] Feedback --- .../tasks/support_bundle_collector.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/nexus/src/app/background/tasks/support_bundle_collector.rs b/nexus/src/app/background/tasks/support_bundle_collector.rs index 42014b953d..09b66816f7 100644 --- a/nexus/src/app/background/tasks/support_bundle_collector.rs +++ b/nexus/src/app/background/tasks/support_bundle_collector.rs @@ -700,18 +700,18 @@ impl BundleCollection { .support_bundle_get(&self.opctx, self.bundle.id.into()) .await { - Ok(bundle) => { - if !matches!(bundle.state, SupportBundleState::Collecting) { - return anyhow::anyhow!("Support Bundle Cancelled"); - } - + Ok(SupportBundle { + state: SupportBundleState::Collecting, + .. + }) => { // Bundle still collecting; continue... continue; } - Err(err) - if matches!(err, Error::ObjectNotFound { .. }) - || matches!(err, Error::NotFound { .. }) => - { + Ok(_) => { + // Not collecting, for any reason: Time to exit + return anyhow::anyhow!("Support Bundle Cancelled"); + } + Err(Error::ObjectNotFound { .. } | Error::NotFound { .. }) => { return anyhow::anyhow!("Support Bundle Deleted"); } Err(err) => {