From 56f18321188ebfed01c11493e7aab65dedd305a8 Mon Sep 17 00:00:00 2001 From: Scott Dodson Date: Mon, 11 Aug 2025 14:51:48 -0400 Subject: [PATCH] Add retry logic for WellKnownAvailable endpoint checks The operator now retries the well-known endpoint check every second for up to 15 seconds before setting itself unavailable. This prevents premature "unavailable" status during temporary network issues or API server startup scenarios. Changes: - Add retry loop with 15-second timeout and 1-second intervals - Preserve existing ControllerProgressingError handling logic - Set WellKnownAvailable=False with reason "NotReady" after retries exhausted - Maintain proper progressing status during retry attempts This change improves operator reliability by giving well-known endpoints time to become ready while maintaining the same final error handling behavior. Assisted-by: Cursor, Claude Sonnet 4 --- .../readiness/wellknown_ready_controller.go | 42 ++++++++++++++++++- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/pkg/controllers/readiness/wellknown_ready_controller.go b/pkg/controllers/readiness/wellknown_ready_controller.go index c9a8c5fd6..67b117f3f 100644 --- a/pkg/controllers/readiness/wellknown_ready_controller.go +++ b/pkg/controllers/readiness/wellknown_ready_controller.go @@ -169,13 +169,51 @@ func (c *wellKnownReadyController) sync(ctx context.Context, controllerContext f return err } - if err := c.isWellknownEndpointsReady(ctx, operatorSpec, operatorStatus, authConfig, infraConfig); err != nil { + // Retry the well-known endpoint check for up to 15 seconds with 1-second intervals + retryStart := time.Now() + const retryTimeout = 15 * time.Second + const retryInterval = 1 * time.Second + + for { + err := c.isWellknownEndpointsReady(ctx, operatorSpec, operatorStatus, authConfig, infraConfig) + if err == nil { + // Success - well-known endpoints are ready + break + } + + elapsed := time.Since(retryStart) + + // If we've exceeded the retry timeout, handle the final error + if elapsed >= retryTimeout { + // After retries are exhausted, set to Unavailable with reason NotReady + available = available. + WithStatus(operatorv1.ConditionFalse). + WithReason("NotReady"). + WithMessage(fmt.Sprintf("The well-known endpoint failed after %v of retries: %s", retryTimeout, err.Error())) + progressing = progressing. + WithStatus(operatorv1.ConditionTrue) + return err + } + + // Wait before retrying, but check context cancellation + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(retryInterval): + // Continue with retry + } + } + + // After retries are exhausted, handle the final error with proper progressingErr logic + if err != nil { available = available. WithStatus(operatorv1.ConditionFalse). WithReason("NotReady"). - WithMessage(fmt.Sprintf("The well-known endpoint is not yet available: %s", err.Error())) + WithMessage(fmt.Sprintf("The well-known endpoint failed after %v of retries: %s", retryTimeout, err.Error())) progressing = progressing. WithStatus(operatorv1.ConditionTrue) + + // Handle ControllerProgressingError specially if progressingErr, ok := err.(*common.ControllerProgressingError); ok { if progressingErr.IsDegraded(controllerName, operatorStatus) { return progressingErr.Unwrap()