From 56f18321188ebfed01c11493e7aab65dedd305a8 Mon Sep 17 00:00:00 2001
From: Scott Dodson <sdodson@redhat.com>
Date: Mon, 11 Aug 2025 14:51:48 -0400
Subject: [PATCH] Add retry logic for WellKnownAvailable endpoint checks

The operator now retries the well-known endpoint check every second for up to 15 seconds before setting itself unavailable. This prevents premature "unavailable" status during temporary network issues or API server startup scenarios.

Changes:
- Add retry loop with 15-second timeout and 1-second intervals
- Preserve existing ControllerProgressingError handling logic
- Set WellKnownAvailable=False with reason "NotReady" after retries exhausted
- Maintain proper progressing status during retry attempts

This change improves operator reliability by giving well-known endpoints time to become ready while maintaining the same final error handling behavior.

Assisted-by: Cursor, Claude Sonnet 4
---
 .../readiness/wellknown_ready_controller.go   | 42 ++++++++++++++++++-
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/pkg/controllers/readiness/wellknown_ready_controller.go b/pkg/controllers/readiness/wellknown_ready_controller.go
index c9a8c5fd6..67b117f3f 100644
--- a/pkg/controllers/readiness/wellknown_ready_controller.go
+++ b/pkg/controllers/readiness/wellknown_ready_controller.go
@@ -169,13 +169,51 @@ func (c *wellKnownReadyController) sync(ctx context.Context, controllerContext f
 		return err
 	}
 
-	if err := c.isWellknownEndpointsReady(ctx, operatorSpec, operatorStatus, authConfig, infraConfig); err != nil {
+	// Retry the well-known endpoint check for up to 15 seconds with 1-second intervals
+	retryStart := time.Now()
+	const retryTimeout = 15 * time.Second
+	const retryInterval = 1 * time.Second
+
+	for {
+		err := c.isWellknownEndpointsReady(ctx, operatorSpec, operatorStatus, authConfig, infraConfig)
+		if err == nil {
+			// Success - well-known endpoints are ready
+			break
+		}
+
+		elapsed := time.Since(retryStart)
+
+		// If we've exceeded the retry timeout, handle the final error
+		if elapsed >= retryTimeout {
+			// After retries are exhausted, set to Unavailable with reason NotReady
+			available = available.
+				WithStatus(operatorv1.ConditionFalse).
+				WithReason("NotReady").
+				WithMessage(fmt.Sprintf("The well-known endpoint failed after %v of retries: %s", retryTimeout, err.Error()))
+			progressing = progressing.
+				WithStatus(operatorv1.ConditionTrue)
+			return err
+		}
+
+		// Wait before retrying, but check context cancellation
+		select {
+		case <-ctx.Done():
+			return ctx.Err()
+		case <-time.After(retryInterval):
+			// Continue with retry
+		}
+	}
+
+	// After retries are exhausted, handle the final error with proper progressingErr logic
+	if err != nil {
 		available = available.
 			WithStatus(operatorv1.ConditionFalse).
 			WithReason("NotReady").
-			WithMessage(fmt.Sprintf("The well-known endpoint is not yet available: %s", err.Error()))
+			WithMessage(fmt.Sprintf("The well-known endpoint failed after %v of retries: %s", retryTimeout, err.Error()))
 		progressing = progressing.
 			WithStatus(operatorv1.ConditionTrue)
+
+		// Handle ControllerProgressingError specially
 		if progressingErr, ok := err.(*common.ControllerProgressingError); ok {
 			if progressingErr.IsDegraded(controllerName, operatorStatus) {
 				return progressingErr.Unwrap()