From ad8ea384b2e3248c966f572b63aa952351c35b56 Mon Sep 17 00:00:00 2001 From: Enxebre Date: Mon, 28 Feb 2022 16:37:51 -0500 Subject: [PATCH] Bubble up machines permanent failure as MS/MD conditions There's a UX lack signal permanent machine failures in MachineSet and MachineDeployments. This PR solves that by bubbling up permanent Machine failures aka FailureMessage/FailureReason as a MachineSet and MachineDeplopyment condition: MachinesSucceededCondition. --- api/v1beta1/condition_consts.go | 4 ++++ .../machinedeployment/machinedeployment_sync.go | 13 +++++++++++++ .../machineset/machineset_controller.go | 17 +++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/api/v1beta1/condition_consts.go b/api/v1beta1/condition_consts.go index 57fd2b379e8b..4d9aa32562be 100644 --- a/api/v1beta1/condition_consts.go +++ b/api/v1beta1/condition_consts.go @@ -250,6 +250,10 @@ const ( // ResizedCondition documents a MachineSet is resizing the set of controlled machines. ResizedCondition ConditionType = "Resized" + // MachinesSucceededCondition reports if any Machine didn't succeed because of a permanent failure. + MachinesSucceededCondition ConditionType = "MachinesSucceeded" + PermanentFailureReason = "PermanentFailure" + // ScalingUpReason (Severity=Info) documents a MachineSet is increasing the number of replicas. ScalingUpReason = "ScalingUp" diff --git a/internal/controllers/machinedeployment/machinedeployment_sync.go b/internal/controllers/machinedeployment/machinedeployment_sync.go index 519a96ed17cb..e4a2725b6dde 100644 --- a/internal/controllers/machinedeployment/machinedeployment_sync.go +++ b/internal/controllers/machinedeployment/machinedeployment_sync.go @@ -420,6 +420,8 @@ func calculateStatus(allMSs []*clusterv1.MachineSet, newMS *clusterv1.MachineSet if totalReplicas-availableReplicas < 0 { status.Phase = string(clusterv1.MachineDeploymentPhaseScalingDown) } + + var machinesSucceededConditionFalse *clusterv1.Condition for _, ms := range allMSs { if ms != nil { if ms.Status.FailureReason != nil || ms.Status.FailureMessage != nil { @@ -427,7 +429,18 @@ func calculateStatus(allMSs []*clusterv1.MachineSet, newMS *clusterv1.MachineSet break } } + + if conditions.IsFalse(ms, clusterv1.MachinesSucceededCondition) { + machinesSucceededConditionFalse = conditions.Get(ms, clusterv1.MachinesSucceededCondition) + } } + + if machinesSucceededConditionFalse != nil { + conditions.Set(deployment, machinesSucceededConditionFalse) + } else { + conditions.MarkTrue(deployment, clusterv1.MachinesSucceededCondition) + } + return status } diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index 5d85a596c0e6..86cf3a85af44 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -22,6 +22,8 @@ import ( "strings" "time" + "k8s.io/utils/pointer" + "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -648,6 +650,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste availableReplicasCount := 0 desiredReplicas := *ms.Spec.Replicas templateLabel := labels.Set(ms.Spec.Template.Labels).AsSelectorPreValidated() + var machineFailures []error for _, machine := range filteredMachines { if templateLabel.Matches(labels.Set(machine.Labels)) { @@ -671,6 +674,20 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste availableReplicasCount++ } } + + if pointer.StringDeref(machine.Status.FailureMessage, "") != "" { + machineFailures = append(machineFailures, fmt.Errorf("machine %q failed: %s. %s", + machine.GetName(), *machine.Status.FailureMessage, *machine.Status.FailureMessage)) + } + } + + // TODO (alberto): if len(machineFailures) is bigger than a threshold it's likely all machines are failing because + // of a common reason e.g cloud provider quota so we can introduce some cleverness here and return a single one to control verbosity here. + if len(machineFailures) > 0 { + failuresError := kerrors.NewAggregate(machineFailures) + conditions.MarkFalse(ms, clusterv1.MachinesSucceededCondition, clusterv1.PermanentFailureReason, clusterv1.ConditionSeverityError, failuresError.Error()) + } else { + conditions.MarkTrue(ms, clusterv1.MachinesSucceededCondition) } newStatus.Replicas = int32(len(filteredMachines))