From ad8ea384b2e3248c966f572b63aa952351c35b56 Mon Sep 17 00:00:00 2001
From: Enxebre <alberto.garcial@hotmail.com>
Date: Mon, 28 Feb 2022 16:37:51 -0500
Subject: [PATCH] Bubble up machines permanent failure as MS/MD conditions

There's a UX lack signal permanent machine failures in MachineSet and MachineDeployments.
This PR solves that by bubbling up permanent Machine failures aka FailureMessage/FailureReason as a MachineSet and MachineDeplopyment condition: MachinesSucceededCondition.
---
 api/v1beta1/condition_consts.go                 |  4 ++++
 .../machinedeployment/machinedeployment_sync.go | 13 +++++++++++++
 .../machineset/machineset_controller.go         | 17 +++++++++++++++++
 3 files changed, 34 insertions(+)

diff --git a/api/v1beta1/condition_consts.go b/api/v1beta1/condition_consts.go
index 57fd2b379e8b..4d9aa32562be 100644
--- a/api/v1beta1/condition_consts.go
+++ b/api/v1beta1/condition_consts.go
@@ -250,6 +250,10 @@ const (
 	// ResizedCondition documents a MachineSet is resizing the set of controlled machines.
 	ResizedCondition ConditionType = "Resized"
 
+	// MachinesSucceededCondition reports if any Machine didn't succeed because of a permanent failure.
+	MachinesSucceededCondition ConditionType = "MachinesSucceeded"
+	PermanentFailureReason                   = "PermanentFailure"
+
 	// ScalingUpReason (Severity=Info) documents a MachineSet is increasing the number of replicas.
 	ScalingUpReason = "ScalingUp"
 
diff --git a/internal/controllers/machinedeployment/machinedeployment_sync.go b/internal/controllers/machinedeployment/machinedeployment_sync.go
index 519a96ed17cb..e4a2725b6dde 100644
--- a/internal/controllers/machinedeployment/machinedeployment_sync.go
+++ b/internal/controllers/machinedeployment/machinedeployment_sync.go
@@ -420,6 +420,8 @@ func calculateStatus(allMSs []*clusterv1.MachineSet, newMS *clusterv1.MachineSet
 	if totalReplicas-availableReplicas < 0 {
 		status.Phase = string(clusterv1.MachineDeploymentPhaseScalingDown)
 	}
+
+	var machinesSucceededConditionFalse *clusterv1.Condition
 	for _, ms := range allMSs {
 		if ms != nil {
 			if ms.Status.FailureReason != nil || ms.Status.FailureMessage != nil {
@@ -427,7 +429,18 @@ func calculateStatus(allMSs []*clusterv1.MachineSet, newMS *clusterv1.MachineSet
 				break
 			}
 		}
+
+		if conditions.IsFalse(ms, clusterv1.MachinesSucceededCondition) {
+			machinesSucceededConditionFalse = conditions.Get(ms, clusterv1.MachinesSucceededCondition)
+		}
 	}
+
+	if machinesSucceededConditionFalse != nil {
+		conditions.Set(deployment, machinesSucceededConditionFalse)
+	} else {
+		conditions.MarkTrue(deployment, clusterv1.MachinesSucceededCondition)
+	}
+
 	return status
 }
 
diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go
index 5d85a596c0e6..86cf3a85af44 100644
--- a/internal/controllers/machineset/machineset_controller.go
+++ b/internal/controllers/machineset/machineset_controller.go
@@ -22,6 +22,8 @@ import (
 	"strings"
 	"time"
 
+	"k8s.io/utils/pointer"
+
 	"github.com/pkg/errors"
 	corev1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -648,6 +650,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste
 	availableReplicasCount := 0
 	desiredReplicas := *ms.Spec.Replicas
 	templateLabel := labels.Set(ms.Spec.Template.Labels).AsSelectorPreValidated()
+	var machineFailures []error
 
 	for _, machine := range filteredMachines {
 		if templateLabel.Matches(labels.Set(machine.Labels)) {
@@ -671,6 +674,20 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste
 				availableReplicasCount++
 			}
 		}
+
+		if pointer.StringDeref(machine.Status.FailureMessage, "") != "" {
+			machineFailures = append(machineFailures, fmt.Errorf("machine %q failed: %s. %s",
+				machine.GetName(), *machine.Status.FailureMessage, *machine.Status.FailureMessage))
+		}
+	}
+
+	// TODO (alberto): if len(machineFailures) is bigger than a threshold it's likely all machines are failing because
+	// of a common reason e.g cloud provider quota so we can introduce some cleverness here and return a single one to control verbosity here.
+	if len(machineFailures) > 0 {
+		failuresError := kerrors.NewAggregate(machineFailures)
+		conditions.MarkFalse(ms, clusterv1.MachinesSucceededCondition, clusterv1.PermanentFailureReason, clusterv1.ConditionSeverityError, failuresError.Error())
+	} else {
+		conditions.MarkTrue(ms, clusterv1.MachinesSucceededCondition)
 	}
 
 	newStatus.Replicas = int32(len(filteredMachines))