diff --git a/test/infrastructure/docker/exp/internal/controllers/dockermachinepool_controller_phases.go b/test/infrastructure/docker/exp/internal/controllers/dockermachinepool_controller_phases.go index f01653849cb6..03545c219405 100644 --- a/test/infrastructure/docker/exp/internal/controllers/dockermachinepool_controller_phases.go +++ b/test/infrastructure/docker/exp/internal/controllers/dockermachinepool_controller_phases.go @@ -97,6 +97,14 @@ func createDockerContainer(ctx context.Context, name string, cluster *clusterv1. } } + // If re-entering the reconcile loop and reaching this point, the container is expected to be running. If it is not, delete it so we can try to create it again. + if externalMachine.Exists() && !externalMachine.IsRunning() { + // This deletes the machine and results in re-creating it below. + if err := externalMachine.Delete(ctx); err != nil { + return errors.Wrap(err, "Failed to delete not running DockerMachine") + } + } + log.Info("Creating container for machinePool", "name", name, "MachinePool", klog.KObj(machinePool)) if err := externalMachine.Create(ctx, dockerMachinePool.Spec.Template.CustomImage, constants.WorkerNodeRoleValue, machinePool.Spec.Template.Spec.Version, labels, dockerMachinePool.Spec.Template.ExtraMounts); err != nil { return errors.Wrapf(err, "failed to create docker machine with name %s", name) diff --git a/test/infrastructure/docker/internal/controllers/backends/docker/dockermachine_backend.go b/test/infrastructure/docker/internal/controllers/backends/docker/dockermachine_backend.go index df08570e0105..42d8cb21f8f9 100644 --- a/test/infrastructure/docker/internal/controllers/backends/docker/dockermachine_backend.go +++ b/test/infrastructure/docker/internal/controllers/backends/docker/dockermachine_backend.go @@ -193,6 +193,14 @@ func (r *MachineBackendReconciler) ReconcileNormal(ctx context.Context, cluster role = constants.ControlPlaneNodeRoleValue } + // If re-entering the reconcile loop and reaching this point, the container is expected to be running. If it is not, delete it so we can try to create it again. + if externalMachine.Exists() && !externalMachine.IsRunning() { + // This deletes the machine and results in re-creating it below. + if err := externalMachine.Delete(ctx); err != nil { + return ctrl.Result{}, errors.Wrap(err, "Failed to delete not running DockerMachine") + } + } + // Create the machine if not existing yet if !externalMachine.Exists() { // NOTE: FailureDomains don't mean much in CAPD since it's all local, but we are setting a label on diff --git a/test/infrastructure/docker/internal/docker/machine.go b/test/infrastructure/docker/internal/docker/machine.go index 4d261813c3b2..56f1aabdad9d 100644 --- a/test/infrastructure/docker/internal/docker/machine.go +++ b/test/infrastructure/docker/internal/docker/machine.go @@ -158,6 +158,15 @@ func (m *Machine) Exists() bool { return m.container != nil } +// IsRunning returns true if the container for this machine is running. +func (m *Machine) IsRunning() bool { + if !m.Exists() { + return false + } + + return m.container.IsRunning() +} + // Name returns the name of the machine. func (m *Machine) Name() string { return m.machine @@ -540,6 +549,8 @@ func (m *Machine) Delete(ctx context.Context) error { if err := m.container.Delete(ctx); err != nil { return err } + + m.container = nil } return nil }