Skip to content

Commit a6f865d

Browse files
author
Naadir Jeewa
committed
:run: cabpk: Add retries to control plane join
Signed-off-by: Naadir Jeewa <[email protected]>
1 parent 04ca7d1 commit a6f865d

File tree

11 files changed

+466
-9
lines changed

11 files changed

+466
-9
lines changed

Makefile

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,9 @@ CONVERSION_GEN := $(abspath $(TOOLS_BIN_DIR)/conversion-gen)
6060
# Bindata.
6161
GOBINDATA := $(abspath $(TOOLS_BIN_DIR)/go-bindata)
6262
GOBINDATA_CLUSTERCTL_DIR := cmd/clusterctl/config
63+
CLOUDINIT_PKG_DIR := bootstrap/kubeadm/internal/cloudinit
64+
CLOUDINIT_GENERATED := $(CLOUDINIT_PKG_DIR)/zz_generated.bindata.go
65+
CLOUDINIT_SCRIPT := $(CLOUDINIT_PKG_DIR)/kubeadm-bootstrap-script.sh
6366
CERTMANAGER_COMPONENTS_GENERATED_FILE := cert-manager.yaml
6467

6568
# Define Docker related variables. Releases should modify and double check these vars.
@@ -242,7 +245,7 @@ generate-go-kubeadm-control-plane: $(CONTROLLER_GEN) $(CONVERSION_GEN) ## Runs G
242245
paths=./controlplane/kubeadm/api/...
243246

244247
.PHONY: generate-bindata
245-
generate-bindata: $(KUSTOMIZE) $(GOBINDATA) clean-bindata ## Generate code for embedding the clusterctl api manifest
248+
generate-bindata: $(KUSTOMIZE) $(GOBINDATA) clean-bindata $(CLOUDINIT_GENERATED) ## Generate code for embedding the clusterctl api manifest
246249
# Package manifest YAML into a single file.
247250
mkdir -p $(GOBINDATA_CLUSTERCTL_DIR)/manifest/
248251
$(KUSTOMIZE) build $(GOBINDATA_CLUSTERCTL_DIR)/crd > $(GOBINDATA_CLUSTERCTL_DIR)/manifest/clusterctl-api.yaml
@@ -255,6 +258,11 @@ generate-bindata: $(KUSTOMIZE) $(GOBINDATA) clean-bindata ## Generate code for e
255258
# Cleanup the manifest folder.
256259
$(MAKE) clean-bindata
257260

261+
$(CLOUDINIT_GENERATED): $(GOBINDATA) $(CLOUDINIT_SCRIPT)
262+
$(GOBINDATA) -mode=420 -modtime=1 -pkg=cloudinit -o=$(CLOUDINIT_GENERATED).tmp $(CLOUDINIT_SCRIPT)
263+
cat ./hack/boilerplate/boilerplate.generatego.txt $(CLOUDINIT_GENERATED).tmp > $(CLOUDINIT_GENERATED)
264+
rm $(CLOUDINIT_GENERATED).tmp
265+
258266
.PHONY: generate-manifests
259267
generate-manifests: ## Generate manifests e.g. CRD, RBAC etc.
260268
$(MAKE) generate-core-manifests

bootstrap/kubeadm/api/v1alpha2/conversion.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ func (src *KubeadmConfig) ConvertTo(dstRaw conversion.Hub) error {
3838

3939
dst.Status.DataSecretName = restored.Status.DataSecretName
4040
dst.Spec.Verbosity = restored.Spec.Verbosity
41+
dst.Spec.UseExperimentalRetryJoin = restored.Spec.UseExperimentalRetryJoin
4142

4243
return nil
4344
}
@@ -119,7 +120,6 @@ func Convert_v1alpha3_KubeadmConfigStatus_To_v1alpha2_KubeadmConfigStatus(in *ku
119120
return nil
120121
}
121122

122-
123123
// Convert_v1alpha2_KubeadmConfigSpec_To_v1alpha3_KubeadmConfigSpec converts this KubeadmConfigSpec to the Hub version (v1alpha3).
124124
func Convert_v1alpha2_KubeadmConfigSpec_To_v1alpha3_KubeadmConfigSpec(in *KubeadmConfigSpec, out *kubeadmbootstrapv1alpha3.KubeadmConfigSpec, s apiconversion.Scope) error {
125125
return autoConvert_v1alpha2_KubeadmConfigSpec_To_v1alpha3_KubeadmConfigSpec(in, out, s)

bootstrap/kubeadm/api/v1alpha2/zz_generated.conversion.go

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bootstrap/kubeadm/api/v1alpha3/kubeadmbootstrapconfig_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ type KubeadmConfigSpec struct {
7373
// It overrides the `--v` flag in kubeadm commands.
7474
// +optional
7575
Verbosity *int32 `json:"verbosity,omitempty"`
76+
77+
// UseExperimentalRetryJoin replaces a basic kubeadm command with a shell
78+
// script with retries for control plane joins
79+
// +optional
80+
UseExperimentalRetryJoin bool `json:"useExperimentalRetryJoin,omitempty"`
7681
}
7782

7883
// KubeadmConfigStatus defines the observed state of KubeadmConfig

bootstrap/kubeadm/config/crd/bases/bootstrap.cluster.x-k8s.io_kubeadmconfigs.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1554,6 +1554,10 @@ spec:
15541554
items:
15551555
type: string
15561556
type: array
1557+
useExperimentalRetryJoin:
1558+
description: UseExperimentalRetryJoin replaces a basic kubeadm command
1559+
with a shell script with retries for control plane joins
1560+
type: boolean
15571561
users:
15581562
description: Users specifies extra users to add
15591563
items:

bootstrap/kubeadm/config/crd/bases/bootstrap.cluster.x-k8s.io_kubeadmconfigtemplates.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1633,6 +1633,11 @@ spec:
16331633
items:
16341634
type: string
16351635
type: array
1636+
useExperimentalRetryJoin:
1637+
description: UseExperimentalRetryJoin replaces a basic kubeadm
1638+
command with a shell script with retries for control plane
1639+
joins
1640+
type: boolean
16361641
users:
16371642
description: Users specifies extra users to add
16381643
items:

bootstrap/kubeadm/controllers/kubeadmconfig_controller.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -489,8 +489,9 @@ func (r *KubeadmConfigReconciler) joinControlplane(ctx context.Context, scope *S
489489
}
490490

491491
cloudJoinData, err := cloudinit.NewJoinControlPlane(&cloudinit.ControlPlaneJoinInput{
492-
JoinConfiguration: joinData,
493-
Certificates: certificates,
492+
JoinConfiguration: joinData,
493+
Certificates: certificates,
494+
UseExperimentalRetryJoin: scope.Config.Spec.UseExperimentalRetryJoin,
494495
BaseUserData: cloudinit.BaseUserData{
495496
AdditionalFiles: scope.Config.Spec.Files,
496497
NTP: scope.Config.Spec.NTP,

bootstrap/kubeadm/internal/cloudinit/controlplane_join.go

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,16 @@ limitations under the License.
1717
package cloudinit
1818

1919
import (
20-
"github.com/pkg/errors"
20+
"fmt"
2121

22+
"github.com/pkg/errors"
23+
bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1alpha3"
2224
"sigs.k8s.io/cluster-api/util/secret"
2325
)
2426

2527
const (
28+
standardJoinCommand = "kubeadm join --config /tmp/kubeadm-controlplane-join-config.yaml %s"
29+
retriableJoinCommand = "/usr/local/bin/kubeadm-bootstrap-script"
2630
controlPlaneJoinCloudInit = `{{.Header}}
2731
{{template "files" .WriteFiles}}
2832
- path: /tmp/kubeadm-controlplane-join-config.yaml
@@ -32,7 +36,7 @@ const (
3236
{{.JoinConfiguration | Indent 6}}
3337
runcmd:
3438
{{- template "commands" .PreKubeadmCommands }}
35-
- 'kubeadm join --config /tmp/kubeadm-controlplane-join-config.yaml {{.KubeadmVerbosity}}'
39+
- {{ .KubeadmCommand }}
3640
{{- template "commands" .PostKubeadmCommands }}
3741
{{- template "ntp" .NTP }}
3842
{{- template "users" .Users }}
@@ -43,9 +47,10 @@ runcmd:
4347
type ControlPlaneJoinInput struct {
4448
BaseUserData
4549
secret.Certificates
46-
47-
BootstrapToken string
48-
JoinConfiguration string
50+
UseExperimentalRetryJoin bool
51+
KubeadmCommand string
52+
BootstrapToken string
53+
JoinConfiguration string
4954
}
5055

5156
// NewJoinControlPlane returns the user data string to be used on a new control plane instance.
@@ -54,10 +59,37 @@ func NewJoinControlPlane(input *ControlPlaneJoinInput) ([]byte, error) {
5459
// TODO: Consider validating that the correct certificates exist. It is different for external/stacked etcd
5560
input.WriteFiles = input.Certificates.AsFiles()
5661
input.WriteFiles = append(input.WriteFiles, input.AdditionalFiles...)
62+
input.KubeadmCommand = fmt.Sprintf(standardJoinCommand, input.KubeadmVerbosity)
63+
if input.UseExperimentalRetryJoin {
64+
err := input.useBootstrapScript()
65+
if err != nil {
66+
return nil, err
67+
}
68+
}
5769
userData, err := generate("JoinControlplane", controlPlaneJoinCloudInit, input)
5870
if err != nil {
5971
return nil, errors.Wrapf(err, "failed to generate user data for machine joining control plane")
6072
}
6173

6274
return userData, err
6375
}
76+
77+
func (input *ControlPlaneJoinInput) useBootstrapScript() error {
78+
scriptBytes, err := bootstrapKubeadmInternalCloudinitKubeadmBootstrapScriptShBytes()
79+
if err != nil {
80+
return errors.Wrap(err, "couldn't read bootstrap script")
81+
}
82+
joinScript, err := generate("JoinControlplaneScript", string(scriptBytes), input)
83+
if err != nil {
84+
return errors.Wrap(err, "failed to generate user data for machine joining control plane")
85+
}
86+
joinScriptFile := bootstrapv1.File{
87+
Path: "/usr/local/bin/kubeadm-bootstrap-script",
88+
Owner: "root",
89+
Permissions: "0755",
90+
Content: string(joinScript),
91+
}
92+
input.WriteFiles = append(input.WriteFiles, joinScriptFile)
93+
input.KubeadmCommand = retriableJoinCommand
94+
return nil
95+
}
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#!/bin/bash
2+
# Copyright 2020 The Kubernetes Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
# Log an error and exit.
17+
# Args:
18+
# $1 Message to log with the error
19+
# $2 The error code to return
20+
log::error_exit() {
21+
local message="${1}"
22+
local code="${2}"
23+
24+
log::error "${message}"
25+
log::info "Removing member from cluster status"
26+
kubeadm reset -f update-cluster-status || true
27+
log::info "Removing etcd member"
28+
kubeadm reset -f remove-etcd-member || true
29+
log::info "Resetting kubeadm"
30+
kubeadm reset -f || true
31+
log::error "cluster.x-k8s.io kubeadm bootstrap script $0 exiting with status ${code}"
32+
exit "${code}"
33+
}
34+
35+
log::success_exit() {
36+
log::info "cluster.x-k8s.io kubeadm bootstrap script $0 finished"
37+
exit 0
38+
}
39+
40+
# Log an error but keep going.
41+
log::error() {
42+
local message="${1}"
43+
timestamp=$(date --iso-8601=seconds)
44+
echo "!!! [${timestamp}] ${1}" >&2
45+
shift
46+
for message; do
47+
echo " ${message}" >&2
48+
done
49+
}
50+
51+
# Print a status line. Formatted to show up in a stream of output.
52+
log::info() {
53+
timestamp=$(date --iso-8601=seconds)
54+
echo "+++ [${timestamp}] ${1}"
55+
shift
56+
for message; do
57+
echo " ${message}"
58+
done
59+
}
60+
61+
check_kubeadm_command() {
62+
local command="${1}"
63+
local code="${2}"
64+
case ${code} in
65+
"0")
66+
log::info "kubeadm reported successful execution for ${command}"
67+
;;
68+
"1")
69+
log::error "kubeadm reported failed action(s) for ${command}"
70+
;;
71+
"2")
72+
log::error "kubeadm reported preflight check error during ${command}"
73+
;;
74+
"3")
75+
log::error_exit "kubeadm reported validation error for ${command}"
76+
;;
77+
*)
78+
log::error "kubeadm reported unknown error ${code} for ${command}"
79+
;;
80+
esac
81+
}
82+
83+
function retry-command() {
84+
n=0
85+
local kubeadm_return
86+
until [ $n -ge 5 ]; do
87+
log::info "running '$*'"
88+
# shellcheck disable=SC1083
89+
"$@" --config /tmp/kubeadm-controlplane-join-config.yaml {{.KubeadmVerbosity}}
90+
kubeadm_return=$?
91+
check_kubeadm_command "'$*'" "${kubeadm_return}"
92+
if [ ${kubeadm_return} -eq 0 ]; then
93+
break
94+
fi
95+
# We allow preflight errors to pass
96+
if [ ${kubeadm_return} -eq 2 ]; then
97+
break
98+
fi
99+
n=$((n + 1))
100+
sleep 15
101+
done
102+
if [ ${kubeadm_return} -ne 0 ]; then
103+
log::error_exit "too many errors, exiting"
104+
fi
105+
}
106+
107+
function try-or-die-command() {
108+
local kubeadm_return
109+
log::info "running '$*'"
110+
# shellcheck disable=SC1083
111+
"$@" --config /tmp/kubeadm-controlplane-join-config.yaml {{.KubeadmVerbosity}}
112+
kubeadm_return=$?
113+
check_kubeadm_command "'$*'" "${kubeadm_return}"
114+
if [ ${kubeadm_return} -ne 0 ]; then
115+
log::error_exit "fatal error, exiting"
116+
fi
117+
}
118+
119+
retry-command kubeadm join phase preflight
120+
retry-command kubeadm join phase control-plane-prepare download-certs
121+
retry-command kubeadm join phase control-plane-prepare certs
122+
retry-command kubeadm join phase control-plane-prepare kubeconfig
123+
retry-command kubeadm join phase control-plane-prepare control-plane
124+
retry-command kubeadm join phase kubelet-start
125+
try-or-die-command kubeadm join phase control-plane-join etcd
126+
retry-command kubeadm join phase control-plane-join update-status
127+
retry-command kubeadm join phase control-plane-join mark-control-plane
128+
129+
log::success_exit

0 commit comments

Comments
 (0)