Skip to content

Commit cc51348

Browse files
author
Naadir Jeewa
committed
:run: cabpk: Add retries to control plane join
Signed-off-by: Naadir Jeewa <[email protected]>
1 parent 04ca7d1 commit cc51348

File tree

2 files changed

+144
-3
lines changed

2 files changed

+144
-3
lines changed

bootstrap/kubeadm/internal/cloudinit/controlplane_join.go

Lines changed: 139 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,136 @@ package cloudinit
1818

1919
import (
2020
"github.com/pkg/errors"
21-
21+
bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1alpha3"
2222
"sigs.k8s.io/cluster-api/util/secret"
2323
)
2424

2525
const (
26+
controlPlaneJoinScript = `#!/bin/bash
27+
# Copyright 2020 The Kubernetes Authors.
28+
#
29+
# Licensed under the Apache License, Version 2.0 (the "License");
30+
# you may not use this file except in compliance with the License.
31+
# You may obtain a copy of the License at
32+
#
33+
# http://www.apache.org/licenses/LICENSE-2.0
34+
#
35+
# Unless required by applicable law or agreed to in writing, software
36+
# distributed under the License is distributed on an "AS IS" BASIS,
37+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
38+
# See the License for the specific language governing permissions and
39+
# limitations under the License.
40+
41+
# Log an error and exit.
42+
# Args:
43+
# $1 Message to log with the error
44+
# $2 The error code to return
45+
log::error_exit() {
46+
local message="${1}"
47+
local code="${2}"
48+
49+
log::error "${message}"
50+
log::info "Removing member from cluster status"
51+
kubeadm reset -f update-cluster-status || true
52+
log::info "Removing etcd member"
53+
kubeadm reset -f remove-etcd-member || true
54+
log::info "Resetting kubeadm"
55+
kubeadm reset -f || true
56+
log::error "cluster.x-k8s.io kubeadm bootstrap script $0 exiting with status ${code}"
57+
exit "${code}"
58+
}
59+
60+
log::success_exit() {
61+
log::info "cluster.x-k8s.io kubeadm bootstrap script $0 finished"
62+
exit 0
63+
}
64+
65+
# Log an error but keep going.
66+
log::error() {
67+
local message="${1}"
68+
timestamp=$(date --iso-8601=seconds)
69+
echo "!!! [${timestamp}] ${1}" >&2
70+
shift
71+
for message; do
72+
echo " ${message}" >&2
73+
done
74+
}
75+
76+
# Print a status line. Formatted to show up in a stream of output.
77+
log::info() {
78+
timestamp=$(date --iso-8601=seconds)
79+
echo "+++ [${timestamp}] ${1}"
80+
shift
81+
for message; do
82+
echo " ${message}"
83+
done
84+
}
85+
86+
check_kubeadm_command() {
87+
local command="${1}"
88+
local code="${2}"
89+
local out="${3}"
90+
local sanitised="${out//[$'\t\r\n']/}"
91+
case ${code} in
92+
"0")
93+
log::info "kubeadm reported successful execution for ${command}"
94+
;;
95+
"1")
96+
log::error "kubeadm reported failed action(s) for ${command}"
97+
log::error "${sanitised}"
98+
;;
99+
"2")
100+
log::error "kubeadm reported preflight check error during ${command}"
101+
log::error "${sanitised}"
102+
;;
103+
"3")
104+
log::error "kubeadm reported validation error for ${command}"
105+
log::error_exit "${sanitised}"
106+
;;
107+
*)
108+
log::error "kubeadm reported unknown error ${code} for ${command}"
109+
log::error "${sanitised}"
110+
;;
111+
esac
112+
}
113+
114+
function retry-command() {
115+
n=0
116+
local kubeadm_return
117+
until [ $n -ge 5 ]; do
118+
local out
119+
out=$(
120+
"$@" --config /tmp/kubeadm-controlplane-join-config.yaml {{.KubeadmVerbosity}}
121+
)
122+
kubeadm_return=$?
123+
check_kubeadm_command "'$*'" "${kubeadm_return}" "${out}"
124+
if [ ${kubeadm_return} -eq 0 ]; then
125+
break
126+
fi
127+
# We allow preflight errors to pass
128+
if [ ${kubeadm_return} -eq 2 ]; then
129+
break
130+
fi
131+
n=$((n + 1))
132+
log::info
133+
sleep 15
134+
done
135+
if [ ${kubeadm_return} -ne 0 ]; then
136+
log::error_exit "too many errors, exiting"
137+
fi
138+
}
139+
140+
retry-command kubeadm join phase preflight
141+
retry-command kubeadm join phase control-plane-prepare download-certs
142+
retry-command kubeadm join phase control-plane-prepare certs
143+
retry-command kubeadm join phase control-plane-prepare kubeconfig
144+
retry-command kubeadm join phase kubelet-start
145+
retry-command kubeadm join phase control-plane-join etcd
146+
retry-command kubeadm join phase control-plane-join update-status
147+
retry-command kubeadm join phase control-plane-join mark-control-plane
148+
149+
log::success_exit
150+
`
26151
controlPlaneJoinCloudInit = `{{.Header}}
27152
{{template "files" .WriteFiles}}
28153
- path: /tmp/kubeadm-controlplane-join-config.yaml
@@ -32,7 +157,7 @@ const (
32157
{{.JoinConfiguration | Indent 6}}
33158
runcmd:
34159
{{- template "commands" .PreKubeadmCommands }}
35-
- 'kubeadm join --config /tmp/kubeadm-controlplane-join-config.yaml {{.KubeadmVerbosity}}'
160+
- '/usr/local/bin/kubeadm-bootstrap-script'
36161
{{- template "commands" .PostKubeadmCommands }}
37162
{{- template "ntp" .NTP }}
38163
{{- template "users" .Users }}
@@ -54,10 +179,21 @@ func NewJoinControlPlane(input *ControlPlaneJoinInput) ([]byte, error) {
54179
// TODO: Consider validating that the correct certificates exist. It is different for external/stacked etcd
55180
input.WriteFiles = input.Certificates.AsFiles()
56181
input.WriteFiles = append(input.WriteFiles, input.AdditionalFiles...)
182+
joinScript, err := generate("JoinControlplaneScript", controlPlaneJoinScript, input)
183+
if err != nil {
184+
return nil, errors.Wrapf(err, "failed to generate user data for machine joining control plane")
185+
}
186+
joinScriptFile := bootstrapv1.File{
187+
Path: "/usr/local/bin/kubeadm-bootstrap-script",
188+
Owner: "root",
189+
Permissions: "0755",
190+
Content: string(joinScript),
191+
}
192+
input.WriteFiles = append(input.WriteFiles, input.AdditionalFiles...)
193+
input.WriteFiles = append(input.WriteFiles, joinScriptFile)
57194
userData, err := generate("JoinControlplane", controlPlaneJoinCloudInit, input)
58195
if err != nil {
59196
return nil, errors.Wrapf(err, "failed to generate user data for machine joining control plane")
60197
}
61-
62198
return userData, err
63199
}

feature/feature.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ const (
3131
// owner: @
3232
// alpha: v0.3
3333
MachinePool featuregate.Feature = "MachinePool"
34+
35+
// owner: @randomvariable
36+
// alpha: v0.3
37+
RetriableBootstrap featuregate.Feature = "RetriableBootstrap"
3438
)
3539

3640
func init() {
@@ -42,4 +46,5 @@ func init() {
4246
var defaultClusterAPIFeatureGates = map[featuregate.Feature]featuregate.FeatureSpec{
4347
// Every feature should be initiated here:
4448
MachinePool: {Default: false, PreRelease: featuregate.Alpha},
49+
RetriableBootstrap: {Default: false, PreRelease: featuregate.Alpha}
4550
}

0 commit comments

Comments
 (0)