diff --git a/cmd/node-termination-handler.go b/cmd/node-termination-handler.go index 26b62020..e631df77 100644 --- a/cmd/node-termination-handler.go +++ b/cmd/node-termination-handler.go @@ -93,6 +93,12 @@ func main() { log.Fatal().Err(err).Msg("Unable to instantiate observability metrics,") } + err = observability.InitProbes(nthConfig.EnableProbes, nthConfig.ProbesPort, nthConfig.ProbesEndpoint) + if err != nil { + nthConfig.Print() + log.Fatal().Err(err).Msg("Unable to instantiate probes service,") + } + imds := ec2metadata.New(nthConfig.MetadataURL, nthConfig.MetadataTries) interruptionEventStore := interruptioneventstore.New(nthConfig) diff --git a/config/helm/aws-node-termination-handler/README.md b/config/helm/aws-node-termination-handler/README.md index e2e3f704..d5d9d8f1 100644 --- a/config/helm/aws-node-termination-handler/README.md +++ b/config/helm/aws-node-termination-handler/README.md @@ -74,6 +74,9 @@ Parameter | Description | Default `logLevel` | Sets the log level (INFO, DEBUG, or ERROR) | `INFO` `enablePrometheusServer` | If true, start an http server exposing `/metrics` endpoint for prometheus. | `false` `prometheusServerPort` | Replaces the default HTTP port for exposing prometheus metrics. | `9092` +`enableProbesServer` |If true, start an http server exposing `/healthz` endpoint for probes. | `false` +`probesServerPort` | Replaces the default HTTP port for exposing probes endpoint. | `8080` +`probesServerEndpoint` | Replaces the default endpoint for exposing probes endpoint. | `/healthz` `podMonitor.create` | if `true`, create a PodMonitor | `false` `podMonitor.interval` | Prometheus scrape interval | `30s` `podMonitor.sampleLimit` | Number of scraped samples accepted | `5000` diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml index 1b3b08f3..c0e78d8b 100644 --- a/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml +++ b/config/helm/aws-node-termination-handler/templates/daemonset.linux.yaml @@ -166,6 +166,12 @@ spec: value: {{ .Values.enablePrometheusServer | quote }} - name: PROMETHEUS_SERVER_PORT value: {{ .Values.prometheusServerPort | quote }} + - name: ENABLE_PROBES_SERVER + value: {{ .Values.enableProbesServer | quote }} + - name: PROBES_SERVER_PORT + value: {{ .Values.probesServerPort | quote }} + - name: PROBES_SERVER_ENDPOINT + value: {{ .Values.probesServerEndpoint | quote }} resources: {{- toYaml .Values.resources | nindent 12 }} {{- if .Values.enablePrometheusServer }} @@ -175,6 +181,13 @@ spec: name: http-metrics protocol: TCP {{- end }} + {{- if .Values.enableProbesServer }} + ports: + - containerPort: {{ .Values.probesServerPort }} + hostPort: {{ .Values.probesServerPort }} + name: liveness-probe + protocol: TCP + {{- end }} nodeSelector: {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux {{- with .Values.nodeSelector }} diff --git a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml index 5f3d71dd..cabc2659 100644 --- a/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml +++ b/config/helm/aws-node-termination-handler/templates/daemonset.windows.yaml @@ -140,6 +140,12 @@ spec: value: {{ .Values.enablePrometheusServer | quote }} - name: PROMETHEUS_SERVER_PORT value: {{ .Values.prometheusServerPort | quote }} + - name: ENABLE_PROBES_SERVER + value: {{ .Values.enableProbesServer | quote }} + - name: PROBES_SERVER_PORT + value: {{ .Values.probesServerPort | quote }} + - name: PROBES_SERVER_ENDPOINT + value: {{ .Values.probesServerEndpoint | quote }} resources: {{- toYaml .Values.resources | nindent 12 }} {{- if .Values.enablePrometheusServer }} @@ -149,6 +155,13 @@ spec: name: http-metrics protocol: TCP {{- end }} + {{- if .Values.enableProbesServer }} + ports: + - containerPort: {{ .Values.probesServerPort }} + hostPort: {{ .Values.probesServerPort }} + name: liveness-probe + protocol: TCP + {{- end }} nodeSelector: {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: windows {{- with .Values.nodeSelector }} diff --git a/config/helm/aws-node-termination-handler/templates/deployment.yaml b/config/helm/aws-node-termination-handler/templates/deployment.yaml index f183a86a..9b11b05b 100644 --- a/config/helm/aws-node-termination-handler/templates/deployment.yaml +++ b/config/helm/aws-node-termination-handler/templates/deployment.yaml @@ -118,6 +118,8 @@ spec: value: {{ .Values.webhookProxy | quote }} - name: ENABLE_PROMETHEUS_SERVER value: {{ .Values.enablePrometheusServer | quote }} + - name: ENABLE_PROBES_SERVER + value: {{ .Values.enableProbesServer | quote }} - name: ENABLE_SPOT_INTERRUPTION_DRAINING value: "false" - name: ENABLE_SCHEDULED_EVENT_DRAINING @@ -130,6 +132,10 @@ spec: value: {{ .Values.queueURL | quote }} - name: PROMETHEUS_SERVER_PORT value: {{ .Values.prometheusServerPort | quote }} + - name: PROBES_SERVER_PORT + value: {{ .Values.probesServerPort | quote }} + - name: PROBES_SERVER_ENDPOINT + value: {{ .Values.probesServerEndpoint | quote }} - name: AWS_REGION value: {{ .Values.awsRegion | quote }} - name: AWS_ENDPOINT @@ -155,6 +161,13 @@ spec: name: http-metrics protocol: TCP {{- end }} + {{- if .Values.enableProbesServer }} + ports: + - containerPort: {{ .Values.probesServerPort }} + hostPort: {{ .Values.probesServerPort }} + name: liveness-probe + protocol: TCP + {{- end }} nodeSelector: {{ include "aws-node-termination-handler.nodeSelectorTermsOs" . }}: linux {{- with .Values.nodeSelector }} diff --git a/config/helm/aws-node-termination-handler/values.yaml b/config/helm/aws-node-termination-handler/values.yaml index 67138bba..84186fed 100644 --- a/config/helm/aws-node-termination-handler/values.yaml +++ b/config/helm/aws-node-termination-handler/values.yaml @@ -25,6 +25,14 @@ podLabels: {} linuxPodLabels: {} windowsPodLabels: {} +# liveness probe settings. +probes: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 5 + resources: requests: memory: "64Mi" @@ -144,6 +152,10 @@ nodeSelectorTermsArch: "" enablePrometheusServer: false prometheusServerPort: 9092 +enableProbesServer: false +probesServerPort: 8080 +probesServerEndpoint: "/healthz" + tolerations: - operator: "Exists" diff --git a/pkg/config/config.go b/pkg/config/config.go index 8f9e7b79..c19246ca 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -78,6 +78,13 @@ const ( // https://github.com/prometheus/prometheus/wiki/Default-port-allocations prometheusPortDefault = 9092 prometheusPortConfigKey = "PROMETHEUS_SERVER_PORT" + // probes + enableProbesDefault = false + enableProbesConfigKey = "ENABLE_PROBES_SERVER" + probesPortDefault = 8080 + probesPortConfigKey = "PROBES_SERVER_PORT" + probesEndpointDefault = "/healthz" + probesEndpointConfigKey = "PROBES_SERVER_ENDPOINT" region = "" awsRegionConfigKey = "AWS_REGION" awsEndpointConfigKey = "AWS_ENDPOINT" @@ -115,6 +122,9 @@ type Config struct { UptimeFromFile string EnablePrometheus bool PrometheusPort int + EnableProbes bool + ProbesPort int + ProbesEndpoint string AWSRegion string AWSEndpoint string QueueURL string @@ -162,6 +172,9 @@ func ParseCliArgs() (config Config, err error) { flag.StringVar(&config.UptimeFromFile, "uptime-from-file", getEnv(uptimeFromFileConfigKey, uptimeFromFileDefault), "If specified, read system uptime from the file path (useful for testing).") flag.BoolVar(&config.EnablePrometheus, "enable-prometheus-server", getBoolEnv(enablePrometheusConfigKey, enablePrometheusDefault), "If true, a http server is used for exposing prometheus metrics in /metrics endpoint.") flag.IntVar(&config.PrometheusPort, "prometheus-server-port", getIntEnv(prometheusPortConfigKey, prometheusPortDefault), "The port for running the prometheus http server.") + flag.BoolVar(&config.EnableProbes, "enable-probes-server", getBoolEnv(enableProbesConfigKey, enableProbesDefault), "If true, a http server is used for exposing probes in /healthz endpoint.") + flag.IntVar(&config.ProbesPort, "probes-server-port", getIntEnv(probesPortConfigKey, probesPortDefault), "The port for running the probes http server.") + flag.StringVar(&config.ProbesEndpoint, "probes-server-endpoint", getEnv(probesEndpointConfigKey, probesEndpointDefault), "If specified, use this endpoint to make liveness probe") flag.StringVar(&config.AWSRegion, "aws-region", getEnv(awsRegionConfigKey, ""), "If specified, use the AWS region for AWS API calls") flag.StringVar(&config.AWSEndpoint, "aws-endpoint", getEnv(awsEndpointConfigKey, ""), "[testing] If specified, use the AWS endpoint to make API calls") flag.StringVar(&config.QueueURL, "queue-url", getEnv(queueURLConfigKey, ""), "Listens for messages on the specified SQS queue URL") diff --git a/pkg/observability/probes.go b/pkg/observability/probes.go new file mode 100644 index 00000000..887f3ec0 --- /dev/null +++ b/pkg/observability/probes.go @@ -0,0 +1,41 @@ +package observability + +import ( + "net" + "net/http" + "strconv" + "time" + + "github.com/rs/zerolog/log" +) + +// InitProbes will initialize, register and expose, via http server, the probes. +func InitProbes(enabled bool, port int, endpoint string) error { + if !enabled { + return nil + } + + http.HandleFunc(endpoint, livenessHandler) + + probes := &http.Server{ + Addr: net.JoinHostPort("", strconv.Itoa(port)), + ReadTimeout: 1 * time.Second, + WriteTimeout: 1 * time.Second, + } + + // Starts HTTP server exposing the probes path + go func() { + log.Info().Msgf("Starting to serve handler %s, port %d", endpoint, port) + if err := probes.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Err(err).Msg("Failed to listen and serve http server") + } + }() + + return nil +} + +func livenessHandler(w http.ResponseWriter, r *http.Request) { + w.Header().Add("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"health":"OK"}`)) +} diff --git a/pkg/observability/probes_test.go b/pkg/observability/probes_test.go new file mode 100644 index 00000000..c4c69c37 --- /dev/null +++ b/pkg/observability/probes_test.go @@ -0,0 +1,30 @@ +package observability + +import ( + "net/http" + "net/http/httptest" + "testing" +) + +func TestLivenessHandler(t *testing.T) { + req := httptest.NewRequest("GET", "/healthz", nil) + rr := httptest.NewRecorder() + handler := http.HandlerFunc(livenessHandler) + + handler.ServeHTTP(rr, req) + + if contentType := rr.Header().Get("Content-Type"); contentType != "application/json" { + t.Errorf("handler returned wrong status content type: got %v want %v", + contentType, "application/json") + } + + if status := rr.Code; status != http.StatusOK { + t.Errorf("handler returned wrong status code: got %v want %v", + status, http.StatusOK) + } + + if body := rr.Body.String(); body != `{"health":"OK"}` { + t.Errorf("handler returned wrong body: got %v want %v", + body, http.StatusText(http.StatusOK)) + } +}