From 21939764410f980084536205599f5b98113e3141 Mon Sep 17 00:00:00 2001 From: Vladimir Videlov Date: Wed, 17 Sep 2025 10:36:13 +0200 Subject: [PATCH] Add alert for when Argora pod not ready for 5 minutes --- .../alerts/controlplane-remote.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/charts/controlplane-operations/alerts/controlplane-remote.yaml b/charts/controlplane-operations/alerts/controlplane-remote.yaml index 12dca6d..1242541 100644 --- a/charts/controlplane-operations/alerts/controlplane-remote.yaml +++ b/charts/controlplane-operations/alerts/controlplane-remote.yaml @@ -15,3 +15,17 @@ groups: description: "Argora Update CR status is in Error state for more than 1 minute." summary: "Update CR in Error state." {{- end }} + +{{- if not (.Values.prometheusRules.disabled.ArgoraPodNotReadyError | default false) }} + - alert: ArgoraPodNotReadyError + expr: > + kube_pod_status_ready{namespace="shoot-control-plane",pod=~"argora-controller-manager-.+",condition="true"} == 0 + for: {{ dig "ArgoraPodNotReadyError" "for" "5m" .Values.prometheusRules }} + labels: + severity: {{ dig "ArgoraPodNotReadyError" "severity" "warning" .Values.prometheusRules }} + playbook: https://github.com/cobaltcore-dev/controlplane-operations/playbooks/ArgoraPodNotReadyError.md + {{ include "controlplane-operations.additionalRuleLabels" . }} + annotations: + description: "Argora Pod is not ready for more than 5 minutes." + summary: "Pod not ready." +{{- end }}