diff --git a/cluster-install/README.md b/cluster-install/README.md new file mode 100644 index 0000000..bc7dd20 --- /dev/null +++ b/cluster-install/README.md @@ -0,0 +1,6 @@ +# Install a new node +1. Install the `gcloud` +2. +Run `00_create_machine.sh` from your local environment + +`gcloud compute scp ./cluster-install/install-scripts/*.sh rabbitmq-1:~/` \ No newline at end of file diff --git a/cluster-install/TODO.md b/cluster-install/TODO.md new file mode 100644 index 0000000..e80120e --- /dev/null +++ b/cluster-install/TODO.md @@ -0,0 +1,9 @@ +# TODO + +1. Check `sudo` +2. Exit at first error +3. First script returns the `ERLANG_COOKIE` +4. Set feature flag `detailed_queues_endpoint` as enabled (~ `rabbitmqctl enable_feature_flag detailed_queues_endpoint` ) +5. track firewall rule creation + +COOKIE = YVIRLAAERJPTQKTYMACZ diff --git a/cluster-install/install-scripts/00_create_machine.sh b/cluster-install/install-scripts/00_create_machine.sh new file mode 100755 index 0000000..1e0d576 --- /dev/null +++ b/cluster-install/install-scripts/00_create_machine.sh @@ -0,0 +1,19 @@ +## to be run on local machine +gcloud compute instances create $1 \ + --project=rabbit-test-406509 \ + --zone=europe-west1-b \ + --machine-type=e2-micro \ + --network-interface=network-tier=PREMIUM,stack-type=IPV4_ONLY,subnet=default \ + --maintenance-policy=MIGRATE \ + --provisioning-model=STANDARD \ + --service-account=99473582712-compute@developer.gserviceaccount.com \ + --scopes=https://www.googleapis.com/auth/devstorage.read_only,https://www.googleapis.com/auth/logging.write,https://www.googleapis.com/auth/monitoring.write,https://www.googleapis.com/auth/servicecontrol,https://www.googleapis.com/auth/service.management.readonly,https://www.googleapis.com/auth/trace.append \ + --tags=allow-tcp-15672,http-server,https-server \ + --create-disk=auto-delete=yes,boot=yes,device-name=$1,image=projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20240228,mode=rw,size=10,type=projects/rabbit-test-406509/zones/europe-west1-b/diskTypes/pd-balanced \ + --no-shielded-secure-boot \ + --shielded-vtpm \ + --shielded-integrity-monitoring \ + --labels=goog-ec-src=vm_add-gcloud \ + --reservation-affinity=any + +# gcloud compute scp ./cluster-install/install-scripts/*.sh rabbitmq-1:~/ \ No newline at end of file diff --git a/cluster-install/install-scripts/01_setup.sh b/cluster-install/install-scripts/01_setup.sh new file mode 100755 index 0000000..fca29ce --- /dev/null +++ b/cluster-install/install-scripts/01_setup.sh @@ -0,0 +1,76 @@ +#!/bin/bash +#usage +# $> sudo 01_setup.sh ["rabbitmq-server 3 minor version"] +# +echo "---------------------" +echo "Installing base tools" +echo "---------------------" +apt-get update -y +apt-get install curl gnupg apt-transport-https net-tools -y + +echo "----------------------------" +echo "Adding rabbitmq repositories" +echo "----------------------------" +./cloudsmith_repos.sh + +echo "-------------------------" +echo "Indexing new repositories" +echo "-------------------------" +## Update package indices +apt-get update -y + +echo "-----------------" +echo "Installing erlang" +echo "-----------------" +## Install Erlang packages +apt-get install -y erlang-base \ + erlang-asn1 erlang-crypto erlang-eldap erlang-ftp erlang-inets \ + erlang-mnesia erlang-os-mon erlang-parsetools erlang-public-key \ + erlang-runtime-tools erlang-snmp erlang-ssl \ + erlang-syntax-tools erlang-tftp erlang-tools erlang-xmerl + + +## Install rabbitmq-server and its dependencies +if [ -z "$1" ]; then + echo "-------------------" + echo "Installing latest version of rabbitmq" + echo "-------------------" + sudo apt-get install rabbitmq-server -y --fix-missing +else + latest_patch=$(apt list -a rabbitmq-server 2>/dev/null | grep -oP "3.12.\K(\d{1,2}-\d{1,2})" | sort -V | tail -n 1) + if [ -z "$latest_patch" ]; then + echo "...could not find any version of minor $1, aborting" + exit 1 + fi + echo "-------------------" + echo "Installing version 3.$1.$latest_patch of rabbitmq" + echo "-------------------" + sudo apt-get install rabbitmq-server=3.$1.$latest_patch -y --fix-missing +fi + +echo "---------------------------" +echo "Enabling rabbitmq at startup" +echo "---------------------------" +#start rabbitmq and make it run at startup +systemctl enable rabbitmq-server +sleep 30 + +echo "--------------------------" +echo "Enabling management plugin" +echo "--------------------------" +#enable management plugin +rabbitmq-plugins enable rabbitmq_management +#metrics for prometheus +rabbitmq-plugins enable rabbitmq_prometheus + +echo "----------------------------------------------" +echo "Enabling feature flag detailed_queues_endpoint" +echo "----------------------------------------------" +rabbitmqctl enable_feature_flag detailed_queues_endpoint + +echo "-------------------" +echo "Adding default user" +echo "-------------------" +rabbitmqctl add_user rabbit rabbit +rabbitmqctl set_user_tags rabbit administrator +rabbitmqctl set_permissions -p / rabbit ".*" ".*" ".*" diff --git a/cluster-install/install-scripts/02_setup_cluster.sh b/cluster-install/install-scripts/02_setup_cluster.sh new file mode 100755 index 0000000..06fd5e1 --- /dev/null +++ b/cluster-install/install-scripts/02_setup_cluster.sh @@ -0,0 +1,31 @@ +#!/bin/bash +#First setup the node with setup.sh, then use this script to join a cluster. +#usage +# $> sudo 02_setup_cluster.sh "rabbit@the-cluster-node" "the-erlang-cookie" +# +SETUP_CLUSTER_HOST=$1 +SETUP_ERLANG_COOKIE=$2 + +#overwrite the erlang cookie +./set_erlang_cookie.sh $SETUP_ERLANG_COOKIE + + +#prepare the local node to join the cluster +echo "--------------------------------------------" +echo "Preparing the local node to join the cluster" +echo "--------------------------------------------" +rabbitmqctl stop_app +rabbitmqctl reset + + +#join operation request +echo "----------------------------------------------------------" +echo "Issuing cluster join request to node ${SETUP_CLUSTER_HOST}" +echo "----------------------------------------------------------" +rabbitmqctl join_cluster $SETUP_CLUSTER_HOST + +#start node apps +echo "------------------------------" +echo "Starting internal rabbitmq app" +echo "------------------------------" +rabbitmqctl start_app \ No newline at end of file diff --git a/cluster-install/install-scripts/cloudsmith_repos.sh b/cluster-install/install-scripts/cloudsmith_repos.sh new file mode 100755 index 0000000..e6b346f --- /dev/null +++ b/cluster-install/install-scripts/cloudsmith_repos.sh @@ -0,0 +1,31 @@ +#!/bin/sh +distribution="jammy" + +## Team RabbitMQ's main signing key +curl -1sLf "https://keys.openpgp.org/vks/v1/by-fingerprint/0A9AF2115F4687BD29803A206B73A36E6026DFCA" | gpg --dearmor | tee /usr/share/keyrings/com.rabbitmq.team.gpg > /dev/null +## Community mirror of Cloudsmith: modern Erlang repository +curl -1sLf https://github.com/rabbitmq/signing-keys/releases/download/3.0/cloudsmith.rabbitmq-erlang.E495BB49CC4BBE5B.key | gpg --dearmor | tee /usr/share/keyrings/rabbitmq.E495BB49CC4BBE5B.gpg > /dev/null +## Community mirror of Cloudsmith: RabbitMQ repository +curl -1sLf https://github.com/rabbitmq/signing-keys/releases/download/3.0/cloudsmith.rabbitmq-server.9F4587F226208342.key | gpg --dearmor | tee /usr/share/keyrings/rabbitmq.9F4587F226208342.gpg > /dev/null + +## Add apt repositories maintained by Team RabbitMQ +tee /etc/apt/sources.list.d/rabbitmq.list < sudo ./set_erlang_cookie.sh "the-erlang-cookie" + +SETUP_ERLANG_COOKIE=$1 + +if [ ! -z $1 ] +then + echo "--------------------------------------------------------" + echo "setting rabbitmq erlang cookie to ${SETUP_ERLANG_COOKIE}" + echo "--------------------------------------------------------" + #overwrite the erlang cookie + systemctl stop rabbitmq-server + chmod 666 /var/lib/rabbitmq/.erlang.cookie + echo -n $SETUP_ERLANG_COOKIE > /var/lib/rabbitmq/.erlang.cookie + chmod 400 /var/lib/rabbitmq/.erlang.cookie + systemctl start rabbitmq-server + sleep 30 +else + echo "-------------------------------------" + echo "Skipping rabbitmq erlang cookie setup" + echo "-------------------------------------" +fi + + + + + diff --git a/docker-compose/alert-manager.yml.tpl b/docker-compose/alert-manager.yml.tpl new file mode 100644 index 0000000..5255449 --- /dev/null +++ b/docker-compose/alert-manager.yml.tpl @@ -0,0 +1,94 @@ +global: + smtp_smarthost: 'smtp4dev:25' + smtp_from: 'alertmanager@example.org' + smtp_require_tls: false + +route: + group_by: ["group"] + receiver: team-mails + routes: + - receiver: team-mails + # time to wait for grouping alarms when sending the first notification + group_wait: 30s + # time to wait for grouping new alarms when sending further notifications for the same group + group_interval: 1m + # should be a multiple of group_interval + repeat_interval: 2h + mute_time_intervals: ["weekday-off-evenings", "weekday-off-mornings", "weekends"] + matchers: + - severity =~ "warning|critical" + - receiver: team-slack-and-emails + # time to wait for grouping alarms when sending the first notification + group_wait: 30s + # time to wait for grouping new alarms when sending further notifications for the same group + group_interval: 1m + # should be a multiple of group_interval + repeat_interval: 2m + matchers: + - severity =~ "critical" + +time_intervals: + - name: "working-hours" + time_intervals: + - times: + - start_time: "09:00" + end_time: "18:00" + location: "Europe/Rome" + weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] + - name: "weekday-off-evenings" + time_intervals: + - times: + - start_time: "18:00" + end_time: "23:59" + location: "Europe/Rome" + weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] + - name: "weekday-off-mornings" + time_intervals: + - times: + - start_time: "00:00" + end_time: "08:59" + location: "Europe/Rome" + weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] + - name: "weekends" + time_intervals: + - times: + - start_time: "00:00" + end_time: "23:59" + location: "Europe/Rome" + weekdays: ['Saturday', 'Sunday'] + +receivers: + - name: "team-mails" + email_configs: + - to: "test-receiver@example.com" + send_resolved: true + - name: "team-slack-and-emails" + email_configs: + - to: "test-receiver@example.com" + send_resolved: true + slack_configs: + - send_resolved: true + channel: "#rabbitmq-management" + api_url: "env.SLACK_API_URL" + title: |- + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} + {{- if gt (len .CommonLabels) (len .GroupLabels) -}} + {{" "}}( + {{- with .CommonLabels.Remove .GroupLabels.Names }} + {{- range $index, $label := .SortedPairs -}} + {{ if $index }}, {{ end }} + {{- $label.Name }}="{{ $label.Value -}}" + {{- end }} + {{- end -}} + ) + {{- end }} + text: >- + {{ range .Alerts -}} + *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} + + *Description:* {{ .Annotations.description }} + + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 3646339..92e79bb 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -1,8 +1,14 @@ -version: '3.9' +version: "3.9" + +networks: + backbone1: + driver: bridge + name: backbone1 services: rabbitmq: - image: rabbitmq:3.10-management + image: rabbitmq:3.12-management + container_name: rabbitmq hostname: rabbitmq ports: - 15692:15692 @@ -15,24 +21,18 @@ services: environment: RABBITMQ_DEFAULT_USER: rabbit RABBITMQ_DEFAULT_PASS: rabbit - - federated-rabbitmq: - image: rabbitmq:3.10-management - hostname: rabbitmq - ports: - - 25692:15692 - - 25672:15672 - - 6672:5672 - volumes: - - type: bind - source: ./enabled_plugins - target: /etc/rabbitmq/enabled_plugins - environment: - RABBITMQ_DEFAULT_USER: rabbit - RABBITMQ_DEFAULT_PASS: rabbit + networks: + - backbone1 + healthcheck: + test: rabbitmq-diagnostics -q ping + interval: 30s + timeout: 30s + retries: 3 grafana: image: grafana/grafana:latest + container_name: grafana + hostname: grafana volumes: - ./grafana:/var/lib/grafana user: root @@ -41,27 +41,65 @@ services: - GF_SMTP_HOST=smtp4dev:25 ports: - 3000:3000 + networks: + - backbone1 prometheus: - image: prom/prometheus:latest + image: prom/prometheus:v2.50.1 + container_name: prometheus + hostname: prometheus restart: unless-stopped volumes: - type: bind source: ./prometheus.yml target: /etc/prometheus/prometheus.yml + - type: bind + source: ./rabbitmq-cluster-alert.rules.yml + target: /etc/prometheus/rabbitmq-cluster-alert.rules.yml + - type: bind + source: ./prometheus-self-monitoring-alert.rules.yml + target: /etc/prometheus/prometheus-self-monitoring-alert.rules.yml + - type: bind + source: ./rabbitmq-architecture-alert.rules.yml + target: /etc/prometheus/rabbitmq-architecture-alert.rules.yml user: root command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--web.console.libraries=/etc/prometheus/console_libraries' - - '--web.console.templates=/etc/prometheus/consoles' - - '--web.enable-lifecycle' + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--web.console.libraries=/etc/prometheus/console_libraries" + - "--web.console.templates=/etc/prometheus/consoles" + - "--web.enable-lifecycle" ports: - 9090:9090 - + networks: + - backbone1 + + alert-manager: + image: prom/alertmanager:v0.27.0 + container_name: alert-manager + hostname: alert-manager + volumes: + - type: bind + source: ./alert-manager.yml.tpl + target: /etc/alertmanager/alert-manager.yml.tpl + ports: + - 9093:9093 + #NOTE: you need to setup an env variable SLACK_API_URL when starting docker-compose (e.g. with direnv) + entrypoint: > + sh -c " + sed 's|env.SLACK_API_URL|$SLACK_API_URL|' /etc/alertmanager/alert-manager.yml.tpl > /etc/alertmanager/alert-manager.yml \\ + && /bin/alertmanager --config.file=/etc/alertmanager/alert-manager.yml + " + networks: + - backbone1 + smtp4dev: + container_name: smtp4dev + hostname: smpt4dev image: rnwood/smtp4dev:v3 restart: unless-stopped ports: - 2525:2525 - 1080:80 + networks: + - backbone1 diff --git a/docker-compose/prometheus-self-monitoring-alert.rules.yml b/docker-compose/prometheus-self-monitoring-alert.rules.yml new file mode 100644 index 0000000..e577849 --- /dev/null +++ b/docker-compose/prometheus-self-monitoring-alert.rules.yml @@ -0,0 +1,275 @@ +groups: +- name: PrometheusSelfMonitoring + rules: + - alert: PrometheusJobMissing + expr: 'absent(up{job="prometheus"})' + for: 0m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: PrometheusTargetMissing + expr: 'up == 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: PrometheusAllTargetsMissing + expr: 'sum by (job) (up) == 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus all targets missing (instance {{ $labels.instance }}) + description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: PrometheusTargetMissingWithWarmupTime + expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus target missing with warmup time (instance {{ $labels.instance }}) + description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: PrometheusConfigurationReloadFailure + expr: 'prometheus_config_last_reload_successful != 1' + for: 0m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: PrometheusTooManyRestarts + expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2' + for: 0m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus too many restarts (instance {{ $labels.instance }}) + description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: PrometheusAlertmanagerJobMissing + expr: 'absent(up{job="alertmanager"})' + for: 0m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) + description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: 'alertmanager_config_last_reload_successful != 1' + for: 0m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) + description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigNotSynced + expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1' + for: 0m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) + description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # - alert: PrometheusAlertmanagerE2eDeadManSwitch + # expr: 'vector(1)' + # for: 0m + # labels: + # severity: critical + # group: "PrometheusSelfMonitoring" + # annotations: + # summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) + # description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: 'prometheus_notifications_alertmanagers_discovered < 1' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationFailures + expr: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTemplateTextExpansionFailures + expr: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationSlow + expr: 'prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds' + for: 5m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotificationsBacklog + expr: 'min_over_time(prometheus_notifications_queue_length[10m]) > 0' + for: 0m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetEmpty + expr: 'prometheus_sd_discovered_targets == 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus target empty (instance {{ $labels.instance }}) + description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapingSlow + expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05' + for: 5m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusLargeScrape + expr: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10' + for: 5m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapeDuplicate + expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0' + for: 0m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) + description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointCreationFailures + expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointDeletionFailures + expr: 'increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCompactionsFailed + expr: 'increase(prometheus_tsdb_compactions_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbHeadTruncationsFailed + expr: 'increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbReloadFailures + expr: 'increase(prometheus_tsdb_reloads_failures_total[1m]) > 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalCorruptions + expr: 'increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalTruncationsFailed + expr: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTimeseriesCardinality + expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000' + for: 0m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus timeseries cardinality (instance {{ $labels.instance }}) + description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" \ No newline at end of file diff --git a/docker-compose/prometheus.yml b/docker-compose/prometheus.yml index 3b13e40..1c33d81 100644 --- a/docker-compose/prometheus.yml +++ b/docker-compose/prometheus.yml @@ -1,5 +1,6 @@ global: scrape_interval: 15s + scrape_timeout: 13s evaluation_interval: 15s scrape_configs: @@ -8,10 +9,15 @@ scrape_configs: static_configs: - targets: ["localhost:9090"] + - job_name: "alertmanager" + scrape_interval: 1m + static_configs: + - targets: ["alert-manager:9093"] + - job_name: "rabbitmq" static_configs: - targets: ["rabbitmq:15692"] - + - job_name: "rabbitmq-detailed" metrics_path: "/metrics/detailed" params: @@ -26,3 +32,15 @@ scrape_configs: - source_labels: [__name__] regex: "rabbitmq_identity_info" action: drop + +alerting: + alertmanagers: + - scheme: "http" + static_configs: + - targets: ["alert-manager:9093"] + + +rule_files: + - "/etc/prometheus/rabbitmq-cluster-alert.rules.yml" + - "/etc/prometheus/prometheus-self-monitoring-alert.rules.yml" + - "/etc/prometheus/rabbitmq-architecture-alert.rules.yml" diff --git a/docker-compose/rabbitmq-architecture-alert.rules.yml b/docker-compose/rabbitmq-architecture-alert.rules.yml new file mode 100644 index 0000000..2dbd097 --- /dev/null +++ b/docker-compose/rabbitmq-architecture-alert.rules.yml @@ -0,0 +1,21 @@ +groups: +- name: RabbitMQ-Architecture + rules: + - alert: DeadLetter (dlq) not empty + expr: sum(rabbitmq_detailed_queue_messages{queue="dlq"}) > 0 + for: 0m + labels: + severity: warning + group: "RabbitMQ-Architecture" + annotations: + summary: DeadLetter queue is not empty + description: "The DeadLetter queue (dlq) has some messages" + - alert: Other queue not empty + expr: sum(rabbitmq_detailed_queue_messages{queue="other"}) > 0 + for: 0m + labels: + severity: warning + group: "RabbitMQ-Architecture" + annotations: + summary: Other queue is not empty + description: "The other queue has some messages" \ No newline at end of file diff --git a/docker-compose/rabbitmq-cluster-alert.rules.yml b/docker-compose/rabbitmq-cluster-alert.rules.yml new file mode 100644 index 0000000..cc9d169 --- /dev/null +++ b/docker-compose/rabbitmq-cluster-alert.rules.yml @@ -0,0 +1,96 @@ +groups: + +## From Awesome Prometheus - https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/rabbitmq/rabbitmq-exporter.yml +- name: RabbitMQ + + rules: + + - alert: RabbitmqNodeDown + expr: 'sum(rabbitmq_build_info) < 1' + for: 0m + labels: + severity: critical + group: "RabbitMQ" + annotations: + summary: RabbitMQ node down (instance {{ $labels.instance }}) + description: "Less than 1 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqNodeNotDistributed + expr: 'erlang_vm_dist_node_state < 3' + for: 0m + labels: + severity: critical + group: "RabbitMQ" + annotations: + summary: RabbitMQ node not distributed (instance {{ $labels.instance }}) + description: "Distribution link state is not 'up'\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqInstancesDifferentVersions + expr: 'count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1' + for: 1h + labels: + severity: warning + group: "RabbitMQ" + annotations: + summary: RabbitMQ instances different versions (instance {{ $labels.instance }}) + description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqMemoryHigh + expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90' + for: 2m + labels: + severity: warning + group: "RabbitMQ" + annotations: + summary: RabbitMQ memory high (instance {{ $labels.instance }}) + description: "A node use more than 90% of allocated RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqFileDescriptorsUsage + expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90' + for: 2m + labels: + severity: warning + group: "RabbitMQ" + annotations: + summary: RabbitMQ file descriptors usage (instance {{ $labels.instance }}) + description: "A node use more than 90% of file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqTooManyUnackMessages + expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000' + for: 1m + labels: + severity: warning + group: "RabbitMQ" + annotations: + summary: RabbitMQ too many unack messages (instance {{ $labels.instance }}) + description: "Too many unacknowledged messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqTooManyConnections + expr: 'rabbitmq_connections > 1000' + for: 2m + labels: + severity: warning + group: "RabbitMQ" + annotations: + summary: RabbitMQ too many connections (instance {{ $labels.instance }}) + description: "The total connections of a node is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # - alert: RabbitmqNoQueueConsumer + # expr: 'rabbitmq_queue_consumers < 1' + # for: 1m + # labels: + # severity: warning + # group: "RabbitMQ" + # annotations: + # summary: RabbitMQ no queue consumer (instance {{ $labels.instance }}) + # description: "A queue has less than 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqUnroutableMessages + expr: 'increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0' + for: 2m + labels: + severity: warning + group: "RabbitMQ" + annotations: + summary: RabbitMQ unroutable messages (instance {{ $labels.instance }}) + description: "A queue has unroutable messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"