From d585ab558f0ba700375afa4f53e4badcf4cee965 Mon Sep 17 00:00:00 2001 From: Igor Cappello Date: Wed, 28 Feb 2024 16:27:16 +0100 Subject: [PATCH 1/8] add: basic rabbitmq installation scripts for Ubuntu --- .../install-scripts/cloudsmith_repos.sh | 31 ++++++++++ .../install-scripts/set_erlang_cookie.sh | 28 +++++++++ cluster-install/install-scripts/setup.sh | 58 +++++++++++++++++++ .../install-scripts/setup_cluster.sh | 31 ++++++++++ 4 files changed, 148 insertions(+) create mode 100644 cluster-install/install-scripts/cloudsmith_repos.sh create mode 100755 cluster-install/install-scripts/set_erlang_cookie.sh create mode 100644 cluster-install/install-scripts/setup.sh create mode 100644 cluster-install/install-scripts/setup_cluster.sh diff --git a/cluster-install/install-scripts/cloudsmith_repos.sh b/cluster-install/install-scripts/cloudsmith_repos.sh new file mode 100644 index 0000000..e6b346f --- /dev/null +++ b/cluster-install/install-scripts/cloudsmith_repos.sh @@ -0,0 +1,31 @@ +#!/bin/sh +distribution="jammy" + +## Team RabbitMQ's main signing key +curl -1sLf "https://keys.openpgp.org/vks/v1/by-fingerprint/0A9AF2115F4687BD29803A206B73A36E6026DFCA" | gpg --dearmor | tee /usr/share/keyrings/com.rabbitmq.team.gpg > /dev/null +## Community mirror of Cloudsmith: modern Erlang repository +curl -1sLf https://github.com/rabbitmq/signing-keys/releases/download/3.0/cloudsmith.rabbitmq-erlang.E495BB49CC4BBE5B.key | gpg --dearmor | tee /usr/share/keyrings/rabbitmq.E495BB49CC4BBE5B.gpg > /dev/null +## Community mirror of Cloudsmith: RabbitMQ repository +curl -1sLf https://github.com/rabbitmq/signing-keys/releases/download/3.0/cloudsmith.rabbitmq-server.9F4587F226208342.key | gpg --dearmor | tee /usr/share/keyrings/rabbitmq.9F4587F226208342.gpg > /dev/null + +## Add apt repositories maintained by Team RabbitMQ +tee /etc/apt/sources.list.d/rabbitmq.list < sudo ./set_erlang_cookie.sh "the-erlang-cookie" + +SETUP_ERLANG_COOKIE=$1 + +if [ ! -z $1 ] +then + echo "--------------------------------------------------------" + echo "setting rabbitmq erlang cookie to ${SETUP_ERLANG_COOKIE}" + echo "--------------------------------------------------------" + #overwrite the erlang cookie + systemctl stop rabbitmq-server + chmod 666 /var/lib/rabbitmq/.erlang.cookie + echo $SETUP_ERLANG_COOKIE > /var/lib/rabbitmq/.erlang.cookie + chmod 400 /var/lib/rabbitmq/.erlang.cookie + systemctl start rabbitmq-server +else + echo "-------------------------------------" + echo "Skipping rabbitmq erlang cookie setup" + echo "-------------------------------------" +fi + + + + + diff --git a/cluster-install/install-scripts/setup.sh b/cluster-install/install-scripts/setup.sh new file mode 100644 index 0000000..cba9710 --- /dev/null +++ b/cluster-install/install-scripts/setup.sh @@ -0,0 +1,58 @@ +#!/bin/bash +#usage +# $> sudo setup.sh ["the-erlang-cookie"] +# +echo "---------------------" +echo "Installing base tools" +echo "---------------------" +apt-get update -y +apt-get install curl gnupg apt-transport-https net-tools -y + +echo "----------------------------" +echo "Adding rabbitmq repositories" +echo "----------------------------" +./cloudsmith_repos.sh + +echo "-------------------------" +echo "Indexing new repositories" +echo "-------------------------" +## Update package indices +apt-get update -y + +echo "-----------------" +echo "Installing erlang" +echo "-----------------" +## Install Erlang packages +apt-get install -y erlang-base \ + erlang-asn1 erlang-crypto erlang-eldap erlang-ftp erlang-inets \ + erlang-mnesia erlang-os-mon erlang-parsetools erlang-public-key \ + erlang-runtime-tools erlang-snmp erlang-ssl \ + erlang-syntax-tools erlang-tftp erlang-tools erlang-xmerl + +echo "-------------------" +echo "Installing rabbitmq" +echo "-------------------" +## Install rabbitmq-server and its dependencies +apt-get install rabbitmq-server -y --fix-missing + +echo "---------------------------" +echo "Running rabbitmq at startup" +echo "---------------------------" +#start rabbitmq and make it run at startup +systemctl enable rabbitmq-server + +echo "--------------------------" +echo "Enabling management plugin" +echo "--------------------------" +#enable management plugin +rabbitmq-plugins enable rabbitmq_management + +echo "-------------------" +echo "Adding default user" +echo "-------------------" +rabbitmqctl add_user rabbit rabbit +rabbitmqctl set_user_tags rabbit administrator +rabbitmqctl set_permissions -p / rabbit ".*" ".*" ".*" + +#overwrite the erlang cookie +./set_erlang_cookie.sh $1 \ No newline at end of file diff --git a/cluster-install/install-scripts/setup_cluster.sh b/cluster-install/install-scripts/setup_cluster.sh new file mode 100644 index 0000000..3b40d46 --- /dev/null +++ b/cluster-install/install-scripts/setup_cluster.sh @@ -0,0 +1,31 @@ +#!/bin/bash +#First setup the node with setup.sh, then use this script to join a cluster. +#usage +# $> sudo setup_cluster.sh "rabbit@the-cluster-node" "the-erlang-cookie" +# +SETUP_CLUSTER_HOST=$1 +SETUP_ERLANG_COOKIE=$2 + +#overwrite the erlang cookie +./set_erlang_cookie.sh $SETUP_ERLANG_COOKIE + + +#prepare the local node to join the cluster +echo "--------------------------------------------" +echo "Preparing the local node to join the cluster" +echo "--------------------------------------------" +rabbitmqctl stop_app +rabbitmqctl reset + + +#join operation request +echo "----------------------------------------------------------" +echo "Issuing cluster join request to node ${SETUP_CLUSTER_HOST}" +echo "----------------------------------------------------------" +rabbitmqctl join_cluster $SETUP_CLUSTER_HOST + +#start node apps +echo "------------------------------" +echo "Starting internal rabbitmq app" +echo "------------------------------" +rabbitmqctl start_app \ No newline at end of file From 411ae1c7ebab65eeebd09cf4c9e411e0e2dbd525 Mon Sep 17 00:00:00 2001 From: Luca Date: Fri, 1 Mar 2024 16:58:55 +0100 Subject: [PATCH 2/8] chore: organize install scripts, minor changes and init readme/todo --- cluster-install/README.md | 6 ++++++ cluster-install/TODO.md | 8 ++++++++ .../install-scripts/00_create_machine.sh | 19 +++++++++++++++++++ .../install-scripts/{setup.sh => 01_setup.sh} | 5 +++-- .../{setup_cluster.sh => 02_setup_cluster.sh} | 2 +- .../install-scripts/set_erlang_cookie.sh | 3 ++- 6 files changed, 39 insertions(+), 4 deletions(-) create mode 100644 cluster-install/README.md create mode 100644 cluster-install/TODO.md create mode 100755 cluster-install/install-scripts/00_create_machine.sh rename cluster-install/install-scripts/{setup.sh => 01_setup.sh} (94%) rename cluster-install/install-scripts/{setup_cluster.sh => 02_setup_cluster.sh} (91%) diff --git a/cluster-install/README.md b/cluster-install/README.md new file mode 100644 index 0000000..bc7dd20 --- /dev/null +++ b/cluster-install/README.md @@ -0,0 +1,6 @@ +# Install a new node +1. Install the `gcloud` +2. +Run `00_create_machine.sh` from your local environment + +`gcloud compute scp ./cluster-install/install-scripts/*.sh rabbitmq-1:~/` \ No newline at end of file diff --git a/cluster-install/TODO.md b/cluster-install/TODO.md new file mode 100644 index 0000000..6dc52cb --- /dev/null +++ b/cluster-install/TODO.md @@ -0,0 +1,8 @@ +# TODO +1. Check `sudo` +2. Exit at first error +3. First script returns the `ERLANG_COOKIE` +4. Set feature flag `detailed_queues_endpoint` as enabled (~ `rabbitmqctl enable_feature_flag detailed_queues_endpoint` ) + + +COOKIE = YVIRLAAERJPTQKTYMACZ \ No newline at end of file diff --git a/cluster-install/install-scripts/00_create_machine.sh b/cluster-install/install-scripts/00_create_machine.sh new file mode 100755 index 0000000..95dbdf8 --- /dev/null +++ b/cluster-install/install-scripts/00_create_machine.sh @@ -0,0 +1,19 @@ +## to be run on local machine +gcloud compute instances create $1 \ + --project=rabbit-test-406509 \ + --zone=europe-west1-b \ + --machine-type=e2-micro \ + --network-interface=network-tier=PREMIUM,stack-type=IPV4_ONLY,subnet=default \ + --maintenance-policy=MIGRATE \ + --provisioning-model=STANDARD \ + --service-account=99473582712-compute@developer.gserviceaccount.com \ + --scopes=https://www.googleapis.com/auth/devstorage.read_only,https://www.googleapis.com/auth/logging.write,https://www.googleapis.com/auth/monitoring.write,https://www.googleapis.com/auth/servicecontrol,https://www.googleapis.com/auth/service.management.readonly,https://www.googleapis.com/auth/trace.append \ + --tags=http-server,https-server \ + --create-disk=auto-delete=yes,boot=yes,device-name=$1,image=projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20240228,mode=rw,size=10,type=projects/rabbit-test-406509/zones/europe-west1-b/diskTypes/pd-balanced \ + --no-shielded-secure-boot \ + --shielded-vtpm \ + --shielded-integrity-monitoring \ + --labels=goog-ec-src=vm_add-gcloud \ + --reservation-affinity=any + +# gcloud compute scp ./cluster-install/install-scripts/*.sh rabbitmq-1:~/ \ No newline at end of file diff --git a/cluster-install/install-scripts/setup.sh b/cluster-install/install-scripts/01_setup.sh similarity index 94% rename from cluster-install/install-scripts/setup.sh rename to cluster-install/install-scripts/01_setup.sh index cba9710..6e23acf 100644 --- a/cluster-install/install-scripts/setup.sh +++ b/cluster-install/install-scripts/01_setup.sh @@ -1,6 +1,6 @@ #!/bin/bash #usage -# $> sudo setup.sh ["the-erlang-cookie"] +# $> sudo 01_setup.sh ["the-erlang-cookie"] # echo "---------------------" echo "Installing base tools" @@ -36,10 +36,11 @@ echo "-------------------" apt-get install rabbitmq-server -y --fix-missing echo "---------------------------" -echo "Running rabbitmq at startup" +echo "Enabling rabbitmq at startup" echo "---------------------------" #start rabbitmq and make it run at startup systemctl enable rabbitmq-server +sleep 30 echo "--------------------------" echo "Enabling management plugin" diff --git a/cluster-install/install-scripts/setup_cluster.sh b/cluster-install/install-scripts/02_setup_cluster.sh similarity index 91% rename from cluster-install/install-scripts/setup_cluster.sh rename to cluster-install/install-scripts/02_setup_cluster.sh index 3b40d46..06fd5e1 100644 --- a/cluster-install/install-scripts/setup_cluster.sh +++ b/cluster-install/install-scripts/02_setup_cluster.sh @@ -1,7 +1,7 @@ #!/bin/bash #First setup the node with setup.sh, then use this script to join a cluster. #usage -# $> sudo setup_cluster.sh "rabbit@the-cluster-node" "the-erlang-cookie" +# $> sudo 02_setup_cluster.sh "rabbit@the-cluster-node" "the-erlang-cookie" # SETUP_CLUSTER_HOST=$1 SETUP_ERLANG_COOKIE=$2 diff --git a/cluster-install/install-scripts/set_erlang_cookie.sh b/cluster-install/install-scripts/set_erlang_cookie.sh index a060b4e..c4d3ebb 100755 --- a/cluster-install/install-scripts/set_erlang_cookie.sh +++ b/cluster-install/install-scripts/set_erlang_cookie.sh @@ -13,9 +13,10 @@ then #overwrite the erlang cookie systemctl stop rabbitmq-server chmod 666 /var/lib/rabbitmq/.erlang.cookie - echo $SETUP_ERLANG_COOKIE > /var/lib/rabbitmq/.erlang.cookie + echo -n $SETUP_ERLANG_COOKIE > /var/lib/rabbitmq/.erlang.cookie chmod 400 /var/lib/rabbitmq/.erlang.cookie systemctl start rabbitmq-server + sleep 30 else echo "-------------------------------------" echo "Skipping rabbitmq erlang cookie setup" From 5f0cff5d1acca340a0e445463a0598f2c6b81ea6 Mon Sep 17 00:00:00 2001 From: Luca Date: Fri, 1 Mar 2024 19:38:03 +0100 Subject: [PATCH 3/8] fix: add prometheus plugin to node setup --- cluster-install/install-scripts/01_setup.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cluster-install/install-scripts/01_setup.sh b/cluster-install/install-scripts/01_setup.sh index 6e23acf..5eb8c95 100644 --- a/cluster-install/install-scripts/01_setup.sh +++ b/cluster-install/install-scripts/01_setup.sh @@ -47,6 +47,8 @@ echo "Enabling management plugin" echo "--------------------------" #enable management plugin rabbitmq-plugins enable rabbitmq_management +#metrics for prometheus +rabbitmq-plugins enable rabbitmq_prometheus echo "-------------------" echo "Adding default user" From b7661b8e40e80c03ce14dcf092b37d66de693a6c Mon Sep 17 00:00:00 2001 From: Luca Date: Tue, 5 Mar 2024 22:21:30 +0100 Subject: [PATCH 4/8] feat: add optional argument for rabbitmq 3 minor version on install script --- cluster-install/install-scripts/01_setup.sh | 26 +++++++++++++------ .../install-scripts/02_setup_cluster.sh | 0 .../install-scripts/cloudsmith_repos.sh | 0 3 files changed, 18 insertions(+), 8 deletions(-) mode change 100644 => 100755 cluster-install/install-scripts/01_setup.sh mode change 100644 => 100755 cluster-install/install-scripts/02_setup_cluster.sh mode change 100644 => 100755 cluster-install/install-scripts/cloudsmith_repos.sh diff --git a/cluster-install/install-scripts/01_setup.sh b/cluster-install/install-scripts/01_setup.sh old mode 100644 new mode 100755 index 5eb8c95..bc2641c --- a/cluster-install/install-scripts/01_setup.sh +++ b/cluster-install/install-scripts/01_setup.sh @@ -1,6 +1,6 @@ #!/bin/bash #usage -# $> sudo 01_setup.sh ["the-erlang-cookie"] +# $> sudo 01_setup.sh ["rabbitmq-server 3 minor version"] # echo "---------------------" echo "Installing base tools" @@ -29,11 +29,24 @@ apt-get install -y erlang-base \ erlang-runtime-tools erlang-snmp erlang-ssl \ erlang-syntax-tools erlang-tftp erlang-tools erlang-xmerl -echo "-------------------" -echo "Installing rabbitmq" -echo "-------------------" + ## Install rabbitmq-server and its dependencies -apt-get install rabbitmq-server -y --fix-missing +if [ -z "$1" ]; then + echo "-------------------" + echo "Installing latest version of rabbitmq" + echo "-------------------" + sudo apt-get install rabbitmq-server -y --fix-missing +else + latest_patch=$(apt list -a rabbitmq-server 2>/dev/null | grep -oP "3.12.\K(\d{1,2}-\d{1,2})" | sort -V | tail -n 1) + if [ -z "$latest_patch" ]; then + echo "...could not find any version of minor $1, aborting" + exit 1 + fi + echo "-------------------" + echo "Installing version 3.$1.$latest_patch of rabbitmq" + echo "-------------------" + sudo apt-get install rabbitmq-server=3.$1.$latest_patch -y --fix-missing +fi echo "---------------------------" echo "Enabling rabbitmq at startup" @@ -56,6 +69,3 @@ echo "-------------------" rabbitmqctl add_user rabbit rabbit rabbitmqctl set_user_tags rabbit administrator rabbitmqctl set_permissions -p / rabbit ".*" ".*" ".*" - -#overwrite the erlang cookie -./set_erlang_cookie.sh $1 \ No newline at end of file diff --git a/cluster-install/install-scripts/02_setup_cluster.sh b/cluster-install/install-scripts/02_setup_cluster.sh old mode 100644 new mode 100755 diff --git a/cluster-install/install-scripts/cloudsmith_repos.sh b/cluster-install/install-scripts/cloudsmith_repos.sh old mode 100644 new mode 100755 From fa86936e18835c1dc8af89d062403e7b6c59a82c Mon Sep 17 00:00:00 2001 From: Igor Cappello Date: Thu, 7 Mar 2024 17:36:57 +0100 Subject: [PATCH 5/8] chore: add minor updates --- cluster-install/TODO.md | 5 +++-- cluster-install/install-scripts/00_create_machine.sh | 2 +- cluster-install/install-scripts/01_setup.sh | 5 +++++ 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cluster-install/TODO.md b/cluster-install/TODO.md index 6dc52cb..e80120e 100644 --- a/cluster-install/TODO.md +++ b/cluster-install/TODO.md @@ -1,8 +1,9 @@ # TODO + 1. Check `sudo` 2. Exit at first error 3. First script returns the `ERLANG_COOKIE` 4. Set feature flag `detailed_queues_endpoint` as enabled (~ `rabbitmqctl enable_feature_flag detailed_queues_endpoint` ) +5. track firewall rule creation - -COOKIE = YVIRLAAERJPTQKTYMACZ \ No newline at end of file +COOKIE = YVIRLAAERJPTQKTYMACZ diff --git a/cluster-install/install-scripts/00_create_machine.sh b/cluster-install/install-scripts/00_create_machine.sh index 95dbdf8..1e0d576 100755 --- a/cluster-install/install-scripts/00_create_machine.sh +++ b/cluster-install/install-scripts/00_create_machine.sh @@ -8,7 +8,7 @@ gcloud compute instances create $1 \ --provisioning-model=STANDARD \ --service-account=99473582712-compute@developer.gserviceaccount.com \ --scopes=https://www.googleapis.com/auth/devstorage.read_only,https://www.googleapis.com/auth/logging.write,https://www.googleapis.com/auth/monitoring.write,https://www.googleapis.com/auth/servicecontrol,https://www.googleapis.com/auth/service.management.readonly,https://www.googleapis.com/auth/trace.append \ - --tags=http-server,https-server \ + --tags=allow-tcp-15672,http-server,https-server \ --create-disk=auto-delete=yes,boot=yes,device-name=$1,image=projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20240228,mode=rw,size=10,type=projects/rabbit-test-406509/zones/europe-west1-b/diskTypes/pd-balanced \ --no-shielded-secure-boot \ --shielded-vtpm \ diff --git a/cluster-install/install-scripts/01_setup.sh b/cluster-install/install-scripts/01_setup.sh index bc2641c..fca29ce 100755 --- a/cluster-install/install-scripts/01_setup.sh +++ b/cluster-install/install-scripts/01_setup.sh @@ -63,6 +63,11 @@ rabbitmq-plugins enable rabbitmq_management #metrics for prometheus rabbitmq-plugins enable rabbitmq_prometheus +echo "----------------------------------------------" +echo "Enabling feature flag detailed_queues_endpoint" +echo "----------------------------------------------" +rabbitmqctl enable_feature_flag detailed_queues_endpoint + echo "-------------------" echo "Adding default user" echo "-------------------" From 1ffbdf24eac4ab87c800cc87617d63435bfd2351 Mon Sep 17 00:00:00 2001 From: Luca Date: Tue, 12 Mar 2024 15:16:58 +0100 Subject: [PATCH 6/8] feat: add prometheus rules to docker compose env --- docker-compose/alert.rules.yml | 87 +++++++++++++++++++++++++++++++ docker-compose/docker-compose.yml | 7 ++- docker-compose/prometheus.yml | 3 ++ 3 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 docker-compose/alert.rules.yml diff --git a/docker-compose/alert.rules.yml b/docker-compose/alert.rules.yml new file mode 100644 index 0000000..3f2f4c1 --- /dev/null +++ b/docker-compose/alert.rules.yml @@ -0,0 +1,87 @@ +groups: + +## From Awesome Prometheus - https://raw.githubusercontent.com/samber/awesome-prometheus-alerts/master/dist/rules/rabbitmq/rabbitmq-exporter.yml +- name: RabbitMQ + + rules: + + - alert: RabbitmqNodeDown + expr: 'sum(rabbitmq_build_info) < 3' + for: 0m + labels: + severity: critical + annotations: + summary: RabbitMQ node down (instance {{ $labels.instance }}) + description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqNodeNotDistributed + expr: 'erlang_vm_dist_node_state < 3' + for: 0m + labels: + severity: critical + annotations: + summary: RabbitMQ node not distributed (instance {{ $labels.instance }}) + description: "Distribution link state is not 'up'\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqInstancesDifferentVersions + expr: 'count(count(rabbitmq_build_info) by (rabbitmq_version)) > 1' + for: 1h + labels: + severity: warning + annotations: + summary: RabbitMQ instances different versions (instance {{ $labels.instance }}) + description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqMemoryHigh + expr: 'rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90' + for: 2m + labels: + severity: warning + annotations: + summary: RabbitMQ memory high (instance {{ $labels.instance }}) + description: "A node use more than 90% of allocated RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqFileDescriptorsUsage + expr: 'rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90' + for: 2m + labels: + severity: warning + annotations: + summary: RabbitMQ file descriptors usage (instance {{ $labels.instance }}) + description: "A node use more than 90% of file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqTooManyUnackMessages + expr: 'sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000' + for: 1m + labels: + severity: warning + annotations: + summary: RabbitMQ too many unack messages (instance {{ $labels.instance }}) + description: "Too many unacknowledged messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqTooManyConnections + expr: 'rabbitmq_connections > 1000' + for: 2m + labels: + severity: warning + annotations: + summary: RabbitMQ too many connections (instance {{ $labels.instance }}) + description: "The total connections of a node is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # - alert: RabbitmqNoQueueConsumer + # expr: 'rabbitmq_queue_consumers < 1' + # for: 1m + # labels: + # severity: warning + # annotations: + # summary: RabbitMQ no queue consumer (instance {{ $labels.instance }}) + # description: "A queue has less than 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: RabbitmqUnroutableMessages + expr: 'increase(rabbitmq_channel_messages_unroutable_returned_total[1m]) > 0 or increase(rabbitmq_channel_messages_unroutable_dropped_total[1m]) > 0' + for: 2m + labels: + severity: warning + annotations: + summary: RabbitMQ unroutable messages (instance {{ $labels.instance }}) + description: "A queue has unroutable messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 3646339..05dc0d9 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -2,7 +2,7 @@ version: '3.9' services: rabbitmq: - image: rabbitmq:3.10-management + image: rabbitmq:3.12-management hostname: rabbitmq ports: - 15692:15692 @@ -17,7 +17,7 @@ services: RABBITMQ_DEFAULT_PASS: rabbit federated-rabbitmq: - image: rabbitmq:3.10-management + image: rabbitmq:3.12-management hostname: rabbitmq ports: - 25692:15692 @@ -49,6 +49,9 @@ services: - type: bind source: ./prometheus.yml target: /etc/prometheus/prometheus.yml + - type: bind + source: ./alert.rules.yml + target: /etc/prometheus/alert.rules.yml user: root command: - '--config.file=/etc/prometheus/prometheus.yml' diff --git a/docker-compose/prometheus.yml b/docker-compose/prometheus.yml index 3b13e40..9531f6a 100644 --- a/docker-compose/prometheus.yml +++ b/docker-compose/prometheus.yml @@ -26,3 +26,6 @@ scrape_configs: - source_labels: [__name__] regex: "rabbitmq_identity_info" action: drop + +rule_files: + - "/etc/prometheus/alert.rules.yml" From df4ae45ca7a716076f9bb1c638ad0ee1a3dbbf43 Mon Sep 17 00:00:00 2001 From: Igor Cappello Date: Tue, 19 Mar 2024 10:47:32 +0100 Subject: [PATCH 7/8] update: config alertmanager and prometheus to get email notifications about issues --- docker-compose/alert-manager.yml | 47 ++++ docker-compose/docker-compose.yml | 76 ++++-- ...prometheus-self-monitoring-alert.rules.yml | 257 ++++++++++++++++++ docker-compose/prometheus.yml | 14 +- .../rabbitmq-architecture-alert.rules.yml | 11 + ...s.yml => rabbitmq-cluster-alert.rules.yml} | 0 6 files changed, 378 insertions(+), 27 deletions(-) create mode 100644 docker-compose/alert-manager.yml create mode 100644 docker-compose/prometheus-self-monitoring-alert.rules.yml create mode 100644 docker-compose/rabbitmq-architecture-alert.rules.yml rename docker-compose/{alert.rules.yml => rabbitmq-cluster-alert.rules.yml} (100%) diff --git a/docker-compose/alert-manager.yml b/docker-compose/alert-manager.yml new file mode 100644 index 0000000..a94032b --- /dev/null +++ b/docker-compose/alert-manager.yml @@ -0,0 +1,47 @@ +global: + smtp_smarthost: 'smtp4dev:25' + smtp_from: 'alertmanager@example.org' + smtp_require_tls: false + +route: + routes: + - group_by: ["service"] + group_wait: 30s + group_interval: 5m + mute_time_intervals: ["weekday-off-evenings", "weekday-off-mornings", "weekends"] + receiver: team-mails + +time_intervals: + - name: "working-hours" + time_intervals: + - times: + - start_time: "09:00" + end_time: "18:00" + location: "Europe/Rome" + weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] + - name: "weekday-off-evenings" + time_intervals: + - times: + - start_time: "18:00" + end_time: "23:59" + location: "Europe/Rome" + weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] + - name: "weekday-off-mornings" + time_intervals: + - times: + - start_time: "00:00" + end_time: "08:59" + location: "Europe/Rome" + weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] + - name: "weekends" + time_intervals: + - times: + - start_time: "00:00" + end_time: "23:59" + location: "Europe/Rome" + weekdays: ['Saturday', 'Sunday'] + +receivers: + - name: "team-mails" + email_configs: + - to: "test-receiver@example.com" \ No newline at end of file diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 05dc0d9..5083d14 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -1,8 +1,14 @@ -version: '3.9' +version: "3.9" + +networks: + backbone1: + driver: bridge + name: backbone1 services: rabbitmq: image: rabbitmq:3.12-management + container_name: rabbitmq hostname: rabbitmq ports: - 15692:15692 @@ -15,24 +21,13 @@ services: environment: RABBITMQ_DEFAULT_USER: rabbit RABBITMQ_DEFAULT_PASS: rabbit - - federated-rabbitmq: - image: rabbitmq:3.12-management - hostname: rabbitmq - ports: - - 25692:15692 - - 25672:15672 - - 6672:5672 - volumes: - - type: bind - source: ./enabled_plugins - target: /etc/rabbitmq/enabled_plugins - environment: - RABBITMQ_DEFAULT_USER: rabbit - RABBITMQ_DEFAULT_PASS: rabbit + networks: + - backbone1 grafana: image: grafana/grafana:latest + container_name: grafana + hostname: grafana volumes: - ./grafana:/var/lib/grafana user: root @@ -41,30 +36,61 @@ services: - GF_SMTP_HOST=smtp4dev:25 ports: - 3000:3000 + networks: + - backbone1 prometheus: - image: prom/prometheus:latest + image: prom/prometheus:v2.50.1 + container_name: prometheus + hostname: prometheus restart: unless-stopped volumes: - type: bind source: ./prometheus.yml target: /etc/prometheus/prometheus.yml - type: bind - source: ./alert.rules.yml - target: /etc/prometheus/alert.rules.yml + source: ./rabbitmq-cluster-alert.rules.yml + target: /etc/prometheus/rabbitmq-cluster-alert.rules.yml + - type: bind + source: ./prometheus-self-monitoring-alert.rules.yml + target: /etc/prometheus/prometheus-self-monitoring-alert.rules.yml + - type: bind + source: ./rabbitmq-architecture-alert.rules.yml + target: /etc/prometheus/rabbitmq-architecture-alert.rules.yml user: root command: - - '--config.file=/etc/prometheus/prometheus.yml' - - '--storage.tsdb.path=/prometheus' - - '--web.console.libraries=/etc/prometheus/console_libraries' - - '--web.console.templates=/etc/prometheus/consoles' - - '--web.enable-lifecycle' + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--web.console.libraries=/etc/prometheus/console_libraries" + - "--web.console.templates=/etc/prometheus/consoles" + - "--web.enable-lifecycle" ports: - 9090:9090 - + networks: + - backbone1 + + alert-manager: + image: prom/alertmanager:v0.27.0 + container_name: alert-manager + hostname: alert-manager + volumes: + - type: bind + source: ./alert-manager.yml + target: /etc/alertmanager/alert-manager.yml + ports: + - 9093:9093 + command: + - "--config.file=/etc/alertmanager/alert-manager.yml" + networks: + - backbone1 + smtp4dev: + container_name: smtp4dev + hostname: smpt4dev image: rnwood/smtp4dev:v3 restart: unless-stopped ports: - 2525:2525 - 1080:80 + networks: + - backbone1 diff --git a/docker-compose/prometheus-self-monitoring-alert.rules.yml b/docker-compose/prometheus-self-monitoring-alert.rules.yml new file mode 100644 index 0000000..92644c0 --- /dev/null +++ b/docker-compose/prometheus-self-monitoring-alert.rules.yml @@ -0,0 +1,257 @@ +groups: + +- name: EmbeddedExporter + + rules: + + - alert: PrometheusJobMissing + expr: 'absent(up{job="prometheus"})' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetMissing + expr: 'up == 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAllTargetsMissing + expr: 'sum by (job) (up) == 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus all targets missing (instance {{ $labels.instance }}) + description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetMissingWithWarmupTime + expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing with warmup time (instance {{ $labels.instance }}) + description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusConfigurationReloadFailure + expr: 'prometheus_config_last_reload_successful != 1' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTooManyRestarts + expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus too many restarts (instance {{ $labels.instance }}) + description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # - alert: PrometheusAlertmanagerJobMissing + # expr: 'absent(up{job="alertmanager"})' + # for: 0m + # labels: + # severity: warning + # annotations: + # summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) + # description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: 'alertmanager_config_last_reload_successful != 1' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) + description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigNotSynced + expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) + description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + # - alert: PrometheusAlertmanagerE2eDeadManSwitch + # expr: 'vector(1)' + # for: 0m + # labels: + # severity: critical + # annotations: + # summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) + # description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: 'prometheus_notifications_alertmanagers_discovered < 1' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationFailures + expr: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTemplateTextExpansionFailures + expr: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationSlow + expr: 'prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds' + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotificationsBacklog + expr: 'min_over_time(prometheus_notifications_queue_length[10m]) > 0' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetEmpty + expr: 'prometheus_sd_discovered_targets == 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target empty (instance {{ $labels.instance }}) + description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapingSlow + expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05' + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusLargeScrape + expr: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10' + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapeDuplicate + expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) + description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointCreationFailures + expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointDeletionFailures + expr: 'increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCompactionsFailed + expr: 'increase(prometheus_tsdb_compactions_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbHeadTruncationsFailed + expr: 'increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbReloadFailures + expr: 'increase(prometheus_tsdb_reloads_failures_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalCorruptions + expr: 'increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalTruncationsFailed + expr: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTimeseriesCardinality + expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus timeseries cardinality (instance {{ $labels.instance }}) + description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" \ No newline at end of file diff --git a/docker-compose/prometheus.yml b/docker-compose/prometheus.yml index 9531f6a..51a0985 100644 --- a/docker-compose/prometheus.yml +++ b/docker-compose/prometheus.yml @@ -1,5 +1,6 @@ global: scrape_interval: 15s + scrape_timeout: 13s evaluation_interval: 15s scrape_configs: @@ -26,6 +27,15 @@ scrape_configs: - source_labels: [__name__] regex: "rabbitmq_identity_info" action: drop - + +alerting: + alertmanagers: + - scheme: "http" + - static_configs: + - targets: ["alert-manager:9093"] + + rule_files: - - "/etc/prometheus/alert.rules.yml" + - "/etc/prometheus/rabbitmq-cluster-alert.rules.yml" + - "/etc/prometheus/prometheus-self-monitoring-alert.rules.yml" + - "/etc/prometheus/rabbitmq-architecture-alert.rules.yml" diff --git a/docker-compose/rabbitmq-architecture-alert.rules.yml b/docker-compose/rabbitmq-architecture-alert.rules.yml new file mode 100644 index 0000000..5401ed1 --- /dev/null +++ b/docker-compose/rabbitmq-architecture-alert.rules.yml @@ -0,0 +1,11 @@ +groups: +- name: RabbitMQ-Architecture + rules: + - alert: DeadLetter not empty + expr: sum(rabbitmq_detailed_queue_messages{queue=""}) > 0 + for: 0m + labels: + severity: warning + annotations: + summary: DeadLetter queue is not empty + description: "The DeadLetter queue has some messages" \ No newline at end of file diff --git a/docker-compose/alert.rules.yml b/docker-compose/rabbitmq-cluster-alert.rules.yml similarity index 100% rename from docker-compose/alert.rules.yml rename to docker-compose/rabbitmq-cluster-alert.rules.yml From 1dfd3b289c1634ff93630ca6dd0cb7916313429d Mon Sep 17 00:00:00 2001 From: Igor Cappello Date: Mon, 25 Mar 2024 11:42:12 +0100 Subject: [PATCH 8/8] update alertmanager config: slack integration, matchers When running docker-compose, the env var SLACK_API_URL is used to determine the slack integration endpoint --- docker-compose/alert-manager.yml | 47 ---------- docker-compose/alert-manager.yml.tpl | 94 +++++++++++++++++++ docker-compose/docker-compose.yml | 17 +++- ...prometheus-self-monitoring-alert.rules.yml | 56 +++++++---- docker-compose/prometheus.yml | 9 +- .../rabbitmq-architecture-alert.rules.yml | 16 +++- .../rabbitmq-cluster-alert.rules.yml | 13 ++- 7 files changed, 175 insertions(+), 77 deletions(-) delete mode 100644 docker-compose/alert-manager.yml create mode 100644 docker-compose/alert-manager.yml.tpl diff --git a/docker-compose/alert-manager.yml b/docker-compose/alert-manager.yml deleted file mode 100644 index a94032b..0000000 --- a/docker-compose/alert-manager.yml +++ /dev/null @@ -1,47 +0,0 @@ -global: - smtp_smarthost: 'smtp4dev:25' - smtp_from: 'alertmanager@example.org' - smtp_require_tls: false - -route: - routes: - - group_by: ["service"] - group_wait: 30s - group_interval: 5m - mute_time_intervals: ["weekday-off-evenings", "weekday-off-mornings", "weekends"] - receiver: team-mails - -time_intervals: - - name: "working-hours" - time_intervals: - - times: - - start_time: "09:00" - end_time: "18:00" - location: "Europe/Rome" - weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] - - name: "weekday-off-evenings" - time_intervals: - - times: - - start_time: "18:00" - end_time: "23:59" - location: "Europe/Rome" - weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] - - name: "weekday-off-mornings" - time_intervals: - - times: - - start_time: "00:00" - end_time: "08:59" - location: "Europe/Rome" - weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] - - name: "weekends" - time_intervals: - - times: - - start_time: "00:00" - end_time: "23:59" - location: "Europe/Rome" - weekdays: ['Saturday', 'Sunday'] - -receivers: - - name: "team-mails" - email_configs: - - to: "test-receiver@example.com" \ No newline at end of file diff --git a/docker-compose/alert-manager.yml.tpl b/docker-compose/alert-manager.yml.tpl new file mode 100644 index 0000000..5255449 --- /dev/null +++ b/docker-compose/alert-manager.yml.tpl @@ -0,0 +1,94 @@ +global: + smtp_smarthost: 'smtp4dev:25' + smtp_from: 'alertmanager@example.org' + smtp_require_tls: false + +route: + group_by: ["group"] + receiver: team-mails + routes: + - receiver: team-mails + # time to wait for grouping alarms when sending the first notification + group_wait: 30s + # time to wait for grouping new alarms when sending further notifications for the same group + group_interval: 1m + # should be a multiple of group_interval + repeat_interval: 2h + mute_time_intervals: ["weekday-off-evenings", "weekday-off-mornings", "weekends"] + matchers: + - severity =~ "warning|critical" + - receiver: team-slack-and-emails + # time to wait for grouping alarms when sending the first notification + group_wait: 30s + # time to wait for grouping new alarms when sending further notifications for the same group + group_interval: 1m + # should be a multiple of group_interval + repeat_interval: 2m + matchers: + - severity =~ "critical" + +time_intervals: + - name: "working-hours" + time_intervals: + - times: + - start_time: "09:00" + end_time: "18:00" + location: "Europe/Rome" + weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] + - name: "weekday-off-evenings" + time_intervals: + - times: + - start_time: "18:00" + end_time: "23:59" + location: "Europe/Rome" + weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] + - name: "weekday-off-mornings" + time_intervals: + - times: + - start_time: "00:00" + end_time: "08:59" + location: "Europe/Rome" + weekdays: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'] + - name: "weekends" + time_intervals: + - times: + - start_time: "00:00" + end_time: "23:59" + location: "Europe/Rome" + weekdays: ['Saturday', 'Sunday'] + +receivers: + - name: "team-mails" + email_configs: + - to: "test-receiver@example.com" + send_resolved: true + - name: "team-slack-and-emails" + email_configs: + - to: "test-receiver@example.com" + send_resolved: true + slack_configs: + - send_resolved: true + channel: "#rabbitmq-management" + api_url: "env.SLACK_API_URL" + title: |- + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }} + {{- if gt (len .CommonLabels) (len .GroupLabels) -}} + {{" "}}( + {{- with .CommonLabels.Remove .GroupLabels.Names }} + {{- range $index, $label := .SortedPairs -}} + {{ if $index }}, {{ end }} + {{- $label.Name }}="{{ $label.Value -}}" + {{- end }} + {{- end -}} + ) + {{- end }} + text: >- + {{ range .Alerts -}} + *Alert:* {{ .Annotations.title }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }} + + *Description:* {{ .Annotations.description }} + + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml index 5083d14..92e79bb 100644 --- a/docker-compose/docker-compose.yml +++ b/docker-compose/docker-compose.yml @@ -23,6 +23,11 @@ services: RABBITMQ_DEFAULT_PASS: rabbit networks: - backbone1 + healthcheck: + test: rabbitmq-diagnostics -q ping + interval: 30s + timeout: 30s + retries: 3 grafana: image: grafana/grafana:latest @@ -75,12 +80,16 @@ services: hostname: alert-manager volumes: - type: bind - source: ./alert-manager.yml - target: /etc/alertmanager/alert-manager.yml + source: ./alert-manager.yml.tpl + target: /etc/alertmanager/alert-manager.yml.tpl ports: - 9093:9093 - command: - - "--config.file=/etc/alertmanager/alert-manager.yml" + #NOTE: you need to setup an env variable SLACK_API_URL when starting docker-compose (e.g. with direnv) + entrypoint: > + sh -c " + sed 's|env.SLACK_API_URL|$SLACK_API_URL|' /etc/alertmanager/alert-manager.yml.tpl > /etc/alertmanager/alert-manager.yml \\ + && /bin/alertmanager --config.file=/etc/alertmanager/alert-manager.yml + " networks: - backbone1 diff --git a/docker-compose/prometheus-self-monitoring-alert.rules.yml b/docker-compose/prometheus-self-monitoring-alert.rules.yml index 92644c0..e577849 100644 --- a/docker-compose/prometheus-self-monitoring-alert.rules.yml +++ b/docker-compose/prometheus-self-monitoring-alert.rules.yml @@ -1,77 +1,75 @@ groups: - -- name: EmbeddedExporter - +- name: PrometheusSelfMonitoring rules: - - alert: PrometheusJobMissing expr: 'absent(up{job="prometheus"})' for: 0m labels: severity: warning + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus job missing (instance {{ $labels.instance }}) description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: PrometheusTargetMissing expr: 'up == 0' for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus target missing (instance {{ $labels.instance }}) description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: PrometheusAllTargetsMissing expr: 'sum by (job) (up) == 0' for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus all targets missing (instance {{ $labels.instance }}) description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: PrometheusTargetMissingWithWarmupTime expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))' for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus target missing with warmup time (instance {{ $labels.instance }}) description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: PrometheusConfigurationReloadFailure expr: 'prometheus_config_last_reload_successful != 1' for: 0m labels: severity: warning + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: PrometheusTooManyRestarts expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2' for: 0m labels: severity: warning + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus too many restarts (instance {{ $labels.instance }}) description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - # - alert: PrometheusAlertmanagerJobMissing - # expr: 'absent(up{job="alertmanager"})' - # for: 0m - # labels: - # severity: warning - # annotations: - # summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) - # description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - + - alert: PrometheusAlertmanagerJobMissing + expr: 'absent(up{job="alertmanager"})' + for: 0m + labels: + severity: warning + group: "PrometheusSelfMonitoring" + annotations: + summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) + description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: PrometheusAlertmanagerConfigurationReloadFailure expr: 'alertmanager_config_last_reload_successful != 1' for: 0m labels: severity: warning + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -81,6 +79,7 @@ groups: for: 0m labels: severity: warning + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -90,6 +89,7 @@ groups: # for: 0m # labels: # severity: critical + # group: "PrometheusSelfMonitoring" # annotations: # summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) # description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -99,6 +99,7 @@ groups: for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -108,6 +109,7 @@ groups: for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -117,6 +119,7 @@ groups: for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -126,6 +129,7 @@ groups: for: 5m labels: severity: warning + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -135,6 +139,7 @@ groups: for: 0m labels: severity: warning + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus notifications backlog (instance {{ $labels.instance }}) description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -144,6 +149,7 @@ groups: for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -153,6 +159,7 @@ groups: for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus target empty (instance {{ $labels.instance }}) description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -162,6 +169,7 @@ groups: for: 5m labels: severity: warning + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus target scraping slow (instance {{ $labels.instance }}) description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -171,6 +179,7 @@ groups: for: 5m labels: severity: warning + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus large scrape (instance {{ $labels.instance }}) description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -180,6 +189,7 @@ groups: for: 0m labels: severity: warning + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -189,6 +199,7 @@ groups: for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -198,6 +209,7 @@ groups: for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -207,6 +219,7 @@ groups: for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -216,6 +229,7 @@ groups: for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -225,6 +239,7 @@ groups: for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -234,6 +249,7 @@ groups: for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -243,6 +259,7 @@ groups: for: 0m labels: severity: critical + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -252,6 +269,7 @@ groups: for: 0m labels: severity: warning + group: "PrometheusSelfMonitoring" annotations: summary: Prometheus timeseries cardinality (instance {{ $labels.instance }}) description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" \ No newline at end of file diff --git a/docker-compose/prometheus.yml b/docker-compose/prometheus.yml index 51a0985..1c33d81 100644 --- a/docker-compose/prometheus.yml +++ b/docker-compose/prometheus.yml @@ -9,10 +9,15 @@ scrape_configs: static_configs: - targets: ["localhost:9090"] + - job_name: "alertmanager" + scrape_interval: 1m + static_configs: + - targets: ["alert-manager:9093"] + - job_name: "rabbitmq" static_configs: - targets: ["rabbitmq:15692"] - + - job_name: "rabbitmq-detailed" metrics_path: "/metrics/detailed" params: @@ -31,7 +36,7 @@ scrape_configs: alerting: alertmanagers: - scheme: "http" - - static_configs: + static_configs: - targets: ["alert-manager:9093"] diff --git a/docker-compose/rabbitmq-architecture-alert.rules.yml b/docker-compose/rabbitmq-architecture-alert.rules.yml index 5401ed1..2dbd097 100644 --- a/docker-compose/rabbitmq-architecture-alert.rules.yml +++ b/docker-compose/rabbitmq-architecture-alert.rules.yml @@ -1,11 +1,21 @@ groups: - name: RabbitMQ-Architecture rules: - - alert: DeadLetter not empty - expr: sum(rabbitmq_detailed_queue_messages{queue=""}) > 0 + - alert: DeadLetter (dlq) not empty + expr: sum(rabbitmq_detailed_queue_messages{queue="dlq"}) > 0 for: 0m labels: severity: warning + group: "RabbitMQ-Architecture" annotations: summary: DeadLetter queue is not empty - description: "The DeadLetter queue has some messages" \ No newline at end of file + description: "The DeadLetter queue (dlq) has some messages" + - alert: Other queue not empty + expr: sum(rabbitmq_detailed_queue_messages{queue="other"}) > 0 + for: 0m + labels: + severity: warning + group: "RabbitMQ-Architecture" + annotations: + summary: Other queue is not empty + description: "The other queue has some messages" \ No newline at end of file diff --git a/docker-compose/rabbitmq-cluster-alert.rules.yml b/docker-compose/rabbitmq-cluster-alert.rules.yml index 3f2f4c1..cc9d169 100644 --- a/docker-compose/rabbitmq-cluster-alert.rules.yml +++ b/docker-compose/rabbitmq-cluster-alert.rules.yml @@ -6,19 +6,21 @@ groups: rules: - alert: RabbitmqNodeDown - expr: 'sum(rabbitmq_build_info) < 3' + expr: 'sum(rabbitmq_build_info) < 1' for: 0m labels: severity: critical + group: "RabbitMQ" annotations: summary: RabbitMQ node down (instance {{ $labels.instance }}) - description: "Less than 3 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Less than 1 nodes running in RabbitMQ cluster\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: RabbitmqNodeNotDistributed expr: 'erlang_vm_dist_node_state < 3' for: 0m labels: severity: critical + group: "RabbitMQ" annotations: summary: RabbitMQ node not distributed (instance {{ $labels.instance }}) description: "Distribution link state is not 'up'\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -28,6 +30,7 @@ groups: for: 1h labels: severity: warning + group: "RabbitMQ" annotations: summary: RabbitMQ instances different versions (instance {{ $labels.instance }}) description: "Running different version of RabbitMQ in the same cluster, can lead to failure.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -37,6 +40,7 @@ groups: for: 2m labels: severity: warning + group: "RabbitMQ" annotations: summary: RabbitMQ memory high (instance {{ $labels.instance }}) description: "A node use more than 90% of allocated RAM\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -46,6 +50,7 @@ groups: for: 2m labels: severity: warning + group: "RabbitMQ" annotations: summary: RabbitMQ file descriptors usage (instance {{ $labels.instance }}) description: "A node use more than 90% of file descriptors\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -55,6 +60,7 @@ groups: for: 1m labels: severity: warning + group: "RabbitMQ" annotations: summary: RabbitMQ too many unack messages (instance {{ $labels.instance }}) description: "Too many unacknowledged messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -64,6 +70,7 @@ groups: for: 2m labels: severity: warning + group: "RabbitMQ" annotations: summary: RabbitMQ too many connections (instance {{ $labels.instance }}) description: "The total connections of a node is too high\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -73,6 +80,7 @@ groups: # for: 1m # labels: # severity: warning + # group: "RabbitMQ" # annotations: # summary: RabbitMQ no queue consumer (instance {{ $labels.instance }}) # description: "A queue has less than 1 consumer\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" @@ -82,6 +90,7 @@ groups: for: 2m labels: severity: warning + group: "RabbitMQ" annotations: summary: RabbitMQ unroutable messages (instance {{ $labels.instance }}) description: "A queue has unroutable messages\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"