Skip to content

Commit cae36dc

Browse files
authored
Alertmanager alerts limits (#4253)
* Add store limits. Signed-off-by: Peter Štibraný <[email protected]> * Expose alerts limiter metrics. Signed-off-by: Peter Štibraný <[email protected]> * Fix tests. Signed-off-by: Peter Štibraný <[email protected]> * CHANGELOG.md Signed-off-by: Peter Štibraný <[email protected]> * Address review feedback. Signed-off-by: Peter Štibraný <[email protected]> * Added comment. Signed-off-by: Peter Štibraný <[email protected]> * Address review feedback. Signed-off-by: Peter Štibraný <[email protected]> * Move check to the top. Signed-off-by: Peter Štibraný <[email protected]> * When existing alert grows and doesn't fit the size limit anymore, it is rejected. Signed-off-by: Peter Štibraný <[email protected]>
1 parent 65fbad2 commit cae36dc

File tree

9 files changed

+413
-2
lines changed

9 files changed

+413
-2
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
* `memberlist_client_kv_store_value_tombstones_removed_total`
4040
* `memberlist_client_messages_to_broadcast_dropped_total`
4141
* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-dispatcher-aggregation-groups` option to control max number of active dispatcher groups in Alertmanager (per tenant, also overrideable). When the limit is reached, Dispatcher produces log message and increases `cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total` metric. #4254
42+
* [ENHANCEMENT] Alertmanager: Added `-alertmanager.max-alerts-count` and `-alertmanager.max-alerts-size-bytes` to control max number of alerts and total size of alerts that a single user can have in Alertmanager's memory. Adding more alerts will fail with a log message and incrementing `cortex_alertmanager_alerts_insert_limited_total` metric (per-user). These limits can be overrided by using per-tenant overrides. Current values are tracked in `cortex_alertmanager_alerts_limiter_current_alerts` and `cortex_alertmanager_alerts_limiter_current_alerts_size_bytes` metrics. #4253
4243
* [BUGFIX] Purger: fix `Invalid null value in condition for column range` caused by `nil` value in range for WriteBatch query. #4128
4344
* [BUGFIX] Ingester: fixed infrequent panic caused by a race condition between TSDB mmap-ed head chunks truncation and queries. #4176
4445
* [BUGFIX] Alertmanager: fix Alertmanager status page if clustering via gossip is disabled or sharding is enabled. #4184

docs/configuration/config-file-reference.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4180,6 +4180,17 @@ The `limits_config` configures default and per-tenant limits imposed by Cortex s
41804180
# 0 = no limit.
41814181
# CLI flag: -alertmanager.max-dispatcher-aggregation-groups
41824182
[alertmanager_max_dispatcher_aggregation_groups: <int> | default = 0]
4183+
4184+
# Maximum number of alerts that a single user can have. Inserting more alerts
4185+
# will fail with a log message and metric increment. 0 = no limit.
4186+
# CLI flag: -alertmanager.max-alerts-count
4187+
[alertmanager_max_alerts_count: <int> | default = 0]
4188+
4189+
# Maximum total size of alerts that a single user can have, alert size is the
4190+
# sum of the bytes of its labels, annotations and generatorURL. Inserting more
4191+
# alerts will fail with a log message and metric increment. 0 = no limit.
4192+
# CLI flag: -alertmanager.max-alerts-size-bytes
4193+
[alertmanager_max_alerts_size_bytes: <int> | default = 0]
41834194
```
41844195

41854196
### `redis_config`

pkg/alertmanager/alertmanager.go

Lines changed: 145 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
166166
Name: "alertmanager_notification_rate_limited_total",
167167
Help: "Number of rate-limited notifications per integration.",
168168
}, []string{"integration"}), // "integration" is consistent with other alertmanager metrics.
169+
169170
}
170171

171172
am.registry = reg
@@ -241,7 +242,12 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) {
241242
am.wg.Done()
242243
}()
243244

244-
am.alerts, err = mem.NewAlerts(context.Background(), am.marker, 30*time.Minute, nil, am.logger)
245+
var callback mem.AlertStoreCallback
246+
if am.cfg.Limits != nil {
247+
callback = newAlertsLimiter(am.cfg.UserID, am.cfg.Limits, reg)
248+
}
249+
250+
am.alerts, err = mem.NewAlerts(context.Background(), am.marker, 30*time.Minute, callback, am.logger)
245251
if err != nil {
246252
return nil, fmt.Errorf("failed to create alerts: %v", err)
247253
}
@@ -584,3 +590,141 @@ type dispatcherLimits struct {
584590
func (g *dispatcherLimits) MaxNumberOfAggregationGroups() int {
585591
return g.limits.AlertmanagerMaxDispatcherAggregationGroups(g.tenant)
586592
}
593+
594+
var (
595+
errTooManyAlerts = "too many alerts, limit: %d"
596+
errAlertsTooBig = "alerts too big, total size limit: %d bytes"
597+
)
598+
599+
// alertsLimiter limits the number and size of alerts being received by the Alertmanager.
600+
// We consider an alert unique based on its fingerprint (a hash of its labels) and
601+
// its size it's determined by the sum of bytes of its labels, annotations, and generator URL.
602+
type alertsLimiter struct {
603+
tenant string
604+
limits Limits
605+
606+
failureCounter prometheus.Counter
607+
608+
mx sync.Mutex
609+
sizes map[model.Fingerprint]int
610+
count int
611+
totalSize int
612+
}
613+
614+
func newAlertsLimiter(tenant string, limits Limits, reg prometheus.Registerer) *alertsLimiter {
615+
limiter := &alertsLimiter{
616+
tenant: tenant,
617+
limits: limits,
618+
sizes: map[model.Fingerprint]int{},
619+
failureCounter: promauto.With(reg).NewCounter(prometheus.CounterOpts{
620+
Name: "alertmanager_alerts_insert_limited_total",
621+
Help: "Number of failures to insert new alerts to in-memory alert store.",
622+
}),
623+
}
624+
625+
promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{
626+
Name: "alertmanager_alerts_limiter_current_alerts",
627+
Help: "Number of alerts tracked by alerts limiter.",
628+
}, func() float64 {
629+
c, _ := limiter.currentStats()
630+
return float64(c)
631+
})
632+
633+
promauto.With(reg).NewGaugeFunc(prometheus.GaugeOpts{
634+
Name: "alertmanager_alerts_limiter_current_alerts_size_bytes",
635+
Help: "Total size of alerts tracked by alerts limiter.",
636+
}, func() float64 {
637+
_, s := limiter.currentStats()
638+
return float64(s)
639+
})
640+
641+
return limiter
642+
}
643+
644+
func (a *alertsLimiter) PreStore(alert *types.Alert, existing bool) error {
645+
if alert == nil {
646+
return nil
647+
}
648+
649+
fp := alert.Fingerprint()
650+
651+
countLimit := a.limits.AlertmanagerMaxAlertsCount(a.tenant)
652+
sizeLimit := a.limits.AlertmanagerMaxAlertsSizeBytes(a.tenant)
653+
654+
sizeDiff := alertSize(alert.Alert)
655+
656+
a.mx.Lock()
657+
defer a.mx.Unlock()
658+
659+
if !existing && countLimit > 0 && (a.count+1) > countLimit {
660+
a.failureCounter.Inc()
661+
return fmt.Errorf(errTooManyAlerts, countLimit)
662+
}
663+
664+
if existing {
665+
sizeDiff -= a.sizes[fp]
666+
}
667+
668+
if sizeLimit > 0 && (a.totalSize+sizeDiff) > sizeLimit {
669+
a.failureCounter.Inc()
670+
return fmt.Errorf(errAlertsTooBig, sizeLimit)
671+
}
672+
673+
return nil
674+
}
675+
676+
func (a *alertsLimiter) PostStore(alert *types.Alert, existing bool) {
677+
if alert == nil {
678+
return
679+
}
680+
681+
newSize := alertSize(alert.Alert)
682+
fp := alert.Fingerprint()
683+
684+
a.mx.Lock()
685+
defer a.mx.Unlock()
686+
687+
if existing {
688+
a.totalSize -= a.sizes[fp]
689+
} else {
690+
a.count++
691+
}
692+
a.sizes[fp] = newSize
693+
a.totalSize += newSize
694+
}
695+
696+
func (a *alertsLimiter) PostDelete(alert *types.Alert) {
697+
if alert == nil {
698+
return
699+
}
700+
701+
fp := alert.Fingerprint()
702+
703+
a.mx.Lock()
704+
defer a.mx.Unlock()
705+
706+
a.totalSize -= a.sizes[fp]
707+
delete(a.sizes, fp)
708+
a.count--
709+
}
710+
711+
func (a *alertsLimiter) currentStats() (count, totalSize int) {
712+
a.mx.Lock()
713+
defer a.mx.Unlock()
714+
715+
return a.count, a.totalSize
716+
}
717+
718+
func alertSize(alert model.Alert) int {
719+
size := 0
720+
for l, v := range alert.Labels {
721+
size += len(l)
722+
size += len(v)
723+
}
724+
for l, v := range alert.Annotations {
725+
size += len(l)
726+
size += len(v)
727+
}
728+
size += len(alert.GeneratorURL)
729+
return size
730+
}

pkg/alertmanager/alertmanager_metrics.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ type alertmanagerMetrics struct {
6161

6262
notificationRateLimited *prometheus.Desc
6363
dispatcherAggregationGroupsLimitReached *prometheus.Desc
64+
insertAlertFailures *prometheus.Desc
65+
alertsLimiterAlertsCount *prometheus.Desc
66+
alertsLimiterAlertsSize *prometheus.Desc
6467
}
6568

6669
func newAlertmanagerMetrics() *alertmanagerMetrics {
@@ -214,6 +217,18 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
214217
"cortex_alertmanager_dispatcher_aggregation_group_limit_reached_total",
215218
"Number of times when dispatcher failed to create new aggregation group due to limit.",
216219
[]string{"user"}, nil),
220+
insertAlertFailures: prometheus.NewDesc(
221+
"cortex_alertmanager_alerts_insert_limited_total",
222+
"Total number of failures to store alert due to hitting alertmanager limits.",
223+
[]string{"user"}, nil),
224+
alertsLimiterAlertsCount: prometheus.NewDesc(
225+
"cortex_alertmanager_alerts_limiter_current_alerts",
226+
"Number of alerts tracked by alerts limiter.",
227+
[]string{"user"}, nil),
228+
alertsLimiterAlertsSize: prometheus.NewDesc(
229+
"cortex_alertmanager_alerts_limiter_current_alerts_size_bytes",
230+
"Total size of alerts tracked by alerts limiter.",
231+
[]string{"user"}, nil),
217232
}
218233
}
219234

@@ -265,6 +280,9 @@ func (m *alertmanagerMetrics) Describe(out chan<- *prometheus.Desc) {
265280
out <- m.persistFailed
266281
out <- m.notificationRateLimited
267282
out <- m.dispatcherAggregationGroupsLimitReached
283+
out <- m.insertAlertFailures
284+
out <- m.alertsLimiterAlertsCount
285+
out <- m.alertsLimiterAlertsSize
268286
}
269287

270288
func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
@@ -313,4 +331,7 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
313331

314332
data.SendSumOfCountersPerUserWithLabels(out, m.notificationRateLimited, "alertmanager_notification_rate_limited_total", "integration")
315333
data.SendSumOfCountersPerUser(out, m.dispatcherAggregationGroupsLimitReached, "alertmanager_dispatcher_aggregation_group_limit_reached_total")
334+
data.SendSumOfCountersPerUser(out, m.insertAlertFailures, "alertmanager_alerts_insert_limited_total")
335+
data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsCount, "alertmanager_alerts_limiter_current_alerts")
336+
data.SendSumOfGaugesPerUser(out, m.alertsLimiterAlertsSize, "alertmanager_alerts_limiter_current_alerts_size_bytes")
316337
}

pkg/alertmanager/alertmanager_metrics_test.go

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,22 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
274274
# HELP cortex_alertmanager_state_persist_total Number of times we have tried to persist the running state to storage.
275275
# TYPE cortex_alertmanager_state_persist_total counter
276276
cortex_alertmanager_state_persist_total 0
277+
278+
# HELP cortex_alertmanager_alerts_limiter_current_alerts Number of alerts tracked by alerts limiter.
279+
# TYPE cortex_alertmanager_alerts_limiter_current_alerts gauge
280+
cortex_alertmanager_alerts_limiter_current_alerts{user="user1"} 10
281+
cortex_alertmanager_alerts_limiter_current_alerts{user="user2"} 100
282+
cortex_alertmanager_alerts_limiter_current_alerts{user="user3"} 1000
283+
# HELP cortex_alertmanager_alerts_limiter_current_alerts_size_bytes Total size of alerts tracked by alerts limiter.
284+
# TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge
285+
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100
286+
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000
287+
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user3"} 10000
288+
# HELP cortex_alertmanager_alerts_insert_limited_total Total number of failures to store alert due to hitting alertmanager limits.
289+
# TYPE cortex_alertmanager_alerts_insert_limited_total counter
290+
cortex_alertmanager_alerts_insert_limited_total{user="user1"} 7
291+
cortex_alertmanager_alerts_insert_limited_total{user="user2"} 70
292+
cortex_alertmanager_alerts_insert_limited_total{user="user3"} 700
277293
`))
278294
require.NoError(t, err)
279295
}
@@ -557,6 +573,23 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
557573
# HELP cortex_alertmanager_state_persist_total Number of times we have tried to persist the running state to storage.
558574
# TYPE cortex_alertmanager_state_persist_total counter
559575
cortex_alertmanager_state_persist_total 0
576+
577+
# HELP cortex_alertmanager_alerts_limiter_current_alerts Number of alerts tracked by alerts limiter.
578+
# TYPE cortex_alertmanager_alerts_limiter_current_alerts gauge
579+
cortex_alertmanager_alerts_limiter_current_alerts{user="user1"} 10
580+
cortex_alertmanager_alerts_limiter_current_alerts{user="user2"} 100
581+
cortex_alertmanager_alerts_limiter_current_alerts{user="user3"} 1000
582+
# HELP cortex_alertmanager_alerts_limiter_current_alerts_size_bytes Total size of alerts tracked by alerts limiter.
583+
# TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge
584+
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100
585+
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000
586+
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user3"} 10000
587+
# HELP cortex_alertmanager_alerts_insert_limited_total Total number of failures to store alert due to hitting alertmanager limits.
588+
# TYPE cortex_alertmanager_alerts_insert_limited_total counter
589+
cortex_alertmanager_alerts_insert_limited_total{user="user1"} 7
590+
cortex_alertmanager_alerts_insert_limited_total{user="user2"} 70
591+
cortex_alertmanager_alerts_insert_limited_total{user="user3"} 700
592+
560593
`))
561594
require.NoError(t, err)
562595

@@ -788,6 +821,19 @@ func TestAlertmanagerMetricsRemoval(t *testing.T) {
788821
# HELP cortex_alertmanager_state_persist_total Number of times we have tried to persist the running state to storage.
789822
# TYPE cortex_alertmanager_state_persist_total counter
790823
cortex_alertmanager_state_persist_total 0
824+
825+
# HELP cortex_alertmanager_alerts_limiter_current_alerts Number of alerts tracked by alerts limiter.
826+
# TYPE cortex_alertmanager_alerts_limiter_current_alerts gauge
827+
cortex_alertmanager_alerts_limiter_current_alerts{user="user1"} 10
828+
cortex_alertmanager_alerts_limiter_current_alerts{user="user2"} 100
829+
# HELP cortex_alertmanager_alerts_limiter_current_alerts_size_bytes Total size of alerts tracked by alerts limiter.
830+
# TYPE cortex_alertmanager_alerts_limiter_current_alerts_size_bytes gauge
831+
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user1"} 100
832+
cortex_alertmanager_alerts_limiter_current_alerts_size_bytes{user="user2"} 1000
833+
# HELP cortex_alertmanager_alerts_insert_limited_total Total number of failures to store alert due to hitting alertmanager limits.
834+
# TYPE cortex_alertmanager_alerts_insert_limited_total counter
835+
cortex_alertmanager_alerts_insert_limited_total{user="user1"} 7
836+
cortex_alertmanager_alerts_insert_limited_total{user="user2"} 70
791837
`))
792838
require.NoError(t, err)
793839
}
@@ -838,6 +884,11 @@ func populateAlertmanager(base float64) *prometheus.Registry {
838884
v2APIMetrics.invalid.Add(base)
839885
v2APIMetrics.resolved.Add(base * 3)
840886

887+
lm := newLimiterMetrics(reg)
888+
lm.count.Set(10 * base)
889+
lm.size.Set(100 * base)
890+
lm.insertFailures.Add(7 * base)
891+
841892
return reg
842893
}
843894

@@ -1041,3 +1092,32 @@ func newAPIMetrics(version string, r prometheus.Registerer) *apiMetrics {
10411092
invalid: numInvalidAlerts,
10421093
}
10431094
}
1095+
1096+
type limiterMetrics struct {
1097+
count prometheus.Gauge
1098+
size prometheus.Gauge
1099+
insertFailures prometheus.Counter
1100+
}
1101+
1102+
func newLimiterMetrics(r prometheus.Registerer) *limiterMetrics {
1103+
count := promauto.With(r).NewGauge(prometheus.GaugeOpts{
1104+
Name: "alertmanager_alerts_limiter_current_alerts",
1105+
Help: "Number of alerts tracked by alerts limiter.",
1106+
})
1107+
1108+
size := promauto.With(r).NewGauge(prometheus.GaugeOpts{
1109+
Name: "alertmanager_alerts_limiter_current_alerts_size_bytes",
1110+
Help: "Total size of alerts tracked by alerts limiter.",
1111+
})
1112+
1113+
insertAlertFailures := promauto.With(r).NewCounter(prometheus.CounterOpts{
1114+
Name: "alertmanager_alerts_insert_limited_total",
1115+
Help: "Number of failures to insert new alerts to in-memory alert store.",
1116+
})
1117+
1118+
return &limiterMetrics{
1119+
count: count,
1120+
size: size,
1121+
insertFailures: insertAlertFailures,
1122+
}
1123+
}

0 commit comments

Comments
 (0)