Skip to content

Commit 9960321

Browse files
authored
new flag: ruler.for-outage-tolerance (#2783)
* new flag: ruler.outage-tolerance Signed-off-by: Owen Diehl <[email protected]> * more ruler options Signed-off-by: Owen Diehl <[email protected]>
1 parent 7f6f879 commit 9960321

File tree

3 files changed

+37
-8
lines changed

3 files changed

+37
-8
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22

33
## master / unreleased
44

5+
* [FEATURE] Introduced `ruler.for-outage-tolerance`, Max time to tolerate outage for restoring "for" state of alert. #2783
6+
* [FEATURE] Introduced `ruler.for-grace-period`, Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period. #2783
7+
* [FEATURE] Introduced `ruler.for-resend-delay`, Minimum amount of time to wait before resending an alert to Alertmanager. #2783
58
* [CHANGE] Metric `cortex_kv_request_duration_seconds` now includes `name` label to denote which client is being used as well as the `backend` label to denote the KV backend implementation in use. #2648
69
* [CHANGE] Experimental Ruler: Rule groups persisted to object storage using the experimental API have an updated object key encoding to better handle special characters. Rule groups previously-stored using object storage must be renamed to the new format. #2646
710
* [CHANGE] Query Frontend now uses Round Robin to choose a tenant queue to service next. #2553

docs/configuration/config-file-reference.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -971,6 +971,19 @@ storage:
971971
# CLI flag: -ruler.notification-timeout
972972
[notification_timeout: <duration> | default = 10s]
973973

974+
# Max time to tolerate outage for restoring "for" state of alert.
975+
# CLI flag: -ruler.for-outage-tolerance
976+
[for_outage_tolerance: <duration> | default = 1h]
977+
978+
# Minimum duration between alert and restored "for" state. This is maintained
979+
# only for alerts with configured "for" time greater than grace period.
980+
# CLI flag: -ruler.for-grace-period
981+
[for_grace_period: <duration> | default = 10m]
982+
983+
# Minimum amount of time to wait before resending an alert to Alertmanager.
984+
# CLI flag: -ruler.resend-delay
985+
[resend_delay: <duration> | default = 1m]
986+
974987
# Distribute rule evaluation using ring backend
975988
# CLI flag: -ruler.enable-sharding
976989
[enable_sharding: <boolean> | default = false]

pkg/ruler/ruler.go

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,13 @@ type Config struct {
8888
// HTTP timeout duration when sending notifications to the Alertmanager.
8989
NotificationTimeout time.Duration `yaml:"notification_timeout"`
9090

91+
// Max time to tolerate outage for restoring "for" state of alert.
92+
OutageTolerance time.Duration `yaml:"for_outage_tolerance"`
93+
// Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.
94+
ForGracePeriod time.Duration `yaml:"for_grace_period"`
95+
// Minimum amount of time to wait before resending an alert to Alertmanager.
96+
ResendDelay time.Duration `yaml:"resend_delay"`
97+
9198
// Enable sharding rule groups.
9299
EnableSharding bool `yaml:"enable_sharding"`
93100
SearchPendingFor time.Duration `yaml:"search_pending_for"`
@@ -132,6 +139,9 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
132139
f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.")
133140
f.StringVar(&cfg.RulePath, "ruler.rule-path", "/rules", "file path to store temporary rule files for the prometheus rule managers")
134141
f.BoolVar(&cfg.EnableAPI, "experimental.ruler.enable-api", false, "Enable the ruler api")
142+
f.DurationVar(&cfg.OutageTolerance, "ruler.for-outage-tolerance", time.Hour, `Max time to tolerate outage for restoring "for" state of alert.`)
143+
f.DurationVar(&cfg.ForGracePeriod, "ruler.for-grace-period", 10*time.Minute, `Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.`)
144+
f.DurationVar(&cfg.ResendDelay, "ruler.resend-delay", time.Minute, `Minimum amount of time to wait before resending an alert to Alertmanager.`)
135145
}
136146

137147
// Ruler evaluates rules.
@@ -523,14 +533,17 @@ func (r *Ruler) newManager(ctx context.Context, userID string) (*promRules.Manag
523533
reg = prometheus.WrapRegistererWithPrefix("cortex_", reg)
524534
logger := log.With(r.logger, "user", userID)
525535
opts := &promRules.ManagerOptions{
526-
Appendable: tsdb,
527-
TSDB: tsdb,
528-
QueryFunc: engineQueryFunc(r.engine, r.queryable, r.cfg.EvaluationDelay),
529-
Context: user.InjectOrgID(ctx, userID),
530-
ExternalURL: r.alertURL,
531-
NotifyFunc: sendAlerts(notifier, r.alertURL.String()),
532-
Logger: logger,
533-
Registerer: reg,
536+
Appendable: tsdb,
537+
TSDB: tsdb,
538+
QueryFunc: engineQueryFunc(r.engine, r.queryable, r.cfg.EvaluationDelay),
539+
Context: user.InjectOrgID(ctx, userID),
540+
ExternalURL: r.alertURL,
541+
NotifyFunc: sendAlerts(notifier, r.alertURL.String()),
542+
Logger: logger,
543+
Registerer: reg,
544+
OutageTolerance: r.cfg.OutageTolerance,
545+
ForGracePeriod: r.cfg.ForGracePeriod,
546+
ResendDelay: r.cfg.ResendDelay,
534547
}
535548
return promRules.NewManager(opts), nil
536549
}

0 commit comments

Comments
 (0)