new flag: ruler.for-outage-tolerance (#2783)

owen-d · web-flow · commit 996032198d5f · 2020-06-25T08:14:45.000+02:00
* new flag: ruler.outage-tolerance
Signed-off-by: Owen Diehl &lt;ow.diehl@gmail.com&gt;

* more ruler options
Signed-off-by: Owen Diehl &lt;ow.diehl@gmail.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ## master / unreleased
 
+* [FEATURE] Introduced `ruler.for-outage-tolerance`, Max time to tolerate outage for restoring "for" state of alert. #2783
+* [FEATURE] Introduced `ruler.for-grace-period`, Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period. #2783
+* [FEATURE] Introduced `ruler.for-resend-delay`, Minimum amount of time to wait before resending an alert to Alertmanager. #2783
 * [CHANGE] Metric `cortex_kv_request_duration_seconds` now includes `name` label to denote which client is being used as well as the `backend` label to denote the KV backend implementation in use. #2648
 * [CHANGE] Experimental Ruler: Rule groups persisted to object storage using the experimental API have an updated object key encoding to better handle special characters. Rule groups previously-stored using object storage must be renamed to the new format. #2646
 * [CHANGE] Query Frontend now uses Round Robin to choose a tenant queue to service next. #2553
diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md
@@ -971,6 +971,19 @@ storage:
 # CLI flag: -ruler.notification-timeout
 [notification_timeout: <duration> | default = 10s]
 
+# Max time to tolerate outage for restoring "for" state of alert.
+# CLI flag: -ruler.for-outage-tolerance
+[for_outage_tolerance: <duration> | default = 1h]
+
+# Minimum duration between alert and restored "for" state. This is maintained
+# only for alerts with configured "for" time greater than grace period.
+# CLI flag: -ruler.for-grace-period
+[for_grace_period: <duration> | default = 10m]
+
+# Minimum amount of time to wait before resending an alert to Alertmanager.
+# CLI flag: -ruler.resend-delay
+[resend_delay: <duration> | default = 1m]
+
 # Distribute rule evaluation using ring backend
 # CLI flag: -ruler.enable-sharding
 [enable_sharding: <boolean> | default = false]
diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go
@@ -88,6 +88,13 @@ type Config struct {
 	// HTTP timeout duration when sending notifications to the Alertmanager.
 	NotificationTimeout time.Duration `yaml:"notification_timeout"`
 
+	// Max time to tolerate outage for restoring "for" state of alert.
+	OutageTolerance time.Duration `yaml:"for_outage_tolerance"`
+	// Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.
+	ForGracePeriod time.Duration `yaml:"for_grace_period"`
+	// Minimum amount of time to wait before resending an alert to Alertmanager.
+	ResendDelay time.Duration `yaml:"resend_delay"`
+
 	// Enable sharding rule groups.
 	EnableSharding   bool          `yaml:"enable_sharding"`
 	SearchPendingFor time.Duration `yaml:"search_pending_for"`
@@ -132,6 +139,9 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
 	f.DurationVar(&cfg.FlushCheckPeriod, "ruler.flush-period", 1*time.Minute, "Period with which to attempt to flush rule groups.")
 	f.StringVar(&cfg.RulePath, "ruler.rule-path", "/rules", "file path to store temporary rule files for the prometheus rule managers")
 	f.BoolVar(&cfg.EnableAPI, "experimental.ruler.enable-api", false, "Enable the ruler api")
+	f.DurationVar(&cfg.OutageTolerance, "ruler.for-outage-tolerance", time.Hour, `Max time to tolerate outage for restoring "for" state of alert.`)
+	f.DurationVar(&cfg.ForGracePeriod, "ruler.for-grace-period", 10*time.Minute, `Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period.`)
+	f.DurationVar(&cfg.ResendDelay, "ruler.resend-delay", time.Minute, `Minimum amount of time to wait before resending an alert to Alertmanager.`)
 }
 
 // Ruler evaluates rules.
@@ -523,14 +533,17 @@ func (r *Ruler) newManager(ctx context.Context, userID string) (*promRules.Manag
 	reg = prometheus.WrapRegistererWithPrefix("cortex_", reg)
 	logger := log.With(r.logger, "user", userID)
 	opts := &promRules.ManagerOptions{
-		Appendable:  tsdb,
-		TSDB:        tsdb,
-		QueryFunc:   engineQueryFunc(r.engine, r.queryable, r.cfg.EvaluationDelay),
-		Context:     user.InjectOrgID(ctx, userID),
-		ExternalURL: r.alertURL,
-		NotifyFunc:  sendAlerts(notifier, r.alertURL.String()),
-		Logger:      logger,
-		Registerer:  reg,
+		Appendable:      tsdb,
+		TSDB:            tsdb,
+		QueryFunc:       engineQueryFunc(r.engine, r.queryable, r.cfg.EvaluationDelay),
+		Context:         user.InjectOrgID(ctx, userID),
+		ExternalURL:     r.alertURL,
+		NotifyFunc:      sendAlerts(notifier, r.alertURL.String()),
+		Logger:          logger,
+		Registerer:      reg,
+		OutageTolerance: r.cfg.OutageTolerance,
+		ForGracePeriod:  r.cfg.ForGracePeriod,
+		ResendDelay:     r.cfg.ResendDelay,
 	}
 	return promRules.NewManager(opts), nil
 }