Skip to content

Commit bb4a1f0

Browse files
committed
Add a timeout in ruler for each rule-group evaluation
1 parent fe0d7de commit bb4a1f0

File tree

1 file changed

+11
-0
lines changed

1 file changed

+11
-0
lines changed

pkg/ruler/ruler.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ type Config struct {
8787
NotificationQueueCapacity int
8888
// HTTP timeout duration when sending notifications to the Alertmanager.
8989
NotificationTimeout time.Duration
90+
// Timeout for rule group evaluation, including sending result to ingester
91+
GroupTimeout time.Duration
9092
}
9193

9294
// RegisterFlags adds the flags required to config this to the given FlagSet
@@ -100,6 +102,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
100102
f.DurationVar(&cfg.AlertmanagerRefreshInterval, "ruler.alertmanager-refresh-interval", 1*time.Minute, "How long to wait between refreshing alertmanager hosts.")
101103
f.IntVar(&cfg.NotificationQueueCapacity, "ruler.notification-queue-capacity", 10000, "Capacity of the queue for notifications to be sent to the Alertmanager.")
102104
f.DurationVar(&cfg.NotificationTimeout, "ruler.notification-timeout", 10*time.Second, "HTTP timeout duration when sending notifications to the Alertmanager.")
105+
f.DurationVar(&cfg.GroupTimeout, "ruler.group-timeout", 10*time.Second, "Timeout for rule group evaluation, including sending result to ingester")
103106
}
104107

105108
// Ruler evaluates rules.
@@ -109,6 +112,7 @@ type Ruler struct {
109112
alertURL *url.URL
110113
notifierCfg *config.Config
111114
queueCapacity int
115+
groupTimeout time.Duration
112116

113117
// Per-user notifiers with separate queues.
114118
notifiersMtx sync.Mutex
@@ -191,6 +195,7 @@ func NewRuler(cfg Config, d *distributor.Distributor, c *chunk.Store) (*Ruler, e
191195
notifierCfg: ncfg,
192196
queueCapacity: cfg.NotificationQueueCapacity,
193197
notifiers: map[string]*rulerNotifier{},
198+
groupTimeout: cfg.GroupTimeout,
194199
}, nil
195200
}
196201

@@ -352,12 +357,18 @@ func (r *Ruler) Evaluate(ctx context.Context, rs []rules.Rule) {
352357
logger := util.WithContext(ctx, util.Logger)
353358
level.Debug(logger).Log("msg", "evaluating rules...", "num_rules", len(rs))
354359
start := time.Now()
360+
ctx, cancelTimeout := context.WithTimeout(ctx, r.groupTimeout)
355361
g, err := r.newGroup(ctx, rs)
356362
if err != nil {
357363
level.Error(logger).Log("msg", "failed to create rule group", "err", err)
358364
return
359365
}
360366
g.Eval(ctx, start)
367+
if err := ctx.Err(); err == nil {
368+
cancelTimeout() // release resources
369+
} else {
370+
level.Warn(util.Logger).Log("msg", "context error", "error", err)
371+
}
361372

362373
// The prometheus routines we're calling have their own instrumentation
363374
// but, a) it's rule-based, not group-based, b) it's a summary, not a

0 commit comments

Comments
 (0)