Skip to content

Commit e0bd75e

Browse files
committed
Allow setting ring heartbeat timeout to zero to disable timeout check.
This change allows the various ring heartbeat timeouts to be configured with zero, as a means of disabling the timeout. This is expected to be used with a separate enhancement to allow disabling heartbeats. When the heartbeat timeout is disabled, instances will always appear as healthy in the ring. Signed-off-by: Steve Simpson <[email protected]>
1 parent 95fedaa commit e0bd75e

File tree

13 files changed

+78
-19
lines changed

13 files changed

+78
-19
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@
55
* [CHANGE] Querier / ruler: Change `-querier.max-fetched-chunks-per-query` configuration to limit to maximum number of chunks that can be fetched in a single query. The number of chunks fetched by ingesters AND long-term storare combined should not exceed the value configured on `-querier.max-fetched-chunks-per-query`. #4260
66
* [ENHANCEMENT] Add timeout for waiting on compactor to become ACTIVE in the ring. #4262
77
* [ENHANCEMENT] Reduce memory used by streaming queries, particularly in ruler. #4341
8+
* [ENHANCEMENT] Ring: allow experimental configuration of disabling of heartbeat timeouts by setting the relevant configuration value to zero. Applies to the following: #4342
9+
* `-distributor.ring.heartbeat-timeout`
10+
* `-ring.heartbeat-timeout`
11+
* `-ruler.ring.heartbeat-timeout`
12+
* `-alertmanager.sharding-ring.heartbeat-timeout`
13+
* `-compactor.ring.heartbeat-timeout`
14+
* `-store-gateway.sharding-ring.heartbeat-timeout`
815
* [BUGFIX] HA Tracker: when cleaning up obsolete elected replicas from KV store, tracker didn't update number of cluster per user correctly. #4336
916

1017
## 1.10.0-rc.0 / 2021-06-28

docs/blocks-storage/compactor.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ compactor:
214214
[heartbeat_period: <duration> | default = 5s]
215215

216216
# The heartbeat timeout after which compactors are considered unhealthy
217-
# within the ring.
217+
# within the ring. 0 = never (timeout disabled).
218218
# CLI flag: -compactor.ring.heartbeat-timeout
219219
[heartbeat_timeout: <duration> | default = 1m]
220220

docs/blocks-storage/store-gateway.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,8 +237,8 @@ store_gateway:
237237
[heartbeat_period: <duration> | default = 15s]
238238

239239
# The heartbeat timeout after which store gateways are considered unhealthy
240-
# within the ring. This option needs be set both on the store-gateway and
241-
# querier when running in microservices mode.
240+
# within the ring. 0 = never (timeout disabled). This option needs be set
241+
# both on the store-gateway and querier when running in microservices mode.
242242
# CLI flag: -store-gateway.sharding-ring.heartbeat-timeout
243243
[heartbeat_timeout: <duration> | default = 1m]
244244

docs/configuration/config-file-reference.md

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -568,7 +568,7 @@ ring:
568568
[heartbeat_period: <duration> | default = 5s]
569569
570570
# The heartbeat timeout after which distributors are considered unhealthy
571-
# within the ring.
571+
# within the ring. 0 = never (timeout disabled).
572572
# CLI flag: -distributor.ring.heartbeat-timeout
573573
[heartbeat_timeout: <duration> | default = 1m]
574574
@@ -662,6 +662,7 @@ lifecycler:
662662
[mirror_timeout: <duration> | default = 2s]
663663
664664
# The heartbeat timeout after which ingesters are skipped for reads/writes.
665+
# 0 = never (timeout disabled).
665666
# CLI flag: -ring.heartbeat-timeout
666667
[heartbeat_timeout: <duration> | default = 1m]
667668
@@ -1585,7 +1586,7 @@ ring:
15851586
[heartbeat_period: <duration> | default = 5s]
15861587

15871588
# The heartbeat timeout after which rulers are considered unhealthy within the
1588-
# ring.
1589+
# ring. 0 = never (timeout disabled).
15891590
# CLI flag: -ruler.ring.heartbeat-timeout
15901591
[heartbeat_timeout: <duration> | default = 1m]
15911592

@@ -1906,7 +1907,7 @@ sharding_ring:
19061907
[heartbeat_period: <duration> | default = 15s]
19071908
19081909
# The heartbeat timeout after which alertmanagers are considered unhealthy
1909-
# within the ring.
1910+
# within the ring. 0 = never (timeout disabled).
19101911
# CLI flag: -alertmanager.sharding-ring.heartbeat-timeout
19111912
[heartbeat_timeout: <duration> | default = 1m]
19121913
@@ -5179,7 +5180,7 @@ sharding_ring:
51795180
[heartbeat_period: <duration> | default = 5s]
51805181
51815182
# The heartbeat timeout after which compactors are considered unhealthy within
5182-
# the ring.
5183+
# the ring. 0 = never (timeout disabled).
51835184
# CLI flag: -compactor.ring.heartbeat-timeout
51845185
[heartbeat_timeout: <duration> | default = 1m]
51855186
@@ -5257,8 +5258,8 @@ sharding_ring:
52575258
[heartbeat_period: <duration> | default = 15s]
52585259
52595260
# The heartbeat timeout after which store gateways are considered unhealthy
5260-
# within the ring. This option needs be set both on the store-gateway and
5261-
# querier when running in microservices mode.
5261+
# within the ring. 0 = never (timeout disabled). This option needs be set both
5262+
# on the store-gateway and querier when running in microservices mode.
52625263
# CLI flag: -store-gateway.sharding-ring.heartbeat-timeout
52635264
[heartbeat_timeout: <duration> | default = 1m]
52645265

pkg/alertmanager/alertmanager_ring.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
7777
// Ring flags
7878
cfg.KVStore.RegisterFlagsWithPrefix(rfprefix, "alertmanagers/", f)
7979
f.DurationVar(&cfg.HeartbeatPeriod, rfprefix+"heartbeat-period", 15*time.Second, "Period at which to heartbeat to the ring.")
80-
f.DurationVar(&cfg.HeartbeatTimeout, rfprefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which alertmanagers are considered unhealthy within the ring.")
80+
f.DurationVar(&cfg.HeartbeatTimeout, rfprefix+"heartbeat-timeout", time.Minute, "The heartbeat timeout after which alertmanagers are considered unhealthy within the ring. 0 = never (timeout disabled).")
8181
f.IntVar(&cfg.ReplicationFactor, rfprefix+"replication-factor", 3, "The replication factor to use when sharding the alertmanager.")
8282
f.BoolVar(&cfg.ZoneAwarenessEnabled, rfprefix+"zone-awareness-enabled", false, "True to enable zone-awareness and replicate alerts across different availability zones.")
8383

pkg/compactor/compactor_ring.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
5151
// Ring flags
5252
cfg.KVStore.RegisterFlagsWithPrefix("compactor.ring.", "collectors/", f)
5353
f.DurationVar(&cfg.HeartbeatPeriod, "compactor.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring.")
54-
f.DurationVar(&cfg.HeartbeatTimeout, "compactor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which compactors are considered unhealthy within the ring.")
54+
f.DurationVar(&cfg.HeartbeatTimeout, "compactor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which compactors are considered unhealthy within the ring. 0 = never (timeout disabled).")
5555

5656
// Wait stability flags.
5757
f.DurationVar(&cfg.WaitStabilityMinDuration, "compactor.ring.wait-stability-min-duration", time.Minute, "Minimum time to wait for ring stability at startup. 0 to disable.")

pkg/distributor/distributor_ring.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
4343
// Ring flags
4444
cfg.KVStore.RegisterFlagsWithPrefix("distributor.ring.", "collectors/", f)
4545
f.DurationVar(&cfg.HeartbeatPeriod, "distributor.ring.heartbeat-period", 5*time.Second, "Period at which to heartbeat to the ring.")
46-
f.DurationVar(&cfg.HeartbeatTimeout, "distributor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which distributors are considered unhealthy within the ring.")
46+
f.DurationVar(&cfg.HeartbeatTimeout, "distributor.ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which distributors are considered unhealthy within the ring. 0 = never (timeout disabled).")
4747

4848
// Instance flags
4949
cfg.InstanceInterfaceNames = []string{"eth0", "en0"}

pkg/ring/model.go

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ func (d *Desc) FindIngestersByState(state InstanceState) []InstanceDesc {
101101
func (d *Desc) Ready(now time.Time, heartbeatTimeout time.Duration) error {
102102
numTokens := 0
103103
for id, ingester := range d.Ingesters {
104-
if now.Sub(time.Unix(ingester.Timestamp, 0)) > heartbeatTimeout {
104+
if !ingester.IsHeartbeatHealthy(heartbeatTimeout, now) {
105105
return fmt.Errorf("instance %s past heartbeat timeout", id)
106106
} else if ingester.State != ACTIVE {
107107
return fmt.Errorf("instance %s in state %v", id, ingester.State)
@@ -136,7 +136,16 @@ func (i *InstanceDesc) GetRegisteredAt() time.Time {
136136
func (i *InstanceDesc) IsHealthy(op Operation, heartbeatTimeout time.Duration, now time.Time) bool {
137137
healthy := op.IsInstanceInStateHealthy(i.State)
138138

139-
return healthy && now.Unix()-i.Timestamp <= heartbeatTimeout.Milliseconds()/1000
139+
return healthy && i.IsHeartbeatHealthy(heartbeatTimeout, now)
140+
}
141+
142+
// IsHeartbeatHealthy returns whether the heartbeat timestamp for the ingester is within the
143+
// specified timeout period. A timeout of zero disables the timeout; the heartbeat is ignored.
144+
func (i *InstanceDesc) IsHeartbeatHealthy(heartbeatTimeout time.Duration, now time.Time) bool {
145+
if heartbeatTimeout == 0 {
146+
return true
147+
}
148+
return now.Sub(time.Unix(i.Timestamp, 0)) <= heartbeatTimeout
140149
}
141150

142151
// Merge merges other ring into this one. Returns sub-ring that represents the change,

pkg/ring/model_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,10 +136,18 @@ func TestDesc_Ready(t *testing.T) {
136136
t.Fatal("expected ready, got", err)
137137
}
138138

139+
if err := r.Ready(now, 0); err != nil {
140+
t.Fatal("expected ready, got", err)
141+
}
142+
139143
if err := r.Ready(now.Add(5*time.Minute), 10*time.Second); err == nil {
140144
t.Fatal("expected !ready (no heartbeat from active ingester), but got no error")
141145
}
142146

147+
if err := r.Ready(now.Add(5*time.Minute), 0); err != nil {
148+
t.Fatal("expected ready (no heartbeat but timeout disabled), got", err)
149+
}
150+
143151
r = &Desc{
144152
Ingesters: map[string]InstanceDesc{
145153
"ing1": {

pkg/ring/ring.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
147147
func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) {
148148
cfg.KVStore.RegisterFlagsWithPrefix(prefix, "collectors/", f)
149149

150-
f.DurationVar(&cfg.HeartbeatTimeout, prefix+"ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which ingesters are skipped for reads/writes.")
150+
f.DurationVar(&cfg.HeartbeatTimeout, prefix+"ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which ingesters are skipped for reads/writes. 0 = never (timeout disabled).")
151151
f.IntVar(&cfg.ReplicationFactor, prefix+"distributor.replication-factor", 3, "The number of ingesters to write to and read from.")
152152
f.BoolVar(&cfg.ZoneAwarenessEnabled, prefix+"distributor.zone-awareness-enabled", false, "True to enable the zone-awareness and replicate ingested samples across different availability zones.")
153153
}

0 commit comments

Comments
 (0)