Skip to content

Commit b4af68a

Browse files
ac1214pracucci
andauthored
Add timeout for waiting on compactor to become ACTIVE in the ring. (#4262)
* add MaxRetries to WaitInstanceState Signed-off-by: Albert <[email protected]> * update CHANGELOG.md Signed-off-by: Albert <[email protected]> * Add timeout for waiting on compactor to become ACTIVE in the ring. Signed-off-by: Albert <[email protected]> * add MaxRetries variable back to WaitInstanceState Signed-off-by: Albert <[email protected]> * Fix linting issues Signed-off-by: Albert <[email protected]> * Remove duplicate entry from changelog Signed-off-by: Albert <[email protected]> * Address PR comments and set timeout to be configurable Signed-off-by: Albert <[email protected]> * Address PR comments and fix tests Signed-off-by: Albert <[email protected]> * Update unit tests Signed-off-by: Albert <[email protected]> * Update changelog and fix linting Signed-off-by: Albert <[email protected]> * Fixed CHANGELOG entry order Signed-off-by: Marco Pracucci <[email protected]> Co-authored-by: Albert <[email protected]> Co-authored-by: Marco Pracucci <[email protected]>
1 parent ff0d1a6 commit b4af68a

File tree

9 files changed

+175
-3
lines changed

9 files changed

+175
-3
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
## master / unreleased
44

55
* [CHANGE] Querier / ruler: Change `-querier.max-fetched-chunks-per-query` configuration to limit to maximum number of chunks that can be fetched in a single query. The number of chunks fetched by ingesters AND long-term storare combined should not exceed the value configured on `-querier.max-fetched-chunks-per-query`. #4260
6+
* [ENHANCEMENT] Add timeout for waiting on compactor to become ACTIVE in the ring. #4262
67
* [BUGFIX] HA Tracker: when cleaning up obsolete elected replicas from KV store, tracker didn't update number of cluster per user correctly. #4336
78

89
## 1.10.0-rc.0 / 2021-06-28

docs/blocks-storage/compactor.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,4 +230,8 @@ compactor:
230230
# Name of network interface to read address from.
231231
# CLI flag: -compactor.ring.instance-interface-names
232232
[instance_interface_names: <list of string> | default = [eth0 en0]]
233+
234+
# Timeout for waiting on compactor to become ACTIVE in the ring.
235+
# CLI flag: -compactor.ring.wait-active-instance-timeout
236+
[wait_active_instance_timeout: <duration> | default = 10m]
233237
```

docs/configuration/config-file-reference.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5195,6 +5195,10 @@ sharding_ring:
51955195
# Name of network interface to read address from.
51965196
# CLI flag: -compactor.ring.instance-interface-names
51975197
[instance_interface_names: <list of string> | default = [eth0 en0]]
5198+
5199+
# Timeout for waiting on compactor to become ACTIVE in the ring.
5200+
# CLI flag: -compactor.ring.wait-active-instance-timeout
5201+
[wait_active_instance_timeout: <duration> | default = 10m]
51985202
```
51995203

52005204
### `store_gateway_config`

pkg/compactor/compactor.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,11 @@ func (c *Compactor) starting(ctx context.Context) error {
393393
// users scanner depends on the ring (to check whether an user belongs
394394
// to this shard or not).
395395
level.Info(c.logger).Log("msg", "waiting until compactor is ACTIVE in the ring")
396-
if err := ring.WaitInstanceState(ctx, c.ring, c.ringLifecycler.ID, ring.ACTIVE); err != nil {
396+
397+
ctxWithTimeout, cancel := context.WithTimeout(ctx, c.compactorCfg.ShardingRing.WaitActiveInstanceTimeout)
398+
defer cancel()
399+
if err := ring.WaitInstanceState(ctxWithTimeout, c.ring, c.ringLifecycler.ID, ring.ACTIVE); err != nil {
400+
level.Error(c.logger).Log("msg", "compactor failed to become ACTIVE in the ring", "err", err)
397401
return err
398402
}
399403
level.Info(c.logger).Log("msg", "compactor is ACTIVE in the ring")

pkg/compactor/compactor_ring.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ type RingConfig struct {
3434

3535
// Injected internally
3636
ListenPort int `yaml:"-"`
37+
38+
WaitActiveInstanceTimeout time.Duration `yaml:"wait_active_instance_timeout"`
39+
40+
ObservePeriod time.Duration `yaml:"-"`
3741
}
3842

3943
// RegisterFlags adds the flags required to config this to the given FlagSet
@@ -59,6 +63,9 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) {
5963
f.StringVar(&cfg.InstanceAddr, "compactor.ring.instance-addr", "", "IP address to advertise in the ring.")
6064
f.IntVar(&cfg.InstancePort, "compactor.ring.instance-port", 0, "Port to advertise in the ring (defaults to server.grpc-listen-port).")
6165
f.StringVar(&cfg.InstanceID, "compactor.ring.instance-id", hostname, "Instance ID to register in the ring.")
66+
67+
// Timeout durations
68+
f.DurationVar(&cfg.WaitActiveInstanceTimeout, "compactor.ring.wait-active-instance-timeout", 10*time.Minute, "Timeout for waiting on compactor to become ACTIVE in the ring.")
6269
}
6370

6471
// ToLifecyclerConfig returns a LifecyclerConfig based on the compactor
@@ -87,7 +94,7 @@ func (cfg *RingConfig) ToLifecyclerConfig() ring.LifecyclerConfig {
8794
lc.InfNames = cfg.InstanceInterfaceNames
8895
lc.UnregisterOnShutdown = true
8996
lc.HeartbeatPeriod = cfg.HeartbeatPeriod
90-
lc.ObservePeriod = 0
97+
lc.ObservePeriod = cfg.ObservePeriod
9198
lc.JoinAfter = 0
9299
lc.MinReadyDuration = 0
93100
lc.FinalSleep = 0

pkg/compactor/compactor_test.go

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,9 @@ func prepareConfig() Config {
10921092
compactorCfg.ShardingRing.WaitStabilityMinDuration = 0
10931093
compactorCfg.ShardingRing.WaitStabilityMaxDuration = 0
10941094

1095+
// Set lower timeout for waiting on compactor to become ACTIVE in the ring for unit tests
1096+
compactorCfg.ShardingRing.WaitActiveInstanceTimeout = 5 * time.Second
1097+
10951098
return compactorCfg
10961099
}
10971100

@@ -1279,3 +1282,33 @@ func TestCompactor_DeleteLocalSyncFiles(t *testing.T) {
12791282
require.NotEqual(t, numUsers, c1Users)
12801283
require.Equal(t, numUsers, c1Users+c2Users)
12811284
}
1285+
1286+
func TestCompactor_ShouldFailCompactionOnTimeout(t *testing.T) {
1287+
t.Parallel()
1288+
1289+
// Mock the bucket
1290+
bucketClient := &bucket.ClientMock{}
1291+
bucketClient.MockIter("", []string{}, nil)
1292+
1293+
cfg := prepareConfig()
1294+
cfg.ShardingEnabled = true
1295+
cfg.ShardingRing.InstanceID = "compactor-1"
1296+
cfg.ShardingRing.InstanceAddr = "1.2.3.4"
1297+
cfg.ShardingRing.KVStore.Mock = consul.NewInMemoryClient(ring.GetCodec())
1298+
1299+
// Set ObservePeriod to longer than the timeout period to mock a timeout while waiting on ring to become ACTIVE
1300+
cfg.ShardingRing.ObservePeriod = time.Second * 10
1301+
1302+
c, _, _, logs, _ := prepare(t, cfg, bucketClient)
1303+
1304+
// Try to start the compactor with a bad consul kv-store. The
1305+
err := services.StartAndAwaitRunning(context.Background(), c)
1306+
1307+
// Assert that the compactor timed out
1308+
assert.Equal(t, context.DeadlineExceeded, err)
1309+
1310+
assert.ElementsMatch(t, []string{
1311+
`level=info component=compactor msg="waiting until compactor is ACTIVE in the ring"`,
1312+
`level=error component=compactor msg="compactor failed to become ACTIVE in the ring" err="context deadline exceeded"`,
1313+
}, removeIgnoredLogs(strings.Split(strings.TrimSpace(logs.String()), "\n")))
1314+
}

pkg/ring/ring.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ type ReadRing interface {
7171
// and size (number of instances).
7272
ShuffleShard(identifier string, size int) ReadRing
7373

74+
// GetInstanceState returns the current state of an instance or an error if the
75+
// instance does not exist in the ring.
76+
GetInstanceState(instanceID string) (InstanceState, error)
77+
7478
// ShuffleShardWithLookback is like ShuffleShard() but the returned subring includes
7579
// all instances that have been part of the identifier's shard since "now - lookbackPeriod".
7680
ShuffleShardWithLookback(identifier string, size int, lookbackPeriod time.Duration, now time.Time) ReadRing

pkg/ring/util.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ func GetInstancePort(configPort, listenPort int) int {
6969

7070
// WaitInstanceState waits until the input instanceID is registered within the
7171
// ring matching the provided state. A timeout should be provided within the context.
72-
func WaitInstanceState(ctx context.Context, r *Ring, instanceID string, state InstanceState) error {
72+
func WaitInstanceState(ctx context.Context, r ReadRing, instanceID string, state InstanceState) error {
7373
backoff := util.NewBackoff(ctx, util.BackoffConfig{
7474
MinBackoff: 100 * time.Millisecond,
7575
MaxBackoff: time.Second,

pkg/ring/util_test.go

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,65 @@ import (
66
"testing"
77
"time"
88

9+
"github.com/pkg/errors"
10+
"github.com/prometheus/client_golang/prometheus"
911
"github.com/stretchr/testify/assert"
12+
"github.com/stretchr/testify/mock"
1013
"github.com/stretchr/testify/require"
1114
)
1215

16+
type RingMock struct {
17+
mock.Mock
18+
}
19+
20+
func (r *RingMock) Collect(ch chan<- prometheus.Metric) {}
21+
22+
func (r *RingMock) Describe(ch chan<- *prometheus.Desc) {}
23+
24+
func (r *RingMock) Get(key uint32, op Operation, bufDescs []InstanceDesc, bufHosts, bufZones []string) (ReplicationSet, error) {
25+
args := r.Called(key, op, bufDescs, bufHosts, bufZones)
26+
return args.Get(0).(ReplicationSet), args.Error(1)
27+
}
28+
29+
func (r *RingMock) GetAllHealthy(op Operation) (ReplicationSet, error) {
30+
args := r.Called(op)
31+
return args.Get(0).(ReplicationSet), args.Error(1)
32+
}
33+
34+
func (r *RingMock) GetReplicationSetForOperation(op Operation) (ReplicationSet, error) {
35+
args := r.Called(op)
36+
return args.Get(0).(ReplicationSet), args.Error(1)
37+
}
38+
39+
func (r *RingMock) ReplicationFactor() int {
40+
return 0
41+
}
42+
43+
func (r *RingMock) InstancesCount() int {
44+
return 0
45+
}
46+
47+
func (r *RingMock) ShuffleShard(identifier string, size int) ReadRing {
48+
args := r.Called(identifier, size)
49+
return args.Get(0).(ReadRing)
50+
}
51+
52+
func (r *RingMock) GetInstanceState(instanceID string) (InstanceState, error) {
53+
args := r.Called(instanceID)
54+
return args.Get(0).(InstanceState), args.Error(1)
55+
}
56+
57+
func (r *RingMock) ShuffleShardWithLookback(identifier string, size int, lookbackPeriod time.Duration, now time.Time) ReadRing {
58+
args := r.Called(identifier, size, lookbackPeriod, now)
59+
return args.Get(0).(ReadRing)
60+
}
61+
62+
func (r *RingMock) HasInstance(instanceID string) bool {
63+
return true
64+
}
65+
66+
func (r *RingMock) CleanupShuffleShardCache(identifier string) {}
67+
1368
func TestGenerateTokens(t *testing.T) {
1469
tokens := GenerateTokens(1000000, nil)
1570

@@ -184,3 +239,63 @@ func TestWaitRingStabilityShouldReturnErrorIfMaxWaitingIsReached(t *testing.T) {
184239

185240
assert.InDelta(t, maxWaiting, elapsedTime, float64(2*time.Second))
186241
}
242+
243+
func TestWaitInstanceStateTimeout(t *testing.T) {
244+
t.Parallel()
245+
246+
const (
247+
instanceID = "test"
248+
timeoutDuration = time.Second
249+
)
250+
251+
ctx, cancel := context.WithTimeout(context.Background(), timeoutDuration)
252+
defer cancel()
253+
254+
ring := &RingMock{}
255+
ring.On("GetInstanceState", mock.Anything, mock.Anything).Return(ACTIVE, nil)
256+
257+
err := WaitInstanceState(ctx, ring, instanceID, PENDING)
258+
259+
assert.Equal(t, context.DeadlineExceeded, err)
260+
ring.AssertCalled(t, "GetInstanceState", instanceID)
261+
}
262+
263+
func TestWaitInstanceStateTimeoutOnError(t *testing.T) {
264+
t.Parallel()
265+
266+
const (
267+
instanceID = "test"
268+
timeoutDuration = time.Second
269+
)
270+
271+
ctx, cancel := context.WithTimeout(context.Background(), timeoutDuration)
272+
defer cancel()
273+
274+
ring := &RingMock{}
275+
ring.On("GetInstanceState", mock.Anything, mock.Anything).Return(PENDING, errors.New("instance not found in the ring"))
276+
277+
err := WaitInstanceState(ctx, ring, instanceID, ACTIVE)
278+
279+
assert.Equal(t, context.DeadlineExceeded, err)
280+
ring.AssertCalled(t, "GetInstanceState", instanceID)
281+
}
282+
283+
func TestWaitInstanceStateExitsAfterActualStateEqualsState(t *testing.T) {
284+
t.Parallel()
285+
286+
const (
287+
instanceID = "test"
288+
timeoutDuration = time.Second
289+
)
290+
291+
ctx, cancel := context.WithTimeout(context.Background(), timeoutDuration)
292+
defer cancel()
293+
294+
ring := &RingMock{}
295+
ring.On("GetInstanceState", mock.Anything, mock.Anything).Return(ACTIVE, nil)
296+
297+
err := WaitInstanceState(ctx, ring, instanceID, ACTIVE)
298+
299+
assert.Nil(t, err)
300+
ring.AssertNumberOfCalls(t, "GetInstanceState", 1)
301+
}

0 commit comments

Comments
 (0)