Skip to content

Commit 43e3745

Browse files
Allow starting ingester in JOINING state to auto join (#2565)
If an ingester crashes during token transfer, or fails to rollback a token transfer then it will start up with a ring entry in the JOINING state. This will mean the ingester will no longer auto join because it is not in the PENDING state, and cannot receive a transfer request either. It will just stay in the JOINING state until manually deleted and forgotten from the ring. Instead, handle this case upon init, and set the state back to PENDING. This will allow the newly started ingester to properly receive a transfer request or auto join. Signed-off-by: Chris Marchbanks <[email protected]> Co-authored-by: Peter Štibraný <[email protected]>
1 parent aa10e35 commit 43e3745

File tree

3 files changed

+64
-0
lines changed

3 files changed

+64
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@
9595
* [BUGFIX] Querier: Fixed a situation where querier would crash because of an unresponsive frontend instance. #2569
9696
* [BUGFIX] Fixed collection of tracing spans from Thanos components used internally. #2584
9797
* [BUGFIX] Experimental TSDB: fixed memory leak in ingesters. #2586
98+
* [BUGFIX] Ingester: Fix an ingester starting up in the JOINING state and staying there forever. #2565
9899

99100
## 1.0.1 / 2020-04-23
100101

pkg/ring/lifecycler.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,17 @@ func (i *Lifecycler) initRing(ctx context.Context) error {
511511
return ringDesc, true, nil
512512
}
513513

514+
// If the ingester is in the JOINING state this means it crashed due to
515+
// a failed token transfer or some other reason during startup. We want
516+
// to set it back to PENDING in order to start the lifecycle from the
517+
// beginning.
518+
if ingesterDesc.State == JOINING {
519+
level.Warn(util.Logger).Log("msg", "instance found in ring as JOINING, setting to PENDING",
520+
"ring", i.RingName)
521+
ingesterDesc.State = PENDING
522+
return ringDesc, true, nil
523+
}
524+
514525
// If the ingester failed to clean it's ring entry up in can leave it's state in LEAVING.
515526
// Move it into ACTIVE to ensure the ingester joins the ring.
516527
if ingesterDesc.State == LEAVING && len(ingesterDesc.Tokens) == i.cfg.NumTokens {

pkg/ring/lifecycler_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,58 @@ func TestJoinInLeavingState(t *testing.T) {
413413
})
414414
}
415415

416+
// JoinInJoiningState ensures that if the lifecycler starts up and the ring already has it in a JOINING state that it still is able to auto join
417+
func TestJoinInJoiningState(t *testing.T) {
418+
var ringConfig Config
419+
flagext.DefaultValues(&ringConfig)
420+
c := GetCodec()
421+
ringConfig.KVStore.Mock = consul.NewInMemoryClient(c)
422+
423+
r, err := New(ringConfig, "ingester", IngesterRingKey)
424+
require.NoError(t, err)
425+
require.NoError(t, services.StartAndAwaitRunning(context.Background(), r))
426+
defer services.StopAndAwaitTerminated(context.Background(), r) //nolint:errcheck
427+
428+
cfg := testLifecyclerConfig(ringConfig, "ing1")
429+
cfg.NumTokens = 2
430+
cfg.MinReadyDuration = 1 * time.Nanosecond
431+
432+
// Set state as JOINING
433+
err = r.KVClient.CAS(context.Background(), IngesterRingKey, func(in interface{}) (interface{}, bool, error) {
434+
r := &Desc{
435+
Ingesters: map[string]IngesterDesc{
436+
"ing1": {
437+
State: JOINING,
438+
Tokens: []uint32{1, 4},
439+
},
440+
"ing2": {
441+
Tokens: []uint32{2, 3},
442+
},
443+
},
444+
}
445+
446+
return r, true, nil
447+
})
448+
require.NoError(t, err)
449+
450+
l1, err := NewLifecycler(cfg, &nopFlushTransferer{}, "ingester", IngesterRingKey, true)
451+
require.NoError(t, err)
452+
require.NoError(t, services.StartAndAwaitRunning(context.Background(), l1))
453+
454+
// Check that the lifecycler was able to join after coming up in JOINING
455+
test.Poll(t, 1000*time.Millisecond, true, func() interface{} {
456+
d, err := r.KVClient.Get(context.Background(), IngesterRingKey)
457+
require.NoError(t, err)
458+
459+
desc, ok := d.(*Desc)
460+
return ok &&
461+
len(desc.Ingesters) == 2 &&
462+
desc.Ingesters["ing1"].State == ACTIVE &&
463+
len(desc.Ingesters["ing1"].Tokens) == cfg.NumTokens &&
464+
len(desc.Ingesters["ing2"].Tokens) == 2
465+
})
466+
}
467+
416468
func TestRestoreOfZoneWhenOverwritten(t *testing.T) {
417469
// This test is simulating a case during upgrade of pre 1.0 cortex where
418470
// older ingesters do not have the zone field in their ring structs

0 commit comments

Comments
 (0)