Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@
* [BUGFIX] Querier: Fixed a situation where querier would crash because of an unresponsive frontend instance. #2569
* [BUGFIX] Fixed collection of tracing spans from Thanos components used internally. #2584
* [BUGFIX] Experimental TSDB: fixed memory leak in ingesters. #2586
* [BUGFIX] Ingester: Fix an ingester starting up in the JOINING state and staying there forever. #2565

## 1.0.1 / 2020-04-23

Expand Down
11 changes: 11 additions & 0 deletions pkg/ring/lifecycler.go
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,17 @@ func (i *Lifecycler) initRing(ctx context.Context) error {
return ringDesc, true, nil
}

// If the ingester is in the JOINING state this means it crashed due to
// a failed token transfer or some other reason during startup. We want
// to set it back to PENDING in order to start the lifecycle from the
// beginning.
if ingesterDesc.State == JOINING {
level.Warn(util.Logger).Log("msg", "instance found in ring as JOINING, setting to PENDING",
"ring", i.RingName)
ingesterDesc.State = PENDING
return ringDesc, true, nil
}

// If the ingester failed to clean it's ring entry up in can leave it's state in LEAVING.
// Move it into ACTIVE to ensure the ingester joins the ring.
if ingesterDesc.State == LEAVING && len(ingesterDesc.Tokens) == i.cfg.NumTokens {
Expand Down
52 changes: 52 additions & 0 deletions pkg/ring/lifecycler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,58 @@ func TestJoinInLeavingState(t *testing.T) {
})
}

// JoinInJoiningState ensures that if the lifecycler starts up and the ring already has it in a JOINING state that it still is able to auto join
func TestJoinInJoiningState(t *testing.T) {
var ringConfig Config
flagext.DefaultValues(&ringConfig)
c := GetCodec()
ringConfig.KVStore.Mock = consul.NewInMemoryClient(c)

r, err := New(ringConfig, "ingester", IngesterRingKey)
require.NoError(t, err)
require.NoError(t, services.StartAndAwaitRunning(context.Background(), r))
defer services.StopAndAwaitTerminated(context.Background(), r) //nolint:errcheck

cfg := testLifecyclerConfig(ringConfig, "ing1")
cfg.NumTokens = 2
cfg.MinReadyDuration = 1 * time.Nanosecond

// Set state as JOINING
err = r.KVClient.CAS(context.Background(), IngesterRingKey, func(in interface{}) (interface{}, bool, error) {
r := &Desc{
Ingesters: map[string]IngesterDesc{
"ing1": {
State: JOINING,
Tokens: []uint32{1, 4},
},
"ing2": {
Tokens: []uint32{2, 3},
},
},
}

return r, true, nil
})
require.NoError(t, err)

l1, err := NewLifecycler(cfg, &nopFlushTransferer{}, "ingester", IngesterRingKey, true)
require.NoError(t, err)
require.NoError(t, services.StartAndAwaitRunning(context.Background(), l1))

// Check that the lifecycler was able to join after coming up in JOINING
test.Poll(t, 1000*time.Millisecond, true, func() interface{} {
d, err := r.KVClient.Get(context.Background(), IngesterRingKey)
require.NoError(t, err)

desc, ok := d.(*Desc)
return ok &&
len(desc.Ingesters) == 2 &&
desc.Ingesters["ing1"].State == ACTIVE &&
len(desc.Ingesters["ing1"].Tokens) == cfg.NumTokens &&
len(desc.Ingesters["ing2"].Tokens) == 2
})
}

func TestRestoreOfZoneWhenOverwritten(t *testing.T) {
// This test is simulating a case during upgrade of pre 1.0 cortex where
// older ingesters do not have the zone field in their ring structs
Expand Down