From 06ef63dc43edc7f7a59b3bfd858e951d60698835 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Wed, 27 Oct 2021 16:47:38 -0500 Subject: [PATCH 1/6] Migrate to dskit/ring Signed-off-by: Tyler Reid --- go.mod | 2 +- go.sum | 2 + pkg/alertmanager/alertmanager_client.go | 2 +- pkg/alertmanager/alertmanager_ring.go | 7 +- pkg/alertmanager/alertmanager_ring_test.go | 3 +- pkg/alertmanager/distributor.go | 4 +- pkg/alertmanager/distributor_test.go | 4 +- pkg/alertmanager/lifecycle.go | 2 +- pkg/alertmanager/multitenant.go | 12 +- pkg/alertmanager/multitenant_test.go | 4 +- pkg/api/api.go | 2 +- pkg/compactor/compactor.go | 6 +- pkg/compactor/compactor_ring.go | 2 +- pkg/compactor/compactor_ring_test.go | 3 +- pkg/compactor/compactor_test.go | 23 +- pkg/cortex/cortex.go | 2 +- pkg/cortex/cortex_test.go | 2 +- pkg/cortex/modules.go | 5 +- pkg/distributor/distributor.go | 8 +- pkg/distributor/distributor_ring.go | 2 +- pkg/distributor/distributor_ring_test.go | 3 +- pkg/distributor/distributor_test.go | 8 +- pkg/distributor/ha_tracker_test.go | 2 +- pkg/distributor/ingester_client_pool.go | 5 +- pkg/distributor/query.go | 2 +- pkg/ingester/flush_test.go | 2 +- pkg/ingester/ingester.go | 4 +- pkg/ingester/ingester_test.go | 2 +- pkg/ingester/ingester_v2.go | 4 +- pkg/ingester/ingester_v2_test.go | 2 +- pkg/ingester/lifecycle_test.go | 26 +- pkg/ingester/transfer.go | 2 +- pkg/querier/blocks_store_balanced_set.go | 2 +- pkg/querier/blocks_store_queryable.go | 8 +- pkg/querier/blocks_store_replicated_set.go | 4 +- .../blocks_store_replicated_set_test.go | 6 +- pkg/querier/store_gateway_client.go | 2 +- pkg/querier/worker/scheduler_processor.go | 2 +- pkg/ring/basic_lifecycler_delegates_test.go | 308 --- pkg/ring/basic_lifecycler_test.go | 469 ---- pkg/ring/client/pool_test.go | 146 -- .../client/ring_service_discovery_test.go | 67 - pkg/ring/lifecycler_test.go | 686 ------ pkg/ring/merge_test.go | 453 ---- pkg/ring/model_test.go | 418 ---- pkg/ring/replication_set_test.go | 243 -- pkg/ring/replication_set_tracker_test.go | 266 -- pkg/ring/replication_strategy_test.go | 165 -- pkg/ring/ring_test.go | 2136 ----------------- pkg/ring/testutils/testutils.go | 26 - pkg/ring/tokens_test.go | 77 - pkg/ring/util_test.go | 301 --- pkg/ruler/client_pool.go | 3 +- pkg/ruler/lifecycle.go | 2 +- pkg/ruler/lifecycle_test.go | 22 +- pkg/ruler/ruler.go | 12 +- pkg/ruler/ruler_ring.go | 8 +- pkg/ruler/ruler_test.go | 2 +- pkg/storegateway/gateway.go | 12 +- pkg/storegateway/gateway_ring.go | 7 +- pkg/storegateway/gateway_ring_test.go | 3 +- pkg/storegateway/gateway_test.go | 2 +- pkg/storegateway/sharding_strategy.go | 2 +- pkg/storegateway/sharding_strategy_test.go | 6 +- .../grafana/dskit}/ring/basic_lifecycler.go | 15 +- .../dskit}/ring/basic_lifecycler_delegates.go | 0 .../dskit}/ring/basic_lifecycler_metrics.go | 6 +- .../github.com/grafana/dskit}/ring/batch.go | 4 + .../grafana/dskit}/ring/client/pool.go | 11 +- .../ring/client/ring_service_discovery.go | 2 +- .../github.com/grafana/dskit}/ring/flush.go | 0 .../github.com/grafana/dskit}/ring/http.go | 44 +- .../grafana/dskit}/ring/lifecycler.go | 229 +- .../grafana/dskit/ring/lifecycler_metrics.go | 40 + .../github.com/grafana/dskit}/ring/model.go | 32 +- .../grafana/dskit}/ring/replication_set.go | 0 .../dskit}/ring/replication_set_tracker.go | 0 .../dskit}/ring/replication_strategy.go | 0 .../github.com/grafana/dskit}/ring/ring.go | 198 +- .../github.com/grafana/dskit}/ring/ring.pb.go | 2 +- .../github.com/grafana/dskit}/ring/ring.proto | 2 +- .../grafana/dskit/ring/shard/shard.go | 45 + .../github.com/grafana/dskit}/ring/tokens.go | 2 +- .../github.com/grafana/dskit}/ring/util.go | 63 +- .../grafana/dskit/ring/util/string_utils.go | 12 + vendor/github.com/grafana/dskit/time/time.go | 96 + vendor/modules.txt | 7 +- 87 files changed, 702 insertions(+), 6133 deletions(-) delete mode 100644 pkg/ring/basic_lifecycler_delegates_test.go delete mode 100644 pkg/ring/basic_lifecycler_test.go delete mode 100644 pkg/ring/client/pool_test.go delete mode 100644 pkg/ring/client/ring_service_discovery_test.go delete mode 100644 pkg/ring/lifecycler_test.go delete mode 100644 pkg/ring/merge_test.go delete mode 100644 pkg/ring/model_test.go delete mode 100644 pkg/ring/replication_set_test.go delete mode 100644 pkg/ring/replication_set_tracker_test.go delete mode 100644 pkg/ring/replication_strategy_test.go delete mode 100644 pkg/ring/ring_test.go delete mode 100644 pkg/ring/testutils/testutils.go delete mode 100644 pkg/ring/tokens_test.go delete mode 100644 pkg/ring/util_test.go rename {pkg => vendor/github.com/grafana/dskit}/ring/basic_lifecycler.go (96%) rename {pkg => vendor/github.com/grafana/dskit}/ring/basic_lifecycler_delegates.go (100%) rename {pkg => vendor/github.com/grafana/dskit}/ring/basic_lifecycler_metrics.go (85%) rename {pkg => vendor/github.com/grafana/dskit}/ring/batch.go (96%) rename {pkg => vendor/github.com/grafana/dskit}/ring/client/pool.go (92%) rename {pkg => vendor/github.com/grafana/dskit}/ring/client/ring_service_discovery.go (91%) rename {pkg => vendor/github.com/grafana/dskit}/ring/flush.go (100%) rename {pkg => vendor/github.com/grafana/dskit}/ring/http.go (79%) rename {pkg => vendor/github.com/grafana/dskit}/ring/lifecycler.go (74%) create mode 100644 vendor/github.com/grafana/dskit/ring/lifecycler_metrics.go rename {pkg => vendor/github.com/grafana/dskit}/ring/model.go (94%) rename {pkg => vendor/github.com/grafana/dskit}/ring/replication_set.go (100%) rename {pkg => vendor/github.com/grafana/dskit}/ring/replication_set_tracker.go (100%) rename {pkg => vendor/github.com/grafana/dskit}/ring/replication_strategy.go (100%) rename {pkg => vendor/github.com/grafana/dskit}/ring/ring.go (86%) rename {pkg => vendor/github.com/grafana/dskit}/ring/ring.pb.go (99%) rename {pkg => vendor/github.com/grafana/dskit}/ring/ring.proto (97%) create mode 100644 vendor/github.com/grafana/dskit/ring/shard/shard.go rename {pkg => vendor/github.com/grafana/dskit}/ring/tokens.go (96%) rename {pkg => vendor/github.com/grafana/dskit}/ring/util.go (67%) create mode 100644 vendor/github.com/grafana/dskit/ring/util/string_utils.go create mode 100644 vendor/github.com/grafana/dskit/time/time.go diff --git a/go.mod b/go.mod index 8dfad1048c7..36ab042e6b2 100644 --- a/go.mod +++ b/go.mod @@ -29,7 +29,7 @@ require ( github.com/golang/protobuf v1.5.2 github.com/golang/snappy v0.0.4 github.com/gorilla/mux v1.8.0 - github.com/grafana/dskit v0.0.0-20211011144203-3a88ec0b675f + github.com/grafana/dskit v0.0.0-20211021180445-3bd016e9d7f1 github.com/json-iterator/go v1.1.11 github.com/lib/pq v1.3.0 github.com/minio/minio-go/v7 v7.0.10 diff --git a/go.sum b/go.sum index e4165c77937..0086dca9650 100644 --- a/go.sum +++ b/go.sum @@ -1014,6 +1014,8 @@ github.com/grafana/dskit v0.0.0-20210818123532-6645f87e9e12/go.mod h1:QaNAQaCSFO github.com/grafana/dskit v0.0.0-20210819132858-471020752967/go.mod h1:uF46UNN1/feB1egpq8UGbBBKvJjGgZauW7pcVbeFLLM= github.com/grafana/dskit v0.0.0-20211011144203-3a88ec0b675f h1:FvvSVEbnGeM2bUivGmsiXTi8URJyBU7TcFEEoRe5wWI= github.com/grafana/dskit v0.0.0-20211011144203-3a88ec0b675f/go.mod h1:uPG2nyK4CtgNDmWv7qyzYcdI+S90kHHRWvHnBtEMBXM= +github.com/grafana/dskit v0.0.0-20211021180445-3bd016e9d7f1 h1:Qf+/W3Tup0nO21tgJmO14WJK0yyrm4L2UJipZP+Zoow= +github.com/grafana/dskit v0.0.0-20211021180445-3bd016e9d7f1/go.mod h1:uPG2nyK4CtgNDmWv7qyzYcdI+S90kHHRWvHnBtEMBXM= github.com/grafana/gocql v0.0.0-20200605141915-ba5dc39ece85 h1:xLuzPoOzdfNb/RF/IENCw+oLVdZB4G21VPhkHBgwSHY= github.com/grafana/gocql v0.0.0-20200605141915-ba5dc39ece85/go.mod h1:crI9WX6p0IhrqB+DqIUHulRW853PaNFf7o4UprV//3I= github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA= diff --git a/pkg/alertmanager/alertmanager_client.go b/pkg/alertmanager/alertmanager_client.go index 57571a31464..51ce369c39a 100644 --- a/pkg/alertmanager/alertmanager_client.go +++ b/pkg/alertmanager/alertmanager_client.go @@ -7,6 +7,7 @@ import ( "github.com/go-kit/log" "github.com/grafana/dskit/crypto/tls" "github.com/grafana/dskit/grpcclient" + "github.com/grafana/dskit/ring/client" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" @@ -14,7 +15,6 @@ import ( "google.golang.org/grpc/health/grpc_health_v1" "github.com/cortexproject/cortex/pkg/alertmanager/alertmanagerpb" - "github.com/cortexproject/cortex/pkg/ring/client" ) // ClientsPool is the interface used to get the client from the pool for a specified address. diff --git a/pkg/alertmanager/alertmanager_ring.go b/pkg/alertmanager/alertmanager_ring.go index 57b300099c7..6c0eefc7c8b 100644 --- a/pkg/alertmanager/alertmanager_ring.go +++ b/pkg/alertmanager/alertmanager_ring.go @@ -6,11 +6,12 @@ import ( "os" "time" + "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv" + "github.com/grafana/dskit/ring" - "github.com/cortexproject/cortex/pkg/ring" util_log "github.com/cortexproject/cortex/pkg/util/log" ) @@ -94,8 +95,8 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // ToLifecyclerConfig returns a LifecyclerConfig based on the alertmanager // ring config. -func (cfg *RingConfig) ToLifecyclerConfig() (ring.BasicLifecyclerConfig, error) { - instanceAddr, err := ring.GetInstanceAddr(cfg.InstanceAddr, cfg.InstanceInterfaceNames) +func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecyclerConfig, error) { + instanceAddr, err := ring.GetInstanceAddr(cfg.InstanceAddr, cfg.InstanceInterfaceNames, logger) if err != nil { return ring.BasicLifecyclerConfig{}, err } diff --git a/pkg/alertmanager/alertmanager_ring_test.go b/pkg/alertmanager/alertmanager_ring_test.go index 3e4d460252e..266e28659ac 100644 --- a/pkg/alertmanager/alertmanager_ring_test.go +++ b/pkg/alertmanager/alertmanager_ring_test.go @@ -4,9 +4,8 @@ import ( "testing" "time" + "github.com/grafana/dskit/ring" "github.com/stretchr/testify/assert" - - "github.com/cortexproject/cortex/pkg/ring" ) func TestIsHealthyForAlertmanagerOperations(t *testing.T) { diff --git a/pkg/alertmanager/distributor.go b/pkg/alertmanager/distributor.go index 53112160a1d..b8db92f6a98 100644 --- a/pkg/alertmanager/distributor.go +++ b/pkg/alertmanager/distributor.go @@ -12,6 +12,8 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" + "github.com/grafana/dskit/ring" + "github.com/grafana/dskit/ring/client" "github.com/grafana/dskit/services" "github.com/opentracing/opentracing-go" "github.com/pkg/errors" @@ -20,8 +22,6 @@ import ( "github.com/weaveworks/common/user" "github.com/cortexproject/cortex/pkg/alertmanager/merger" - "github.com/cortexproject/cortex/pkg/ring" - "github.com/cortexproject/cortex/pkg/ring/client" "github.com/cortexproject/cortex/pkg/tenant" "github.com/cortexproject/cortex/pkg/util" util_log "github.com/cortexproject/cortex/pkg/util/log" diff --git a/pkg/alertmanager/distributor_test.go b/pkg/alertmanager/distributor_test.go index 6dafc36a786..6ea6edb19d1 100644 --- a/pkg/alertmanager/distributor_test.go +++ b/pkg/alertmanager/distributor_test.go @@ -17,6 +17,7 @@ import ( "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv" "github.com/grafana/dskit/kv/consul" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" @@ -27,7 +28,6 @@ import ( "google.golang.org/grpc/health/grpc_health_v1" "github.com/cortexproject/cortex/pkg/alertmanager/alertmanagerpb" - "github.com/cortexproject/cortex/pkg/ring" util_log "github.com/cortexproject/cortex/pkg/util/log" "github.com/cortexproject/cortex/pkg/util/test" ) @@ -355,7 +355,7 @@ func prepare(t *testing.T, numAM, numHappyAM, replicationFactor int, responseBod }, HeartbeatTimeout: 60 * time.Minute, ReplicationFactor: replicationFactor, - }, RingNameForServer, RingKey, nil) + }, RingNameForServer, RingKey, nil, nil) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(context.Background(), amRing)) test.Poll(t, time.Second, numAM, func() interface{} { diff --git a/pkg/alertmanager/lifecycle.go b/pkg/alertmanager/lifecycle.go index 54e420701a4..b3a9e894045 100644 --- a/pkg/alertmanager/lifecycle.go +++ b/pkg/alertmanager/lifecycle.go @@ -1,7 +1,7 @@ package alertmanager import ( - "github.com/cortexproject/cortex/pkg/ring" + "github.com/grafana/dskit/ring" ) func (r *MultitenantAlertmanager) OnRingInstanceRegister(_ *ring.BasicLifecycler, ringDesc ring.Desc, instanceExists bool, instanceID string, instanceDesc ring.InstanceDesc) (ring.InstanceState, ring.Tokens) { diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 42ec392b571..208bb24fec0 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -18,6 +18,8 @@ import ( "github.com/grafana/dskit/concurrency" "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv" + "github.com/grafana/dskit/ring" + "github.com/grafana/dskit/ring/client" "github.com/grafana/dskit/services" "github.com/pkg/errors" "github.com/prometheus/alertmanager/cluster" @@ -34,8 +36,6 @@ import ( "github.com/cortexproject/cortex/pkg/alertmanager/alertmanagerpb" "github.com/cortexproject/cortex/pkg/alertmanager/alertspb" "github.com/cortexproject/cortex/pkg/alertmanager/alertstore" - "github.com/cortexproject/cortex/pkg/ring" - "github.com/cortexproject/cortex/pkg/ring/client" "github.com/cortexproject/cortex/pkg/tenant" "github.com/cortexproject/cortex/pkg/util" util_log "github.com/cortexproject/cortex/pkg/util/log" @@ -398,7 +398,7 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC } if cfg.ShardingEnabled { - lifecyclerCfg, err := am.cfg.ShardingRing.ToLifecyclerConfig() + lifecyclerCfg, err := am.cfg.ShardingRing.ToLifecyclerConfig(am.logger) if err != nil { return nil, errors.Wrap(err, "failed to initialize Alertmanager's lifecycler config") } @@ -414,15 +414,11 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC return nil, errors.Wrap(err, "failed to initialize Alertmanager's lifecycler") } - am.ring, err = ring.NewWithStoreClientAndStrategy(am.cfg.ShardingRing.ToRingConfig(), RingNameForServer, RingKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy()) + am.ring, err = ring.NewWithStoreClientAndStrategy(am.cfg.ShardingRing.ToRingConfig(), RingNameForServer, RingKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix("cortex_", am.registry), am.logger) if err != nil { return nil, errors.Wrap(err, "failed to initialize Alertmanager's ring") } - if am.registry != nil { - am.registry.MustRegister(am.ring) - } - am.grpcServer = server.NewServer(&handlerForGRPCServer{am: am}) am.alertmanagerClientsPool = newAlertmanagerClientsPool(client.NewRingServiceDiscovery(am.ring), cfg.AlertmanagerClient, logger, am.registry) diff --git a/pkg/alertmanager/multitenant_test.go b/pkg/alertmanager/multitenant_test.go index 94cd067ec8d..068213599ae 100644 --- a/pkg/alertmanager/multitenant_test.go +++ b/pkg/alertmanager/multitenant_test.go @@ -23,6 +23,7 @@ import ( "github.com/grafana/dskit/concurrency" "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv/consul" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/prometheus/alertmanager/cluster/clusterpb" "github.com/prometheus/alertmanager/notify" @@ -44,7 +45,6 @@ import ( "github.com/cortexproject/cortex/pkg/alertmanager/alertspb" "github.com/cortexproject/cortex/pkg/alertmanager/alertstore" "github.com/cortexproject/cortex/pkg/alertmanager/alertstore/bucketclient" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/storage/bucket" "github.com/cortexproject/cortex/pkg/util" "github.com/cortexproject/cortex/pkg/util/test" @@ -1445,7 +1445,7 @@ func TestAlertmanager_ReplicasPosition(t *testing.T) { // First, create the alertmanager instances, we'll use a replication factor of 3 and create 3 instances so that we can get the tenant on each replica. for i := 1; i <= 3; i++ { - //instanceIDs = append(instanceIDs, fmt.Sprintf("alertmanager-%d", i)) + // instanceIDs = append(instanceIDs, fmt.Sprintf("alertmanager-%d", i)) instanceID := fmt.Sprintf("alertmanager-%d", i) amConfig := mockAlertmanagerConfig(t) diff --git a/pkg/api/api.go b/pkg/api/api.go index ced7685448f..d7635d2c56d 100644 --- a/pkg/api/api.go +++ b/pkg/api/api.go @@ -12,6 +12,7 @@ import ( "github.com/felixge/fgprof" "github.com/go-kit/log" "github.com/go-kit/log/level" + "github.com/grafana/dskit/ring" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/prometheus/storage" "github.com/weaveworks/common/middleware" @@ -30,7 +31,6 @@ import ( "github.com/cortexproject/cortex/pkg/frontend/v2/frontendv2pb" "github.com/cortexproject/cortex/pkg/ingester/client" "github.com/cortexproject/cortex/pkg/querier" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/ruler" "github.com/cortexproject/cortex/pkg/scheduler" "github.com/cortexproject/cortex/pkg/scheduler/schedulerpb" diff --git a/pkg/compactor/compactor.go b/pkg/compactor/compactor.go index d1a5a69c4bb..8ba7b7933d2 100644 --- a/pkg/compactor/compactor.go +++ b/pkg/compactor/compactor.go @@ -17,6 +17,7 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/dskit/backoff" "github.com/grafana/dskit/flagext" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" @@ -28,7 +29,6 @@ import ( "github.com/thanos-io/thanos/pkg/compact/downsample" "github.com/thanos-io/thanos/pkg/objstore" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/storage/bucket" cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb" "github.com/cortexproject/cortex/pkg/storage/tsdb/bucketindex" @@ -367,12 +367,12 @@ func (c *Compactor) starting(ctx context.Context) error { // Initialize the compactors ring if sharding is enabled. if c.compactorCfg.ShardingEnabled { lifecyclerCfg := c.compactorCfg.ShardingRing.ToLifecyclerConfig() - c.ringLifecycler, err = ring.NewLifecycler(lifecyclerCfg, ring.NewNoopFlushTransferer(), "compactor", ring.CompactorRingKey, false, c.registerer) + c.ringLifecycler, err = ring.NewLifecycler(lifecyclerCfg, ring.NewNoopFlushTransferer(), "compactor", ring.CompactorRingKey, false, c.logger, prometheus.WrapRegistererWithPrefix("cortex_", c.registerer)) if err != nil { return errors.Wrap(err, "unable to initialize compactor ring lifecycler") } - c.ring, err = ring.New(lifecyclerCfg.RingConfig, "compactor", ring.CompactorRingKey, c.registerer) + c.ring, err = ring.New(lifecyclerCfg.RingConfig, "compactor", ring.CompactorRingKey, c.logger, prometheus.WrapRegistererWithPrefix("cortex_", c.registerer)) if err != nil { return errors.Wrap(err, "unable to initialize compactor ring") } diff --git a/pkg/compactor/compactor_ring.go b/pkg/compactor/compactor_ring.go index 28b20e6734c..b0683c02dda 100644 --- a/pkg/compactor/compactor_ring.go +++ b/pkg/compactor/compactor_ring.go @@ -8,8 +8,8 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv" + "github.com/grafana/dskit/ring" - "github.com/cortexproject/cortex/pkg/ring" util_log "github.com/cortexproject/cortex/pkg/util/log" ) diff --git a/pkg/compactor/compactor_ring_test.go b/pkg/compactor/compactor_ring_test.go index 3a70ccde2b9..0ef8f10bdbc 100644 --- a/pkg/compactor/compactor_ring_test.go +++ b/pkg/compactor/compactor_ring_test.go @@ -5,9 +5,8 @@ import ( "time" "github.com/grafana/dskit/flagext" + "github.com/grafana/dskit/ring" "github.com/stretchr/testify/assert" - - "github.com/cortexproject/cortex/pkg/ring" ) func TestRingConfig_DefaultConfigToLifecyclerConfig(t *testing.T) { diff --git a/pkg/compactor/compactor_test.go b/pkg/compactor/compactor_test.go index eb5738e6396..5a6ab55ad25 100644 --- a/pkg/compactor/compactor_test.go +++ b/pkg/compactor/compactor_test.go @@ -36,7 +36,8 @@ import ( "github.com/thanos-io/thanos/pkg/objstore" "gopkg.in/yaml.v2" - "github.com/cortexproject/cortex/pkg/ring" + "github.com/grafana/dskit/ring" + "github.com/cortexproject/cortex/pkg/storage/bucket" cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb" cortex_testutil "github.com/cortexproject/cortex/pkg/util/test" @@ -1060,6 +1061,22 @@ func findCompactorByUserID(compactors []*Compactor, logs []*concurrency.SyncBuff } func removeIgnoredLogs(input []string) []string { + ignoredLogStringsMap := map[string]struct{}{ + // Since we moved to the component logger from the global logger for the ring in dskit these lines are now expected but are just ring setup information. + `level=info component=compactor msg="ring doesn't exist in KV store yet"`: {}, + `level=info component=compactor msg="not loading tokens from file, tokens file path is empty"`: {}, + `level=info component=compactor msg="instance not found in ring, adding with no tokens" ring=compactor`: {}, + `level=debug component=compactor msg="JoinAfter expired" ring=compactor`: {}, + `level=info component=compactor msg="auto-joining cluster after timeout" ring=compactor`: {}, + `level=info component=compactor msg="lifecycler loop() exited gracefully" ring=compactor`: {}, + `level=info component=compactor msg="changing instance state from" old_state=ACTIVE new_state=LEAVING ring=compactor`: {}, + `level=error component=compactor msg="failed to set state to LEAVING" ring=compactor err="Changing instance state from LEAVING -> LEAVING is disallowed"`: {}, + `level=error component=compactor msg="failed to set state to LEAVING" ring=compactor err="Changing instance state from JOINING -> LEAVING is disallowed"`: {}, + `level=debug component=compactor msg="unregistering instance from ring" ring=compactor`: {}, + `level=info component=compactor msg="instance removed from the KV store" ring=compactor`: {}, + `level=info component=compactor msg="observing tokens before going ACTIVE" ring=compactor`: {}, + } + out := make([]string, 0, len(input)) durationRe := regexp.MustCompile(`\s?duration=\S+`) @@ -1069,6 +1086,10 @@ func removeIgnoredLogs(input []string) []string { continue } + if _, exists := ignoredLogStringsMap[log]; exists { + continue + } + // Remove any duration from logs. log = durationRe.ReplaceAllString(log, "") diff --git a/pkg/cortex/cortex.go b/pkg/cortex/cortex.go index 0a6ca6c2c1f..31ff0d67281 100644 --- a/pkg/cortex/cortex.go +++ b/pkg/cortex/cortex.go @@ -16,6 +16,7 @@ import ( "github.com/grafana/dskit/grpcutil" "github.com/grafana/dskit/kv/memberlist" "github.com/grafana/dskit/modules" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/runtimeconfig" "github.com/grafana/dskit/services" "github.com/pkg/errors" @@ -50,7 +51,6 @@ import ( "github.com/cortexproject/cortex/pkg/querier/queryrange" "github.com/cortexproject/cortex/pkg/querier/tenantfederation" querier_worker "github.com/cortexproject/cortex/pkg/querier/worker" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/ruler" "github.com/cortexproject/cortex/pkg/ruler/rulestore" "github.com/cortexproject/cortex/pkg/scheduler" diff --git a/pkg/cortex/cortex_test.go b/pkg/cortex/cortex_test.go index 7c7c2c615c4..1584b27854e 100644 --- a/pkg/cortex/cortex_test.go +++ b/pkg/cortex/cortex_test.go @@ -14,6 +14,7 @@ import ( "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" @@ -26,7 +27,6 @@ import ( "github.com/cortexproject/cortex/pkg/chunk/storage" "github.com/cortexproject/cortex/pkg/frontend/v1/frontendv1pb" "github.com/cortexproject/cortex/pkg/ingester" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/ruler" "github.com/cortexproject/cortex/pkg/scheduler/schedulerpb" "github.com/cortexproject/cortex/pkg/storage/bucket" diff --git a/pkg/cortex/modules.go b/pkg/cortex/modules.go index a6aa9171ba1..d19e964634f 100644 --- a/pkg/cortex/modules.go +++ b/pkg/cortex/modules.go @@ -12,6 +12,7 @@ import ( "github.com/grafana/dskit/kv/codec" "github.com/grafana/dskit/kv/memberlist" "github.com/grafana/dskit/modules" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/runtimeconfig" "github.com/grafana/dskit/services" "github.com/opentracing-contrib/go-stdlib/nethttp" @@ -43,7 +44,6 @@ import ( "github.com/cortexproject/cortex/pkg/querier/queryrange" "github.com/cortexproject/cortex/pkg/querier/tenantfederation" querier_worker "github.com/cortexproject/cortex/pkg/querier/worker" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/ruler" "github.com/cortexproject/cortex/pkg/scheduler" "github.com/cortexproject/cortex/pkg/storegateway" @@ -137,11 +137,10 @@ func (t *Cortex) initServer() (services.Service, error) { func (t *Cortex) initRing() (serv services.Service, err error) { t.Cfg.Ingester.LifecyclerConfig.RingConfig.KVStore.Multi.ConfigProvider = multiClientRuntimeConfigChannel(t.RuntimeConfig) - t.Ring, err = ring.New(t.Cfg.Ingester.LifecyclerConfig.RingConfig, "ingester", ring.IngesterRingKey, prometheus.DefaultRegisterer) + t.Ring, err = ring.New(t.Cfg.Ingester.LifecyclerConfig.RingConfig, "ingester", ring.IngesterRingKey, util_log.Logger, prometheus.WrapRegistererWithPrefix("cortex_", prometheus.DefaultRegisterer)) if err != nil { return nil, err } - prometheus.MustRegister(t.Ring) t.API.RegisterRing(t.Ring) diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 995176316f1..8bd5cdc9927 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -12,6 +12,8 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/dskit/limiter" + "github.com/grafana/dskit/ring" + ring_client "github.com/grafana/dskit/ring/client" "github.com/grafana/dskit/services" "github.com/opentracing/opentracing-go" "github.com/pkg/errors" @@ -29,8 +31,6 @@ import ( "github.com/cortexproject/cortex/pkg/cortexpb" ingester_client "github.com/cortexproject/cortex/pkg/ingester/client" "github.com/cortexproject/cortex/pkg/prom1/storage/metric" - "github.com/cortexproject/cortex/pkg/ring" - ring_client "github.com/cortexproject/cortex/pkg/ring/client" "github.com/cortexproject/cortex/pkg/tenant" "github.com/cortexproject/cortex/pkg/util" "github.com/cortexproject/cortex/pkg/util/extract" @@ -211,12 +211,12 @@ func New(cfg Config, clientConfig ingester_client.Config, limits *validation.Ove if !canJoinDistributorsRing { ingestionRateStrategy = newInfiniteIngestionRateStrategy() } else if limits.IngestionRateStrategy() == validation.GlobalIngestionRateStrategy { - distributorsLifeCycler, err = ring.NewLifecycler(cfg.DistributorRing.ToLifecyclerConfig(), nil, "distributor", ring.DistributorRingKey, true, reg) + distributorsLifeCycler, err = ring.NewLifecycler(cfg.DistributorRing.ToLifecyclerConfig(), nil, "distributor", ring.DistributorRingKey, true, log, prometheus.WrapRegistererWithPrefix("cortex_", reg)) if err != nil { return nil, err } - distributorsRing, err = ring.New(cfg.DistributorRing.ToRingConfig(), "distributor", ring.DistributorRingKey, reg) + distributorsRing, err = ring.New(cfg.DistributorRing.ToRingConfig(), "distributor", ring.DistributorRingKey, log, prometheus.WrapRegistererWithPrefix("cortex_", reg)) if err != nil { return nil, errors.Wrap(err, "failed to initialize distributors' ring client") } diff --git a/pkg/distributor/distributor_ring.go b/pkg/distributor/distributor_ring.go index e1e6135d2cd..ceefa198c2d 100644 --- a/pkg/distributor/distributor_ring.go +++ b/pkg/distributor/distributor_ring.go @@ -8,8 +8,8 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv" + "github.com/grafana/dskit/ring" - "github.com/cortexproject/cortex/pkg/ring" util_log "github.com/cortexproject/cortex/pkg/util/log" ) diff --git a/pkg/distributor/distributor_ring_test.go b/pkg/distributor/distributor_ring_test.go index b34a8326c82..15d696fb819 100644 --- a/pkg/distributor/distributor_ring_test.go +++ b/pkg/distributor/distributor_ring_test.go @@ -5,9 +5,8 @@ import ( "time" "github.com/grafana/dskit/flagext" + "github.com/grafana/dskit/ring" "github.com/stretchr/testify/assert" - - "github.com/cortexproject/cortex/pkg/ring" ) func TestRingConfig_DefaultConfigToLifecyclerConfig(t *testing.T) { diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 564638eb7ae..e220eecd439 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -18,6 +18,8 @@ import ( "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv" "github.com/grafana/dskit/kv/consul" + "github.com/grafana/dskit/ring" + ring_client "github.com/grafana/dskit/ring/client" "github.com/grafana/dskit/services" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" @@ -36,8 +38,6 @@ import ( "github.com/cortexproject/cortex/pkg/cortexpb" "github.com/cortexproject/cortex/pkg/ingester/client" "github.com/cortexproject/cortex/pkg/prom1/storage/metric" - "github.com/cortexproject/cortex/pkg/ring" - ring_client "github.com/cortexproject/cortex/pkg/ring/client" "github.com/cortexproject/cortex/pkg/tenant" "github.com/cortexproject/cortex/pkg/util" "github.com/cortexproject/cortex/pkg/util/chunkcompat" @@ -1598,7 +1598,7 @@ func BenchmarkDistributor_Push(b *testing.B) { KVStore: kv.Config{Mock: kvStore}, HeartbeatTimeout: 60 * time.Minute, ReplicationFactor: 1, - }, ring.IngesterRingKey, ring.IngesterRingKey, nil) + }, ring.IngesterRingKey, ring.IngesterRingKey, nil, nil) require.NoError(b, err) require.NoError(b, services.StartAndAwaitRunning(context.Background(), ingestersRing)) b.Cleanup(func() { @@ -1962,7 +1962,7 @@ func prepare(t *testing.T, cfg prepConfig) ([]*Distributor, []mockIngester, []*p }, HeartbeatTimeout: 60 * time.Minute, ReplicationFactor: rf, - }, ring.IngesterRingKey, ring.IngesterRingKey, nil) + }, ring.IngesterRingKey, ring.IngesterRingKey, nil, nil) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(context.Background(), ingestersRing)) diff --git a/pkg/distributor/ha_tracker_test.go b/pkg/distributor/ha_tracker_test.go index 952e8ad7bd4..18147886748 100644 --- a/pkg/distributor/ha_tracker_test.go +++ b/pkg/distributor/ha_tracker_test.go @@ -11,6 +11,7 @@ import ( "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv" "github.com/grafana/dskit/kv/consul" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" @@ -22,7 +23,6 @@ import ( "github.com/weaveworks/common/user" "github.com/cortexproject/cortex/pkg/cortexpb" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/util" util_log "github.com/cortexproject/cortex/pkg/util/log" "github.com/cortexproject/cortex/pkg/util/test" diff --git a/pkg/distributor/ingester_client_pool.go b/pkg/distributor/ingester_client_pool.go index 007d8872ebb..6f97702e75c 100644 --- a/pkg/distributor/ingester_client_pool.go +++ b/pkg/distributor/ingester_client_pool.go @@ -5,11 +5,10 @@ import ( "time" "github.com/go-kit/log" + "github.com/grafana/dskit/ring" + ring_client "github.com/grafana/dskit/ring/client" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" - - "github.com/cortexproject/cortex/pkg/ring" - ring_client "github.com/cortexproject/cortex/pkg/ring/client" ) var clients = promauto.NewGauge(prometheus.GaugeOpts{ diff --git a/pkg/distributor/query.go b/pkg/distributor/query.go index e3d5fe2e9ec..72f2e039553 100644 --- a/pkg/distributor/query.go +++ b/pkg/distributor/query.go @@ -7,6 +7,7 @@ import ( "time" "github.com/grafana/dskit/grpcutil" + "github.com/grafana/dskit/ring" "github.com/opentracing/opentracing-go" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/pkg/labels" @@ -15,7 +16,6 @@ import ( "github.com/cortexproject/cortex/pkg/cortexpb" ingester_client "github.com/cortexproject/cortex/pkg/ingester/client" "github.com/cortexproject/cortex/pkg/querier/stats" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/tenant" "github.com/cortexproject/cortex/pkg/util" "github.com/cortexproject/cortex/pkg/util/extract" diff --git a/pkg/ingester/flush_test.go b/pkg/ingester/flush_test.go index 5dba0837c0f..b8346b91c90 100644 --- a/pkg/ingester/flush_test.go +++ b/pkg/ingester/flush_test.go @@ -10,6 +10,7 @@ import ( "github.com/go-kit/log" "github.com/grafana/dskit/kv" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/pkg/labels" @@ -20,7 +21,6 @@ import ( "github.com/cortexproject/cortex/pkg/chunk" "github.com/cortexproject/cortex/pkg/cortexpb" "github.com/cortexproject/cortex/pkg/ingester/client" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/util" "github.com/cortexproject/cortex/pkg/util/validation" ) diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index ada5aa7ec2e..3b34a2e1a5c 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -13,6 +13,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/gogo/status" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" @@ -27,7 +28,6 @@ import ( cortex_chunk "github.com/cortexproject/cortex/pkg/chunk" "github.com/cortexproject/cortex/pkg/cortexpb" "github.com/cortexproject/cortex/pkg/ingester/client" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/storage/tsdb" "github.com/cortexproject/cortex/pkg/tenant" "github.com/cortexproject/cortex/pkg/util" @@ -266,7 +266,7 @@ func New(cfg Config, clientConfig client.Config, limits *validation.Overrides, c // During WAL recovery, it will create new user states which requires the limiter. // Hence initialise the limiter before creating the WAL. // The '!cfg.WALConfig.WALEnabled' argument says don't flush on shutdown if the WAL is enabled. - i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", ring.IngesterRingKey, !cfg.WALConfig.WALEnabled || cfg.WALConfig.FlushOnShutdown, registerer) + i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", ring.IngesterRingKey, !cfg.WALConfig.WALEnabled || cfg.WALConfig.FlushOnShutdown, logger, prometheus.WrapRegistererWithPrefix("cortex_", registerer)) if err != nil { return nil, err } diff --git a/pkg/ingester/ingester_test.go b/pkg/ingester/ingester_test.go index d31340995ac..57ae360b6ec 100644 --- a/pkg/ingester/ingester_test.go +++ b/pkg/ingester/ingester_test.go @@ -17,6 +17,7 @@ import ( "time" "github.com/go-kit/log" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/testutil" @@ -32,7 +33,6 @@ import ( promchunk "github.com/cortexproject/cortex/pkg/chunk/encoding" "github.com/cortexproject/cortex/pkg/cortexpb" "github.com/cortexproject/cortex/pkg/ingester/client" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/util/chunkcompat" "github.com/cortexproject/cortex/pkg/util/test" "github.com/cortexproject/cortex/pkg/util/validation" diff --git a/pkg/ingester/ingester_v2.go b/pkg/ingester/ingester_v2.go index ccf32e56c15..416750d8179 100644 --- a/pkg/ingester/ingester_v2.go +++ b/pkg/ingester/ingester_v2.go @@ -14,6 +14,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/dskit/concurrency" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/oklog/ulid" "github.com/pkg/errors" @@ -35,7 +36,6 @@ import ( "github.com/cortexproject/cortex/pkg/chunk/encoding" "github.com/cortexproject/cortex/pkg/cortexpb" "github.com/cortexproject/cortex/pkg/ingester/client" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/storage/bucket" cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb" "github.com/cortexproject/cortex/pkg/tenant" @@ -511,7 +511,7 @@ func NewV2(cfg Config, clientConfig client.Config, limits *validation.Overrides, }, i.getOldestUnshippedBlockMetric) } - i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", ring.IngesterRingKey, cfg.BlocksStorageConfig.TSDB.FlushBlocksOnShutdown, registerer) + i.lifecycler, err = ring.NewLifecycler(cfg.LifecyclerConfig, i, "ingester", ring.IngesterRingKey, cfg.BlocksStorageConfig.TSDB.FlushBlocksOnShutdown, logger, prometheus.WrapRegistererWithPrefix("cortex_", registerer)) if err != nil { return nil, err } diff --git a/pkg/ingester/ingester_v2_test.go b/pkg/ingester/ingester_v2_test.go index 7e586445a9d..9c5e7e136f0 100644 --- a/pkg/ingester/ingester_v2_test.go +++ b/pkg/ingester/ingester_v2_test.go @@ -21,6 +21,7 @@ import ( "time" "github.com/go-kit/log" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/oklog/ulid" "github.com/pkg/errors" @@ -45,7 +46,6 @@ import ( "github.com/cortexproject/cortex/pkg/chunk/encoding" "github.com/cortexproject/cortex/pkg/cortexpb" "github.com/cortexproject/cortex/pkg/ingester/client" - "github.com/cortexproject/cortex/pkg/ring" cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb" "github.com/cortexproject/cortex/pkg/util" util_math "github.com/cortexproject/cortex/pkg/util/math" diff --git a/pkg/ingester/lifecycle_test.go b/pkg/ingester/lifecycle_test.go index 344168a8b6b..6e085ea5d89 100644 --- a/pkg/ingester/lifecycle_test.go +++ b/pkg/ingester/lifecycle_test.go @@ -12,7 +12,9 @@ import ( "github.com/go-kit/log" "github.com/grafana/dskit/flagext" + "github.com/grafana/dskit/kv" "github.com/grafana/dskit/kv/consul" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/prometheus/common/model" "github.com/prometheus/prometheus/pkg/labels" @@ -25,8 +27,6 @@ import ( "github.com/cortexproject/cortex/pkg/chunk" "github.com/cortexproject/cortex/pkg/cortexpb" "github.com/cortexproject/cortex/pkg/ingester/client" - "github.com/cortexproject/cortex/pkg/ring" - "github.com/cortexproject/cortex/pkg/ring/testutils" "github.com/cortexproject/cortex/pkg/util/test" "github.com/cortexproject/cortex/pkg/util/validation" ) @@ -83,7 +83,7 @@ func TestIngesterRestart(t *testing.T) { } test.Poll(t, 100*time.Millisecond, 1, func() interface{} { - return testutils.NumTokens(config.LifecyclerConfig.RingConfig.KVStore.Mock, "localhost", ring.IngesterRingKey) + return numTokens(config.LifecyclerConfig.RingConfig.KVStore.Mock, "localhost", ring.IngesterRingKey) }) { @@ -96,7 +96,7 @@ func TestIngesterRestart(t *testing.T) { time.Sleep(200 * time.Millisecond) test.Poll(t, 100*time.Millisecond, 1, func() interface{} { - return testutils.NumTokens(config.LifecyclerConfig.RingConfig.KVStore.Mock, "localhost", ring.IngesterRingKey) + return numTokens(config.LifecyclerConfig.RingConfig.KVStore.Mock, "localhost", ring.IngesterRingKey) }) } @@ -111,7 +111,7 @@ func TestIngester_ShutdownHandler(t *testing.T) { // Make sure the ingester has been added to the ring. test.Poll(t, 100*time.Millisecond, 1, func() interface{} { - return testutils.NumTokens(config.LifecyclerConfig.RingConfig.KVStore.Mock, "localhost", ring.IngesterRingKey) + return numTokens(config.LifecyclerConfig.RingConfig.KVStore.Mock, "localhost", ring.IngesterRingKey) }) recorder := httptest.NewRecorder() @@ -120,7 +120,7 @@ func TestIngester_ShutdownHandler(t *testing.T) { // Make sure the ingester has been removed from the ring even when UnregisterFromRing is false. test.Poll(t, 100*time.Millisecond, 0, func() interface{} { - return testutils.NumTokens(config.LifecyclerConfig.RingConfig.KVStore.Mock, "localhost", ring.IngesterRingKey) + return numTokens(config.LifecyclerConfig.RingConfig.KVStore.Mock, "localhost", ring.IngesterRingKey) }) }) } @@ -362,3 +362,17 @@ func TestIngesterFlush(t *testing.T) { }, }, res) } + +// numTokens determines the number of tokens owned by the specified +// address +func numTokens(c kv.Client, name, ringKey string) int { + ringDesc, err := c.Get(context.Background(), ringKey) + + // The ringDesc may be null if the lifecycler hasn't stored the ring + // to the KVStore yet. + if ringDesc == nil || err != nil { + return 0 + } + rd := ringDesc.(*ring.Desc) + return len(rd.Ingesters[name].Tokens) +} diff --git a/pkg/ingester/transfer.go b/pkg/ingester/transfer.go index a0a3ab2a610..d2466cbaff0 100644 --- a/pkg/ingester/transfer.go +++ b/pkg/ingester/transfer.go @@ -10,6 +10,7 @@ import ( "github.com/go-kit/log/level" "github.com/grafana/dskit/backoff" + "github.com/grafana/dskit/ring" "github.com/pkg/errors" "github.com/prometheus/common/model" "github.com/weaveworks/common/user" @@ -17,7 +18,6 @@ import ( "github.com/cortexproject/cortex/pkg/chunk/encoding" "github.com/cortexproject/cortex/pkg/cortexpb" "github.com/cortexproject/cortex/pkg/ingester/client" - "github.com/cortexproject/cortex/pkg/ring" ) var ( diff --git a/pkg/querier/blocks_store_balanced_set.go b/pkg/querier/blocks_store_balanced_set.go index 0494c054f5b..8f58c123d14 100644 --- a/pkg/querier/blocks_store_balanced_set.go +++ b/pkg/querier/blocks_store_balanced_set.go @@ -9,6 +9,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" + "github.com/grafana/dskit/ring/client" "github.com/grafana/dskit/services" "github.com/oklog/ulid" "github.com/pkg/errors" @@ -16,7 +17,6 @@ import ( "github.com/thanos-io/thanos/pkg/discovery/dns" "github.com/thanos-io/thanos/pkg/extprom" - "github.com/cortexproject/cortex/pkg/ring/client" "github.com/cortexproject/cortex/pkg/util" ) diff --git a/pkg/querier/blocks_store_queryable.go b/pkg/querier/blocks_store_queryable.go index b327a2ec809..0447762ee03 100644 --- a/pkg/querier/blocks_store_queryable.go +++ b/pkg/querier/blocks_store_queryable.go @@ -13,6 +13,7 @@ import ( "github.com/go-kit/log/level" "github.com/gogo/protobuf/types" "github.com/grafana/dskit/kv" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/oklog/ulid" "github.com/pkg/errors" @@ -32,7 +33,6 @@ import ( "github.com/cortexproject/cortex/pkg/cortexpb" "github.com/cortexproject/cortex/pkg/querier/series" "github.com/cortexproject/cortex/pkg/querier/stats" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/storage/bucket" cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb" "github.com/cortexproject/cortex/pkg/storage/tsdb/bucketindex" @@ -217,15 +217,11 @@ func NewBlocksStoreQueryableFromConfig(querierCfg Config, gatewayCfg storegatewa return nil, errors.Wrap(err, "failed to create store-gateway ring backend") } - storesRing, err := ring.NewWithStoreClientAndStrategy(storesRingCfg, storegateway.RingNameForClient, storegateway.RingKey, storesRingBackend, ring.NewIgnoreUnhealthyInstancesReplicationStrategy()) + storesRing, err := ring.NewWithStoreClientAndStrategy(storesRingCfg, storegateway.RingNameForClient, storegateway.RingKey, storesRingBackend, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix("cortex_", reg), logger) if err != nil { return nil, errors.Wrap(err, "failed to create store-gateway ring client") } - if reg != nil { - reg.MustRegister(storesRing) - } - stores, err = newBlocksStoreReplicationSet(storesRing, gatewayCfg.ShardingStrategy, randomLoadBalancing, limits, querierCfg.StoreGatewayClient, logger, reg) if err != nil { return nil, errors.Wrap(err, "failed to create store set") diff --git a/pkg/querier/blocks_store_replicated_set.go b/pkg/querier/blocks_store_replicated_set.go index 435a8aa60b2..9ae4ce3f448 100644 --- a/pkg/querier/blocks_store_replicated_set.go +++ b/pkg/querier/blocks_store_replicated_set.go @@ -6,13 +6,13 @@ import ( "math/rand" "github.com/go-kit/log" + "github.com/grafana/dskit/ring" + "github.com/grafana/dskit/ring/client" "github.com/grafana/dskit/services" "github.com/oklog/ulid" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" - "github.com/cortexproject/cortex/pkg/ring" - "github.com/cortexproject/cortex/pkg/ring/client" cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb" "github.com/cortexproject/cortex/pkg/storegateway" "github.com/cortexproject/cortex/pkg/util" diff --git a/pkg/querier/blocks_store_replicated_set_test.go b/pkg/querier/blocks_store_replicated_set_test.go index 50e424e33b3..c7af92d8e06 100644 --- a/pkg/querier/blocks_store_replicated_set_test.go +++ b/pkg/querier/blocks_store_replicated_set_test.go @@ -10,6 +10,7 @@ import ( "github.com/go-kit/log" "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv/consul" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/oklog/ulid" "github.com/prometheus/client_golang/prometheus" @@ -17,7 +18,6 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/cortexproject/cortex/pkg/ring" cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb" "github.com/cortexproject/cortex/pkg/util" "github.com/cortexproject/cortex/pkg/util/test" @@ -341,7 +341,7 @@ func TestBlocksStoreReplicationSet_GetClientsFor(t *testing.T) { flagext.DefaultValues(&ringCfg) ringCfg.ReplicationFactor = testData.replicationFactor - r, err := ring.NewWithStoreClientAndStrategy(ringCfg, "test", "test", ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy()) + r, err := ring.NewWithStoreClientAndStrategy(ringCfg, "test", "test", ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), nil, nil) require.NoError(t, err) limits := &blocksStoreLimitsMock{ @@ -404,7 +404,7 @@ func TestBlocksStoreReplicationSet_GetClientsFor_ShouldSupportRandomLoadBalancin flagext.DefaultValues(&ringCfg) ringCfg.ReplicationFactor = numInstances - r, err := ring.NewWithStoreClientAndStrategy(ringCfg, "test", "test", ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy()) + r, err := ring.NewWithStoreClientAndStrategy(ringCfg, "test", "test", ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), nil, nil) require.NoError(t, err) limits := &blocksStoreLimitsMock{} diff --git a/pkg/querier/store_gateway_client.go b/pkg/querier/store_gateway_client.go index 528c3c2fd93..e5fa71e747d 100644 --- a/pkg/querier/store_gateway_client.go +++ b/pkg/querier/store_gateway_client.go @@ -7,13 +7,13 @@ import ( "github.com/go-kit/log" "github.com/grafana/dskit/crypto/tls" "github.com/grafana/dskit/grpcclient" + "github.com/grafana/dskit/ring/client" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "google.golang.org/grpc" "google.golang.org/grpc/health/grpc_health_v1" - "github.com/cortexproject/cortex/pkg/ring/client" "github.com/cortexproject/cortex/pkg/storegateway/storegatewaypb" ) diff --git a/pkg/querier/worker/scheduler_processor.go b/pkg/querier/worker/scheduler_processor.go index ee0b2028425..d881ea61410 100644 --- a/pkg/querier/worker/scheduler_processor.go +++ b/pkg/querier/worker/scheduler_processor.go @@ -11,6 +11,7 @@ import ( "github.com/grafana/dskit/backoff" "github.com/grafana/dskit/grpcclient" dsmiddleware "github.com/grafana/dskit/middleware" + "github.com/grafana/dskit/ring/client" "github.com/grafana/dskit/services" otgrpc "github.com/opentracing-contrib/go-grpc" "github.com/opentracing/opentracing-go" @@ -24,7 +25,6 @@ import ( "github.com/cortexproject/cortex/pkg/frontend/v2/frontendv2pb" querier_stats "github.com/cortexproject/cortex/pkg/querier/stats" - "github.com/cortexproject/cortex/pkg/ring/client" "github.com/cortexproject/cortex/pkg/scheduler/schedulerpb" "github.com/cortexproject/cortex/pkg/util/httpgrpcutil" util_log "github.com/cortexproject/cortex/pkg/util/log" diff --git a/pkg/ring/basic_lifecycler_delegates_test.go b/pkg/ring/basic_lifecycler_delegates_test.go deleted file mode 100644 index 077b2b0ed82..00000000000 --- a/pkg/ring/basic_lifecycler_delegates_test.go +++ /dev/null @@ -1,308 +0,0 @@ -package ring - -import ( - "context" - "io/ioutil" - "os" - "testing" - "time" - - "github.com/go-kit/log" - "github.com/grafana/dskit/concurrency" - "github.com/grafana/dskit/services" - "github.com/prometheus/client_golang/prometheus/testutil" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/cortexproject/cortex/pkg/util/test" -) - -func TestLeaveOnStoppingDelegate(t *testing.T) { - onStoppingCalled := false - - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - - testDelegate := &mockDelegate{ - onStopping: func(l *BasicLifecycler) { - assert.Equal(t, LEAVING, l.GetState()) - onStoppingCalled = true - }, - } - - leaveDelegate := NewLeaveOnStoppingDelegate(testDelegate, log.NewNopLogger()) - lifecycler, _, err := prepareBasicLifecyclerWithDelegate(t, cfg, leaveDelegate) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - - assert.NoError(t, services.StopAndAwaitTerminated(ctx, lifecycler)) - assert.True(t, onStoppingCalled) -} - -func TestTokensPersistencyDelegate_ShouldSkipTokensLoadingIfFileDoesNotExist(t *testing.T) { - // Create a temporary file and immediately delete it. - tokensFile, err := ioutil.TempFile(os.TempDir(), "tokens-*") - require.NoError(t, err) - require.NoError(t, os.Remove(tokensFile.Name())) - - testDelegate := &mockDelegate{ - onRegister: func(lifecycler *BasicLifecycler, ringDesc Desc, instanceExists bool, instanceID string, instanceDesc InstanceDesc) (InstanceState, Tokens) { - assert.False(t, instanceExists) - return JOINING, Tokens{1, 2, 3, 4, 5} - }, - } - - logs := &concurrency.SyncBuffer{} - logger := log.NewLogfmtLogger(logs) - persistencyDelegate := NewTokensPersistencyDelegate(tokensFile.Name(), ACTIVE, testDelegate, logger) - - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - lifecycler, _, err := prepareBasicLifecyclerWithDelegate(t, cfg, persistencyDelegate) - require.NoError(t, err) - defer services.StopAndAwaitTerminated(ctx, lifecycler) //nolint:errcheck - - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - assert.Equal(t, JOINING, lifecycler.GetState()) - assert.Equal(t, Tokens{1, 2, 3, 4, 5}, lifecycler.GetTokens()) - assert.True(t, lifecycler.IsRegistered()) - - require.NoError(t, services.StopAndAwaitTerminated(ctx, lifecycler)) - - // Ensure tokens have been stored. - actualTokens, err := LoadTokensFromFile(tokensFile.Name()) - require.NoError(t, err) - assert.Equal(t, Tokens{1, 2, 3, 4, 5}, actualTokens) - - // Ensure no error has been logged. - assert.Empty(t, logs.String()) -} - -func TestTokensPersistencyDelegate_ShouldLoadTokensFromFileIfFileExist(t *testing.T) { - tokensFile, err := ioutil.TempFile(os.TempDir(), "tokens-*") - require.NoError(t, err) - defer os.Remove(tokensFile.Name()) //nolint:errcheck - - // Store some tokens to the file. - storedTokens := Tokens{6, 7, 8, 9, 10} - require.NoError(t, storedTokens.StoreToFile(tokensFile.Name())) - - testDelegate := &mockDelegate{ - onRegister: func(lifecycler *BasicLifecycler, ringDesc Desc, instanceExists bool, instanceID string, instanceDesc InstanceDesc) (InstanceState, Tokens) { - assert.True(t, instanceExists) - assert.Equal(t, ACTIVE, instanceDesc.GetState()) - assert.Equal(t, storedTokens, Tokens(instanceDesc.GetTokens())) - assert.True(t, instanceDesc.GetRegisteredAt().IsZero()) - - return instanceDesc.GetState(), instanceDesc.GetTokens() - }, - } - - persistencyDelegate := NewTokensPersistencyDelegate(tokensFile.Name(), ACTIVE, testDelegate, log.NewNopLogger()) - - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - lifecycler, _, err := prepareBasicLifecyclerWithDelegate(t, cfg, persistencyDelegate) - require.NoError(t, err) - - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - assert.Equal(t, ACTIVE, lifecycler.GetState()) - assert.Equal(t, storedTokens, lifecycler.GetTokens()) - assert.True(t, lifecycler.IsRegistered()) - assert.InDelta(t, time.Now().Unix(), lifecycler.GetRegisteredAt().Unix(), 2) - - require.NoError(t, services.StopAndAwaitTerminated(ctx, lifecycler)) - - // Ensure we can still read back the tokens file. - actualTokens, err := LoadTokensFromFile(tokensFile.Name()) - require.NoError(t, err) - assert.Equal(t, storedTokens, actualTokens) -} - -func TestTokensPersistencyDelegate_ShouldHandleTheCaseTheInstanceIsAlreadyInTheRing(t *testing.T) { - storedTokens := Tokens{6, 7, 8, 9, 10} - differentTokens := Tokens{1, 2, 3, 4, 5} - - tests := map[string]struct { - storedTokens Tokens - initialState InstanceState - initialTokens Tokens - expectedState InstanceState - expectedTokens Tokens - }{ - "instance already registered in the ring without tokens": { - initialState: PENDING, - initialTokens: nil, - expectedState: ACTIVE, - expectedTokens: storedTokens, - }, - "instance already registered in the ring with tokens": { - initialState: JOINING, - initialTokens: differentTokens, - expectedState: JOINING, - expectedTokens: differentTokens, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - tokensFile, err := ioutil.TempFile(os.TempDir(), "tokens-*") - require.NoError(t, err) - defer os.Remove(tokensFile.Name()) //nolint:errcheck - - // Store some tokens to the file. - require.NoError(t, storedTokens.StoreToFile(tokensFile.Name())) - - // We assume is already registered to the ring. - registeredAt := time.Now().Add(-time.Hour) - - testDelegate := &mockDelegate{ - onRegister: func(lifecycler *BasicLifecycler, ringDesc Desc, instanceExists bool, instanceID string, instanceDesc InstanceDesc) (InstanceState, Tokens) { - return instanceDesc.GetState(), instanceDesc.GetTokens() - }, - } - - persistencyDelegate := NewTokensPersistencyDelegate(tokensFile.Name(), ACTIVE, testDelegate, log.NewNopLogger()) - - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - lifecycler, store, err := prepareBasicLifecyclerWithDelegate(t, cfg, persistencyDelegate) - require.NoError(t, err) - defer services.StopAndAwaitTerminated(ctx, lifecycler) //nolint:errcheck - - // Add the instance to the ring. - require.NoError(t, store.CAS(ctx, testRingKey, func(in interface{}) (out interface{}, retry bool, err error) { - ringDesc := NewDesc() - ringDesc.AddIngester(cfg.ID, cfg.Addr, cfg.Zone, testData.initialTokens, testData.initialState, registeredAt) - return ringDesc, true, nil - })) - - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - assert.Equal(t, testData.expectedState, lifecycler.GetState()) - assert.Equal(t, testData.expectedTokens, lifecycler.GetTokens()) - assert.True(t, lifecycler.IsRegistered()) - assert.Equal(t, registeredAt.Unix(), lifecycler.GetRegisteredAt().Unix()) - }) - } -} - -// TestDelegatesChain tests chaining all provided delegates together. -func TestDelegatesChain(t *testing.T) { - onStoppingCalled := false - - // Create a temporary file and immediately delete it. - tokensFile, err := ioutil.TempFile(os.TempDir(), "tokens-*") - require.NoError(t, err) - require.NoError(t, os.Remove(tokensFile.Name())) - - // Chain delegates together. - var chain BasicLifecyclerDelegate - chain = &mockDelegate{ - onRegister: func(lifecycler *BasicLifecycler, ringDesc Desc, instanceExists bool, instanceID string, instanceDesc InstanceDesc) (InstanceState, Tokens) { - assert.False(t, instanceExists) - return JOINING, Tokens{1, 2, 3, 4, 5} - }, - onStopping: func(l *BasicLifecycler) { - assert.Equal(t, LEAVING, l.GetState()) - onStoppingCalled = true - }, - } - - chain = NewTokensPersistencyDelegate(tokensFile.Name(), ACTIVE, chain, log.NewNopLogger()) - chain = NewLeaveOnStoppingDelegate(chain, log.NewNopLogger()) - chain = NewAutoForgetDelegate(time.Minute, chain, log.NewNopLogger()) - - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - lifecycler, _, err := prepareBasicLifecyclerWithDelegate(t, cfg, chain) - require.NoError(t, err) - defer services.StopAndAwaitTerminated(ctx, lifecycler) //nolint:errcheck - - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - assert.Equal(t, JOINING, lifecycler.GetState()) - assert.Equal(t, Tokens{1, 2, 3, 4, 5}, lifecycler.GetTokens()) - assert.True(t, lifecycler.IsRegistered()) - - require.NoError(t, services.StopAndAwaitTerminated(ctx, lifecycler)) - assert.True(t, onStoppingCalled) - - // Ensure tokens have been stored. - actualTokens, err := LoadTokensFromFile(tokensFile.Name()) - require.NoError(t, err) - assert.Equal(t, Tokens{1, 2, 3, 4, 5}, actualTokens) -} - -func TestAutoForgetDelegate(t *testing.T) { - const forgetPeriod = time.Minute - registeredAt := time.Now() - - tests := map[string]struct { - setup func(ringDesc *Desc) - expectedInstances []string - }{ - "no unhealthy instance in the ring": { - setup: func(ringDesc *Desc) { - ringDesc.AddIngester("instance-1", "1.1.1.1", "", nil, ACTIVE, registeredAt) - }, - expectedInstances: []string{testInstanceID, "instance-1"}, - }, - "unhealthy instance in the ring that has NOTreached the forget period yet": { - setup: func(ringDesc *Desc) { - i := ringDesc.AddIngester("instance-1", "1.1.1.1", "", nil, ACTIVE, registeredAt) - i.Timestamp = time.Now().Add(-forgetPeriod).Add(5 * time.Second).Unix() - ringDesc.Ingesters["instance-1"] = i - }, - expectedInstances: []string{testInstanceID, "instance-1"}, - }, - "unhealthy instance in the ring that has reached the forget period": { - setup: func(ringDesc *Desc) { - i := ringDesc.AddIngester("instance-1", "1.1.1.1", "", nil, ACTIVE, registeredAt) - i.Timestamp = time.Now().Add(-forgetPeriod).Add(-5 * time.Second).Unix() - ringDesc.Ingesters["instance-1"] = i - }, - expectedInstances: []string{testInstanceID}, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - cfg.HeartbeatPeriod = 100 * time.Millisecond - - testDelegate := &mockDelegate{} - - autoForgetDelegate := NewAutoForgetDelegate(forgetPeriod, testDelegate, log.NewNopLogger()) - lifecycler, store, err := prepareBasicLifecyclerWithDelegate(t, cfg, autoForgetDelegate) - require.NoError(t, err) - - // Setup the initial state of the ring. - require.NoError(t, store.CAS(ctx, testRingKey, func(in interface{}) (out interface{}, retry bool, err error) { - ringDesc := NewDesc() - testData.setup(ringDesc) - return ringDesc, true, nil - })) - - // Start the lifecycler. - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - defer services.StopAndAwaitTerminated(ctx, lifecycler) //nolint:errcheck - - // Wait until an heartbeat has been sent. - test.Poll(t, time.Second, true, func() interface{} { - return testutil.ToFloat64(lifecycler.metrics.heartbeats) > 0 - }) - - // Read back the ring status from the store. - v, err := store.Get(ctx, testRingKey) - require.NoError(t, err) - require.NotNil(t, v) - - var actualInstances []string - for id := range GetOrCreateRingDesc(v).GetIngesters() { - actualInstances = append(actualInstances, id) - } - - assert.ElementsMatch(t, testData.expectedInstances, actualInstances) - }) - } -} diff --git a/pkg/ring/basic_lifecycler_test.go b/pkg/ring/basic_lifecycler_test.go deleted file mode 100644 index 2ec483881ee..00000000000 --- a/pkg/ring/basic_lifecycler_test.go +++ /dev/null @@ -1,469 +0,0 @@ -package ring - -import ( - "context" - "testing" - "time" - - "github.com/go-kit/log" - "github.com/grafana/dskit/kv" - "github.com/grafana/dskit/kv/consul" - "github.com/grafana/dskit/services" - "github.com/prometheus/client_golang/prometheus/testutil" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/cortexproject/cortex/pkg/util/test" -) - -const ( - testRingKey = "test" - testRingName = "test" - testInstanceID = "test-id" -) - -func TestBasicLifecycler_RegisterOnStart(t *testing.T) { - tests := map[string]struct { - initialInstanceID string - initialInstanceDesc *InstanceDesc - registerState InstanceState - registerTokens Tokens - }{ - "initial ring is empty": { - registerState: ACTIVE, - registerTokens: Tokens{1, 2, 3, 4, 5}, - }, - "initial ring non empty (containing another instance)": { - initialInstanceID: "instance-1", - initialInstanceDesc: &InstanceDesc{ - Addr: "1.1.1.1", - State: ACTIVE, - Tokens: Tokens{6, 7, 8, 9, 10}, - RegisteredTimestamp: time.Now().Add(-time.Hour).Unix(), - }, - registerState: ACTIVE, - registerTokens: Tokens{1, 2, 3, 4, 5}, - }, - "initial ring contains the same instance with different state, tokens and address (new one is 127.0.0.1)": { - initialInstanceID: testInstanceID, - initialInstanceDesc: &InstanceDesc{ - Addr: "1.1.1.1", - State: ACTIVE, - Tokens: Tokens{6, 7, 8, 9, 10}, - RegisteredTimestamp: time.Now().Add(-time.Hour).Unix(), - }, - registerState: JOINING, - registerTokens: Tokens{1, 2, 3, 4, 5}, - }, - "initial ring contains the same instance with different address (new one is 127.0.0.1)": { - initialInstanceID: testInstanceID, - initialInstanceDesc: &InstanceDesc{ - Addr: "1.1.1.1", - State: ACTIVE, - Tokens: Tokens{1, 2, 3, 4, 5}, - RegisteredTimestamp: time.Now().Add(-time.Hour).Unix(), - }, - registerState: ACTIVE, - registerTokens: Tokens{1, 2, 3, 4, 5}, - }, - "initial ring contains the same instance with registered timestamp == 0": { - initialInstanceID: testInstanceID, - initialInstanceDesc: &InstanceDesc{ - Addr: "1.1.1.1", - State: ACTIVE, - Tokens: Tokens{1, 2, 3, 4, 5}, - RegisteredTimestamp: 0, - }, - registerState: ACTIVE, - registerTokens: Tokens{1, 2, 3, 4, 5}, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - lifecycler, delegate, store, err := prepareBasicLifecycler(t, cfg) - require.NoError(t, err) - defer services.StopAndAwaitTerminated(ctx, lifecycler) //nolint:errcheck - - // Add an initial instance to the ring. - if testData.initialInstanceDesc != nil { - require.NoError(t, store.CAS(ctx, testRingKey, func(in interface{}) (out interface{}, retry bool, err error) { - desc := testData.initialInstanceDesc - - ringDesc := GetOrCreateRingDesc(in) - ringDesc.AddIngester(testData.initialInstanceID, desc.Addr, desc.Zone, desc.Tokens, desc.State, desc.GetRegisteredAt()) - return ringDesc, true, nil - })) - } - - // Assert on the lifecycler state once the instance register delegate function will be called. - delegate.onRegister = func(_ *BasicLifecycler, ringDesc Desc, instanceExists bool, instanceID string, instanceDesc InstanceDesc) (InstanceState, Tokens) { - assert.Equal(t, services.Starting, lifecycler.State()) - assert.False(t, lifecycler.IsRegistered()) - assert.Equal(t, testInstanceID, instanceID) - assert.NotNil(t, ringDesc) - - if testData.initialInstanceID == instanceID { - assert.True(t, instanceExists) - assert.Equal(t, testData.initialInstanceDesc.Addr, instanceDesc.Addr) - assert.Equal(t, testData.initialInstanceDesc.Zone, instanceDesc.Zone) - assert.Equal(t, testData.initialInstanceDesc.State, instanceDesc.State) - assert.Equal(t, testData.initialInstanceDesc.Tokens, instanceDesc.Tokens) - assert.Equal(t, testData.initialInstanceDesc.RegisteredTimestamp, instanceDesc.RegisteredTimestamp) - } else { - assert.False(t, instanceExists) - } - - return testData.registerState, testData.registerTokens - } - - assert.Equal(t, testInstanceID, lifecycler.GetInstanceID()) - assert.Equal(t, services.New, lifecycler.State()) - assert.Equal(t, PENDING, lifecycler.GetState()) - assert.Empty(t, lifecycler.GetTokens()) - assert.False(t, lifecycler.IsRegistered()) - assert.Equal(t, float64(0), testutil.ToFloat64(lifecycler.metrics.tokensOwned)) - assert.Equal(t, float64(cfg.NumTokens), testutil.ToFloat64(lifecycler.metrics.tokensToOwn)) - assert.Zero(t, lifecycler.GetRegisteredAt()) - - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - - assert.Equal(t, services.Running, lifecycler.State()) - assert.Equal(t, testData.registerState, lifecycler.GetState()) - assert.Equal(t, testData.registerTokens, lifecycler.GetTokens()) - assert.True(t, lifecycler.IsRegistered()) - assert.Equal(t, float64(cfg.NumTokens), testutil.ToFloat64(lifecycler.metrics.tokensOwned)) - assert.Equal(t, float64(cfg.NumTokens), testutil.ToFloat64(lifecycler.metrics.tokensToOwn)) - - // Assert on the instance registered within the ring. - instanceDesc, ok := getInstanceFromStore(t, store, testInstanceID) - assert.True(t, ok) - assert.Equal(t, cfg.Addr, instanceDesc.GetAddr()) - assert.Equal(t, testData.registerState, instanceDesc.GetState()) - assert.Equal(t, testData.registerTokens, Tokens(instanceDesc.GetTokens())) - assert.Equal(t, cfg.Zone, instanceDesc.GetZone()) - - // The expected registered timestamp is "now" if the instance didn't exist in the ring yet - // or the already existing value. - if testData.initialInstanceID == testInstanceID { - assert.Equal(t, testData.initialInstanceDesc.RegisteredTimestamp, instanceDesc.RegisteredTimestamp) - } else { - assert.InDelta(t, time.Now().Unix(), instanceDesc.RegisteredTimestamp, 2) - } - }) - } -} - -func TestBasicLifecycler_UnregisterOnStop(t *testing.T) { - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - lifecycler, delegate, store, err := prepareBasicLifecycler(t, cfg) - require.NoError(t, err) - - delegate.onRegister = func(_ *BasicLifecycler, _ Desc, _ bool, _ string, _ InstanceDesc) (InstanceState, Tokens) { - return ACTIVE, Tokens{1, 2, 3, 4, 5} - } - delegate.onStopping = func(_ *BasicLifecycler) { - assert.Equal(t, services.Stopping, lifecycler.State()) - } - - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - assert.Equal(t, ACTIVE, lifecycler.GetState()) - assert.Equal(t, Tokens{1, 2, 3, 4, 5}, lifecycler.GetTokens()) - assert.True(t, lifecycler.IsRegistered()) - assert.NotZero(t, lifecycler.GetRegisteredAt()) - assert.Equal(t, float64(cfg.NumTokens), testutil.ToFloat64(lifecycler.metrics.tokensOwned)) - assert.Equal(t, float64(cfg.NumTokens), testutil.ToFloat64(lifecycler.metrics.tokensToOwn)) - - require.NoError(t, services.StopAndAwaitTerminated(ctx, lifecycler)) - assert.Equal(t, PENDING, lifecycler.GetState()) - assert.Equal(t, Tokens{}, lifecycler.GetTokens()) - assert.False(t, lifecycler.IsRegistered()) - assert.Zero(t, lifecycler.GetRegisteredAt()) - assert.Equal(t, float64(0), testutil.ToFloat64(lifecycler.metrics.tokensOwned)) - assert.Equal(t, float64(0), testutil.ToFloat64(lifecycler.metrics.tokensToOwn)) - - // Assert on the instance removed from the ring. - _, ok := getInstanceFromStore(t, store, testInstanceID) - assert.False(t, ok) -} - -func TestBasicLifecycler_HeartbeatWhileRunning(t *testing.T) { - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - cfg.HeartbeatPeriod = 10 * time.Millisecond - - lifecycler, _, store, err := prepareBasicLifecycler(t, cfg) - require.NoError(t, err) - defer services.StopAndAwaitTerminated(ctx, lifecycler) //nolint:errcheck - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - - // Get the initial timestamp so that we can then assert on the timestamp updated. - desc, _ := getInstanceFromStore(t, store, testInstanceID) - initialTimestamp := desc.GetTimestamp() - - test.Poll(t, time.Second, true, func() interface{} { - desc, _ := getInstanceFromStore(t, store, testInstanceID) - currTimestamp := desc.GetTimestamp() - - return currTimestamp > initialTimestamp - }) - - assert.Greater(t, testutil.ToFloat64(lifecycler.metrics.heartbeats), float64(0)) -} - -func TestBasicLifecycler_HeartbeatWhileStopping(t *testing.T) { - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - cfg.HeartbeatPeriod = 10 * time.Millisecond - - lifecycler, delegate, store, err := prepareBasicLifecycler(t, cfg) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - - onStoppingCalled := false - - delegate.onStopping = func(_ *BasicLifecycler) { - // Since the hearbeat timestamp is in seconds we would have to wait 1s before we can assert - // on it being changed, regardless the heartbeat period. To speed up this test, we're going - // to reset the timestamp to 0 and then assert it has been updated. - require.NoError(t, store.CAS(ctx, testRingKey, func(in interface{}) (out interface{}, retry bool, err error) { - ringDesc := GetOrCreateRingDesc(in) - instanceDesc := ringDesc.Ingesters[testInstanceID] - instanceDesc.Timestamp = 0 - ringDesc.Ingesters[testInstanceID] = instanceDesc - return ringDesc, true, nil - })) - - // Wait until the timestamp has been updated. - test.Poll(t, time.Second, true, func() interface{} { - desc, _ := getInstanceFromStore(t, store, testInstanceID) - currTimestamp := desc.GetTimestamp() - - return currTimestamp != 0 - }) - - onStoppingCalled = true - } - - assert.NoError(t, services.StopAndAwaitTerminated(ctx, lifecycler)) - assert.True(t, onStoppingCalled) -} - -func TestBasicLifecycler_HeartbeatAfterBackendRest(t *testing.T) { - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - cfg.HeartbeatPeriod = 10 * time.Millisecond - - lifecycler, delegate, store, err := prepareBasicLifecycler(t, cfg) - require.NoError(t, err) - defer services.StopAndAwaitTerminated(ctx, lifecycler) //nolint:errcheck - - registerTokens := Tokens{1, 2, 3, 4, 5} - delegate.onRegister = func(_ *BasicLifecycler, _ Desc, _ bool, _ string, _ InstanceDesc) (state InstanceState, tokens Tokens) { - return ACTIVE, registerTokens - } - - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - - // At this point the instance has been registered to the ring. - expectedRegisteredAt := lifecycler.GetRegisteredAt() - - // Now we delete it from the ring to simulate a ring storage reset and we expect the next heartbeat - // will restore it. - require.NoError(t, store.CAS(ctx, testRingKey, func(in interface{}) (out interface{}, retry bool, err error) { - return NewDesc(), true, nil - })) - - test.Poll(t, time.Second, true, func() interface{} { - desc, ok := getInstanceFromStore(t, store, testInstanceID) - return ok && - desc.GetTimestamp() > 0 && - desc.GetState() == ACTIVE && - Tokens(desc.GetTokens()).Equals(registerTokens) && - desc.GetAddr() == cfg.Addr && - desc.GetRegisteredAt().Unix() == expectedRegisteredAt.Unix() - }) -} - -func TestBasicLifecycler_ChangeState(t *testing.T) { - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - lifecycler, delegate, store, err := prepareBasicLifecycler(t, cfg) - require.NoError(t, err) - defer services.StopAndAwaitTerminated(ctx, lifecycler) //nolint:errcheck - - delegate.onRegister = func(_ *BasicLifecycler, _ Desc, _ bool, _ string, _ InstanceDesc) (InstanceState, Tokens) { - return JOINING, Tokens{1, 2, 3, 4, 5} - } - - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - assert.Equal(t, JOINING, lifecycler.GetState()) - - for _, state := range []InstanceState{ACTIVE, LEAVING} { - assert.NoError(t, lifecycler.ChangeState(ctx, state)) - assert.Equal(t, state, lifecycler.GetState()) - - // Assert on the instance state read from the ring. - desc, ok := getInstanceFromStore(t, store, testInstanceID) - assert.True(t, ok) - assert.Equal(t, state, desc.GetState()) - } -} - -func TestBasicLifecycler_TokensObservePeriod(t *testing.T) { - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - cfg.NumTokens = 5 - cfg.TokensObservePeriod = time.Second - - lifecycler, delegate, store, err := prepareBasicLifecycler(t, cfg) - require.NoError(t, err) - - delegate.onRegister = func(_ *BasicLifecycler, _ Desc, _ bool, _ string, _ InstanceDesc) (InstanceState, Tokens) { - return ACTIVE, Tokens{1, 2, 3, 4, 5} - } - - require.NoError(t, lifecycler.StartAsync(ctx)) - - // While the lifecycler is starting we poll the ring. As soon as the instance - // is registered, we remove some tokens to simulate how gossip memberlist - // reconciliation works in case of clashing tokens. - test.Poll(t, time.Second, true, func() interface{} { - // Ensure the instance has been registered in the ring. - desc, ok := getInstanceFromStore(t, store, testInstanceID) - if !ok { - return false - } - - // Remove some tokens. - return store.CAS(ctx, testRingKey, func(in interface{}) (out interface{}, retry bool, err error) { - ringDesc := GetOrCreateRingDesc(in) - ringDesc.AddIngester(testInstanceID, desc.Addr, desc.Zone, Tokens{4, 5}, desc.State, time.Now()) - return ringDesc, true, nil - }) == nil - }) - - require.NoError(t, lifecycler.AwaitRunning(ctx)) - assert.Subset(t, lifecycler.GetTokens(), Tokens{4, 5}) - assert.NotContains(t, lifecycler.GetTokens(), uint32(1)) - assert.NotContains(t, lifecycler.GetTokens(), uint32(2)) - assert.NotContains(t, lifecycler.GetTokens(), uint32(3)) -} - -func TestBasicLifecycler_updateInstance_ShouldAddInstanceToTheRingIfDoesNotExistEvenIfNotChanged(t *testing.T) { - ctx := context.Background() - cfg := prepareBasicLifecyclerConfig() - cfg.HeartbeatPeriod = time.Hour // No heartbeat during the test. - - lifecycler, delegate, store, err := prepareBasicLifecycler(t, cfg) - require.NoError(t, err) - defer services.StopAndAwaitTerminated(ctx, lifecycler) //nolint:errcheck - - registerTokens := Tokens{1, 2, 3, 4, 5} - delegate.onRegister = func(_ *BasicLifecycler, _ Desc, _ bool, _ string, _ InstanceDesc) (state InstanceState, tokens Tokens) { - return ACTIVE, registerTokens - } - - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - - // At this point the instance has been registered to the ring. - expectedRegisteredAt := lifecycler.GetRegisteredAt() - - // Now we delete it from the ring to simulate a ring storage reset. - require.NoError(t, store.CAS(ctx, testRingKey, func(in interface{}) (out interface{}, retry bool, err error) { - return NewDesc(), true, nil - })) - - // Run a noop update instance, but since the instance is not in the ring we do expect - // it will added back anyway. - require.NoError(t, lifecycler.updateInstance(ctx, func(_ *Desc, desc *InstanceDesc) bool { - return false - })) - - desc, ok := getInstanceFromStore(t, store, testInstanceID) - require.True(t, ok) - assert.Equal(t, ACTIVE, desc.GetState()) - assert.Equal(t, registerTokens, Tokens(desc.GetTokens())) - assert.Equal(t, cfg.Addr, desc.GetAddr()) - assert.Equal(t, expectedRegisteredAt.Unix(), desc.RegisteredTimestamp) - assert.Equal(t, expectedRegisteredAt.Unix(), desc.GetRegisteredAt().Unix()) -} - -func prepareBasicLifecyclerConfig() BasicLifecyclerConfig { - return BasicLifecyclerConfig{ - ID: testInstanceID, - Addr: "127.0.0.1:12345", - Zone: "test-zone", - HeartbeatPeriod: time.Minute, - TokensObservePeriod: 0, - NumTokens: 5, - } -} - -func prepareBasicLifecycler(t testing.TB, cfg BasicLifecyclerConfig) (*BasicLifecycler, *mockDelegate, kv.Client, error) { - t.Helper() - - delegate := &mockDelegate{} - lifecycler, store, err := prepareBasicLifecyclerWithDelegate(t, cfg, delegate) - return lifecycler, delegate, store, err -} - -func prepareBasicLifecyclerWithDelegate(t testing.TB, cfg BasicLifecyclerConfig, delegate BasicLifecyclerDelegate) (*BasicLifecycler, kv.Client, error) { - t.Helper() - - store, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - lifecycler, err := NewBasicLifecycler(cfg, testRingName, testRingKey, store, delegate, log.NewNopLogger(), nil) - return lifecycler, store, err -} - -type mockDelegate struct { - onRegister func(lifecycler *BasicLifecycler, ringDesc Desc, instanceExists bool, instanceID string, instanceDesc InstanceDesc) (InstanceState, Tokens) - onTokensChanged func(lifecycler *BasicLifecycler, tokens Tokens) - onStopping func(lifecycler *BasicLifecycler) - onHeartbeat func(lifecycler *BasicLifecycler, ringDesc *Desc, instanceDesc *InstanceDesc) -} - -func (m *mockDelegate) OnRingInstanceRegister(lifecycler *BasicLifecycler, ringDesc Desc, instanceExists bool, instanceID string, instanceDesc InstanceDesc) (InstanceState, Tokens) { - if m.onRegister == nil { - return PENDING, Tokens{} - } - - return m.onRegister(lifecycler, ringDesc, instanceExists, instanceID, instanceDesc) -} - -func (m *mockDelegate) OnRingInstanceTokens(lifecycler *BasicLifecycler, tokens Tokens) { - if m.onTokensChanged != nil { - m.onTokensChanged(lifecycler, tokens) - } -} - -func (m *mockDelegate) OnRingInstanceStopping(lifecycler *BasicLifecycler) { - if m.onStopping != nil { - m.onStopping(lifecycler) - } -} - -func (m *mockDelegate) OnRingInstanceHeartbeat(lifecycler *BasicLifecycler, ringDesc *Desc, instanceDesc *InstanceDesc) { - if m.onHeartbeat != nil { - m.onHeartbeat(lifecycler, ringDesc, instanceDesc) - } -} - -func getInstanceFromStore(t *testing.T, store kv.Client, instanceID string) (InstanceDesc, bool) { - out, err := store.Get(context.Background(), testRingKey) - require.NoError(t, err) - - if out == nil { - return InstanceDesc{}, false - } - - ringDesc := out.(*Desc) - instanceDesc, ok := ringDesc.GetIngesters()[instanceID] - - return instanceDesc, ok -} diff --git a/pkg/ring/client/pool_test.go b/pkg/ring/client/pool_test.go deleted file mode 100644 index 4037458f327..00000000000 --- a/pkg/ring/client/pool_test.go +++ /dev/null @@ -1,146 +0,0 @@ -package client - -import ( - "context" - fmt "fmt" - "testing" - "time" - - "github.com/go-kit/log" - "github.com/gogo/status" - "github.com/grafana/dskit/services" - "github.com/stretchr/testify/require" - "google.golang.org/grpc" - "google.golang.org/grpc/codes" - "google.golang.org/grpc/health/grpc_health_v1" -) - -type mockClient struct { - happy bool - status grpc_health_v1.HealthCheckResponse_ServingStatus -} - -func (i mockClient) Check(ctx context.Context, in *grpc_health_v1.HealthCheckRequest, opts ...grpc.CallOption) (*grpc_health_v1.HealthCheckResponse, error) { - if !i.happy { - return nil, fmt.Errorf("Fail") - } - return &grpc_health_v1.HealthCheckResponse{Status: i.status}, nil -} - -func (i mockClient) Close() error { - return nil -} - -func (i mockClient) Watch(ctx context.Context, in *grpc_health_v1.HealthCheckRequest, opts ...grpc.CallOption) (grpc_health_v1.Health_WatchClient, error) { - return nil, status.Error(codes.Unimplemented, "Watching is not supported") -} - -func TestHealthCheck(t *testing.T) { - tcs := []struct { - client mockClient - hasError bool - }{ - {mockClient{happy: true, status: grpc_health_v1.HealthCheckResponse_UNKNOWN}, true}, - {mockClient{happy: true, status: grpc_health_v1.HealthCheckResponse_SERVING}, false}, - {mockClient{happy: true, status: grpc_health_v1.HealthCheckResponse_NOT_SERVING}, true}, - {mockClient{happy: false, status: grpc_health_v1.HealthCheckResponse_UNKNOWN}, true}, - {mockClient{happy: false, status: grpc_health_v1.HealthCheckResponse_SERVING}, true}, - {mockClient{happy: false, status: grpc_health_v1.HealthCheckResponse_NOT_SERVING}, true}, - } - for _, tc := range tcs { - err := healthCheck(tc.client, 50*time.Millisecond) - hasError := err != nil - if hasError != tc.hasError { - t.Errorf("Expected error: %t, error: %v", tc.hasError, err) - } - } -} - -func TestPoolCache(t *testing.T) { - buildCount := 0 - factory := func(addr string) (PoolClient, error) { - if addr == "bad" { - return nil, fmt.Errorf("Fail") - } - buildCount++ - return mockClient{happy: true, status: grpc_health_v1.HealthCheckResponse_SERVING}, nil - } - - cfg := PoolConfig{ - HealthCheckTimeout: 50 * time.Millisecond, - CheckInterval: 10 * time.Second, - } - - pool := NewPool("test", cfg, nil, factory, nil, log.NewNopLogger()) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), pool)) - defer services.StopAndAwaitTerminated(context.Background(), pool) //nolint:errcheck - - _, err := pool.GetClientFor("1") - require.NoError(t, err) - if buildCount != 1 { - t.Errorf("Did not create client") - } - - _, err = pool.GetClientFor("1") - require.NoError(t, err) - if buildCount != 1 { - t.Errorf("Created client that should have been cached") - } - - _, err = pool.GetClientFor("2") - require.NoError(t, err) - if pool.Count() != 2 { - t.Errorf("Expected Count() = 2, got %d", pool.Count()) - } - - pool.RemoveClientFor("1") - if pool.Count() != 1 { - t.Errorf("Expected Count() = 1, got %d", pool.Count()) - } - - _, err = pool.GetClientFor("1") - require.NoError(t, err) - if buildCount != 3 || pool.Count() != 2 { - t.Errorf("Did not re-create client correctly") - } - - _, err = pool.GetClientFor("bad") - if err == nil { - t.Errorf("Bad create should have thrown an error") - } - if pool.Count() != 2 { - t.Errorf("Bad create should not have been added to cache") - } - - addrs := pool.RegisteredAddresses() - if len(addrs) != pool.Count() { - t.Errorf("Lengths of registered addresses and cache.Count() do not match") - } -} - -func TestCleanUnhealthy(t *testing.T) { - goodAddrs := []string{"good1", "good2"} - badAddrs := []string{"bad1", "bad2"} - clients := map[string]PoolClient{} - for _, addr := range goodAddrs { - clients[addr] = mockClient{happy: true, status: grpc_health_v1.HealthCheckResponse_SERVING} - } - for _, addr := range badAddrs { - clients[addr] = mockClient{happy: false, status: grpc_health_v1.HealthCheckResponse_NOT_SERVING} - } - pool := &Pool{ - clients: clients, - logger: log.NewNopLogger(), - } - pool.cleanUnhealthy() - for _, addr := range badAddrs { - if _, ok := pool.clients[addr]; ok { - t.Errorf("Found bad client after clean: %s\n", addr) - } - } - for _, addr := range goodAddrs { - if _, ok := pool.clients[addr]; !ok { - t.Errorf("Could not find good client after clean: %s\n", addr) - } - } -} diff --git a/pkg/ring/client/ring_service_discovery_test.go b/pkg/ring/client/ring_service_discovery_test.go deleted file mode 100644 index d161b724239..00000000000 --- a/pkg/ring/client/ring_service_discovery_test.go +++ /dev/null @@ -1,67 +0,0 @@ -package client - -import ( - "errors" - "testing" - - "github.com/stretchr/testify/assert" - - "github.com/cortexproject/cortex/pkg/ring" -) - -func TestNewRingServiceDiscovery(t *testing.T) { - tests := map[string]struct { - ringReplicationSet ring.ReplicationSet - ringErr error - expectedAddrs []string - expectedErr error - }{ - "discovery failure": { - ringErr: errors.New("mocked error"), - expectedErr: errors.New("mocked error"), - }, - "empty ring": { - ringErr: ring.ErrEmptyRing, - expectedAddrs: nil, - }, - "empty replication set": { - ringReplicationSet: ring.ReplicationSet{ - Instances: []ring.InstanceDesc{}, - }, - expectedAddrs: nil, - }, - "replication containing some endpoints": { - ringReplicationSet: ring.ReplicationSet{ - Instances: []ring.InstanceDesc{ - {Addr: "1.1.1.1"}, - {Addr: "2.2.2.2"}, - }, - }, - expectedAddrs: []string{"1.1.1.1", "2.2.2.2"}, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - r := &mockReadRing{} - r.mockedReplicationSet = testData.ringReplicationSet - r.mockedErr = testData.ringErr - - d := NewRingServiceDiscovery(r) - addrs, err := d() - assert.Equal(t, testData.expectedErr, err) - assert.Equal(t, testData.expectedAddrs, addrs) - }) - } -} - -type mockReadRing struct { - ring.ReadRing - - mockedReplicationSet ring.ReplicationSet - mockedErr error -} - -func (m *mockReadRing) GetAllHealthy(_ ring.Operation) (ring.ReplicationSet, error) { - return m.mockedReplicationSet, m.mockedErr -} diff --git a/pkg/ring/lifecycler_test.go b/pkg/ring/lifecycler_test.go deleted file mode 100644 index 1bc82274f5f..00000000000 --- a/pkg/ring/lifecycler_test.go +++ /dev/null @@ -1,686 +0,0 @@ -package ring - -import ( - "context" - "fmt" - "io/ioutil" - "os" - "sort" - "testing" - "time" - - "github.com/go-kit/log" - "github.com/grafana/dskit/flagext" - "github.com/grafana/dskit/kv/consul" - "github.com/grafana/dskit/services" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/cortexproject/cortex/pkg/util/test" -) - -func testLifecyclerConfig(ringConfig Config, id string) LifecyclerConfig { - var lifecyclerConfig LifecyclerConfig - flagext.DefaultValues(&lifecyclerConfig) - lifecyclerConfig.Addr = "0.0.0.0" - lifecyclerConfig.Port = 1 - lifecyclerConfig.ListenPort = 0 - lifecyclerConfig.RingConfig = ringConfig - lifecyclerConfig.NumTokens = 1 - lifecyclerConfig.ID = id - lifecyclerConfig.Zone = "zone1" - lifecyclerConfig.FinalSleep = 0 - lifecyclerConfig.HeartbeatPeriod = 100 * time.Millisecond - - return lifecyclerConfig -} - -func checkNormalised(d interface{}, id string) bool { - desc, ok := d.(*Desc) - return ok && - len(desc.Ingesters) == 1 && - desc.Ingesters[id].State == ACTIVE && - len(desc.Ingesters[id].Tokens) == 1 -} - -func TestLifecycler_HealthyInstancesCount(t *testing.T) { - ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - var ringConfig Config - flagext.DefaultValues(&ringConfig) - ringConfig.KVStore.Mock = ringStore - - ctx := context.Background() - - // Add the first ingester to the ring - lifecyclerConfig1 := testLifecyclerConfig(ringConfig, "ing1") - lifecyclerConfig1.HeartbeatPeriod = 100 * time.Millisecond - lifecyclerConfig1.JoinAfter = 100 * time.Millisecond - - lifecycler1, err := NewLifecycler(lifecyclerConfig1, &nopFlushTransferer{}, "ingester", IngesterRingKey, true, nil) - require.NoError(t, err) - assert.Equal(t, 0, lifecycler1.HealthyInstancesCount()) - - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler1)) - defer services.StopAndAwaitTerminated(ctx, lifecycler1) // nolint:errcheck - - // Assert the first ingester joined the ring - test.Poll(t, 1000*time.Millisecond, true, func() interface{} { - return lifecycler1.HealthyInstancesCount() == 1 - }) - - // Add the second ingester to the ring - lifecyclerConfig2 := testLifecyclerConfig(ringConfig, "ing2") - lifecyclerConfig2.HeartbeatPeriod = 100 * time.Millisecond - lifecyclerConfig2.JoinAfter = 100 * time.Millisecond - - lifecycler2, err := NewLifecycler(lifecyclerConfig2, &nopFlushTransferer{}, "ingester", IngesterRingKey, true, nil) - require.NoError(t, err) - assert.Equal(t, 0, lifecycler2.HealthyInstancesCount()) - - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler2)) - defer services.StopAndAwaitTerminated(ctx, lifecycler2) // nolint:errcheck - - // Assert the second ingester joined the ring - test.Poll(t, 1000*time.Millisecond, true, func() interface{} { - return lifecycler2.HealthyInstancesCount() == 2 - }) - - // Assert the first ingester count is updated - test.Poll(t, 1000*time.Millisecond, true, func() interface{} { - return lifecycler1.HealthyInstancesCount() == 2 - }) -} - -func TestLifecycler_ZonesCount(t *testing.T) { - ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - var ringConfig Config - flagext.DefaultValues(&ringConfig) - ringConfig.KVStore.Mock = ringStore - - events := []struct { - zone string - expectedZones int - }{ - {"zone-a", 1}, - {"zone-b", 2}, - {"zone-a", 2}, - {"zone-c", 3}, - } - - for idx, event := range events { - ctx := context.Background() - - // Register an ingester to the ring. - cfg := testLifecyclerConfig(ringConfig, fmt.Sprintf("instance-%d", idx)) - cfg.HeartbeatPeriod = 100 * time.Millisecond - cfg.JoinAfter = 100 * time.Millisecond - cfg.Zone = event.zone - - lifecycler, err := NewLifecycler(cfg, &nopFlushTransferer{}, "ingester", IngesterRingKey, true, nil) - require.NoError(t, err) - assert.Equal(t, 0, lifecycler.ZonesCount()) - - require.NoError(t, services.StartAndAwaitRunning(ctx, lifecycler)) - defer services.StopAndAwaitTerminated(ctx, lifecycler) // nolint:errcheck - - // Wait until joined. - test.Poll(t, time.Second, idx+1, func() interface{} { - return lifecycler.HealthyInstancesCount() - }) - - assert.Equal(t, event.expectedZones, lifecycler.ZonesCount()) - } -} - -func TestLifecycler_NilFlushTransferer(t *testing.T) { - ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - var ringConfig Config - flagext.DefaultValues(&ringConfig) - ringConfig.KVStore.Mock = ringStore - lifecyclerConfig := testLifecyclerConfig(ringConfig, "ing1") - - // Create a lifecycler with nil FlushTransferer to make sure it operates correctly - lifecycler, err := NewLifecycler(lifecyclerConfig, nil, "ingester", IngesterRingKey, true, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), lifecycler)) - - // Ensure the lifecycler joined the ring - test.Poll(t, time.Second, 1, func() interface{} { - return lifecycler.HealthyInstancesCount() - }) - - require.NoError(t, services.StopAndAwaitTerminated(context.Background(), lifecycler)) - - assert.Equal(t, 0, lifecycler.HealthyInstancesCount()) -} - -func TestLifecycler_TwoRingsWithDifferentKeysOnTheSameKVStore(t *testing.T) { - // Create a shared ring - ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - var ringConfig Config - flagext.DefaultValues(&ringConfig) - ringConfig.KVStore.Mock = ringStore - - // Create two lifecyclers, each on a separate ring - lifecyclerConfig1 := testLifecyclerConfig(ringConfig, "instance-1") - lifecyclerConfig2 := testLifecyclerConfig(ringConfig, "instance-2") - - lifecycler1, err := NewLifecycler(lifecyclerConfig1, nil, "service-1", "ring-1", true, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), lifecycler1)) - defer services.StopAndAwaitTerminated(context.Background(), lifecycler1) //nolint:errcheck - - lifecycler2, err := NewLifecycler(lifecyclerConfig2, nil, "service-2", "ring-2", true, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), lifecycler2)) - defer services.StopAndAwaitTerminated(context.Background(), lifecycler2) //nolint:errcheck - - // Ensure each lifecycler reports 1 healthy instance, because they're - // in a different ring - test.Poll(t, time.Second, 1, func() interface{} { - return lifecycler1.HealthyInstancesCount() - }) - - test.Poll(t, time.Second, 1, func() interface{} { - return lifecycler2.HealthyInstancesCount() - }) -} - -type nopFlushTransferer struct{} - -func (f *nopFlushTransferer) Flush() {} -func (f *nopFlushTransferer) TransferOut(_ context.Context) error { - return nil -} - -func TestLifecycler_ShouldHandleInstanceAbruptlyRestarted(t *testing.T) { - ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - var ringConfig Config - flagext.DefaultValues(&ringConfig) - ringConfig.KVStore.Mock = ringStore - - r, err := New(ringConfig, "ingester", IngesterRingKey, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), r)) - defer services.StopAndAwaitTerminated(context.Background(), r) //nolint:errcheck - - // Add an 'ingester' with normalised tokens. - lifecyclerConfig1 := testLifecyclerConfig(ringConfig, "ing1") - l1, err := NewLifecycler(lifecyclerConfig1, &nopFlushTransferer{}, "ingester", IngesterRingKey, true, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), l1)) - - // Check this ingester joined, is active, and has one token. - test.Poll(t, 1000*time.Millisecond, true, func() interface{} { - d, err := r.KVClient.Get(context.Background(), IngesterRingKey) - require.NoError(t, err) - return checkNormalised(d, "ing1") - }) - - expectedTokens := l1.getTokens() - expectedRegisteredAt := l1.getRegisteredAt() - - // Wait 1 second because the registered timestamp has second precision. Without waiting - // we wouldn't have the guarantee the previous registered timestamp is preserved. - time.Sleep(time.Second) - - // Add a second ingester with the same settings, so it will think it has restarted - l2, err := NewLifecycler(lifecyclerConfig1, &nopFlushTransferer{}, "ingester", IngesterRingKey, true, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), l2)) - - // Check the new ingester picked up the same tokens and registered timestamp. - test.Poll(t, 1000*time.Millisecond, true, func() interface{} { - d, err := r.KVClient.Get(context.Background(), IngesterRingKey) - require.NoError(t, err) - - return checkNormalised(d, "ing1") && - expectedTokens.Equals(l2.getTokens()) && - expectedRegisteredAt.Unix() == l2.getRegisteredAt().Unix() - }) -} - -type MockClient struct { - ListFunc func(ctx context.Context, prefix string) ([]string, error) - GetFunc func(ctx context.Context, key string) (interface{}, error) - DeleteFunc func(ctx context.Context, key string) error - CASFunc func(ctx context.Context, key string, f func(in interface{}) (out interface{}, retry bool, err error)) error - WatchKeyFunc func(ctx context.Context, key string, f func(interface{}) bool) - WatchPrefixFunc func(ctx context.Context, prefix string, f func(string, interface{}) bool) -} - -func (m *MockClient) List(ctx context.Context, prefix string) ([]string, error) { - if m.ListFunc != nil { - return m.ListFunc(ctx, prefix) - } - - return nil, nil -} - -func (m *MockClient) Get(ctx context.Context, key string) (interface{}, error) { - if m.GetFunc != nil { - return m.GetFunc(ctx, key) - } - - return nil, nil -} - -func (m *MockClient) Delete(ctx context.Context, key string) error { - if m.DeleteFunc != nil { - return m.DeleteFunc(ctx, key) - } - - return nil -} - -func (m *MockClient) CAS(ctx context.Context, key string, f func(in interface{}) (out interface{}, retry bool, err error)) error { - if m.CASFunc != nil { - return m.CASFunc(ctx, key, f) - } - - return nil -} - -func (m *MockClient) WatchKey(ctx context.Context, key string, f func(interface{}) bool) { - if m.WatchKeyFunc != nil { - m.WatchKeyFunc(ctx, key, f) - } -} - -func (m *MockClient) WatchPrefix(ctx context.Context, prefix string, f func(string, interface{}) bool) { - if m.WatchPrefixFunc != nil { - m.WatchPrefixFunc(ctx, prefix, f) - } -} - -// Ensure a check ready returns error when consul returns a nil key and the ingester already holds keys. This happens if the ring key gets deleted -func TestCheckReady(t *testing.T) { - var ringConfig Config - flagext.DefaultValues(&ringConfig) - ringConfig.KVStore.Mock = &MockClient{} - - r, err := New(ringConfig, "ingester", IngesterRingKey, nil) - require.NoError(t, err) - require.NoError(t, r.StartAsync(context.Background())) - // This is very atypical, but if we used AwaitRunning, that would fail, because of how quickly service terminates ... - // by the time we check for Running state, it is already terminated, because mock ring has no WatchFunc, so it - // will just exit. - require.NoError(t, r.AwaitTerminated(context.Background())) - - cfg := testLifecyclerConfig(ringConfig, "ring1") - cfg.MinReadyDuration = 1 * time.Nanosecond - l1, err := NewLifecycler(cfg, &nopFlushTransferer{}, "ingester", IngesterRingKey, true, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), l1)) - - l1.setTokens(Tokens([]uint32{1})) - - // Delete the ring key before checking ready - err = l1.CheckReady(context.Background()) - require.Error(t, err) -} - -type noopFlushTransferer struct { -} - -func (f *noopFlushTransferer) Flush() {} -func (f *noopFlushTransferer) TransferOut(ctx context.Context) error { return nil } - -func TestRestartIngester_DisabledHeartbeat_unregister_on_shutdown_false(t *testing.T) { - ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - var ringConfig Config - flagext.DefaultValues(&ringConfig) - ringConfig.KVStore.Mock = ringStore - - r, err := New(ringConfig, "ingester", IngesterRingKey, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), r)) - - // poll function waits for a condition and returning actual state of the ingesters after the condition succeed. - poll := func(condition func(*Desc) bool) map[string]InstanceDesc { - var ingesters map[string]InstanceDesc - test.Poll(t, 5*time.Second, true, func() interface{} { - d, err := r.KVClient.Get(context.Background(), IngesterRingKey) - require.NoError(t, err) - - desc, ok := d.(*Desc) - - if ok { - ingesters = desc.Ingesters - } - return ok && condition(desc) - }) - - return ingesters - } - - // Starts Ingester and wait it to became active - startIngesterAndWaitActive := func(ingId string) *Lifecycler { - lifecyclerConfig := testLifecyclerConfig(ringConfig, ingId) - // Disabling heartBeat and unregister_on_shutdown - lifecyclerConfig.UnregisterOnShutdown = false - lifecyclerConfig.HeartbeatPeriod = 0 - lifecycler, err := NewLifecycler(lifecyclerConfig, &noopFlushTransferer{}, "lifecycler", IngesterRingKey, true, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), lifecycler)) - poll(func(desc *Desc) bool { - return desc.Ingesters[ingId].State == ACTIVE - }) - return lifecycler - } - - // We are going to create 2 fake ingester with disabled heart beat and `unregister_on_shutdown=false` then - // test if the ingester 2 became active after: - // * Clean Shutdown (LEAVING after shutdown) - // * Crashes while in the PENDING or JOINING state - l1 := startIngesterAndWaitActive("ing1") - defer services.StopAndAwaitTerminated(context.Background(), l1) //nolint:errcheck - - l2 := startIngesterAndWaitActive("ing2") - - ingesters := poll(func(desc *Desc) bool { - return len(desc.Ingesters) == 2 && desc.Ingesters["ing1"].State == ACTIVE && desc.Ingesters["ing2"].State == ACTIVE - }) - - // Both Ingester should be active and running - assert.Equal(t, ACTIVE, ingesters["ing1"].State) - assert.Equal(t, ACTIVE, ingesters["ing2"].State) - - // Stop One ingester gracefully should leave it on LEAVING STATE on the ring - require.NoError(t, services.StopAndAwaitTerminated(context.Background(), l2)) - - ingesters = poll(func(desc *Desc) bool { - return len(desc.Ingesters) == 2 && desc.Ingesters["ing2"].State == LEAVING - }) - assert.Equal(t, LEAVING, ingesters["ing2"].State) - - // Start Ingester2 again - Should flip back to ACTIVE in the ring - l2 = startIngesterAndWaitActive("ing2") - require.NoError(t, services.StopAndAwaitTerminated(context.Background(), l2)) - - // Simulate ingester2 crash on startup and left the ring with JOINING state - err = r.KVClient.CAS(context.Background(), IngesterRingKey, func(in interface{}) (out interface{}, retry bool, err error) { - desc, ok := in.(*Desc) - require.Equal(t, true, ok) - ingester2Desc := desc.Ingesters["ing2"] - ingester2Desc.State = JOINING - desc.Ingesters["ing2"] = ingester2Desc - return desc, true, nil - }) - require.NoError(t, err) - - l2 = startIngesterAndWaitActive("ing2") - require.NoError(t, services.StopAndAwaitTerminated(context.Background(), l2)) - - // Simulate ingester2 crash on startup and left the ring with PENDING state - err = r.KVClient.CAS(context.Background(), IngesterRingKey, func(in interface{}) (out interface{}, retry bool, err error) { - desc, ok := in.(*Desc) - require.Equal(t, true, ok) - ingester2Desc := desc.Ingesters["ing2"] - ingester2Desc.State = PENDING - desc.Ingesters["ing2"] = ingester2Desc - return desc, true, nil - }) - require.NoError(t, err) - - l2 = startIngesterAndWaitActive("ing2") - require.NoError(t, services.StopAndAwaitTerminated(context.Background(), l2)) -} - -func TestTokensOnDisk(t *testing.T) { - ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - var ringConfig Config - flagext.DefaultValues(&ringConfig) - ringConfig.KVStore.Mock = ringStore - - r, err := New(ringConfig, "ingester", IngesterRingKey, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), r)) - defer services.StopAndAwaitTerminated(context.Background(), r) //nolint:errcheck - - tokenDir, err := ioutil.TempDir(os.TempDir(), "tokens_on_disk") - require.NoError(t, err) - defer func() { - require.NoError(t, os.RemoveAll(tokenDir)) - }() - - lifecyclerConfig := testLifecyclerConfig(ringConfig, "ing1") - lifecyclerConfig.NumTokens = 512 - lifecyclerConfig.TokensFilePath = tokenDir + "/tokens" - - // Start first ingester. - l1, err := NewLifecycler(lifecyclerConfig, &noopFlushTransferer{}, "ingester", IngesterRingKey, true, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), l1)) - - // Check this ingester joined, is active, and has 512 token. - var expTokens []uint32 - test.Poll(t, 1000*time.Millisecond, true, func() interface{} { - d, err := r.KVClient.Get(context.Background(), IngesterRingKey) - require.NoError(t, err) - - desc, ok := d.(*Desc) - if ok { - expTokens = desc.Ingesters["ing1"].Tokens - } - return ok && - len(desc.Ingesters) == 1 && - desc.Ingesters["ing1"].State == ACTIVE && - len(desc.Ingesters["ing1"].Tokens) == 512 - }) - - require.NoError(t, services.StopAndAwaitTerminated(context.Background(), l1)) - - // Start new ingester at same token directory. - lifecyclerConfig.ID = "ing2" - l2, err := NewLifecycler(lifecyclerConfig, &noopFlushTransferer{}, "ingester", IngesterRingKey, true, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), l2)) - defer services.StopAndAwaitTerminated(context.Background(), l2) //nolint:errcheck - - // Check this ingester joined, is active, and has 512 token. - var actTokens []uint32 - test.Poll(t, 1000*time.Millisecond, true, func() interface{} { - d, err := r.KVClient.Get(context.Background(), IngesterRingKey) - require.NoError(t, err) - desc, ok := d.(*Desc) - if ok { - actTokens = desc.Ingesters["ing2"].Tokens - } - return ok && - len(desc.Ingesters) == 1 && - desc.Ingesters["ing2"].State == ACTIVE && - len(desc.Ingesters["ing2"].Tokens) == 512 - }) - - // Check for same tokens. - sort.Slice(expTokens, func(i, j int) bool { return expTokens[i] < expTokens[j] }) - sort.Slice(actTokens, func(i, j int) bool { return actTokens[i] < actTokens[j] }) - for i := 0; i < 512; i++ { - require.Equal(t, expTokens, actTokens) - } -} - -// JoinInLeavingState ensures that if the lifecycler starts up and the ring already has it in a LEAVING state that it still is able to auto join -func TestJoinInLeavingState(t *testing.T) { - ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - var ringConfig Config - flagext.DefaultValues(&ringConfig) - ringConfig.KVStore.Mock = ringStore - - r, err := New(ringConfig, "ingester", IngesterRingKey, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), r)) - defer services.StopAndAwaitTerminated(context.Background(), r) //nolint:errcheck - - cfg := testLifecyclerConfig(ringConfig, "ing1") - cfg.NumTokens = 2 - cfg.MinReadyDuration = 1 * time.Nanosecond - - // Set state as LEAVING - err = r.KVClient.CAS(context.Background(), IngesterRingKey, func(in interface{}) (interface{}, bool, error) { - r := &Desc{ - Ingesters: map[string]InstanceDesc{ - "ing1": { - State: LEAVING, - Tokens: []uint32{1, 4}, - }, - "ing2": { - Tokens: []uint32{2, 3}, - }, - }, - } - - return r, true, nil - }) - require.NoError(t, err) - - l1, err := NewLifecycler(cfg, &nopFlushTransferer{}, "ingester", IngesterRingKey, true, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), l1)) - - // Check that the lifecycler was able to join after coming up in LEAVING - test.Poll(t, 1000*time.Millisecond, true, func() interface{} { - d, err := r.KVClient.Get(context.Background(), IngesterRingKey) - require.NoError(t, err) - - desc, ok := d.(*Desc) - return ok && - len(desc.Ingesters) == 2 && - desc.Ingesters["ing1"].State == ACTIVE && - len(desc.Ingesters["ing1"].Tokens) == cfg.NumTokens && - len(desc.Ingesters["ing2"].Tokens) == 2 - }) -} - -// JoinInJoiningState ensures that if the lifecycler starts up and the ring already has it in a JOINING state that it still is able to auto join -func TestJoinInJoiningState(t *testing.T) { - ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - var ringConfig Config - flagext.DefaultValues(&ringConfig) - ringConfig.KVStore.Mock = ringStore - - r, err := New(ringConfig, "ingester", IngesterRingKey, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), r)) - defer services.StopAndAwaitTerminated(context.Background(), r) //nolint:errcheck - - cfg := testLifecyclerConfig(ringConfig, "ing1") - cfg.NumTokens = 2 - cfg.MinReadyDuration = 1 * time.Nanosecond - instance1RegisteredAt := time.Now().Add(-1 * time.Hour) - instance2RegisteredAt := time.Now().Add(-2 * time.Hour) - - // Set state as JOINING - err = r.KVClient.CAS(context.Background(), IngesterRingKey, func(in interface{}) (interface{}, bool, error) { - r := &Desc{ - Ingesters: map[string]InstanceDesc{ - "ing1": { - State: JOINING, - Tokens: []uint32{1, 4}, - RegisteredTimestamp: instance1RegisteredAt.Unix(), - }, - "ing2": { - Tokens: []uint32{2, 3}, - RegisteredTimestamp: instance2RegisteredAt.Unix(), - }, - }, - } - - return r, true, nil - }) - require.NoError(t, err) - - l1, err := NewLifecycler(cfg, &nopFlushTransferer{}, "ingester", IngesterRingKey, true, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), l1)) - - // Check that the lifecycler was able to join after coming up in JOINING - test.Poll(t, 1000*time.Millisecond, true, func() interface{} { - d, err := r.KVClient.Get(context.Background(), IngesterRingKey) - require.NoError(t, err) - - desc, ok := d.(*Desc) - return ok && - len(desc.Ingesters) == 2 && - desc.Ingesters["ing1"].State == ACTIVE && - len(desc.Ingesters["ing1"].Tokens) == cfg.NumTokens && - len(desc.Ingesters["ing2"].Tokens) == 2 && - desc.Ingesters["ing1"].RegisteredTimestamp == instance1RegisteredAt.Unix() && - desc.Ingesters["ing2"].RegisteredTimestamp == instance2RegisteredAt.Unix() - }) -} - -func TestRestoreOfZoneWhenOverwritten(t *testing.T) { - // This test is simulating a case during upgrade of pre 1.0 cortex where - // older ingesters do not have the zone field in their ring structs - // so it gets removed. The current version of the lifecylcer should - // write it back on update during its next heartbeat. - - ringStore, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - var ringConfig Config - flagext.DefaultValues(&ringConfig) - ringConfig.KVStore.Mock = ringStore - - r, err := New(ringConfig, "ingester", IngesterRingKey, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), r)) - defer services.StopAndAwaitTerminated(context.Background(), r) //nolint:errcheck - - cfg := testLifecyclerConfig(ringConfig, "ing1") - - // Set ing1 to not have a zone - err = r.KVClient.CAS(context.Background(), IngesterRingKey, func(in interface{}) (interface{}, bool, error) { - r := &Desc{ - Ingesters: map[string]InstanceDesc{ - "ing1": { - State: ACTIVE, - Addr: "0.0.0.0", - Tokens: []uint32{1, 4}, - }, - "ing2": { - Tokens: []uint32{2, 3}, - }, - }, - } - - return r, true, nil - }) - require.NoError(t, err) - - l1, err := NewLifecycler(cfg, &nopFlushTransferer{}, "ingester", IngesterRingKey, true, nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), l1)) - - // Check that the lifecycler was able to reset the zone value to the expected setting - test.Poll(t, 1000*time.Millisecond, true, func() interface{} { - d, err := r.KVClient.Get(context.Background(), IngesterRingKey) - require.NoError(t, err) - desc, ok := d.(*Desc) - return ok && - len(desc.Ingesters) == 2 && - desc.Ingesters["ing1"].Zone == l1.Zone && - desc.Ingesters["ing2"].Zone == "" - - }) -} diff --git a/pkg/ring/merge_test.go b/pkg/ring/merge_test.go deleted file mode 100644 index 08edb24fade..00000000000 --- a/pkg/ring/merge_test.go +++ /dev/null @@ -1,453 +0,0 @@ -package ring - -import ( - "testing" - "time" - - "github.com/stretchr/testify/assert" -) - -func TestNormalizationAndConflictResolution(t *testing.T) { - now := time.Now().Unix() - - first := &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{50, 40, 40, 30}}, - "Ing 2": {Addr: "addr2", Timestamp: 123456, State: LEAVING, Tokens: []uint32{100, 5, 5, 100, 100, 200, 20, 10}}, - "Ing 3": {Addr: "addr3", Timestamp: now, State: LEFT, Tokens: []uint32{100, 200, 300}}, - "Ing 4": {Addr: "addr4", Timestamp: now, State: LEAVING, Tokens: []uint32{30, 40, 50}}, - "Unknown": {Tokens: []uint32{100}}, - }, - } - - second := &Desc{ - Ingesters: map[string]InstanceDesc{ - "Unknown": { - Timestamp: now + 10, - Tokens: []uint32{1000, 2000}, - }, - }, - } - - change, err := first.Merge(second, false) - if err != nil { - t.Fatal(err) - } - changeRing := (*Desc)(nil) - if change != nil { - changeRing = change.(*Desc) - } - - assert.Equal(t, &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: 123456, State: LEAVING, Tokens: []uint32{5, 10, 20, 100, 200}}, - "Ing 3": {Addr: "addr3", Timestamp: now, State: LEFT}, - "Ing 4": {Addr: "addr4", Timestamp: now, State: LEAVING}, - "Unknown": {Timestamp: now + 10, Tokens: []uint32{1000, 2000}}, - }, - }, first) - - assert.Equal(t, &Desc{ - // change ring is always normalized, "Unknown" ingester has lost two tokens: 100 from first ring (because of second ring), and 1000 (conflict resolution) - Ingesters: map[string]InstanceDesc{ - "Unknown": {Timestamp: now + 10, Tokens: []uint32{1000, 2000}}, - }, - }, changeRing) -} - -func merge(ring1, ring2 *Desc) (*Desc, *Desc) { - change, err := ring1.Merge(ring2, false) - if err != nil { - panic(err) - } - - if change == nil { - return ring1, nil - } - - changeRing := change.(*Desc) - return ring1, changeRing -} - -func TestMerge(t *testing.T) { - now := time.Now().Unix() - - firstRing := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now, State: JOINING, Tokens: []uint32{5, 10, 20, 100, 200}}, - }, - } - } - - secondRing := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 3": {Addr: "addr3", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{150, 250, 350}}, - "Ing 2": {Addr: "addr2", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{5, 10, 20, 100, 200}}, - }, - } - } - - thirdRing := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now + 10, State: LEAVING, Tokens: []uint32{30, 40, 50}}, - "Ing 3": {Addr: "addr3", Timestamp: now + 10, State: ACTIVE, Tokens: []uint32{150, 250, 350}}, - }, - } - } - - expectedFirstSecondMerge := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{5, 10, 20, 100, 200}}, - "Ing 3": {Addr: "addr3", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{150, 250, 350}}, - }, - } - } - - expectedFirstSecondThirdMerge := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now + 10, State: LEAVING, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{5, 10, 20, 100, 200}}, - "Ing 3": {Addr: "addr3", Timestamp: now + 10, State: ACTIVE, Tokens: []uint32{150, 250, 350}}, - }, - } - } - - fourthRing := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now + 10, State: LEFT, Tokens: []uint32{30, 40, 50}}, - }, - } - } - - expectedFirstSecondThirdFourthMerge := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now + 10, State: LEFT, Tokens: nil}, - "Ing 2": {Addr: "addr2", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{5, 10, 20, 100, 200}}, - "Ing 3": {Addr: "addr3", Timestamp: now + 10, State: ACTIVE, Tokens: []uint32{150, 250, 350}}, - }, - } - } - - { - our, ch := merge(firstRing(), secondRing()) - assert.Equal(t, expectedFirstSecondMerge(), our) - assert.Equal(t, secondRing(), ch) // entire second ring is new - } - - { // idempotency: (no change after applying same ring again) - our, ch := merge(expectedFirstSecondMerge(), secondRing()) - assert.Equal(t, expectedFirstSecondMerge(), our) - assert.Equal(t, (*Desc)(nil), ch) - } - - { // commutativity: Merge(first, second) == Merge(second, first) - our, ch := merge(secondRing(), firstRing()) - assert.Equal(t, expectedFirstSecondMerge(), our) - // when merging first into second ring, only "Ing 1" is new - assert.Equal(t, &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - }, - }, ch) - } - - { // associativity: Merge(Merge(first, second), third) == Merge(first, Merge(second, third)) - our1, _ := merge(firstRing(), secondRing()) - our1, _ = merge(our1, thirdRing()) - assert.Equal(t, expectedFirstSecondThirdMerge(), our1) - - our2, _ := merge(secondRing(), thirdRing()) - our2, _ = merge(our2, firstRing()) - assert.Equal(t, expectedFirstSecondThirdMerge(), our2) - } - - { - out, ch := merge(expectedFirstSecondThirdMerge(), fourthRing()) - assert.Equal(t, expectedFirstSecondThirdFourthMerge(), out) - // entire fourth ring is the update -- but without tokens - assert.Equal(t, &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now + 10, State: LEFT, Tokens: nil}, - }, - }, ch) - } -} - -func TestTokensTakeover(t *testing.T) { - now := time.Now().Unix() - - first := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now, State: JOINING, Tokens: []uint32{5, 10, 20}}, // partially migrated from Ing 3 - }, - } - } - - second := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 2": {Addr: "addr2", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{5, 10, 20}}, - "Ing 3": {Addr: "addr3", Timestamp: now + 5, State: LEAVING, Tokens: []uint32{5, 10, 20, 100, 200}}, - }, - } - } - - merged := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{5, 10, 20}}, - "Ing 3": {Addr: "addr3", Timestamp: now + 5, State: LEAVING, Tokens: []uint32{100, 200}}, - }, - } - } - - { - our, ch := merge(first(), second()) - assert.Equal(t, merged(), our) - assert.Equal(t, &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 2": {Addr: "addr2", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{5, 10, 20}}, - "Ing 3": {Addr: "addr3", Timestamp: now + 5, State: LEAVING, Tokens: []uint32{100, 200}}, // change doesn't contain conflicted tokens - }, - }, ch) - } - - { // idempotency: (no change after applying same ring again) - our, ch := merge(merged(), second()) - assert.Equal(t, merged(), our) - assert.Equal(t, (*Desc)(nil), ch) - } - - { // commutativity: (Merge(first, second) == Merge(second, first) - our, ch := merge(second(), first()) - assert.Equal(t, merged(), our) - - // change is different though - assert.Equal(t, &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - }, - }, ch) - } -} - -func TestMergeLeft(t *testing.T) { - now := time.Now().Unix() - - firstRing := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now, State: JOINING, Tokens: []uint32{5, 10, 20, 100, 200}}, - }, - } - } - - secondRing := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 2": {Addr: "addr2", Timestamp: now, State: LEFT, Tokens: []uint32{5, 10, 20, 100, 200}}, - }, - } - } - - expectedFirstSecondMerge := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now, State: LEFT}, - }, - } - } - - thirdRing := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now + 10, State: LEAVING, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now, State: JOINING, Tokens: []uint32{5, 10, 20, 100, 200}}, // from firstRing - }, - } - } - - expectedFirstSecondThirdMerge := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now + 10, State: LEAVING, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now, State: LEFT}, - }, - } - } - - { - our, ch := merge(firstRing(), secondRing()) - assert.Equal(t, expectedFirstSecondMerge(), our) - assert.Equal(t, &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 2": {Addr: "addr2", Timestamp: now, State: LEFT}, - }, - }, ch) - } - - { // idempotency: (no change after applying same ring again) - our, ch := merge(expectedFirstSecondMerge(), secondRing()) - assert.Equal(t, expectedFirstSecondMerge(), our) - assert.Equal(t, (*Desc)(nil), ch) - } - - { // commutativity: Merge(first, second) == Merge(second, first) - our, ch := merge(secondRing(), firstRing()) - assert.Equal(t, expectedFirstSecondMerge(), our) - // when merging first into second ring, only "Ing 1" is new - assert.Equal(t, &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - }, - }, ch) - } - - { // associativity: Merge(Merge(first, second), third) == Merge(first, Merge(second, third)) - our1, _ := merge(firstRing(), secondRing()) - our1, _ = merge(our1, thirdRing()) - assert.Equal(t, expectedFirstSecondThirdMerge(), our1) - - our2, _ := merge(secondRing(), thirdRing()) - our2, _ = merge(our2, firstRing()) - assert.Equal(t, expectedFirstSecondThirdMerge(), our2) - } -} - -func TestMergeRemoveMissing(t *testing.T) { - now := time.Now().Unix() - - firstRing := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now, State: JOINING, Tokens: []uint32{5, 10, 20, 100, 200}}, - "Ing 3": {Addr: "addr3", Timestamp: now, State: LEAVING, Tokens: []uint32{5, 10, 20, 100, 200}}, - }, - } - } - - secondRing := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{5, 10, 20, 100, 200}}, - }, - } - } - - expectedFirstSecondMerge := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{5, 10, 20, 100, 200}}, - "Ing 3": {Addr: "addr3", Timestamp: now + 3, State: LEFT}, // When deleting, time depends on value passed to merge function. - }, - } - } - - { - our, ch := mergeLocalCAS(firstRing(), secondRing(), now+3) - assert.Equal(t, expectedFirstSecondMerge(), our) - assert.Equal(t, &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 2": {Addr: "addr2", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{5, 10, 20, 100, 200}}, - "Ing 3": {Addr: "addr3", Timestamp: now + 3, State: LEFT}, // When deleting, time depends on value passed to merge function. - }, - }, ch) // entire second ring is new - } - - { // idempotency: (no change after applying same ring again, even if time has advanced) - our, ch := mergeLocalCAS(expectedFirstSecondMerge(), secondRing(), now+10) - assert.Equal(t, expectedFirstSecondMerge(), our) - assert.Equal(t, (*Desc)(nil), ch) - } - - { // commutativity is broken when deleting missing entries. But let's make sure we get reasonable results at least. - our, ch := mergeLocalCAS(secondRing(), firstRing(), now+3) - assert.Equal(t, &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{5, 10, 20, 100, 200}}, - "Ing 3": {Addr: "addr3", Timestamp: now, State: LEAVING}, - }, - }, our) - - assert.Equal(t, &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 3": {Addr: "addr3", Timestamp: now, State: LEAVING}, - }, - }, ch) - } -} - -func TestMergeMissingIntoLeft(t *testing.T) { - now := time.Now().Unix() - - ring1 := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now + 5, State: ACTIVE, Tokens: []uint32{5, 10, 20, 100, 200}}, - "Ing 3": {Addr: "addr3", Timestamp: now, State: LEFT}, - }, - } - } - - ring2 := func() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now + 10, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now + 10, State: ACTIVE, Tokens: []uint32{5, 10, 20, 100, 200}}, - }, - } - } - - { - our, ch := mergeLocalCAS(ring1(), ring2(), now+10) - assert.Equal(t, &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now + 10, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now + 10, State: ACTIVE, Tokens: []uint32{5, 10, 20, 100, 200}}, - "Ing 3": {Addr: "addr3", Timestamp: now, State: LEFT}, - }, - }, our) - - assert.Equal(t, &Desc{ - Ingesters: map[string]InstanceDesc{ - "Ing 1": {Addr: "addr1", Timestamp: now + 10, State: ACTIVE, Tokens: []uint32{30, 40, 50}}, - "Ing 2": {Addr: "addr2", Timestamp: now + 10, State: ACTIVE, Tokens: []uint32{5, 10, 20, 100, 200}}, - // Ing 3 is not changed, it was already LEFT - }, - }, ch) - } -} - -func mergeLocalCAS(ring1, ring2 *Desc, nowUnixTime int64) (*Desc, *Desc) { - change, err := ring1.mergeWithTime(ring2, true, time.Unix(nowUnixTime, 0)) - if err != nil { - panic(err) - } - - if change == nil { - return ring1, nil - } - - changeRing := change.(*Desc) - return ring1, changeRing -} diff --git a/pkg/ring/model_test.go b/pkg/ring/model_test.go deleted file mode 100644 index 1d73e6f98b9..00000000000 --- a/pkg/ring/model_test.go +++ /dev/null @@ -1,418 +0,0 @@ -package ring - -import ( - "testing" - "time" - - "github.com/stretchr/testify/assert" -) - -func TestInstanceDesc_IsHealthy_ForIngesterOperations(t *testing.T) { - t.Parallel() - - tests := map[string]struct { - ingester *InstanceDesc - timeout time.Duration - writeExpected bool - readExpected bool - reportExpected bool - }{ - "ACTIVE ingester with last keepalive newer than timeout": { - ingester: &InstanceDesc{State: ACTIVE, Timestamp: time.Now().Add(-30 * time.Second).Unix()}, - timeout: time.Minute, - writeExpected: true, - readExpected: true, - reportExpected: true, - }, - "ACTIVE ingester with last keepalive older than timeout": { - ingester: &InstanceDesc{State: ACTIVE, Timestamp: time.Now().Add(-90 * time.Second).Unix()}, - timeout: time.Minute, - writeExpected: false, - readExpected: false, - reportExpected: false, - }, - "JOINING ingester with last keepalive newer than timeout": { - ingester: &InstanceDesc{State: JOINING, Timestamp: time.Now().Add(-30 * time.Second).Unix()}, - timeout: time.Minute, - writeExpected: false, - readExpected: false, - reportExpected: true, - }, - "LEAVING ingester with last keepalive newer than timeout": { - ingester: &InstanceDesc{State: LEAVING, Timestamp: time.Now().Add(-30 * time.Second).Unix()}, - timeout: time.Minute, - writeExpected: false, - readExpected: true, - reportExpected: true, - }, - } - - for testName, testData := range tests { - testData := testData - - t.Run(testName, func(t *testing.T) { - actual := testData.ingester.IsHealthy(Write, testData.timeout, time.Now()) - assert.Equal(t, testData.writeExpected, actual) - - actual = testData.ingester.IsHealthy(Read, testData.timeout, time.Now()) - assert.Equal(t, testData.readExpected, actual) - - actual = testData.ingester.IsHealthy(Reporting, testData.timeout, time.Now()) - assert.Equal(t, testData.reportExpected, actual) - }) - } -} - -func TestInstanceDesc_GetRegisteredAt(t *testing.T) { - tests := map[string]struct { - desc *InstanceDesc - expected time.Time - }{ - "should return zero value on nil desc": { - desc: nil, - expected: time.Time{}, - }, - "should return zero value registered timestamp = 0": { - desc: &InstanceDesc{ - RegisteredTimestamp: 0, - }, - expected: time.Time{}, - }, - "should return timestamp parsed from desc": { - desc: &InstanceDesc{ - RegisteredTimestamp: time.Unix(10000000, 0).Unix(), - }, - expected: time.Unix(10000000, 0), - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - assert.True(t, testData.desc.GetRegisteredAt().Equal(testData.expected)) - }) - } -} - -func normalizedSource() *Desc { - r := NewDesc() - r.Ingesters["first"] = InstanceDesc{ - Tokens: []uint32{100, 200, 300}, - } - r.Ingesters["second"] = InstanceDesc{} - return r -} - -func normalizedOutput() *Desc { - return &Desc{ - Ingesters: map[string]InstanceDesc{ - "first": {}, - "second": {Tokens: []uint32{100, 200, 300}}, - }, - } -} - -func TestClaimTokensFromNormalizedToNormalized(t *testing.T) { - r := normalizedSource() - result := r.ClaimTokens("first", "second") - - assert.Equal(t, Tokens{100, 200, 300}, result) - assert.Equal(t, normalizedOutput(), r) -} - -func TestDesc_Ready(t *testing.T) { - now := time.Now() - - r := &Desc{ - Ingesters: map[string]InstanceDesc{ - "ing1": { - Tokens: []uint32{100, 200, 300}, - State: ACTIVE, - Timestamp: now.Unix(), - }, - }, - } - - if err := r.Ready(now, 10*time.Second); err != nil { - t.Fatal("expected ready, got", err) - } - - if err := r.Ready(now, 0); err != nil { - t.Fatal("expected ready, got", err) - } - - if err := r.Ready(now.Add(5*time.Minute), 10*time.Second); err == nil { - t.Fatal("expected !ready (no heartbeat from active ingester), but got no error") - } - - if err := r.Ready(now.Add(5*time.Minute), 0); err != nil { - t.Fatal("expected ready (no heartbeat but timeout disabled), got", err) - } - - r = &Desc{ - Ingesters: map[string]InstanceDesc{ - "ing1": { - State: ACTIVE, - Timestamp: now.Unix(), - }, - }, - } - - if err := r.Ready(now, 10*time.Second); err == nil { - t.Fatal("expected !ready (no tokens), but got no error") - } - - r.Ingesters["some ingester"] = InstanceDesc{ - Tokens: []uint32{12345}, - Timestamp: now.Unix(), - } - - if err := r.Ready(now, 10*time.Second); err != nil { - t.Fatal("expected ready, got", err) - } -} - -func TestDesc_getTokensByZone(t *testing.T) { - tests := map[string]struct { - desc *Desc - expected map[string][]uint32 - }{ - "empty ring": { - desc: &Desc{Ingesters: map[string]InstanceDesc{}}, - expected: map[string][]uint32{}, - }, - "single zone": { - desc: &Desc{Ingesters: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Tokens: []uint32{1, 5}, Zone: ""}, - "instance-2": {Addr: "127.0.0.1", Tokens: []uint32{2, 4}, Zone: ""}, - "instance-3": {Addr: "127.0.0.1", Tokens: []uint32{3, 6}, Zone: ""}, - }}, - expected: map[string][]uint32{ - "": {1, 2, 3, 4, 5, 6}, - }, - }, - "multiple zones": { - desc: &Desc{Ingesters: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Tokens: []uint32{1, 5}, Zone: "zone-1"}, - "instance-2": {Addr: "127.0.0.1", Tokens: []uint32{2, 4}, Zone: "zone-1"}, - "instance-3": {Addr: "127.0.0.1", Tokens: []uint32{3, 6}, Zone: "zone-2"}, - }}, - expected: map[string][]uint32{ - "zone-1": {1, 2, 4, 5}, - "zone-2": {3, 6}, - }, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - assert.Equal(t, testData.expected, testData.desc.getTokensByZone()) - }) - } -} - -func TestDesc_TokensFor(t *testing.T) { - tests := map[string]struct { - desc *Desc - expectedMine Tokens - expectedAll Tokens - }{ - "empty ring": { - desc: &Desc{Ingesters: map[string]InstanceDesc{}}, - expectedMine: Tokens(nil), - expectedAll: Tokens{}, - }, - "single zone": { - desc: &Desc{Ingesters: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Tokens: []uint32{1, 5}, Zone: ""}, - "instance-2": {Addr: "127.0.0.1", Tokens: []uint32{2, 4}, Zone: ""}, - "instance-3": {Addr: "127.0.0.1", Tokens: []uint32{3, 6}, Zone: ""}, - }}, - expectedMine: Tokens{1, 5}, - expectedAll: Tokens{1, 2, 3, 4, 5, 6}, - }, - "multiple zones": { - desc: &Desc{Ingesters: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Tokens: []uint32{1, 5}, Zone: "zone-1"}, - "instance-2": {Addr: "127.0.0.1", Tokens: []uint32{2, 4}, Zone: "zone-1"}, - "instance-3": {Addr: "127.0.0.1", Tokens: []uint32{3, 6}, Zone: "zone-2"}, - }}, - expectedMine: Tokens{1, 5}, - expectedAll: Tokens{1, 2, 3, 4, 5, 6}, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - actualMine, actualAll := testData.desc.TokensFor("instance-1") - assert.Equal(t, testData.expectedMine, actualMine) - assert.Equal(t, testData.expectedAll, actualAll) - }) - } -} - -func TestDesc_RingsCompare(t *testing.T) { - tests := map[string]struct { - r1, r2 *Desc - expected CompareResult - }{ - "nil rings": { - r1: nil, - r2: nil, - expected: Equal, - }, - "one nil, one empty ring": { - r1: nil, - r2: &Desc{Ingesters: map[string]InstanceDesc{}}, - expected: Equal, - }, - "two empty rings": { - r1: &Desc{Ingesters: map[string]InstanceDesc{}}, - r2: &Desc{Ingesters: map[string]InstanceDesc{}}, - expected: Equal, - }, - "same single instance": { - r1: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1"}}}, - r2: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1"}}}, - expected: Equal, - }, - "same single instance, different timestamp": { - r1: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", Timestamp: 123456}}}, - r2: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", Timestamp: 789012}}}, - expected: EqualButStatesAndTimestamps, - }, - "same single instance, different state": { - r1: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", State: ACTIVE}}}, - r2: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", State: JOINING}}}, - expected: EqualButStatesAndTimestamps, - }, - "same single instance, different registered timestamp": { - r1: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", State: ACTIVE, RegisteredTimestamp: 1}}}, - r2: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", State: ACTIVE, RegisteredTimestamp: 2}}}, - expected: Different, - }, - "instance in different zone": { - r1: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", Zone: "one"}}}, - r2: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", Zone: "two"}}}, - expected: Different, - }, - "same instance, different address": { - r1: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1"}}}, - r2: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr2"}}}, - expected: Different, - }, - "more instances in one ring": { - r1: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1"}, "ing2": {Addr: "ing2"}}}, - r2: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1"}}}, - expected: Different, - }, - "different tokens": { - r1: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", Tokens: []uint32{1, 2, 3}}}}, - r2: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1"}}}, - expected: Different, - }, - "different tokens 2": { - r1: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", Tokens: []uint32{1, 2, 3}}}}, - r2: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", Tokens: []uint32{1, 2, 4}}}}, - expected: Different, - }, - "same number of instances, using different IDs": { - r1: &Desc{Ingesters: map[string]InstanceDesc{"ing1": {Addr: "addr1", Tokens: []uint32{1, 2, 3}}}}, - r2: &Desc{Ingesters: map[string]InstanceDesc{"ing2": {Addr: "addr1", Tokens: []uint32{1, 2, 3}}}}, - expected: Different, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - assert.Equal(t, testData.expected, testData.r1.RingCompare(testData.r2)) - assert.Equal(t, testData.expected, testData.r2.RingCompare(testData.r1)) - }) - } -} - -func TestMergeTokens(t *testing.T) { - tests := map[string]struct { - input [][]uint32 - expected []uint32 - }{ - "empty input": { - input: nil, - expected: []uint32{}, - }, - "single instance in input": { - input: [][]uint32{ - {1, 3, 4, 8}, - }, - expected: []uint32{1, 3, 4, 8}, - }, - "multiple instances in input": { - input: [][]uint32{ - {1, 3, 4, 8}, - {0, 2, 6, 9}, - {5, 7, 10, 11}, - }, - expected: []uint32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, - }, - "some instances have no tokens": { - input: [][]uint32{ - {1, 3, 4, 8}, - {}, - {0, 2, 6, 9}, - {}, - {5, 7, 10, 11}, - }, - expected: []uint32{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - assert.Equal(t, testData.expected, MergeTokens(testData.input)) - }) - } -} - -func TestMergeTokensByZone(t *testing.T) { - tests := map[string]struct { - input map[string][][]uint32 - expected map[string][]uint32 - }{ - "empty input": { - input: nil, - expected: map[string][]uint32{}, - }, - "single zone": { - input: map[string][][]uint32{ - "zone-1": { - {1, 3, 4, 8}, - {2, 5, 6, 7}, - }, - }, - expected: map[string][]uint32{ - "zone-1": {1, 2, 3, 4, 5, 6, 7, 8}, - }, - }, - "multiple zones": { - input: map[string][][]uint32{ - "zone-1": { - {1, 3, 4, 8}, - {2, 5, 6, 7}, - }, - "zone-2": { - {3, 5}, - {2, 4}, - }, - }, - expected: map[string][]uint32{ - "zone-1": {1, 2, 3, 4, 5, 6, 7, 8}, - "zone-2": {2, 3, 4, 5}, - }, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - assert.Equal(t, testData.expected, MergeTokensByZone(testData.input)) - }) - } -} diff --git a/pkg/ring/replication_set_test.go b/pkg/ring/replication_set_test.go deleted file mode 100644 index 74ef17ba163..00000000000 --- a/pkg/ring/replication_set_test.go +++ /dev/null @@ -1,243 +0,0 @@ -package ring - -import ( - "context" - "errors" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "go.uber.org/atomic" -) - -func TestReplicationSet_GetAddresses(t *testing.T) { - tests := map[string]struct { - rs ReplicationSet - expected []string - }{ - "should return an empty slice on empty replication set": { - rs: ReplicationSet{}, - expected: []string{}, - }, - "should return instances addresses (no order guaranteed)": { - rs: ReplicationSet{ - Instances: []InstanceDesc{ - {Addr: "127.0.0.1"}, - {Addr: "127.0.0.2"}, - {Addr: "127.0.0.3"}, - }, - }, - expected: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3"}, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - assert.ElementsMatch(t, testData.expected, testData.rs.GetAddresses()) - }) - } -} - -func TestReplicationSet_GetAddressesWithout(t *testing.T) { - tests := map[string]struct { - rs ReplicationSet - expected []string - exclude string - }{ - "should return an empty slice on empty replication set": { - rs: ReplicationSet{}, - expected: []string{}, - exclude: "127.0.0.1", - }, - "non-matching exclusion, should return all addresses": { - rs: ReplicationSet{ - Instances: []InstanceDesc{ - {Addr: "127.0.0.1"}, - {Addr: "127.0.0.2"}, - {Addr: "127.0.0.3"}, - }, - }, - expected: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3"}, - exclude: "127.0.0.4", - }, - "matching exclusion, should return non-excluded addresses": { - rs: ReplicationSet{ - Instances: []InstanceDesc{ - {Addr: "127.0.0.1"}, - {Addr: "127.0.0.2"}, - {Addr: "127.0.0.3"}, - }, - }, - expected: []string{"127.0.0.1", "127.0.0.3"}, - exclude: "127.0.0.2", - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - assert.ElementsMatch(t, testData.expected, testData.rs.GetAddressesWithout(testData.exclude)) - }) - } -} - -var ( - errFailure = errors.New("failed") - errZoneFailure = errors.New("zone failed") -) - -// Return a function that fails starting from failAfter times -func failingFunctionAfter(failAfter int32, delay time.Duration) func(context.Context, *InstanceDesc) (interface{}, error) { - count := atomic.NewInt32(0) - return func(context.Context, *InstanceDesc) (interface{}, error) { - time.Sleep(delay) - if count.Inc() > failAfter { - return nil, errFailure - } - return 1, nil - } -} - -func failingFunctionOnZones(zones ...string) func(context.Context, *InstanceDesc) (interface{}, error) { - return func(ctx context.Context, ing *InstanceDesc) (interface{}, error) { - for _, zone := range zones { - if ing.Zone == zone { - return nil, errZoneFailure - } - } - return 1, nil - } -} - -func TestReplicationSet_Do(t *testing.T) { - tests := []struct { - name string - instances []InstanceDesc - maxErrors int - maxUnavailableZones int - f func(context.Context, *InstanceDesc) (interface{}, error) - delay time.Duration - cancelContextDelay time.Duration - want []interface{} - expectedError error - }{ - { - name: "max errors = 0, no errors no delay", - instances: []InstanceDesc{ - {}, - }, - f: func(c context.Context, id *InstanceDesc) (interface{}, error) { - return 1, nil - }, - want: []interface{}{1}, - }, - { - name: "max errors = 0, should fail on 1 error out of 1 instance", - instances: []InstanceDesc{{}}, - f: func(c context.Context, id *InstanceDesc) (interface{}, error) { - return nil, errFailure - }, - want: nil, - expectedError: errFailure, - }, - { - name: "max errors = 0, should fail on 1 error out of 3 instances (last call fails)", - instances: []InstanceDesc{{}, {}, {}}, - f: failingFunctionAfter(2, 10*time.Millisecond), - want: nil, - expectedError: errFailure, - }, - { - name: "max errors = 1, should fail on 3 errors out of 5 instances (last calls fail)", - instances: []InstanceDesc{{}, {}, {}, {}, {}}, - maxErrors: 1, - f: failingFunctionAfter(2, 10*time.Millisecond), - delay: 100 * time.Millisecond, - want: nil, - expectedError: errFailure, - }, - { - name: "max errors = 1, should handle context canceled", - instances: []InstanceDesc{{}, {}, {}}, - maxErrors: 1, - f: func(c context.Context, id *InstanceDesc) (interface{}, error) { - time.Sleep(300 * time.Millisecond) - return 1, nil - }, - cancelContextDelay: 100 * time.Millisecond, - want: nil, - expectedError: context.Canceled, - }, - { - name: "max errors = 0, should succeed on all successful instances", - instances: []InstanceDesc{{Zone: "zone1"}, {Zone: "zone2"}, {Zone: "zone3"}}, - f: func(c context.Context, id *InstanceDesc) (interface{}, error) { - return 1, nil - }, - want: []interface{}{1, 1, 1}, - }, - { - name: "max unavailable zones = 1, should succeed on instances failing in 1 out of 3 zones (3 instances)", - instances: []InstanceDesc{{Zone: "zone1"}, {Zone: "zone2"}, {Zone: "zone3"}}, - f: failingFunctionOnZones("zone1"), - maxUnavailableZones: 1, - want: []interface{}{1, 1}, - }, - { - name: "max unavailable zones = 1, should fail on instances failing in 2 out of 3 zones (3 instances)", - instances: []InstanceDesc{{Zone: "zone1"}, {Zone: "zone2"}, {Zone: "zone3"}}, - f: failingFunctionOnZones("zone1", "zone2"), - maxUnavailableZones: 1, - expectedError: errZoneFailure, - }, - { - name: "max unavailable zones = 1, should succeed on instances failing in 1 out of 3 zones (6 instances)", - instances: []InstanceDesc{{Zone: "zone1"}, {Zone: "zone1"}, {Zone: "zone2"}, {Zone: "zone2"}, {Zone: "zone3"}, {Zone: "zone3"}}, - f: failingFunctionOnZones("zone1"), - maxUnavailableZones: 1, - want: []interface{}{1, 1, 1, 1}, - }, - { - name: "max unavailable zones = 2, should fail on instances failing in 3 out of 5 zones (5 instances)", - instances: []InstanceDesc{{Zone: "zone1"}, {Zone: "zone2"}, {Zone: "zone3"}, {Zone: "zone4"}, {Zone: "zone5"}}, - f: failingFunctionOnZones("zone1", "zone2", "zone3"), - maxUnavailableZones: 2, - expectedError: errZoneFailure, - }, - { - name: "max unavailable zones = 2, should succeed on instances failing in 2 out of 5 zones (10 instances)", - instances: []InstanceDesc{{Zone: "zone1"}, {Zone: "zone1"}, {Zone: "zone2"}, {Zone: "zone2"}, {Zone: "zone3"}, {Zone: "zone3"}, {Zone: "zone4"}, {Zone: "zone4"}, {Zone: "zone5"}, {Zone: "zone5"}}, - f: failingFunctionOnZones("zone1", "zone5"), - maxUnavailableZones: 2, - want: []interface{}{1, 1, 1, 1, 1, 1}, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Ensure the test case has been correctly setup (max errors and max unavailable zones are - // mutually exclusive). - require.False(t, tt.maxErrors > 0 && tt.maxUnavailableZones > 0) - - r := ReplicationSet{ - Instances: tt.instances, - MaxErrors: tt.maxErrors, - MaxUnavailableZones: tt.maxUnavailableZones, - } - ctx := context.Background() - if tt.cancelContextDelay > 0 { - var cancel context.CancelFunc - ctx, cancel = context.WithCancel(ctx) - time.AfterFunc(tt.cancelContextDelay, func() { - cancel() - }) - } - got, err := r.Do(ctx, tt.delay, tt.f) - if tt.expectedError != nil { - assert.Equal(t, tt.expectedError, err) - } else { - assert.NoError(t, err) - } - assert.Equal(t, tt.want, got) - }) - } -} diff --git a/pkg/ring/replication_set_tracker_test.go b/pkg/ring/replication_set_tracker_test.go deleted file mode 100644 index f24d23c00a2..00000000000 --- a/pkg/ring/replication_set_tracker_test.go +++ /dev/null @@ -1,266 +0,0 @@ -package ring - -import ( - "errors" - "testing" - - "github.com/stretchr/testify/assert" -) - -func TestDefaultResultTracker(t *testing.T) { - instance1 := InstanceDesc{Addr: "127.0.0.1"} - instance2 := InstanceDesc{Addr: "127.0.0.2"} - instance3 := InstanceDesc{Addr: "127.0.0.3"} - instance4 := InstanceDesc{Addr: "127.0.0.4"} - - tests := map[string]struct { - instances []InstanceDesc - maxErrors int - run func(t *testing.T, tracker *defaultResultTracker) - }{ - "should succeed on no instances to track": { - instances: nil, - maxErrors: 0, - run: func(t *testing.T, tracker *defaultResultTracker) { - assert.True(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - }, - }, - "should succeed once all instances succeed on max errors = 0": { - instances: []InstanceDesc{instance1, instance2, instance3, instance4}, - maxErrors: 0, - run: func(t *testing.T, tracker *defaultResultTracker) { - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance1, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance2, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance3, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance4, nil) - assert.True(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - }, - }, - "should fail on 1st failing instance on max errors = 0": { - instances: []InstanceDesc{instance1, instance2, instance3, instance4}, - maxErrors: 0, - run: func(t *testing.T, tracker *defaultResultTracker) { - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance1, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance2, errors.New("test")) - assert.False(t, tracker.succeeded()) - assert.True(t, tracker.failed()) - }, - }, - "should fail on 2nd failing instance on max errors = 1": { - instances: []InstanceDesc{instance1, instance2, instance3, instance4}, - maxErrors: 1, - run: func(t *testing.T, tracker *defaultResultTracker) { - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance1, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance2, errors.New("test")) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance3, errors.New("test")) - assert.False(t, tracker.succeeded()) - assert.True(t, tracker.failed()) - }, - }, - "should fail on 3rd failing instance on max errors = 2": { - instances: []InstanceDesc{instance1, instance2, instance3, instance4}, - maxErrors: 2, - run: func(t *testing.T, tracker *defaultResultTracker) { - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance1, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance2, errors.New("test")) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance3, errors.New("test")) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance4, errors.New("test")) - assert.False(t, tracker.succeeded()) - assert.True(t, tracker.failed()) - }, - }, - } - - for testName, testCase := range tests { - t.Run(testName, func(t *testing.T) { - testCase.run(t, newDefaultResultTracker(testCase.instances, testCase.maxErrors)) - }) - } -} - -func TestZoneAwareResultTracker(t *testing.T) { - instance1 := InstanceDesc{Addr: "127.0.0.1", Zone: "zone-a"} - instance2 := InstanceDesc{Addr: "127.0.0.2", Zone: "zone-a"} - instance3 := InstanceDesc{Addr: "127.0.0.3", Zone: "zone-b"} - instance4 := InstanceDesc{Addr: "127.0.0.4", Zone: "zone-b"} - instance5 := InstanceDesc{Addr: "127.0.0.5", Zone: "zone-c"} - instance6 := InstanceDesc{Addr: "127.0.0.6", Zone: "zone-c"} - - tests := map[string]struct { - instances []InstanceDesc - maxUnavailableZones int - run func(t *testing.T, tracker *zoneAwareResultTracker) - }{ - "should succeed on no instances to track": { - instances: nil, - maxUnavailableZones: 0, - run: func(t *testing.T, tracker *zoneAwareResultTracker) { - assert.True(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - }, - }, - "should succeed once all instances succeed on max unavailable zones = 0": { - instances: []InstanceDesc{instance1, instance2, instance3}, - maxUnavailableZones: 0, - run: func(t *testing.T, tracker *zoneAwareResultTracker) { - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance1, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance2, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance3, nil) - assert.True(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - }, - }, - "should fail on 1st failing instance on max unavailable zones = 0": { - instances: []InstanceDesc{instance1, instance2, instance3, instance4, instance5, instance6}, - maxUnavailableZones: 0, - run: func(t *testing.T, tracker *zoneAwareResultTracker) { - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance1, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance2, errors.New("test")) - assert.False(t, tracker.succeeded()) - assert.True(t, tracker.failed()) - }, - }, - "should succeed on 2 failing instances within the same zone on max unavailable zones = 1": { - instances: []InstanceDesc{instance1, instance2, instance3, instance4, instance5, instance6}, - maxUnavailableZones: 1, - run: func(t *testing.T, tracker *zoneAwareResultTracker) { - // Track failing instances. - for _, instance := range []InstanceDesc{instance1, instance2} { - tracker.done(&instance, errors.New("test")) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - } - - // Track successful instances. - for _, instance := range []InstanceDesc{instance3, instance4, instance5} { - tracker.done(&instance, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - } - - tracker.done(&instance6, nil) - assert.True(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - }, - }, - "should succeed as soon as the response has been successfully received from 'all zones - 1' on max unavailable zones = 1": { - instances: []InstanceDesc{instance1, instance2, instance3, instance4, instance5, instance6}, - maxUnavailableZones: 1, - run: func(t *testing.T, tracker *zoneAwareResultTracker) { - // Track successful instances. - for _, instance := range []InstanceDesc{instance1, instance2, instance3} { - tracker.done(&instance, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - } - - tracker.done(&instance4, nil) - assert.True(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - }, - }, - "should succeed on failing instances within 2 zones on max unavailable zones = 2": { - instances: []InstanceDesc{instance1, instance2, instance3, instance4, instance5, instance6}, - maxUnavailableZones: 2, - run: func(t *testing.T, tracker *zoneAwareResultTracker) { - // Track failing instances. - for _, instance := range []InstanceDesc{instance1, instance2, instance3, instance4} { - tracker.done(&instance, errors.New("test")) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - } - - // Track successful instances. - tracker.done(&instance5, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - tracker.done(&instance6, nil) - assert.True(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - }, - }, - "should succeed as soon as the response has been successfully received from 'all zones - 2' on max unavailable zones = 2": { - instances: []InstanceDesc{instance1, instance2, instance3, instance4, instance5, instance6}, - maxUnavailableZones: 2, - run: func(t *testing.T, tracker *zoneAwareResultTracker) { - // Zone-a - tracker.done(&instance1, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - // Zone-b - tracker.done(&instance3, nil) - assert.False(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - - // Zone-a - tracker.done(&instance2, nil) - assert.True(t, tracker.succeeded()) - assert.False(t, tracker.failed()) - }, - }, - } - - for testName, testCase := range tests { - t.Run(testName, func(t *testing.T) { - testCase.run(t, newZoneAwareResultTracker(testCase.instances, testCase.maxUnavailableZones)) - }) - } -} diff --git a/pkg/ring/replication_strategy_test.go b/pkg/ring/replication_strategy_test.go deleted file mode 100644 index 0bce73350b2..00000000000 --- a/pkg/ring/replication_strategy_test.go +++ /dev/null @@ -1,165 +0,0 @@ -package ring - -import ( - "fmt" - "testing" - "time" - - "github.com/stretchr/testify/assert" -) - -func TestRingReplicationStrategy(t *testing.T) { - for i, tc := range []struct { - replicationFactor, liveIngesters, deadIngesters int - expectedMaxFailure int - expectedError string - }{ - // Ensure it works for a single ingester, for local testing. - { - replicationFactor: 1, - liveIngesters: 1, - expectedMaxFailure: 0, - }, - - { - replicationFactor: 1, - deadIngesters: 1, - expectedError: "at least 1 live replicas required, could only find 0", - }, - - // Ensure it works for RF=3 and 2 ingesters. - { - replicationFactor: 3, - liveIngesters: 2, - expectedMaxFailure: 0, - }, - - // Ensure it works for the default production config. - { - replicationFactor: 3, - liveIngesters: 3, - expectedMaxFailure: 1, - }, - - { - replicationFactor: 3, - liveIngesters: 2, - deadIngesters: 1, - expectedMaxFailure: 0, - }, - - { - replicationFactor: 3, - liveIngesters: 1, - deadIngesters: 2, - expectedError: "at least 2 live replicas required, could only find 1", - }, - - // Ensure it works when adding / removing nodes. - - // A node is joining or leaving, replica set expands. - { - replicationFactor: 3, - liveIngesters: 4, - expectedMaxFailure: 1, - }, - - { - replicationFactor: 3, - liveIngesters: 3, - deadIngesters: 1, - expectedMaxFailure: 0, - }, - - { - replicationFactor: 3, - liveIngesters: 2, - deadIngesters: 2, - expectedError: "at least 3 live replicas required, could only find 2", - }, - } { - ingesters := []InstanceDesc{} - for i := 0; i < tc.liveIngesters; i++ { - ingesters = append(ingesters, InstanceDesc{ - Timestamp: time.Now().Unix(), - }) - } - for i := 0; i < tc.deadIngesters; i++ { - ingesters = append(ingesters, InstanceDesc{}) - } - - t.Run(fmt.Sprintf("[%d]", i), func(t *testing.T) { - strategy := NewDefaultReplicationStrategy() - liveIngesters, maxFailure, err := strategy.Filter(ingesters, Read, tc.replicationFactor, 100*time.Second, false) - if tc.expectedError == "" { - assert.NoError(t, err) - assert.Equal(t, tc.liveIngesters, len(liveIngesters)) - assert.Equal(t, tc.expectedMaxFailure, maxFailure) - } else { - assert.EqualError(t, err, tc.expectedError) - } - }) - } -} - -func TestIgnoreUnhealthyInstancesReplicationStrategy(t *testing.T) { - for _, tc := range []struct { - name string - liveIngesters, deadIngesters int - expectedMaxFailure int - expectedError string - }{ - { - name: "with at least 1 healthy instance", - liveIngesters: 1, - expectedMaxFailure: 0, - }, - { - name: "with more healthy instances than unhealthy", - deadIngesters: 1, - liveIngesters: 2, - expectedMaxFailure: 1, - }, - { - name: "with more unhealthy instances than healthy", - deadIngesters: 1, - liveIngesters: 2, - expectedMaxFailure: 1, - }, - { - name: "with equal number of healthy and unhealthy instances", - deadIngesters: 2, - liveIngesters: 2, - expectedMaxFailure: 1, - }, - { - name: "with no healthy instances", - liveIngesters: 0, - deadIngesters: 3, - expectedMaxFailure: 0, - expectedError: "at least 1 healthy replica required, could only find 0", - }, - } { - ingesters := []InstanceDesc{} - for i := 0; i < tc.liveIngesters; i++ { - ingesters = append(ingesters, InstanceDesc{ - Timestamp: time.Now().Unix(), - }) - } - for i := 0; i < tc.deadIngesters; i++ { - ingesters = append(ingesters, InstanceDesc{}) - } - - t.Run(tc.name, func(t *testing.T) { - strategy := NewIgnoreUnhealthyInstancesReplicationStrategy() - liveIngesters, maxFailure, err := strategy.Filter(ingesters, Read, 3, 100*time.Second, false) - if tc.expectedError == "" { - assert.NoError(t, err) - assert.Equal(t, tc.liveIngesters, len(liveIngesters)) - assert.Equal(t, tc.expectedMaxFailure, maxFailure) - } else { - assert.EqualError(t, err, tc.expectedError) - } - }) - } -} diff --git a/pkg/ring/ring_test.go b/pkg/ring/ring_test.go deleted file mode 100644 index dc49046e7ae..00000000000 --- a/pkg/ring/ring_test.go +++ /dev/null @@ -1,2136 +0,0 @@ -package ring - -import ( - "context" - "fmt" - "math" - "math/rand" - "sort" - "strconv" - "strings" - "testing" - "time" - - "github.com/go-kit/log" - "github.com/grafana/dskit/flagext" - "github.com/grafana/dskit/kv" - "github.com/grafana/dskit/kv/consul" - "github.com/grafana/dskit/services" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/cortexproject/cortex/pkg/util" - "github.com/cortexproject/cortex/pkg/util/test" -) - -const ( - numTokens = 512 -) - -func BenchmarkBatch10x100(b *testing.B) { - benchmarkBatch(b, 10, 100) -} - -func BenchmarkBatch100x100(b *testing.B) { - benchmarkBatch(b, 100, 100) -} - -func BenchmarkBatch100x1000(b *testing.B) { - benchmarkBatch(b, 100, 1000) -} - -func benchmarkBatch(b *testing.B, numInstances, numKeys int) { - // Make a random ring with N instances, and M tokens per ingests - desc := NewDesc() - takenTokens := []uint32{} - for i := 0; i < numInstances; i++ { - tokens := GenerateTokens(numTokens, takenTokens) - takenTokens = append(takenTokens, tokens...) - desc.AddIngester(fmt.Sprintf("%d", i), fmt.Sprintf("instance-%d", i), strconv.Itoa(i), tokens, ACTIVE, time.Now()) - } - - cfg := Config{} - flagext.DefaultValues(&cfg) - r := Ring{ - cfg: cfg, - ringDesc: desc, - strategy: NewDefaultReplicationStrategy(), - } - - ctx := context.Background() - callback := func(InstanceDesc, []int) error { - return nil - } - cleanup := func() { - } - rnd := rand.New(rand.NewSource(time.Now().UnixNano())) - keys := make([]uint32, numKeys) - // Generate a batch of N random keys, and look them up - b.ResetTimer() - for i := 0; i < b.N; i++ { - generateKeys(rnd, numKeys, keys) - err := DoBatch(ctx, Write, &r, keys, callback, cleanup) - require.NoError(b, err) - } -} - -func generateKeys(r *rand.Rand, numTokens int, dest []uint32) { - for i := 0; i < numTokens; i++ { - dest[i] = r.Uint32() - } -} - -func TestDoBatchZeroInstances(t *testing.T) { - ctx := context.Background() - numKeys := 10 - keys := make([]uint32, numKeys) - rnd := rand.New(rand.NewSource(time.Now().UnixNano())) - generateKeys(rnd, numKeys, keys) - callback := func(InstanceDesc, []int) error { - return nil - } - cleanup := func() { - } - desc := NewDesc() - r := Ring{ - cfg: Config{}, - ringDesc: desc, - strategy: NewDefaultReplicationStrategy(), - } - require.Error(t, DoBatch(ctx, Write, &r, keys, callback, cleanup)) -} - -func TestAddIngester(t *testing.T) { - r := NewDesc() - - const ingName = "ing1" - - now := time.Now() - ing1Tokens := GenerateTokens(128, nil) - - r.AddIngester(ingName, "addr", "1", ing1Tokens, ACTIVE, now) - - assert.Equal(t, "addr", r.Ingesters[ingName].Addr) - assert.Equal(t, ing1Tokens, r.Ingesters[ingName].Tokens) - assert.InDelta(t, time.Now().Unix(), r.Ingesters[ingName].Timestamp, 2) - assert.Equal(t, now.Unix(), r.Ingesters[ingName].RegisteredTimestamp) -} - -func TestAddIngesterReplacesExistingTokens(t *testing.T) { - r := NewDesc() - - const ing1Name = "ing1" - - // old tokens will be replaced - r.Ingesters[ing1Name] = InstanceDesc{ - Tokens: []uint32{11111, 22222, 33333}, - } - - newTokens := GenerateTokens(128, nil) - - r.AddIngester(ing1Name, "addr", "1", newTokens, ACTIVE, time.Now()) - - require.Equal(t, newTokens, r.Ingesters[ing1Name].Tokens) -} - -func TestRing_Get_ZoneAwarenessWithIngesterLeaving(t *testing.T) { - const testCount = 10000 - - tests := map[string]struct { - replicationFactor int - expectedInstances int - expectedZones int - }{ - "should succeed if there are enough instances per zone on RF = 3": { - replicationFactor: 3, - expectedInstances: 3, - expectedZones: 3, - }, - "should succeed if there are enough instances per zone on RF = 2": { - replicationFactor: 2, - expectedInstances: 2, - expectedZones: 2, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - r := NewDesc() - instances := map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", State: ACTIVE}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", State: ACTIVE}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-b", State: ACTIVE}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-b", State: ACTIVE}, - "instance-5": {Addr: "127.0.0.5", Zone: "zone-c", State: LEAVING}, - "instance-6": {Addr: "127.0.0.6", Zone: "zone-c", State: ACTIVE}, - } - var prevTokens []uint32 - for id, instance := range instances { - ingTokens := GenerateTokens(128, prevTokens) - r.AddIngester(id, instance.Addr, instance.Zone, ingTokens, instance.State, time.Now()) - prevTokens = append(prevTokens, ingTokens...) - } - instancesList := make([]InstanceDesc, 0, len(r.GetIngesters())) - for _, v := range r.GetIngesters() { - instancesList = append(instancesList, v) - } - - ring := Ring{ - cfg: Config{ - HeartbeatTimeout: time.Hour, - ReplicationFactor: testData.replicationFactor, - ZoneAwarenessEnabled: true, - }, - ringDesc: r, - ringTokens: r.GetTokens(), - ringTokensByZone: r.getTokensByZone(), - ringInstanceByToken: r.getTokensInfo(), - ringZones: getZones(r.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - _, bufHosts, bufZones := MakeBuffersForGet() - - // Use the GenerateTokens to get an array of random uint32 values. - testValues := GenerateTokens(testCount, nil) - - for i := 0; i < testCount; i++ { - set, err := ring.Get(testValues[i], Write, instancesList, bufHosts, bufZones) - require.NoError(t, err) - - distinctZones := map[string]int{} - for _, instance := range set.Instances { - distinctZones[instance.Zone]++ - } - - assert.Len(t, set.Instances, testData.expectedInstances) - assert.Len(t, distinctZones, testData.expectedZones) - } - }) - } -} - -func TestRing_Get_ZoneAwareness(t *testing.T) { - // Number of tests to run. - const testCount = 10000 - - tests := map[string]struct { - numInstances int - numZones int - replicationFactor int - zoneAwarenessEnabled bool - expectedErr string - expectedInstances int - }{ - "should succeed if there are enough instances per zone on RF = 3": { - numInstances: 16, - numZones: 3, - replicationFactor: 3, - zoneAwarenessEnabled: true, - expectedInstances: 3, - }, - "should fail if there are instances in 1 zone only on RF = 3": { - numInstances: 16, - numZones: 1, - replicationFactor: 3, - zoneAwarenessEnabled: true, - expectedErr: "at least 2 live replicas required across different availability zones, could only find 1", - }, - "should succeed if there are instances in 2 zones on RF = 3": { - numInstances: 16, - numZones: 2, - replicationFactor: 3, - zoneAwarenessEnabled: true, - expectedInstances: 2, - }, - "should succeed if there are instances in 1 zone only on RF = 3 but zone-awareness is disabled": { - numInstances: 16, - numZones: 1, - replicationFactor: 3, - zoneAwarenessEnabled: false, - expectedInstances: 3, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - // Add instances to the ring. - r := NewDesc() - var prevTokens []uint32 - for i := 0; i < testData.numInstances; i++ { - name := fmt.Sprintf("ing%v", i) - ingTokens := GenerateTokens(128, prevTokens) - - r.AddIngester(name, fmt.Sprintf("127.0.0.%d", i), fmt.Sprintf("zone-%v", i%testData.numZones), ingTokens, ACTIVE, time.Now()) - - prevTokens = append(prevTokens, ingTokens...) - } - - // Create a ring with the instances - ring := Ring{ - cfg: Config{ - HeartbeatTimeout: time.Hour, - ReplicationFactor: testData.replicationFactor, - ZoneAwarenessEnabled: testData.zoneAwarenessEnabled, - }, - ringDesc: r, - ringTokens: r.GetTokens(), - ringTokensByZone: r.getTokensByZone(), - ringInstanceByToken: r.getTokensInfo(), - ringZones: getZones(r.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - instances := make([]InstanceDesc, 0, len(r.GetIngesters())) - for _, v := range r.GetIngesters() { - instances = append(instances, v) - } - - _, bufHosts, bufZones := MakeBuffersForGet() - - // Use the GenerateTokens to get an array of random uint32 values. - testValues := GenerateTokens(testCount, nil) - - var set ReplicationSet - var err error - for i := 0; i < testCount; i++ { - set, err = ring.Get(testValues[i], Write, instances, bufHosts, bufZones) - if testData.expectedErr != "" { - require.EqualError(t, err, testData.expectedErr) - } else { - require.NoError(t, err) - } - - // Skip the rest of the assertions if we were expecting an error. - if testData.expectedErr != "" { - continue - } - - // Check that we have the expected number of instances for replication. - assert.Equal(t, testData.expectedInstances, len(set.Instances)) - - // Ensure all instances are in a different zone (only if zone-awareness is enabled). - if testData.zoneAwarenessEnabled { - zones := make(map[string]struct{}) - for i := 0; i < len(set.Instances); i++ { - if _, ok := zones[set.Instances[i].Zone]; ok { - t.Fatal("found multiple instances in the same zone") - } - zones[set.Instances[i].Zone] = struct{}{} - } - } - } - }) - } -} - -func TestRing_GetAllHealthy(t *testing.T) { - const heartbeatTimeout = time.Minute - now := time.Now() - - tests := map[string]struct { - ringInstances map[string]InstanceDesc - expectedErrForRead error - expectedSetForRead []string - expectedErrForWrite error - expectedSetForWrite []string - expectedErrForReporting error - expectedSetForReporting []string - }{ - "should return error on empty ring": { - ringInstances: nil, - expectedErrForRead: ErrEmptyRing, - expectedErrForWrite: ErrEmptyRing, - expectedErrForReporting: ErrEmptyRing, - }, - "should return all healthy instances for the given operation": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: now.Unix()}, - "instance-2": {Addr: "127.0.0.2", State: PENDING, Timestamp: now.Add(-10 * time.Second).Unix()}, - "instance-3": {Addr: "127.0.0.3", State: JOINING, Timestamp: now.Add(-20 * time.Second).Unix()}, - "instance-4": {Addr: "127.0.0.4", State: LEAVING, Timestamp: now.Add(-30 * time.Second).Unix()}, - "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix()}, - }, - expectedSetForRead: []string{"127.0.0.1", "127.0.0.2", "127.0.0.4"}, - expectedSetForWrite: []string{"127.0.0.1"}, - expectedSetForReporting: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4"}, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - // Init the ring. - ringDesc := &Desc{Ingesters: testData.ringInstances} - for id, instance := range ringDesc.Ingesters { - ringDesc.Ingesters[id] = instance - } - - ring := Ring{ - cfg: Config{HeartbeatTimeout: heartbeatTimeout}, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - set, err := ring.GetAllHealthy(Read) - require.Equal(t, testData.expectedErrForRead, err) - assert.ElementsMatch(t, testData.expectedSetForRead, set.GetAddresses()) - - set, err = ring.GetAllHealthy(Write) - require.Equal(t, testData.expectedErrForWrite, err) - assert.ElementsMatch(t, testData.expectedSetForWrite, set.GetAddresses()) - - set, err = ring.GetAllHealthy(Reporting) - require.Equal(t, testData.expectedErrForReporting, err) - assert.ElementsMatch(t, testData.expectedSetForReporting, set.GetAddresses()) - }) - } -} - -func TestRing_GetReplicationSetForOperation(t *testing.T) { - now := time.Now() - - tests := map[string]struct { - ringInstances map[string]InstanceDesc - ringHeartbeatTimeout time.Duration - ringReplicationFactor int - expectedErrForRead error - expectedSetForRead []string - expectedErrForWrite error - expectedSetForWrite []string - expectedErrForReporting error - expectedSetForReporting []string - }{ - "should return error on empty ring": { - ringInstances: nil, - ringHeartbeatTimeout: time.Minute, - ringReplicationFactor: 1, - expectedErrForRead: ErrEmptyRing, - expectedErrForWrite: ErrEmptyRing, - expectedErrForReporting: ErrEmptyRing, - }, - "should succeed on all healthy instances and RF=1": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: now.Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", State: ACTIVE, Timestamp: now.Add(-10 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", State: ACTIVE, Timestamp: now.Add(-20 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-40 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, - }, - ringHeartbeatTimeout: time.Minute, - ringReplicationFactor: 1, - expectedSetForRead: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, - expectedSetForWrite: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, - expectedSetForReporting: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, - }, - "should succeed on instances with old timestamps but heartbeat timeout disabled": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, - }, - ringHeartbeatTimeout: 0, - ringReplicationFactor: 1, - expectedSetForRead: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, - expectedSetForWrite: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, - expectedSetForReporting: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5"}, - }, - "should fail on 1 unhealthy instance and RF=1": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: now.Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", State: ACTIVE, Timestamp: now.Add(-10 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", State: ACTIVE, Timestamp: now.Add(-20 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, - }, - ringHeartbeatTimeout: time.Minute, - ringReplicationFactor: 1, - expectedErrForRead: ErrTooManyUnhealthyInstances, - expectedErrForWrite: ErrTooManyUnhealthyInstances, - expectedErrForReporting: ErrTooManyUnhealthyInstances, - }, - "should succeed on 1 unhealthy instances and RF=3": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: now.Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", State: ACTIVE, Timestamp: now.Add(-10 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", State: ACTIVE, Timestamp: now.Add(-20 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-30 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, - }, - ringHeartbeatTimeout: time.Minute, - ringReplicationFactor: 3, - expectedSetForRead: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4"}, - expectedSetForWrite: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4"}, - expectedSetForReporting: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4"}, - }, - "should fail on 2 unhealthy instances and RF=3": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: now.Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", State: ACTIVE, Timestamp: now.Add(-10 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", State: ACTIVE, Timestamp: now.Add(-20 * time.Second).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: now.Add(-2 * time.Minute).Unix(), Tokens: GenerateTokens(128, nil)}, - }, - ringHeartbeatTimeout: time.Minute, - ringReplicationFactor: 3, - expectedErrForRead: ErrTooManyUnhealthyInstances, - expectedErrForWrite: ErrTooManyUnhealthyInstances, - expectedErrForReporting: ErrTooManyUnhealthyInstances, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - // Init the ring. - ringDesc := &Desc{Ingesters: testData.ringInstances} - for id, instance := range ringDesc.Ingesters { - ringDesc.Ingesters[id] = instance - } - - ring := Ring{ - cfg: Config{ - HeartbeatTimeout: testData.ringHeartbeatTimeout, - ReplicationFactor: testData.ringReplicationFactor, - }, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - set, err := ring.GetReplicationSetForOperation(Read) - require.Equal(t, testData.expectedErrForRead, err) - assert.ElementsMatch(t, testData.expectedSetForRead, set.GetAddresses()) - - set, err = ring.GetReplicationSetForOperation(Write) - require.Equal(t, testData.expectedErrForWrite, err) - assert.ElementsMatch(t, testData.expectedSetForWrite, set.GetAddresses()) - - set, err = ring.GetReplicationSetForOperation(Reporting) - require.Equal(t, testData.expectedErrForReporting, err) - assert.ElementsMatch(t, testData.expectedSetForReporting, set.GetAddresses()) - }) - } -} - -func TestRing_GetReplicationSetForOperation_WithZoneAwarenessEnabled(t *testing.T) { - tests := map[string]struct { - ringInstances map[string]InstanceDesc - unhealthyInstances []string - expectedAddresses []string - replicationFactor int - expectedError error - expectedMaxErrors int - expectedMaxUnavailableZones int - }{ - "empty ring": { - ringInstances: nil, - expectedError: ErrEmptyRing, - }, - "RF=1, 1 zone": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1", "127.0.0.2"}, - replicationFactor: 1, - expectedMaxErrors: 0, - expectedMaxUnavailableZones: 0, - }, - "RF=1, 1 zone, one unhealthy instance": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - }, - unhealthyInstances: []string{"instance-2"}, - replicationFactor: 1, - expectedError: ErrTooManyUnhealthyInstances, - }, - "RF=1, 3 zones, one unhealthy instance": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - }, - unhealthyInstances: []string{"instance-3"}, - replicationFactor: 1, - expectedError: ErrTooManyUnhealthyInstances, - }, - "RF=2, 2 zones": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1", "127.0.0.2"}, - replicationFactor: 2, - expectedMaxUnavailableZones: 1, - }, - "RF=2, 2 zones, one unhealthy instance": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1"}, - unhealthyInstances: []string{"instance-2"}, - replicationFactor: 2, - }, - "RF=3, 3 zones, one instance per zone": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3"}, - replicationFactor: 3, - expectedMaxErrors: 0, - expectedMaxUnavailableZones: 1, - }, - "RF=3, 3 zones, one instance per zone, one instance unhealthy": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.2", "127.0.0.3"}, - unhealthyInstances: []string{"instance-1"}, - replicationFactor: 3, - expectedMaxErrors: 0, - expectedMaxUnavailableZones: 0, - }, - "RF=3, 3 zones, one instance per zone, two instances unhealthy in separate zones": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - }, - unhealthyInstances: []string{"instance-1", "instance-2"}, - replicationFactor: 3, - expectedError: ErrTooManyUnhealthyInstances, - }, - "RF=3, 3 zones, one instance per zone, all instances unhealthy": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - }, - unhealthyInstances: []string{"instance-1", "instance-2", "instance-3"}, - replicationFactor: 3, - expectedError: ErrTooManyUnhealthyInstances, - }, - "RF=3, 3 zones, two instances per zone": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-6": {Addr: "127.0.0.6", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5", "127.0.0.6"}, - replicationFactor: 3, - expectedMaxErrors: 0, - expectedMaxUnavailableZones: 1, - }, - "RF=3, 3 zones, two instances per zone, two instances unhealthy in same zone": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-6": {Addr: "127.0.0.6", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1", "127.0.0.2", "127.0.0.5", "127.0.0.6"}, - unhealthyInstances: []string{"instance-3", "instance-4"}, - replicationFactor: 3, - expectedMaxErrors: 0, - expectedMaxUnavailableZones: 0, - }, - "RF=3, 3 zones, three instances per zone, two instances unhealthy in same zone": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-6": {Addr: "127.0.0.6", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-7": {Addr: "127.0.0.7", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-8": {Addr: "127.0.0.8", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-9": {Addr: "127.0.0.9", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.7", "127.0.0.8", "127.0.0.9"}, - unhealthyInstances: []string{"instance-4", "instance-6"}, - replicationFactor: 3, - expectedMaxErrors: 0, - expectedMaxUnavailableZones: 0, - }, - "RF=3, only 2 zones, two instances per zone": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4"}, - replicationFactor: 3, - expectedMaxErrors: 0, - expectedMaxUnavailableZones: 1, - }, - "RF=3, only 2 zones, two instances per zone, one instance unhealthy": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1", "127.0.0.2"}, - unhealthyInstances: []string{"instance-4"}, - replicationFactor: 3, - expectedMaxErrors: 0, - expectedMaxUnavailableZones: 0, - }, - "RF=3, only 1 zone, two instances per zone": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1", "127.0.0.2"}, - replicationFactor: 3, - expectedMaxErrors: 0, - expectedMaxUnavailableZones: 0, - }, - "RF=3, only 1 zone, two instances per zone, one instance unhealthy": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - }, - unhealthyInstances: []string{"instance-2"}, - replicationFactor: 3, - expectedError: ErrTooManyUnhealthyInstances, - }, - "RF=5, 5 zones, two instances per zone except for one zone which has three": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-6": {Addr: "127.0.0.6", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-7": {Addr: "127.0.0.7", Zone: "zone-d", Tokens: GenerateTokens(128, nil)}, - "instance-8": {Addr: "127.0.0.8", Zone: "zone-d", Tokens: GenerateTokens(128, nil)}, - "instance-9": {Addr: "127.0.0.9", Zone: "zone-e", Tokens: GenerateTokens(128, nil)}, - "instance-10": {Addr: "127.0.0.10", Zone: "zone-e", Tokens: GenerateTokens(128, nil)}, - "instance-11": {Addr: "127.0.0.11", Zone: "zone-e", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1", "127.0.0.2", "127.0.0.3", "127.0.0.4", "127.0.0.5", - "127.0.0.6", "127.0.0.7", "127.0.0.8", "127.0.0.9", "127.0.0.10", "127.0.0.11"}, - replicationFactor: 5, - expectedMaxErrors: 0, - expectedMaxUnavailableZones: 2, - }, - "RF=5, 5 zones, two instances per zone except for one zone which has three, 2 unhealthy nodes in same zones": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-6": {Addr: "127.0.0.6", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-7": {Addr: "127.0.0.7", Zone: "zone-d", Tokens: GenerateTokens(128, nil)}, - "instance-8": {Addr: "127.0.0.8", Zone: "zone-d", Tokens: GenerateTokens(128, nil)}, - "instance-9": {Addr: "127.0.0.9", Zone: "zone-e", Tokens: GenerateTokens(128, nil)}, - "instance-10": {Addr: "127.0.0.10", Zone: "zone-e", Tokens: GenerateTokens(128, nil)}, - "instance-11": {Addr: "127.0.0.11", Zone: "zone-e", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1", "127.0.0.2", "127.0.0.5", "127.0.0.6", "127.0.0.7", "127.0.0.8", "127.0.0.9", "127.0.0.10", "127.0.0.11"}, - unhealthyInstances: []string{"instance-3", "instance-4"}, - replicationFactor: 5, - expectedMaxErrors: 0, - expectedMaxUnavailableZones: 1, - }, - "RF=5, 5 zones, two instances per zone except for one zone which has three, 2 unhealthy nodes in separate zones": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-6": {Addr: "127.0.0.6", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-7": {Addr: "127.0.0.7", Zone: "zone-d", Tokens: GenerateTokens(128, nil)}, - "instance-8": {Addr: "127.0.0.8", Zone: "zone-d", Tokens: GenerateTokens(128, nil)}, - "instance-9": {Addr: "127.0.0.9", Zone: "zone-e", Tokens: GenerateTokens(128, nil)}, - "instance-10": {Addr: "127.0.0.10", Zone: "zone-e", Tokens: GenerateTokens(128, nil)}, - "instance-11": {Addr: "127.0.0.11", Zone: "zone-e", Tokens: GenerateTokens(128, nil)}, - }, - expectedAddresses: []string{"127.0.0.1", "127.0.0.2", "127.0.0.7", "127.0.0.8", "127.0.0.9", "127.0.0.10", "127.0.0.11"}, - unhealthyInstances: []string{"instance-3", "instance-5"}, - replicationFactor: 5, - expectedMaxErrors: 0, - expectedMaxUnavailableZones: 0, - }, - "RF=5, 5 zones, one instances per zone, three unhealthy instances": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-d", Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", Zone: "zone-e", Tokens: GenerateTokens(128, nil)}, - }, - unhealthyInstances: []string{"instance-2", "instance-4", "instance-5"}, - replicationFactor: 5, - expectedError: ErrTooManyUnhealthyInstances, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - // Ensure the test case has been correctly setup (max errors and max unavailable zones are - // mutually exclusive). - require.False(t, testData.expectedMaxErrors > 0 && testData.expectedMaxUnavailableZones > 0) - - // Init the ring. - ringDesc := &Desc{Ingesters: testData.ringInstances} - for id, instance := range ringDesc.Ingesters { - instance.Timestamp = time.Now().Unix() - instance.State = ACTIVE - for _, instanceName := range testData.unhealthyInstances { - if instanceName == id { - instance.Timestamp = time.Now().Add(-time.Hour).Unix() - } - } - ringDesc.Ingesters[id] = instance - } - - ring := Ring{ - cfg: Config{ - HeartbeatTimeout: time.Minute, - ZoneAwarenessEnabled: true, - ReplicationFactor: testData.replicationFactor, - }, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - // Check the replication set has the correct settings - replicationSet, err := ring.GetReplicationSetForOperation(Read) - if testData.expectedError == nil { - require.NoError(t, err) - } else { - require.Equal(t, testData.expectedError, err) - } - - assert.Equal(t, testData.expectedMaxErrors, replicationSet.MaxErrors) - assert.Equal(t, testData.expectedMaxUnavailableZones, replicationSet.MaxUnavailableZones) - - returnAddresses := []string{} - for _, instance := range replicationSet.Instances { - returnAddresses = append(returnAddresses, instance.Addr) - } - for _, addr := range testData.expectedAddresses { - assert.Contains(t, returnAddresses, addr) - } - assert.Equal(t, len(testData.expectedAddresses), len(replicationSet.Instances)) - }) - } -} - -func TestRing_ShuffleShard(t *testing.T) { - tests := map[string]struct { - ringInstances map[string]InstanceDesc - shardSize int - zoneAwarenessEnabled bool - expectedSize int - expectedDistribution []int - }{ - "empty ring": { - ringInstances: nil, - shardSize: 2, - zoneAwarenessEnabled: true, - expectedSize: 0, - expectedDistribution: []int{}, - }, - "single zone, shard size > num instances": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - }, - shardSize: 3, - zoneAwarenessEnabled: true, - expectedSize: 2, - expectedDistribution: []int{2}, - }, - "single zone, shard size < num instances": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - }, - shardSize: 2, - zoneAwarenessEnabled: true, - expectedSize: 2, - expectedDistribution: []int{2}, - }, - "multiple zones, shard size < num zones": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - }, - shardSize: 2, - zoneAwarenessEnabled: true, - expectedSize: 3, - expectedDistribution: []int{1, 1, 1}, - }, - "multiple zones, shard size divisible by num zones": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-6": {Addr: "127.0.0.6", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - }, - shardSize: 3, - zoneAwarenessEnabled: true, - expectedSize: 3, - expectedDistribution: []int{1, 1, 1}, - }, - "multiple zones, shard size NOT divisible by num zones": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-6": {Addr: "127.0.0.6", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - }, - shardSize: 4, - zoneAwarenessEnabled: true, - expectedSize: 6, - expectedDistribution: []int{2, 2, 2}, - }, - "multiple zones, shard size NOT divisible by num zones, but zone awareness is disabled": { - ringInstances: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-2": {Addr: "127.0.0.2", Zone: "zone-a", Tokens: GenerateTokens(128, nil)}, - "instance-3": {Addr: "127.0.0.3", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-4": {Addr: "127.0.0.4", Zone: "zone-b", Tokens: GenerateTokens(128, nil)}, - "instance-5": {Addr: "127.0.0.5", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - "instance-6": {Addr: "127.0.0.6", Zone: "zone-c", Tokens: GenerateTokens(128, nil)}, - }, - shardSize: 4, - zoneAwarenessEnabled: false, - expectedSize: 4, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - // Init the ring. - ringDesc := &Desc{Ingesters: testData.ringInstances} - for id, instance := range ringDesc.Ingesters { - instance.Timestamp = time.Now().Unix() - instance.State = ACTIVE - ringDesc.Ingesters[id] = instance - } - - ring := Ring{ - cfg: Config{ - HeartbeatTimeout: time.Hour, - ZoneAwarenessEnabled: testData.zoneAwarenessEnabled, - }, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - shardRing := ring.ShuffleShard("tenant-id", testData.shardSize) - assert.Equal(t, testData.expectedSize, shardRing.InstancesCount()) - - // Compute the actual distribution of instances across zones. - if testData.zoneAwarenessEnabled { - var actualDistribution []int - - if shardRing.InstancesCount() > 0 { - all, err := shardRing.GetAllHealthy(Read) - require.NoError(t, err) - - countByZone := map[string]int{} - for _, instance := range all.Instances { - countByZone[instance.Zone]++ - } - - for _, count := range countByZone { - actualDistribution = append(actualDistribution, count) - } - } - - assert.ElementsMatch(t, testData.expectedDistribution, actualDistribution) - } - }) - } -} - -// This test asserts on shard stability across multiple invocations and given the same input ring. -func TestRing_ShuffleShard_Stability(t *testing.T) { - var ( - numTenants = 100 - numInstances = 50 - numZones = 3 - numInvocations = 10 - shardSizes = []int{3, 6, 9, 12, 15} - ) - - // Initialise the ring. - ringDesc := &Desc{Ingesters: generateRingInstances(numInstances, numZones, 128)} - ring := Ring{ - cfg: Config{ - HeartbeatTimeout: time.Hour, - ZoneAwarenessEnabled: true, - }, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - for i := 1; i <= numTenants; i++ { - tenantID := fmt.Sprintf("%d", i) - - for _, size := range shardSizes { - r := ring.ShuffleShard(tenantID, size) - expected, err := r.GetAllHealthy(Read) - require.NoError(t, err) - - // Assert that multiple invocations generate the same exact shard. - for n := 0; n < numInvocations; n++ { - r := ring.ShuffleShard(tenantID, size) - actual, err := r.GetAllHealthy(Read) - require.NoError(t, err) - assert.ElementsMatch(t, expected.Instances, actual.Instances) - } - } - } -} - -func TestRing_ShuffleShard_Shuffling(t *testing.T) { - var ( - numTenants = 1000 - numInstances = 90 - numZones = 3 - shardSize = 3 - - // This is the expected theoretical distribution of matching instances - // between different shards, given the settings above. It has been computed - // using this spreadsheet: - // https://docs.google.com/spreadsheets/d/1FXbiWTXi6bdERtamH-IfmpgFq1fNL4GP_KX_yJvbRi4/edit - theoreticalMatchings = map[int]float64{ - 0: 90.2239, - 1: 9.55312, - 2: 0.22217, - 3: 0.00085, - } - ) - - // Initialise the ring instances. To have stable tests we generate tokens using a linear - // distribution. Tokens within the same zone are evenly distributed too. - instances := make(map[string]InstanceDesc, numInstances) - for i := 0; i < numInstances; i++ { - id := fmt.Sprintf("instance-%d", i) - instances[id] = InstanceDesc{ - Addr: fmt.Sprintf("127.0.0.%d", i), - Timestamp: time.Now().Unix(), - RegisteredTimestamp: time.Now().Unix(), - State: ACTIVE, - Tokens: generateTokensLinear(i, numInstances, 128), - Zone: fmt.Sprintf("zone-%d", i%numZones), - } - } - - // Initialise the ring. - ringDesc := &Desc{Ingesters: instances} - ring := Ring{ - cfg: Config{ - HeartbeatTimeout: time.Hour, - ZoneAwarenessEnabled: true, - }, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - // Compute the shard for each tenant. - shards := map[string][]string{} - - for i := 1; i <= numTenants; i++ { - tenantID := fmt.Sprintf("%d", i) - r := ring.ShuffleShard(tenantID, shardSize) - set, err := r.GetAllHealthy(Read) - require.NoError(t, err) - - instances := make([]string, 0, len(set.Instances)) - for _, instance := range set.Instances { - instances = append(instances, instance.Addr) - } - - shards[tenantID] = instances - } - - // Compute the distribution of matching instances between every combination of shards. - // The shards comparison is not optimized, but it's fine for a test. - distribution := map[int]int{} - - for currID, currShard := range shards { - for otherID, otherShard := range shards { - if currID == otherID { - continue - } - - numMatching := 0 - for _, c := range currShard { - if util.StringsContain(otherShard, c) { - numMatching++ - } - } - - distribution[numMatching]++ - } - } - - maxCombinations := int(math.Pow(float64(numTenants), 2)) - numTenants - for numMatching, probability := range theoreticalMatchings { - // We allow a max deviance of 10% compared to the theoretical probability, - // clamping it between 1% and 0.2% boundaries. - maxDeviance := math.Min(1, math.Max(0.2, probability*0.1)) - - actual := (float64(distribution[numMatching]) / float64(maxCombinations)) * 100 - assert.InDelta(t, probability, actual, maxDeviance, "numMatching: %d", numMatching) - } -} - -func TestRing_ShuffleShard_Consistency(t *testing.T) { - type change string - - type scenario struct { - name string - numInstances int - numZones int - shardSize int - ringChange change - } - - const ( - numTenants = 100 - add = change("add-instance") - remove = change("remove-instance") - ) - - // Generate all test scenarios. - var scenarios []scenario - for _, numInstances := range []int{20, 30, 40, 50} { - for _, shardSize := range []int{3, 6, 9, 12, 15} { - for _, c := range []change{add, remove} { - scenarios = append(scenarios, scenario{ - name: fmt.Sprintf("instances = %d, shard size = %d, ring operation = %s", numInstances, shardSize, c), - numInstances: numInstances, - numZones: 3, - shardSize: shardSize, - ringChange: c, - }) - } - } - } - - for _, s := range scenarios { - t.Run(s.name, func(t *testing.T) { - // Initialise the ring. - ringDesc := &Desc{Ingesters: generateRingInstances(s.numInstances, s.numZones, 128)} - ring := Ring{ - cfg: Config{ - HeartbeatTimeout: time.Hour, - ZoneAwarenessEnabled: true, - }, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - // Compute the initial shard for each tenant. - initial := map[int]ReplicationSet{} - for id := 0; id < numTenants; id++ { - set, err := ring.ShuffleShard(fmt.Sprintf("%d", id), s.shardSize).GetAllHealthy(Read) - require.NoError(t, err) - initial[id] = set - } - - // Update the ring. - switch s.ringChange { - case add: - newID, newDesc := generateRingInstance(s.numInstances+1, 0, 128) - ringDesc.Ingesters[newID] = newDesc - case remove: - // Remove the first one. - for id := range ringDesc.Ingesters { - delete(ringDesc.Ingesters, id) - break - } - } - - ring.ringTokens = ringDesc.GetTokens() - ring.ringTokensByZone = ringDesc.getTokensByZone() - ring.ringInstanceByToken = ringDesc.getTokensInfo() - ring.ringZones = getZones(ringDesc.getTokensByZone()) - - // Compute the update shard for each tenant and compare it with the initial one. - // If the "consistency" property is guaranteed, we expect no more then 1 different instance - // in the updated shard. - for id := 0; id < numTenants; id++ { - updated, err := ring.ShuffleShard(fmt.Sprintf("%d", id), s.shardSize).GetAllHealthy(Read) - require.NoError(t, err) - - added, removed := compareReplicationSets(initial[id], updated) - assert.LessOrEqual(t, len(added), 1) - assert.LessOrEqual(t, len(removed), 1) - } - }) - } -} - -func TestRing_ShuffleShard_ConsistencyOnShardSizeChanged(t *testing.T) { - // Create 30 instances in 3 zones. - ringInstances := map[string]InstanceDesc{} - for i := 0; i < 30; i++ { - name, desc := generateRingInstance(i, i%3, 128) - ringInstances[name] = desc - } - - // Init the ring. - ringDesc := &Desc{Ingesters: ringInstances} - ring := Ring{ - cfg: Config{ - HeartbeatTimeout: time.Hour, - ZoneAwarenessEnabled: true, - }, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - // Get the replication set with shard size = 3. - firstShard := ring.ShuffleShard("tenant-id", 3) - assert.Equal(t, 3, firstShard.InstancesCount()) - - firstSet, err := firstShard.GetAllHealthy(Read) - require.NoError(t, err) - - // Increase shard size to 6. - secondShard := ring.ShuffleShard("tenant-id", 6) - assert.Equal(t, 6, secondShard.InstancesCount()) - - secondSet, err := secondShard.GetAllHealthy(Read) - require.NoError(t, err) - - for _, firstInstance := range firstSet.Instances { - assert.True(t, secondSet.Includes(firstInstance.Addr), "new replication set is expected to include previous instance %s", firstInstance.Addr) - } - - // Increase shard size to 9. - thirdShard := ring.ShuffleShard("tenant-id", 9) - assert.Equal(t, 9, thirdShard.InstancesCount()) - - thirdSet, err := thirdShard.GetAllHealthy(Read) - require.NoError(t, err) - - for _, secondInstance := range secondSet.Instances { - assert.True(t, thirdSet.Includes(secondInstance.Addr), "new replication set is expected to include previous instance %s", secondInstance.Addr) - } - - // Decrease shard size to 6. - fourthShard := ring.ShuffleShard("tenant-id", 6) - assert.Equal(t, 6, fourthShard.InstancesCount()) - - fourthSet, err := fourthShard.GetAllHealthy(Read) - require.NoError(t, err) - - // We expect to have the same exact instances we had when the shard size was 6. - for _, secondInstance := range secondSet.Instances { - assert.True(t, fourthSet.Includes(secondInstance.Addr), "new replication set is expected to include previous instance %s", secondInstance.Addr) - } - - // Decrease shard size to 3. - fifthShard := ring.ShuffleShard("tenant-id", 3) - assert.Equal(t, 3, fifthShard.InstancesCount()) - - fifthSet, err := fifthShard.GetAllHealthy(Read) - require.NoError(t, err) - - // We expect to have the same exact instances we had when the shard size was 3. - for _, firstInstance := range firstSet.Instances { - assert.True(t, fifthSet.Includes(firstInstance.Addr), "new replication set is expected to include previous instance %s", firstInstance.Addr) - } -} - -func TestRing_ShuffleShard_ConsistencyOnZonesChanged(t *testing.T) { - // Create 20 instances in 2 zones. - ringInstances := map[string]InstanceDesc{} - for i := 0; i < 20; i++ { - name, desc := generateRingInstance(i, i%2, 128) - ringInstances[name] = desc - } - - // Init the ring. - ringDesc := &Desc{Ingesters: ringInstances} - ring := Ring{ - cfg: Config{ - HeartbeatTimeout: time.Hour, - ZoneAwarenessEnabled: true, - }, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - // Get the replication set with shard size = 2. - firstShard := ring.ShuffleShard("tenant-id", 2) - assert.Equal(t, 2, firstShard.InstancesCount()) - - firstSet, err := firstShard.GetAllHealthy(Read) - require.NoError(t, err) - - // Increase shard size to 4. - secondShard := ring.ShuffleShard("tenant-id", 4) - assert.Equal(t, 4, secondShard.InstancesCount()) - - secondSet, err := secondShard.GetAllHealthy(Read) - require.NoError(t, err) - - for _, firstInstance := range firstSet.Instances { - assert.True(t, secondSet.Includes(firstInstance.Addr), "new replication set is expected to include previous instance %s", firstInstance.Addr) - } - - // Scale up cluster, adding 10 instances in 1 new zone. - for i := 20; i < 30; i++ { - name, desc := generateRingInstance(i, 2, 128) - ringInstances[name] = desc - } - - ring.ringDesc.Ingesters = ringInstances - ring.ringTokens = ringDesc.GetTokens() - ring.ringTokensByZone = ringDesc.getTokensByZone() - ring.ringInstanceByToken = ringDesc.getTokensInfo() - ring.ringZones = getZones(ringDesc.getTokensByZone()) - - // Increase shard size to 6. - thirdShard := ring.ShuffleShard("tenant-id", 6) - assert.Equal(t, 6, thirdShard.InstancesCount()) - - thirdSet, err := thirdShard.GetAllHealthy(Read) - require.NoError(t, err) - - for _, secondInstance := range secondSet.Instances { - assert.True(t, thirdSet.Includes(secondInstance.Addr), "new replication set is expected to include previous instance %s", secondInstance.Addr) - } - - // Increase shard size to 9. - fourthShard := ring.ShuffleShard("tenant-id", 9) - assert.Equal(t, 9, fourthShard.InstancesCount()) - - fourthSet, err := fourthShard.GetAllHealthy(Read) - require.NoError(t, err) - - for _, thirdInstance := range thirdSet.Instances { - assert.True(t, fourthSet.Includes(thirdInstance.Addr), "new replication set is expected to include previous instance %s", thirdInstance.Addr) - } -} - -func TestRing_ShuffleShardWithLookback(t *testing.T) { - type eventType int - - const ( - add eventType = iota - remove - test - - lookbackPeriod = time.Hour - userID = "user-1" - ) - - var ( - now = time.Now() - ) - - type event struct { - what eventType - instanceID string - instanceDesc InstanceDesc - shardSize int - expected []string - } - - tests := map[string]struct { - timeline []event - }{ - "single zone, shard size = 1, recently bootstrapped cluster": { - timeline: []event{ - {what: add, instanceID: "instance-1", instanceDesc: generateRingInstanceWithInfo("instance-1", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 1}, now.Add(-time.Minute))}, - {what: add, instanceID: "instance-2", instanceDesc: generateRingInstanceWithInfo("instance-2", "zone-a", []uint32{userToken(userID, "zone-a", 1) + 1}, now.Add(-time.Minute))}, - {what: add, instanceID: "instance-3", instanceDesc: generateRingInstanceWithInfo("instance-3", "zone-a", []uint32{userToken(userID, "zone-a", 2) + 1}, now.Add(-time.Minute))}, - {what: test, shardSize: 1, expected: []string{"instance-1", "instance-2", "instance-3"}}, - }, - }, - "single zone, shard size = 1, instances scale up": { - timeline: []event{ - {what: add, instanceID: "instance-1", instanceDesc: generateRingInstanceWithInfo("instance-1", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 3}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-2", instanceDesc: generateRingInstanceWithInfo("instance-2", "zone-a", []uint32{userToken(userID, "zone-a", 1) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-3", instanceDesc: generateRingInstanceWithInfo("instance-3", "zone-a", []uint32{userToken(userID, "zone-a", 2) + 1}, now.Add(-2*lookbackPeriod))}, - {what: test, shardSize: 1, expected: []string{"instance-1"}}, - {what: add, instanceID: "instance-4", instanceDesc: generateRingInstanceWithInfo("instance-4", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 2}, now.Add(-10*time.Minute))}, - {what: test, shardSize: 1, expected: []string{"instance-4" /* lookback: */, "instance-1"}}, - {what: add, instanceID: "instance-5", instanceDesc: generateRingInstanceWithInfo("instance-5", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 1}, now.Add(-5*time.Minute))}, - {what: test, shardSize: 1, expected: []string{"instance-5" /* lookback: */, "instance-4", "instance-1"}}, - }, - }, - "single zone, shard size = 1, instances scale down": { - timeline: []event{ - {what: add, instanceID: "instance-1", instanceDesc: generateRingInstanceWithInfo("instance-1", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-2", instanceDesc: generateRingInstanceWithInfo("instance-2", "zone-a", []uint32{userToken(userID, "zone-a", 1) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-3", instanceDesc: generateRingInstanceWithInfo("instance-3", "zone-a", []uint32{userToken(userID, "zone-a", 2) + 1}, now.Add(-2*lookbackPeriod))}, - {what: test, shardSize: 1, expected: []string{"instance-1"}}, - {what: remove, instanceID: "instance-3"}, - {what: test, shardSize: 1, expected: []string{"instance-1"}}, - {what: remove, instanceID: "instance-1"}, - {what: test, shardSize: 1, expected: []string{"instance-2"}}, - }, - }, - "single zone, shard size = 1, rollout with instances unregistered (removed and re-added one by one)": { - timeline: []event{ - {what: add, instanceID: "instance-1", instanceDesc: generateRingInstanceWithInfo("instance-1", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-2", instanceDesc: generateRingInstanceWithInfo("instance-2", "zone-a", []uint32{userToken(userID, "zone-a", 1) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-3", instanceDesc: generateRingInstanceWithInfo("instance-3", "zone-a", []uint32{userToken(userID, "zone-a", 2) + 1}, now.Add(-2*lookbackPeriod))}, - {what: test, shardSize: 1, expected: []string{"instance-1"}}, - // Rollout instance-3. - {what: remove, instanceID: "instance-3"}, - {what: test, shardSize: 1, expected: []string{"instance-1"}}, - {what: add, instanceID: "instance-3", instanceDesc: generateRingInstanceWithInfo("instance-3", "zone-a", []uint32{userToken(userID, "zone-a", 2) + 1}, now)}, - {what: test, shardSize: 1, expected: []string{"instance-1"}}, - // Rollout instance-2. - {what: remove, instanceID: "instance-2"}, - {what: test, shardSize: 1, expected: []string{"instance-1"}}, - {what: add, instanceID: "instance-2", instanceDesc: generateRingInstanceWithInfo("instance-2", "zone-a", []uint32{userToken(userID, "zone-a", 1) + 1}, now)}, - {what: test, shardSize: 1, expected: []string{"instance-1"}}, - // Rollout instance-1. - {what: remove, instanceID: "instance-1"}, - {what: test, shardSize: 1, expected: []string{"instance-2" /* side effect: */, "instance-3"}}, - {what: add, instanceID: "instance-1", instanceDesc: generateRingInstanceWithInfo("instance-1", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 1}, now)}, - {what: test, shardSize: 1, expected: []string{"instance-1" /* lookback: */, "instance-2" /* side effect: */, "instance-3"}}, - }, - }, - "single zone, shard size = 2, rollout with instances unregistered (removed and re-added one by one)": { - timeline: []event{ - {what: add, instanceID: "instance-1", instanceDesc: generateRingInstanceWithInfo("instance-1", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-2", instanceDesc: generateRingInstanceWithInfo("instance-2", "zone-a", []uint32{userToken(userID, "zone-a", 1) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-3", instanceDesc: generateRingInstanceWithInfo("instance-3", "zone-a", []uint32{userToken(userID, "zone-a", 2) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-4", instanceDesc: generateRingInstanceWithInfo("instance-4", "zone-a", []uint32{userToken(userID, "zone-a", 3) + 1}, now.Add(-2*lookbackPeriod))}, - {what: test, shardSize: 2, expected: []string{"instance-1", "instance-2"}}, - // Rollout instance-4. - {what: remove, instanceID: "instance-4"}, - {what: test, shardSize: 2, expected: []string{"instance-1", "instance-2"}}, - {what: add, instanceID: "instance-4", instanceDesc: generateRingInstanceWithInfo("instance-4", "zone-a", []uint32{userToken(userID, "zone-a", 3) + 1}, now)}, - {what: test, shardSize: 2, expected: []string{"instance-1", "instance-2"}}, - // Rollout instance-3. - {what: remove, instanceID: "instance-3"}, - {what: test, shardSize: 2, expected: []string{"instance-1", "instance-2"}}, - {what: add, instanceID: "instance-3", instanceDesc: generateRingInstanceWithInfo("instance-3", "zone-a", []uint32{userToken(userID, "zone-a", 2) + 1}, now)}, - {what: test, shardSize: 2, expected: []string{"instance-1", "instance-2"}}, - // Rollout instance-2. - {what: remove, instanceID: "instance-2"}, - {what: test, shardSize: 2, expected: []string{"instance-1", "instance-3" /* side effect:*/, "instance-4"}}, - {what: add, instanceID: "instance-2", instanceDesc: generateRingInstanceWithInfo("instance-2", "zone-a", []uint32{userToken(userID, "zone-a", 1) + 1}, now)}, - {what: test, shardSize: 2, expected: []string{"instance-1", "instance-2" /* lookback: */, "instance-3" /* side effect:*/, "instance-4"}}, - // Rollout instance-1. - {what: remove, instanceID: "instance-1"}, - {what: test, shardSize: 2, expected: []string{"instance-2" /* lookback: */, "instance-3" /* side effect:*/, "instance-4"}}, - {what: add, instanceID: "instance-1", instanceDesc: generateRingInstanceWithInfo("instance-1", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 1}, now)}, - {what: test, shardSize: 2, expected: []string{"instance-1", "instance-2" /* lookback: */, "instance-3" /* side effect:*/, "instance-4"}}, - }, - }, - "single zone, increase shard size": { - timeline: []event{ - {what: add, instanceID: "instance-1", instanceDesc: generateRingInstanceWithInfo("instance-1", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-2", instanceDesc: generateRingInstanceWithInfo("instance-2", "zone-a", []uint32{userToken(userID, "zone-a", 1) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-3", instanceDesc: generateRingInstanceWithInfo("instance-3", "zone-a", []uint32{userToken(userID, "zone-a", 2) + 1}, now.Add(-2*lookbackPeriod))}, - {what: test, shardSize: 1, expected: []string{"instance-1"}}, - {what: test, shardSize: 2, expected: []string{"instance-1", "instance-2"}}, - {what: test, shardSize: 3, expected: []string{"instance-1", "instance-2", "instance-3"}}, - {what: test, shardSize: 4, expected: []string{"instance-1", "instance-2", "instance-3"}}, - }, - }, - "multi zone, shard size = 3, recently bootstrapped cluster": { - timeline: []event{ - {what: add, instanceID: "instance-1", instanceDesc: generateRingInstanceWithInfo("instance-1", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 1}, now.Add(-time.Minute))}, - {what: add, instanceID: "instance-2", instanceDesc: generateRingInstanceWithInfo("instance-2", "zone-b", []uint32{userToken(userID, "zone-b", 1) + 1}, now.Add(-time.Minute))}, - {what: add, instanceID: "instance-3", instanceDesc: generateRingInstanceWithInfo("instance-3", "zone-c", []uint32{userToken(userID, "zone-c", 2) + 1}, now.Add(-time.Minute))}, - {what: add, instanceID: "instance-4", instanceDesc: generateRingInstanceWithInfo("instance-4", "zone-a", []uint32{userToken(userID, "zone-a", 3) + 1}, now.Add(-time.Minute))}, - {what: add, instanceID: "instance-5", instanceDesc: generateRingInstanceWithInfo("instance-5", "zone-b", []uint32{userToken(userID, "zone-b", 4) + 1}, now.Add(-time.Minute))}, - {what: add, instanceID: "instance-6", instanceDesc: generateRingInstanceWithInfo("instance-6", "zone-c", []uint32{userToken(userID, "zone-c", 5) + 1}, now.Add(-time.Minute))}, - {what: test, shardSize: 3, expected: []string{"instance-1", "instance-2", "instance-3", "instance-4", "instance-5", "instance-6"}}, - }, - }, - "multi zone, shard size = 3, instances scale up": { - timeline: []event{ - {what: add, instanceID: "instance-1", instanceDesc: generateRingInstanceWithInfo("instance-1", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 2}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-2", instanceDesc: generateRingInstanceWithInfo("instance-2", "zone-b", []uint32{userToken(userID, "zone-b", 1) + 2}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-3", instanceDesc: generateRingInstanceWithInfo("instance-3", "zone-c", []uint32{userToken(userID, "zone-c", 2) + 2}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-4", instanceDesc: generateRingInstanceWithInfo("instance-4", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 3}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-5", instanceDesc: generateRingInstanceWithInfo("instance-5", "zone-b", []uint32{userToken(userID, "zone-b", 1) + 3}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-6", instanceDesc: generateRingInstanceWithInfo("instance-6", "zone-c", []uint32{userToken(userID, "zone-c", 2) + 3}, now.Add(-2*lookbackPeriod))}, - {what: test, shardSize: 3, expected: []string{"instance-1", "instance-2", "instance-3"}}, - // Scale up. - {what: add, instanceID: "instance-7", instanceDesc: generateRingInstanceWithInfo("instance-7", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 1}, now)}, - {what: test, shardSize: 3, expected: []string{"instance-7", "instance-2", "instance-3" /* lookback: */, "instance-1"}}, - {what: add, instanceID: "instance-8", instanceDesc: generateRingInstanceWithInfo("instance-8", "zone-b", []uint32{userToken(userID, "zone-b", 1) + 1}, now)}, - {what: test, shardSize: 3, expected: []string{"instance-7", "instance-8", "instance-3" /* lookback: */, "instance-1", "instance-2"}}, - {what: add, instanceID: "instance-9", instanceDesc: generateRingInstanceWithInfo("instance-9", "zone-c", []uint32{userToken(userID, "zone-c", 2) + 1}, now)}, - {what: test, shardSize: 3, expected: []string{"instance-7", "instance-8", "instance-9" /* lookback: */, "instance-1", "instance-2", "instance-3"}}, - }, - }, - "multi zone, shard size = 3, instances scale down": { - timeline: []event{ - {what: add, instanceID: "instance-1", instanceDesc: generateRingInstanceWithInfo("instance-1", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-2", instanceDesc: generateRingInstanceWithInfo("instance-2", "zone-b", []uint32{userToken(userID, "zone-b", 1) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-3", instanceDesc: generateRingInstanceWithInfo("instance-3", "zone-c", []uint32{userToken(userID, "zone-c", 2) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-4", instanceDesc: generateRingInstanceWithInfo("instance-4", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 3}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-5", instanceDesc: generateRingInstanceWithInfo("instance-5", "zone-b", []uint32{userToken(userID, "zone-b", 1) + 3}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-6", instanceDesc: generateRingInstanceWithInfo("instance-6", "zone-c", []uint32{userToken(userID, "zone-c", 2) + 3}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-7", instanceDesc: generateRingInstanceWithInfo("instance-7", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 2}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-8", instanceDesc: generateRingInstanceWithInfo("instance-8", "zone-b", []uint32{userToken(userID, "zone-b", 1) + 2}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-9", instanceDesc: generateRingInstanceWithInfo("instance-9", "zone-c", []uint32{userToken(userID, "zone-c", 2) + 2}, now.Add(-2*lookbackPeriod))}, - {what: test, shardSize: 3, expected: []string{"instance-1", "instance-2", "instance-3"}}, - // Scale down. - {what: remove, instanceID: "instance-1"}, - {what: test, shardSize: 3, expected: []string{"instance-7", "instance-2", "instance-3"}}, - {what: remove, instanceID: "instance-2"}, - {what: test, shardSize: 3, expected: []string{"instance-7", "instance-8", "instance-3"}}, - {what: remove, instanceID: "instance-3"}, - {what: test, shardSize: 3, expected: []string{"instance-7", "instance-8", "instance-9"}}, - }, - }, - "multi zone, increase shard size": { - timeline: []event{ - {what: add, instanceID: "instance-1", instanceDesc: generateRingInstanceWithInfo("instance-1", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-2", instanceDesc: generateRingInstanceWithInfo("instance-2", "zone-b", []uint32{userToken(userID, "zone-b", 1) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-3", instanceDesc: generateRingInstanceWithInfo("instance-3", "zone-c", []uint32{userToken(userID, "zone-c", 2) + 1}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-4", instanceDesc: generateRingInstanceWithInfo("instance-4", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 3}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-5", instanceDesc: generateRingInstanceWithInfo("instance-5", "zone-b", []uint32{userToken(userID, "zone-b", 1) + 3}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-6", instanceDesc: generateRingInstanceWithInfo("instance-6", "zone-c", []uint32{userToken(userID, "zone-c", 2) + 3}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-7", instanceDesc: generateRingInstanceWithInfo("instance-7", "zone-a", []uint32{userToken(userID, "zone-a", 0) + 2}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-8", instanceDesc: generateRingInstanceWithInfo("instance-8", "zone-b", []uint32{userToken(userID, "zone-b", 1) + 2}, now.Add(-2*lookbackPeriod))}, - {what: add, instanceID: "instance-9", instanceDesc: generateRingInstanceWithInfo("instance-9", "zone-c", []uint32{userToken(userID, "zone-c", 2) + 2}, now.Add(-2*lookbackPeriod))}, - {what: test, shardSize: 3, expected: []string{"instance-1", "instance-2", "instance-3"}}, - {what: test, shardSize: 6, expected: []string{"instance-1", "instance-2", "instance-3", "instance-7", "instance-8", "instance-9"}}, - }, - }, - } - - for testName, testData := range tests { - t.Run(testName, func(t *testing.T) { - // Initialise the ring. - ringDesc := &Desc{Ingesters: map[string]InstanceDesc{}} - ring := Ring{ - cfg: Config{ - HeartbeatTimeout: time.Hour, - ZoneAwarenessEnabled: true, - }, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - // Replay the events on the timeline. - for _, event := range testData.timeline { - switch event.what { - case add: - ringDesc.Ingesters[event.instanceID] = event.instanceDesc - - ring.ringTokens = ringDesc.GetTokens() - ring.ringTokensByZone = ringDesc.getTokensByZone() - ring.ringInstanceByToken = ringDesc.getTokensInfo() - ring.ringZones = getZones(ringDesc.getTokensByZone()) - case remove: - delete(ringDesc.Ingesters, event.instanceID) - - ring.ringTokens = ringDesc.GetTokens() - ring.ringTokensByZone = ringDesc.getTokensByZone() - ring.ringInstanceByToken = ringDesc.getTokensInfo() - ring.ringZones = getZones(ringDesc.getTokensByZone()) - case test: - rs, err := ring.ShuffleShardWithLookback(userID, event.shardSize, lookbackPeriod, time.Now()).GetAllHealthy(Read) - require.NoError(t, err) - assert.ElementsMatch(t, event.expected, rs.GetAddresses()) - } - } - }) - } -} - -func TestRing_ShuffleShardWithLookback_CorrectnessWithFuzzy(t *testing.T) { - // The goal of this test is NOT to ensure that the minimum required number of instances - // are returned at any given time, BUT at least all required instances are returned. - var ( - numInitialInstances = []int{9, 30, 60, 90} - numInitialZones = []int{1, 3} - numEvents = 100 - lookbackPeriod = time.Hour - delayBetweenEvents = 5 * time.Minute // 12 events / hour - userID = "user-1" - ) - - for _, numInstances := range numInitialInstances { - for _, numZones := range numInitialZones { - testName := fmt.Sprintf("num instances = %d, num zones = %d", numInstances, numZones) - - t.Run(testName, func(t *testing.T) { - // Randomise the seed but log it in case we need to reproduce the test on failure. - seed := time.Now().UnixNano() - rand.Seed(seed) - t.Log("random generator seed:", seed) - - // Initialise the ring. - ringDesc := &Desc{Ingesters: generateRingInstances(numInstances, numZones, 128)} - ring := Ring{ - cfg: Config{ - HeartbeatTimeout: time.Hour, - ZoneAwarenessEnabled: true, - ReplicationFactor: 3, - }, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - // The simulation starts with the minimum shard size. Random events can later increase it. - shardSize := numZones - - // The simulation assumes the initial ring contains instances registered - // since more than the lookback period. - currTime := time.Now().Add(lookbackPeriod).Add(time.Minute) - - // Add the initial shard to the history. - rs, err := ring.shuffleShard(userID, shardSize, 0, time.Now()).GetReplicationSetForOperation(Read) - require.NoError(t, err) - - history := map[time.Time]ReplicationSet{ - currTime: rs, - } - - // Simulate a progression of random events over the time and, at each iteration of the simuation, - // make sure the subring includes all non-removed instances picked from previous versions of the - // ring up until the lookback period. - nextInstanceID := len(ringDesc.Ingesters) + 1 - - for i := 1; i <= numEvents; i++ { - currTime = currTime.Add(delayBetweenEvents) - - switch r := rand.Intn(100); { - case r < 80: - // Scale up instances by 1. - instanceID := fmt.Sprintf("instance-%d", nextInstanceID) - zoneID := fmt.Sprintf("zone-%d", nextInstanceID%numZones) - nextInstanceID++ - - ringDesc.Ingesters[instanceID] = generateRingInstanceWithInfo(instanceID, zoneID, GenerateTokens(128, nil), currTime) - - ring.ringTokens = ringDesc.GetTokens() - ring.ringTokensByZone = ringDesc.getTokensByZone() - ring.ringInstanceByToken = ringDesc.getTokensInfo() - ring.ringZones = getZones(ringDesc.getTokensByZone()) - case r < 90: - // Scale down instances by 1. To make tests reproducible we get the instance IDs, sort them - // and then get a random index (using the random generator initialized with a constant seed). - instanceIDs := make([]string, 0, len(ringDesc.Ingesters)) - for id := range ringDesc.Ingesters { - instanceIDs = append(instanceIDs, id) - } - - sort.Strings(instanceIDs) - - idxToRemove := rand.Intn(len(instanceIDs)) - idToRemove := instanceIDs[idxToRemove] - delete(ringDesc.Ingesters, idToRemove) - - ring.ringTokens = ringDesc.GetTokens() - ring.ringTokensByZone = ringDesc.getTokensByZone() - ring.ringInstanceByToken = ringDesc.getTokensInfo() - ring.ringZones = getZones(ringDesc.getTokensByZone()) - - // Remove the terminated instance from the history. - for ringTime, ringState := range history { - for idx, desc := range ringState.Instances { - // In this simulation instance ID == instance address. - if desc.Addr != idToRemove { - continue - } - - ringState.Instances = append(ringState.Instances[:idx], ringState.Instances[idx+1:]...) - history[ringTime] = ringState - break - } - } - default: - // Scale up shard size (keeping the per-zone balance). - shardSize += numZones - } - - // Add the current shard to the history. - rs, err = ring.shuffleShard(userID, shardSize, 0, time.Now()).GetReplicationSetForOperation(Read) - require.NoError(t, err) - history[currTime] = rs - - // Ensure the shard with lookback includes all instances from previous states of the ring. - rsWithLookback, err := ring.ShuffleShardWithLookback(userID, shardSize, lookbackPeriod, currTime).GetReplicationSetForOperation(Read) - require.NoError(t, err) - - for ringTime, ringState := range history { - if ringTime.Before(currTime.Add(-lookbackPeriod)) { - // This entry from the history is obsolete, we can remove it. - delete(history, ringTime) - continue - } - - for _, expectedAddr := range ringState.GetAddresses() { - if !rsWithLookback.Includes(expectedAddr) { - t.Fatalf( - "subring generated after event %d is expected to include instance %s from ring state at time %s but it's missing (actual instances are: %s)", - i, expectedAddr, ringTime.String(), strings.Join(rsWithLookback.GetAddresses(), ", ")) - } - } - } - } - }) - } - } -} - -func BenchmarkRing_ShuffleShard(b *testing.B) { - for _, numInstances := range []int{50, 100, 1000} { - for _, numZones := range []int{1, 3} { - for _, shardSize := range []int{3, 10, 30} { - b.Run(fmt.Sprintf("num instances = %d, num zones = %d, shard size = %d", numInstances, numZones, shardSize), func(b *testing.B) { - benchmarkShuffleSharding(b, numInstances, numZones, 128, shardSize, false) - }) - } - } - } -} - -func BenchmarkRing_ShuffleShardCached(b *testing.B) { - for _, numInstances := range []int{50, 100, 1000} { - for _, numZones := range []int{1, 3} { - for _, shardSize := range []int{3, 10, 30} { - b.Run(fmt.Sprintf("num instances = %d, num zones = %d, shard size = %d", numInstances, numZones, shardSize), func(b *testing.B) { - benchmarkShuffleSharding(b, numInstances, numZones, 128, shardSize, true) - }) - } - } - } -} - -func BenchmarkRing_ShuffleShard_512Tokens(b *testing.B) { - const ( - numInstances = 30 - numZones = 3 - numTokens = 512 - shardSize = 9 - cacheEnabled = false - ) - - benchmarkShuffleSharding(b, numInstances, numZones, numTokens, shardSize, cacheEnabled) -} - -func benchmarkShuffleSharding(b *testing.B, numInstances, numZones, numTokens, shardSize int, cache bool) { - // Initialise the ring. - ringDesc := &Desc{Ingesters: generateRingInstances(numInstances, numZones, numTokens)} - ring := Ring{ - cfg: Config{HeartbeatTimeout: time.Hour, ZoneAwarenessEnabled: true, SubringCacheDisabled: !cache}, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - shuffledSubringCache: map[subringCacheKey]*Ring{}, - strategy: NewDefaultReplicationStrategy(), - lastTopologyChange: time.Now(), - } - - b.ResetTimer() - - for n := 0; n < b.N; n++ { - ring.ShuffleShard("tenant-1", shardSize) - } -} - -func BenchmarkRing_Get(b *testing.B) { - const ( - numInstances = 100 - numZones = 3 - replicationFactor = 3 - ) - - // Initialise the ring. - ringDesc := &Desc{Ingesters: generateRingInstances(numInstances, numZones, numTokens)} - ring := Ring{ - cfg: Config{HeartbeatTimeout: time.Hour, ZoneAwarenessEnabled: true, SubringCacheDisabled: true, ReplicationFactor: replicationFactor}, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - shuffledSubringCache: map[subringCacheKey]*Ring{}, - strategy: NewDefaultReplicationStrategy(), - lastTopologyChange: time.Now(), - } - - buf, bufHosts, bufZones := MakeBuffersForGet() - r := rand.New(rand.NewSource(time.Now().UnixNano())) - - b.ResetTimer() - - for n := 0; n < b.N; n++ { - set, err := ring.Get(r.Uint32(), Write, buf, bufHosts, bufZones) - if err != nil || len(set.Instances) != replicationFactor { - b.Fatal() - } - } -} - -func TestRing_Get_NoMemoryAllocations(t *testing.T) { - // Initialise the ring. - ringDesc := &Desc{Ingesters: generateRingInstances(3, 3, 128)} - ring := Ring{ - cfg: Config{HeartbeatTimeout: time.Hour, ZoneAwarenessEnabled: true, SubringCacheDisabled: true, ReplicationFactor: 3}, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - shuffledSubringCache: map[subringCacheKey]*Ring{}, - strategy: NewDefaultReplicationStrategy(), - lastTopologyChange: time.Now(), - } - - buf, bufHosts, bufZones := MakeBuffersForGet() - r := rand.New(rand.NewSource(time.Now().UnixNano())) - - numAllocs := testing.AllocsPerRun(10, func() { - set, err := ring.Get(r.Uint32(), Write, buf, bufHosts, bufZones) - if err != nil || len(set.Instances) != 3 { - t.Fail() - } - }) - - assert.Equal(t, float64(0), numAllocs) -} - -// generateTokensLinear returns tokens with a linear distribution. -func generateTokensLinear(instanceID, numInstances, numTokens int) []uint32 { - tokens := make([]uint32, 0, numTokens) - step := math.MaxUint32 / numTokens - offset := (step / numInstances) * instanceID - - for t := offset; t <= math.MaxUint32; t += step { - tokens = append(tokens, uint32(t)) - } - - return tokens -} - -func generateRingInstances(numInstances, numZones, numTokens int) map[string]InstanceDesc { - instances := make(map[string]InstanceDesc, numInstances) - - for i := 1; i <= numInstances; i++ { - id, desc := generateRingInstance(i, i%numZones, numTokens) - instances[id] = desc - } - - return instances -} - -func generateRingInstance(id, zone, numTokens int) (string, InstanceDesc) { - instanceID := fmt.Sprintf("instance-%d", id) - zoneID := fmt.Sprintf("zone-%d", zone) - - return instanceID, generateRingInstanceWithInfo(instanceID, zoneID, GenerateTokens(numTokens, nil), time.Now()) -} - -func generateRingInstanceWithInfo(addr, zone string, tokens []uint32, registeredAt time.Time) InstanceDesc { - return InstanceDesc{ - Addr: addr, - Timestamp: time.Now().Unix(), - RegisteredTimestamp: registeredAt.Unix(), - State: ACTIVE, - Tokens: tokens, - Zone: zone, - } -} - -// compareReplicationSets returns the list of instance addresses which differ between the two sets. -func compareReplicationSets(first, second ReplicationSet) (added, removed []string) { - for _, instance := range first.Instances { - if !second.Includes(instance.Addr) { - added = append(added, instance.Addr) - } - } - - for _, instance := range second.Instances { - if !first.Includes(instance.Addr) { - removed = append(removed, instance.Addr) - } - } - - return -} - -// This test verifies that ring is getting updates, even after extending check in the loop method. -func TestRingUpdates(t *testing.T) { - inmem, closer := consul.NewInMemoryClient(GetCodec(), log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - cfg := Config{ - KVStore: kv.Config{Mock: inmem}, - HeartbeatTimeout: 1 * time.Minute, - ReplicationFactor: 3, - } - - ring, err := New(cfg, "test", "test", nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), ring)) - t.Cleanup(func() { - _ = services.StopAndAwaitTerminated(context.Background(), ring) - }) - - require.Equal(t, 0, ring.InstancesCount()) - - lc1 := startLifecycler(t, cfg, 100*time.Millisecond, 1, 3) - test.Poll(t, 1*time.Second, 1, func() interface{} { - return ring.InstancesCount() - }) - - lc2 := startLifecycler(t, cfg, 100*time.Millisecond, 2, 3) - test.Poll(t, 1*time.Second, 2, func() interface{} { - return ring.InstancesCount() - }) - - lc3 := startLifecycler(t, cfg, 100*time.Millisecond, 3, 3) - test.Poll(t, 1*time.Second, 3, func() interface{} { - return ring.InstancesCount() - }) - - // Sleep for a few seconds (ring timestamp resolution is 1 second, so to verify that ring is updated in the background, - // sleep for 2 seconds) - time.Sleep(2 * time.Second) - - rs, err := ring.GetAllHealthy(Read) - require.NoError(t, err) - - now := time.Now() - for _, ing := range rs.Instances { - require.InDelta(t, now.UnixNano(), time.Unix(ing.Timestamp, 0).UnixNano(), float64(1500*time.Millisecond.Nanoseconds())) - } - - require.NoError(t, services.StopAndAwaitTerminated(context.Background(), lc2)) - test.Poll(t, 1*time.Second, 2, func() interface{} { - return ring.InstancesCount() - }) - - require.NoError(t, services.StopAndAwaitTerminated(context.Background(), lc1)) - test.Poll(t, 1*time.Second, 1, func() interface{} { - return ring.InstancesCount() - }) - - require.NoError(t, services.StopAndAwaitTerminated(context.Background(), lc3)) - test.Poll(t, 1*time.Second, 0, func() interface{} { - return ring.InstancesCount() - }) -} - -func startLifecycler(t *testing.T, cfg Config, heartbeat time.Duration, lifecyclerID int, zones int) *Lifecycler { - lcCfg := LifecyclerConfig{ - RingConfig: cfg, - NumTokens: 16, - HeartbeatPeriod: heartbeat, - ObservePeriod: 0, - JoinAfter: 0, - Zone: fmt.Sprintf("zone-%d", lifecyclerID%zones), - Addr: fmt.Sprintf("addr-%d", lifecyclerID), - ID: fmt.Sprintf("instance-%d", lifecyclerID), - UnregisterOnShutdown: true, - } - - lc, err := NewLifecycler(lcCfg, &noopFlushTransferer{}, "test", "test", false, nil) - require.NoError(t, err) - - lc.AddListener(services.NewListener(nil, nil, nil, nil, func(from services.State, failure error) { - t.Log("lifecycler", lifecyclerID, "failed:", failure) - t.Fail() - })) - - require.NoError(t, services.StartAndAwaitRunning(context.Background(), lc)) - - t.Cleanup(func() { - _ = services.StopAndAwaitTerminated(context.Background(), lc) - }) - - return lc -} - -// This test checks if shuffle-sharded ring can be reused, and whether it receives -// updates from "main" ring. -func TestShuffleShardWithCaching(t *testing.T) { - inmem, closer := consul.NewInMemoryClientWithConfig(GetCodec(), consul.Config{ - MaxCasRetries: 20, - CasRetryDelay: 500 * time.Millisecond, - }, log.NewNopLogger(), nil) - t.Cleanup(func() { assert.NoError(t, closer.Close()) }) - - cfg := Config{ - KVStore: kv.Config{Mock: inmem}, - HeartbeatTimeout: 1 * time.Minute, - ReplicationFactor: 3, - ZoneAwarenessEnabled: true, - } - - ring, err := New(cfg, "test", "test", nil) - require.NoError(t, err) - require.NoError(t, services.StartAndAwaitRunning(context.Background(), ring)) - t.Cleanup(func() { - _ = services.StartAndAwaitRunning(context.Background(), ring) - }) - - // We will stop instances later, to see that subring is recomputed. - const numLifecyclers = 6 - const zones = 3 - - lcs := []*Lifecycler(nil) - for i := 0; i < numLifecyclers; i++ { - lc := startLifecycler(t, cfg, 500*time.Millisecond, i, zones) - - lcs = append(lcs, lc) - } - - // Wait until all instances in the ring are ACTIVE. - test.Poll(t, 5*time.Second, numLifecyclers, func() interface{} { - active := 0 - rs, _ := ring.GetReplicationSetForOperation(Read) - for _, ing := range rs.Instances { - if ing.State == ACTIVE { - active++ - } - } - return active - }) - - // Use shardSize = zones, to get one instance from each zone. - const shardSize = zones - const user = "user" - - // This subring should be cached, and reused. - subring := ring.ShuffleShard(user, shardSize) - - // Do 100 iterations over two seconds. Make sure we get the same subring. - const iters = 100 - sleep := (2 * time.Second) / iters - for i := 0; i < iters; i++ { - newSubring := ring.ShuffleShard(user, shardSize) - require.True(t, subring == newSubring, "cached subring reused") - require.Equal(t, shardSize, subring.InstancesCount()) - time.Sleep(sleep) - } - - // Make sure subring has up-to-date timestamps. - { - rs, err := subring.GetReplicationSetForOperation(Read) - require.NoError(t, err) - - now := time.Now() - for _, ing := range rs.Instances { - // Lifecyclers use 500ms refresh, but timestamps use 1s resolution, so we better give it some extra buffer. - assert.InDelta(t, now.UnixNano(), time.Unix(ing.Timestamp, 0).UnixNano(), float64(2*time.Second.Nanoseconds())) - } - } - - // Now stop one lifecycler from each zone. Subring needs to be recomputed. - for i := 0; i < zones; i++ { - require.NoError(t, services.StopAndAwaitTerminated(context.Background(), lcs[i])) - } - - test.Poll(t, 5*time.Second, numLifecyclers-zones, func() interface{} { - return ring.InstancesCount() - }) - - // Change of instances -> new subring needed. - newSubring := ring.ShuffleShard("user", zones) - require.False(t, subring == newSubring) - require.Equal(t, zones, subring.InstancesCount()) - - // Change of shard size -> new subring needed. - subring = newSubring - newSubring = ring.ShuffleShard("user", 1) - require.False(t, subring == newSubring) - // Zone-aware shuffle-shard gives all zones the same number of instances (at least one). - require.Equal(t, zones, newSubring.InstancesCount()) - - // Verify that getting the same subring uses cached instance. - subring = newSubring - newSubring = ring.ShuffleShard("user", 1) - require.True(t, subring == newSubring) - - // But after cleanup, it doesn't. - ring.CleanupShuffleShardCache("user") - newSubring = ring.ShuffleShard("user", 1) - require.False(t, subring == newSubring) -} - -// User shuffle shard token. -func userToken(user, zone string, skip int) uint32 { - r := rand.New(rand.NewSource(util.ShuffleShardSeed(user, zone))) - - for ; skip > 0; skip-- { - _ = r.Uint32() - } - return r.Uint32() -} diff --git a/pkg/ring/testutils/testutils.go b/pkg/ring/testutils/testutils.go deleted file mode 100644 index 8d0b1d91cea..00000000000 --- a/pkg/ring/testutils/testutils.go +++ /dev/null @@ -1,26 +0,0 @@ -package testutils - -import ( - "context" - - "github.com/go-kit/log/level" - "github.com/grafana/dskit/kv" - - "github.com/cortexproject/cortex/pkg/ring" - util_log "github.com/cortexproject/cortex/pkg/util/log" -) - -// NumTokens determines the number of tokens owned by the specified -// address -func NumTokens(c kv.Client, name, ringKey string) int { - ringDesc, err := c.Get(context.Background(), ringKey) - - // The ringDesc may be null if the lifecycler hasn't stored the ring - // to the KVStore yet. - if ringDesc == nil || err != nil { - level.Error(util_log.Logger).Log("msg", "error reading consul", "err", err) - return 0 - } - rd := ringDesc.(*ring.Desc) - return len(rd.Ingesters[name].Tokens) -} diff --git a/pkg/ring/tokens_test.go b/pkg/ring/tokens_test.go deleted file mode 100644 index 97d5914e464..00000000000 --- a/pkg/ring/tokens_test.go +++ /dev/null @@ -1,77 +0,0 @@ -package ring - -import ( - "io/ioutil" - "math/rand" - "os" - "path/filepath" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestTokens_Serialization(t *testing.T) { - tokens := make(Tokens, 512) - for i := 0; i < 512; i++ { - tokens = append(tokens, uint32(rand.Int31())) - } - - b, err := tokens.Marshal() - require.NoError(t, err) - - var unmarshaledTokens Tokens - require.NoError(t, unmarshaledTokens.Unmarshal(b)) - require.Equal(t, tokens, unmarshaledTokens) -} - -func TestTokens_Equals(t *testing.T) { - tests := []struct { - first Tokens - second Tokens - expected bool - }{ - { - first: Tokens{}, - second: Tokens{}, - expected: true, - }, - { - first: Tokens{1, 2, 3}, - second: Tokens{1, 2, 3}, - expected: true, - }, - { - first: Tokens{1, 2, 3}, - second: Tokens{3, 2, 1}, - expected: true, - }, - { - first: Tokens{1, 2}, - second: Tokens{1, 2, 3}, - expected: false, - }, - } - - for _, c := range tests { - assert.Equal(t, c.expected, c.first.Equals(c.second)) - assert.Equal(t, c.expected, c.second.Equals(c.first)) - } -} - -func TestLoadTokensFromFile_ShouldGuaranteeSortedTokens(t *testing.T) { - tmpDir, err := ioutil.TempDir("", "test-tokens") - require.NoError(t, err) - t.Cleanup(func() { - os.RemoveAll(tmpDir) - }) - - // Store tokens to file. - orig := Tokens{1, 5, 3} - require.NoError(t, orig.StoreToFile(filepath.Join(tmpDir, "tokens"))) - - // Read back and ensure they're sorted. - actual, err := LoadTokensFromFile(filepath.Join(tmpDir, "tokens")) - require.NoError(t, err) - assert.Equal(t, Tokens{1, 3, 5}, actual) -} diff --git a/pkg/ring/util_test.go b/pkg/ring/util_test.go deleted file mode 100644 index 9a4a69c6904..00000000000 --- a/pkg/ring/util_test.go +++ /dev/null @@ -1,301 +0,0 @@ -package ring - -import ( - "context" - "fmt" - "testing" - "time" - - "github.com/pkg/errors" - "github.com/prometheus/client_golang/prometheus" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/mock" - "github.com/stretchr/testify/require" -) - -type RingMock struct { - mock.Mock -} - -func (r *RingMock) Collect(ch chan<- prometheus.Metric) {} - -func (r *RingMock) Describe(ch chan<- *prometheus.Desc) {} - -func (r *RingMock) Get(key uint32, op Operation, bufDescs []InstanceDesc, bufHosts, bufZones []string) (ReplicationSet, error) { - args := r.Called(key, op, bufDescs, bufHosts, bufZones) - return args.Get(0).(ReplicationSet), args.Error(1) -} - -func (r *RingMock) GetAllHealthy(op Operation) (ReplicationSet, error) { - args := r.Called(op) - return args.Get(0).(ReplicationSet), args.Error(1) -} - -func (r *RingMock) GetReplicationSetForOperation(op Operation) (ReplicationSet, error) { - args := r.Called(op) - return args.Get(0).(ReplicationSet), args.Error(1) -} - -func (r *RingMock) ReplicationFactor() int { - return 0 -} - -func (r *RingMock) InstancesCount() int { - return 0 -} - -func (r *RingMock) ShuffleShard(identifier string, size int) ReadRing { - args := r.Called(identifier, size) - return args.Get(0).(ReadRing) -} - -func (r *RingMock) GetInstanceState(instanceID string) (InstanceState, error) { - args := r.Called(instanceID) - return args.Get(0).(InstanceState), args.Error(1) -} - -func (r *RingMock) ShuffleShardWithLookback(identifier string, size int, lookbackPeriod time.Duration, now time.Time) ReadRing { - args := r.Called(identifier, size, lookbackPeriod, now) - return args.Get(0).(ReadRing) -} - -func (r *RingMock) HasInstance(instanceID string) bool { - return true -} - -func (r *RingMock) CleanupShuffleShardCache(identifier string) {} - -func TestGenerateTokens(t *testing.T) { - tokens := GenerateTokens(1000000, nil) - - dups := make(map[uint32]int) - - for ix, v := range tokens { - if ox, ok := dups[v]; ok { - t.Errorf("Found duplicate token %d, tokens[%d]=%d, tokens[%d]=%d", v, ix, tokens[ix], ox, tokens[ox]) - } else { - dups[v] = ix - } - } -} - -func TestGenerateTokensIgnoresOldTokens(t *testing.T) { - first := GenerateTokens(1000000, nil) - second := GenerateTokens(1000000, first) - - dups := make(map[uint32]bool) - - for _, v := range first { - dups[v] = true - } - - for _, v := range second { - if dups[v] { - t.Fatal("GenerateTokens returned old token") - } - } -} - -func TestWaitRingStabilityShouldReturnAsSoonAsMinStabilityIsReachedOnNoChanges(t *testing.T) { - t.Parallel() - - const ( - minStability = 2 * time.Second - maxWaiting = 10 * time.Second - ) - - // Init the ring. - ringDesc := &Desc{Ingesters: map[string]InstanceDesc{ - "instance-1": {Addr: "127.0.0.1", State: ACTIVE, Timestamp: time.Now().Unix()}, - "instance-2": {Addr: "127.0.0.2", State: PENDING, Timestamp: time.Now().Unix()}, - "instance-3": {Addr: "127.0.0.3", State: JOINING, Timestamp: time.Now().Unix()}, - "instance-4": {Addr: "127.0.0.4", State: LEAVING, Timestamp: time.Now().Unix()}, - "instance-5": {Addr: "127.0.0.5", State: ACTIVE, Timestamp: time.Now().Unix()}, - }} - - ring := &Ring{ - cfg: Config{HeartbeatTimeout: time.Minute}, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - startTime := time.Now() - require.NoError(t, WaitRingStability(context.Background(), ring, Reporting, minStability, maxWaiting)) - elapsedTime := time.Since(startTime) - - assert.InDelta(t, minStability, elapsedTime, float64(2*time.Second)) -} - -func TestWaitRingStabilityShouldReturnOnceMinStabilityHasBeenReached(t *testing.T) { - t.Parallel() - - const ( - minStability = 3 * time.Second - addInstanceAfter = 2 * time.Second - maxWaiting = 15 * time.Second - ) - - // Init the ring. - ringDesc := &Desc{Ingesters: map[string]InstanceDesc{ - "instance-1": {Addr: "instance-1", State: ACTIVE, Timestamp: time.Now().Unix()}, - "instance-2": {Addr: "instance-2", State: PENDING, Timestamp: time.Now().Unix()}, - "instance-3": {Addr: "instance-3", State: JOINING, Timestamp: time.Now().Unix()}, - "instance-4": {Addr: "instance-4", State: LEAVING, Timestamp: time.Now().Unix()}, - "instance-5": {Addr: "instance-5", State: ACTIVE, Timestamp: time.Now().Unix()}, - }} - - ring := &Ring{ - cfg: Config{HeartbeatTimeout: time.Minute}, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - // Add 1 new instance after some time. - go func() { - time.Sleep(addInstanceAfter) - - ring.mtx.Lock() - defer ring.mtx.Unlock() - - instanceID := fmt.Sprintf("instance-%d", len(ringDesc.Ingesters)+1) - ringDesc.Ingesters[instanceID] = InstanceDesc{Addr: instanceID, State: ACTIVE, Timestamp: time.Now().Unix()} - ring.ringDesc = ringDesc - ring.ringTokens = ringDesc.GetTokens() - ring.ringTokensByZone = ringDesc.getTokensByZone() - ring.ringInstanceByToken = ringDesc.getTokensInfo() - ring.ringZones = getZones(ringDesc.getTokensByZone()) - }() - - startTime := time.Now() - require.NoError(t, WaitRingStability(context.Background(), ring, Reporting, minStability, maxWaiting)) - elapsedTime := time.Since(startTime) - - assert.GreaterOrEqual(t, elapsedTime.Milliseconds(), (minStability + addInstanceAfter).Milliseconds()) - assert.LessOrEqual(t, elapsedTime.Milliseconds(), (minStability + addInstanceAfter + 3*time.Second).Milliseconds()) -} - -func TestWaitRingStabilityShouldReturnErrorIfMaxWaitingIsReached(t *testing.T) { - t.Parallel() - - const ( - minStability = 2 * time.Second - maxWaiting = 7 * time.Second - ) - - // Init the ring. - ringDesc := &Desc{Ingesters: map[string]InstanceDesc{ - "instance-1": {Addr: "instance-1", State: ACTIVE, Timestamp: time.Now().Unix()}, - "instance-2": {Addr: "instance-2", State: PENDING, Timestamp: time.Now().Unix()}, - "instance-3": {Addr: "instance-3", State: JOINING, Timestamp: time.Now().Unix()}, - "instance-4": {Addr: "instance-4", State: LEAVING, Timestamp: time.Now().Unix()}, - "instance-5": {Addr: "instance-5", State: ACTIVE, Timestamp: time.Now().Unix()}, - }} - - ring := &Ring{ - cfg: Config{HeartbeatTimeout: time.Minute}, - ringDesc: ringDesc, - ringTokens: ringDesc.GetTokens(), - ringTokensByZone: ringDesc.getTokensByZone(), - ringInstanceByToken: ringDesc.getTokensInfo(), - ringZones: getZones(ringDesc.getTokensByZone()), - strategy: NewDefaultReplicationStrategy(), - } - - // Keep changing the ring. - done := make(chan struct{}) - defer close(done) - go func() { - for { - select { - case <-done: - return - case <-time.After(time.Second): - ring.mtx.Lock() - - instanceID := fmt.Sprintf("instance-%d", len(ringDesc.Ingesters)+1) - ringDesc.Ingesters[instanceID] = InstanceDesc{Addr: instanceID, State: ACTIVE, Timestamp: time.Now().Unix()} - ring.ringDesc = ringDesc - ring.ringTokens = ringDesc.GetTokens() - ring.ringTokensByZone = ringDesc.getTokensByZone() - ring.ringInstanceByToken = ringDesc.getTokensInfo() - ring.ringZones = getZones(ringDesc.getTokensByZone()) - - ring.mtx.Unlock() - } - } - }() - - startTime := time.Now() - require.Equal(t, context.DeadlineExceeded, WaitRingStability(context.Background(), ring, Reporting, minStability, maxWaiting)) - elapsedTime := time.Since(startTime) - - assert.InDelta(t, maxWaiting, elapsedTime, float64(2*time.Second)) -} - -func TestWaitInstanceStateTimeout(t *testing.T) { - t.Parallel() - - const ( - instanceID = "test" - timeoutDuration = time.Second - ) - - ctx, cancel := context.WithTimeout(context.Background(), timeoutDuration) - defer cancel() - - ring := &RingMock{} - ring.On("GetInstanceState", mock.Anything, mock.Anything).Return(ACTIVE, nil) - - err := WaitInstanceState(ctx, ring, instanceID, PENDING) - - assert.Equal(t, context.DeadlineExceeded, err) - ring.AssertCalled(t, "GetInstanceState", instanceID) -} - -func TestWaitInstanceStateTimeoutOnError(t *testing.T) { - t.Parallel() - - const ( - instanceID = "test" - timeoutDuration = time.Second - ) - - ctx, cancel := context.WithTimeout(context.Background(), timeoutDuration) - defer cancel() - - ring := &RingMock{} - ring.On("GetInstanceState", mock.Anything, mock.Anything).Return(PENDING, errors.New("instance not found in the ring")) - - err := WaitInstanceState(ctx, ring, instanceID, ACTIVE) - - assert.Equal(t, context.DeadlineExceeded, err) - ring.AssertCalled(t, "GetInstanceState", instanceID) -} - -func TestWaitInstanceStateExitsAfterActualStateEqualsState(t *testing.T) { - t.Parallel() - - const ( - instanceID = "test" - timeoutDuration = time.Second - ) - - ctx, cancel := context.WithTimeout(context.Background(), timeoutDuration) - defer cancel() - - ring := &RingMock{} - ring.On("GetInstanceState", mock.Anything, mock.Anything).Return(ACTIVE, nil) - - err := WaitInstanceState(ctx, ring, instanceID, ACTIVE) - - assert.Nil(t, err) - ring.AssertNumberOfCalls(t, "GetInstanceState", 1) -} diff --git a/pkg/ruler/client_pool.go b/pkg/ruler/client_pool.go index 717d154e0fc..5338da713d1 100644 --- a/pkg/ruler/client_pool.go +++ b/pkg/ruler/client_pool.go @@ -5,14 +5,13 @@ import ( "github.com/go-kit/log" "github.com/grafana/dskit/grpcclient" + "github.com/grafana/dskit/ring/client" "github.com/grafana/dskit/services" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promauto" "google.golang.org/grpc" "google.golang.org/grpc/health/grpc_health_v1" - - "github.com/cortexproject/cortex/pkg/ring/client" ) // ClientsPool is the interface used to get the client from the pool for a specified address. diff --git a/pkg/ruler/lifecycle.go b/pkg/ruler/lifecycle.go index 65bb4bf7123..efa6ae205f9 100644 --- a/pkg/ruler/lifecycle.go +++ b/pkg/ruler/lifecycle.go @@ -1,7 +1,7 @@ package ruler import ( - "github.com/cortexproject/cortex/pkg/ring" + "github.com/grafana/dskit/ring" ) func (r *Ruler) OnRingInstanceRegister(_ *ring.BasicLifecycler, ringDesc ring.Desc, instanceExists bool, instanceID string, instanceDesc ring.InstanceDesc) (ring.InstanceState, ring.Tokens) { diff --git a/pkg/ruler/lifecycle_test.go b/pkg/ruler/lifecycle_test.go index ce5ff570958..c39923c3279 100644 --- a/pkg/ruler/lifecycle_test.go +++ b/pkg/ruler/lifecycle_test.go @@ -7,13 +7,13 @@ import ( "time" "github.com/go-kit/log" + "github.com/grafana/dskit/kv" "github.com/grafana/dskit/kv/consul" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - "github.com/cortexproject/cortex/pkg/ring" - "github.com/cortexproject/cortex/pkg/ring/testutils" "github.com/cortexproject/cortex/pkg/util/test" ) @@ -39,7 +39,7 @@ func TestRulerShutdown(t *testing.T) { // Wait until the tokens are registered in the ring test.Poll(t, 100*time.Millisecond, config.Ring.NumTokens, func() interface{} { - return testutils.NumTokens(ringStore, "localhost", ring.RulerRingKey) + return numTokens(ringStore, "localhost", ring.RulerRingKey) }) require.Equal(t, ring.ACTIVE, r.lifecycler.GetState()) @@ -48,7 +48,7 @@ func TestRulerShutdown(t *testing.T) { // Wait until the tokens are unregistered from the ring test.Poll(t, 100*time.Millisecond, 0, func() interface{} { - return testutils.NumTokens(ringStore, "localhost", ring.RulerRingKey) + return numTokens(ringStore, "localhost", ring.RulerRingKey) }) } @@ -107,3 +107,17 @@ func generateSortedTokens(numTokens int) ring.Tokens { return ring.Tokens(tokens) } + +// numTokens determines the number of tokens owned by the specified +// address +func numTokens(c kv.Client, name, ringKey string) int { + ringDesc, err := c.Get(context.Background(), ringKey) + + // The ringDesc may be null if the lifecycler hasn't stored the ring + // to the KVStore yet. + if ringDesc == nil || err != nil { + return 0 + } + rd := ringDesc.(*ring.Desc) + return len(rd.Ingesters[name].Tokens) +} diff --git a/pkg/ruler/ruler.go b/pkg/ruler/ruler.go index 40d95ffb93c..6cb506dc68c 100644 --- a/pkg/ruler/ruler.go +++ b/pkg/ruler/ruler.go @@ -18,6 +18,7 @@ import ( "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/grpcclient" "github.com/grafana/dskit/kv" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" @@ -30,7 +31,6 @@ import ( "golang.org/x/sync/errgroup" "github.com/cortexproject/cortex/pkg/cortexpb" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/ruler/rulespb" "github.com/cortexproject/cortex/pkg/ruler/rulestore" "github.com/cortexproject/cortex/pkg/tenant" @@ -295,10 +295,6 @@ func newRuler(cfg Config, manager MultiTenantManager, reg prometheus.Registerer, if err = enableSharding(ruler, ringStore); err != nil { return nil, errors.Wrap(err, "setup ruler sharding ring") } - - if reg != nil { - reg.MustRegister(ruler.ring) - } } ruler.Service = services.NewBasicService(ruler.starting, ruler.run, ruler.stopping) @@ -306,7 +302,7 @@ func newRuler(cfg Config, manager MultiTenantManager, reg prometheus.Registerer, } func enableSharding(r *Ruler, ringStore kv.Client) error { - lifecyclerCfg, err := r.cfg.Ring.ToLifecyclerConfig() + lifecyclerCfg, err := r.cfg.Ring.ToLifecyclerConfig(r.logger) if err != nil { return errors.Wrap(err, "failed to initialize ruler's lifecycler config") } @@ -318,12 +314,12 @@ func enableSharding(r *Ruler, ringStore kv.Client) error { delegate = ring.NewAutoForgetDelegate(r.cfg.Ring.HeartbeatTimeout*ringAutoForgetUnhealthyPeriods, delegate, r.logger) rulerRingName := "ruler" - r.lifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, rulerRingName, ring.RulerRingKey, ringStore, delegate, r.logger, r.registry) + r.lifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, rulerRingName, ring.RulerRingKey, ringStore, delegate, r.logger, prometheus.WrapRegistererWithPrefix("cortex_", r.registry)) if err != nil { return errors.Wrap(err, "failed to initialize ruler's lifecycler") } - r.ring, err = ring.NewWithStoreClientAndStrategy(r.cfg.Ring.ToRingConfig(), rulerRingName, ring.RulerRingKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy()) + r.ring, err = ring.NewWithStoreClientAndStrategy(r.cfg.Ring.ToRingConfig(), rulerRingName, ring.RulerRingKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix("cortex_", r.registry), r.logger) if err != nil { return errors.Wrap(err, "failed to initialize ruler's ring") } diff --git a/pkg/ruler/ruler_ring.go b/pkg/ruler/ruler_ring.go index 2231858b03f..b22ce6b4186 100644 --- a/pkg/ruler/ruler_ring.go +++ b/pkg/ruler/ruler_ring.go @@ -6,10 +6,10 @@ import ( "os" "time" + "github.com/go-kit/log" "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv" - - "github.com/cortexproject/cortex/pkg/ring" + "github.com/grafana/dskit/ring" ) const ( @@ -71,8 +71,8 @@ func (cfg *RingConfig) RegisterFlags(f *flag.FlagSet) { // ToLifecyclerConfig returns a LifecyclerConfig based on the ruler // ring config. -func (cfg *RingConfig) ToLifecyclerConfig() (ring.BasicLifecyclerConfig, error) { - instanceAddr, err := ring.GetInstanceAddr(cfg.InstanceAddr, cfg.InstanceInterfaceNames) +func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecyclerConfig, error) { + instanceAddr, err := ring.GetInstanceAddr(cfg.InstanceAddr, cfg.InstanceInterfaceNames, logger) if err != nil { return ring.BasicLifecyclerConfig{}, err } diff --git a/pkg/ruler/ruler_test.go b/pkg/ruler/ruler_test.go index 11c1b85feb4..36feb7a69e8 100644 --- a/pkg/ruler/ruler_test.go +++ b/pkg/ruler/ruler_test.go @@ -34,6 +34,7 @@ import ( "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv" "github.com/grafana/dskit/kv/consul" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/prometheus/client_golang/prometheus" prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" @@ -50,7 +51,6 @@ import ( "github.com/cortexproject/cortex/pkg/chunk" "github.com/cortexproject/cortex/pkg/cortexpb" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/ruler/rulespb" "github.com/cortexproject/cortex/pkg/ruler/rulestore" "github.com/cortexproject/cortex/pkg/ruler/rulestore/objectclient" diff --git a/pkg/storegateway/gateway.go b/pkg/storegateway/gateway.go index d5708c609ff..f085364490a 100644 --- a/pkg/storegateway/gateway.go +++ b/pkg/storegateway/gateway.go @@ -10,6 +10,7 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/dskit/kv" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" @@ -19,7 +20,6 @@ import ( "github.com/thanos-io/thanos/pkg/store/storepb" "github.com/weaveworks/common/logging" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/storage/bucket" cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb" "github.com/cortexproject/cortex/pkg/storegateway/storegatewaypb" @@ -146,7 +146,7 @@ func newStoreGateway(gatewayCfg Config, storageCfg cortex_tsdb.BlocksStorageConf var shardingStrategy ShardingStrategy if gatewayCfg.ShardingEnabled { - lifecyclerCfg, err := gatewayCfg.ShardingRing.ToLifecyclerConfig() + lifecyclerCfg, err := gatewayCfg.ShardingRing.ToLifecyclerConfig(logger) if err != nil { return nil, errors.Wrap(err, "invalid ring lifecycler config") } @@ -158,21 +158,17 @@ func newStoreGateway(gatewayCfg Config, storageCfg cortex_tsdb.BlocksStorageConf delegate = ring.NewTokensPersistencyDelegate(gatewayCfg.ShardingRing.TokensFilePath, ring.JOINING, delegate, logger) delegate = ring.NewAutoForgetDelegate(ringAutoForgetUnhealthyPeriods*gatewayCfg.ShardingRing.HeartbeatTimeout, delegate, logger) - g.ringLifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, RingNameForServer, RingKey, ringStore, delegate, logger, reg) + g.ringLifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, RingNameForServer, RingKey, ringStore, delegate, logger, prometheus.WrapRegistererWithPrefix("cortex_", reg)) if err != nil { return nil, errors.Wrap(err, "create ring lifecycler") } ringCfg := gatewayCfg.ShardingRing.ToRingConfig() - g.ring, err = ring.NewWithStoreClientAndStrategy(ringCfg, RingNameForServer, RingKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy()) + g.ring, err = ring.NewWithStoreClientAndStrategy(ringCfg, RingNameForServer, RingKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix("cortex_", reg), logger) if err != nil { return nil, errors.Wrap(err, "create ring client") } - if reg != nil { - reg.MustRegister(g.ring) - } - // Instance the right strategy. switch gatewayCfg.ShardingStrategy { case util.ShardingStrategyDefault: diff --git a/pkg/storegateway/gateway_ring.go b/pkg/storegateway/gateway_ring.go index e31f2eedd8f..636cdce9c3c 100644 --- a/pkg/storegateway/gateway_ring.go +++ b/pkg/storegateway/gateway_ring.go @@ -6,11 +6,12 @@ import ( "os" "time" + "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv" + "github.com/grafana/dskit/ring" - "github.com/cortexproject/cortex/pkg/ring" util_log "github.com/cortexproject/cortex/pkg/util/log" ) @@ -129,8 +130,8 @@ func (cfg *RingConfig) ToRingConfig() ring.Config { return rc } -func (cfg *RingConfig) ToLifecyclerConfig() (ring.BasicLifecyclerConfig, error) { - instanceAddr, err := ring.GetInstanceAddr(cfg.InstanceAddr, cfg.InstanceInterfaceNames) +func (cfg *RingConfig) ToLifecyclerConfig(logger log.Logger) (ring.BasicLifecyclerConfig, error) { + instanceAddr, err := ring.GetInstanceAddr(cfg.InstanceAddr, cfg.InstanceInterfaceNames, logger) if err != nil { return ring.BasicLifecyclerConfig{}, err } diff --git a/pkg/storegateway/gateway_ring_test.go b/pkg/storegateway/gateway_ring_test.go index ef1908f653b..b621a566adf 100644 --- a/pkg/storegateway/gateway_ring_test.go +++ b/pkg/storegateway/gateway_ring_test.go @@ -4,9 +4,8 @@ import ( "testing" "time" + "github.com/grafana/dskit/ring" "github.com/stretchr/testify/assert" - - "github.com/cortexproject/cortex/pkg/ring" ) func TestIsHealthyForStoreGatewayOperations(t *testing.T) { diff --git a/pkg/storegateway/gateway_test.go b/pkg/storegateway/gateway_test.go index e614e17d9be..d27d170f652 100644 --- a/pkg/storegateway/gateway_test.go +++ b/pkg/storegateway/gateway_test.go @@ -19,6 +19,7 @@ import ( "github.com/go-kit/log" "github.com/grafana/dskit/flagext" "github.com/grafana/dskit/kv/consul" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/oklog/ulid" "github.com/pkg/errors" @@ -37,7 +38,6 @@ import ( "github.com/thanos-io/thanos/pkg/store/storepb" "google.golang.org/grpc/status" - "github.com/cortexproject/cortex/pkg/ring" "github.com/cortexproject/cortex/pkg/storage/bucket" "github.com/cortexproject/cortex/pkg/storage/bucket/filesystem" cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb" diff --git a/pkg/storegateway/sharding_strategy.go b/pkg/storegateway/sharding_strategy.go index 64f0f19de6a..b9a27f5dd70 100644 --- a/pkg/storegateway/sharding_strategy.go +++ b/pkg/storegateway/sharding_strategy.go @@ -5,13 +5,13 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" + "github.com/grafana/dskit/ring" "github.com/oklog/ulid" "github.com/thanos-io/thanos/pkg/block" "github.com/thanos-io/thanos/pkg/block/metadata" "github.com/thanos-io/thanos/pkg/extprom" "github.com/thanos-io/thanos/pkg/objstore" - "github.com/cortexproject/cortex/pkg/ring" cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb" ) diff --git a/pkg/storegateway/sharding_strategy_test.go b/pkg/storegateway/sharding_strategy_test.go index ce3298b29d2..9ba549f6275 100644 --- a/pkg/storegateway/sharding_strategy_test.go +++ b/pkg/storegateway/sharding_strategy_test.go @@ -7,6 +7,7 @@ import ( "github.com/go-kit/log" "github.com/grafana/dskit/kv/consul" + "github.com/grafana/dskit/ring" "github.com/grafana/dskit/services" "github.com/oklog/ulid" "github.com/prometheus/client_golang/prometheus" @@ -16,7 +17,6 @@ import ( "github.com/thanos-io/thanos/pkg/block/metadata" "github.com/thanos-io/thanos/pkg/extprom" - "github.com/cortexproject/cortex/pkg/ring" cortex_tsdb "github.com/cortexproject/cortex/pkg/storage/tsdb" ) @@ -258,7 +258,7 @@ func TestDefaultShardingStrategy(t *testing.T) { ZoneAwarenessEnabled: testData.zoneAwarenessEnabled, } - r, err := ring.NewWithStoreClientAndStrategy(cfg, "test", "test", store, ring.NewIgnoreUnhealthyInstancesReplicationStrategy()) + r, err := ring.NewWithStoreClientAndStrategy(cfg, "test", "test", store, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), nil, nil) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(ctx, r)) defer services.StopAndAwaitTerminated(ctx, r) //nolint:errcheck @@ -616,7 +616,7 @@ func TestShuffleShardingStrategy(t *testing.T) { SubringCacheDisabled: true, } - r, err := ring.NewWithStoreClientAndStrategy(cfg, "test", "test", store, ring.NewIgnoreUnhealthyInstancesReplicationStrategy()) + r, err := ring.NewWithStoreClientAndStrategy(cfg, "test", "test", store, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), nil, nil) require.NoError(t, err) require.NoError(t, services.StartAndAwaitRunning(ctx, r)) defer services.StopAndAwaitTerminated(ctx, r) //nolint:errcheck diff --git a/pkg/ring/basic_lifecycler.go b/vendor/github.com/grafana/dskit/ring/basic_lifecycler.go similarity index 96% rename from pkg/ring/basic_lifecycler.go rename to vendor/github.com/grafana/dskit/ring/basic_lifecycler.go index 94e464035b6..237bf49c6f2 100644 --- a/pkg/ring/basic_lifecycler.go +++ b/vendor/github.com/grafana/dskit/ring/basic_lifecycler.go @@ -9,13 +9,12 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" - "github.com/grafana/dskit/kv" - "github.com/grafana/dskit/services" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" - "github.com/cortexproject/cortex/pkg/util" - util_log "github.com/cortexproject/cortex/pkg/util/log" + "github.com/grafana/dskit/kv" + "github.com/grafana/dskit/services" + dstime "github.com/grafana/dskit/time" ) type BasicLifecyclerDelegate interface { @@ -183,7 +182,7 @@ func (l *BasicLifecycler) starting(ctx context.Context) error { } func (l *BasicLifecycler) running(ctx context.Context) error { - heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(l.cfg.HeartbeatPeriod) + heartbeatTickerStop, heartbeatTickerChan := dstime.NewDisableableTicker(l.cfg.HeartbeatPeriod) defer heartbeatTickerStop() for { @@ -195,7 +194,7 @@ func (l *BasicLifecycler) running(ctx context.Context) error { f() case <-ctx.Done(): - level.Info(util_log.Logger).Log("msg", "ring lifecycler is shutting down", "ring", l.ringName) + level.Info(l.logger).Log("msg", "ring lifecycler is shutting down", "ring", l.ringName) return nil } } @@ -215,7 +214,7 @@ func (l *BasicLifecycler) stopping(runningError error) error { }() // Heartbeat while the stopping delegate function is running. - heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(l.cfg.HeartbeatPeriod) + heartbeatTickerStop, heartbeatTickerChan := dstime.NewDisableableTicker(l.cfg.HeartbeatPeriod) defer heartbeatTickerStop() heartbeatLoop: @@ -293,7 +292,7 @@ func (l *BasicLifecycler) registerInstance(ctx context.Context) error { } func (l *BasicLifecycler) waitStableTokens(ctx context.Context, period time.Duration) error { - heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(l.cfg.HeartbeatPeriod) + heartbeatTickerStop, heartbeatTickerChan := dstime.NewDisableableTicker(l.cfg.HeartbeatPeriod) defer heartbeatTickerStop() // The first observation will occur after the specified period. diff --git a/pkg/ring/basic_lifecycler_delegates.go b/vendor/github.com/grafana/dskit/ring/basic_lifecycler_delegates.go similarity index 100% rename from pkg/ring/basic_lifecycler_delegates.go rename to vendor/github.com/grafana/dskit/ring/basic_lifecycler_delegates.go diff --git a/pkg/ring/basic_lifecycler_metrics.go b/vendor/github.com/grafana/dskit/ring/basic_lifecycler_metrics.go similarity index 85% rename from pkg/ring/basic_lifecycler_metrics.go rename to vendor/github.com/grafana/dskit/ring/basic_lifecycler_metrics.go index d23e9402336..6503baa4a0f 100644 --- a/pkg/ring/basic_lifecycler_metrics.go +++ b/vendor/github.com/grafana/dskit/ring/basic_lifecycler_metrics.go @@ -14,17 +14,17 @@ type BasicLifecyclerMetrics struct { func NewBasicLifecyclerMetrics(ringName string, reg prometheus.Registerer) *BasicLifecyclerMetrics { return &BasicLifecyclerMetrics{ heartbeats: promauto.With(reg).NewCounter(prometheus.CounterOpts{ - Name: "cortex_ring_member_heartbeats_total", + Name: "ring_member_heartbeats_total", Help: "The total number of heartbeats sent.", ConstLabels: prometheus.Labels{"name": ringName}, }), tokensOwned: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ - Name: "cortex_ring_member_tokens_owned", + Name: "ring_member_tokens_owned", Help: "The number of tokens owned in the ring.", ConstLabels: prometheus.Labels{"name": ringName}, }), tokensToOwn: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ - Name: "cortex_ring_member_tokens_to_own", + Name: "ring_member_tokens_to_own", Help: "The number of tokens to own in the ring.", ConstLabels: prometheus.Labels{"name": ringName}, }), diff --git a/pkg/ring/batch.go b/vendor/github.com/grafana/dskit/ring/batch.go similarity index 96% rename from pkg/ring/batch.go rename to vendor/github.com/grafana/dskit/ring/batch.go index 2528569fb35..1e4ee446d91 100644 --- a/pkg/ring/batch.go +++ b/vendor/github.com/grafana/dskit/ring/batch.go @@ -37,9 +37,12 @@ type itemTracker struct { // Callback is passed the instance to target, and the indexes of the keys // to send to that instance. // +// cleanup() is always called, either on an error before starting the batches or after they all finish. +// // Not implemented as a method on Ring so we can test separately. func DoBatch(ctx context.Context, op Operation, r ReadRing, keys []uint32, callback func(InstanceDesc, []int) error, cleanup func()) error { if r.InstancesCount() <= 0 { + cleanup() return fmt.Errorf("DoBatch: InstancesCount <= 0") } expectedTrackers := len(keys) * (r.ReplicationFactor() + 1) / r.InstancesCount() @@ -54,6 +57,7 @@ func DoBatch(ctx context.Context, op Operation, r ReadRing, keys []uint32, callb for i, key := range keys { replicationSet, err := r.Get(key, op, bufDescs[:0], bufHosts[:0], bufZones[:0]) if err != nil { + cleanup() return err } itemTrackers[i].minSuccess = len(replicationSet.Instances) - replicationSet.MaxErrors diff --git a/pkg/ring/client/pool.go b/vendor/github.com/grafana/dskit/ring/client/pool.go similarity index 92% rename from pkg/ring/client/pool.go rename to vendor/github.com/grafana/dskit/ring/client/pool.go index 243ac0de820..57b462cc414 100644 --- a/pkg/ring/client/pool.go +++ b/vendor/github.com/grafana/dskit/ring/client/pool.go @@ -9,13 +9,12 @@ import ( "github.com/go-kit/log" "github.com/go-kit/log/level" - "github.com/grafana/dskit/services" "github.com/prometheus/client_golang/prometheus" "github.com/weaveworks/common/user" "google.golang.org/grpc/health/grpc_health_v1" - "github.com/cortexproject/cortex/pkg/util" - util_log "github.com/cortexproject/cortex/pkg/util/log" + "github.com/grafana/dskit/ring/util" + "github.com/grafana/dskit/services" ) // PoolClient is the interface that should be implemented by a @@ -161,7 +160,7 @@ func (p *Pool) removeStaleClients() { serviceAddrs, err := p.discovery() if err != nil { - level.Error(util_log.Logger).Log("msg", "error removing stale clients", "err", err) + level.Error(p.logger).Log("msg", "error removing stale clients", "err", err) return } @@ -169,7 +168,7 @@ func (p *Pool) removeStaleClients() { if util.StringsContain(serviceAddrs, addr) { continue } - level.Info(util_log.Logger).Log("msg", "removing stale client", "addr", addr) + level.Info(p.logger).Log("msg", "removing stale client", "addr", addr) p.RemoveClientFor(addr) } } @@ -182,7 +181,7 @@ func (p *Pool) cleanUnhealthy() { if ok { err := healthCheck(client, p.cfg.HealthCheckTimeout) if err != nil { - level.Warn(util_log.Logger).Log("msg", fmt.Sprintf("removing %s failing healthcheck", p.clientName), "addr", addr, "reason", err) + level.Warn(p.logger).Log("msg", fmt.Sprintf("removing %s failing healthcheck", p.clientName), "addr", addr, "reason", err) p.RemoveClientFor(addr) } } diff --git a/pkg/ring/client/ring_service_discovery.go b/vendor/github.com/grafana/dskit/ring/client/ring_service_discovery.go similarity index 91% rename from pkg/ring/client/ring_service_discovery.go rename to vendor/github.com/grafana/dskit/ring/client/ring_service_discovery.go index 797b171c074..2210e754917 100644 --- a/pkg/ring/client/ring_service_discovery.go +++ b/vendor/github.com/grafana/dskit/ring/client/ring_service_discovery.go @@ -3,7 +3,7 @@ package client import ( "errors" - "github.com/cortexproject/cortex/pkg/ring" + "github.com/grafana/dskit/ring" ) func NewRingServiceDiscovery(r ring.ReadRing) PoolServiceDiscovery { diff --git a/pkg/ring/flush.go b/vendor/github.com/grafana/dskit/ring/flush.go similarity index 100% rename from pkg/ring/flush.go rename to vendor/github.com/grafana/dskit/ring/flush.go diff --git a/pkg/ring/http.go b/vendor/github.com/grafana/dskit/ring/http.go similarity index 79% rename from pkg/ring/http.go rename to vendor/github.com/grafana/dskit/ring/http.go index 5da19e30060..9f961cde30e 100644 --- a/pkg/ring/http.go +++ b/vendor/github.com/grafana/dskit/ring/http.go @@ -2,17 +2,16 @@ package ring import ( "context" + "encoding/json" "fmt" "html/template" "math" "net/http" "sort" + "strings" "time" "github.com/go-kit/log/level" - - "github.com/cortexproject/cortex/pkg/util" - "github.com/cortexproject/cortex/pkg/util/log" ) const pageContent = ` @@ -20,10 +19,10 @@ const pageContent = ` - Cortex Ring Status + Ring Status -

Cortex Ring Status

+

Ring Status

Current time: {{ .Now }}

@@ -108,7 +107,7 @@ func (r *Ring) ServeHTTP(w http.ResponseWriter, req *http.Request) { if req.Method == http.MethodPost { ingesterID := req.FormValue("forget") if err := r.forget(req.Context(), ingesterID); err != nil { - level.Error(log.WithContext(req.Context(), log.Logger)).Log("msg", "error forgetting instance", "err", err) + level.Error(r.logger).Log("msg", "error forgetting instance", "err", err) } // Implement PRG pattern to prevent double-POST and work with CSRF middleware. @@ -174,7 +173,7 @@ func (r *Ring) ServeHTTP(w http.ResponseWriter, req *http.Request) { tokensParam := req.URL.Query().Get("tokens") - util.RenderHTTPResponse(w, struct { + renderHTTPResponse(w, struct { Ingesters []interface{} `json:"shards"` Now time.Time `json:"now"` ShowTokens bool `json:"-"` @@ -184,3 +183,34 @@ func (r *Ring) ServeHTTP(w http.ResponseWriter, req *http.Request) { ShowTokens: tokensParam == "true", }, pageTemplate, req) } + +// RenderHTTPResponse either responds with json or a rendered html page using the passed in template +// by checking the Accepts header +func renderHTTPResponse(w http.ResponseWriter, v interface{}, t *template.Template, r *http.Request) { + accept := r.Header.Get("Accept") + if strings.Contains(accept, "application/json") { + writeJSONResponse(w, v) + return + } + + err := t.Execute(w, v) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + } +} + +// WriteJSONResponse writes some JSON as a HTTP response. +func writeJSONResponse(w http.ResponseWriter, v interface{}) { + w.Header().Set("Content-Type", "application/json") + + data, err := json.Marshal(v) + if err != nil { + http.Error(w, err.Error(), http.StatusInternalServerError) + return + } + + // We ignore errors here, because we cannot do anything about them. + // Write will trigger sending Status code, so we cannot send a different status code afterwards. + // Also this isn't internal error, but error communicating with client. + _, _ = w.Write(data) +} diff --git a/pkg/ring/lifecycler.go b/vendor/github.com/grafana/dskit/ring/lifecycler.go similarity index 74% rename from pkg/ring/lifecycler.go rename to vendor/github.com/grafana/dskit/ring/lifecycler.go index ed69a5611f7..b13776042fd 100644 --- a/pkg/ring/lifecycler.go +++ b/vendor/github.com/grafana/dskit/ring/lifecycler.go @@ -9,38 +9,17 @@ import ( "sync" "time" + "github.com/go-kit/log" "github.com/go-kit/log/level" - "github.com/grafana/dskit/flagext" - "github.com/grafana/dskit/kv" - "github.com/grafana/dskit/services" "github.com/pkg/errors" perrors "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/client_golang/prometheus/promauto" "go.uber.org/atomic" - "github.com/cortexproject/cortex/pkg/util" - "github.com/cortexproject/cortex/pkg/util/log" -) - -var ( - consulHeartbeats = promauto.NewCounterVec(prometheus.CounterOpts{ - Name: "cortex_member_consul_heartbeats_total", - Help: "The total number of heartbeats sent to consul.", - }, []string{"name"}) - tokensOwned = promauto.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_member_ring_tokens_owned", - Help: "The number of tokens owned in the ring.", - }, []string{"name"}) - tokensToOwn = promauto.NewGaugeVec(prometheus.GaugeOpts{ - Name: "cortex_member_ring_tokens_to_own", - Help: "The number of tokens to own in the ring.", - }, []string{"name"}) - shutdownDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{ - Name: "cortex_shutdown_duration_seconds", - Help: "Duration (in seconds) of cortex shutdown procedure (ie transfer or flush).", - Buckets: prometheus.ExponentialBuckets(10, 2, 8), // Biggest bucket is 10*2^(9-1) = 2560, or 42 mins. - }, []string{"op", "status", "name"}) + "github.com/grafana/dskit/flagext" + "github.com/grafana/dskit/kv" + "github.com/grafana/dskit/services" + dstime "github.com/grafana/dskit/time" ) // LifecyclerConfig is the config to build a Lifecycler. @@ -48,16 +27,17 @@ type LifecyclerConfig struct { RingConfig Config `yaml:"ring"` // Config for the ingester lifecycle control - NumTokens int `yaml:"num_tokens"` - HeartbeatPeriod time.Duration `yaml:"heartbeat_period"` - ObservePeriod time.Duration `yaml:"observe_period"` - JoinAfter time.Duration `yaml:"join_after"` - MinReadyDuration time.Duration `yaml:"min_ready_duration"` - InfNames []string `yaml:"interface_names"` - FinalSleep time.Duration `yaml:"final_sleep"` - TokensFilePath string `yaml:"tokens_file_path"` - Zone string `yaml:"availability_zone"` - UnregisterOnShutdown bool `yaml:"unregister_on_shutdown"` + NumTokens int `yaml:"num_tokens"` + HeartbeatPeriod time.Duration `yaml:"heartbeat_period"` + ObservePeriod time.Duration `yaml:"observe_period"` + JoinAfter time.Duration `yaml:"join_after"` + MinReadyDuration time.Duration `yaml:"min_ready_duration"` + InfNames []string `yaml:"interface_names"` + FinalSleep time.Duration `yaml:"final_sleep"` + TokensFilePath string `yaml:"tokens_file_path"` + Zone string `yaml:"availability_zone"` + UnregisterOnShutdown bool `yaml:"unregister_on_shutdown"` + ReadinessCheckRingHealth bool `yaml:"readiness_check_ring_health"` // For testing, you can override the address and ID of this ingester Addr string `yaml:"address" doc:"hidden"` @@ -87,14 +67,13 @@ func (cfg *LifecyclerConfig) RegisterFlagsWithPrefix(prefix string, f *flag.Flag f.DurationVar(&cfg.HeartbeatPeriod, prefix+"heartbeat-period", 5*time.Second, "Period at which to heartbeat to consul. 0 = disabled.") f.DurationVar(&cfg.JoinAfter, prefix+"join-after", 0*time.Second, "Period to wait for a claim from another member; will join automatically after this.") f.DurationVar(&cfg.ObservePeriod, prefix+"observe-period", 0*time.Second, "Observe tokens after generating to resolve collisions. Useful when using gossiping ring.") - f.DurationVar(&cfg.MinReadyDuration, prefix+"min-ready-duration", 1*time.Minute, "Minimum duration to wait before becoming ready. This is to work around race conditions with ingesters exiting and updating the ring.") + f.DurationVar(&cfg.MinReadyDuration, prefix+"min-ready-duration", 15*time.Second, "Minimum duration to wait after the internal readiness checks have passed but before succeeding the readiness endpoint. This is used to slowdown deployment controllers (eg. Kubernetes) after an instance is ready and before they proceed with a rolling update, to give the rest of the cluster instances enough time to receive ring updates.") f.DurationVar(&cfg.FinalSleep, prefix+"final-sleep", 30*time.Second, "Duration to sleep for before exiting, to ensure metrics are scraped.") f.StringVar(&cfg.TokensFilePath, prefix+"tokens-file-path", "", "File path where tokens are stored. If empty, tokens are not stored at shutdown and restored at startup.") hostname, err := os.Hostname() if err != nil { - level.Error(log.Logger).Log("msg", "failed to get hostname", "err", err) - os.Exit(1) + panic(fmt.Errorf("failed to get hostname %s", err)) } cfg.InfNames = []string{"eth0", "en0"} @@ -104,6 +83,7 @@ func (cfg *LifecyclerConfig) RegisterFlagsWithPrefix(prefix string, f *flag.Flag f.StringVar(&cfg.ID, prefix+"lifecycler.ID", hostname, "ID to register in the ring.") f.StringVar(&cfg.Zone, prefix+"availability-zone", "", "The availability zone where this instance is running.") f.BoolVar(&cfg.UnregisterOnShutdown, prefix+"unregister-on-shutdown", true, "Unregister from the ring upon clean shutdown. It can be useful to disable for rolling restarts with consistent naming in conjunction with -distributor.extend-writes=false.") + f.BoolVar(&cfg.ReadinessCheckRingHealth, prefix+"readiness-check-ring-health", true, "When enabled the readiness probe succeeds only after all instances are ACTIVE and healthy in the ring, otherwise only the instance itself is checked. This option should be disabled if in your cluster multiple instances can be rolled out simultaneously, otherwise rolling updates may be slowed down.") } // Lifecycler is responsible for managing the lifecycle of entries in the ring. @@ -135,19 +115,22 @@ type Lifecycler struct { registeredAt time.Time // Controls the ready-reporting - readyLock sync.Mutex - startTime time.Time - ready bool + readyLock sync.Mutex + ready bool + readySince time.Time // Keeps stats updated at every heartbeat period countersLock sync.RWMutex healthyInstancesCount int zonesCount int + + lifecyclerMetrics *LifecyclerMetrics + logger log.Logger } // NewLifecycler creates new Lifecycler. It must be started via StartAsync. -func NewLifecycler(cfg LifecyclerConfig, flushTransferer FlushTransferer, ringName, ringKey string, flushOnShutdown bool, reg prometheus.Registerer) (*Lifecycler, error) { - addr, err := GetInstanceAddr(cfg.Addr, cfg.InfNames) +func NewLifecycler(cfg LifecyclerConfig, flushTransferer FlushTransferer, ringName, ringKey string, flushOnShutdown bool, logger log.Logger, reg prometheus.Registerer) (*Lifecycler, error) { + addr, err := GetInstanceAddr(cfg.Addr, cfg.InfNames, logger) if err != nil { return nil, err } @@ -157,8 +140,8 @@ func NewLifecycler(cfg LifecyclerConfig, flushTransferer FlushTransferer, ringNa store, err := kv.NewClient( cfg.RingConfig.KVStore, codec, - kv.RegistererWithKVName(prometheus.WrapRegistererWithPrefix("cortex_", reg), ringName+"-lifecycler"), - log.Logger, + kv.RegistererWithKVName(reg, ringName+"-lifecycler"), + logger, ) if err != nil { return nil, err @@ -166,7 +149,7 @@ func NewLifecycler(cfg LifecyclerConfig, flushTransferer FlushTransferer, ringNa zone := cfg.Zone if zone != "" { - log.WarnExperimentalUse("Zone aware replication") + level.Warn(logger).Log("msg", "experimental feature in use", "feature", "Zone aware replication") } // We do allow a nil FlushTransferer, but to keep the ring logic easier we assume @@ -176,10 +159,9 @@ func NewLifecycler(cfg LifecyclerConfig, flushTransferer FlushTransferer, ringNa } l := &Lifecycler{ - cfg: cfg, - flushTransferer: flushTransferer, - KVStore: store, - + cfg: cfg, + flushTransferer: flushTransferer, + KVStore: store, Addr: fmt.Sprintf("%s:%d", addr, port), ID: cfg.ID, RingName: ringName, @@ -187,14 +169,13 @@ func NewLifecycler(cfg LifecyclerConfig, flushTransferer FlushTransferer, ringNa flushOnShutdown: atomic.NewBool(flushOnShutdown), unregisterOnShutdown: atomic.NewBool(cfg.UnregisterOnShutdown), Zone: zone, - - actorChan: make(chan func()), - - state: PENDING, - startTime: time.Now(), + actorChan: make(chan func()), + state: PENDING, + lifecyclerMetrics: NewLifecyclerMetrics(ringName, reg), + logger: logger, } - tokensToOwn.WithLabelValues(l.RingName).Set(float64(cfg.NumTokens)) + l.lifecyclerMetrics.tokensToOwn.Set(float64(cfg.NumTokens)) l.BasicService = services. NewBasicService(nil, l.loop, l.stopping). @@ -214,36 +195,64 @@ func (i *Lifecycler) CheckReady(ctx context.Context) error { return nil } - // Ingester always take at least minReadyDuration to become ready to work - // around race conditions with ingesters exiting and updating the ring - if time.Since(i.startTime) < i.cfg.MinReadyDuration { - return fmt.Errorf("waiting for %v after startup", i.cfg.MinReadyDuration) + if err := i.checkRingHealthForReadiness(ctx); err != nil { + // Reset the min ready duration counter. + i.readySince = time.Time{} + + return err } - desc, err := i.KVStore.Get(ctx, i.RingKey) - if err != nil { - level.Error(log.Logger).Log("msg", "error talking to the KV store", "ring", i.RingName, "err", err) - return fmt.Errorf("error talking to the KV store: %s", err) + // Honor the min ready duration. The duration counter start after all readiness checks have + // passed. + if i.readySince.IsZero() { + i.readySince = time.Now() + } + if time.Since(i.readySince) < i.cfg.MinReadyDuration { + return fmt.Errorf("waiting for %v after being ready", i.cfg.MinReadyDuration) } + i.ready = true + return nil +} + +func (i *Lifecycler) checkRingHealthForReadiness(ctx context.Context) error { + // Ensure the instance holds some tokens. if len(i.getTokens()) == 0 { return fmt.Errorf("this instance owns no tokens") } + // If ring health checking is enabled we make sure all instances in the ring are ACTIVE and healthy, + // otherwise we just check this instance. + desc, err := i.KVStore.Get(ctx, i.RingKey) + if err != nil { + level.Error(i.logger).Log("msg", "error talking to the KV store", "ring", i.RingName, "err", err) + return fmt.Errorf("error talking to the KV store: %s", err) + } + ringDesc, ok := desc.(*Desc) if !ok || ringDesc == nil { return fmt.Errorf("no ring returned from the KV store") } - if err := ringDesc.Ready(time.Now(), i.cfg.RingConfig.HeartbeatTimeout); err != nil { - level.Warn(log.Logger).Log("msg", "found an existing instance(s) with a problem in the ring, "+ - "this instance cannot become ready until this problem is resolved. "+ - "The /ring http endpoint on the distributor (or single binary) provides visibility into the ring.", - "ring", i.RingName, "err", err) - return err + if i.cfg.ReadinessCheckRingHealth { + if err := ringDesc.IsReady(time.Now(), i.cfg.RingConfig.HeartbeatTimeout); err != nil { + level.Warn(i.logger).Log("msg", "found an existing instance(s) with a problem in the ring, "+ + "this instance cannot become ready until this problem is resolved. "+ + "The /ring http endpoint on the distributor (or single binary) provides visibility into the ring.", + "ring", i.RingName, "err", err) + return err + } + } else { + instance, ok := ringDesc.Ingesters[i.ID] + if !ok { + return fmt.Errorf("instance %s not found in the ring", i.ID) + } + + if err := instance.IsReady(time.Now(), i.cfg.RingConfig.HeartbeatTimeout); err != nil { + return err + } } - i.ready = true return nil } @@ -294,7 +303,7 @@ func (i *Lifecycler) getTokens() Tokens { } func (i *Lifecycler) setTokens(tokens Tokens) { - tokensOwned.WithLabelValues(i.RingName).Set(float64(len(tokens))) + i.lifecyclerMetrics.tokensOwned.Set(float64(len(tokens))) i.stateMtx.Lock() defer i.stateMtx.Unlock() @@ -302,7 +311,7 @@ func (i *Lifecycler) setTokens(tokens Tokens) { i.tokens = tokens if i.cfg.TokensFilePath != "" { if err := i.tokens.StoreToFile(i.cfg.TokensFilePath); err != nil { - level.Error(log.Logger).Log("msg", "error storing tokens to disk", "path", i.cfg.TokensFilePath, "err", err) + level.Error(i.logger).Log("msg", "error storing tokens to disk", "path", i.cfg.TokensFilePath, "err", err) } } } @@ -342,7 +351,7 @@ func (i *Lifecycler) ClaimTokensFor(ctx context.Context, ingesterID string) erro ing := ringDesc.Ingesters[i.ID] ing.Timestamp = time.Now().Unix() - // Tokens of the leaving ingester may have been generated by an older version of Cortex which + // Tokens of the leaving ingester may have been generated by an older version which // doesn't guarantee sorted tokens, so we enforce sorting here. sort.Sort(tokens) ing.Tokens = tokens @@ -352,7 +361,7 @@ func (i *Lifecycler) ClaimTokensFor(ctx context.Context, ingesterID string) erro } if err := i.KVStore.CAS(ctx, i.RingKey, claimTokens); err != nil { - level.Error(log.Logger).Log("msg", "Failed to write to the KV store", "ring", i.RingName, "err", err) + level.Error(i.logger).Log("msg", "Failed to write to the KV store", "ring", i.RingName, "err", err) } i.setTokens(tokens) @@ -392,19 +401,19 @@ func (i *Lifecycler) loop(ctx context.Context) error { // We do various period tasks autoJoinAfter := time.After(i.cfg.JoinAfter) - var observeChan <-chan time.Time = nil + var observeChan <-chan time.Time - heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(i.cfg.HeartbeatPeriod) + heartbeatTickerStop, heartbeatTickerChan := dstime.NewDisableableTicker(i.cfg.HeartbeatPeriod) defer heartbeatTickerStop() for { select { case <-autoJoinAfter: - level.Debug(log.Logger).Log("msg", "JoinAfter expired", "ring", i.RingName) + level.Debug(i.logger).Log("msg", "JoinAfter expired", "ring", i.RingName) // Will only fire once, after auto join timeout. If we haven't entered "JOINING" state, // then pick some tokens and enter ACTIVE state. if i.GetState() == PENDING { - level.Info(log.Logger).Log("msg", "auto-joining cluster after timeout", "ring", i.RingName) + level.Info(i.logger).Log("msg", "auto-joining cluster after timeout", "ring", i.RingName) if i.cfg.ObservePeriod > 0 { // let's observe the ring. By using JOINING state, this ingester will be ignored by LEAVING @@ -413,7 +422,7 @@ func (i *Lifecycler) loop(ctx context.Context) error { return perrors.Wrapf(err, "failed to pick tokens in the KV store, ring: %s", i.RingName) } - level.Info(log.Logger).Log("msg", "observing tokens before going ACTIVE", "ring", i.RingName) + level.Info(i.logger).Log("msg", "observing tokens before going ACTIVE", "ring", i.RingName) observeChan = time.After(i.cfg.ObservePeriod) } else { if err := i.autoJoin(context.Background(), ACTIVE); err != nil { @@ -428,33 +437,33 @@ func (i *Lifecycler) loop(ctx context.Context) error { observeChan = nil if s := i.GetState(); s != JOINING { - level.Error(log.Logger).Log("msg", "unexpected state while observing tokens", "state", s, "ring", i.RingName) + level.Error(i.logger).Log("msg", "unexpected state while observing tokens", "state", s, "ring", i.RingName) } if i.verifyTokens(context.Background()) { - level.Info(log.Logger).Log("msg", "token verification successful", "ring", i.RingName) + level.Info(i.logger).Log("msg", "token verification successful", "ring", i.RingName) err := i.changeState(context.Background(), ACTIVE) if err != nil { - level.Error(log.Logger).Log("msg", "failed to set state to ACTIVE", "ring", i.RingName, "err", err) + level.Error(i.logger).Log("msg", "failed to set state to ACTIVE", "ring", i.RingName, "err", err) } } else { - level.Info(log.Logger).Log("msg", "token verification failed, observing", "ring", i.RingName) + level.Info(i.logger).Log("msg", "token verification failed, observing", "ring", i.RingName) // keep observing observeChan = time.After(i.cfg.ObservePeriod) } case <-heartbeatTickerChan: - consulHeartbeats.WithLabelValues(i.RingName).Inc() + i.lifecyclerMetrics.consulHeartbeats.Inc() if err := i.updateConsul(context.Background()); err != nil { - level.Error(log.Logger).Log("msg", "failed to write to the KV store, sleeping", "ring", i.RingName, "err", err) + level.Error(i.logger).Log("msg", "failed to write to the KV store, sleeping", "ring", i.RingName, "err", err) } case f := <-i.actorChan: f() case <-ctx.Done(): - level.Info(log.Logger).Log("msg", "lifecycler loop() exited gracefully", "ring", i.RingName) + level.Info(i.logger).Log("msg", "lifecycler loop() exited gracefully", "ring", i.RingName) return nil } } @@ -471,13 +480,13 @@ func (i *Lifecycler) stopping(runningError error) error { return nil } - heartbeatTickerStop, heartbeatTickerChan := util.NewDisableableTicker(i.cfg.HeartbeatPeriod) + heartbeatTickerStop, heartbeatTickerChan := dstime.NewDisableableTicker(i.cfg.HeartbeatPeriod) defer heartbeatTickerStop() // Mark ourselved as Leaving so no more samples are send to us. err := i.changeState(context.Background(), LEAVING) if err != nil { - level.Error(log.Logger).Log("msg", "failed to set state to LEAVING", "ring", i.RingName, "err", err) + level.Error(i.logger).Log("msg", "failed to set state to LEAVING", "ring", i.RingName, "err", err) } // Do the transferring / flushing on a background goroutine so we can continue @@ -492,9 +501,9 @@ heartbeatLoop: for { select { case <-heartbeatTickerChan: - consulHeartbeats.WithLabelValues(i.RingName).Inc() + i.lifecyclerMetrics.consulHeartbeats.Inc() if err := i.updateConsul(context.Background()); err != nil { - level.Error(log.Logger).Log("msg", "failed to write to the KV store, sleeping", "ring", i.RingName, "err", err) + level.Error(i.logger).Log("msg", "failed to write to the KV store, sleeping", "ring", i.RingName, "err", err) } case <-done: @@ -506,7 +515,7 @@ heartbeatLoop: if err := i.unregister(context.Background()); err != nil { return perrors.Wrapf(err, "failed to unregister from the KV store, ring: %s", i.RingName) } - level.Info(log.Logger).Log("msg", "instance removed from the KV store", "ring", i.RingName) + level.Info(i.logger).Log("msg", "instance removed from the KV store", "ring", i.RingName) } return nil @@ -525,10 +534,10 @@ func (i *Lifecycler) initRing(ctx context.Context) error { if i.cfg.TokensFilePath != "" { tokensFromFile, err = LoadTokensFromFile(i.cfg.TokensFilePath) if err != nil && !os.IsNotExist(err) { - level.Error(log.Logger).Log("msg", "error loading tokens from file", "err", err) + level.Error(i.logger).Log("msg", "error loading tokens from file", "err", err) } } else { - level.Info(log.Logger).Log("msg", "not loading tokens from file, tokens file path is empty") + level.Info(i.logger).Log("msg", "not loading tokens from file, tokens file path is empty") } err = i.KVStore.CAS(ctx, i.RingKey, func(in interface{}) (out interface{}, retry bool, err error) { @@ -547,7 +556,7 @@ func (i *Lifecycler) initRing(ctx context.Context) error { // We use the tokens from the file only if it does not exist in the ring yet. if len(tokensFromFile) > 0 { - level.Info(log.Logger).Log("msg", "adding tokens from file", "num_tokens", len(tokensFromFile)) + level.Info(i.logger).Log("msg", "adding tokens from file", "num_tokens", len(tokensFromFile)) if len(tokensFromFile) >= i.cfg.NumTokens { i.setState(ACTIVE) } @@ -557,7 +566,7 @@ func (i *Lifecycler) initRing(ctx context.Context) error { } // Either we are a new ingester, or consul must have restarted - level.Info(log.Logger).Log("msg", "instance not found in ring, adding with no tokens", "ring", i.RingName) + level.Info(i.logger).Log("msg", "instance not found in ring, adding with no tokens", "ring", i.RingName) ringDesc.AddIngester(i.ID, i.Addr, i.Zone, []uint32{}, i.GetState(), registeredAt) return ringDesc, true, nil } @@ -571,7 +580,7 @@ func (i *Lifecycler) initRing(ctx context.Context) error { // to set it back to PENDING in order to start the lifecycle from the // beginning. if instanceDesc.State == JOINING { - level.Warn(log.Logger).Log("msg", "instance found in ring as JOINING, setting to PENDING", + level.Warn(i.logger).Log("msg", "instance found in ring as JOINING, setting to PENDING", "ring", i.RingName) instanceDesc.State = PENDING return ringDesc, true, nil @@ -589,7 +598,7 @@ func (i *Lifecycler) initRing(ctx context.Context) error { tokens, _ := ringDesc.TokensFor(i.ID) i.setTokens(tokens) - level.Info(log.Logger).Log("msg", "existing entry found in ring", "state", i.GetState(), "tokens", len(tokens), "ring", i.RingName) + level.Info(i.logger).Log("msg", "existing entry found in ring", "state", i.GetState(), "tokens", len(tokens), "ring", i.RingName) // Update the ring if the instance has been changed and the heartbeat is disabled. // We dont need to update KV here when heartbeat is enabled as this info will eventually be update on KV @@ -634,7 +643,7 @@ func (i *Lifecycler) verifyTokens(ctx context.Context) bool { // uh, oh... our tokens are not our anymore. Let's try new ones. needTokens := i.cfg.NumTokens - len(ringTokens) - level.Info(log.Logger).Log("msg", "generating new tokens", "count", needTokens, "ring", i.RingName) + level.Info(i.logger).Log("msg", "generating new tokens", "count", needTokens, "ring", i.RingName) newTokens := GenerateTokens(needTokens, takenTokens) ringTokens = append(ringTokens, newTokens...) @@ -653,7 +662,7 @@ func (i *Lifecycler) verifyTokens(ctx context.Context) bool { }) if err != nil { - level.Error(log.Logger).Log("msg", "failed to verify tokens", "ring", i.RingName, "err", err) + level.Error(i.logger).Log("msg", "failed to verify tokens", "ring", i.RingName, "err", err) return false } @@ -692,7 +701,7 @@ func (i *Lifecycler) autoJoin(ctx context.Context, targetState InstanceState) er // At this point, we should not have any tokens, and we should be in PENDING state. myTokens, takenTokens := ringDesc.TokensFor(i.ID) if len(myTokens) > 0 { - level.Error(log.Logger).Log("msg", "tokens already exist for this instance - wasn't expecting any!", "num_tokens", len(myTokens), "ring", i.RingName) + level.Error(i.logger).Log("msg", "tokens already exist for this instance - wasn't expecting any!", "num_tokens", len(myTokens), "ring", i.RingName) } newTokens := GenerateTokens(i.cfg.NumTokens-len(myTokens), takenTokens) @@ -730,7 +739,7 @@ func (i *Lifecycler) updateConsul(ctx context.Context) error { instanceDesc, ok := ringDesc.Ingesters[i.ID] if !ok { // consul must have restarted - level.Info(log.Logger).Log("msg", "found empty ring, inserting tokens", "ring", i.RingName) + level.Info(i.logger).Log("msg", "found empty ring, inserting tokens", "ring", i.RingName) ringDesc.AddIngester(i.ID, i.Addr, i.Zone, i.getTokens(), i.GetState(), i.getRegisteredAt()) } else { instanceDesc.Timestamp = time.Now().Unix() @@ -765,7 +774,7 @@ func (i *Lifecycler) changeState(ctx context.Context, state InstanceState) error return fmt.Errorf("Changing instance state from %v -> %v is disallowed", currState, state) } - level.Info(log.Logger).Log("msg", "changing instance state from", "old_state", currState, "new_state", state, "ring", i.RingName) + level.Info(i.logger).Log("msg", "changing instance state from", "old_state", currState, "new_state", state, "ring", i.RingName) i.setState(state) return i.updateConsul(ctx) } @@ -820,20 +829,20 @@ func (i *Lifecycler) processShutdown(ctx context.Context) { transferStart := time.Now() if err := i.flushTransferer.TransferOut(ctx); err != nil { if err == ErrTransferDisabled { - level.Info(log.Logger).Log("msg", "transfers are disabled") + level.Info(i.logger).Log("msg", "transfers are disabled") } else { - level.Error(log.Logger).Log("msg", "failed to transfer chunks to another instance", "ring", i.RingName, "err", err) - shutdownDuration.WithLabelValues("transfer", "fail", i.RingName).Observe(time.Since(transferStart).Seconds()) + level.Error(i.logger).Log("msg", "failed to transfer chunks to another instance", "ring", i.RingName, "err", err) + i.lifecyclerMetrics.shutdownDuration.WithLabelValues("transfer", "fail").Observe(time.Since(transferStart).Seconds()) } } else { flushRequired = false - shutdownDuration.WithLabelValues("transfer", "success", i.RingName).Observe(time.Since(transferStart).Seconds()) + i.lifecyclerMetrics.shutdownDuration.WithLabelValues("transfer", "success").Observe(time.Since(transferStart).Seconds()) } if flushRequired { flushStart := time.Now() i.flushTransferer.Flush() - shutdownDuration.WithLabelValues("flush", "success", i.RingName).Observe(time.Since(flushStart).Seconds()) + i.lifecyclerMetrics.shutdownDuration.WithLabelValues("flush", "success").Observe(time.Since(flushStart).Seconds()) } // Sleep so the shutdownDuration metric can be collected. @@ -842,7 +851,7 @@ func (i *Lifecycler) processShutdown(ctx context.Context) { // unregister removes our entry from consul. func (i *Lifecycler) unregister(ctx context.Context) error { - level.Debug(log.Logger).Log("msg", "unregistering instance from ring", "ring", i.RingName) + level.Debug(i.logger).Log("msg", "unregistering instance from ring", "ring", i.RingName) return i.KVStore.CAS(ctx, i.RingKey, func(in interface{}) (out interface{}, retry bool, err error) { if in == nil { diff --git a/vendor/github.com/grafana/dskit/ring/lifecycler_metrics.go b/vendor/github.com/grafana/dskit/ring/lifecycler_metrics.go new file mode 100644 index 00000000000..422a564c18b --- /dev/null +++ b/vendor/github.com/grafana/dskit/ring/lifecycler_metrics.go @@ -0,0 +1,40 @@ +package ring + +import ( + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" +) + +type LifecyclerMetrics struct { + consulHeartbeats prometheus.Counter + tokensOwned prometheus.Gauge + tokensToOwn prometheus.Gauge + shutdownDuration *prometheus.HistogramVec +} + +func NewLifecyclerMetrics(ringName string, reg prometheus.Registerer) *LifecyclerMetrics { + return &LifecyclerMetrics{ + consulHeartbeats: promauto.With(reg).NewCounter(prometheus.CounterOpts{ + Name: "member_consul_heartbeats_total", + Help: "The total number of heartbeats sent to consul.", + ConstLabels: prometheus.Labels{"name": ringName}, + }), + tokensOwned: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ + Name: "member_ring_tokens_owned", + Help: "The number of tokens owned in the ring.", + ConstLabels: prometheus.Labels{"name": ringName}, + }), + tokensToOwn: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ + Name: "member_ring_tokens_to_own", + Help: "The number of tokens to own in the ring.", + ConstLabels: prometheus.Labels{"name": ringName}, + }), + shutdownDuration: promauto.With(reg).NewHistogramVec(prometheus.HistogramOpts{ + Name: "shutdown_duration_seconds", + Help: "Duration (in seconds) of shutdown procedure (ie transfer or flush).", + Buckets: prometheus.ExponentialBuckets(10, 2, 8), // Biggest bucket is 10*2^(9-1) = 2560, or 42 mins. + ConstLabels: prometheus.Labels{"name": ringName}, + }, []string{"op", "status"}), + } + +} diff --git a/pkg/ring/model.go b/vendor/github.com/grafana/dskit/ring/model.go similarity index 94% rename from pkg/ring/model.go rename to vendor/github.com/grafana/dskit/ring/model.go index 26487c050bc..cb2d7c7870d 100644 --- a/pkg/ring/model.go +++ b/vendor/github.com/grafana/dskit/ring/model.go @@ -7,6 +7,7 @@ import ( "time" "github.com/gogo/protobuf/proto" + "github.com/grafana/dskit/kv/codec" "github.com/grafana/dskit/kv/memberlist" ) @@ -96,16 +97,15 @@ func (d *Desc) FindIngestersByState(state InstanceState) []InstanceDesc { return result } -// Ready returns no error when all ingesters are active and healthy. -func (d *Desc) Ready(now time.Time, heartbeatTimeout time.Duration) error { +// IsReady returns no error when all instance are ACTIVE and healthy, +// and the ring has some tokens. +func (d *Desc) IsReady(now time.Time, heartbeatTimeout time.Duration) error { numTokens := 0 - for id, ingester := range d.Ingesters { - if !ingester.IsHeartbeatHealthy(heartbeatTimeout, now) { - return fmt.Errorf("instance %s past heartbeat timeout", id) - } else if ingester.State != ACTIVE { - return fmt.Errorf("instance %s in state %v", id, ingester.State) + for _, instance := range d.Ingesters { + if err := instance.IsReady(now, heartbeatTimeout); err != nil { + return err } - numTokens += len(ingester.Tokens) + numTokens += len(instance.Tokens) } if numTokens == 0 { @@ -147,6 +147,17 @@ func (i *InstanceDesc) IsHeartbeatHealthy(heartbeatTimeout time.Duration, now ti return now.Sub(time.Unix(i.Timestamp, 0)) <= heartbeatTimeout } +// IsReady returns no error if the instance is ACTIVE and healthy. +func (i *InstanceDesc) IsReady(now time.Time, heartbeatTimeout time.Duration) error { + if !i.IsHeartbeatHealthy(heartbeatTimeout, now) { + return fmt.Errorf("instance %s past heartbeat timeout", i.Addr) + } + if i.State != ACTIVE { + return fmt.Errorf("instance %s in state %v", i.Addr, i.State) + } + return nil +} + // Merge merges other ring into this one. Returns sub-ring that represents the change, // and can be sent out to other clients. // @@ -423,7 +434,7 @@ func (d *Desc) getTokensInfo() map[uint32]instanceInfo { func (d *Desc) GetTokens() []uint32 { instances := make([][]uint32, 0, len(d.Ingesters)) for _, instance := range d.Ingesters { - // Tokens may not be sorted for an older version of Cortex which, so we enforce sorting here. + // Tokens may not be sorted for an older version which, so we enforce sorting here. tokens := instance.Tokens if !sort.IsSorted(Tokens(tokens)) { sort.Sort(Tokens(tokens)) @@ -440,7 +451,7 @@ func (d *Desc) GetTokens() []uint32 { func (d *Desc) getTokensByZone() map[string][]uint32 { zones := map[string][][]uint32{} for _, instance := range d.Ingesters { - // Tokens may not be sorted for an older version of Cortex which, so we enforce sorting here. + // Tokens may not be sorted for an older version which, so we enforce sorting here. tokens := instance.Tokens if !sort.IsSorted(Tokens(tokens)) { sort.Sort(Tokens(tokens)) @@ -455,6 +466,7 @@ func (d *Desc) getTokensByZone() map[string][]uint32 { type CompareResult int +// CompareResult responses const ( Equal CompareResult = iota // Both rings contain same exact instances. EqualButStatesAndTimestamps // Both rings contain the same instances with the same data except states and timestamps (may differ). diff --git a/pkg/ring/replication_set.go b/vendor/github.com/grafana/dskit/ring/replication_set.go similarity index 100% rename from pkg/ring/replication_set.go rename to vendor/github.com/grafana/dskit/ring/replication_set.go diff --git a/pkg/ring/replication_set_tracker.go b/vendor/github.com/grafana/dskit/ring/replication_set_tracker.go similarity index 100% rename from pkg/ring/replication_set_tracker.go rename to vendor/github.com/grafana/dskit/ring/replication_set_tracker.go diff --git a/pkg/ring/replication_strategy.go b/vendor/github.com/grafana/dskit/ring/replication_strategy.go similarity index 100% rename from pkg/ring/replication_strategy.go rename to vendor/github.com/grafana/dskit/ring/replication_strategy.go diff --git a/pkg/ring/ring.go b/vendor/github.com/grafana/dskit/ring/ring.go similarity index 86% rename from pkg/ring/ring.go rename to vendor/github.com/grafana/dskit/ring/ring.go index 4577b677414..63e3a547c48 100644 --- a/pkg/ring/ring.go +++ b/vendor/github.com/grafana/dskit/ring/ring.go @@ -11,15 +11,19 @@ import ( "sync" "time" + "github.com/go-kit/log" "github.com/go-kit/log/level" - "github.com/grafana/dskit/kv" - "github.com/grafana/dskit/services" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promauto" + + "github.com/grafana/dskit/kv" + shardUtil "github.com/grafana/dskit/ring/shard" + "github.com/grafana/dskit/ring/util" + "github.com/grafana/dskit/services" - "github.com/cortexproject/cortex/pkg/util" - "github.com/cortexproject/cortex/pkg/util/log" - util_math "github.com/cortexproject/cortex/pkg/util/math" + "github.com/grafana/dskit/flagext" + dsmath "github.com/grafana/dskit/math" ) const ( @@ -44,7 +48,6 @@ const ( // ReadRing represents the read interface to the ring. type ReadRing interface { - prometheus.Collector // Get returns n (or more) instances which form the replicas for the given key. // bufDescs, bufHosts and bufZones are slices to be overwritten for the return value @@ -99,6 +102,7 @@ var ( // WriteNoExtend is like Write, but with no replicaset extension. WriteNoExtend = NewOp([]InstanceState{ACTIVE}, nil) + // Read operation that extends the replica set if an instance is not ACTIVE or LEAVING Read = NewOp([]InstanceState{ACTIVE, PENDING, LEAVING}, func(s InstanceState) bool { // To match Write with extended replica set we have to also increase the // size of the replica set for Read, but we can read from LEAVING ingesters. @@ -128,10 +132,11 @@ var ( // Config for a Ring type Config struct { - KVStore kv.Config `yaml:"kvstore"` - HeartbeatTimeout time.Duration `yaml:"heartbeat_timeout"` - ReplicationFactor int `yaml:"replication_factor"` - ZoneAwarenessEnabled bool `yaml:"zone_awareness_enabled"` + KVStore kv.Config `yaml:"kvstore"` + HeartbeatTimeout time.Duration `yaml:"heartbeat_timeout"` + ReplicationFactor int `yaml:"replication_factor"` + ZoneAwarenessEnabled bool `yaml:"zone_awareness_enabled"` + ExcludedZones flagext.StringSliceCSV `yaml:"excluded_zones"` // Whether the shuffle-sharding subring cache is disabled. This option is set // internally and never exposed to the user. @@ -150,6 +155,7 @@ func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { f.DurationVar(&cfg.HeartbeatTimeout, prefix+"ring.heartbeat-timeout", time.Minute, "The heartbeat timeout after which ingesters are skipped for reads/writes. 0 = never (timeout disabled).") f.IntVar(&cfg.ReplicationFactor, prefix+"distributor.replication-factor", 3, "The number of ingesters to write to and read from.") f.BoolVar(&cfg.ZoneAwarenessEnabled, prefix+"distributor.zone-awareness-enabled", false, "True to enable the zone-awareness and replicate ingested samples across different availability zones.") + f.Var(&cfg.ExcludedZones, prefix+"distributor.excluded-zones", "Comma-separated list of zones to exclude from the ring. Instances in excluded zones will be filtered out from the ring.") } type instanceInfo struct { @@ -187,11 +193,13 @@ type Ring struct { // If set to nil, no caching is done (used by tests, and subrings). shuffledSubringCache map[subringCacheKey]*Ring - memberOwnershipDesc *prometheus.Desc - numMembersDesc *prometheus.Desc - totalTokensDesc *prometheus.Desc - numTokensDesc *prometheus.Desc - oldestTimestampDesc *prometheus.Desc + memberOwnershipGaugeVec *prometheus.GaugeVec + numMembersGaugeVec *prometheus.GaugeVec + totalTokensGauge prometheus.Gauge + numTokensGaugeVec *prometheus.GaugeVec + oldestTimestampGaugeVec *prometheus.GaugeVec + + logger log.Logger } type subringCacheKey struct { @@ -200,23 +208,23 @@ type subringCacheKey struct { } // New creates a new Ring. Being a service, Ring needs to be started to do anything. -func New(cfg Config, name, key string, reg prometheus.Registerer) (*Ring, error) { +func New(cfg Config, name, key string, logger log.Logger, reg prometheus.Registerer) (*Ring, error) { codec := GetCodec() // Suffix all client names with "-ring" to denote this kv client is used by the ring store, err := kv.NewClient( cfg.KVStore, codec, - kv.RegistererWithKVName(prometheus.WrapRegistererWithPrefix("cortex_", reg), name+"-ring"), - log.Logger, + kv.RegistererWithKVName(reg, name+"-ring"), + logger, ) if err != nil { return nil, err } - return NewWithStoreClientAndStrategy(cfg, name, key, store, NewDefaultReplicationStrategy()) + return NewWithStoreClientAndStrategy(cfg, name, key, store, NewDefaultReplicationStrategy(), reg, logger) } -func NewWithStoreClientAndStrategy(cfg Config, name, key string, store kv.Client, strategy ReplicationStrategy) (*Ring, error) { +func NewWithStoreClientAndStrategy(cfg Config, name, key string, store kv.Client, strategy ReplicationStrategy, reg prometheus.Registerer, logger log.Logger) (*Ring, error) { if cfg.ReplicationFactor <= 0 { return nil, fmt.Errorf("ReplicationFactor must be greater than zero: %d", cfg.ReplicationFactor) } @@ -228,36 +236,31 @@ func NewWithStoreClientAndStrategy(cfg Config, name, key string, store kv.Client strategy: strategy, ringDesc: &Desc{}, shuffledSubringCache: map[subringCacheKey]*Ring{}, - memberOwnershipDesc: prometheus.NewDesc( - "cortex_ring_member_ownership_percent", - "The percent ownership of the ring by member", - []string{"member"}, - map[string]string{"name": name}, - ), - numMembersDesc: prometheus.NewDesc( - "cortex_ring_members", - "Number of members in the ring", - []string{"state"}, - map[string]string{"name": name}, - ), - totalTokensDesc: prometheus.NewDesc( - "cortex_ring_tokens_total", - "Number of tokens in the ring", - nil, - map[string]string{"name": name}, - ), - numTokensDesc: prometheus.NewDesc( - "cortex_ring_tokens_owned", - "The number of tokens in the ring owned by the member", - []string{"member"}, - map[string]string{"name": name}, - ), - oldestTimestampDesc: prometheus.NewDesc( - "cortex_ring_oldest_member_timestamp", - "Timestamp of the oldest member in the ring.", - []string{"state"}, - map[string]string{"name": name}, - ), + memberOwnershipGaugeVec: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + Name: "ring_member_ownership_percent", + Help: "The percent ownership of the ring by member", + ConstLabels: map[string]string{"name": name}}, + []string{"member"}), + numMembersGaugeVec: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + Name: "ring_members", + Help: "Number of members in the ring", + ConstLabels: map[string]string{"name": name}}, + []string{"state"}), + totalTokensGauge: promauto.With(reg).NewGauge(prometheus.GaugeOpts{ + Name: "ring_tokens_total", + Help: "Number of tokens in the ring", + ConstLabels: map[string]string{"name": name}}), + numTokensGaugeVec: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + Name: "ring_tokens_owned", + Help: "The number of tokens in the ring owned by the member", + ConstLabels: map[string]string{"name": name}}, + []string{"member"}), + oldestTimestampGaugeVec: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{ + Name: "ring_oldest_member_timestamp", + Help: "Timestamp of the oldest member in the ring.", + ConstLabels: map[string]string{"name": name}}, + []string{"state"}), + logger: logger, } r.Service = services.NewBasicService(r.starting, r.loop, nil).WithName(fmt.Sprintf("%s ring client", name)) @@ -272,19 +275,35 @@ func (r *Ring) starting(ctx context.Context) error { if err != nil { return errors.Wrap(err, "unable to initialise ring state") } - if value == nil { - level.Info(log.Logger).Log("msg", "ring doesn't exist in KV store yet") - return nil + if value != nil { + r.updateRingState(value.(*Desc)) + } else { + level.Info(r.logger).Log("msg", "ring doesn't exist in KV store yet") } - - r.updateRingState(value.(*Desc)) return nil } func (r *Ring) loop(ctx context.Context) error { + // Update the ring metrics at start of the main loop. + r.updateRingMetrics() + go func() { + // Start metrics update ticker to update the ring metrics. + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + r.updateRingMetrics() + case <-ctx.Done(): + return + } + } + }() + r.KVClient.WatchKey(ctx, r.key, func(value interface{}) bool { if value == nil { - level.Info(log.Logger).Log("msg", "ring doesn't exist in KV store yet") + level.Info(r.logger).Log("msg", "ring doesn't exist in KV store yet") return true } @@ -299,6 +318,15 @@ func (r *Ring) updateRingState(ringDesc *Desc) { prevRing := r.ringDesc r.mtx.RUnlock() + // Filter out all instances belonging to excluded zones. + if len(r.cfg.ExcludedZones) > 0 { + for instanceID, instance := range ringDesc.Ingesters { + if util.StringsContain(r.cfg.ExcludedZones, instance.Zone) { + delete(ringDesc.Ingesters, instanceID) + } + } + } + rc := prevRing.RingCompare(ringDesc) if rc == Equal || rc == EqualButStatesAndTimestamps { // No need to update tokens or zones. Only states and timestamps @@ -454,7 +482,7 @@ func (r *Ring) GetReplicationSetForOperation(op Operation) (ReplicationSet, erro // Given data is replicated to RF different zones, we can tolerate a number of // RF/2 failing zones. However, we need to protect from the case the ring currently // contains instances in a number of zones < RF. - numReplicatedZones := util_math.Min(len(r.ringZones), r.cfg.ReplicationFactor) + numReplicatedZones := dsmath.Min(len(r.ringZones), r.cfg.ReplicationFactor) minSuccessZones := (numReplicatedZones / 2) + 1 maxUnavailableZones = minSuccessZones - 1 @@ -505,15 +533,6 @@ func (r *Ring) GetReplicationSetForOperation(op Operation) (ReplicationSet, erro }, nil } -// Describe implements prometheus.Collector. -func (r *Ring) Describe(ch chan<- *prometheus.Desc) { - ch <- r.memberOwnershipDesc - ch <- r.numMembersDesc - ch <- r.totalTokensDesc - ch <- r.oldestTimestampDesc - ch <- r.numTokensDesc -} - // countTokens returns the number of tokens and tokens within the range for each instance. // The ring read lock must be already taken when calling this function. func (r *Ring) countTokens() (map[string]uint32, map[string]uint32) { @@ -545,25 +564,15 @@ func (r *Ring) countTokens() (map[string]uint32, map[string]uint32) { return numTokens, owned } -// Collect implements prometheus.Collector. -func (r *Ring) Collect(ch chan<- prometheus.Metric) { +// updateRingMetrics updates ring metrics. +func (r *Ring) updateRingMetrics() { r.mtx.RLock() defer r.mtx.RUnlock() numTokens, ownedRange := r.countTokens() for id, totalOwned := range ownedRange { - ch <- prometheus.MustNewConstMetric( - r.memberOwnershipDesc, - prometheus.GaugeValue, - float64(totalOwned)/float64(math.MaxUint32), - id, - ) - ch <- prometheus.MustNewConstMetric( - r.numTokensDesc, - prometheus.GaugeValue, - float64(numTokens[id]), - id, - ) + r.memberOwnershipGaugeVec.WithLabelValues(id).Set(float64(totalOwned) / float64(math.MaxUint32)) + r.numTokensGaugeVec.WithLabelValues(id).Set(float64(numTokens[id])) } numByState := map[string]int{} @@ -587,27 +596,12 @@ func (r *Ring) Collect(ch chan<- prometheus.Metric) { } for state, count := range numByState { - ch <- prometheus.MustNewConstMetric( - r.numMembersDesc, - prometheus.GaugeValue, - float64(count), - state, - ) + r.numMembersGaugeVec.WithLabelValues(state).Set(float64(count)) } for state, timestamp := range oldestTimestampByState { - ch <- prometheus.MustNewConstMetric( - r.oldestTimestampDesc, - prometheus.GaugeValue, - float64(timestamp), - state, - ) - } - - ch <- prometheus.MustNewConstMetric( - r.totalTokensDesc, - prometheus.GaugeValue, - float64(len(r.ringTokens)), - ) + r.oldestTimestampGaugeVec.WithLabelValues(state).Set(float64(timestamp)) + } + r.totalTokensGauge.Set(float64(len(r.ringTokens))) } // ShuffleShard returns a subring for the provided identifier (eg. a tenant ID) @@ -671,7 +665,7 @@ func (r *Ring) shuffleShard(identifier string, size int, lookbackPeriod time.Dur var actualZones []string if r.cfg.ZoneAwarenessEnabled { - numInstancesPerZone = util.ShuffleShardExpectedInstancesPerZone(size, len(r.ringZones)) + numInstancesPerZone = shardUtil.ShuffleShardExpectedInstancesPerZone(size, len(r.ringZones)) actualZones = r.ringZones } else { numInstancesPerZone = size @@ -696,7 +690,7 @@ func (r *Ring) shuffleShard(identifier string, size int, lookbackPeriod time.Dur // Since we consider each zone like an independent ring, we have to use dedicated // pseudo-random generator for each zone, in order to guarantee the "consistency" // property when the shard size changes or a new zone is added. - random := rand.New(rand.NewSource(util.ShuffleShardSeed(identifier, zone))) + random := rand.New(rand.NewSource(shardUtil.ShuffleShardSeed(identifier, zone))) // To select one more instance while guaranteeing the "consistency" property, // we do pick a random value from the generator and resolve uniqueness collisions @@ -866,7 +860,7 @@ func NewOp(healthyStates []InstanceState, shouldExtendReplicaSet func(s Instance } if shouldExtendReplicaSet != nil { - for _, s := range []InstanceState{ACTIVE, LEAVING, PENDING, JOINING, LEFT} { + for _, s := range []InstanceState{ACTIVE, LEAVING, PENDING, JOINING, LEAVING, LEFT} { if shouldExtendReplicaSet(s) { op |= (0x10000 << s) } diff --git a/pkg/ring/ring.pb.go b/vendor/github.com/grafana/dskit/ring/ring.pb.go similarity index 99% rename from pkg/ring/ring.pb.go rename to vendor/github.com/grafana/dskit/ring/ring.pb.go index 8029a514343..453ac4c2e40 100644 --- a/pkg/ring/ring.pb.go +++ b/vendor/github.com/grafana/dskit/ring/ring.pb.go @@ -122,7 +122,7 @@ type InstanceDesc struct { // When an instance is already registered in the ring with a value of 0 it's NOT safe to // update the timestamp to "now" because it would break the contract, given the instance // was already registered before "now". If unknown (0), it should be left as is, and the - // Cortex code will properly deal with that. + // code will properly deal with that. RegisteredTimestamp int64 `protobuf:"varint,8,opt,name=registered_timestamp,json=registeredTimestamp,proto3" json:"registered_timestamp,omitempty"` } diff --git a/pkg/ring/ring.proto b/vendor/github.com/grafana/dskit/ring/ring.proto similarity index 97% rename from pkg/ring/ring.proto rename to vendor/github.com/grafana/dskit/ring/ring.proto index 1310e9b1c47..8f464dc2c8c 100644 --- a/pkg/ring/ring.proto +++ b/vendor/github.com/grafana/dskit/ring/ring.proto @@ -37,7 +37,7 @@ message InstanceDesc { // When an instance is already registered in the ring with a value of 0 it's NOT safe to // update the timestamp to "now" because it would break the contract, given the instance // was already registered before "now". If unknown (0), it should be left as is, and the - // Cortex code will properly deal with that. + // code will properly deal with that. int64 registered_timestamp = 8; } diff --git a/vendor/github.com/grafana/dskit/ring/shard/shard.go b/vendor/github.com/grafana/dskit/ring/shard/shard.go new file mode 100644 index 00000000000..1d70eb6283b --- /dev/null +++ b/vendor/github.com/grafana/dskit/ring/shard/shard.go @@ -0,0 +1,45 @@ +package shard + +import ( + "crypto/md5" + "encoding/binary" + "math" + "unsafe" +) + +var ( + seedSeparator = []byte{0} +) + +// ShuffleShardSeed returns seed for random number generator, computed from provided identifier. +func ShuffleShardSeed(identifier, zone string) int64 { + // Use the identifier to compute a hash we'll use to seed the random. + hasher := md5.New() + hasher.Write(yoloBuf(identifier)) // nolint:errcheck + if zone != "" { + hasher.Write(seedSeparator) // nolint:errcheck + hasher.Write(yoloBuf(zone)) // nolint:errcheck + } + checksum := hasher.Sum(nil) + + // Generate the seed based on the first 64 bits of the checksum. + return int64(binary.BigEndian.Uint64(checksum)) +} + +// ShuffleShardExpectedInstancesPerZone returns the number of instances that should be selected for each +// zone when zone-aware replication is enabled. The algorithm expects the shard size to be divisible +// by the number of zones, in order to have nodes balanced across zones. If it's not, we do round up. +func ShuffleShardExpectedInstancesPerZone(shardSize, numZones int) int { + return int(math.Ceil(float64(shardSize) / float64(numZones))) +} + +// ShuffleShardExpectedInstances returns the total number of instances that should be selected for a given +// tenant. If zone-aware replication is disabled, the input numZones should be 1. +func ShuffleShardExpectedInstances(shardSize, numZones int) int { + return ShuffleShardExpectedInstancesPerZone(shardSize, numZones) * numZones +} + +// yoloBuf will return an unsafe pointer to a string, as the name yolo.yoloBuf implies use at your own risk. +func yoloBuf(s string) []byte { + return *((*[]byte)(unsafe.Pointer(&s))) +} diff --git a/pkg/ring/tokens.go b/vendor/github.com/grafana/dskit/ring/tokens.go similarity index 96% rename from pkg/ring/tokens.go rename to vendor/github.com/grafana/dskit/ring/tokens.go index 919c626bc6a..51b7d830919 100644 --- a/pkg/ring/tokens.go +++ b/vendor/github.com/grafana/dskit/ring/tokens.go @@ -79,7 +79,7 @@ func LoadTokensFromFile(tokenFilePath string) (Tokens, error) { var t Tokens err = t.Unmarshal(b) - // Tokens may have been written to file by an older version of Cortex which + // Tokens may have been written to file by an older version which // doesn't guarantee sorted tokens, so we enforce sorting here. if !sort.IsSorted(t) { sort.Sort(t) diff --git a/pkg/ring/util.go b/vendor/github.com/grafana/dskit/ring/util.go similarity index 67% rename from pkg/ring/util.go rename to vendor/github.com/grafana/dskit/ring/util.go index 7754ec49d59..a836aa2fca1 100644 --- a/pkg/ring/util.go +++ b/vendor/github.com/grafana/dskit/ring/util.go @@ -2,13 +2,17 @@ package ring import ( "context" + "fmt" "math/rand" + "net" "sort" + "strings" "time" - "github.com/grafana/dskit/backoff" + "github.com/go-kit/log" + "github.com/go-kit/log/level" - "github.com/cortexproject/cortex/pkg/util" + "github.com/grafana/dskit/backoff" ) // GenerateTokens make numTokens unique random tokens, none of which clash @@ -46,12 +50,12 @@ func GenerateTokens(numTokens int, takenTokens []uint32) []uint32 { // GetInstanceAddr returns the address to use to register the instance // in the ring. -func GetInstanceAddr(configAddr string, netInterfaces []string) (string, error) { +func GetInstanceAddr(configAddr string, netInterfaces []string, logger log.Logger) (string, error) { if configAddr != "" { return configAddr, nil } - addr, err := util.GetFirstAddressOf(netInterfaces) + addr, err := getFirstAddressOf(netInterfaces, logger) if err != nil { return "", err } @@ -154,3 +158,54 @@ func searchToken(tokens []uint32, key uint32) int { } return i } + +// GetFirstAddressOf returns the first IPv4 address of the supplied interface names, omitting any 169.254.x.x automatic private IPs if possible. +func getFirstAddressOf(names []string, logger log.Logger) (string, error) { + var ipAddr string + for _, name := range names { + inf, err := net.InterfaceByName(name) + if err != nil { + level.Warn(logger).Log("msg", "error getting interface", "inf", name, "err", err) + continue + } + addrs, err := inf.Addrs() + if err != nil { + level.Warn(logger).Log("msg", "error getting addresses for interface", "inf", name, "err", err) + continue + } + if len(addrs) <= 0 { + level.Warn(logger).Log("msg", "no addresses found for interface", "inf", name, "err", err) + continue + } + if ip := filterIPs(addrs); ip != "" { + ipAddr = ip + } + if strings.HasPrefix(ipAddr, `169.254.`) || ipAddr == "" { + continue + } + return ipAddr, nil + } + if ipAddr == "" { + return "", fmt.Errorf("No address found for %s", names) + } + if strings.HasPrefix(ipAddr, `169.254.`) { + level.Warn(logger).Log("msg", "using automatic private ip", "address", ipAddr) + } + return ipAddr, nil +} + +// filterIPs attempts to return the first non automatic private IP (APIPA / 169.254.x.x) if possible, only returning APIPA if available and no other valid IP is found. +func filterIPs(addrs []net.Addr) string { + var ipAddr string + for _, addr := range addrs { + if v, ok := addr.(*net.IPNet); ok { + if ip := v.IP.To4(); ip != nil { + ipAddr = v.IP.String() + if !strings.HasPrefix(ipAddr, `169.254.`) { + return ipAddr + } + } + } + } + return ipAddr +} diff --git a/vendor/github.com/grafana/dskit/ring/util/string_utils.go b/vendor/github.com/grafana/dskit/ring/util/string_utils.go new file mode 100644 index 00000000000..39868e1d1cb --- /dev/null +++ b/vendor/github.com/grafana/dskit/ring/util/string_utils.go @@ -0,0 +1,12 @@ +package util + +// StringsContain returns true if the search value is within the list of input values. +func StringsContain(values []string, search string) bool { + for _, v := range values { + if search == v { + return true + } + } + + return false +} diff --git a/vendor/github.com/grafana/dskit/time/time.go b/vendor/github.com/grafana/dskit/time/time.go new file mode 100644 index 00000000000..84c698a18e6 --- /dev/null +++ b/vendor/github.com/grafana/dskit/time/time.go @@ -0,0 +1,96 @@ +package time + +import ( + "math" + "math/rand" + "net/http" + "strconv" + "time" + + "github.com/prometheus/common/model" + "github.com/weaveworks/common/httpgrpc" +) + +const ( + nanosecondsInMillisecond = int64(time.Millisecond / time.Nanosecond) +) + +func ToMillis(t time.Time) int64 { + return t.UnixNano() / nanosecondsInMillisecond +} + +// FromMillis is a helper to turn milliseconds -> time.Time +func FromMillis(ms int64) time.Time { + return time.Unix(0, ms*nanosecondsInMillisecond) +} + +// FormatTimeMillis returns a human readable version of the input time (in milliseconds). +func FormatTimeMillis(ms int64) string { + return FromMillis(ms).String() +} + +// FormatTimeModel returns a human readable version of the input time. +func FormatTimeModel(t model.Time) string { + return FromMillis(int64(t)).String() +} + +// ParseTime parses the string into an int64, milliseconds since epoch. +func ParseTime(s string) (int64, error) { + if t, err := strconv.ParseFloat(s, 64); err == nil { + s, ns := math.Modf(t) + ns = math.Round(ns*1000) / 1000 + tm := time.Unix(int64(s), int64(ns*float64(time.Second))) + return ToMillis(tm), nil + } + if t, err := time.Parse(time.RFC3339Nano, s); err == nil { + return ToMillis(t), nil + } + return 0, httpgrpc.Errorf(http.StatusBadRequest, "cannot parse %q to a valid timestamp", s) +} + +// DurationWithJitter returns random duration from "input - input*variance" to "input + input*variance" interval. +func DurationWithJitter(input time.Duration, variancePerc float64) time.Duration { + // No duration? No jitter. + if input == 0 { + return 0 + } + + variance := int64(float64(input) * variancePerc) + if variance == 0 { + // Values too low + return input + } + + jitter := rand.Int63n(variance*2) - variance + + return input + time.Duration(jitter) +} + +// DurationWithPositiveJitter returns random duration from "input" to "input + input*variance" interval. +func DurationWithPositiveJitter(input time.Duration, variancePerc float64) time.Duration { + // No duration? No jitter. + if input == 0 { + return 0 + } + + variance := int64(float64(input) * variancePerc) + if variance == 0 { + // Values too low + return input + } + + jitter := rand.Int63n(variance) + + return input + time.Duration(jitter) +} + +// NewDisableableTicker essentially wraps NewTicker but allows the ticker to be disabled by passing +// zero duration as the interval. Returns a function for stopping the ticker, and the ticker channel. +func NewDisableableTicker(interval time.Duration) (func(), <-chan time.Time) { + if interval == 0 { + return func() {}, nil + } + + tick := time.NewTicker(interval) + return func() { tick.Stop() }, tick.C +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 4a0212a9a21..279d8ace7be 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -293,7 +293,7 @@ github.com/googleapis/gax-go/v2/apierror/internal/proto # github.com/gorilla/mux v1.8.0 ## explicit github.com/gorilla/mux -# github.com/grafana/dskit v0.0.0-20211011144203-3a88ec0b675f +# github.com/grafana/dskit v0.0.0-20211021180445-3bd016e9d7f1 ## explicit github.com/grafana/dskit/backoff github.com/grafana/dskit/closer @@ -313,9 +313,14 @@ github.com/grafana/dskit/math github.com/grafana/dskit/middleware github.com/grafana/dskit/modules github.com/grafana/dskit/multierror +github.com/grafana/dskit/ring +github.com/grafana/dskit/ring/client +github.com/grafana/dskit/ring/shard +github.com/grafana/dskit/ring/util github.com/grafana/dskit/runtimeconfig github.com/grafana/dskit/runutil github.com/grafana/dskit/services +github.com/grafana/dskit/time # github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 github.com/grpc-ecosystem/go-grpc-middleware # github.com/grpc-ecosystem/go-grpc-middleware/v2 v2.0.0-rc.2.0.20201207153454-9f6bf00c00a7 From b0897803335a66034ec009809e63b55f2faeb3dd Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Wed, 27 Oct 2021 17:08:53 -0500 Subject: [PATCH 2/6] Tidy those mods Signed-off-by: Tyler Reid --- go.sum | 2 -- 1 file changed, 2 deletions(-) diff --git a/go.sum b/go.sum index 0086dca9650..9cd35a78c12 100644 --- a/go.sum +++ b/go.sum @@ -1012,8 +1012,6 @@ github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoA github.com/gorilla/websocket v1.4.2/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE= github.com/grafana/dskit v0.0.0-20210818123532-6645f87e9e12/go.mod h1:QaNAQaCSFOtG/NHf6Jd/zh67H25kkrVCq36U61Y2Mhw= github.com/grafana/dskit v0.0.0-20210819132858-471020752967/go.mod h1:uF46UNN1/feB1egpq8UGbBBKvJjGgZauW7pcVbeFLLM= -github.com/grafana/dskit v0.0.0-20211011144203-3a88ec0b675f h1:FvvSVEbnGeM2bUivGmsiXTi8URJyBU7TcFEEoRe5wWI= -github.com/grafana/dskit v0.0.0-20211011144203-3a88ec0b675f/go.mod h1:uPG2nyK4CtgNDmWv7qyzYcdI+S90kHHRWvHnBtEMBXM= github.com/grafana/dskit v0.0.0-20211021180445-3bd016e9d7f1 h1:Qf+/W3Tup0nO21tgJmO14WJK0yyrm4L2UJipZP+Zoow= github.com/grafana/dskit v0.0.0-20211021180445-3bd016e9d7f1/go.mod h1:uPG2nyK4CtgNDmWv7qyzYcdI+S90kHHRWvHnBtEMBXM= github.com/grafana/gocql v0.0.0-20200605141915-ba5dc39ece85 h1:xLuzPoOzdfNb/RF/IENCw+oLVdZB4G21VPhkHBgwSHY= From a4e2d83a6fb50259e162fe86ba4204070d6690ef Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Wed, 27 Oct 2021 17:23:13 -0500 Subject: [PATCH 3/6] Update docs on my branch Signed-off-by: Tyler Reid --- docs/configuration/config-file-reference.md | 22 ++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/docs/configuration/config-file-reference.md b/docs/configuration/config-file-reference.md index 3d3554e66f8..7d2db4cb6a8 100644 --- a/docs/configuration/config-file-reference.md +++ b/docs/configuration/config-file-reference.md @@ -683,6 +683,11 @@ lifecycler: # CLI flag: -distributor.zone-awareness-enabled [zone_awareness_enabled: | default = false] + # Comma-separated list of zones to exclude from the ring. Instances in + # excluded zones will be filtered out from the ring. + # CLI flag: -distributor.excluded-zones + [excluded_zones: | default = ""] + # Number of tokens for each ingester. # CLI flag: -ingester.num-tokens [num_tokens: | default = 128] @@ -701,10 +706,13 @@ lifecycler: # CLI flag: -ingester.join-after [join_after: | default = 0s] - # Minimum duration to wait before becoming ready. This is to work around race - # conditions with ingesters exiting and updating the ring. + # Minimum duration to wait after the internal readiness checks have passed but + # before succeeding the readiness endpoint. This is used to slowdown + # deployment controllers (eg. Kubernetes) after an instance is ready and + # before they proceed with a rolling update, to give the rest of the cluster + # instances enough time to receive ring updates. # CLI flag: -ingester.min-ready-duration - [min_ready_duration: | default = 1m] + [min_ready_duration: | default = 15s] # Name of network interface to read address from. # CLI flag: -ingester.lifecycler.interface @@ -729,6 +737,14 @@ lifecycler: # CLI flag: -ingester.unregister-on-shutdown [unregister_on_shutdown: | default = true] + # When enabled the readiness probe succeeds only after all instances are + # ACTIVE and healthy in the ring, otherwise only the instance itself is + # checked. This option should be disabled if in your cluster multiple + # instances can be rolled out simultaneously, otherwise rolling updates may be + # slowed down. + # CLI flag: -ingester.readiness-check-ring-health + [readiness_check_ring_health: | default = true] + # Number of times to try and transfer chunks before falling back to flushing. # Negative value or zero disables hand-over. This feature is supported only by # the chunks storage. From a98a506dc2008354c15167c853d8630ceaa233d7 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Tue, 2 Nov 2021 16:34:58 -0500 Subject: [PATCH 4/6] Wrap multi-tenant alert manager lifecycler registerer with cortex prefix Signed-off-by: Tyler Reid --- pkg/alertmanager/multitenant.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/alertmanager/multitenant.go b/pkg/alertmanager/multitenant.go index 208bb24fec0..46fa8a8931d 100644 --- a/pkg/alertmanager/multitenant.go +++ b/pkg/alertmanager/multitenant.go @@ -409,7 +409,7 @@ func createMultitenantAlertmanager(cfg *MultitenantAlertmanagerConfig, fallbackC delegate = ring.NewLeaveOnStoppingDelegate(delegate, am.logger) delegate = ring.NewAutoForgetDelegate(am.cfg.ShardingRing.HeartbeatTimeout*ringAutoForgetUnhealthyPeriods, delegate, am.logger) - am.ringLifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, RingNameForServer, RingKey, ringStore, delegate, am.logger, am.registry) + am.ringLifecycler, err = ring.NewBasicLifecycler(lifecyclerCfg, RingNameForServer, RingKey, ringStore, delegate, am.logger, prometheus.WrapRegistererWithPrefix("cortex_", am.registry)) if err != nil { return nil, errors.Wrap(err, "failed to initialize Alertmanager's lifecycler") } From a842683d88dbbb9974ed700f9701fe16a216c4bc Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Tue, 2 Nov 2021 18:10:17 -0500 Subject: [PATCH 5/6] Add changelog for new configs brought in via dskit Signed-off-by: Tyler Reid --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1e949d4f376..0eac17fabdd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ## master / unreleased +* [CHANGE] Changed default for `-ingester.min-ready-duration` from 1 minute to 15 seconds. #4539 +* [ENHANCEMENT] Added new ring related config `-ingester.readiness-check-ring-health` when enabled the readiness probe will succeed only after all instances are ACTIVE and healthy in the ring, this is enabled by default. #4539 +* [ENHANCEMENT] Added new ring related config `-distributor.excluded-zones` when set this will exclude the comma-separated zones from the ring, default is "". #4539 * [ENHANCEMENT] Upgraded Docker base images to `alpine:3.14`. #4514 * [ENHANCEMENT] Updated Prometheus to latest. Includes changes from prometheus#9239, adding 15 new functions. Multiple TSDB bugfixes prometheus#9438 & prometheus#9381. #4524 From 324b50b7878b4c9f0a9583e69fd7b76bef407a33 Mon Sep 17 00:00:00 2001 From: Tyler Reid Date: Wed, 3 Nov 2021 00:20:58 -0500 Subject: [PATCH 6/6] Remove whitespace in changelog Signed-off-by: Tyler Reid --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0eac17fabdd..76578bbe973 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ * [CHANGE] Changed default for `-ingester.min-ready-duration` from 1 minute to 15 seconds. #4539 * [ENHANCEMENT] Added new ring related config `-ingester.readiness-check-ring-health` when enabled the readiness probe will succeed only after all instances are ACTIVE and healthy in the ring, this is enabled by default. #4539 -* [ENHANCEMENT] Added new ring related config `-distributor.excluded-zones` when set this will exclude the comma-separated zones from the ring, default is "". #4539 +* [ENHANCEMENT] Added new ring related config `-distributor.excluded-zones` when set this will exclude the comma-separated zones from the ring, default is "". #4539 * [ENHANCEMENT] Upgraded Docker base images to `alpine:3.14`. #4514 * [ENHANCEMENT] Updated Prometheus to latest. Includes changes from prometheus#9239, adding 15 new functions. Multiple TSDB bugfixes prometheus#9438 & prometheus#9381. #4524