@@ -5,6 +5,7 @@ package integration
55import (
66 "crypto/x509"
77 "crypto/x509/pkix"
8+ "fmt"
89 "os"
910 "path/filepath"
1011 "testing"
@@ -109,16 +110,16 @@ func testSingleBinaryEnv(t *testing.T, tlsEnabled bool) {
109110
110111func newSingleBinary (name string , servername string , join string ) * e2ecortex.CortexService {
111112 flags := map [string ]string {
112- "-ingester.final-sleep" : "0s" ,
113- "-ingester.join-after" : "0s" , // join quickly
114- "-ingester.min-ready-duration" : "0s" ,
115- "-ingester.concurrent-flushes" : "10" ,
116- "-ingester.max-transfer-retries" : "0" , // disable
117- "-ingester.num-tokens" : "512" ,
118- "-ingester.observe-period" : "5s" , // to avoid conflicts in tokens
119- "-ring.store" : "memberlist" ,
120- "-memberlist.bind-port" : "8000" ,
121- "-memberlist.pullpush-interval " : "3s " , // speed up state convergence to make test faster and avoid flakiness
113+ "-ingester.final-sleep" : "0s" ,
114+ "-ingester.join-after" : "0s" , // join quickly
115+ "-ingester.min-ready-duration" : "0s" ,
116+ "-ingester.concurrent-flushes" : "10" ,
117+ "-ingester.max-transfer-retries" : "0" , // disable
118+ "-ingester.num-tokens" : "512" ,
119+ "-ingester.observe-period" : "5s" , // to avoid conflicts in tokens
120+ "-ring.store" : "memberlist" ,
121+ "-memberlist.bind-port" : "8000" ,
122+ "-memberlist.left-ingesters-timeout " : "600s " , // effectively disable
122123 }
123124
124125 if join != "" {
@@ -145,3 +146,77 @@ func newSingleBinary(name string, servername string, join string) *e2ecortex.Cor
145146 serv .SetBackoff (backOff )
146147 return serv
147148}
149+
150+ func TestSingleBinaryWithMemberlistScaling (t * testing.T ) {
151+ s , err := e2e .NewScenario (networkName )
152+ require .NoError (t , err )
153+ defer s .Close ()
154+
155+ dynamo := e2edb .NewDynamoDB ()
156+ require .NoError (t , s .StartAndWaitReady (dynamo ))
157+ require .NoError (t , writeFileToSharedDir (s , cortexSchemaConfigFile , []byte (cortexSchemaConfigYaml )))
158+
159+ // Scale up instances. These numbers seem enough to reliably reproduce some unwanted
160+ // consequences of slow propagation, such as missing tombstones.
161+
162+ maxCortex := 20
163+ minCortex := 3
164+ instances := make ([]* e2ecortex.CortexService , 0 )
165+
166+ for i := 0 ; i < maxCortex ; i ++ {
167+ name := fmt .Sprintf ("cortex-%d" , i + 1 )
168+ join := ""
169+ if i > 0 {
170+ join = fmt .Sprintf ("%s-cortex-1:8000" , networkName )
171+ }
172+ c := newSingleBinary (name , "" , join )
173+ require .NoError (t , s .StartAndWaitReady (c ))
174+ instances = append (instances , c )
175+ }
176+
177+ // Sanity check the ring membership and give each instance time to see every other instance.
178+
179+ for _ , c := range instances {
180+ require .NoError (t , c .WaitSumMetrics (e2e .Equals (float64 (maxCortex )), "cortex_ring_members" ))
181+ require .NoError (t , c .WaitSumMetrics (e2e .Equals (0 ), "memberlist_client_kv_store_value_tombstones" ))
182+ }
183+
184+ // Scale down as fast as possible but cleanly, in order to send out tombstones.
185+
186+ for len (instances ) > minCortex {
187+ i := len (instances ) - 1
188+ require .NoError (t , s .Stop (instances [i ]))
189+ instances = instances [:i ]
190+ }
191+
192+ // If all is working as expected, then tombstones should have propagated easily within this time period.
193+ // The logging is mildly spammy, but it has proven extremely useful for debugging convergence cases.
194+ // We don't use WaitSumMetrics [over all instances] here so we can log the per-instance metrics.
195+
196+ expectedRingMembers := float64 (minCortex )
197+ expectedTombstones := float64 (maxCortex - minCortex )
198+
199+ require .Eventually (t , func () bool {
200+ ok := true
201+ for _ , c := range instances {
202+ metrics , err := c .SumMetrics ([]string {
203+ "cortex_ring_members" , "memberlist_client_kv_store_value_tombstones" ,
204+ })
205+ require .NoError (t , err )
206+ t .Logf ("%s: cortex_ring_members=%f memberlist_client_kv_store_value_tombstones=%f\n " ,
207+ c .Name (), metrics [0 ], metrics [1 ])
208+
209+ // Don't short circuit the check, so we log the state for all instances.
210+ if metrics [0 ] != expectedRingMembers {
211+ ok = false
212+ }
213+ if metrics [1 ] != expectedTombstones {
214+ ok = false
215+ }
216+
217+ }
218+ return ok
219+ }, 30 * time .Second , 2 * time .Second ,
220+ "expected all instances to have %f ring members and %f tombstones" ,
221+ expectedRingMembers , expectedTombstones )
222+ }
0 commit comments