Skip to content

Commit d4198ca

Browse files
Modifying #4422 to retry only for GET requests. Retrying AlertManager UnaryPath GET Requests on next replica if one fail.
Signed-off-by: Krishna Teja Puttagunta <[email protected]>
1 parent c815b3c commit d4198ca

File tree

3 files changed

+47
-6
lines changed

3 files changed

+47
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2088,3 +2088,4 @@ This release has several exciting features, the most notable of them being setti
20882088
* [FEATURE] You can specify "heap ballast" to reduce Go GC Churn #1489
20892089
* [BUGFIX] HA Tracker no longer always makes a request to Consul/Etcd when a request is not from the active replica #1516
20902090
* [BUGFIX] Queries are now correctly cancelled by the query-frontend #1508
2091+
* [ENHANCEMENT] AlertManager: Retrying AlertManager Get Requests (Get Alertmanager status, Get Alertmanager Receivers) on next replica on error #4840

pkg/alertmanager/distributor.go

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -254,14 +254,36 @@ func (d *Distributor) doUnary(userID string, w http.ResponseWriter, r *http.Requ
254254
defer sp.Finish()
255255
// Until we have a mechanism to combine the results from multiple alertmanagers,
256256
// we forward the request to only only of the alertmanagers.
257-
amDesc := replicationSet.Instances[rand.Intn(len(replicationSet.Instances))]
258-
resp, err := d.doRequest(ctx, amDesc, req)
259-
if err != nil {
260-
respondFromError(err, w, logger)
261-
return
257+
258+
var instances []ring.InstanceDesc
259+
if req.GetMethod() == "GET" && d.isUnaryReadPath(r.URL.Path) {
260+
instances = replicationSet.Instances
261+
// Randomize the list of instances to not always query the same one.
262+
rand.Shuffle(len(instances), func(i, j int) {
263+
instances[i], instances[j] = instances[j], instances[i]
264+
})
265+
} else {
266+
//Picking 1 instance at Random for Non-Get Unary Read requests, as shuffling through large number of instances might increase complexity
267+
randN := rand.Intn(len(replicationSet.Instances))
268+
instances = replicationSet.Instances[randN : randN+1]
262269
}
263270

264-
respondFromHTTPGRPCResponse(w, resp)
271+
var lastErr error
272+
for _, instance := range instances {
273+
resp, err := d.doRequest(ctx, instance, req)
274+
// storing the last error message
275+
if err != nil {
276+
lastErr = err
277+
continue
278+
}
279+
// Return on the first succeeded request
280+
respondFromHTTPGRPCResponse(w, resp)
281+
return
282+
}
283+
// throwing the last error if the for loop finish without succeeding
284+
if lastErr != nil {
285+
respondFromError(lastErr, w, logger)
286+
}
265287
}
266288

267289
func respondFromError(err error, w http.ResponseWriter, logger log.Logger) {

pkg/alertmanager/distributor_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,24 @@ func TestDistributor_DistributeRequest(t *testing.T) {
196196
expStatusCode: http.StatusOK,
197197
expectedTotalCalls: 1,
198198
route: "/status",
199+
}, {
200+
name: "Read /status should try all alert managers on error",
201+
numAM: 3,
202+
numHappyAM: 0,
203+
replicationFactor: 3,
204+
isRead: true,
205+
expStatusCode: http.StatusInternalServerError,
206+
expectedTotalCalls: 3,
207+
route: "/status",
208+
}, {
209+
name: "Read /status is sent to 3 AM when 2 are not happy",
210+
numAM: 3,
211+
numHappyAM: 1,
212+
replicationFactor: 3,
213+
isRead: true,
214+
expStatusCode: http.StatusOK,
215+
expectedTotalCalls: 3,
216+
route: "/status",
199217
}, {
200218
name: "Write /status not supported",
201219
numAM: 5,

0 commit comments

Comments
 (0)