-
Notifications
You must be signed in to change notification settings - Fork 34
feat: parse server response and react to error messages #281
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e7bfb87
ab0e3c5
6d5679d
161bb54
fc16906
c138a40
8923a9f
b4a0c11
1576234
e2fbd6e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ import ( | |
| "bytes" | ||
| "compress/gzip" | ||
| "context" | ||
| "encoding/json" | ||
| "errors" | ||
| "fmt" | ||
| "io" | ||
|
|
@@ -30,11 +31,20 @@ import ( | |
| "time" | ||
| ) | ||
|
|
||
| type jsonResult struct { | ||
| Errors []jsonError `json:"errors,omitempty"` | ||
| } | ||
|
|
||
| type jsonError struct { | ||
| Message string `json:"message"` | ||
| Document string `json:"document,omitempty"` | ||
| } | ||
|
|
||
| // ForwardApmData receives agent data as it comes in and posts it to the APM server. | ||
| // Stop checking for, and sending agent data when the function invocation | ||
| // has completed, signaled via a channel. | ||
| func (c *Client) ForwardApmData(ctx context.Context, metadataContainer *MetadataContainer) error { | ||
| if c.Status == Failing { | ||
| if c.IsUnhealthy() { | ||
| return nil | ||
| } | ||
| for { | ||
|
|
@@ -59,7 +69,7 @@ func (c *Client) ForwardApmData(ctx context.Context, metadataContainer *Metadata | |
|
|
||
| // FlushAPMData reads all the apm data in the apm data channel and sends it to the APM server. | ||
| func (c *Client) FlushAPMData(ctx context.Context) { | ||
| if c.Status == Failing { | ||
| if c.IsUnhealthy() { | ||
| c.logger.Debug("Flush skipped - Transport failing") | ||
| return | ||
| } | ||
|
|
@@ -86,7 +96,7 @@ func (c *Client) FlushAPMData(ctx context.Context) { | |
| func (c *Client) PostToApmServer(ctx context.Context, agentData AgentData) error { | ||
| // todo: can this be a streaming or streaming style call that keeps the | ||
| // connection open across invocations? | ||
| if c.Status == Failing { | ||
| if c.IsUnhealthy() { | ||
| return errors.New("transport status is unhealthy") | ||
| } | ||
|
|
||
|
|
@@ -131,61 +141,124 @@ func (c *Client) PostToApmServer(ctx context.Context, agentData AgentData) error | |
| c.logger.Debug("Sending data chunk to APM server") | ||
| resp, err := c.client.Do(req) | ||
| if err != nil { | ||
| c.SetApmServerTransportState(ctx, Failing) | ||
| c.UpdateStatus(ctx, Failing) | ||
| return fmt.Errorf("failed to post to APM server: %v", err) | ||
| } | ||
|
|
||
| //Read the response body | ||
| defer resp.Body.Close() | ||
| body, err := io.ReadAll(resp.Body) | ||
| if err != nil { | ||
| c.SetApmServerTransportState(ctx, Failing) | ||
| return fmt.Errorf("failed to read the response body after posting to the APM server") | ||
|
|
||
| // On success, the server will respond with a 202 Accepted status code and no body. | ||
| if resp.StatusCode == http.StatusAccepted { | ||
| c.UpdateStatus(ctx, Healthy) | ||
| return nil | ||
| } | ||
|
|
||
| // RateLimited | ||
| if resp.StatusCode == http.StatusTooManyRequests { | ||
| c.logger.Warnf("Transport has been rate limited: response status code: %d", resp.StatusCode) | ||
| c.UpdateStatus(ctx, RateLimited) | ||
| return nil | ||
| } | ||
|
|
||
| if resp.StatusCode == http.StatusUnauthorized { | ||
| jErr := jsonResult{} | ||
| if err := json.NewDecoder(resp.Body).Decode(&jErr); err != nil { | ||
| // non critical error. | ||
| // Log a warning and continue. | ||
| c.logger.Warnf("failed to decode response body: %v", err) | ||
| } | ||
|
|
||
| // Auth errors | ||
| if resp.StatusCode == http.StatusUnauthorized || resp.StatusCode == http.StatusForbidden { | ||
| c.logger.Warnf("Authentication with the APM server failed: response status code: %d", resp.StatusCode) | ||
| c.logger.Debugf("APM server response body: %v", string(body)) | ||
| for _, err := range jErr.Errors { | ||
| c.logger.Warnf("failed to authenticate: document %s: message: %s", err.Document, err.Message) | ||
| } | ||
| c.UpdateStatus(ctx, Failing) | ||
| return nil | ||
| } | ||
|
|
||
| // ClientErrors | ||
| if resp.StatusCode >= 400 && resp.StatusCode < 500 { | ||
| c.logger.Warnf("client error: response status code: %d", resp.StatusCode) | ||
| for _, err := range jErr.Errors { | ||
| c.logger.Warnf("client error: document %s: message: %s", err.Document, err.Message) | ||
| } | ||
| c.UpdateStatus(ctx, ClientFailing) | ||
| return nil | ||
| } | ||
|
|
||
| c.SetApmServerTransportState(ctx, Healthy) | ||
| c.logger.Debug("Transport status set to healthy") | ||
| c.logger.Debugf("APM server response body: %v", string(body)) | ||
| c.logger.Debugf("APM server response status code: %v", resp.StatusCode) | ||
| // critical errors | ||
| if resp.StatusCode == http.StatusInternalServerError || resp.StatusCode == http.StatusServiceUnavailable { | ||
| c.logger.Warnf("failed to post data to APM server: response status code: %d", resp.StatusCode) | ||
| for _, err := range jErr.Errors { | ||
| c.logger.Warnf("critical error: document %s: message: %s", err.Document, err.Message) | ||
| } | ||
| c.UpdateStatus(ctx, Failing) | ||
| return nil | ||
| } | ||
|
|
||
| c.logger.Warnf("unhandled status code: %d", resp.StatusCode) | ||
| return nil | ||
| } | ||
|
|
||
| // SetApmServerTransportState takes a state of the APM server transport and updates | ||
| // IsUnhealthy returns true if the apmproxy is not healthy. | ||
| func (c *Client) IsUnhealthy() bool { | ||
| c.mu.RLock() | ||
| defer c.mu.RUnlock() | ||
| return c.Status == Failing | ||
simitt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| // UpdateStatus takes a state of the APM server transport and updates | ||
| // the current state of the transport. For a change to a failing state, the grace period | ||
| // is calculated and a go routine is started that waits for that period to complete | ||
| // before changing the status to "pending". This would allow a subsequent send attempt | ||
| // to the APM server. | ||
| // | ||
| // This function is public for use in tests. | ||
| func (c *Client) SetApmServerTransportState(ctx context.Context, status Status) { | ||
| func (c *Client) UpdateStatus(ctx context.Context, status Status) { | ||
| // Reduce lock contention as UpdateStatus is called on every | ||
| // successful request | ||
| c.mu.RLock() | ||
| if status == c.Status { | ||
| c.mu.RUnlock() | ||
| return | ||
| } | ||
| c.mu.RUnlock() | ||
|
|
||
| switch status { | ||
| case Healthy: | ||
| c.mu.Lock() | ||
kruskall marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| if c.Status == status { | ||
| return | ||
| } | ||
| c.Status = status | ||
| c.logger.Debugf("APM server Transport status set to %s", c.Status) | ||
| c.ReconnectionCount = -1 | ||
| c.mu.Unlock() | ||
| case RateLimited, ClientFailing: | ||
| // No need to start backoff, this is a temporary status. It usually | ||
| // means we went over the limit of events/s. | ||
|
Comment on lines
+238
to
+239
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When would ClientFailing be temporary? From what I can see above, it's either due to auth failure (probably not temporary without user intervention?) or some validation error (probably implies a bug in the agent or server?)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Ah, good point! I've made auth errors a critical failure
From what I could see from the middlewares in the APM server repository, this would happen on data decoding/validation errors, request body too large or invalid query. Those are errors tied to a specific request, I don't think we should trigger a backoff and associated delay.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
good point, I think this is fine as is now. |
||
| c.mu.Lock() | ||
| c.Status = status | ||
| c.logger.Debugf("APM server Transport status set to %s", c.Status) | ||
| c.mu.Unlock() | ||
| case Failing: | ||
| c.mu.Lock() | ||
| c.Status = status | ||
| c.logger.Debugf("APM server Transport status set to %s", c.Status) | ||
| c.ReconnectionCount++ | ||
| gracePeriodTimer := time.NewTimer(c.ComputeGracePeriod()) | ||
| c.logger.Debugf("Grace period entered, reconnection count : %d", c.ReconnectionCount) | ||
| c.mu.Unlock() | ||
|
|
||
| go func() { | ||
| select { | ||
| case <-gracePeriodTimer.C: | ||
| c.logger.Debug("Grace period over - timer timed out") | ||
| case <-ctx.Done(): | ||
| c.logger.Debug("Grace period over - context done") | ||
| } | ||
| c.Status = Pending | ||
| c.mu.Lock() | ||
| c.Status = Started | ||
| c.logger.Debugf("APM server Transport status set to %s", c.Status) | ||
| c.mu.Unlock() | ||
| }() | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.