diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index 7ce834cb..56b31636 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -25,6 +25,7 @@ https://github.com/elastic/apm-aws-lambda/compare/v1.4.0...main[View commits] [float] ===== Bug fixes - Log a warning, instead of failing a Lambda function, if auth retrieval from AWS Secrets Manager fails. Reporting APM data will not work, but the Lambda function invocations will proceed. {lambda-pull}401[401] +- Fix incorrect proxy transaction handling at shutdown due to not flushing the data before processing shutdown event. {lambda-pull}412[412]. [float] ===== Features diff --git a/app/run.go b/app/run.go index 9fa2d08f..940b12ef 100644 --- a/app/run.go +++ b/app/run.go @@ -96,6 +96,28 @@ func (app *App) Run(ctx context.Context) error { backgroundDataSendWg.Wait() if event.EventType == extension.Shutdown { app.logger.Infof("Exiting due to shutdown event with reason %s", event.ShutdownReason) + // Since we have waited for the processEvent loop to finish we + // already have received all the data we can from the agent. So, we + // flush all the data to make sure that shutdown can correctly deduce + // any pending transactions. + app.apmClient.FlushAPMData(ctx) + // At shutdown we can not expect platform.runtimeDone events to be + // reported for the remaining invocations. If we haven't received the + // transaction from agents at this point then it is safe to assume + // that the function failed. We will create proxy transaction for all + // invocations that haven't received a full transaction from the agent + // yet. If extension doesn't have enough CPU time it is possible that + // the extension might not receive the shutdown signal for timeouts + // or runtime crashes. In these cases we will miss the transaction. + // + // TODO (lahsivjar): Any partial transaction remaining will be added + // to a new batch by OnShutdown and flushed from the defer call to + // flush all data when this function exits. This causes 2 triggers + // of flush, we can optimize this by clearing all buffered channel + // then calling OnShutdown and finally flushing any remaining data. + if err := app.batch.OnShutdown(event.ShutdownReason); err != nil { + app.logger.Errorf("Error finalizing invocation on shutdown: %v", err) + } return nil } if app.apmClient.ShouldFlush() { @@ -153,21 +175,6 @@ func (app *App) processEvent( event.Timestamp, ) case extension.Shutdown: - // At shutdown we can not expect platform.runtimeDone events to be reported - // for the remaining invocations. If we haven't received the transaction - // from agents at this point then it is safe to assume that the function - // failed. We will create proxy transaction for all invocations that - // haven't received a full transaction from the agent yet. If extension - // doesn't have enough CPU time it is possible that the extension might - // not receive the shutdown signal for timeouts or runtime crashes. In - // these cases we will miss the transaction. - app.logger.Debugf("Received shutdown event with reason %s", event.ShutdownReason) - defer func() { - if err := app.batch.OnShutdown(event.ShutdownReason); err != nil { - app.logger.Errorf("Error finalizing invocation on shutdown: %v", err) - } - }() - // platform.report metric (and some other metrics) might not have been // reported by the logs API even till shutdown. At shutdown we will make // a last attempt to collect and report these metrics. However, it is