From e66fd25bbbafe2075e401cc9dc90016f607e2317 Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Wed, 21 Sep 2022 14:08:55 +0800 Subject: [PATCH 01/15] Introduce apm data type --- apmproxy/apmserver.go | 4 ++-- apmproxy/apmserver_test.go | 20 ++++++++++---------- apmproxy/client.go | 4 ++-- apmproxy/metadata.go | 2 +- apmproxy/metadata_test.go | 7 ++++--- apmproxy/option.go | 2 +- apmproxy/receiver.go | 19 ++++++++++++++++--- logsapi/functionlogs.go | 8 ++++---- logsapi/functionlogs_test.go | 1 + logsapi/metrics.go | 8 ++++---- logsapi/metrics_test.go | 11 +++++++---- 11 files changed, 52 insertions(+), 34 deletions(-) diff --git a/apmproxy/apmserver.go b/apmproxy/apmserver.go index 35fb4396..fa3d9ee5 100644 --- a/apmproxy/apmserver.go +++ b/apmproxy/apmserver.go @@ -93,7 +93,7 @@ func (c *Client) FlushAPMData(ctx context.Context) { // The function compresses the APM agent data, if it's not already compressed. // It sets the APM transport status to failing upon errors, as part of the backoff // strategy. -func (c *Client) PostToApmServer(ctx context.Context, agentData AgentData) error { +func (c *Client) PostToApmServer(ctx context.Context, agentData APMData) error { // todo: can this be a streaming or streaming style call that keeps the // connection open across invocations? if c.IsUnhealthy() { @@ -283,7 +283,7 @@ func (c *Client) ComputeGracePeriod() time.Duration { // EnqueueAPMData adds a AgentData struct to the agent data channel, effectively queueing for a send // to the APM server. -func (c *Client) EnqueueAPMData(agentData AgentData) { +func (c *Client) EnqueueAPMData(agentData APMData) { select { case c.DataChannel <- agentData: c.logger.Debug("Adding agent data to buffer to be sent to apm server") diff --git a/apmproxy/apmserver_test.go b/apmproxy/apmserver_test.go index 3f028ff0..eb34e010 100644 --- a/apmproxy/apmserver_test.go +++ b/apmproxy/apmserver_test.go @@ -57,7 +57,7 @@ func TestPostToApmServerDataCompressed(t *testing.T) { // Create AgentData struct with compressed data data, _ := io.ReadAll(pr) - agentData := apmproxy.AgentData{Data: data, ContentEncoding: "gzip"} + agentData := apmproxy.APMData{Data: data, ContentEncoding: "gzip"} // Create apm server and handler apmServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -83,7 +83,7 @@ func TestPostToApmServerDataCompressed(t *testing.T) { func TestPostToApmServerDataNotCompressed(t *testing.T) { s := "A long time ago in a galaxy far, far away..." body := []byte(s) - agentData := apmproxy.AgentData{Data: body, ContentEncoding: ""} + agentData := apmproxy.APMData{Data: body, ContentEncoding: ""} // Compress the data, so it can be compared with what // the apm server receives @@ -251,7 +251,7 @@ func TestEnterBackoffFromHealthy(t *testing.T) { // Create AgentData struct with compressed data data, _ := io.ReadAll(pr) - agentData := apmproxy.AgentData{Data: data, ContentEncoding: "gzip"} + agentData := apmproxy.APMData{Data: data, ContentEncoding: "gzip"} // Create apm server and handler apmServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -302,7 +302,7 @@ func TestEnterBackoffFromFailing(t *testing.T) { // Create AgentData struct with compressed data data, _ := io.ReadAll(pr) - agentData := apmproxy.AgentData{Data: data, ContentEncoding: "gzip"} + agentData := apmproxy.APMData{Data: data, ContentEncoding: "gzip"} // Create apm server and handler apmServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -356,7 +356,7 @@ func TestAPMServerRecovery(t *testing.T) { // Create AgentData struct with compressed data data, _ := io.ReadAll(pr) - agentData := apmproxy.AgentData{Data: data, ContentEncoding: "gzip"} + agentData := apmproxy.APMData{Data: data, ContentEncoding: "gzip"} // Create apm server and handler apmServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -409,7 +409,7 @@ func TestAPMServerAuthFails(t *testing.T) { // Create AgentData struct with compressed data data, _ := io.ReadAll(pr) - agentData := apmproxy.AgentData{Data: data, ContentEncoding: "gzip"} + agentData := apmproxy.APMData{Data: data, ContentEncoding: "gzip"} // Create apm server and handler apmServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -453,7 +453,7 @@ func TestAPMServerRatelimit(t *testing.T) { // Create AgentData struct with compressed data data, _ := io.ReadAll(pr) - agentData := apmproxy.AgentData{Data: data, ContentEncoding: "gzip"} + agentData := apmproxy.APMData{Data: data, ContentEncoding: "gzip"} // Create apm server and handler var shouldSucceed atomic.Bool @@ -506,7 +506,7 @@ func TestAPMServerClientFail(t *testing.T) { // Create AgentData struct with compressed data data, _ := io.ReadAll(pr) - agentData := apmproxy.AgentData{Data: data, ContentEncoding: "gzip"} + agentData := apmproxy.APMData{Data: data, ContentEncoding: "gzip"} // Create apm server and handler var shouldSucceed atomic.Bool @@ -558,7 +558,7 @@ func TestContinuedAPMServerFailure(t *testing.T) { // Create AgentData struct with compressed data data, _ := io.ReadAll(pr) - agentData := apmproxy.AgentData{Data: data, ContentEncoding: "gzip"} + agentData := apmproxy.APMData{Data: data, ContentEncoding: "gzip"} // Create apm server and handler apmServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -615,7 +615,7 @@ func BenchmarkPostToAPM(b *testing.B) { {"transaction": { "id": "00xxxxFFaaaa1234", "trace_id": "0123456789abcdef0123456789abcdef", "name": "amqp receive", "parent_id": "abcdefabcdef01234567", "type": "messaging", "duration": 3, "span_count": { "started": 1 }, "context": {"message": {"queue": { "name": "new_users"}, "age":{ "ms": 1577958057123}, "headers": {"user_id": "1ax3", "involved_services": ["user", "auth"]}, "body": "user created", "routing_key": "user-created-transaction"}},"session":{"id":"sunday","sequence":123}}} {"transaction": { "name": "july-2021-delete-after-july-31", "type": "lambda", "result": "success", "id": "142e61450efb8574", "trace_id": "eb56529a1f461c5e7e2f66ecb075e983", "subtype": null, "action": null, "duration": 38.853, "timestamp": 1631736666365048, "sampled": true, "context": { "cloud": { "origin": { "account": { "id": "abc123" }, "provider": "aws", "region": "us-east-1", "service": { "name": "serviceName" } } }, "service": { "origin": { "id": "abc123", "name": "service-name", "version": "1.0" } }, "user": {}, "tags": {}, "custom": { } }, "sync": true, "span_count": { "started": 0 }, "outcome": "unknown", "faas": { "coldstart": false, "execution": "2e13b309-23e1-417f-8bf7-074fc96bc683", "trigger": { "request_id": "FuH2Cir_vHcEMUA=", "type": "http" } }, "sample_rate": 1 } } `) - agentData := apmproxy.AgentData{Data: benchBody, ContentEncoding: ""} + agentData := apmproxy.APMData{Data: benchBody, ContentEncoding: ""} b.ResetTimer() for i := 0; i < b.N; i++ { diff --git a/apmproxy/client.go b/apmproxy/client.go index 7b161abe..6899133f 100644 --- a/apmproxy/client.go +++ b/apmproxy/client.go @@ -52,7 +52,7 @@ const ( type Client struct { mu sync.RWMutex bufferPool sync.Pool - DataChannel chan AgentData + DataChannel chan APMData client *http.Client Status Status ReconnectionCount int @@ -72,7 +72,7 @@ func NewClient(opts ...Option) (*Client, error) { bufferPool: sync.Pool{New: func() interface{} { return &bytes.Buffer{} }}, - DataChannel: make(chan AgentData, defaultAgentBufferSize), + DataChannel: make(chan APMData, defaultAgentBufferSize), client: &http.Client{ Transport: http.DefaultTransport.(*http.Transport).Clone(), }, diff --git a/apmproxy/metadata.go b/apmproxy/metadata.go index d0becec0..2f10a2ae 100644 --- a/apmproxy/metadata.go +++ b/apmproxy/metadata.go @@ -31,7 +31,7 @@ type MetadataContainer struct { // ProcessMetadata return a byte array containing the Metadata marshaled in JSON // In case we want to update the Metadata values, usage of https://github.com/tidwall/sjson is advised -func ProcessMetadata(data AgentData) ([]byte, error) { +func ProcessMetadata(data APMData) ([]byte, error) { uncompressedData, err := GetUncompressedBytes(data.Data, data.ContentEncoding) if err != nil { return nil, fmt.Errorf("error uncompressing agent data for metadata extraction: %w", err) diff --git a/apmproxy/metadata_test.go b/apmproxy/metadata_test.go index be88a0f3..0bc3b3c3 100644 --- a/apmproxy/metadata_test.go +++ b/apmproxy/metadata_test.go @@ -21,10 +21,11 @@ import ( "bytes" "compress/gzip" "compress/zlib" - "github.com/elastic/apm-aws-lambda/apmproxy" "io" "testing" + "github.com/elastic/apm-aws-lambda/apmproxy" + "github.com/stretchr/testify/require" ) @@ -133,7 +134,7 @@ func Test_processMetadata(t *testing.T) { for name, tc := range testCases { t.Run(name, func(t *testing.T) { - agentData := apmproxy.AgentData{Data: tc.data(), ContentEncoding: tc.encodingType} + agentData := apmproxy.APMData{Data: tc.data(), ContentEncoding: tc.encodingType} extractedMetadata, err := apmproxy.ProcessMetadata(agentData) if tc.expectError != nil { @@ -167,7 +168,7 @@ func BenchmarkProcessMetadata(b *testing.B) { } for _, bench := range benchmarks { - agentData := apmproxy.AgentData{Data: bench.body, ContentEncoding: ""} + agentData := apmproxy.APMData{Data: bench.body, ContentEncoding: ""} b.Run(bench.name, func(b *testing.B) { b.ReportAllocs() diff --git a/apmproxy/option.go b/apmproxy/option.go index 60b353f5..827ed218 100644 --- a/apmproxy/option.go +++ b/apmproxy/option.go @@ -74,7 +74,7 @@ func WithSendStrategy(strategy SendStrategy) Option { // WithAgentDataBufferSize sets the agent data buffer size. func WithAgentDataBufferSize(size int) Option { return func(c *Client) { - c.DataChannel = make(chan AgentData, size) + c.DataChannel = make(chan APMData, size) } } diff --git a/apmproxy/receiver.go b/apmproxy/receiver.go index aca911f4..b385c9b1 100644 --- a/apmproxy/receiver.go +++ b/apmproxy/receiver.go @@ -29,12 +29,24 @@ import ( "time" ) -type AgentData struct { +// APMDataType represents source of APMData +type APMDataType string + +const ( + // Agent data type represents APMData collected from APM agents + Agent APMDataType = "agent" + // Lambda data type represents APMData collected from logs API + Lambda APMDataType = "lambda" +) + +// APMData represents data to be sent to APMServer +type APMData struct { Data []byte + Type APMDataType ContentEncoding string } -// StartHttpServer starts the server listening for APM agent data. +// StartReceiver starts the server listening for APM agent data. func (c *Client) StartReceiver() error { mux := http.NewServeMux() @@ -123,8 +135,9 @@ func (c *Client) handleIntakeV2Events() func(w http.ResponseWriter, r *http.Requ agentFlushed := r.URL.Query().Get("flushed") == "true" - agentData := AgentData{ + agentData := APMData{ Data: rawBytes, + Type: Agent, ContentEncoding: r.Header.Get("Content-Encoding"), } diff --git a/logsapi/functionlogs.go b/logsapi/functionlogs.go index 77298890..f5e2f594 100644 --- a/logsapi/functionlogs.go +++ b/logsapi/functionlogs.go @@ -92,9 +92,9 @@ func ProcessFunctionLog( requestID string, invokedFnArn string, log LogEvent, -) (apmproxy.AgentData, error) { +) (apmproxy.APMData, error) { if metadataContainer == nil || len(metadataContainer.Metadata) == 0 { - return apmproxy.AgentData{}, errors.New("metadata is required") + return apmproxy.APMData{}, errors.New("metadata is required") } lc := logContainer{ @@ -111,7 +111,7 @@ func ProcessFunctionLog( var jsonWriter fastjson.Writer if err := lc.MarshalFastJSON(&jsonWriter); err != nil { - return apmproxy.AgentData{}, err + return apmproxy.APMData{}, err } capacity := len(metadataContainer.Metadata) + jsonWriter.Size() + 1 @@ -120,5 +120,5 @@ func ProcessFunctionLog( logData = append(logData, '\n') logData = append(logData, jsonWriter.Bytes()...) - return apmproxy.AgentData{Data: logData}, nil + return apmproxy.APMData{Type: apmproxy.Lambda, Data: logData}, nil } diff --git a/logsapi/functionlogs_test.go b/logsapi/functionlogs_test.go index 5aa4034b..e8658a9e 100644 --- a/logsapi/functionlogs_test.go +++ b/logsapi/functionlogs_test.go @@ -50,5 +50,6 @@ func TestProcessFunctionLog(t *testing.T) { apmData, err := ProcessFunctionLog(metadataContainer, reqID, invokedFnArn, event) require.NoError(t, err) + assert.Equal(t, apmproxy.Lambda, apmData.Type) assert.Equal(t, expectedData, string(apmData.Data)) } diff --git a/logsapi/metrics.go b/logsapi/metrics.go index 6108b22c..6aeac9d6 100644 --- a/logsapi/metrics.go +++ b/logsapi/metrics.go @@ -63,10 +63,10 @@ func (mc MetricsContainer) MarshalFastJSON(json *fastjson.Writer) error { return nil } -func ProcessPlatformReport(metadataContainer *apmproxy.MetadataContainer, functionData *extension.NextEventResponse, platformReport LogEvent) (apmproxy.AgentData, error) { +func ProcessPlatformReport(metadataContainer *apmproxy.MetadataContainer, functionData *extension.NextEventResponse, platformReport LogEvent) (apmproxy.APMData, error) { if metadataContainer == nil || len(metadataContainer.Metadata) == 0 { - return apmproxy.AgentData{}, errors.New("metadata is not populated") + return apmproxy.APMData{}, errors.New("metadata is not populated") } metricsContainer := MetricsContainer{ @@ -103,7 +103,7 @@ func ProcessPlatformReport(metadataContainer *apmproxy.MetadataContainer, functi var jsonWriter fastjson.Writer if err := metricsContainer.MarshalFastJSON(&jsonWriter); err != nil { - return apmproxy.AgentData{}, err + return apmproxy.APMData{}, err } capacity := len(metadataContainer.Metadata) + jsonWriter.Size() + 1 // 1 for newline @@ -112,5 +112,5 @@ func ProcessPlatformReport(metadataContainer *apmproxy.MetadataContainer, functi metricsData = append(metricsData, []byte("\n")...) metricsData = append(metricsData, jsonWriter.Bytes()...) - return apmproxy.AgentData{Data: metricsData}, nil + return apmproxy.APMData{Type: apmproxy.Lambda, Data: metricsData}, nil } diff --git a/logsapi/metrics_test.go b/logsapi/metrics_test.go index 7ca086a5..8de5ccfd 100644 --- a/logsapi/metrics_test.go +++ b/logsapi/metrics_test.go @@ -75,10 +75,11 @@ func TestProcessPlatformReport_Coldstart(t *testing.T) { desiredOutputMetrics := fmt.Sprintf(`{"metricset":{"samples":{"faas.coldstart_duration":{"value":422.9700012207031},"faas.timeout":{"value":5000},"system.memory.total":{"value":1.34217728e+08},"system.memory.actual.free":{"value":5.4525952e+07},"faas.duration":{"value":182.42999267578125},"faas.billed_duration":{"value":183}},"timestamp":%d,"faas":{"coldstart":true,"execution":"6f7f0961f83442118a7af6fe80b88d56","id":"arn:aws:lambda:us-east-2:123456789012:function:custom-runtime"}}}`, timestamp.UnixNano()/1e3) - rawBytes, err := ProcessPlatformReport(&mc, &event, logEvent) + apmData, err := ProcessPlatformReport(&mc, &event, logEvent) require.NoError(t, err) + assert.Equal(t, apmproxy.Lambda, apmData.Type) - requestBytes, err := apmproxy.GetUncompressedBytes(rawBytes.Data, "") + requestBytes, err := apmproxy.GetUncompressedBytes(apmData.Data, "") require.NoError(t, err) out := string(requestBytes) @@ -135,10 +136,11 @@ func TestProcessPlatformReport_NoColdstart(t *testing.T) { desiredOutputMetrics := fmt.Sprintf(`{"metricset":{"samples":{"faas.coldstart_duration":{"value":0},"faas.timeout":{"value":5000},"system.memory.total":{"value":1.34217728e+08},"system.memory.actual.free":{"value":5.4525952e+07},"faas.duration":{"value":182.42999267578125},"faas.billed_duration":{"value":183}},"timestamp":%d,"faas":{"coldstart":false,"execution":"6f7f0961f83442118a7af6fe80b88d56","id":"arn:aws:lambda:us-east-2:123456789012:function:custom-runtime"}}}`, timestamp.UnixNano()/1e3) - rawBytes, err := ProcessPlatformReport(&mc, &event, logEvent) + apmData, err := ProcessPlatformReport(&mc, &event, logEvent) require.NoError(t, err) + assert.Equal(t, apmproxy.Lambda, apmData.Type) - requestBytes, err := apmproxy.GetUncompressedBytes(rawBytes.Data, "") + requestBytes, err := apmproxy.GetUncompressedBytes(apmData.Data, "") require.NoError(t, err) out := string(requestBytes) @@ -183,6 +185,7 @@ func TestProcessPlatformReport_DataCorruption(t *testing.T) { agentData, err := ProcessPlatformReport(mc, nextEventResp, logEvent) require.NoError(t, err) + assert.Equal(t, apmproxy.Lambda, agentData.Type) // process another platform report to ensure the previous payload is not corrupted logEvent.Record.RequestID = "corrupt-req-id" From 5346d96c8d54a30b3c9b41d4ee342beedda109bd Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Thu, 22 Sep 2022 15:46:25 +0800 Subject: [PATCH 02/15] Batch lambda logs API data before sending to APM-Server --- apmproxy/apmserver.go | 92 +++++++++++++++++++++++----- apmproxy/apmserver_test.go | 114 +++++++++++++++++++++++++++++++++++ apmproxy/batch.go | 70 +++++++++++++++++++++ apmproxy/batch_test.go | 61 +++++++++++++++++++ apmproxy/client.go | 3 + apmproxy/metadata.go | 4 -- apmproxy/receiver.go | 4 +- app/run.go | 10 +-- logsapi/event.go | 6 +- logsapi/functionlogs.go | 18 ++---- logsapi/functionlogs_test.go | 8 +-- logsapi/metrics.go | 19 ++---- logsapi/metrics_test.go | 81 ++----------------------- 13 files changed, 347 insertions(+), 143 deletions(-) create mode 100644 apmproxy/batch.go create mode 100644 apmproxy/batch_test.go diff --git a/apmproxy/apmserver.go b/apmproxy/apmserver.go index fa3d9ee5..0d9dcdca 100644 --- a/apmproxy/apmserver.go +++ b/apmproxy/apmserver.go @@ -31,6 +31,8 @@ import ( "time" ) +const defaultBatchSize = 20 + type jsonResult struct { Errors []jsonError `json:"errors,omitempty"` } @@ -40,27 +42,21 @@ type jsonError struct { Document string `json:"document,omitempty"` } -// ForwardApmData receives agent data as it comes in and posts it to the APM server. -// Stop checking for, and sending agent data when the function invocation +// ForwardApmData receives apm data as it comes in and posts it to the APM server. +// Stop checking for, and sending apm data when the function invocation // has completed, signaled via a channel. -func (c *Client) ForwardApmData(ctx context.Context, metadataContainer *MetadataContainer) error { +func (c *Client) ForwardApmData(ctx context.Context) error { if c.IsUnhealthy() { return nil } + batch := NewBatch(defaultBatchSize) for { select { case <-ctx.Done(): c.logger.Debug("Invocation context cancelled, not processing any more agent data") - return nil - case agentData := <-c.DataChannel: - if metadataContainer.Metadata == nil { - metadata, err := ProcessMetadata(agentData) - if err != nil { - return fmt.Errorf("failed to extract metadata from agent payload %w", err) - } - metadataContainer.Metadata = metadata - } - if err := c.PostToApmServer(ctx, agentData); err != nil { + return c.sendBatch(ctx, batch) + case apmData := <-c.DataChannel: + if err := c.forwardAPMDataByType(ctx, apmData, batch); err != nil { return fmt.Errorf("error sending to APM server, skipping: %v", err) } } @@ -74,15 +70,20 @@ func (c *Client) FlushAPMData(ctx context.Context) { return } c.logger.Debug("Flush started - Checking for agent data") + batch := NewBatch(defaultBatchSize) for { select { - case agentData := <-c.DataChannel: + case apmData := <-c.DataChannel: c.logger.Debug("Flush in progress - Processing agent data") - if err := c.PostToApmServer(ctx, agentData); err != nil { + if err := c.forwardAPMDataByType(ctx, apmData, batch); err != nil { c.logger.Errorf("Error sending to APM server, skipping: %v", err) } default: c.logger.Debug("Flush ended - No agent data on buffer") + // Flush any remaining data in batch + if err := c.sendBatch(ctx, batch); err != nil { + c.logger.Errorf("Error sending to APM server, skipping: %v", err) + } return } } @@ -313,3 +314,64 @@ func (c *Client) WaitForFlush() <-chan struct{} { defer c.flushMutex.Unlock() return c.flushCh } + +func (c *Client) forwardAPMDataByType(ctx context.Context, apmData APMData, batch *BatchData) error { + switch apmData.Type { + case Agent: + if len(c.metadata) == 0 { + metadata, err := ProcessMetadata(apmData) + if err != nil { + return fmt.Errorf("failed to extract metadata from agent payload %w", err) + } + c.setMetadata(metadata) + } + return c.PostToApmServer(ctx, apmData) + case Lambda: + if err := batch.Add(apmData); err != nil { + c.logger.Warnf("Dropping data due to error: %v", err) + } + if batch.ShouldFlush() { + return c.sendBatch(ctx, batch) + } + return nil + default: + return errors.New("invalid apm data type") + } +} + +func (c *Client) sendBatch(ctx context.Context, batch *BatchData) error { + // TODO: @lahsivjar keep buffering logs in DataChannel until metadata + // is available. + if len(c.metadata) == 0 { + return errors.New("metadata not yet populated, dropping data") + } + if batch.Size() == 0 { + return nil + } + defer batch.Reset() + return c.PostToApmServer(ctx, c.getWithMetadata(batch)) +} + +func (c *Client) setMetadata(metadata []byte) { + c.metadataMutex.Lock() + defer c.metadataMutex.Unlock() + c.metadata = metadata +} + +func (c *Client) getWithMetadata(batch *BatchData) APMData { + c.metadataMutex.RLock() + defer c.metadataMutex.RUnlock() + + capacity := len(c.metadata) + 1 + for _, d := range batch.agentData { + capacity += len(d.Data) + 1 + } + + withMeta := make([]byte, len(c.metadata), capacity) + copy(withMeta, c.metadata) + for _, d := range batch.agentData { + withMeta = append(withMeta, '\n') + withMeta = append(withMeta, d.Data...) + } + return APMData{Data: withMeta} +} diff --git a/apmproxy/apmserver_test.go b/apmproxy/apmserver_test.go index eb34e010..2e313161 100644 --- a/apmproxy/apmserver_test.go +++ b/apmproxy/apmserver_test.go @@ -18,11 +18,14 @@ package apmproxy_test import ( + "bytes" "compress/gzip" "context" + "fmt" "io" "net/http" "net/http/httptest" + "sync" "sync/atomic" "testing" "time" @@ -586,6 +589,115 @@ func TestContinuedAPMServerFailure(t *testing.T) { assert.Equal(t, apmClient.Status, apmproxy.Failing) } +func TestForwardApmData(t *testing.T) { + receivedReqBodyChan := make(chan []byte) + apmServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + bytes, _ := io.ReadAll(r.Body) + receivedReqBodyChan <- bytes + w.WriteHeader(http.StatusAccepted) + })) + t.Cleanup(apmServer.Close) + metadata := `{"metadata":{"service":{"agent":{"name":"apm-lambda-extension","version":"1.1.0"},"framework":{"name":"AWS Lambda","version":""},"language":{"name":"python","version":"3.9.8"},"runtime":{"name":"","version":""},"node":{}},"user":{},"process":{"pid":0},"system":{"container":{"id":""},"kubernetes":{"node":{},"pod":{}}},"cloud":{"provider":"","instance":{},"machine":{},"account":{},"project":{},"service":{}}}}` + assertGzipBody := func(expected string) { + buf := bytes.NewReader(<-receivedReqBodyChan) + r, err := gzip.NewReader(buf) + require.NoError(t, err) + out, err := io.ReadAll(r) + require.NoError(t, err) + assert.Equal(t, expected, string(out)) + } + agentData := fmt.Sprintf("%s\n%s", metadata, `{"log": {"message": "test"}}`) + lambdaData := `{"log": {"message": "test"}}` + apmClient, err := apmproxy.NewClient( + apmproxy.WithURL(apmServer.URL), + apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + ) + require.NoError(t, err) + // Override DataChannel to be unbuffered for ease of testing + apmClient.DataChannel = make(chan apmproxy.APMData) + + // Start forwarding APM data + ctx, cancel := context.WithCancel(context.Background()) + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + assert.NoError(t, apmClient.ForwardApmData(ctx)) + }() + + // Populate metadata by sending agent data + apmClient.DataChannel <- apmproxy.APMData{ + Data: []byte(agentData), + Type: apmproxy.Agent, + } + assertGzipBody(agentData) + + // Send lambda logs API data + var expected bytes.Buffer + expected.WriteString(metadata) + // Send multiple lambda logs to batch data + for i := 0; i < 5; i++ { + apmClient.DataChannel <- apmproxy.APMData{ + Data: []byte(lambdaData), + Type: apmproxy.Lambda, + } + expected.WriteByte('\n') + expected.WriteString(lambdaData) + } + + // Trigger a batch send by cancelling context + cancel() + assertGzipBody(expected.String()) + // Wait for ForwardApmData to exit + wg.Wait() +} + +func BenchmarkFlushAPMData(b *testing.B) { + // Create apm server and handler + apmServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if _, err := io.Copy(io.Discard, r.Body); err != nil { + return + } + if err := r.Body.Close(); err != nil { + return + } + w.WriteHeader(202) + if _, err := w.Write([]byte(`{}`)); err != nil { + return + } + })) + b.Cleanup(apmServer.Close) + + apmClient, err := apmproxy.NewClient( + apmproxy.WithURL(apmServer.URL), + apmproxy.WithLogger(zaptest.NewLogger(b).Sugar()), + ) + require.NoError(b, err) + + // Copied from https://github.com/elastic/apm-server/blob/master/testdata/intake-v2/transactions.ndjson. + agentData := []byte(`{"metadata": {"service": {"name": "1234_service-12a3","node": {"configured_name": "node-123"},"version": "5.1.3","environment": "staging","language": {"name": "ecmascript","version": "8"},"runtime": {"name": "node","version": "8.0.0"},"framework": {"name": "Express","version": "1.2.3"},"agent": {"name": "elastic-node","version": "3.14.0"}},"user": {"id": "123user", "username": "bar", "email": "bar@user.com"}, "labels": {"tag0": null, "tag1": "one", "tag2": 2}, "process": {"pid": 1234,"ppid": 6789,"title": "node","argv": ["node","server.js"]},"system": {"hostname": "prod1.example.com","architecture": "x64","platform": "darwin", "container": {"id": "container-id"}, "kubernetes": {"namespace": "namespace1", "pod": {"uid": "pod-uid", "name": "pod-name"}, "node": {"name": "node-name"}}},"cloud":{"account":{"id":"account_id","name":"account_name"},"availability_zone":"cloud_availability_zone","instance":{"id":"instance_id","name":"instance_name"},"machine":{"type":"machine_type"},"project":{"id":"project_id","name":"project_name"},"provider":"cloud_provider","region":"cloud_region","service":{"name":"lambda"}}}} +{"transaction": { "id": "945254c567a5417e", "trace_id": "0123456789abcdef0123456789abcdef", "parent_id": "abcdefabcdef01234567", "type": "request", "duration": 32.592981, "span_count": { "started": 43 }}} +{"transaction": {"id": "4340a8e0df1906ecbfa9", "trace_id": "0acd456789abcdef0123456789abcdef", "name": "GET /api/types","type": "request","duration": 32.592981,"outcome":"success", "result": "success", "timestamp": 1496170407154000, "sampled": true, "span_count": {"started": 17},"context": {"service": {"runtime": {"version": "7.0"}},"page":{"referer":"http://localhost:8000/test/e2e/","url":"http://localhost:8000/test/e2e/general-usecase/"}, "request": {"socket": {"remote_address": "12.53.12.1","encrypted": true},"http_version": "1.1","method": "POST","url": {"protocol": "https:","full": "https://www.example.com/p/a/t/h?query=string#hash","hostname": "www.example.com","port": "8080","pathname": "/p/a/t/h","search": "?query=string","hash": "#hash","raw": "/p/a/t/h?query=string#hash"},"headers": {"user-agent":["Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36","Mozilla Chrome Edge"],"content-type": "text/html","cookie": "c1=v1, c2=v2","some-other-header": "foo","array": ["foo","bar","baz"]},"cookies": {"c1": "v1","c2": "v2"},"env": {"SERVER_SOFTWARE": "nginx","GATEWAY_INTERFACE": "CGI/1.1"},"body": {"str": "hello world","additional": { "foo": {},"bar": 123,"req": "additional information"}}},"response": {"status_code": 200,"headers": {"content-type": "application/json"},"headers_sent": true,"finished": true,"transfer_size":25.8,"encoded_body_size":26.90,"decoded_body_size":29.90}, "user": {"domain": "ldap://abc","id": "99","username": "foo"},"tags": {"organization_uuid": "9f0e9d64-c185-4d21-a6f4-4673ed561ec8", "tag2": 12, "tag3": 12.45, "tag4": false, "tag5": null },"custom": {"my_key": 1,"some_other_value": "foo bar","and_objects": {"foo": ["bar","baz"]},"(": "not a valid regex and that is fine"}}}} +{"transaction": { "id": "cdef4340a8e0df19", "trace_id": "0acd456789abcdef0123456789abcdef", "type": "request", "duration": 13.980558, "timestamp": 1532976822281000, "sampled": null, "span_count": { "dropped": 55, "started": 436 }, "marks": {"navigationTiming": {"appBeforeBootstrap": 608.9300000000001,"navigationStart": -21},"another_mark": {"some_long": 10,"some_float": 10.0}, "performance": {}}, "context": { "request": { "socket": { "remote_address": "192.0.1", "encrypted": null }, "method": "POST", "headers": { "user-agent": null, "content-type": null, "cookie": null }, "url": { "protocol": null, "full": null, "hostname": null, "port": null, "pathname": null, "search": null, "hash": null, "raw": null } }, "response": { "headers": { "content-type": null } }, "service": {"environment":"testing","name": "service1","node": {"configured_name": "node-ABC"}, "language": {"version": "2.5", "name": "ruby"}, "agent": {"version": "2.2", "name": "elastic-ruby", "ephemeral_id": "justanid"}, "framework": {"version": "5.0", "name": "Rails"}, "version": "2", "runtime": {"version": "2.5", "name": "cruby"}}},"experience":{"cls":1,"fid":2.0,"tbt":3.4,"longtask":{"count":3,"sum":2.5,"max":1}}}} +{"transaction": { "id": "00xxxxFFaaaa1234", "trace_id": "0123456789abcdef0123456789abcdef", "name": "amqp receive", "parent_id": "abcdefabcdef01234567", "type": "messaging", "duration": 3, "span_count": { "started": 1 }, "context": {"message": {"queue": { "name": "new_users"}, "age":{ "ms": 1577958057123}, "headers": {"user_id": "1ax3", "involved_services": ["user", "auth"]}, "body": "user created", "routing_key": "user-created-transaction"}},"session":{"id":"sunday","sequence":123}}} +{"transaction": { "name": "july-2021-delete-after-july-31", "type": "lambda", "result": "success", "id": "142e61450efb8574", "trace_id": "eb56529a1f461c5e7e2f66ecb075e983", "subtype": null, "action": null, "duration": 38.853, "timestamp": 1631736666365048, "sampled": true, "context": { "cloud": { "origin": { "account": { "id": "abc123" }, "provider": "aws", "region": "us-east-1", "service": { "name": "serviceName" } } }, "service": { "origin": { "id": "abc123", "name": "service-name", "version": "1.0" } }, "user": {}, "tags": {}, "custom": { } }, "sync": true, "span_count": { "started": 0 }, "outcome": "unknown", "faas": { "coldstart": false, "execution": "2e13b309-23e1-417f-8bf7-074fc96bc683", "trigger": { "request_id": "FuH2Cir_vHcEMUA=", "type": "http" } }, "sample_rate": 1 } } +`) + agentAPMData := apmproxy.APMData{Data: agentData, Type: apmproxy.Agent} + + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + apmClient.DataChannel <- agentAPMData + for j := 0; j < 99; j++ { + apmClient.DataChannel <- apmproxy.APMData{ + Data: []byte("this is test log"), + Type: apmproxy.Lambda, + } + } + apmClient.FlushAPMData(context.Background()) + } +} + func BenchmarkPostToAPM(b *testing.B) { // Create apm server and handler apmServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { @@ -600,6 +712,7 @@ func BenchmarkPostToAPM(b *testing.B) { return } })) + b.Cleanup(apmServer.Close) apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), @@ -617,6 +730,7 @@ func BenchmarkPostToAPM(b *testing.B) { `) agentData := apmproxy.APMData{Data: benchBody, ContentEncoding: ""} + b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { if err := apmClient.PostToApmServer(context.Background(), agentData); err != nil { diff --git a/apmproxy/batch.go b/apmproxy/batch.go new file mode 100644 index 00000000..a56ad926 --- /dev/null +++ b/apmproxy/batch.go @@ -0,0 +1,70 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package apmproxy + +import "errors" + +// ErrBatchFull signfies that the batch has reached full +// capacity and cannot accept more entires. +var ErrBatchFull = errors.New("batch is full") + +// BatchData represents a batch of data without metadata +// that will be sent to APMServer. BatchData is not safe +// concurrent access. +type BatchData struct { + agentData []APMData + maxSize int +} + +// NewBatch creates a new BatchData which can accept a +// maximum number of entires as specified by the argument +func NewBatch(maxSize int) *BatchData { + return &BatchData{ + maxSize: maxSize, + agentData: make([]APMData, 0, maxSize), + } +} + +// Add adds a new entry to the batch. Returns ErrBatchFull +// if batch has reached it's maximum size. +func (b *BatchData) Add(d APMData) error { + if len(b.agentData) >= b.maxSize { + return ErrBatchFull + } + + b.agentData = append(b.agentData, d) + return nil +} + +// Size return the number of entries in batch. +func (b *BatchData) Size() int { + return len(b.agentData) +} + +// ShouldFlush indicates when a batch is ready for flush. +// A batch is marked as ready for flush once it reaches +// 90% of its max size. +func (b *BatchData) ShouldFlush() bool { + return len(b.agentData) >= int(float32(b.maxSize)*0.9) +} + +// Reset resets the batch to prepare for new set of data +func (b *BatchData) Reset() { + b.agentData = nil + b.agentData = make([]APMData, 0, b.maxSize) +} diff --git a/apmproxy/batch_test.go b/apmproxy/batch_test.go new file mode 100644 index 00000000..f71b7de9 --- /dev/null +++ b/apmproxy/batch_test.go @@ -0,0 +1,61 @@ +// Licensed to Elasticsearch B.V. under one or more contributor +// license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright +// ownership. Elasticsearch B.V. licenses this file to you under +// the Apache License, Version 2.0 (the "License"); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package apmproxy + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestAdd(t *testing.T) { + t.Run("empty", func(t *testing.T) { + b := NewBatch(1) + + assert.NoError(t, b.Add(APMData{})) + }) + t.Run("full", func(t *testing.T) { + b := NewBatch(1) + require.NoError(t, b.Add(APMData{})) + + assert.ErrorIs(t, ErrBatchFull, b.Add(APMData{})) + }) +} + +func TestReset(t *testing.T) { + b := NewBatch(1) + require.NoError(t, b.Add(APMData{})) + require.Equal(t, 1, b.Size()) + b.Reset() + + assert.Equal(t, 0, b.Size()) +} + +func TestShouldFlush(t *testing.T) { + b := NewBatch(10) + + // Should flush at 90% full + for i := 0; i < 9; i++ { + assert.False(t, b.ShouldFlush()) + require.NoError(t, b.Add(APMData{})) + } + + require.Equal(t, 9, b.Size()) + assert.True(t, b.ShouldFlush()) +} diff --git a/apmproxy/client.go b/apmproxy/client.go index 6899133f..1269b2c9 100644 --- a/apmproxy/client.go +++ b/apmproxy/client.go @@ -65,6 +65,9 @@ type Client struct { flushMutex sync.Mutex flushCh chan struct{} + + metadataMutex sync.RWMutex + metadata []byte } func NewClient(opts ...Option) (*Client, error) { diff --git a/apmproxy/metadata.go b/apmproxy/metadata.go index 2f10a2ae..6d63b292 100644 --- a/apmproxy/metadata.go +++ b/apmproxy/metadata.go @@ -25,10 +25,6 @@ import ( "io" ) -type MetadataContainer struct { - Metadata []byte -} - // ProcessMetadata return a byte array containing the Metadata marshaled in JSON // In case we want to update the Metadata values, usage of https://github.com/tidwall/sjson is advised func ProcessMetadata(data APMData) ([]byte, error) { diff --git a/apmproxy/receiver.go b/apmproxy/receiver.go index b385c9b1..74843ad8 100644 --- a/apmproxy/receiver.go +++ b/apmproxy/receiver.go @@ -39,7 +39,9 @@ const ( Lambda APMDataType = "lambda" ) -// APMData represents data to be sent to APMServer +// APMData represents data to be sent to APMServer. `Agent` type +// data will have `metadata` as ndjson whereas `lambda` type data +// will be without metadata. type APMData struct { Data []byte Type APMDataType diff --git a/app/run.go b/app/run.go index 8a9a1438..f4035660 100644 --- a/app/run.go +++ b/app/run.go @@ -23,7 +23,6 @@ import ( "sync" "time" - "github.com/elastic/apm-aws-lambda/apmproxy" "github.com/elastic/apm-aws-lambda/extension" ) @@ -82,9 +81,6 @@ func (app *App) Run(ctx context.Context) error { // The previous event id is used to validate the received Lambda metrics var prevEvent *extension.NextEventResponse - // This data structure contains metadata tied to the current Lambda instance. If empty, it is populated once for each - // active Lambda environment - metadataContainer := apmproxy.MetadataContainer{} for { select { @@ -96,7 +92,7 @@ func (app *App) Run(ctx context.Context) error { // Use a wait group to ensure the background go routine sending to the APM server // completes before signaling that the extension is ready for the next invocation. var backgroundDataSendWg sync.WaitGroup - event, err := app.processEvent(ctx, &backgroundDataSendWg, prevEvent, &metadataContainer) + event, err := app.processEvent(ctx, &backgroundDataSendWg, prevEvent) if err != nil { return err } @@ -120,7 +116,6 @@ func (app *App) processEvent( ctx context.Context, backgroundDataSendWg *sync.WaitGroup, prevEvent *extension.NextEventResponse, - metadataContainer *apmproxy.MetadataContainer, ) (*extension.NextEventResponse, error) { // Reset flush state for future events. defer app.apmClient.ResetFlush() @@ -159,7 +154,7 @@ func (app *App) processEvent( backgroundDataSendWg.Add(1) go func() { defer backgroundDataSendWg.Done() - if err := app.apmClient.ForwardApmData(invocationCtx, metadataContainer); err != nil { + if err := app.apmClient.ForwardApmData(invocationCtx); err != nil { app.logger.Error(err) } }() @@ -174,7 +169,6 @@ func (app *App) processEvent( event.RequestID, event.InvokedFunctionArn, app.apmClient, - metadataContainer, runtimeDone, prevEvent, ); err != nil { diff --git a/logsapi/event.go b/logsapi/event.go index 0cefa6be..0a23e316 100644 --- a/logsapi/event.go +++ b/logsapi/event.go @@ -60,7 +60,6 @@ func (lc *Client) ProcessLogs( requestID string, invokedFnArn string, apmClient *apmproxy.Client, - metadataContainer *apmproxy.MetadataContainer, runtimeDoneSignal chan struct{}, prevEvent *extension.NextEventResponse, ) error { @@ -89,7 +88,7 @@ func (lc *Client) ProcessLogs( case PlatformReport: if prevEvent != nil && logEvent.Record.RequestID == prevEvent.RequestID { lc.logger.Debug("Received platform report for the previous function invocation") - processedMetrics, err := ProcessPlatformReport(metadataContainer, prevEvent, logEvent) + processedMetrics, err := ProcessPlatformReport(prevEvent, logEvent) if err != nil { lc.logger.Errorf("Error processing Lambda platform metrics : %v", err) } else { @@ -100,11 +99,8 @@ func (lc *Client) ProcessLogs( lc.logger.Debug("Log API runtimeDone event request id didn't match") } case FunctionLog: - // TODO: @lahsivjar Buffer logs and send batches of data to APM-Server. - // Buffering should account for metadata being available before sending. lc.logger.Debug("Received function log") processedLog, err := ProcessFunctionLog( - metadataContainer, platformStartReqID, invokedFnArn, logEvent, diff --git a/logsapi/functionlogs.go b/logsapi/functionlogs.go index f5e2f594..d32219fd 100644 --- a/logsapi/functionlogs.go +++ b/logsapi/functionlogs.go @@ -18,8 +18,6 @@ package logsapi import ( - "errors" - "github.com/elastic/apm-aws-lambda/apmproxy" "go.elastic.co/apm/v2/model" "go.elastic.co/fastjson" @@ -88,15 +86,10 @@ func (lc logContainer) MarshalFastJSON(json *fastjson.Writer) error { // ProcessFunctionLog consumes agent metadata and log event from Lambda // logs API to create a payload for APM server. func ProcessFunctionLog( - metadataContainer *apmproxy.MetadataContainer, requestID string, invokedFnArn string, log LogEvent, ) (apmproxy.APMData, error) { - if metadataContainer == nil || len(metadataContainer.Metadata) == 0 { - return apmproxy.APMData{}, errors.New("metadata is required") - } - lc := logContainer{ Log: &logLine{ Timestamp: model.Time(log.Time), @@ -114,11 +107,8 @@ func ProcessFunctionLog( return apmproxy.APMData{}, err } - capacity := len(metadataContainer.Metadata) + jsonWriter.Size() + 1 - logData := make([]byte, len(metadataContainer.Metadata), capacity) - copy(logData, metadataContainer.Metadata) - - logData = append(logData, '\n') - logData = append(logData, jsonWriter.Bytes()...) - return apmproxy.APMData{Type: apmproxy.Lambda, Data: logData}, nil + return apmproxy.APMData{ + Type: apmproxy.Lambda, + Data: jsonWriter.Bytes(), + }, nil } diff --git a/logsapi/functionlogs_test.go b/logsapi/functionlogs_test.go index e8658a9e..7952bf04 100644 --- a/logsapi/functionlogs_test.go +++ b/logsapi/functionlogs_test.go @@ -28,9 +28,6 @@ import ( ) func TestProcessFunctionLog(t *testing.T) { - metadataContainer := &apmproxy.MetadataContainer{ - Metadata: []byte(`{"metadata":{"service":{"agent":{"name":"apm-lambda-extension","version":"1.1.0"},"framework":{"name":"AWS Lambda","version":""},"language":{"name":"python","version":"3.9.8"},"runtime":{"name":"","version":""},"node":{}},"user":{},"process":{"pid":0},"system":{"container":{"id":""},"kubernetes":{"node":{},"pod":{}}},"cloud":{"provider":"","instance":{},"machine":{},"account":{},"project":{},"service":{}}}}`), - } event := LogEvent{ Time: time.Date(2022, 11, 12, 0, 0, 0, 0, time.UTC), Type: FunctionLog, @@ -39,15 +36,14 @@ func TestProcessFunctionLog(t *testing.T) { reqID := "8476a536-e9f4-11e8-9739-2dfe598c3fcd" invokedFnArn := "arn:aws:lambda:us-east-2:123456789012:function:custom-runtime" expectedData := fmt.Sprintf( - "%s\n{\"log\":{\"message\":\"%s\",\"@timestamp\":%d,\"faas\":{\"id\":\"%s\",\"execution\":\"%s\"}}}", - metadataContainer.Metadata, + "{\"log\":{\"message\":\"%s\",\"@timestamp\":%d,\"faas\":{\"id\":\"%s\",\"execution\":\"%s\"}}}", event.StringRecord, event.Time.UnixNano()/int64(time.Microsecond), invokedFnArn, reqID, ) - apmData, err := ProcessFunctionLog(metadataContainer, reqID, invokedFnArn, event) + apmData, err := ProcessFunctionLog(reqID, invokedFnArn, event) require.NoError(t, err) assert.Equal(t, apmproxy.Lambda, apmData.Type) diff --git a/logsapi/metrics.go b/logsapi/metrics.go index 6aeac9d6..dc670047 100644 --- a/logsapi/metrics.go +++ b/logsapi/metrics.go @@ -18,7 +18,6 @@ package logsapi import ( - "errors" "math" "github.com/elastic/apm-aws-lambda/apmproxy" @@ -63,12 +62,7 @@ func (mc MetricsContainer) MarshalFastJSON(json *fastjson.Writer) error { return nil } -func ProcessPlatformReport(metadataContainer *apmproxy.MetadataContainer, functionData *extension.NextEventResponse, platformReport LogEvent) (apmproxy.APMData, error) { - - if metadataContainer == nil || len(metadataContainer.Metadata) == 0 { - return apmproxy.APMData{}, errors.New("metadata is not populated") - } - +func ProcessPlatformReport(functionData *extension.NextEventResponse, platformReport LogEvent) (apmproxy.APMData, error) { metricsContainer := MetricsContainer{ Metrics: &model.Metrics{}, } @@ -106,11 +100,8 @@ func ProcessPlatformReport(metadataContainer *apmproxy.MetadataContainer, functi return apmproxy.APMData{}, err } - capacity := len(metadataContainer.Metadata) + jsonWriter.Size() + 1 // 1 for newline - metricsData := make([]byte, len(metadataContainer.Metadata), capacity) - copy(metricsData, metadataContainer.Metadata) - - metricsData = append(metricsData, []byte("\n")...) - metricsData = append(metricsData, jsonWriter.Bytes()...) - return apmproxy.APMData{Type: apmproxy.Lambda, Data: metricsData}, nil + return apmproxy.APMData{ + Type: apmproxy.Lambda, + Data: jsonWriter.Bytes(), + }, nil } diff --git a/logsapi/metrics_test.go b/logsapi/metrics_test.go index 8de5ccfd..837276d7 100644 --- a/logsapi/metrics_test.go +++ b/logsapi/metrics_test.go @@ -20,7 +20,6 @@ package logsapi import ( "fmt" "log" - "strings" "testing" "time" @@ -32,10 +31,6 @@ import ( ) func TestProcessPlatformReport_Coldstart(t *testing.T) { - mc := apmproxy.MetadataContainer{ - Metadata: []byte(fmt.Sprintf(`{"metadata":{"service":{"agent":{"name":"apm-lambda-extension","version":"%s"},"framework":{"name":"AWS Lambda","version":""},"language":{"name":"python","version":"3.9.8"},"runtime":{"name":"","version":""},"node":{}},"user":{},"process":{"pid":0},"system":{"container":{"id":""},"kubernetes":{"node":{},"pod":{}}},"cloud":{"provider":"","instance":{},"machine":{},"account":{},"project":{},"service":{}}}}`, extension.Version)), - } - timestamp := time.Now() pm := PlatformMetrics{ @@ -71,11 +66,9 @@ func TestProcessPlatformReport_Coldstart(t *testing.T) { }, } - desiredOutputMetadata := fmt.Sprintf(`{"metadata":{"service":{"agent":{"name":"apm-lambda-extension","version":"%s"},"framework":{"name":"AWS Lambda","version":""},"language":{"name":"python","version":"3.9.8"},"runtime":{"name":"","version":""},"node":{}},"user":{},"process":{"pid":0},"system":{"container":{"id":""},"kubernetes":{"node":{},"pod":{}}},"cloud":{"provider":"","instance":{},"machine":{},"account":{},"project":{},"service":{}}}}`, extension.Version) - desiredOutputMetrics := fmt.Sprintf(`{"metricset":{"samples":{"faas.coldstart_duration":{"value":422.9700012207031},"faas.timeout":{"value":5000},"system.memory.total":{"value":1.34217728e+08},"system.memory.actual.free":{"value":5.4525952e+07},"faas.duration":{"value":182.42999267578125},"faas.billed_duration":{"value":183}},"timestamp":%d,"faas":{"coldstart":true,"execution":"6f7f0961f83442118a7af6fe80b88d56","id":"arn:aws:lambda:us-east-2:123456789012:function:custom-runtime"}}}`, timestamp.UnixNano()/1e3) - apmData, err := ProcessPlatformReport(&mc, &event, logEvent) + apmData, err := ProcessPlatformReport(&event, logEvent) require.NoError(t, err) assert.Equal(t, apmproxy.Lambda, apmData.Type) @@ -85,18 +78,10 @@ func TestProcessPlatformReport_Coldstart(t *testing.T) { out := string(requestBytes) log.Println(out) - processingResult := strings.Split(string(requestBytes), "\n") - - assert.JSONEq(t, desiredOutputMetadata, processingResult[0]) - assert.JSONEq(t, desiredOutputMetrics, processingResult[1]) + assert.JSONEq(t, desiredOutputMetrics, string(requestBytes)) } func TestProcessPlatformReport_NoColdstart(t *testing.T) { - - mc := apmproxy.MetadataContainer{ - Metadata: []byte(fmt.Sprintf(`{"metadata":{"service":{"agent":{"name":"apm-lambda-extension","version":"%s"},"framework":{"name":"AWS Lambda","version":""},"language":{"name":"python","version":"3.9.8"},"runtime":{"name":"","version":""},"node":{}},"user":{},"process":{"pid":0},"system":{"container":{"id":""},"kubernetes":{"node":{},"pod":{}}},"cloud":{"provider":"","instance":{},"machine":{},"account":{},"project":{},"service":{}}}}`, extension.Version)), - } - timestamp := time.Now() pm := PlatformMetrics{ @@ -132,75 +117,19 @@ func TestProcessPlatformReport_NoColdstart(t *testing.T) { }, } - desiredOutputMetadata := fmt.Sprintf(`{"metadata":{"service":{"agent":{"name":"apm-lambda-extension","version":"%s"},"framework":{"name":"AWS Lambda","version":""},"language":{"name":"python","version":"3.9.8"},"runtime":{"name":"","version":""},"node":{}},"user":{},"process":{"pid":0},"system":{"container":{"id":""},"kubernetes":{"node":{},"pod":{}}},"cloud":{"provider":"","instance":{},"machine":{},"account":{},"project":{},"service":{}}}}`, extension.Version) - desiredOutputMetrics := fmt.Sprintf(`{"metricset":{"samples":{"faas.coldstart_duration":{"value":0},"faas.timeout":{"value":5000},"system.memory.total":{"value":1.34217728e+08},"system.memory.actual.free":{"value":5.4525952e+07},"faas.duration":{"value":182.42999267578125},"faas.billed_duration":{"value":183}},"timestamp":%d,"faas":{"coldstart":false,"execution":"6f7f0961f83442118a7af6fe80b88d56","id":"arn:aws:lambda:us-east-2:123456789012:function:custom-runtime"}}}`, timestamp.UnixNano()/1e3) - apmData, err := ProcessPlatformReport(&mc, &event, logEvent) + apmData, err := ProcessPlatformReport(&event, logEvent) require.NoError(t, err) assert.Equal(t, apmproxy.Lambda, apmData.Type) requestBytes, err := apmproxy.GetUncompressedBytes(apmData.Data, "") require.NoError(t, err) - out := string(requestBytes) - log.Println(out) - - processingResult := strings.Split(string(requestBytes), "\n") - - assert.JSONEq(t, desiredOutputMetadata, processingResult[0]) - assert.JSONEq(t, desiredOutputMetrics, processingResult[1]) -} - -func TestProcessPlatformReport_DataCorruption(t *testing.T) { - // Allocate big capacity metadata array to prevent reallocation - raw := make([]byte, 0, 7000) - mc := &apmproxy.MetadataContainer{ - Metadata: append(raw, []byte(fmt.Sprintf(`{"metadata":{"service":{"agent":{"name":"apm-lambda-extension","version":"%s"},"framework":{"name":"AWS Lambda","version":""},"language":{"name":"python","version":"3.9.8"},"runtime":{"name":"","version":""},"node":{}},"user":{},"process":{"pid":0},"system":{"container":{"id":""},"kubernetes":{"node":{},"pod":{}}},"cloud":{"provider":"","instance":{},"machine":{},"account":{},"project":{},"service":{}}}}`, extension.Version))...), - } - reqID := "8476a536-e9f4-11e8-9739-2dfe598c3fcd" - invokedFnArn := "arn:aws:lambda:us-east-2:123456789012:function:custom-runtime" - timestamp := time.Date(2022, 11, 12, 0, 0, 0, 0, time.UTC) - logEvent := LogEvent{ - Time: timestamp, - Type: PlatformReport, - Record: LogEventRecord{ - RequestID: reqID, - Metrics: PlatformMetrics{ - DurationMs: 1.0, - BilledDurationMs: 1, - MemorySizeMB: 1, - MaxMemoryUsedMB: 1, - InitDurationMs: 1.0, - }, - }, - } - nextEventResp := &extension.NextEventResponse{ - Timestamp: timestamp, - EventType: extension.Invoke, - RequestID: reqID, - InvokedFunctionArn: invokedFnArn, - } - expected := "{\"metricset\":{\"samples\":{\"system.memory.total\":{\"value\":1.048576e+06},\"system.memory.actual.free\":{\"value\":0},\"faas.duration\":{\"value\":1},\"faas.billed_duration\":{\"value\":1},\"faas.coldstart_duration\":{\"value\":1},\"faas.timeout\":{\"value\":-1.6682112e+12}},\"timestamp\":1668211200000000,\"faas\":{\"coldstart\":true,\"execution\":\"8476a536-e9f4-11e8-9739-2dfe598c3fcd\",\"id\":\"arn:aws:lambda:us-east-2:123456789012:function:custom-runtime\"}}}" - - agentData, err := ProcessPlatformReport(mc, nextEventResp, logEvent) - require.NoError(t, err) - assert.Equal(t, apmproxy.Lambda, agentData.Type) - - // process another platform report to ensure the previous payload is not corrupted - logEvent.Record.RequestID = "corrupt-req-id" - nextEventResp.RequestID = "corrupt-req-id" - _, err = ProcessPlatformReport(mc, nextEventResp, logEvent) - require.NoError(t, err) - - data := strings.Split(string(agentData.Data), "\n") - assert.JSONEq(t, expected, data[1]) + assert.JSONEq(t, desiredOutputMetrics, string(requestBytes)) } func BenchmarkPlatformReport(b *testing.B) { - mc := &apmproxy.MetadataContainer{ - Metadata: []byte(`{"metadata":{"service":{"agent":{"name":"apm-lambda-extension","version":"1.1.0"},"framework":{"name":"AWS Lambda","version":""},"language":{"name":"python","version":"3.9.8"},"runtime":{"name":"","version":""},"node":{}},"user":{},"process":{"pid":0},"system":{"container":{"id":""},"kubernetes":{"node":{},"pod":{}}},"cloud":{"provider":"","instance":{},"machine":{},"account":{},"project":{},"service":{}}}}`), - } reqID := "8476a536-e9f4-11e8-9739-2dfe598c3fcd" invokedFnArn := "arn:aws:lambda:us-east-2:123456789012:function:custom-runtime" timestamp := time.Date(2022, 11, 12, 0, 0, 0, 0, time.UTC) @@ -226,7 +155,7 @@ func BenchmarkPlatformReport(b *testing.B) { } for n := 0; n < b.N; n++ { - _, err := ProcessPlatformReport(mc, nextEventResp, logEvent) + _, err := ProcessPlatformReport(nextEventResp, logEvent) require.NoError(b, err) } } From 3ff264be427b70994f496335bbe99663e5a6e48e Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Sun, 25 Sep 2022 15:43:34 +0800 Subject: [PATCH 03/15] Pushback on lambda logs API until metadata is available --- apmproxy/apmserver.go | 6 +++--- apmproxy/apmserver_test.go | 14 +++++++++++--- apmproxy/client.go | 5 +++-- apmproxy/option.go | 8 ++++++++ app/app.go | 3 +++ logsapi/client.go | 3 ++- logsapi/option.go | 8 ++++++++ logsapi/route_handlers.go | 16 +++++++++++++++- 8 files changed, 53 insertions(+), 10 deletions(-) diff --git a/apmproxy/apmserver.go b/apmproxy/apmserver.go index 0d9dcdca..86ced85c 100644 --- a/apmproxy/apmserver.go +++ b/apmproxy/apmserver.go @@ -324,6 +324,8 @@ func (c *Client) forwardAPMDataByType(ctx context.Context, apmData APMData, batc return fmt.Errorf("failed to extract metadata from agent payload %w", err) } c.setMetadata(metadata) + // broadcast that metadata is available + close(c.metadataAvailable) } return c.PostToApmServer(ctx, apmData) case Lambda: @@ -340,10 +342,8 @@ func (c *Client) forwardAPMDataByType(ctx context.Context, apmData APMData, batc } func (c *Client) sendBatch(ctx context.Context, batch *BatchData) error { - // TODO: @lahsivjar keep buffering logs in DataChannel until metadata - // is available. if len(c.metadata) == 0 { - return errors.New("metadata not yet populated, dropping data") + return errors.New("unexpected state, metadata should always be populated") } if batch.Size() == 0 { return nil diff --git a/apmproxy/apmserver_test.go b/apmproxy/apmserver_test.go index 2e313161..3bcc4d04 100644 --- a/apmproxy/apmserver_test.go +++ b/apmproxy/apmserver_test.go @@ -599,7 +599,13 @@ func TestForwardApmData(t *testing.T) { t.Cleanup(apmServer.Close) metadata := `{"metadata":{"service":{"agent":{"name":"apm-lambda-extension","version":"1.1.0"},"framework":{"name":"AWS Lambda","version":""},"language":{"name":"python","version":"3.9.8"},"runtime":{"name":"","version":""},"node":{}},"user":{},"process":{"pid":0},"system":{"container":{"id":""},"kubernetes":{"node":{},"pod":{}}},"cloud":{"provider":"","instance":{},"machine":{},"account":{},"project":{},"service":{}}}}` assertGzipBody := func(expected string) { - buf := bytes.NewReader(<-receivedReqBodyChan) + var body []byte + select { + case body = <-receivedReqBodyChan: + case <-time.After(1 * time.Second): + require.Fail(t, "mock APM-Server timed out waiting for request") + } + buf := bytes.NewReader(body) r, err := gzip.NewReader(buf) require.NoError(t, err) out, err := io.ReadAll(r) @@ -608,13 +614,15 @@ func TestForwardApmData(t *testing.T) { } agentData := fmt.Sprintf("%s\n%s", metadata, `{"log": {"message": "test"}}`) lambdaData := `{"log": {"message": "test"}}` + metaAvailable := make(chan struct{}) apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(metaAvailable), + // Use unbuffered channel for ease of testing + apmproxy.WithAgentDataBufferSize(0), ) require.NoError(t, err) - // Override DataChannel to be unbuffered for ease of testing - apmClient.DataChannel = make(chan apmproxy.APMData) // Start forwarding APM data ctx, cancel := context.WithCancel(context.Background()) diff --git a/apmproxy/client.go b/apmproxy/client.go index 1269b2c9..3d2194d8 100644 --- a/apmproxy/client.go +++ b/apmproxy/client.go @@ -66,8 +66,9 @@ type Client struct { flushMutex sync.Mutex flushCh chan struct{} - metadataMutex sync.RWMutex - metadata []byte + metadataAvailable chan<- struct{} + metadataMutex sync.RWMutex + metadata []byte } func NewClient(opts ...Option) (*Client, error) { diff --git a/apmproxy/option.go b/apmproxy/option.go index 827ed218..8ec4ba75 100644 --- a/apmproxy/option.go +++ b/apmproxy/option.go @@ -83,3 +83,11 @@ func WithLogger(logger *zap.SugaredLogger) Option { c.logger = logger } } + +// WithMetadataAvailableIndicator configures a channel +// which will broadcast metadata available event on close +func WithMetadataAvailableIndicator(ch chan<- struct{}) Option { + return func(c *Client) { + c.metadataAvailable = ch + } +} diff --git a/app/app.go b/app/app.go index d2b9f808..e3667533 100644 --- a/app/app.go +++ b/app/app.go @@ -67,6 +67,7 @@ func New(ctx context.Context, opts ...ConfigOption) (*App, error) { return nil, err } + metadataAvailable := make(chan struct{}) app.extensionClient = extension.NewClient(c.awsLambdaRuntimeAPI, app.logger) if !c.disableLogsAPI { @@ -86,6 +87,7 @@ func New(ctx context.Context, opts ...ConfigOption) (*App, error) { logsapi.WithLogBuffer(100), logsapi.WithLogger(app.logger), logsapi.WithSubscriptionTypes(subscriptionLogStreams...), + logsapi.WithMetadataAvailableIndicator(metadataAvailable), ) if err != nil { return nil, err @@ -132,6 +134,7 @@ func New(ctx context.Context, opts ...ConfigOption) (*App, error) { apmproxy.WithLogger(app.logger), apmproxy.WithAPIKey(apmServerAPIKey), apmproxy.WithSecretToken(apmServerSecretToken), + apmproxy.WithMetadataAvailableIndicator(metadataAvailable), ) ac, err := apmproxy.NewClient(apmOpts...) diff --git a/logsapi/client.go b/logsapi/client.go index b7d18ac0..b0d7bcf6 100644 --- a/logsapi/client.go +++ b/logsapi/client.go @@ -52,6 +52,7 @@ type Client struct { logsAPIBaseURL string logsAPISubscriptionTypes []SubscriptionType logsChannel chan LogEvent + metadataAvailable <-chan struct{} listenerAddr string server *http.Server logger *zap.SugaredLogger @@ -69,7 +70,7 @@ func NewClient(opts ...ClientOption) (*Client, error) { } mux := http.NewServeMux() - mux.HandleFunc("/", handleLogEventsRequest(c.logger, c.logsChannel)) + mux.HandleFunc("/", handleLogEventsRequest(c.logger, c.logsChannel, c.metadataAvailable)) c.server.Handler = mux diff --git a/logsapi/option.go b/logsapi/option.go index e431477e..677cbf4d 100644 --- a/logsapi/option.go +++ b/logsapi/option.go @@ -55,3 +55,11 @@ func WithSubscriptionTypes(types ...SubscriptionType) ClientOption { c.logsAPISubscriptionTypes = types } } + +// WithMetadataAvailableIndicator configures a channel +// which will broadcast metadata available event on close +func WithMetadataAvailableIndicator(ch <-chan struct{}) ClientOption { + return func(c *Client) { + c.metadataAvailable = ch + } +} diff --git a/logsapi/route_handlers.go b/logsapi/route_handlers.go index 0981d497..7c10a956 100644 --- a/logsapi/route_handlers.go +++ b/logsapi/route_handlers.go @@ -25,8 +25,22 @@ import ( "go.uber.org/zap" ) -func handleLogEventsRequest(logger *zap.SugaredLogger, logsChannel chan LogEvent) func(w http.ResponseWriter, r *http.Request) { +func handleLogEventsRequest( + logger *zap.SugaredLogger, + logsChannel chan LogEvent, + metadataAvailable <-chan struct{}, +) func(w http.ResponseWriter, r *http.Request) { return func(w http.ResponseWriter, r *http.Request) { + select { + case <-metadataAvailable: + case <-time.After(100 * time.Millisecond): + // Without metadata the events cannot be processed + // thus we indicate lambda to keep the buffer + logger.Warnf("Metadata unavailable, signaling lambda to buffer log events") + w.WriteHeader(http.StatusInternalServerError) + return + } + var logEvents []LogEvent if err := json.NewDecoder(r.Body).Decode(&logEvents); err != nil { logger.Errorf("Error unmarshalling log events: %+v", err) From d5d624edc69effdf848ce5b7fb37c2e84c18a7fd Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Mon, 26 Sep 2022 17:39:34 +0800 Subject: [PATCH 04/15] Tweak default buffering settings --- logsapi/subscribe.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/logsapi/subscribe.go b/logsapi/subscribe.go index 9e96a096..c13cadcb 100644 --- a/logsapi/subscribe.go +++ b/logsapi/subscribe.go @@ -86,8 +86,8 @@ func (lc *Client) subscribe(types []SubscriptionType, extensionID string, uri st LogTypes: types, BufferingCfg: BufferingCfg{ MaxItems: 10000, - MaxBytes: 262144, - TimeoutMS: 25, + MaxBytes: 1024 * 1024, + TimeoutMS: 100, }, Destination: Destination{ Protocol: "HTTP", From 3d3c1b9b633159dbf062ea4051f2255d9989929c Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Tue, 27 Sep 2022 11:35:15 +0800 Subject: [PATCH 05/15] Refactor metadata handling and add age to batch --- apmproxy/apmserver.go | 72 ++++++++++------------------- apmproxy/apmserver_test.go | 25 ++++++++++ apmproxy/batch.go | 93 ++++++++++++++++++++++++++++---------- apmproxy/batch_test.go | 51 +++++++++++++++------ apmproxy/client.go | 14 +++++- apmproxy/client_test.go | 14 ++++-- apmproxy/receiver_test.go | 9 +++- app/app.go | 10 ++-- app/config.go | 9 ++++ logsapi/client_test.go | 33 +++++++++++--- logsapi/event.go | 5 +- main.go | 1 + main_test.go | 4 ++ 13 files changed, 235 insertions(+), 105 deletions(-) diff --git a/apmproxy/apmserver.go b/apmproxy/apmserver.go index 86ced85c..40bf5cd6 100644 --- a/apmproxy/apmserver.go +++ b/apmproxy/apmserver.go @@ -31,8 +31,6 @@ import ( "time" ) -const defaultBatchSize = 20 - type jsonResult struct { Errors []jsonError `json:"errors,omitempty"` } @@ -49,14 +47,13 @@ func (c *Client) ForwardApmData(ctx context.Context) error { if c.IsUnhealthy() { return nil } - batch := NewBatch(defaultBatchSize) for { select { case <-ctx.Done(): c.logger.Debug("Invocation context cancelled, not processing any more agent data") - return c.sendBatch(ctx, batch) + return c.sendBatch(ctx) case apmData := <-c.DataChannel: - if err := c.forwardAPMDataByType(ctx, apmData, batch); err != nil { + if err := c.forwardAPMDataByType(ctx, apmData); err != nil { return fmt.Errorf("error sending to APM server, skipping: %v", err) } } @@ -70,18 +67,17 @@ func (c *Client) FlushAPMData(ctx context.Context) { return } c.logger.Debug("Flush started - Checking for agent data") - batch := NewBatch(defaultBatchSize) for { select { case apmData := <-c.DataChannel: c.logger.Debug("Flush in progress - Processing agent data") - if err := c.forwardAPMDataByType(ctx, apmData, batch); err != nil { + if err := c.forwardAPMDataByType(ctx, apmData); err != nil { c.logger.Errorf("Error sending to APM server, skipping: %v", err) } default: c.logger.Debug("Flush ended - No agent data on buffer") // Flush any remaining data in batch - if err := c.sendBatch(ctx, batch); err != nil { + if err := c.sendBatch(ctx); err != nil { c.logger.Errorf("Error sending to APM server, skipping: %v", err) } return @@ -282,12 +278,12 @@ func (c *Client) ComputeGracePeriod() time.Duration { return time.Duration((gracePeriodWithoutJitter + jitter*gracePeriodWithoutJitter) * float64(time.Second)) } -// EnqueueAPMData adds a AgentData struct to the agent data channel, effectively queueing for a send +// EnqueueAPMData adds a apm data struct to the agent data channel, effectively queueing for a send // to the APM server. -func (c *Client) EnqueueAPMData(agentData APMData) { +func (c *Client) EnqueueAPMData(apmData APMData) { select { - case c.DataChannel <- agentData: - c.logger.Debug("Adding agent data to buffer to be sent to apm server") + case c.DataChannel <- apmData: + c.logger.Debug("Adding APM data to buffer to be sent to apm server") default: c.logger.Warn("Channel full: dropping a subset of agent data") } @@ -315,25 +311,30 @@ func (c *Client) WaitForFlush() <-chan struct{} { return c.flushCh } -func (c *Client) forwardAPMDataByType(ctx context.Context, apmData APMData, batch *BatchData) error { +func (c *Client) forwardAPMDataByType(ctx context.Context, apmData APMData) error { switch apmData.Type { case Agent: - if len(c.metadata) == 0 { + if c.batch == nil { metadata, err := ProcessMetadata(apmData) if err != nil { return fmt.Errorf("failed to extract metadata from agent payload %w", err) } - c.setMetadata(metadata) + c.batch = NewBatch(c.maxBatchSize, metadata) // broadcast that metadata is available close(c.metadataAvailable) } return c.PostToApmServer(ctx, apmData) case Lambda: - if err := batch.Add(apmData); err != nil { + if c.batch == nil { + // This state is not possible since we are pushing back on + // lambda logs API until metadata is available. + return errors.New("unexpected state, metadata not yet set") + } + if err := c.batch.Add(apmData); err != nil { c.logger.Warnf("Dropping data due to error: %v", err) } - if batch.ShouldFlush() { - return c.sendBatch(ctx, batch) + if c.batch.ShouldShip() { + return c.sendBatch(ctx) } return nil default: @@ -341,37 +342,10 @@ func (c *Client) forwardAPMDataByType(ctx context.Context, apmData APMData, batc } } -func (c *Client) sendBatch(ctx context.Context, batch *BatchData) error { - if len(c.metadata) == 0 { - return errors.New("unexpected state, metadata should always be populated") - } - if batch.Size() == 0 { +func (c *Client) sendBatch(ctx context.Context) error { + if c.batch == nil || c.batch.Count() == 0 { return nil } - defer batch.Reset() - return c.PostToApmServer(ctx, c.getWithMetadata(batch)) -} - -func (c *Client) setMetadata(metadata []byte) { - c.metadataMutex.Lock() - defer c.metadataMutex.Unlock() - c.metadata = metadata -} - -func (c *Client) getWithMetadata(batch *BatchData) APMData { - c.metadataMutex.RLock() - defer c.metadataMutex.RUnlock() - - capacity := len(c.metadata) + 1 - for _, d := range batch.agentData { - capacity += len(d.Data) + 1 - } - - withMeta := make([]byte, len(c.metadata), capacity) - copy(withMeta, c.metadata) - for _, d := range batch.agentData { - withMeta = append(withMeta, '\n') - withMeta = append(withMeta, d.Data...) - } - return APMData{Data: withMeta} + defer c.batch.Reset() + return c.PostToApmServer(ctx, c.batch.ToAPMData()) } diff --git a/apmproxy/apmserver_test.go b/apmproxy/apmserver_test.go index 3bcc4d04..604faaab 100644 --- a/apmproxy/apmserver_test.go +++ b/apmproxy/apmserver_test.go @@ -78,6 +78,7 @@ func TestPostToApmServerDataCompressed(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) require.NoError(t, apmClient.PostToApmServer(context.Background(), agentData)) @@ -124,6 +125,7 @@ func TestPostToApmServerDataNotCompressed(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) require.NoError(t, apmClient.PostToApmServer(context.Background(), agentData)) @@ -133,6 +135,7 @@ func TestGracePeriod(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) @@ -173,6 +176,7 @@ func TestSetHealthyTransport(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -186,6 +190,7 @@ func TestSetFailingTransport(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.ReconnectionCount = 0 @@ -198,6 +203,7 @@ func TestSetPendingTransport(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -213,6 +219,7 @@ func TestSetPendingTransportExplicitly(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -225,6 +232,7 @@ func TestSetInvalidTransport(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -269,6 +277,7 @@ func TestEnterBackoffFromHealthy(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -323,6 +332,7 @@ func TestEnterBackoffFromFailing(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) @@ -377,6 +387,7 @@ func TestAPMServerRecovery(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) @@ -423,6 +434,7 @@ func TestAPMServerAuthFails(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -474,6 +486,7 @@ func TestAPMServerRatelimit(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) assert.Equal(t, apmClient.Status, apmproxy.Started) @@ -527,6 +540,7 @@ func TestAPMServerClientFail(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) assert.Equal(t, apmClient.Status, apmproxy.Started) @@ -577,6 +591,7 @@ func TestContinuedAPMServerFailure(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -639,6 +654,14 @@ func TestForwardApmData(t *testing.T) { Type: apmproxy.Agent, } assertGzipBody(agentData) + // After reading agent data with metadata the metadata indicator + // channel should be closed + select { + case _, ok := <-metaAvailable: + require.False(t, ok) + default: + require.Fail(t, "meta channel should be closed after reading agent data") + } // Send lambda logs API data var expected bytes.Buffer @@ -679,6 +702,7 @@ func BenchmarkFlushAPMData(b *testing.B) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(b).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(b, err) @@ -725,6 +749,7 @@ func BenchmarkPostToAPM(b *testing.B) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(b).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(b, err) diff --git a/apmproxy/batch.go b/apmproxy/batch.go index a56ad926..7d2400a8 100644 --- a/apmproxy/batch.go +++ b/apmproxy/batch.go @@ -17,54 +17,101 @@ package apmproxy -import "errors" +import ( + "bytes" + "errors" + "time" +) -// ErrBatchFull signfies that the batch has reached full -// capacity and cannot accept more entires. -var ErrBatchFull = errors.New("batch is full") +var ( + // ErrBatchFull signfies that the batch has reached full capacity + // and cannot accept more entires. + ErrBatchFull = errors.New("batch is full") + // ErrInvalidType is returned for any APMData that is not Lambda type + ErrInvalidType = errors.New("only accepts lambda type data") + // ErrInvalidEncoding is returned for any APMData that is encoded + // with any encoding format + ErrInvalidEncoding = errors.New("encoded data not supported") +) + +var ( + maxSizeThreshold = 0.9 +) // BatchData represents a batch of data without metadata // that will be sent to APMServer. BatchData is not safe // concurrent access. type BatchData struct { - agentData []APMData - maxSize int + metadataBytes int + buf bytes.Buffer + count int + maxSize int + maxAgeInSeconds int + age int64 } // NewBatch creates a new BatchData which can accept a // maximum number of entires as specified by the argument -func NewBatch(maxSize int) *BatchData { - return &BatchData{ - maxSize: maxSize, - agentData: make([]APMData, 0, maxSize), +func NewBatch(maxSize int, maxAgeInSeconds int, metadata []byte) *BatchData { + b := &BatchData{ + maxSize: maxSize, + maxAgeInSeconds: maxAgeInSeconds, } + b.metadataBytes, _ = b.buf.Write(metadata) + return b } // Add adds a new entry to the batch. Returns ErrBatchFull // if batch has reached it's maximum size. func (b *BatchData) Add(d APMData) error { - if len(b.agentData) >= b.maxSize { + if b.count >= b.maxSize { return ErrBatchFull } - - b.agentData = append(b.agentData, d) + if d.Type != Lambda { + return ErrInvalidType + } + if d.ContentEncoding != "" { + return ErrInvalidEncoding + } + if err := b.buf.WriteByte('\n'); err != nil { + return err + } + if _, err := b.buf.Write(d.Data); err != nil { + return err + } + if b.count == 0 { + // For first entry, set the age of the batch + b.age = time.Now().Unix() + } + b.count++ return nil } -// Size return the number of entries in batch. -func (b *BatchData) Size() int { - return len(b.agentData) +// Count return the number of APMData entries in batch. +func (b *BatchData) Count() int { + return b.count } -// ShouldFlush indicates when a batch is ready for flush. -// A batch is marked as ready for flush once it reaches -// 90% of its max size. -func (b *BatchData) ShouldFlush() bool { - return len(b.agentData) >= int(float32(b.maxSize)*0.9) +// ShouldShip indicates when a batch is ready for sending. +// A batch is marked as ready for flush when one of the +// below conditions is reached: +// 1. max size is greater than threshold (90% of maxSize) +// 2. batch is older than maturity age +func (b *BatchData) ShouldShip() bool { + return b.count >= int(float64(b.maxSize)*maxSizeThreshold) || + (b.age > 0 && time.Now().Unix()-b.age > int64(b.maxAgeInSeconds)) } // Reset resets the batch to prepare for new set of data func (b *BatchData) Reset() { - b.agentData = nil - b.agentData = make([]APMData, 0, b.maxSize) + b.count, b.age = 0, 0 + b.buf.Truncate(b.metadataBytes) +} + +// ToAPMData returns APMData with metadata and the accumulated batch +func (b *BatchData) ToAPMData() APMData { + return APMData{ + Data: b.buf.Bytes(), + Type: Lambda, + } } diff --git a/apmproxy/batch_test.go b/apmproxy/batch_test.go index f71b7de9..b58f5787 100644 --- a/apmproxy/batch_test.go +++ b/apmproxy/batch_test.go @@ -19,43 +19,64 @@ package apmproxy import ( "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) +const metadata = `{"metadata":{"service":{"agent":{"name":"apm-lambda-extension","version":"1.1.0"},"framework":{"name":"AWS Lambda","version":""},"language":{"name":"python","version":"3.9.8"},"runtime":{"name":"","version":""},"node":{}},"user":{},"process":{"pid":0},"system":{"container":{"id":""},"kubernetes":{"node":{},"pod":{}}},"cloud":{"provider":"","instance":{},"machine":{},"account":{},"project":{},"service":{}}}}` + func TestAdd(t *testing.T) { t.Run("empty", func(t *testing.T) { - b := NewBatch(1) + b := NewBatch(1, 100, []byte(metadata)) - assert.NoError(t, b.Add(APMData{})) + assert.NoError(t, b.Add(APMData{Type: Lambda})) }) t.Run("full", func(t *testing.T) { - b := NewBatch(1) - require.NoError(t, b.Add(APMData{})) + b := NewBatch(1, 100, []byte(metadata)) + require.NoError(t, b.Add(APMData{Type: Lambda})) - assert.ErrorIs(t, ErrBatchFull, b.Add(APMData{})) + assert.ErrorIs(t, ErrBatchFull, b.Add(APMData{Type: Lambda})) }) } func TestReset(t *testing.T) { - b := NewBatch(1) - require.NoError(t, b.Add(APMData{})) - require.Equal(t, 1, b.Size()) + b := NewBatch(1, 100, []byte(metadata)) + require.NoError(t, b.Add(APMData{Type: Lambda})) + require.Equal(t, 1, b.Count()) b.Reset() - assert.Equal(t, 0, b.Size()) + assert.Equal(t, 0, b.Count()) + assert.Equal(t, 0, b.age) } -func TestShouldFlush(t *testing.T) { - b := NewBatch(10) +func TestShouldShip_ReasonSize(t *testing.T) { + b := NewBatch(10, 100, []byte(metadata)) // Should flush at 90% full for i := 0; i < 9; i++ { - assert.False(t, b.ShouldFlush()) - require.NoError(t, b.Add(APMData{})) + assert.False(t, b.ShouldShip()) + require.NoError(t, b.Add(APMData{Type: Lambda})) + } + + require.Equal(t, 9, b.Count()) + assert.True(t, b.ShouldShip()) +} + +func TestShouldShip_ReasonAge(t *testing.T) { + if testing.Short() { + t.Skip("skipping testing in short mode") } - require.Equal(t, 9, b.Size()) - assert.True(t, b.ShouldFlush()) + b := NewBatch(10, 10, []byte(metadata)) + + assert.False(t, b.ShouldShip()) + require.NoError(t, b.Add(APMData{Type: Lambda})) + + <-time.After(11 * time.Second) + + // Should flush after 10 seconds + require.Equal(t, 1, b.Count()) + assert.True(t, b.ShouldShip()) } diff --git a/apmproxy/client.go b/apmproxy/client.go index 3d2194d8..b5a44fcc 100644 --- a/apmproxy/client.go +++ b/apmproxy/client.go @@ -46,6 +46,8 @@ const ( defaultDataForwarderTimeout time.Duration = 3 * time.Second defaultReceiverAddr = ":8200" defaultAgentBufferSize int = 100 + defaultMaxBatchSize int = 50 + defaultMaxBatchAge int = 10 ) // Client is the client used to communicate with the apm server. @@ -66,9 +68,11 @@ type Client struct { flushMutex sync.Mutex flushCh chan struct{} + batch *BatchData + maxBatchSize int + maxBatchAge int + metadataAvailable chan<- struct{} - metadataMutex sync.RWMutex - metadata []byte } func NewClient(opts ...Option) (*Client, error) { @@ -90,6 +94,8 @@ func NewClient(opts ...Option) (*Client, error) { }, sendStrategy: SyncFlush, flushCh: make(chan struct{}), + maxBatchSize: defaultMaxBatchSize, + maxBatchAge: defaultMaxBatchAge, } c.client.Timeout = defaultDataForwarderTimeout @@ -106,6 +112,10 @@ func NewClient(opts ...Option) (*Client, error) { return nil, errors.New("logger cannot be empty") } + if c.metadataAvailable == nil { + return nil, errors.New("metadata available indicator is required") + } + // normalize server URL if !strings.HasSuffix(c.serverURL, "/") { c.serverURL = c.serverURL + "/" diff --git a/apmproxy/client_test.go b/apmproxy/client_test.go index 8dd75d36..3acab6db 100644 --- a/apmproxy/client_test.go +++ b/apmproxy/client_test.go @@ -18,9 +18,10 @@ package apmproxy_test import ( - "github.com/elastic/apm-aws-lambda/apmproxy" "testing" + "github.com/elastic/apm-aws-lambda/apmproxy" + "github.com/stretchr/testify/require" "go.uber.org/zap/zaptest" ) @@ -34,15 +35,19 @@ func TestClient(t *testing.T) { expectedErr: true, }, "missing base url": { + opts: []apmproxy.Option{}, + expectedErr: true, + }, + "missing logger": { opts: []apmproxy.Option{ - apmproxy.WithURL(""), - apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithURL("https://example.com"), }, expectedErr: true, }, - "missing logger": { + "missing metadata indicator": { opts: []apmproxy.Option{ apmproxy.WithURL("https://example.com"), + apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), }, expectedErr: true, }, @@ -50,6 +55,7 @@ func TestClient(t *testing.T) { opts: []apmproxy.Option{ apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), }, }, } diff --git a/apmproxy/receiver_test.go b/apmproxy/receiver_test.go index a66de72c..2179a1cc 100644 --- a/apmproxy/receiver_test.go +++ b/apmproxy/receiver_test.go @@ -19,7 +19,6 @@ package apmproxy_test import ( "bytes" - "github.com/elastic/apm-aws-lambda/apmproxy" "io" "net" "net/http" @@ -28,6 +27,8 @@ import ( "testing" "time" + "github.com/elastic/apm-aws-lambda/apmproxy" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/zap/zaptest" @@ -57,6 +58,7 @@ func TestInfoProxy(t *testing.T) { apmproxy.WithReceiverAddress(":1234"), apmproxy.WithReceiverTimeout(15*time.Second), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) @@ -102,6 +104,7 @@ func TestInfoProxyErrorStatusCode(t *testing.T) { apmproxy.WithReceiverAddress(":1234"), apmproxy.WithReceiverTimeout(15*time.Second), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) @@ -140,6 +143,7 @@ func Test_handleInfoRequest(t *testing.T) { apmproxy.WithReceiverAddress(":1234"), apmproxy.WithReceiverTimeout(15*time.Second), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) @@ -180,6 +184,7 @@ func Test_handleIntakeV2EventsQueryParam(t *testing.T) { apmproxy.WithReceiverAddress(":1234"), apmproxy.WithReceiverTimeout(15*time.Second), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) require.NoError(t, apmClient.StartReceiver()) @@ -222,6 +227,7 @@ func Test_handleIntakeV2EventsNoQueryParam(t *testing.T) { apmproxy.WithReceiverAddress(":1234"), apmproxy.WithReceiverTimeout(15*time.Second), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) require.NoError(t, apmClient.StartReceiver()) @@ -265,6 +271,7 @@ func Test_handleIntakeV2EventsQueryParamEmptyData(t *testing.T) { apmproxy.WithReceiverAddress(":1234"), apmproxy.WithReceiverTimeout(15*time.Second), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), + apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) require.NoError(t, apmClient.StartReceiver()) diff --git a/app/app.go b/app/app.go index e3667533..9d39bffb 100644 --- a/app/app.go +++ b/app/app.go @@ -19,6 +19,7 @@ package app import ( "context" + "errors" "fmt" "os" "strconv" @@ -52,6 +53,10 @@ func New(ctx context.Context, opts ...ConfigOption) (*App, error) { opt(&c) } + if c.metadataAvailable == nil { + return nil, errors.New("metadata available indicator is required") + } + app := &App{ extensionName: c.extensionName, } @@ -67,7 +72,6 @@ func New(ctx context.Context, opts ...ConfigOption) (*App, error) { return nil, err } - metadataAvailable := make(chan struct{}) app.extensionClient = extension.NewClient(c.awsLambdaRuntimeAPI, app.logger) if !c.disableLogsAPI { @@ -87,7 +91,7 @@ func New(ctx context.Context, opts ...ConfigOption) (*App, error) { logsapi.WithLogBuffer(100), logsapi.WithLogger(app.logger), logsapi.WithSubscriptionTypes(subscriptionLogStreams...), - logsapi.WithMetadataAvailableIndicator(metadataAvailable), + logsapi.WithMetadataAvailableIndicator(c.metadataAvailable), ) if err != nil { return nil, err @@ -134,7 +138,7 @@ func New(ctx context.Context, opts ...ConfigOption) (*App, error) { apmproxy.WithLogger(app.logger), apmproxy.WithAPIKey(apmServerAPIKey), apmproxy.WithSecretToken(apmServerSecretToken), - apmproxy.WithMetadataAvailableIndicator(metadataAvailable), + apmproxy.WithMetadataAvailableIndicator(c.metadataAvailable), ) ac, err := apmproxy.NewClient(apmOpts...) diff --git a/app/config.go b/app/config.go index 4311de5a..c34bcf61 100644 --- a/app/config.go +++ b/app/config.go @@ -27,6 +27,7 @@ type appConfig struct { enableFunctionLogSubscription bool logLevel string logsapiAddr string + metadataAvailable chan struct{} } // ConfigOption is used to configure the lambda extension @@ -85,3 +86,11 @@ func WithAWSConfig(awsConfig aws.Config) ConfigOption { c.awsConfig = awsConfig } } + +// WithMetadataAvailableIndicator configures a channel which should +// be closed when metadata is available after parsing agent data. +func WithMetadataAvailableIndicator(ch chan struct{}) ConfigOption { + return func(c *appConfig) { + c.metadataAvailable = ch + } +} diff --git a/logsapi/client_test.go b/logsapi/client_test.go index b29d8cb8..175f16f4 100644 --- a/logsapi/client_test.go +++ b/logsapi/client_test.go @@ -123,13 +123,32 @@ func TestSubscribeAWSRequest(t *testing.T) { addr := "localhost:8080" testCases := map[string]struct { - opts []logsapi.ClientOption + getOpts func() []logsapi.ClientOption + expectedRespCode int }{ - "valid response": { - opts: []logsapi.ClientOption{ - logsapi.WithListenerAddress(addr), - logsapi.WithLogger(zaptest.NewLogger(t).Sugar()), + "before_metadata_available": { + getOpts: func() []logsapi.ClientOption { + metaAvailable := make(chan struct{}) + return []logsapi.ClientOption{ + logsapi.WithListenerAddress(addr), + logsapi.WithLogger(zaptest.NewLogger(t).Sugar()), + logsapi.WithMetadataAvailableIndicator(metaAvailable), + } + }, + expectedRespCode: http.StatusInternalServerError, + }, + "after_metadata_available": { + getOpts: func() []logsapi.ClientOption { + metaAvailable := make(chan struct{}) + // Close the channel to signal metadata is available + defer close(metaAvailable) + return []logsapi.ClientOption{ + logsapi.WithListenerAddress(addr), + logsapi.WithLogger(zaptest.NewLogger(t).Sugar()), + logsapi.WithMetadataAvailableIndicator(metaAvailable), + } }, + expectedRespCode: http.StatusOK, }, } for name, tc := range testCases { @@ -144,7 +163,7 @@ func TestSubscribeAWSRequest(t *testing.T) { defer s.Close() cOpts := append( - tc.opts, + tc.getOpts(), logsapi.WithLogsAPIBaseURL(s.URL), logsapi.WithLogBuffer(1), logsapi.WithSubscriptionTypes(logsapi.Platform, logsapi.Function), @@ -169,7 +188,7 @@ func TestSubscribeAWSRequest(t *testing.T) { // Send the request to the logs listener rsp, err := http.DefaultClient.Do(req) require.NoError(t, err) - require.Equal(t, http.StatusOK, rsp.StatusCode) + require.Equal(t, tc.expectedRespCode, rsp.StatusCode) require.NoError(t, rsp.Body.Close()) require.NoError(t, c.Shutdown()) }) diff --git a/logsapi/event.go b/logsapi/event.go index 0a23e316..c961982e 100644 --- a/logsapi/event.go +++ b/logsapi/event.go @@ -33,6 +33,7 @@ const ( PlatformRuntimeDone LogEventType = "platform.runtimeDone" PlatformFault LogEventType = "platform.fault" PlatformReport LogEventType = "platform.report" + PlatformLogsDropped LogEventType = "platform.logsDropped" PlatformStart LogEventType = "platform.start" PlatformEnd LogEventType = "platform.end" FunctionLog LogEventType = "function" @@ -90,7 +91,7 @@ func (lc *Client) ProcessLogs( lc.logger.Debug("Received platform report for the previous function invocation") processedMetrics, err := ProcessPlatformReport(prevEvent, logEvent) if err != nil { - lc.logger.Errorf("Error processing Lambda platform metrics : %v", err) + lc.logger.Errorf("Error processing Lambda platform metrics: %v", err) } else { apmClient.EnqueueAPMData(processedMetrics) } @@ -98,6 +99,8 @@ func (lc *Client) ProcessLogs( lc.logger.Warn("report event request id didn't match the previous event id") lc.logger.Debug("Log API runtimeDone event request id didn't match") } + case PlatformLogsDropped: + lc.logger.Warn("Logs dropped due to extension falling behind: %v", logEvent.Record) case FunctionLog: lc.logger.Debug("Received function log") processedLog, err := ProcessFunctionLog( diff --git a/main.go b/main.go index 6e43626e..0a53c2c3 100644 --- a/main.go +++ b/main.go @@ -45,6 +45,7 @@ func main() { app.WithLambdaRuntimeAPI(os.Getenv("AWS_LAMBDA_RUNTIME_API")), app.WithLogLevel(os.Getenv("ELASTIC_APM_LOG_LEVEL")), app.WithAWSConfig(cfg), + app.WithMetadataAvailableIndicator(make(chan struct{})), } captureLogs, err := strconv.ParseBool(os.Getenv("ELASTIC_APM_LAMBDA_CAPTURE_LOGS")) diff --git a/main_test.go b/main_test.go index 72609e82..e31579c3 100644 --- a/main_test.go +++ b/main_test.go @@ -513,6 +513,8 @@ func TestAPMServerHangs(t *testing.T) { newMockLambdaServer(t, logsapiAddr, eventsChannel, l) eventsChain := []MockEvent{ + // The first response sets the metadata so that the lambda logs handler is accepting requests + {Type: InvokeStandard, APMServerBehavior: TimelyResponse, ExecutionDuration: 1, Timeout: 5}, {Type: InvokeStandard, APMServerBehavior: Hangs, ExecutionDuration: 1, Timeout: 500}, } eventQueueGenerator(eventsChain, eventsChannel) @@ -772,12 +774,14 @@ func TestMetricsWithMetadata(t *testing.T) { func runApp(t *testing.T, logsapiAddr string) <-chan struct{} { ctx, cancel := context.WithCancel(context.Background()) + metadataIndicator := make(chan struct{}) app, err := app.New(ctx, app.WithExtensionName("apm-lambda-extension"), app.WithLambdaRuntimeAPI(os.Getenv("AWS_LAMBDA_RUNTIME_API")), app.WithLogLevel("debug"), app.WithLogsapiAddress(logsapiAddr), + app.WithMetadataAvailableIndicator(metadataIndicator), ) require.NoError(t, err) From e42a3df3c60b85519101d094ceaa3b543fbfe315 Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Thu, 29 Sep 2022 10:57:42 +0800 Subject: [PATCH 06/15] Fix batch creation and main_test logic --- apmproxy/apmserver.go | 2 +- apmproxy/batch.go | 6 +++--- apmproxy/batch_test.go | 12 ++++-------- main_test.go | 2 +- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/apmproxy/apmserver.go b/apmproxy/apmserver.go index 40bf5cd6..dbbbb794 100644 --- a/apmproxy/apmserver.go +++ b/apmproxy/apmserver.go @@ -319,7 +319,7 @@ func (c *Client) forwardAPMDataByType(ctx context.Context, apmData APMData) erro if err != nil { return fmt.Errorf("failed to extract metadata from agent payload %w", err) } - c.batch = NewBatch(c.maxBatchSize, metadata) + c.batch = NewBatch(c.maxBatchSize, c.maxBatchAge, metadata) // broadcast that metadata is available close(c.metadataAvailable) } diff --git a/apmproxy/batch.go b/apmproxy/batch.go index 7d2400a8..37860340 100644 --- a/apmproxy/batch.go +++ b/apmproxy/batch.go @@ -25,7 +25,7 @@ import ( var ( // ErrBatchFull signfies that the batch has reached full capacity - // and cannot accept more entires. + // and cannot accept more entries. ErrBatchFull = errors.New("batch is full") // ErrInvalidType is returned for any APMData that is not Lambda type ErrInvalidType = errors.New("only accepts lambda type data") @@ -62,9 +62,9 @@ func NewBatch(maxSize int, maxAgeInSeconds int, metadata []byte) *BatchData { } // Add adds a new entry to the batch. Returns ErrBatchFull -// if batch has reached it's maximum size. +// if batch has reached its maximum size. func (b *BatchData) Add(d APMData) error { - if b.count >= b.maxSize { + if b.count == b.maxSize { return ErrBatchFull } if d.Type != Lambda { diff --git a/apmproxy/batch_test.go b/apmproxy/batch_test.go index b58f5787..b8e6d7c3 100644 --- a/apmproxy/batch_test.go +++ b/apmproxy/batch_test.go @@ -48,7 +48,7 @@ func TestReset(t *testing.T) { b.Reset() assert.Equal(t, 0, b.Count()) - assert.Equal(t, 0, b.age) + assert.Equal(t, int64(0), b.age) } func TestShouldShip_ReasonSize(t *testing.T) { @@ -65,18 +65,14 @@ func TestShouldShip_ReasonSize(t *testing.T) { } func TestShouldShip_ReasonAge(t *testing.T) { - if testing.Short() { - t.Skip("skipping testing in short mode") - } - - b := NewBatch(10, 10, []byte(metadata)) + b := NewBatch(10, 2, []byte(metadata)) assert.False(t, b.ShouldShip()) require.NoError(t, b.Add(APMData{Type: Lambda})) - <-time.After(11 * time.Second) + <-time.After(3 * time.Second) - // Should flush after 10 seconds + // Should be ready for send after 3 seconds require.Equal(t, 1, b.Count()) assert.True(t, b.ShouldShip()) } diff --git a/main_test.go b/main_test.go index e31579c3..385441d6 100644 --- a/main_test.go +++ b/main_test.go @@ -694,7 +694,7 @@ func TestInfoRequestHangs(t *testing.T) { lambdaServerInternals := newMockLambdaServer(t, logsapiAddr, eventsChannel, l) eventsChain := []MockEvent{ - {Type: InvokeStandardInfo, APMServerBehavior: Hangs, ExecutionDuration: 1, Timeout: 500}, + {Type: InvokeStandardInfo, APMServerBehavior: Hangs, ExecutionDuration: 1, Timeout: 5}, } eventQueueGenerator(eventsChain, eventsChannel) select { From 632d3953ab63074a31e6d34748c658cf44c68ee9 Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Thu, 29 Sep 2022 15:21:18 +0800 Subject: [PATCH 07/15] Address review comments --- apmproxy/apmserver.go | 3 ++- apmproxy/apmserver_test.go | 13 +++++++++---- apmproxy/batch.go | 27 ++++++++++++++------------- apmproxy/batch_test.go | 14 +++++++------- apmproxy/client.go | 4 ++-- apmproxy/option.go | 24 ++++++++++++++++++++++++ logsapi/event.go | 2 +- 7 files changed, 59 insertions(+), 28 deletions(-) diff --git a/apmproxy/apmserver.go b/apmproxy/apmserver.go index dbbbb794..d32ba97f 100644 --- a/apmproxy/apmserver.go +++ b/apmproxy/apmserver.go @@ -51,7 +51,8 @@ func (c *Client) ForwardApmData(ctx context.Context) error { select { case <-ctx.Done(): c.logger.Debug("Invocation context cancelled, not processing any more agent data") - return c.sendBatch(ctx) + // Any remaining data in the batch will be handled by FlushAPMData + return nil case apmData := <-c.DataChannel: if err := c.forwardAPMDataByType(ctx, apmData); err != nil { return fmt.Errorf("error sending to APM server, skipping: %v", err) diff --git a/apmproxy/apmserver_test.go b/apmproxy/apmserver_test.go index 604faaab..f040afa3 100644 --- a/apmproxy/apmserver_test.go +++ b/apmproxy/apmserver_test.go @@ -630,12 +630,14 @@ func TestForwardApmData(t *testing.T) { agentData := fmt.Sprintf("%s\n%s", metadata, `{"log": {"message": "test"}}`) lambdaData := `{"log": {"message": "test"}}` metaAvailable := make(chan struct{}) + maxBatchAge := 500 * time.Millisecond apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), apmproxy.WithMetadataAvailableIndicator(metaAvailable), - // Use unbuffered channel for ease of testing - apmproxy.WithAgentDataBufferSize(0), + apmproxy.WithAgentDataBufferSize(10), + // Configure a small batch age for ease of testing + apmproxy.WithMaxBatchAge(maxBatchAge), ) require.NoError(t, err) @@ -668,6 +670,10 @@ func TestForwardApmData(t *testing.T) { expected.WriteString(metadata) // Send multiple lambda logs to batch data for i := 0; i < 5; i++ { + if i == 4 { + // Wait for batch age to make sure the batch is mature to be sent + <-time.After(maxBatchAge + time.Millisecond) + } apmClient.DataChannel <- apmproxy.APMData{ Data: []byte(lambdaData), Type: apmproxy.Lambda, @@ -676,10 +682,9 @@ func TestForwardApmData(t *testing.T) { expected.WriteString(lambdaData) } - // Trigger a batch send by cancelling context - cancel() assertGzipBody(expected.String()) // Wait for ForwardApmData to exit + cancel() wg.Wait() } diff --git a/apmproxy/batch.go b/apmproxy/batch.go index 37860340..ba64340f 100644 --- a/apmproxy/batch.go +++ b/apmproxy/batch.go @@ -36,26 +36,27 @@ var ( var ( maxSizeThreshold = 0.9 + zeroTime = time.Time{} ) // BatchData represents a batch of data without metadata // that will be sent to APMServer. BatchData is not safe // concurrent access. type BatchData struct { - metadataBytes int - buf bytes.Buffer - count int - maxSize int - maxAgeInSeconds int - age int64 + metadataBytes int + buf bytes.Buffer + count int + age time.Time + maxSize int + maxAge time.Duration } // NewBatch creates a new BatchData which can accept a // maximum number of entires as specified by the argument -func NewBatch(maxSize int, maxAgeInSeconds int, metadata []byte) *BatchData { +func NewBatch(maxSize int, maxAge time.Duration, metadata []byte) *BatchData { b := &BatchData{ - maxSize: maxSize, - maxAgeInSeconds: maxAgeInSeconds, + maxSize: maxSize, + maxAge: maxAge, } b.metadataBytes, _ = b.buf.Write(metadata) return b @@ -81,7 +82,7 @@ func (b *BatchData) Add(d APMData) error { } if b.count == 0 { // For first entry, set the age of the batch - b.age = time.Now().Unix() + b.age = time.Now() } b.count++ return nil @@ -98,13 +99,13 @@ func (b *BatchData) Count() int { // 1. max size is greater than threshold (90% of maxSize) // 2. batch is older than maturity age func (b *BatchData) ShouldShip() bool { - return b.count >= int(float64(b.maxSize)*maxSizeThreshold) || - (b.age > 0 && time.Now().Unix()-b.age > int64(b.maxAgeInSeconds)) + return (b.count >= int(float64(b.maxSize)*maxSizeThreshold)) || + (!b.age.IsZero() && time.Since(b.age) > b.maxAge) } // Reset resets the batch to prepare for new set of data func (b *BatchData) Reset() { - b.count, b.age = 0, 0 + b.count, b.age = 0, zeroTime b.buf.Truncate(b.metadataBytes) } diff --git a/apmproxy/batch_test.go b/apmproxy/batch_test.go index b8e6d7c3..22a5de08 100644 --- a/apmproxy/batch_test.go +++ b/apmproxy/batch_test.go @@ -29,12 +29,12 @@ const metadata = `{"metadata":{"service":{"agent":{"name":"apm-lambda-extension" func TestAdd(t *testing.T) { t.Run("empty", func(t *testing.T) { - b := NewBatch(1, 100, []byte(metadata)) + b := NewBatch(1, time.Hour, []byte(metadata)) assert.NoError(t, b.Add(APMData{Type: Lambda})) }) t.Run("full", func(t *testing.T) { - b := NewBatch(1, 100, []byte(metadata)) + b := NewBatch(1, time.Hour, []byte(metadata)) require.NoError(t, b.Add(APMData{Type: Lambda})) assert.ErrorIs(t, ErrBatchFull, b.Add(APMData{Type: Lambda})) @@ -42,17 +42,17 @@ func TestAdd(t *testing.T) { } func TestReset(t *testing.T) { - b := NewBatch(1, 100, []byte(metadata)) + b := NewBatch(1, time.Hour, []byte(metadata)) require.NoError(t, b.Add(APMData{Type: Lambda})) require.Equal(t, 1, b.Count()) b.Reset() assert.Equal(t, 0, b.Count()) - assert.Equal(t, int64(0), b.age) + assert.True(t, b.age.IsZero()) } func TestShouldShip_ReasonSize(t *testing.T) { - b := NewBatch(10, 100, []byte(metadata)) + b := NewBatch(10, time.Hour, []byte(metadata)) // Should flush at 90% full for i := 0; i < 9; i++ { @@ -65,12 +65,12 @@ func TestShouldShip_ReasonSize(t *testing.T) { } func TestShouldShip_ReasonAge(t *testing.T) { - b := NewBatch(10, 2, []byte(metadata)) + b := NewBatch(10, time.Second, []byte(metadata)) assert.False(t, b.ShouldShip()) require.NoError(t, b.Add(APMData{Type: Lambda})) - <-time.After(3 * time.Second) + <-time.After(2 * time.Second) // Should be ready for send after 3 seconds require.Equal(t, 1, b.Count()) diff --git a/apmproxy/client.go b/apmproxy/client.go index b5a44fcc..ab5a93a5 100644 --- a/apmproxy/client.go +++ b/apmproxy/client.go @@ -47,7 +47,7 @@ const ( defaultReceiverAddr = ":8200" defaultAgentBufferSize int = 100 defaultMaxBatchSize int = 50 - defaultMaxBatchAge int = 10 + defaultMaxBatchAge time.Duration = 10 * time.Second ) // Client is the client used to communicate with the apm server. @@ -70,7 +70,7 @@ type Client struct { batch *BatchData maxBatchSize int - maxBatchAge int + maxBatchAge time.Duration metadataAvailable chan<- struct{} } diff --git a/apmproxy/option.go b/apmproxy/option.go index 8ec4ba75..48cde77a 100644 --- a/apmproxy/option.go +++ b/apmproxy/option.go @@ -78,6 +78,8 @@ func WithAgentDataBufferSize(size int) Option { } } +// WithLogger configures a custom zap logger to be used by +// the client. func WithLogger(logger *zap.SugaredLogger) Option { return func(c *Client) { c.logger = logger @@ -91,3 +93,25 @@ func WithMetadataAvailableIndicator(ch chan<- struct{}) Option { c.metadataAvailable = ch } } + +// WithMaxBatchSize configures the maximum batch size for +// the payload sent to the APMServer +func WithMaxBatchSize(size int) Option { + return func(c *Client) { + c.maxBatchSize = size + } +} + +// WithMaxBatchAge configures the maximum age of the batch +// before it is sent to APMServer. Age is measured from the +// time the first entry is added in the batch. +// +// It is possible for batch age to be greater than the +// configured max batch age when sending since a send is +// triggered by a new log event and log events can be delayed +// due to various factors. +func WithMaxBatchAge(age time.Duration) Option { + return func(c *Client) { + c.maxBatchAge = age + } +} diff --git a/logsapi/event.go b/logsapi/event.go index c961982e..fa6f62c8 100644 --- a/logsapi/event.go +++ b/logsapi/event.go @@ -100,7 +100,7 @@ func (lc *Client) ProcessLogs( lc.logger.Debug("Log API runtimeDone event request id didn't match") } case PlatformLogsDropped: - lc.logger.Warn("Logs dropped due to extension falling behind: %v", logEvent.Record) + lc.logger.Warnf("Logs dropped due to extension falling behind: %v", logEvent.Record) case FunctionLog: lc.logger.Debug("Received function log") processedLog, err := ProcessFunctionLog( From 889885446f0f40e7570379177e7e120e2dab7498 Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Tue, 18 Oct 2022 11:17:57 +0800 Subject: [PATCH 08/15] Refactor logsapi to be independent of metadata status --- apmproxy/apmserver.go | 72 ++++++++++++++++++++++++++++---------- apmproxy/apmserver_test.go | 10 +++--- apmproxy/client.go | 9 +++-- apmproxy/option.go | 4 +-- apmproxy/receiver.go | 6 +++- apmproxy/receiver_test.go | 2 +- app/app.go | 3 +- logsapi/client.go | 3 +- logsapi/client_test.go | 33 ++++------------- logsapi/event.go | 4 +-- logsapi/option.go | 8 ----- logsapi/route_handlers.go | 24 +++++-------- 12 files changed, 92 insertions(+), 86 deletions(-) diff --git a/apmproxy/apmserver.go b/apmproxy/apmserver.go index d32ba97f..66656338 100644 --- a/apmproxy/apmserver.go +++ b/apmproxy/apmserver.go @@ -47,15 +47,33 @@ func (c *Client) ForwardApmData(ctx context.Context) error { if c.IsUnhealthy() { return nil } + + // Wait for metadata to be available, metadata will be available as soon as + // the first agent data is processed. + select { + case <-ctx.Done(): + c.logger.Debug("Invocation context cancelled, not processing any more agent data") + return nil + case data := <-c.AgentDataChannel: + if err := c.forwardAPMDataByType(ctx, data); err != nil { + return err + } + case <-c.metadataAvailable: + } + + // Process lambda and agent data after metadata is available for { select { case <-ctx.Done(): c.logger.Debug("Invocation context cancelled, not processing any more agent data") - // Any remaining data in the batch will be handled by FlushAPMData return nil - case apmData := <-c.DataChannel: - if err := c.forwardAPMDataByType(ctx, apmData); err != nil { - return fmt.Errorf("error sending to APM server, skipping: %v", err) + case data := <-c.AgentDataChannel: + if err := c.forwardAPMDataByType(ctx, data); err != nil { + return err + } + case data := <-c.LambdaDataChannel: + if err := c.forwardAPMDataByType(ctx, data); err != nil { + return err } } } @@ -68,19 +86,48 @@ func (c *Client) FlushAPMData(ctx context.Context) { return } c.logger.Debug("Flush started - Checking for agent data") + + // Flush agent data first to make sure metadata is available if possible + func() { + for { + select { + case <-ctx.Done(): + c.logger.Debugf("Failed to flush completely, may result in data drop") + case data := <-c.AgentDataChannel: + if err := c.forwardAPMDataByType(ctx, data); err != nil { + c.logger.Errorf("Error sending to APM Server, skipping: %v", err) + } + default: + c.logger.Debug("Flush ended for agent data - no data in buffer") + return + } + } + }() + + // If metadata still not available then fail fast + select { + case <-c.metadataAvailable: + default: + c.logger.Warnf("Metadata not available at flush, skipping sending lambda data to APM Server") + return + } + + // Flush lambda data for { select { - case apmData := <-c.DataChannel: - c.logger.Debug("Flush in progress - Processing agent data") + case <-ctx.Done(): + c.logger.Debugf("Failed to flush completely, may result in data drop") + case apmData := <-c.LambdaDataChannel: + c.logger.Debug("Flush in progress - Processing lambda data") if err := c.forwardAPMDataByType(ctx, apmData); err != nil { c.logger.Errorf("Error sending to APM server, skipping: %v", err) } default: - c.logger.Debug("Flush ended - No agent data on buffer") // Flush any remaining data in batch if err := c.sendBatch(ctx); err != nil { c.logger.Errorf("Error sending to APM server, skipping: %v", err) } + c.logger.Debug("Flush ended for lambda data - no data in buffer") return } } @@ -279,17 +326,6 @@ func (c *Client) ComputeGracePeriod() time.Duration { return time.Duration((gracePeriodWithoutJitter + jitter*gracePeriodWithoutJitter) * float64(time.Second)) } -// EnqueueAPMData adds a apm data struct to the agent data channel, effectively queueing for a send -// to the APM server. -func (c *Client) EnqueueAPMData(apmData APMData) { - select { - case c.DataChannel <- apmData: - c.logger.Debug("Adding APM data to buffer to be sent to apm server") - default: - c.logger.Warn("Channel full: dropping a subset of agent data") - } -} - // ShouldFlush returns true if the client should flush APM data after processing the event. func (c *Client) ShouldFlush() bool { return c.sendStrategy == SyncFlush diff --git a/apmproxy/apmserver_test.go b/apmproxy/apmserver_test.go index f040afa3..ea149317 100644 --- a/apmproxy/apmserver_test.go +++ b/apmproxy/apmserver_test.go @@ -630,7 +630,7 @@ func TestForwardApmData(t *testing.T) { agentData := fmt.Sprintf("%s\n%s", metadata, `{"log": {"message": "test"}}`) lambdaData := `{"log": {"message": "test"}}` metaAvailable := make(chan struct{}) - maxBatchAge := 500 * time.Millisecond + maxBatchAge := 1 * time.Second apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), @@ -651,7 +651,7 @@ func TestForwardApmData(t *testing.T) { }() // Populate metadata by sending agent data - apmClient.DataChannel <- apmproxy.APMData{ + apmClient.AgentDataChannel <- apmproxy.APMData{ Data: []byte(agentData), Type: apmproxy.Agent, } @@ -674,7 +674,7 @@ func TestForwardApmData(t *testing.T) { // Wait for batch age to make sure the batch is mature to be sent <-time.After(maxBatchAge + time.Millisecond) } - apmClient.DataChannel <- apmproxy.APMData{ + apmClient.LambdaDataChannel <- apmproxy.APMData{ Data: []byte(lambdaData), Type: apmproxy.Lambda, } @@ -724,9 +724,9 @@ func BenchmarkFlushAPMData(b *testing.B) { b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { - apmClient.DataChannel <- agentAPMData + apmClient.AgentDataChannel <- agentAPMData for j := 0; j < 99; j++ { - apmClient.DataChannel <- apmproxy.APMData{ + apmClient.LambdaDataChannel <- apmproxy.APMData{ Data: []byte("this is test log"), Type: apmproxy.Lambda, } diff --git a/apmproxy/client.go b/apmproxy/client.go index ab5a93a5..4699441c 100644 --- a/apmproxy/client.go +++ b/apmproxy/client.go @@ -46,6 +46,7 @@ const ( defaultDataForwarderTimeout time.Duration = 3 * time.Second defaultReceiverAddr = ":8200" defaultAgentBufferSize int = 100 + defaultLambdaBufferSize int = 100 defaultMaxBatchSize int = 50 defaultMaxBatchAge time.Duration = 10 * time.Second ) @@ -54,7 +55,8 @@ const ( type Client struct { mu sync.RWMutex bufferPool sync.Pool - DataChannel chan APMData + AgentDataChannel chan APMData + LambdaDataChannel chan APMData client *http.Client Status Status ReconnectionCount int @@ -72,7 +74,7 @@ type Client struct { maxBatchSize int maxBatchAge time.Duration - metadataAvailable chan<- struct{} + metadataAvailable chan struct{} } func NewClient(opts ...Option) (*Client, error) { @@ -80,7 +82,8 @@ func NewClient(opts ...Option) (*Client, error) { bufferPool: sync.Pool{New: func() interface{} { return &bytes.Buffer{} }}, - DataChannel: make(chan APMData, defaultAgentBufferSize), + AgentDataChannel: make(chan APMData, defaultAgentBufferSize), + LambdaDataChannel: make(chan APMData, defaultLambdaBufferSize), client: &http.Client{ Transport: http.DefaultTransport.(*http.Transport).Clone(), }, diff --git a/apmproxy/option.go b/apmproxy/option.go index 48cde77a..34622fb1 100644 --- a/apmproxy/option.go +++ b/apmproxy/option.go @@ -74,7 +74,7 @@ func WithSendStrategy(strategy SendStrategy) Option { // WithAgentDataBufferSize sets the agent data buffer size. func WithAgentDataBufferSize(size int) Option { return func(c *Client) { - c.DataChannel = make(chan APMData, size) + c.AgentDataChannel = make(chan APMData, size) } } @@ -88,7 +88,7 @@ func WithLogger(logger *zap.SugaredLogger) Option { // WithMetadataAvailableIndicator configures a channel // which will broadcast metadata available event on close -func WithMetadataAvailableIndicator(ch chan<- struct{}) Option { +func WithMetadataAvailableIndicator(ch chan struct{}) Option { return func(c *Client) { c.metadataAvailable = ch } diff --git a/apmproxy/receiver.go b/apmproxy/receiver.go index 74843ad8..3361b8a4 100644 --- a/apmproxy/receiver.go +++ b/apmproxy/receiver.go @@ -144,7 +144,11 @@ func (c *Client) handleIntakeV2Events() func(w http.ResponseWriter, r *http.Requ } if len(agentData.Data) != 0 { - c.EnqueueAPMData(agentData) + select { + case c.AgentDataChannel <- agentData: + default: + c.logger.Warnf("Channel full: dropping a subset of agent data") + } } if agentFlushed { diff --git a/apmproxy/receiver_test.go b/apmproxy/receiver_test.go index 2179a1cc..e908eed6 100644 --- a/apmproxy/receiver_test.go +++ b/apmproxy/receiver_test.go @@ -249,7 +249,7 @@ func Test_handleIntakeV2EventsNoQueryParam(t *testing.T) { resp, err := client.Do(req) require.NoError(t, err) select { - case <-apmClient.DataChannel: + case <-apmClient.AgentDataChannel: case <-time.After(1 * time.Second): t.Log("Timed out waiting for server to send FuncDone signal") t.Fail() diff --git a/app/app.go b/app/app.go index 9d39bffb..3d0416d7 100644 --- a/app/app.go +++ b/app/app.go @@ -88,10 +88,9 @@ func New(ctx context.Context, opts ...ConfigOption) (*App, error) { lc, err := logsapi.NewClient( logsapi.WithLogsAPIBaseURL(fmt.Sprintf("http://%s", c.awsLambdaRuntimeAPI)), logsapi.WithListenerAddress(addr), - logsapi.WithLogBuffer(100), + logsapi.WithLogBuffer(0), logsapi.WithLogger(app.logger), logsapi.WithSubscriptionTypes(subscriptionLogStreams...), - logsapi.WithMetadataAvailableIndicator(c.metadataAvailable), ) if err != nil { return nil, err diff --git a/logsapi/client.go b/logsapi/client.go index b0d7bcf6..b7d18ac0 100644 --- a/logsapi/client.go +++ b/logsapi/client.go @@ -52,7 +52,6 @@ type Client struct { logsAPIBaseURL string logsAPISubscriptionTypes []SubscriptionType logsChannel chan LogEvent - metadataAvailable <-chan struct{} listenerAddr string server *http.Server logger *zap.SugaredLogger @@ -70,7 +69,7 @@ func NewClient(opts ...ClientOption) (*Client, error) { } mux := http.NewServeMux() - mux.HandleFunc("/", handleLogEventsRequest(c.logger, c.logsChannel, c.metadataAvailable)) + mux.HandleFunc("/", handleLogEventsRequest(c.logger, c.logsChannel)) c.server.Handler = mux diff --git a/logsapi/client_test.go b/logsapi/client_test.go index 175f16f4..b29d8cb8 100644 --- a/logsapi/client_test.go +++ b/logsapi/client_test.go @@ -123,32 +123,13 @@ func TestSubscribeAWSRequest(t *testing.T) { addr := "localhost:8080" testCases := map[string]struct { - getOpts func() []logsapi.ClientOption - expectedRespCode int + opts []logsapi.ClientOption }{ - "before_metadata_available": { - getOpts: func() []logsapi.ClientOption { - metaAvailable := make(chan struct{}) - return []logsapi.ClientOption{ - logsapi.WithListenerAddress(addr), - logsapi.WithLogger(zaptest.NewLogger(t).Sugar()), - logsapi.WithMetadataAvailableIndicator(metaAvailable), - } - }, - expectedRespCode: http.StatusInternalServerError, - }, - "after_metadata_available": { - getOpts: func() []logsapi.ClientOption { - metaAvailable := make(chan struct{}) - // Close the channel to signal metadata is available - defer close(metaAvailable) - return []logsapi.ClientOption{ - logsapi.WithListenerAddress(addr), - logsapi.WithLogger(zaptest.NewLogger(t).Sugar()), - logsapi.WithMetadataAvailableIndicator(metaAvailable), - } + "valid response": { + opts: []logsapi.ClientOption{ + logsapi.WithListenerAddress(addr), + logsapi.WithLogger(zaptest.NewLogger(t).Sugar()), }, - expectedRespCode: http.StatusOK, }, } for name, tc := range testCases { @@ -163,7 +144,7 @@ func TestSubscribeAWSRequest(t *testing.T) { defer s.Close() cOpts := append( - tc.getOpts(), + tc.opts, logsapi.WithLogsAPIBaseURL(s.URL), logsapi.WithLogBuffer(1), logsapi.WithSubscriptionTypes(logsapi.Platform, logsapi.Function), @@ -188,7 +169,7 @@ func TestSubscribeAWSRequest(t *testing.T) { // Send the request to the logs listener rsp, err := http.DefaultClient.Do(req) require.NoError(t, err) - require.Equal(t, tc.expectedRespCode, rsp.StatusCode) + require.Equal(t, http.StatusOK, rsp.StatusCode) require.NoError(t, rsp.Body.Close()) require.NoError(t, c.Shutdown()) }) diff --git a/logsapi/event.go b/logsapi/event.go index fa6f62c8..9c891512 100644 --- a/logsapi/event.go +++ b/logsapi/event.go @@ -93,7 +93,7 @@ func (lc *Client) ProcessLogs( if err != nil { lc.logger.Errorf("Error processing Lambda platform metrics: %v", err) } else { - apmClient.EnqueueAPMData(processedMetrics) + apmClient.LambdaDataChannel <- processedMetrics } } else { lc.logger.Warn("report event request id didn't match the previous event id") @@ -111,7 +111,7 @@ func (lc *Client) ProcessLogs( if err != nil { lc.logger.Errorf("Error processing function log : %v", err) } else { - apmClient.EnqueueAPMData(processedLog) + apmClient.LambdaDataChannel <- processedLog } } case <-ctx.Done(): diff --git a/logsapi/option.go b/logsapi/option.go index 677cbf4d..e431477e 100644 --- a/logsapi/option.go +++ b/logsapi/option.go @@ -55,11 +55,3 @@ func WithSubscriptionTypes(types ...SubscriptionType) ClientOption { c.logsAPISubscriptionTypes = types } } - -// WithMetadataAvailableIndicator configures a channel -// which will broadcast metadata available event on close -func WithMetadataAvailableIndicator(ch <-chan struct{}) ClientOption { - return func(c *Client) { - c.metadataAvailable = ch - } -} diff --git a/logsapi/route_handlers.go b/logsapi/route_handlers.go index 7c10a956..6dbca1b1 100644 --- a/logsapi/route_handlers.go +++ b/logsapi/route_handlers.go @@ -25,22 +25,8 @@ import ( "go.uber.org/zap" ) -func handleLogEventsRequest( - logger *zap.SugaredLogger, - logsChannel chan LogEvent, - metadataAvailable <-chan struct{}, -) func(w http.ResponseWriter, r *http.Request) { +func handleLogEventsRequest(logger *zap.SugaredLogger, logsChannel chan LogEvent) func(w http.ResponseWriter, r *http.Request) { return func(w http.ResponseWriter, r *http.Request) { - select { - case <-metadataAvailable: - case <-time.After(100 * time.Millisecond): - // Without metadata the events cannot be processed - // thus we indicate lambda to keep the buffer - logger.Warnf("Metadata unavailable, signaling lambda to buffer log events") - w.WriteHeader(http.StatusInternalServerError) - return - } - var logEvents []LogEvent if err := json.NewDecoder(r.Body).Decode(&logEvents); err != nil { logger.Errorf("Error unmarshalling log events: %+v", err) @@ -54,7 +40,13 @@ func handleLogEventsRequest( w.WriteHeader(http.StatusInternalServerError) continue } - logsChannel <- logEvents[idx] + select { + case logsChannel <- logEvents[idx]: + case <-r.Context().Done(): + logger.Warnf("Failed to enqueue event, signaling lambda to retry") + w.WriteHeader(http.StatusInternalServerError) + return + } } } } From 2d7eeb76523e654b3b3199bd14cf67a496e443c1 Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Tue, 18 Oct 2022 11:20:56 +0800 Subject: [PATCH 09/15] Address review comments --- apmproxy/apmserver.go | 10 +++++----- apmproxy/batch.go | 2 +- apmproxy/client.go | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/apmproxy/apmserver.go b/apmproxy/apmserver.go index 66656338..275f10df 100644 --- a/apmproxy/apmserver.go +++ b/apmproxy/apmserver.go @@ -138,7 +138,7 @@ func (c *Client) FlushAPMData(ctx context.Context) { // The function compresses the APM agent data, if it's not already compressed. // It sets the APM transport status to failing upon errors, as part of the backoff // strategy. -func (c *Client) PostToApmServer(ctx context.Context, agentData APMData) error { +func (c *Client) PostToApmServer(ctx context.Context, apmData APMData) error { // todo: can this be a streaming or streaming style call that keeps the // connection open across invocations? if c.IsUnhealthy() { @@ -146,11 +146,11 @@ func (c *Client) PostToApmServer(ctx context.Context, agentData APMData) error { } endpointURI := "intake/v2/events" - encoding := agentData.ContentEncoding + encoding := apmData.ContentEncoding var r io.Reader - if agentData.ContentEncoding != "" { - r = bytes.NewReader(agentData.Data) + if apmData.ContentEncoding != "" { + r = bytes.NewReader(apmData.Data) } else { encoding = "gzip" buf := c.bufferPool.Get().(*bytes.Buffer) @@ -162,7 +162,7 @@ func (c *Client) PostToApmServer(ctx context.Context, agentData APMData) error { if err != nil { return err } - if _, err := gw.Write(agentData.Data); err != nil { + if _, err := gw.Write(apmData.Data); err != nil { return fmt.Errorf("failed to compress data: %w", err) } if err := gw.Close(); err != nil { diff --git a/apmproxy/batch.go b/apmproxy/batch.go index ba64340f..d39ba3b2 100644 --- a/apmproxy/batch.go +++ b/apmproxy/batch.go @@ -52,7 +52,7 @@ type BatchData struct { } // NewBatch creates a new BatchData which can accept a -// maximum number of entires as specified by the argument +// maximum number of entries as specified by the argument func NewBatch(maxSize int, maxAge time.Duration, metadata []byte) *BatchData { b := &BatchData{ maxSize: maxSize, diff --git a/apmproxy/client.go b/apmproxy/client.go index 4699441c..6f9d62e5 100644 --- a/apmproxy/client.go +++ b/apmproxy/client.go @@ -48,7 +48,7 @@ const ( defaultAgentBufferSize int = 100 defaultLambdaBufferSize int = 100 defaultMaxBatchSize int = 50 - defaultMaxBatchAge time.Duration = 10 * time.Second + defaultMaxBatchAge time.Duration = 2 * time.Second ) // Client is the client used to communicate with the apm server. From 62415bb44d834ee16049d0d3ad27be4e85beb74f Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Tue, 18 Oct 2022 18:34:12 +0800 Subject: [PATCH 10/15] Add buffering behavior for main_test --- app/app.go | 2 +- main_test.go | 113 +++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 83 insertions(+), 32 deletions(-) diff --git a/app/app.go b/app/app.go index 3d0416d7..3f928b35 100644 --- a/app/app.go +++ b/app/app.go @@ -88,7 +88,7 @@ func New(ctx context.Context, opts ...ConfigOption) (*App, error) { lc, err := logsapi.NewClient( logsapi.WithLogsAPIBaseURL(fmt.Sprintf("http://%s", c.awsLambdaRuntimeAPI)), logsapi.WithListenerAddress(addr), - logsapi.WithLogBuffer(0), + logsapi.WithLogBuffer(100), logsapi.WithLogger(app.logger), logsapi.WithSubscriptionTypes(subscriptionLogStreams...), ) diff --git a/main_test.go b/main_test.go index 385441d6..cf5ee376 100644 --- a/main_test.go +++ b/main_test.go @@ -156,6 +156,21 @@ func newMockApmServer(t *testing.T, l *zap.SugaredLogger) (*MockServerInternals, func newMockLambdaServer(t *testing.T, logsapiAddr string, eventsChannel chan MockEvent, l *zap.SugaredLogger) *MockServerInternals { var lambdaServerInternals MockServerInternals + // A big queue that can hold all the events required for a test + mockLogEventQ := make(chan logsapi.LogEvent, 100) + ctx, cancel := context.WithCancel(context.Background()) + + var wg sync.WaitGroup + wg.Add(1) + go func() { + defer wg.Done() + startLogSender(ctx, mockLogEventQ, logsapiAddr, l) + }() + t.Cleanup(func() { + cancel() + wg.Wait() + }) + lambdaServer := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { switch r.RequestURI { // Extension registration request @@ -171,19 +186,19 @@ func newMockLambdaServer(t *testing.T, logsapiAddr string, eventsChannel chan Mo } case "/2020-01-01/extension/event/next": lambdaServerInternals.WaitGroup.Wait() - currId := uuid.New().String() + currID := uuid.New().String() select { case nextEvent := <-eventsChannel: - sendNextEventInfo(w, currId, nextEvent, l) - go processMockEvent(currId, nextEvent, os.Getenv("ELASTIC_APM_DATA_RECEIVER_SERVER_PORT"), logsapiAddr, &lambdaServerInternals, l) + sendNextEventInfo(w, currID, nextEvent.Timeout, nextEvent.Type == Shutdown, l) + go processMockEvent(mockLogEventQ, currID, nextEvent, os.Getenv("ELASTIC_APM_DATA_RECEIVER_SERVER_PORT"), &lambdaServerInternals, l) default: finalShutDown := MockEvent{ Type: Shutdown, ExecutionDuration: 0, Timeout: 0, } - sendNextEventInfo(w, currId, finalShutDown, l) - go processMockEvent(currId, finalShutDown, os.Getenv("ELASTIC_APM_DATA_RECEIVER_SERVER_PORT"), logsapiAddr, &lambdaServerInternals, l) + sendNextEventInfo(w, currID, finalShutDown.Timeout, true, l) + go processMockEvent(mockLogEventQ, currID, finalShutDown, os.Getenv("ELASTIC_APM_DATA_RECEIVER_SERVER_PORT"), &lambdaServerInternals, l) } // Logs API subscription request case "/2020-08-15/logs": @@ -209,14 +224,12 @@ func newMockLambdaServer(t *testing.T, logsapiAddr string, eventsChannel chan Mo func newTestStructs(t *testing.T) chan MockEvent { http.DefaultServeMux = new(http.ServeMux) - _, cancel := context.WithCancel(context.Background()) - t.Cleanup(func() { cancel() }) eventsChannel := make(chan MockEvent, 100) return eventsChannel } -func processMockEvent(currId string, event MockEvent, extensionPort string, logsapiAddr string, internals *MockServerInternals, l *zap.SugaredLogger) { - sendLogEvent(logsapiAddr, currId, logsapi.PlatformStart, l) +func processMockEvent(q chan<- logsapi.LogEvent, currID string, event MockEvent, extensionPort string, internals *MockServerInternals, l *zap.SugaredLogger) { + queueLogEvent(q, currID, logsapi.PlatformStart, l) client := http.Client{} // Use a custom transport with a low timeout @@ -311,22 +324,23 @@ func processMockEvent(currId string, event MockEvent, extensionPort string, logs case Shutdown: } if sendRuntimeDone { - sendLogEvent(logsapiAddr, currId, logsapi.PlatformRuntimeDone, l) + l.Info("Queueuing runtime done event") + queueLogEvent(q, currID, logsapi.PlatformRuntimeDone, l) } if sendMetrics { - sendLogEvent(logsapiAddr, currId, logsapi.PlatformReport, l) + queueLogEvent(q, currID, logsapi.PlatformReport, l) } } -func sendNextEventInfo(w http.ResponseWriter, id string, event MockEvent, l *zap.SugaredLogger) { +func sendNextEventInfo(w http.ResponseWriter, id string, timeoutSec float64, shutdown bool, l *zap.SugaredLogger) { nextEventInfo := extension.NextEventResponse{ EventType: "INVOKE", - DeadlineMs: time.Now().UnixNano()/int64(time.Millisecond) + int64(event.Timeout*1000), + DeadlineMs: time.Now().UnixNano()/int64(time.Millisecond) + int64(timeoutSec*1000), RequestID: id, InvokedFunctionArn: "arn:aws:lambda:eu-central-1:627286350134:function:main_unit_test", Tracing: extension.Tracing{}, } - if event.Type == Shutdown { + if shutdown { nextEventInfo.EventType = "SHUTDOWN" } @@ -335,9 +349,9 @@ func sendNextEventInfo(w http.ResponseWriter, id string, event MockEvent, l *zap } } -func sendLogEvent(logsapiAddr string, requestId string, logEventType logsapi.LogEventType, l *zap.SugaredLogger) { +func queueLogEvent(q chan<- logsapi.LogEvent, requestID string, logEventType logsapi.LogEventType, l *zap.SugaredLogger) { record := logsapi.LogEventRecord{ - RequestID: requestId, + RequestID: requestID, } if logEventType == logsapi.PlatformReport { record.Metrics = logsapi.PlatformMetrics{ @@ -359,27 +373,64 @@ func sendLogEvent(logsapiAddr string, requestId string, logEventType logsapi.Log bufRecord := new(bytes.Buffer) if err := json.NewEncoder(bufRecord).Encode(record); err != nil { l.Errorf("Could not encode record : %v", err) - return } logEvent.StringRecord = bufRecord.String() + q <- logEvent +} - // Convert full log event to JSON - bufLogEvent := new(bytes.Buffer) - if err := json.NewEncoder(bufLogEvent).Encode([]logsapi.LogEvent{logEvent}); err != nil { - l.Errorf("Could not encode record : %v", err) - return +func startLogSender(ctx context.Context, q <-chan logsapi.LogEvent, logsapiAddr string, l *zap.SugaredLogger) { + client := http.Client{ + Timeout: 10 * time.Millisecond, } + doSend := func(events []logsapi.LogEvent) error { + var buf bytes.Buffer + if err := json.NewEncoder(&buf).Encode(events); err != nil { + return err + } - req, err := http.NewRequest(http.MethodPost, "http://"+logsapiAddr, bufLogEvent) - if err != nil { - l.Errorf("Could not create logs api request: %v", err) - return - } + req, err := http.NewRequest(http.MethodPost, fmt.Sprintf("http://%s", logsapiAddr), &buf) + if err != nil { + return err + } - client := http.Client{} - if _, err := client.Do(req); err != nil { - l.Errorf("Could not send log event : %v", err) - return + resp, err := client.Do(req) + if err != nil { + return err + } + if resp.StatusCode/100 != 2 { + return fmt.Errorf("received a non 2xx status code: %d", resp.StatusCode) + } + return nil + } + + var batch []logsapi.LogEvent + flushInterval := time.NewTicker(100 * time.Millisecond) + defer flushInterval.Stop() + for { + select { + case <-flushInterval.C: + var trySend bool + for !trySend { + // TODO: @lahsivjar mock dropping of logs, batch age and batch size + // TODO: @lahsivjar is it possible for one batch to have platform.runtimeDone + // event in middle of the batch? + select { + case e := <-q: + batch = append(batch, e) + default: + trySend = true + if len(batch) > 0 { + if err := doSend(batch); err != nil { + l.Warnf("mock lambda API failed to send logs to the extension: %v", err) + } else { + batch = batch[:0] + } + } + } + } + case <-ctx.Done(): + return + } } } From 7b8afdcc54ce93f316f82c200a934b2160231b87 Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Tue, 18 Oct 2022 18:56:54 +0800 Subject: [PATCH 11/15] Remove Type field from APMData --- apmproxy/apmserver.go | 61 +++++++++++++++++------------------- apmproxy/apmserver_test.go | 5 +-- apmproxy/batch.go | 6 ---- apmproxy/batch_test.go | 12 +++---- apmproxy/receiver.go | 12 ------- logsapi/functionlogs.go | 1 - logsapi/functionlogs_test.go | 2 -- logsapi/metrics.go | 1 - logsapi/metrics_test.go | 2 -- 9 files changed, 36 insertions(+), 66 deletions(-) diff --git a/apmproxy/apmserver.go b/apmproxy/apmserver.go index 275f10df..0be3e8ad 100644 --- a/apmproxy/apmserver.go +++ b/apmproxy/apmserver.go @@ -55,7 +55,7 @@ func (c *Client) ForwardApmData(ctx context.Context) error { c.logger.Debug("Invocation context cancelled, not processing any more agent data") return nil case data := <-c.AgentDataChannel: - if err := c.forwardAPMDataByType(ctx, data); err != nil { + if err := c.forwardAgentData(ctx, data); err != nil { return err } case <-c.metadataAvailable: @@ -68,11 +68,11 @@ func (c *Client) ForwardApmData(ctx context.Context) error { c.logger.Debug("Invocation context cancelled, not processing any more agent data") return nil case data := <-c.AgentDataChannel: - if err := c.forwardAPMDataByType(ctx, data); err != nil { + if err := c.forwardAgentData(ctx, data); err != nil { return err } case data := <-c.LambdaDataChannel: - if err := c.forwardAPMDataByType(ctx, data); err != nil { + if err := c.forwardLambdaData(ctx, data); err != nil { return err } } @@ -94,7 +94,7 @@ func (c *Client) FlushAPMData(ctx context.Context) { case <-ctx.Done(): c.logger.Debugf("Failed to flush completely, may result in data drop") case data := <-c.AgentDataChannel: - if err := c.forwardAPMDataByType(ctx, data); err != nil { + if err := c.forwardAgentData(ctx, data); err != nil { c.logger.Errorf("Error sending to APM Server, skipping: %v", err) } default: @@ -119,7 +119,7 @@ func (c *Client) FlushAPMData(ctx context.Context) { c.logger.Debugf("Failed to flush completely, may result in data drop") case apmData := <-c.LambdaDataChannel: c.logger.Debug("Flush in progress - Processing lambda data") - if err := c.forwardAPMDataByType(ctx, apmData); err != nil { + if err := c.forwardLambdaData(ctx, apmData); err != nil { c.logger.Errorf("Error sending to APM server, skipping: %v", err) } default: @@ -348,35 +348,32 @@ func (c *Client) WaitForFlush() <-chan struct{} { return c.flushCh } -func (c *Client) forwardAPMDataByType(ctx context.Context, apmData APMData) error { - switch apmData.Type { - case Agent: - if c.batch == nil { - metadata, err := ProcessMetadata(apmData) - if err != nil { - return fmt.Errorf("failed to extract metadata from agent payload %w", err) - } - c.batch = NewBatch(c.maxBatchSize, c.maxBatchAge, metadata) - // broadcast that metadata is available - close(c.metadataAvailable) - } - return c.PostToApmServer(ctx, apmData) - case Lambda: - if c.batch == nil { - // This state is not possible since we are pushing back on - // lambda logs API until metadata is available. - return errors.New("unexpected state, metadata not yet set") - } - if err := c.batch.Add(apmData); err != nil { - c.logger.Warnf("Dropping data due to error: %v", err) - } - if c.batch.ShouldShip() { - return c.sendBatch(ctx) +func (c *Client) forwardAgentData(ctx context.Context, apmData APMData) error { + if c.batch == nil { + metadata, err := ProcessMetadata(apmData) + if err != nil { + return fmt.Errorf("failed to extract metadata from agent payload %w", err) } - return nil - default: - return errors.New("invalid apm data type") + c.batch = NewBatch(c.maxBatchSize, c.maxBatchAge, metadata) + // broadcast that metadata is available + close(c.metadataAvailable) } + return c.PostToApmServer(ctx, apmData) +} + +func (c *Client) forwardLambdaData(ctx context.Context, apmData APMData) error { + if c.batch == nil { + // This state is not possible since we are pushing back on + // lambda logs API until metadata is available. + return errors.New("unexpected state, metadata not yet set") + } + if err := c.batch.Add(apmData); err != nil { + c.logger.Warnf("Dropping data due to error: %v", err) + } + if c.batch.ShouldShip() { + return c.sendBatch(ctx) + } + return nil } func (c *Client) sendBatch(ctx context.Context) error { diff --git a/apmproxy/apmserver_test.go b/apmproxy/apmserver_test.go index ea149317..cfa21356 100644 --- a/apmproxy/apmserver_test.go +++ b/apmproxy/apmserver_test.go @@ -653,7 +653,6 @@ func TestForwardApmData(t *testing.T) { // Populate metadata by sending agent data apmClient.AgentDataChannel <- apmproxy.APMData{ Data: []byte(agentData), - Type: apmproxy.Agent, } assertGzipBody(agentData) // After reading agent data with metadata the metadata indicator @@ -676,7 +675,6 @@ func TestForwardApmData(t *testing.T) { } apmClient.LambdaDataChannel <- apmproxy.APMData{ Data: []byte(lambdaData), - Type: apmproxy.Lambda, } expected.WriteByte('\n') expected.WriteString(lambdaData) @@ -719,7 +717,7 @@ func BenchmarkFlushAPMData(b *testing.B) { {"transaction": { "id": "00xxxxFFaaaa1234", "trace_id": "0123456789abcdef0123456789abcdef", "name": "amqp receive", "parent_id": "abcdefabcdef01234567", "type": "messaging", "duration": 3, "span_count": { "started": 1 }, "context": {"message": {"queue": { "name": "new_users"}, "age":{ "ms": 1577958057123}, "headers": {"user_id": "1ax3", "involved_services": ["user", "auth"]}, "body": "user created", "routing_key": "user-created-transaction"}},"session":{"id":"sunday","sequence":123}}} {"transaction": { "name": "july-2021-delete-after-july-31", "type": "lambda", "result": "success", "id": "142e61450efb8574", "trace_id": "eb56529a1f461c5e7e2f66ecb075e983", "subtype": null, "action": null, "duration": 38.853, "timestamp": 1631736666365048, "sampled": true, "context": { "cloud": { "origin": { "account": { "id": "abc123" }, "provider": "aws", "region": "us-east-1", "service": { "name": "serviceName" } } }, "service": { "origin": { "id": "abc123", "name": "service-name", "version": "1.0" } }, "user": {}, "tags": {}, "custom": { } }, "sync": true, "span_count": { "started": 0 }, "outcome": "unknown", "faas": { "coldstart": false, "execution": "2e13b309-23e1-417f-8bf7-074fc96bc683", "trigger": { "request_id": "FuH2Cir_vHcEMUA=", "type": "http" } }, "sample_rate": 1 } } `) - agentAPMData := apmproxy.APMData{Data: agentData, Type: apmproxy.Agent} + agentAPMData := apmproxy.APMData{Data: agentData} b.ReportAllocs() b.ResetTimer() @@ -728,7 +726,6 @@ func BenchmarkFlushAPMData(b *testing.B) { for j := 0; j < 99; j++ { apmClient.LambdaDataChannel <- apmproxy.APMData{ Data: []byte("this is test log"), - Type: apmproxy.Lambda, } } apmClient.FlushAPMData(context.Background()) diff --git a/apmproxy/batch.go b/apmproxy/batch.go index d39ba3b2..d139c259 100644 --- a/apmproxy/batch.go +++ b/apmproxy/batch.go @@ -27,8 +27,6 @@ var ( // ErrBatchFull signfies that the batch has reached full capacity // and cannot accept more entries. ErrBatchFull = errors.New("batch is full") - // ErrInvalidType is returned for any APMData that is not Lambda type - ErrInvalidType = errors.New("only accepts lambda type data") // ErrInvalidEncoding is returned for any APMData that is encoded // with any encoding format ErrInvalidEncoding = errors.New("encoded data not supported") @@ -68,9 +66,6 @@ func (b *BatchData) Add(d APMData) error { if b.count == b.maxSize { return ErrBatchFull } - if d.Type != Lambda { - return ErrInvalidType - } if d.ContentEncoding != "" { return ErrInvalidEncoding } @@ -113,6 +108,5 @@ func (b *BatchData) Reset() { func (b *BatchData) ToAPMData() APMData { return APMData{ Data: b.buf.Bytes(), - Type: Lambda, } } diff --git a/apmproxy/batch_test.go b/apmproxy/batch_test.go index 22a5de08..bf0182ee 100644 --- a/apmproxy/batch_test.go +++ b/apmproxy/batch_test.go @@ -31,19 +31,19 @@ func TestAdd(t *testing.T) { t.Run("empty", func(t *testing.T) { b := NewBatch(1, time.Hour, []byte(metadata)) - assert.NoError(t, b.Add(APMData{Type: Lambda})) + assert.NoError(t, b.Add(APMData{})) }) t.Run("full", func(t *testing.T) { b := NewBatch(1, time.Hour, []byte(metadata)) - require.NoError(t, b.Add(APMData{Type: Lambda})) + require.NoError(t, b.Add(APMData{})) - assert.ErrorIs(t, ErrBatchFull, b.Add(APMData{Type: Lambda})) + assert.ErrorIs(t, ErrBatchFull, b.Add(APMData{})) }) } func TestReset(t *testing.T) { b := NewBatch(1, time.Hour, []byte(metadata)) - require.NoError(t, b.Add(APMData{Type: Lambda})) + require.NoError(t, b.Add(APMData{})) require.Equal(t, 1, b.Count()) b.Reset() @@ -57,7 +57,7 @@ func TestShouldShip_ReasonSize(t *testing.T) { // Should flush at 90% full for i := 0; i < 9; i++ { assert.False(t, b.ShouldShip()) - require.NoError(t, b.Add(APMData{Type: Lambda})) + require.NoError(t, b.Add(APMData{})) } require.Equal(t, 9, b.Count()) @@ -68,7 +68,7 @@ func TestShouldShip_ReasonAge(t *testing.T) { b := NewBatch(10, time.Second, []byte(metadata)) assert.False(t, b.ShouldShip()) - require.NoError(t, b.Add(APMData{Type: Lambda})) + require.NoError(t, b.Add(APMData{})) <-time.After(2 * time.Second) diff --git a/apmproxy/receiver.go b/apmproxy/receiver.go index 3361b8a4..d4c40951 100644 --- a/apmproxy/receiver.go +++ b/apmproxy/receiver.go @@ -29,22 +29,11 @@ import ( "time" ) -// APMDataType represents source of APMData -type APMDataType string - -const ( - // Agent data type represents APMData collected from APM agents - Agent APMDataType = "agent" - // Lambda data type represents APMData collected from logs API - Lambda APMDataType = "lambda" -) - // APMData represents data to be sent to APMServer. `Agent` type // data will have `metadata` as ndjson whereas `lambda` type data // will be without metadata. type APMData struct { Data []byte - Type APMDataType ContentEncoding string } @@ -139,7 +128,6 @@ func (c *Client) handleIntakeV2Events() func(w http.ResponseWriter, r *http.Requ agentData := APMData{ Data: rawBytes, - Type: Agent, ContentEncoding: r.Header.Get("Content-Encoding"), } diff --git a/logsapi/functionlogs.go b/logsapi/functionlogs.go index d32219fd..19b67b72 100644 --- a/logsapi/functionlogs.go +++ b/logsapi/functionlogs.go @@ -108,7 +108,6 @@ func ProcessFunctionLog( } return apmproxy.APMData{ - Type: apmproxy.Lambda, Data: jsonWriter.Bytes(), }, nil } diff --git a/logsapi/functionlogs_test.go b/logsapi/functionlogs_test.go index 7952bf04..7ebf6dee 100644 --- a/logsapi/functionlogs_test.go +++ b/logsapi/functionlogs_test.go @@ -22,7 +22,6 @@ import ( "testing" "time" - "github.com/elastic/apm-aws-lambda/apmproxy" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -46,6 +45,5 @@ func TestProcessFunctionLog(t *testing.T) { apmData, err := ProcessFunctionLog(reqID, invokedFnArn, event) require.NoError(t, err) - assert.Equal(t, apmproxy.Lambda, apmData.Type) assert.Equal(t, expectedData, string(apmData.Data)) } diff --git a/logsapi/metrics.go b/logsapi/metrics.go index dc670047..c7ebf92a 100644 --- a/logsapi/metrics.go +++ b/logsapi/metrics.go @@ -101,7 +101,6 @@ func ProcessPlatformReport(functionData *extension.NextEventResponse, platformRe } return apmproxy.APMData{ - Type: apmproxy.Lambda, Data: jsonWriter.Bytes(), }, nil } diff --git a/logsapi/metrics_test.go b/logsapi/metrics_test.go index 837276d7..e1e53941 100644 --- a/logsapi/metrics_test.go +++ b/logsapi/metrics_test.go @@ -70,7 +70,6 @@ func TestProcessPlatformReport_Coldstart(t *testing.T) { apmData, err := ProcessPlatformReport(&event, logEvent) require.NoError(t, err) - assert.Equal(t, apmproxy.Lambda, apmData.Type) requestBytes, err := apmproxy.GetUncompressedBytes(apmData.Data, "") require.NoError(t, err) @@ -121,7 +120,6 @@ func TestProcessPlatformReport_NoColdstart(t *testing.T) { apmData, err := ProcessPlatformReport(&event, logEvent) require.NoError(t, err) - assert.Equal(t, apmproxy.Lambda, apmData.Type) requestBytes, err := apmproxy.GetUncompressedBytes(apmData.Data, "") require.NoError(t, err) From add41785a6792b1b439469d68bbee70643e2a10f Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Tue, 18 Oct 2022 20:48:50 +0800 Subject: [PATCH 12/15] Minor refactor --- apmproxy/apmserver.go | 4 ++-- main_test.go | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/apmproxy/apmserver.go b/apmproxy/apmserver.go index 0be3e8ad..9075d55c 100644 --- a/apmproxy/apmserver.go +++ b/apmproxy/apmserver.go @@ -363,8 +363,8 @@ func (c *Client) forwardAgentData(ctx context.Context, apmData APMData) error { func (c *Client) forwardLambdaData(ctx context.Context, apmData APMData) error { if c.batch == nil { - // This state is not possible since we are pushing back on - // lambda logs API until metadata is available. + // This state is not possible since we start processing lambda + // logs only after metadata is available and batch is created. return errors.New("unexpected state, metadata not yet set") } if err := c.batch.Add(apmData); err != nil { diff --git a/main_test.go b/main_test.go index cf5ee376..95ae534f 100644 --- a/main_test.go +++ b/main_test.go @@ -324,7 +324,6 @@ func processMockEvent(q chan<- logsapi.LogEvent, currID string, event MockEvent, case Shutdown: } if sendRuntimeDone { - l.Info("Queueuing runtime done event") queueLogEvent(q, currID, logsapi.PlatformRuntimeDone, l) } if sendMetrics { From ff98be99cdbd4805a1237af5a39c56b4e0135f8f Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Wed, 19 Oct 2022 13:05:43 +0800 Subject: [PATCH 13/15] Remove metadata available channel and other minor issues --- apmproxy/apmserver.go | 45 ++++++++++---------------------------- apmproxy/apmserver_test.go | 29 +----------------------- apmproxy/batch_test.go | 4 ++-- apmproxy/client.go | 6 ----- apmproxy/client_test.go | 8 ------- apmproxy/option.go | 8 ------- apmproxy/receiver_test.go | 6 ----- app/app.go | 6 ----- app/config.go | 9 -------- logsapi/event.go | 10 +++++++-- main.go | 1 - main_test.go | 3 --- 12 files changed, 22 insertions(+), 113 deletions(-) diff --git a/apmproxy/apmserver.go b/apmproxy/apmserver.go index 9075d55c..f21b9bdb 100644 --- a/apmproxy/apmserver.go +++ b/apmproxy/apmserver.go @@ -48,20 +48,7 @@ func (c *Client) ForwardApmData(ctx context.Context) error { return nil } - // Wait for metadata to be available, metadata will be available as soon as - // the first agent data is processed. - select { - case <-ctx.Done(): - c.logger.Debug("Invocation context cancelled, not processing any more agent data") - return nil - case data := <-c.AgentDataChannel: - if err := c.forwardAgentData(ctx, data); err != nil { - return err - } - case <-c.metadataAvailable: - } - - // Process lambda and agent data after metadata is available + var lambdaDataChan chan APMData for { select { case <-ctx.Done(): @@ -71,7 +58,10 @@ func (c *Client) ForwardApmData(ctx context.Context) error { if err := c.forwardAgentData(ctx, data); err != nil { return err } - case data := <-c.LambdaDataChannel: + // Wait for metadata to be available, metadata will be available as soon as + // the first agent data is processed. + lambdaDataChan = c.LambdaDataChannel + case data := <-lambdaDataChan: if err := c.forwardLambdaData(ctx, data); err != nil { return err } @@ -88,26 +78,15 @@ func (c *Client) FlushAPMData(ctx context.Context) { c.logger.Debug("Flush started - Checking for agent data") // Flush agent data first to make sure metadata is available if possible - func() { - for { - select { - case <-ctx.Done(): - c.logger.Debugf("Failed to flush completely, may result in data drop") - case data := <-c.AgentDataChannel: - if err := c.forwardAgentData(ctx, data); err != nil { - c.logger.Errorf("Error sending to APM Server, skipping: %v", err) - } - default: - c.logger.Debug("Flush ended for agent data - no data in buffer") - return - } + for i := len(c.AgentDataChannel); i > 0; i-- { + data := <-c.AgentDataChannel + if err := c.forwardAgentData(ctx, data); err != nil { + c.logger.Errorf("Error sending to APM Server, skipping: %v", err) } - }() + } // If metadata still not available then fail fast - select { - case <-c.metadataAvailable: - default: + if c.batch == nil { c.logger.Warnf("Metadata not available at flush, skipping sending lambda data to APM Server") return } @@ -355,8 +334,6 @@ func (c *Client) forwardAgentData(ctx context.Context, apmData APMData) error { return fmt.Errorf("failed to extract metadata from agent payload %w", err) } c.batch = NewBatch(c.maxBatchSize, c.maxBatchAge, metadata) - // broadcast that metadata is available - close(c.metadataAvailable) } return c.PostToApmServer(ctx, apmData) } diff --git a/apmproxy/apmserver_test.go b/apmproxy/apmserver_test.go index cfa21356..5bba30c3 100644 --- a/apmproxy/apmserver_test.go +++ b/apmproxy/apmserver_test.go @@ -78,7 +78,6 @@ func TestPostToApmServerDataCompressed(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) require.NoError(t, apmClient.PostToApmServer(context.Background(), agentData)) @@ -125,7 +124,6 @@ func TestPostToApmServerDataNotCompressed(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) require.NoError(t, apmClient.PostToApmServer(context.Background(), agentData)) @@ -135,7 +133,6 @@ func TestGracePeriod(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) @@ -176,7 +173,6 @@ func TestSetHealthyTransport(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -190,7 +186,6 @@ func TestSetFailingTransport(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.ReconnectionCount = 0 @@ -203,7 +198,6 @@ func TestSetPendingTransport(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -219,7 +213,6 @@ func TestSetPendingTransportExplicitly(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -232,7 +225,6 @@ func TestSetInvalidTransport(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -277,7 +269,6 @@ func TestEnterBackoffFromHealthy(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -332,7 +323,6 @@ func TestEnterBackoffFromFailing(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) @@ -387,7 +377,6 @@ func TestAPMServerRecovery(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) @@ -434,7 +423,6 @@ func TestAPMServerAuthFails(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -486,7 +474,6 @@ func TestAPMServerRatelimit(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) assert.Equal(t, apmClient.Status, apmproxy.Started) @@ -540,7 +527,6 @@ func TestAPMServerClientFail(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) assert.Equal(t, apmClient.Status, apmproxy.Started) @@ -591,7 +577,6 @@ func TestContinuedAPMServerFailure(t *testing.T) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) apmClient.UpdateStatus(context.Background(), apmproxy.Healthy) @@ -629,12 +614,10 @@ func TestForwardApmData(t *testing.T) { } agentData := fmt.Sprintf("%s\n%s", metadata, `{"log": {"message": "test"}}`) lambdaData := `{"log": {"message": "test"}}` - metaAvailable := make(chan struct{}) maxBatchAge := 1 * time.Second apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(metaAvailable), apmproxy.WithAgentDataBufferSize(10), // Configure a small batch age for ease of testing apmproxy.WithMaxBatchAge(maxBatchAge), @@ -655,14 +638,6 @@ func TestForwardApmData(t *testing.T) { Data: []byte(agentData), } assertGzipBody(agentData) - // After reading agent data with metadata the metadata indicator - // channel should be closed - select { - case _, ok := <-metaAvailable: - require.False(t, ok) - default: - require.Fail(t, "meta channel should be closed after reading agent data") - } // Send lambda logs API data var expected bytes.Buffer @@ -671,7 +646,7 @@ func TestForwardApmData(t *testing.T) { for i := 0; i < 5; i++ { if i == 4 { // Wait for batch age to make sure the batch is mature to be sent - <-time.After(maxBatchAge + time.Millisecond) + time.Sleep(maxBatchAge + time.Millisecond) } apmClient.LambdaDataChannel <- apmproxy.APMData{ Data: []byte(lambdaData), @@ -705,7 +680,6 @@ func BenchmarkFlushAPMData(b *testing.B) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(b).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(b, err) @@ -751,7 +725,6 @@ func BenchmarkPostToAPM(b *testing.B) { apmClient, err := apmproxy.NewClient( apmproxy.WithURL(apmServer.URL), apmproxy.WithLogger(zaptest.NewLogger(b).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(b, err) diff --git a/apmproxy/batch_test.go b/apmproxy/batch_test.go index bf0182ee..2d93d122 100644 --- a/apmproxy/batch_test.go +++ b/apmproxy/batch_test.go @@ -70,9 +70,9 @@ func TestShouldShip_ReasonAge(t *testing.T) { assert.False(t, b.ShouldShip()) require.NoError(t, b.Add(APMData{})) - <-time.After(2 * time.Second) + time.Sleep(time.Second + time.Millisecond) - // Should be ready for send after 3 seconds + // Should be ready to send now require.Equal(t, 1, b.Count()) assert.True(t, b.ShouldShip()) } diff --git a/apmproxy/client.go b/apmproxy/client.go index 6f9d62e5..feccd734 100644 --- a/apmproxy/client.go +++ b/apmproxy/client.go @@ -73,8 +73,6 @@ type Client struct { batch *BatchData maxBatchSize int maxBatchAge time.Duration - - metadataAvailable chan struct{} } func NewClient(opts ...Option) (*Client, error) { @@ -115,10 +113,6 @@ func NewClient(opts ...Option) (*Client, error) { return nil, errors.New("logger cannot be empty") } - if c.metadataAvailable == nil { - return nil, errors.New("metadata available indicator is required") - } - // normalize server URL if !strings.HasSuffix(c.serverURL, "/") { c.serverURL = c.serverURL + "/" diff --git a/apmproxy/client_test.go b/apmproxy/client_test.go index 3acab6db..3f57c73b 100644 --- a/apmproxy/client_test.go +++ b/apmproxy/client_test.go @@ -44,18 +44,10 @@ func TestClient(t *testing.T) { }, expectedErr: true, }, - "missing metadata indicator": { - opts: []apmproxy.Option{ - apmproxy.WithURL("https://example.com"), - apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - }, - expectedErr: true, - }, "valid": { opts: []apmproxy.Option{ apmproxy.WithURL("https://example.com"), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), }, }, } diff --git a/apmproxy/option.go b/apmproxy/option.go index 34622fb1..f1862532 100644 --- a/apmproxy/option.go +++ b/apmproxy/option.go @@ -86,14 +86,6 @@ func WithLogger(logger *zap.SugaredLogger) Option { } } -// WithMetadataAvailableIndicator configures a channel -// which will broadcast metadata available event on close -func WithMetadataAvailableIndicator(ch chan struct{}) Option { - return func(c *Client) { - c.metadataAvailable = ch - } -} - // WithMaxBatchSize configures the maximum batch size for // the payload sent to the APMServer func WithMaxBatchSize(size int) Option { diff --git a/apmproxy/receiver_test.go b/apmproxy/receiver_test.go index e908eed6..155ae79a 100644 --- a/apmproxy/receiver_test.go +++ b/apmproxy/receiver_test.go @@ -58,7 +58,6 @@ func TestInfoProxy(t *testing.T) { apmproxy.WithReceiverAddress(":1234"), apmproxy.WithReceiverTimeout(15*time.Second), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) @@ -104,7 +103,6 @@ func TestInfoProxyErrorStatusCode(t *testing.T) { apmproxy.WithReceiverAddress(":1234"), apmproxy.WithReceiverTimeout(15*time.Second), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) @@ -143,7 +141,6 @@ func Test_handleInfoRequest(t *testing.T) { apmproxy.WithReceiverAddress(":1234"), apmproxy.WithReceiverTimeout(15*time.Second), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) @@ -184,7 +181,6 @@ func Test_handleIntakeV2EventsQueryParam(t *testing.T) { apmproxy.WithReceiverAddress(":1234"), apmproxy.WithReceiverTimeout(15*time.Second), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) require.NoError(t, apmClient.StartReceiver()) @@ -227,7 +223,6 @@ func Test_handleIntakeV2EventsNoQueryParam(t *testing.T) { apmproxy.WithReceiverAddress(":1234"), apmproxy.WithReceiverTimeout(15*time.Second), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) require.NoError(t, apmClient.StartReceiver()) @@ -271,7 +266,6 @@ func Test_handleIntakeV2EventsQueryParamEmptyData(t *testing.T) { apmproxy.WithReceiverAddress(":1234"), apmproxy.WithReceiverTimeout(15*time.Second), apmproxy.WithLogger(zaptest.NewLogger(t).Sugar()), - apmproxy.WithMetadataAvailableIndicator(make(chan struct{})), ) require.NoError(t, err) require.NoError(t, apmClient.StartReceiver()) diff --git a/app/app.go b/app/app.go index 3f928b35..d2b9f808 100644 --- a/app/app.go +++ b/app/app.go @@ -19,7 +19,6 @@ package app import ( "context" - "errors" "fmt" "os" "strconv" @@ -53,10 +52,6 @@ func New(ctx context.Context, opts ...ConfigOption) (*App, error) { opt(&c) } - if c.metadataAvailable == nil { - return nil, errors.New("metadata available indicator is required") - } - app := &App{ extensionName: c.extensionName, } @@ -137,7 +132,6 @@ func New(ctx context.Context, opts ...ConfigOption) (*App, error) { apmproxy.WithLogger(app.logger), apmproxy.WithAPIKey(apmServerAPIKey), apmproxy.WithSecretToken(apmServerSecretToken), - apmproxy.WithMetadataAvailableIndicator(c.metadataAvailable), ) ac, err := apmproxy.NewClient(apmOpts...) diff --git a/app/config.go b/app/config.go index c34bcf61..4311de5a 100644 --- a/app/config.go +++ b/app/config.go @@ -27,7 +27,6 @@ type appConfig struct { enableFunctionLogSubscription bool logLevel string logsapiAddr string - metadataAvailable chan struct{} } // ConfigOption is used to configure the lambda extension @@ -86,11 +85,3 @@ func WithAWSConfig(awsConfig aws.Config) ConfigOption { c.awsConfig = awsConfig } } - -// WithMetadataAvailableIndicator configures a channel which should -// be closed when metadata is available after parsing agent data. -func WithMetadataAvailableIndicator(ch chan struct{}) ConfigOption { - return func(c *appConfig) { - c.metadataAvailable = ch - } -} diff --git a/logsapi/event.go b/logsapi/event.go index 9c891512..414bbff5 100644 --- a/logsapi/event.go +++ b/logsapi/event.go @@ -93,7 +93,10 @@ func (lc *Client) ProcessLogs( if err != nil { lc.logger.Errorf("Error processing Lambda platform metrics: %v", err) } else { - apmClient.LambdaDataChannel <- processedMetrics + select { + case apmClient.LambdaDataChannel <- processedMetrics: + case <-ctx.Done(): + } } } else { lc.logger.Warn("report event request id didn't match the previous event id") @@ -111,7 +114,10 @@ func (lc *Client) ProcessLogs( if err != nil { lc.logger.Errorf("Error processing function log : %v", err) } else { - apmClient.LambdaDataChannel <- processedLog + select { + case apmClient.LambdaDataChannel <- processedLog: + case <-ctx.Done(): + } } } case <-ctx.Done(): diff --git a/main.go b/main.go index 0a53c2c3..6e43626e 100644 --- a/main.go +++ b/main.go @@ -45,7 +45,6 @@ func main() { app.WithLambdaRuntimeAPI(os.Getenv("AWS_LAMBDA_RUNTIME_API")), app.WithLogLevel(os.Getenv("ELASTIC_APM_LOG_LEVEL")), app.WithAWSConfig(cfg), - app.WithMetadataAvailableIndicator(make(chan struct{})), } captureLogs, err := strconv.ParseBool(os.Getenv("ELASTIC_APM_LAMBDA_CAPTURE_LOGS")) diff --git a/main_test.go b/main_test.go index 95ae534f..774d759a 100644 --- a/main_test.go +++ b/main_test.go @@ -824,14 +824,11 @@ func TestMetricsWithMetadata(t *testing.T) { func runApp(t *testing.T, logsapiAddr string) <-chan struct{} { ctx, cancel := context.WithCancel(context.Background()) - metadataIndicator := make(chan struct{}) - app, err := app.New(ctx, app.WithExtensionName("apm-lambda-extension"), app.WithLambdaRuntimeAPI(os.Getenv("AWS_LAMBDA_RUNTIME_API")), app.WithLogLevel("debug"), app.WithLogsapiAddress(logsapiAddr), - app.WithMetadataAvailableIndicator(metadataIndicator), ) require.NoError(t, err) From 8892605073ea24ec18efc3a1d93ffab12c9fd9f1 Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Wed, 19 Oct 2022 15:04:38 +0800 Subject: [PATCH 14/15] Fix context done handling --- apmproxy/apmserver.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/apmproxy/apmserver.go b/apmproxy/apmserver.go index f21b9bdb..634e6a2f 100644 --- a/apmproxy/apmserver.go +++ b/apmproxy/apmserver.go @@ -94,13 +94,14 @@ func (c *Client) FlushAPMData(ctx context.Context) { // Flush lambda data for { select { - case <-ctx.Done(): - c.logger.Debugf("Failed to flush completely, may result in data drop") case apmData := <-c.LambdaDataChannel: c.logger.Debug("Flush in progress - Processing lambda data") if err := c.forwardLambdaData(ctx, apmData); err != nil { c.logger.Errorf("Error sending to APM server, skipping: %v", err) } + case <-ctx.Done(): + c.logger.Debugf("Failed to flush completely, may result in data drop") + return default: // Flush any remaining data in batch if err := c.sendBatch(ctx); err != nil { From fa41c113909e639b8d2bc9f7f9e23dba8689e44a Mon Sep 17 00:00:00 2001 From: Vishal Raj Date: Wed, 19 Oct 2022 15:17:52 +0800 Subject: [PATCH 15/15] Add changelog --- CHANGELOG.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.asciidoc b/CHANGELOG.asciidoc index 69e7ee54..41c18298 100644 --- a/CHANGELOG.asciidoc +++ b/CHANGELOG.asciidoc @@ -30,6 +30,7 @@ https://github.com/elastic/apm-aws-lambda/compare/v1.1.0...main[View commits] ===== Features - Disable CGO to prevent libc/ABI compatibility issues {lambda-pull}292[292] - Add support for collecting and shipping function logs to APM Server {lambda-pull}303[303] +- Batch data collected from lambda logs API before sending to APM Server {lambda-pull}314[314] [float] ===== Bug fixes