From a28cab6a8799c083e4ff5d144ccf819dc5f741e6 Mon Sep 17 00:00:00 2001 From: Daniel Franz Date: Wed, 16 Jul 2025 14:59:27 +0900 Subject: [PATCH] Metrics Summary Adds a util to the e2e suite which queries prometheus at the end of the test run for alerts and metrics data. This data is then processed into markdown which is displayed to the contributor at the end of their test runs. Extra: Tuned prometheus alerts to be less sensitive to memory growth. The tests will naturally cause an additional memory footprint at the beginning of the e2e, so we need to account for that somehow. Also tagged a couple of images we were implicitly using 'latest' versions of so nodes won't have to pull them on every test run. Signed-off-by: Daniel Franz --- Makefile | 10 +- .../overlays/prometheus/prometheus_rule.yaml | 4 +- go.mod | 2 +- go.sum | 4 + test/e2e/e2e_suite_test.go | 13 +- test/e2e/metrics_test.go | 4 +- test/utils/summary.go | 199 ++++++++++++++++++ test/utils/templates/alert.md.tmpl | 16 ++ test/utils/templates/mermaid_chart.md.tmpl | 17 ++ test/utils/templates/summary.md.tmpl | 22 ++ .../testoperator.clusterserviceversion.yaml | 2 +- 11 files changed, 275 insertions(+), 18 deletions(-) create mode 100644 test/utils/summary.go create mode 100644 test/utils/templates/alert.md.tmpl create mode 100644 test/utils/templates/mermaid_chart.md.tmpl create mode 100644 test/utils/templates/summary.md.tmpl diff --git a/Makefile b/Makefile index 67c876a59a..8596edee97 100644 --- a/Makefile +++ b/Makefile @@ -272,14 +272,14 @@ test-e2e: SOURCE_MANIFEST := $(STANDARD_E2E_MANIFEST) test-e2e: KIND_CLUSTER_NAME := operator-controller-e2e test-e2e: GO_BUILD_EXTRA_FLAGS := -cover test-e2e: COVERAGE_NAME := e2e -test-e2e: run image-registry prometheus e2e e2e-metrics e2e-coverage kind-clean #HELP Run e2e test suite on local kind cluster +test-e2e: run image-registry prometheus e2e e2e-coverage kind-clean #HELP Run e2e test suite on local kind cluster .PHONY: test-experimental-e2e test-experimental-e2e: SOURCE_MANIFEST := $(EXPERIMENTAL_E2E_MANIFEST) test-experimental-e2e: KIND_CLUSTER_NAME := operator-controller-e2e test-experimental-e2e: GO_BUILD_EXTRA_FLAGS := -cover test-experimental-e2e: COVERAGE_NAME := experimental-e2e -test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-metrics e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster +test-experimental-e2e: run image-registry prometheus experimental-e2e e2e e2e-coverage kind-clean #HELP Run experimental e2e test suite on local kind cluster .PHONY: prometheus prometheus: PROMETHEUS_NAMESPACE := olmv1-system @@ -287,12 +287,6 @@ prometheus: PROMETHEUS_VERSION := v0.83.0 prometheus: #EXHELP Deploy Prometheus into specified namespace ./hack/test/install-prometheus.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(KUSTOMIZE) $(VERSION) -# The output alerts.out file contains any alerts, pending or firing, collected during a test run in json format. -.PHONY: e2e-metrics -e2e-metrics: ALERTS_FILE_PATH := $(if $(ARTIFACT_PATH),$(ARTIFACT_PATH),.)/alerts.out -e2e-metrics: #EXHELP Request metrics from prometheus; place in ARTIFACT_PATH if set - curl -X GET http://localhost:30900/api/v1/alerts | jq 'if (.data.alerts | length) > 0 then .data.alerts.[] else empty end' > $(ALERTS_FILE_PATH) - .PHONY: extension-developer-e2e extension-developer-e2e: KIND_CLUSTER_NAME := operator-controller-ext-dev-e2e extension-developer-e2e: export INSTALL_DEFAULT_CATALOGS := false diff --git a/config/overlays/prometheus/prometheus_rule.yaml b/config/overlays/prometheus/prometheus_rule.yaml index 16e4bfd1ac..5bd7e120b8 100644 --- a/config/overlays/prometheus/prometheus_rule.yaml +++ b/config/overlays/prometheus/prometheus_rule.yaml @@ -22,13 +22,13 @@ spec: annotations: description: "container {{ $labels.container }} of pod {{ $labels.pod }} experienced OOM event(s); count={{ $value }}" - alert: operator-controller-memory-growth - expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 50_000 + expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 100_000 for: 5m keep_firing_for: 1d annotations: description: "operator-controller pod memory usage growing at a high rate for 5 minutes: {{ $value | humanize }}B/sec" - alert: catalogd-memory-growth - expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 50_000 + expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 100_000 for: 5m keep_firing_for: 1d annotations: diff --git a/go.mod b/go.mod index 7f2e8a9e39..02c7162300 100644 --- a/go.mod +++ b/go.mod @@ -23,6 +23,7 @@ require ( github.com/operator-framework/helm-operator-plugins v0.8.0 github.com/operator-framework/operator-registry v1.56.0 github.com/prometheus/client_golang v1.23.0 + github.com/prometheus/common v0.65.0 github.com/spf13/cobra v1.9.1 github.com/stretchr/testify v1.10.0 golang.org/x/exp v0.0.0-20250620022241-b7579e27df2b @@ -177,7 +178,6 @@ require ( github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/proglottis/gpgme v0.1.4 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/common v0.65.0 // indirect github.com/prometheus/procfs v0.16.1 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/rubenv/sql-migrate v1.8.0 // indirect diff --git a/go.sum b/go.sum index 897a5ba135..dedd5be964 100644 --- a/go.sum +++ b/go.sum @@ -279,6 +279,8 @@ github.com/joelanford/ignore v0.1.1 h1:vKky5RDoPT+WbONrbQBgOn95VV/UPh4ejlyAbbzgn github.com/joelanford/ignore v0.1.1/go.mod h1:8eho/D8fwQ3rIXrLwE23AaeaGDNXqLE9QJ3zJ4LIPCw= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= +github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= @@ -352,6 +354,8 @@ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/ github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/nxadm/tail v1.4.11 h1:8feyoE3OzPrcshW5/MJ4sGESc5cqmGkGCWlco4l0bqY= diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go index 354ef75f42..dabfb48cac 100644 --- a/test/e2e/e2e_suite_test.go +++ b/test/e2e/e2e_suite_test.go @@ -15,6 +15,7 @@ import ( ocv1 "github.com/operator-framework/operator-controller/api/v1" "github.com/operator-framework/operator-controller/internal/operator-controller/scheme" + utils "github.com/operator-framework/operator-controller/test/utils" ) var ( @@ -23,9 +24,10 @@ var ( ) const ( - testCatalogRefEnvVar = "CATALOG_IMG" - testCatalogName = "test-catalog" - latestImageTag = "latest" + testSummaryOutputEnvVar = "GITHUB_STEP_SUMMARY" + testCatalogRefEnvVar = "CATALOG_IMG" + testCatalogName = "test-catalog" + latestImageTag = "latest" ) func TestMain(m *testing.M) { @@ -36,7 +38,10 @@ func TestMain(m *testing.M) { c, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) utilruntime.Must(err) - os.Exit(m.Run()) + res := m.Run() + err = utils.PrintSummary(testSummaryOutputEnvVar) + utilruntime.Must(err) + os.Exit(res) } // createTestCatalog will create a new catalog on the test cluster, provided diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 4a88c3dca7..85908f4d56 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -129,7 +129,7 @@ func (c *MetricsTestConfig) getServiceAccountToken(t *testing.T) string { func (c *MetricsTestConfig) createCurlMetricsPod(t *testing.T) { t.Logf("Creating curl pod (%s/%s) to validate the metrics endpoint", c.namespace, c.curlPodName) cmd := exec.Command(c.client, "run", c.curlPodName, - "--image=curlimages/curl", + "--image=curlimages/curl:8.15.0", "--namespace", c.namespace, "--restart=Never", "--overrides", `{ @@ -137,7 +137,7 @@ func (c *MetricsTestConfig) createCurlMetricsPod(t *testing.T) { "terminationGradePeriodSeconds": 0, "containers": [{ "name": "curl", - "image": "curlimages/curl", + "image": "curlimages/curl:8.15.0", "command": ["sh", "-c", "sleep 3600"], "securityContext": { "allowPrivilegeEscalation": false, diff --git a/test/utils/summary.go b/test/utils/summary.go new file mode 100644 index 0000000000..d91ae32391 --- /dev/null +++ b/test/utils/summary.go @@ -0,0 +1,199 @@ +package utils + +import ( + "context" + "fmt" + "math" + "os" + "path/filepath" + "strings" + "text/template" + "time" + + "github.com/prometheus/client_golang/api" + v1 "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" +) + +var ( + summaryTemplate = "summary.md.tmpl" + alertsTemplate = "alert.md.tmpl" + chartTemplate = "mermaid_chart.md.tmpl" + defaultPromUrl = "http://localhost:30900" +) + +type summaryAlerts struct { + FiringAlerts []summaryAlert + PendingAlerts []summaryAlert +} + +type summaryAlert struct { + v1.Alert + Name string + Description string +} + +type xychart struct { + Title string + YMax float64 + YMin float64 + YLabel string + Data string +} + +type githubSummary struct { + client api.Client + Pods []string +} + +func NewSummary(c api.Client, pods ...string) githubSummary { + return githubSummary{ + client: c, + Pods: pods, + } +} + +// PerformanceQuery queries the prometheus server and generates a mermaid xychart with the data. +// title - Display name of the xychart +// pod - Pod name with which to filter results from prometheus +// query - Prometheus query +// yLabel - Label of the Y axis i.e. "KB/s", "MB", etc. +// scaler - Constant by which to scale the results. For instance, cpu usage is more human-readable +// as "mCPU" vs "CPU", so we scale the results by a factor of 1,000. +func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, scaler float64) (string, error) { + v1api := v1.NewAPI(s.client) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + fullQuery := fmt.Sprintf(query, pod) + result, warnings, err := v1api.Query(ctx, fullQuery, time.Now()) + if err != nil { + return "", err + } else if len(warnings) > 0 { + fmt.Printf("warnings returned from performance query; query=%s, warnings=%v", fullQuery, warnings) + } else if result.Type() != model.ValMatrix { + return "", fmt.Errorf("incompatible result type; need: %s, got: %s", model.ValMatrix, result.Type().String()) + } + + matrix, ok := result.(model.Matrix) + if !ok { + return "", fmt.Errorf("typecast for metrics samples failed; aborting") + } else if len(matrix) > 1 { + return "", fmt.Errorf("expected 1 set of results; got: %d", len(matrix)) + } + chart := xychart{ + Title: title, + YLabel: yLabel, + YMax: math.SmallestNonzeroFloat64, + YMin: math.MaxFloat64, + } + formattedData := make([]string, 0) + // matrix does not allow [] access, so we just do one iteration for the single result + for _, metric := range matrix { + if len(metric.Values) < 1 { + return "", fmt.Errorf("expected at least one data point; got: %d", len(metric.Values)) + } + for _, sample := range metric.Values { + floatSample := float64(sample.Value) * scaler + formattedData = append(formattedData, fmt.Sprintf("%f", floatSample)) + if floatSample > chart.YMax { + chart.YMax = floatSample + } + if floatSample < chart.YMin { + chart.YMin = floatSample + } + } + } + // Add some padding + chart.YMax = (chart.YMax + (math.Abs(chart.YMax) * 0.05)) + chart.YMin = (chart.YMin - (math.Abs(chart.YMin) * 0.05)) + // Pretty print the values, ex: [1,2,3,4] + chart.Data = strings.ReplaceAll(fmt.Sprintf("%v", formattedData), " ", ",") + + return executeTemplate(chartTemplate, chart) +} + +// Alerts queries the prometheus server for alerts and generates markdown output for anything found. +// If no alerts are found, the alerts section will contain only "None." in the final output. +func (s githubSummary) Alerts() (string, error) { + v1api := v1.NewAPI(s.client) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + result, err := v1api.Alerts(ctx) + if err != nil { + return "", err + } + + firingAlerts := make([]summaryAlert, 0) + pendingAlerts := make([]summaryAlert, 0) + if len(result.Alerts) > 0 { + for _, a := range result.Alerts { + aConv := summaryAlert{ + Alert: a, + Name: string(a.Labels["alertname"]), + Description: string(a.Annotations["description"]), + } + switch a.State { + case v1.AlertStateFiring: + firingAlerts = append(firingAlerts, aConv) + case v1.AlertStatePending: + pendingAlerts = append(pendingAlerts, aConv) + // Ignore AlertStateInactive; the alerts endpoint doesn't return them + } + } + } else { + return "None.", nil + } + + return executeTemplate(alertsTemplate, summaryAlerts{ + FiringAlerts: firingAlerts, + PendingAlerts: pendingAlerts, + }) +} + +func executeTemplate(templateFile string, obj any) (string, error) { + wd, err := os.Getwd() + if err != nil { + return "", fmt.Errorf("failed to get working directory: %w", err) + } + tmpl, err := template.New(templateFile).ParseGlob(filepath.Join(wd, "../utils/templates", templateFile)) + if err != nil { + return "", err + } + buffer := new(strings.Builder) + err = tmpl.Execute(buffer, obj) + if err != nil { + return "", err + } + return buffer.String(), nil +} + +// PrintSummary executes the main summary template, generating the full test report. +// The markdown is template-driven; the summary methods are called from within the +// template. This allows us to add or change queries (hopefully) without needing to +// touch code. The summary will be output to a file supplied by the env target. +func PrintSummary(envTarget string) error { + client, err := api.NewClient(api.Config{ + Address: defaultPromUrl, + }) + if err != nil { + fmt.Printf("Error creating prometheus client: %v\n", err) + os.Exit(1) + } + + summary := NewSummary(client, "operator-controller", "catalogd") + summaryMarkdown, err := executeTemplate(summaryTemplate, summary) + if err != nil { + return err + } + if path := os.Getenv(envTarget); path != "" { + err = os.WriteFile(path, []byte(summaryMarkdown), 0o600) + if err != nil { + return err + } + fmt.Printf("Test summary output to %s successful\n", envTarget) + } else { + fmt.Printf("No summary output specified; skipping") + } + return nil +} diff --git a/test/utils/templates/alert.md.tmpl b/test/utils/templates/alert.md.tmpl new file mode 100644 index 0000000000..39f3e42879 --- /dev/null +++ b/test/utils/templates/alert.md.tmpl @@ -0,0 +1,16 @@ +{{- /* -------------------- Alert Template --------------------- */ -}} +{{define "alert"}} +| {{ .Name }} | {{ .Description }} | +| -------- | ------- | +| ActiveAt | {{ .ActiveAt }} | +| State | {{ .State }} | +{{- end}} + +### Firing Alerts +{{ range .FiringAlerts }} +{{ template "alert" .}} +{{ end }} +### Pending Alerts +{{ range .PendingAlerts }} +{{ template "alert" .}} +{{ end }} diff --git a/test/utils/templates/mermaid_chart.md.tmpl b/test/utils/templates/mermaid_chart.md.tmpl new file mode 100644 index 0000000000..0a8ed1135b --- /dev/null +++ b/test/utils/templates/mermaid_chart.md.tmpl @@ -0,0 +1,17 @@ +
+ +```mermaid +--- +config: + xyChart: + showDataLabel: true + xAxis: + showLabel: false +--- +xychart-beta +title "{{ .Title }}" +y-axis "{{ .YLabel }}" {{printf "%f" .YMin}} --> {{printf "%f" .YMax}} +x-axis "time (start of test to end)" +line {{.Data}} +``` +
diff --git a/test/utils/templates/summary.md.tmpl b/test/utils/templates/summary.md.tmpl new file mode 100644 index 0000000000..c094d49f38 --- /dev/null +++ b/test/utils/templates/summary.md.tmpl @@ -0,0 +1,22 @@ + +{{- /* ------------ Performance Statistics Template ------------ */ -}} +{{define "performanceStatistics" -}} +{{ range $index, $pod := .Pods }} +### {{$pod}} +#### Memory Usage +{{$.PerformanceQuery "Memory Usage" $pod `container_memory_working_set_bytes{pod=~"%s.*",container="manager"}[5m]` "MB" .000001}} + +#### Memory Growth Rate +{{$.PerformanceQuery "Memory Growth Rate" $pod `deriv(sum(container_memory_working_set_bytes{pod=~"%s.*",container="manager"})[5m:])[5m:]` "KB/s" .001}} + +#### CPU Usage +{{$.PerformanceQuery "CPU Usage" $pod `rate(container_cpu_usage_seconds_total{pod=~"%s.*",container="manager"}[5m])[5m:]` "mCPU" 1000}} +{{end}} +{{- end}} + +{{- /* ----------------- E2E Summary Markdown ------------------ */ -}} +# E2E Summary +## Alerts +{{.Alerts}} +## Performance +{{ template "performanceStatistics" . -}} diff --git a/testdata/images/bundles/test-operator/v1.0.0/manifests/testoperator.clusterserviceversion.yaml b/testdata/images/bundles/test-operator/v1.0.0/manifests/testoperator.clusterserviceversion.yaml index a566e3595e..3520f53dbd 100644 --- a/testdata/images/bundles/test-operator/v1.0.0/manifests/testoperator.clusterserviceversion.yaml +++ b/testdata/images/bundles/test-operator/v1.0.0/manifests/testoperator.clusterserviceversion.yaml @@ -58,7 +58,7 @@ spec: terminationGracePeriodSeconds: 0 containers: - name: busybox - image: busybox + image: busybox:1.36 command: - 'sleep' - '1000'