From d790ce5bf773ff624bd3cd525e0c460cc9f3fadd Mon Sep 17 00:00:00 2001
From: Daniel Franz <dfranz@redhat.com>
Date: Tue, 5 Aug 2025 17:55:00 +0900
Subject: [PATCH] API Call Alerts

Do not fail e2e run when issues with summary generation are encountered. Fail the run if alerts are encountered.

Add prometheus alerts for excessive API calls from operator-controller or catalogd, as well as summary graphs to match.

Signed-off-by: Daniel Franz <dfranz@redhat.com>
---
 .github/workflows/e2e.yaml                    | 19 +------
 .../overlays/prometheus/prometheus_rule.yaml  | 12 +++++
 test/e2e/e2e_suite_test.go                    | 15 ++++--
 test/utils/summary.go                         | 52 +++++++++++--------
 test/utils/templates/summary.md.tmpl          |  7 +++
 5 files changed, 63 insertions(+), 42 deletions(-)

diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml
index 97e0a21815..8e7d8d511e 100644
--- a/.github/workflows/e2e.yaml
+++ b/.github/workflows/e2e.yaml
@@ -33,22 +33,7 @@ jobs:
           go-version-file: go.mod
 
       - name: Run e2e tests
-        run: ARTIFACT_PATH=/tmp/artifacts make test-e2e
-
-      - name: alerts-check
-        # Grab all current alerts, filtering out pending, and print the GH actions warning string
-        # containing the alert name and description.
-        #
-        # NOTE: Leaving this as annotating-only instead of failing the run until we have some more
-        # finely-tuned alerts.
-        run: |
-          if [[ -s /tmp/artifacts/alerts.out ]]; then \
-            jq -r 'if .state=="firing" then
-              "::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)"
-            elif .state=="pending" then
-              "::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)"
-            end' /tmp/artifacts/alerts.out
-          fi
+        run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-e2e
 
       - uses: actions/upload-artifact@v4
         if: failure()
@@ -75,7 +60,7 @@ jobs:
           go-version-file: go.mod
 
       - name: Run e2e tests
-        run: ARTIFACT_PATH=/tmp/artifacts make test-experimental-e2e
+        run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-experimental-e2e
 
       - uses: actions/upload-artifact@v4
         if: failure()
diff --git a/config/overlays/prometheus/prometheus_rule.yaml b/config/overlays/prometheus/prometheus_rule.yaml
index 5bd7e120b8..b7e3fcdafe 100644
--- a/config/overlays/prometheus/prometheus_rule.yaml
+++ b/config/overlays/prometheus/prometheus_rule.yaml
@@ -57,3 +57,15 @@ spec:
       keep_firing_for: 1d
       annotations:
         description: "catalogd using high cpu resources for 5 minutes: {{ $value | printf \"%.2f\" }}%"
+    - alert: operator-controller-api-call-rate
+      expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > 10
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "operator-controller making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"
+    - alert: catalogd-api-call-rate
+      expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > 5
+      for: 5m
+      keep_firing_for: 1d
+      annotations:
+        description: "catalogd making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"
diff --git a/test/e2e/e2e_suite_test.go b/test/e2e/e2e_suite_test.go
index cf4f474eba..5fa87d6c1b 100644
--- a/test/e2e/e2e_suite_test.go
+++ b/test/e2e/e2e_suite_test.go
@@ -25,7 +25,7 @@ var (
 )
 
 const (
-	testSummaryOutputEnvVar = "GITHUB_STEP_SUMMARY"
+	testSummaryOutputEnvVar = "E2E_SUMMARY_OUTPUT"
 	testCatalogRefEnvVar    = "CATALOG_IMG"
 	testCatalogName         = "test-catalog"
 	latestImageTag          = "latest"
@@ -40,9 +40,16 @@ func TestMain(m *testing.M) {
 	utilruntime.Must(err)
 
 	res := m.Run()
-	err = utils.PrintSummary(testSummaryOutputEnvVar)
-	if err != nil {
-		fmt.Println("PrintSummary error", err)
+	path := os.Getenv(testSummaryOutputEnvVar)
+	if path == "" {
+		fmt.Printf("Note: E2E_SUMMARY_OUTPUT is unset; skipping summary generation")
+	} else {
+		err = utils.PrintSummary(path)
+		if err != nil {
+			// Fail the run if alerts are found
+			fmt.Printf("%v", err)
+			os.Exit(1)
+		}
 	}
 	os.Exit(res)
 }
diff --git a/test/utils/summary.go b/test/utils/summary.go
index d91ae32391..f3830d30e3 100644
--- a/test/utils/summary.go
+++ b/test/utils/summary.go
@@ -42,14 +42,16 @@ type xychart struct {
 }
 
 type githubSummary struct {
-	client api.Client
-	Pods   []string
+	client       api.Client
+	Pods         []string
+	alertsFiring bool
 }
 
 func NewSummary(c api.Client, pods ...string) githubSummary {
 	return githubSummary{
-		client: c,
-		Pods:   pods,
+		client:       c,
+		Pods:         pods,
+		alertsFiring: false,
 	}
 }
 
@@ -60,7 +62,7 @@ func NewSummary(c api.Client, pods ...string) githubSummary {
 // yLabel - Label of the Y axis i.e. "KB/s", "MB", etc.
 // scaler - Constant by which to scale the results. For instance, cpu usage is more human-readable
 // as "mCPU" vs "CPU", so we scale the results by a factor of 1,000.
-func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, scaler float64) (string, error) {
+func (s *githubSummary) PerformanceQuery(title, pod, query, yLabel string, scaler float64) (string, error) {
 	v1api := v1.NewAPI(s.client)
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -90,8 +92,9 @@ func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string,
 	formattedData := make([]string, 0)
 	// matrix does not allow [] access, so we just do one iteration for the single result
 	for _, metric := range matrix {
-		if len(metric.Values) < 1 {
-			return "", fmt.Errorf("expected at least one data point; got: %d", len(metric.Values))
+		if len(metric.Values) < 2 {
+			// A graph with one data point means something with the collection was wrong
+			return "", fmt.Errorf("expected at least two data points; got: %d", len(metric.Values))
 		}
 		for _, sample := range metric.Values {
 			floatSample := float64(sample.Value) * scaler
@@ -115,7 +118,7 @@ func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string,
 
 // Alerts queries the prometheus server for alerts and generates markdown output for anything found.
 // If no alerts are found, the alerts section will contain only "None." in the final output.
-func (s githubSummary) Alerts() (string, error) {
+func (s *githubSummary) Alerts() (string, error) {
 	v1api := v1.NewAPI(s.client)
 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
@@ -136,6 +139,7 @@ func (s githubSummary) Alerts() (string, error) {
 			switch a.State {
 			case v1.AlertStateFiring:
 				firingAlerts = append(firingAlerts, aConv)
+				s.alertsFiring = true
 			case v1.AlertStatePending:
 				pendingAlerts = append(pendingAlerts, aConv)
 				// Ignore AlertStateInactive; the alerts endpoint doesn't return them
@@ -172,28 +176,34 @@ func executeTemplate(templateFile string, obj any) (string, error) {
 // The markdown is template-driven; the summary methods are called from within the
 // template. This allows us to add or change queries (hopefully) without needing to
 // touch code. The summary will be output to a file supplied by the env target.
-func PrintSummary(envTarget string) error {
+func PrintSummary(path string) error {
+	if path == "" {
+		fmt.Printf("No summary output path specified; skipping")
+		return nil
+	}
+
 	client, err := api.NewClient(api.Config{
 		Address: defaultPromUrl,
 	})
 	if err != nil {
-		fmt.Printf("Error creating prometheus client: %v\n", err)
-		os.Exit(1)
+		fmt.Printf("warning: failed to initialize promQL client: %v", err)
+		return nil
 	}
 
 	summary := NewSummary(client, "operator-controller", "catalogd")
-	summaryMarkdown, err := executeTemplate(summaryTemplate, summary)
+	summaryMarkdown, err := executeTemplate(summaryTemplate, &summary)
 	if err != nil {
-		return err
+		fmt.Printf("warning: failed to generate e2e test summary: %v", err)
+		return nil
 	}
-	if path := os.Getenv(envTarget); path != "" {
-		err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
-		if err != nil {
-			return err
-		}
-		fmt.Printf("Test summary output to %s successful\n", envTarget)
-	} else {
-		fmt.Printf("No summary output specified; skipping")
+	err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
+	if err != nil {
+		fmt.Printf("warning: failed to write e2e test summary output to %s: %v", path, err)
+		return nil
+	}
+	fmt.Printf("Test summary output to %s successful\n", path)
+	if summary.alertsFiring {
+		return fmt.Errorf("performance alerts encountered during test run; please check e2e test summary for details")
 	}
 	return nil
 }
diff --git a/test/utils/templates/summary.md.tmpl b/test/utils/templates/summary.md.tmpl
index c094d49f38..b1372b8740 100644
--- a/test/utils/templates/summary.md.tmpl
+++ b/test/utils/templates/summary.md.tmpl
@@ -11,6 +11,13 @@
 
 #### CPU Usage
 {{$.PerformanceQuery "CPU Usage" $pod `rate(container_cpu_usage_seconds_total{pod=~"%s.*",container="manager"}[5m])[5m:]` "mCPU" 1000}}
+
+#### API Queries Total
+{{$.PerformanceQuery "API Queries Total" $pod `sum(rest_client_requests_total{job=~"%s.*"})[5m:]` "# queries" 1}}
+
+#### API Query Rate
+{{$.PerformanceQuery "API Queries/sec" $pod `sum(rate(rest_client_requests_total{job=~"%s.*"}[5m]))[5m:]` "per sec" 1}}
+
 {{end}}
 {{- end}}