Skip to content

Commit ad199f1

Browse files
authored
API Call Alerts (#2139)
Do not fail e2e run when issues with summary generation are encountered. Fail the run if alerts are encountered. Add prometheus alerts for excessive API calls from operator-controller or catalogd, as well as summary graphs to match. Signed-off-by: Daniel Franz <[email protected]>
1 parent 3d6a33b commit ad199f1

File tree

5 files changed

+63
-42
lines changed

5 files changed

+63
-42
lines changed

.github/workflows/e2e.yaml

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,22 +33,7 @@ jobs:
3333
go-version-file: go.mod
3434

3535
- name: Run e2e tests
36-
run: ARTIFACT_PATH=/tmp/artifacts make test-e2e
37-
38-
- name: alerts-check
39-
# Grab all current alerts, filtering out pending, and print the GH actions warning string
40-
# containing the alert name and description.
41-
#
42-
# NOTE: Leaving this as annotating-only instead of failing the run until we have some more
43-
# finely-tuned alerts.
44-
run: |
45-
if [[ -s /tmp/artifacts/alerts.out ]]; then \
46-
jq -r 'if .state=="firing" then
47-
"::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)"
48-
elif .state=="pending" then
49-
"::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)"
50-
end' /tmp/artifacts/alerts.out
51-
fi
36+
run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-e2e
5237

5338
- uses: actions/upload-artifact@v4
5439
if: failure()
@@ -75,7 +60,7 @@ jobs:
7560
go-version-file: go.mod
7661

7762
- name: Run e2e tests
78-
run: ARTIFACT_PATH=/tmp/artifacts make test-experimental-e2e
63+
run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-experimental-e2e
7964

8065
- uses: actions/upload-artifact@v4
8166
if: failure()

config/overlays/prometheus/prometheus_rule.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,15 @@ spec:
5757
keep_firing_for: 1d
5858
annotations:
5959
description: "catalogd using high cpu resources for 5 minutes: {{ $value | printf \"%.2f\" }}%"
60+
- alert: operator-controller-api-call-rate
61+
expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > 10
62+
for: 5m
63+
keep_firing_for: 1d
64+
annotations:
65+
description: "operator-controller making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"
66+
- alert: catalogd-api-call-rate
67+
expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > 5
68+
for: 5m
69+
keep_firing_for: 1d
70+
annotations:
71+
description: "catalogd making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"

test/e2e/e2e_suite_test.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ var (
2525
)
2626

2727
const (
28-
testSummaryOutputEnvVar = "GITHUB_STEP_SUMMARY"
28+
testSummaryOutputEnvVar = "E2E_SUMMARY_OUTPUT"
2929
testCatalogRefEnvVar = "CATALOG_IMG"
3030
testCatalogName = "test-catalog"
3131
latestImageTag = "latest"
@@ -40,9 +40,16 @@ func TestMain(m *testing.M) {
4040
utilruntime.Must(err)
4141

4242
res := m.Run()
43-
err = utils.PrintSummary(testSummaryOutputEnvVar)
44-
if err != nil {
45-
fmt.Println("PrintSummary error", err)
43+
path := os.Getenv(testSummaryOutputEnvVar)
44+
if path == "" {
45+
fmt.Printf("Note: E2E_SUMMARY_OUTPUT is unset; skipping summary generation")
46+
} else {
47+
err = utils.PrintSummary(path)
48+
if err != nil {
49+
// Fail the run if alerts are found
50+
fmt.Printf("%v", err)
51+
os.Exit(1)
52+
}
4653
}
4754
os.Exit(res)
4855
}

test/utils/summary.go

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,16 @@ type xychart struct {
4242
}
4343

4444
type githubSummary struct {
45-
client api.Client
46-
Pods []string
45+
client api.Client
46+
Pods []string
47+
alertsFiring bool
4748
}
4849

4950
func NewSummary(c api.Client, pods ...string) githubSummary {
5051
return githubSummary{
51-
client: c,
52-
Pods: pods,
52+
client: c,
53+
Pods: pods,
54+
alertsFiring: false,
5355
}
5456
}
5557

@@ -60,7 +62,7 @@ func NewSummary(c api.Client, pods ...string) githubSummary {
6062
// yLabel - Label of the Y axis i.e. "KB/s", "MB", etc.
6163
// scaler - Constant by which to scale the results. For instance, cpu usage is more human-readable
6264
// as "mCPU" vs "CPU", so we scale the results by a factor of 1,000.
63-
func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, scaler float64) (string, error) {
65+
func (s *githubSummary) PerformanceQuery(title, pod, query, yLabel string, scaler float64) (string, error) {
6466
v1api := v1.NewAPI(s.client)
6567
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
6668
defer cancel()
@@ -90,8 +92,9 @@ func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string,
9092
formattedData := make([]string, 0)
9193
// matrix does not allow [] access, so we just do one iteration for the single result
9294
for _, metric := range matrix {
93-
if len(metric.Values) < 1 {
94-
return "", fmt.Errorf("expected at least one data point; got: %d", len(metric.Values))
95+
if len(metric.Values) < 2 {
96+
// A graph with one data point means something with the collection was wrong
97+
return "", fmt.Errorf("expected at least two data points; got: %d", len(metric.Values))
9598
}
9699
for _, sample := range metric.Values {
97100
floatSample := float64(sample.Value) * scaler
@@ -115,7 +118,7 @@ func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string,
115118

116119
// Alerts queries the prometheus server for alerts and generates markdown output for anything found.
117120
// If no alerts are found, the alerts section will contain only "None." in the final output.
118-
func (s githubSummary) Alerts() (string, error) {
121+
func (s *githubSummary) Alerts() (string, error) {
119122
v1api := v1.NewAPI(s.client)
120123
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
121124
defer cancel()
@@ -136,6 +139,7 @@ func (s githubSummary) Alerts() (string, error) {
136139
switch a.State {
137140
case v1.AlertStateFiring:
138141
firingAlerts = append(firingAlerts, aConv)
142+
s.alertsFiring = true
139143
case v1.AlertStatePending:
140144
pendingAlerts = append(pendingAlerts, aConv)
141145
// Ignore AlertStateInactive; the alerts endpoint doesn't return them
@@ -172,28 +176,34 @@ func executeTemplate(templateFile string, obj any) (string, error) {
172176
// The markdown is template-driven; the summary methods are called from within the
173177
// template. This allows us to add or change queries (hopefully) without needing to
174178
// touch code. The summary will be output to a file supplied by the env target.
175-
func PrintSummary(envTarget string) error {
179+
func PrintSummary(path string) error {
180+
if path == "" {
181+
fmt.Printf("No summary output path specified; skipping")
182+
return nil
183+
}
184+
176185
client, err := api.NewClient(api.Config{
177186
Address: defaultPromUrl,
178187
})
179188
if err != nil {
180-
fmt.Printf("Error creating prometheus client: %v\n", err)
181-
os.Exit(1)
189+
fmt.Printf("warning: failed to initialize promQL client: %v", err)
190+
return nil
182191
}
183192

184193
summary := NewSummary(client, "operator-controller", "catalogd")
185-
summaryMarkdown, err := executeTemplate(summaryTemplate, summary)
194+
summaryMarkdown, err := executeTemplate(summaryTemplate, &summary)
186195
if err != nil {
187-
return err
196+
fmt.Printf("warning: failed to generate e2e test summary: %v", err)
197+
return nil
188198
}
189-
if path := os.Getenv(envTarget); path != "" {
190-
err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
191-
if err != nil {
192-
return err
193-
}
194-
fmt.Printf("Test summary output to %s successful\n", envTarget)
195-
} else {
196-
fmt.Printf("No summary output specified; skipping")
199+
err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
200+
if err != nil {
201+
fmt.Printf("warning: failed to write e2e test summary output to %s: %v", path, err)
202+
return nil
203+
}
204+
fmt.Printf("Test summary output to %s successful\n", path)
205+
if summary.alertsFiring {
206+
return fmt.Errorf("performance alerts encountered during test run; please check e2e test summary for details")
197207
}
198208
return nil
199209
}

test/utils/templates/summary.md.tmpl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111

1212
#### CPU Usage
1313
{{$.PerformanceQuery "CPU Usage" $pod `rate(container_cpu_usage_seconds_total{pod=~"%s.*",container="manager"}[5m])[5m:]` "mCPU" 1000}}
14+
15+
#### API Queries Total
16+
{{$.PerformanceQuery "API Queries Total" $pod `sum(rest_client_requests_total{job=~"%s.*"})[5m:]` "# queries" 1}}
17+
18+
#### API Query Rate
19+
{{$.PerformanceQuery "API Queries/sec" $pod `sum(rate(rest_client_requests_total{job=~"%s.*"}[5m]))[5m:]` "per sec" 1}}
20+
1421
{{end}}
1522
{{- end}}
1623

0 commit comments

Comments
 (0)