Skip to content

Commit d790ce5

Browse files
committed
API Call Alerts
Do not fail e2e run when issues with summary generation are encountered. Fail the run if alerts are encountered. Add prometheus alerts for excessive API calls from operator-controller or catalogd, as well as summary graphs to match. Signed-off-by: Daniel Franz <[email protected]>
1 parent 3d6a33b commit d790ce5

File tree

5 files changed

+63
-42
lines changed

5 files changed

+63
-42
lines changed

.github/workflows/e2e.yaml

Lines changed: 2 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,22 +33,7 @@ jobs:
3333
go-version-file: go.mod
3434

3535
- name: Run e2e tests
36-
run: ARTIFACT_PATH=/tmp/artifacts make test-e2e
37-
38-
- name: alerts-check
39-
# Grab all current alerts, filtering out pending, and print the GH actions warning string
40-
# containing the alert name and description.
41-
#
42-
# NOTE: Leaving this as annotating-only instead of failing the run until we have some more
43-
# finely-tuned alerts.
44-
run: |
45-
if [[ -s /tmp/artifacts/alerts.out ]]; then \
46-
jq -r 'if .state=="firing" then
47-
"::error title=Prometheus Alert Firing::\(.labels.alertname): \(.annotations.description)"
48-
elif .state=="pending" then
49-
"::warning title=Prometheus Alert Pending::\(.labels.alertname): \(.annotations.description)"
50-
end' /tmp/artifacts/alerts.out
51-
fi
36+
run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-e2e
5237

5338
- uses: actions/upload-artifact@v4
5439
if: failure()
@@ -75,7 +60,7 @@ jobs:
7560
go-version-file: go.mod
7661

7762
- name: Run e2e tests
78-
run: ARTIFACT_PATH=/tmp/artifacts make test-experimental-e2e
63+
run: ARTIFACT_PATH=/tmp/artifacts E2E_SUMMARY_OUTPUT=$GITHUB_STEP_SUMMARY make test-experimental-e2e
7964

8065
- uses: actions/upload-artifact@v4
8166
if: failure()

config/overlays/prometheus/prometheus_rule.yaml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,15 @@ spec:
5757
keep_firing_for: 1d
5858
annotations:
5959
description: "catalogd using high cpu resources for 5 minutes: {{ $value | printf \"%.2f\" }}%"
60+
- alert: operator-controller-api-call-rate
61+
expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > 10
62+
for: 5m
63+
keep_firing_for: 1d
64+
annotations:
65+
description: "operator-controller making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"
66+
- alert: catalogd-api-call-rate
67+
expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > 5
68+
for: 5m
69+
keep_firing_for: 1d
70+
annotations:
71+
description: "catalogd making excessive API calls for 5 minutes: {{ $value | printf \"%.2f\" }}/sec"

test/e2e/e2e_suite_test.go

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ var (
2525
)
2626

2727
const (
28-
testSummaryOutputEnvVar = "GITHUB_STEP_SUMMARY"
28+
testSummaryOutputEnvVar = "E2E_SUMMARY_OUTPUT"
2929
testCatalogRefEnvVar = "CATALOG_IMG"
3030
testCatalogName = "test-catalog"
3131
latestImageTag = "latest"
@@ -40,9 +40,16 @@ func TestMain(m *testing.M) {
4040
utilruntime.Must(err)
4141

4242
res := m.Run()
43-
err = utils.PrintSummary(testSummaryOutputEnvVar)
44-
if err != nil {
45-
fmt.Println("PrintSummary error", err)
43+
path := os.Getenv(testSummaryOutputEnvVar)
44+
if path == "" {
45+
fmt.Printf("Note: E2E_SUMMARY_OUTPUT is unset; skipping summary generation")
46+
} else {
47+
err = utils.PrintSummary(path)
48+
if err != nil {
49+
// Fail the run if alerts are found
50+
fmt.Printf("%v", err)
51+
os.Exit(1)
52+
}
4653
}
4754
os.Exit(res)
4855
}

test/utils/summary.go

Lines changed: 31 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -42,14 +42,16 @@ type xychart struct {
4242
}
4343

4444
type githubSummary struct {
45-
client api.Client
46-
Pods []string
45+
client api.Client
46+
Pods []string
47+
alertsFiring bool
4748
}
4849

4950
func NewSummary(c api.Client, pods ...string) githubSummary {
5051
return githubSummary{
51-
client: c,
52-
Pods: pods,
52+
client: c,
53+
Pods: pods,
54+
alertsFiring: false,
5355
}
5456
}
5557

@@ -60,7 +62,7 @@ func NewSummary(c api.Client, pods ...string) githubSummary {
6062
// yLabel - Label of the Y axis i.e. "KB/s", "MB", etc.
6163
// scaler - Constant by which to scale the results. For instance, cpu usage is more human-readable
6264
// as "mCPU" vs "CPU", so we scale the results by a factor of 1,000.
63-
func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string, scaler float64) (string, error) {
65+
func (s *githubSummary) PerformanceQuery(title, pod, query, yLabel string, scaler float64) (string, error) {
6466
v1api := v1.NewAPI(s.client)
6567
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
6668
defer cancel()
@@ -90,8 +92,9 @@ func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string,
9092
formattedData := make([]string, 0)
9193
// matrix does not allow [] access, so we just do one iteration for the single result
9294
for _, metric := range matrix {
93-
if len(metric.Values) < 1 {
94-
return "", fmt.Errorf("expected at least one data point; got: %d", len(metric.Values))
95+
if len(metric.Values) < 2 {
96+
// A graph with one data point means something with the collection was wrong
97+
return "", fmt.Errorf("expected at least two data points; got: %d", len(metric.Values))
9598
}
9699
for _, sample := range metric.Values {
97100
floatSample := float64(sample.Value) * scaler
@@ -115,7 +118,7 @@ func (s githubSummary) PerformanceQuery(title, pod, query string, yLabel string,
115118

116119
// Alerts queries the prometheus server for alerts and generates markdown output for anything found.
117120
// If no alerts are found, the alerts section will contain only "None." in the final output.
118-
func (s githubSummary) Alerts() (string, error) {
121+
func (s *githubSummary) Alerts() (string, error) {
119122
v1api := v1.NewAPI(s.client)
120123
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
121124
defer cancel()
@@ -136,6 +139,7 @@ func (s githubSummary) Alerts() (string, error) {
136139
switch a.State {
137140
case v1.AlertStateFiring:
138141
firingAlerts = append(firingAlerts, aConv)
142+
s.alertsFiring = true
139143
case v1.AlertStatePending:
140144
pendingAlerts = append(pendingAlerts, aConv)
141145
// Ignore AlertStateInactive; the alerts endpoint doesn't return them
@@ -172,28 +176,34 @@ func executeTemplate(templateFile string, obj any) (string, error) {
172176
// The markdown is template-driven; the summary methods are called from within the
173177
// template. This allows us to add or change queries (hopefully) without needing to
174178
// touch code. The summary will be output to a file supplied by the env target.
175-
func PrintSummary(envTarget string) error {
179+
func PrintSummary(path string) error {
180+
if path == "" {
181+
fmt.Printf("No summary output path specified; skipping")
182+
return nil
183+
}
184+
176185
client, err := api.NewClient(api.Config{
177186
Address: defaultPromUrl,
178187
})
179188
if err != nil {
180-
fmt.Printf("Error creating prometheus client: %v\n", err)
181-
os.Exit(1)
189+
fmt.Printf("warning: failed to initialize promQL client: %v", err)
190+
return nil
182191
}
183192

184193
summary := NewSummary(client, "operator-controller", "catalogd")
185-
summaryMarkdown, err := executeTemplate(summaryTemplate, summary)
194+
summaryMarkdown, err := executeTemplate(summaryTemplate, &summary)
186195
if err != nil {
187-
return err
196+
fmt.Printf("warning: failed to generate e2e test summary: %v", err)
197+
return nil
188198
}
189-
if path := os.Getenv(envTarget); path != "" {
190-
err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
191-
if err != nil {
192-
return err
193-
}
194-
fmt.Printf("Test summary output to %s successful\n", envTarget)
195-
} else {
196-
fmt.Printf("No summary output specified; skipping")
199+
err = os.WriteFile(path, []byte(summaryMarkdown), 0o600)
200+
if err != nil {
201+
fmt.Printf("warning: failed to write e2e test summary output to %s: %v", path, err)
202+
return nil
203+
}
204+
fmt.Printf("Test summary output to %s successful\n", path)
205+
if summary.alertsFiring {
206+
return fmt.Errorf("performance alerts encountered during test run; please check e2e test summary for details")
197207
}
198208
return nil
199209
}

test/utils/templates/summary.md.tmpl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111

1212
#### CPU Usage
1313
{{$.PerformanceQuery "CPU Usage" $pod `rate(container_cpu_usage_seconds_total{pod=~"%s.*",container="manager"}[5m])[5m:]` "mCPU" 1000}}
14+
15+
#### API Queries Total
16+
{{$.PerformanceQuery "API Queries Total" $pod `sum(rest_client_requests_total{job=~"%s.*"})[5m:]` "# queries" 1}}
17+
18+
#### API Query Rate
19+
{{$.PerformanceQuery "API Queries/sec" $pod `sum(rate(rest_client_requests_total{job=~"%s.*"}[5m]))[5m:]` "per sec" 1}}
20+
1421
{{end}}
1522
{{- end}}
1623

0 commit comments

Comments
 (0)