Skip to content

Commit 88790f9

Browse files
authored
Add config option to omit exporting rule_group label on ruler metrics (#4571)
* add config option to omit exporting rule_group label on ruler metrics Signed-off-by: Zahari Dichev <[email protected]> * add changelog entry Signed-off-by: Zahari Dichev <[email protected]>
1 parent b525af5 commit 88790f9

File tree

6 files changed

+141
-27
lines changed

6 files changed

+141
-27
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
## master / unreleased
44

55
* [CHANGE] Changed default for `-ingester.min-ready-duration` from 1 minute to 15 seconds. #4539
6+
* [ENHANCEMENT] Ruler: Add `ruler.disable-rule-group-label` to disable the `rule_group` label on exported metrics. #4571
67
* [ENHANCEMENT] Query federation: improve performance in MergeQueryable by memoizing labels. #4502
78
* [ENHANCEMENT] Added new ring related config `-ingester.readiness-check-ring-health` when enabled the readiness probe will succeed only after all instances are ACTIVE and healthy in the ring, this is enabled by default. #4539
89
* [CHANGE] query-frontend: Do not print anything in the logs of `query-frontend` if a in-progress query has been canceled (context canceled). #4562

docs/configuration/config-file-reference.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1650,6 +1650,10 @@ ring:
16501650
# an info level log message.
16511651
# CLI flag: -ruler.query-stats-enabled
16521652
[query_stats_enabled: <boolean> | default = false]
1653+
1654+
# Disable the rule_group label on exported metrics
1655+
# CLI flag: -ruler.disable-rule-group-label
1656+
[disable_rule_group_label: <boolean> | default = false]
16531657
```
16541658
16551659
### `ruler_storage_config`

pkg/ruler/manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg
5353
return nil, err
5454
}
5555

56-
userManagerMetrics := NewManagerMetrics()
56+
userManagerMetrics := NewManagerMetrics(cfg.DisableRuleGroupLabel)
5757
if reg != nil {
5858
reg.MustRegister(userManagerMetrics)
5959
}

pkg/ruler/manager_metrics.go

Lines changed: 31 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ import (
99
// ManagerMetrics aggregates metrics exported by the Prometheus
1010
// rules package and returns them as Cortex metrics
1111
type ManagerMetrics struct {
12-
regs *util.UserRegistries
12+
regs *util.UserRegistries
13+
disableRuleGroupLabel bool
1314

1415
EvalDuration *prometheus.Desc
1516
IterationDuration *prometheus.Desc
@@ -25,9 +26,14 @@ type ManagerMetrics struct {
2526
}
2627

2728
// NewManagerMetrics returns a ManagerMetrics struct
28-
func NewManagerMetrics() *ManagerMetrics {
29+
func NewManagerMetrics(disableRuleGroupLabel bool) *ManagerMetrics {
30+
commonLabels := []string{"user"}
31+
if !disableRuleGroupLabel {
32+
commonLabels = append(commonLabels, "rule_group")
33+
}
2934
return &ManagerMetrics{
30-
regs: util.NewUserRegistries(),
35+
regs: util.NewUserRegistries(),
36+
disableRuleGroupLabel: disableRuleGroupLabel,
3137

3238
EvalDuration: prometheus.NewDesc(
3339
"cortex_prometheus_rule_evaluation_duration_seconds",
@@ -44,55 +50,55 @@ func NewManagerMetrics() *ManagerMetrics {
4450
IterationsMissed: prometheus.NewDesc(
4551
"cortex_prometheus_rule_group_iterations_missed_total",
4652
"The total number of rule group evaluations missed due to slow rule group evaluation.",
47-
[]string{"user", "rule_group"},
53+
commonLabels,
4854
nil,
4955
),
5056
IterationsScheduled: prometheus.NewDesc(
5157
"cortex_prometheus_rule_group_iterations_total",
5258
"The total number of scheduled rule group evaluations, whether executed or missed.",
53-
[]string{"user", "rule_group"},
59+
commonLabels,
5460
nil,
5561
),
5662
EvalTotal: prometheus.NewDesc(
5763
"cortex_prometheus_rule_evaluations_total",
5864
"The total number of rule evaluations.",
59-
[]string{"user", "rule_group"},
65+
commonLabels,
6066
nil,
6167
),
6268
EvalFailures: prometheus.NewDesc(
6369
"cortex_prometheus_rule_evaluation_failures_total",
6470
"The total number of rule evaluation failures.",
65-
[]string{"user", "rule_group"},
71+
commonLabels,
6672
nil,
6773
),
6874
GroupInterval: prometheus.NewDesc(
6975
"cortex_prometheus_rule_group_interval_seconds",
7076
"The interval of a rule group.",
71-
[]string{"user", "rule_group"},
77+
commonLabels,
7278
nil,
7379
),
7480
GroupLastEvalTime: prometheus.NewDesc(
7581
"cortex_prometheus_rule_group_last_evaluation_timestamp_seconds",
7682
"The timestamp of the last rule group evaluation in seconds.",
77-
[]string{"user", "rule_group"},
83+
commonLabels,
7884
nil,
7985
),
8086
GroupLastDuration: prometheus.NewDesc(
8187
"cortex_prometheus_rule_group_last_duration_seconds",
8288
"The duration of the last rule group evaluation.",
83-
[]string{"user", "rule_group"},
89+
commonLabels,
8490
nil,
8591
),
8692
GroupRules: prometheus.NewDesc(
8793
"cortex_prometheus_rule_group_rules",
8894
"The number of rules.",
89-
[]string{"user", "rule_group"},
95+
commonLabels,
9096
nil,
9197
),
9298
GroupLastEvalSamples: prometheus.NewDesc(
9399
"cortex_prometheus_last_evaluation_samples",
94100
"The number of samples returned during the last rule group evaluation.",
95-
[]string{"user", "rule_group"},
101+
commonLabels,
96102
nil,
97103
),
98104
}
@@ -126,21 +132,24 @@ func (m *ManagerMetrics) Describe(out chan<- *prometheus.Desc) {
126132
// Collect implements the Collector interface
127133
func (m *ManagerMetrics) Collect(out chan<- prometheus.Metric) {
128134
data := m.regs.BuildMetricFamiliesPerUser()
129-
135+
labels := []string{}
136+
if !m.disableRuleGroupLabel {
137+
labels = append(labels, "rule_group")
138+
}
130139
// WARNING: It is important that all metrics generated in this method are "Per User".
131140
// Thanks to that we can actually *remove* metrics for given user (see RemoveUserRegistry).
132141
// If same user is later re-added, all metrics will start from 0, which is fine.
133142

134143
data.SendSumOfSummariesPerUser(out, m.EvalDuration, "prometheus_rule_evaluation_duration_seconds")
135144
data.SendSumOfSummariesPerUser(out, m.IterationDuration, "prometheus_rule_group_duration_seconds")
136145

137-
data.SendSumOfCountersPerUserWithLabels(out, m.IterationsMissed, "prometheus_rule_group_iterations_missed_total", "rule_group")
138-
data.SendSumOfCountersPerUserWithLabels(out, m.IterationsScheduled, "prometheus_rule_group_iterations_total", "rule_group")
139-
data.SendSumOfCountersPerUserWithLabels(out, m.EvalTotal, "prometheus_rule_evaluations_total", "rule_group")
140-
data.SendSumOfCountersPerUserWithLabels(out, m.EvalFailures, "prometheus_rule_evaluation_failures_total", "rule_group")
141-
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupInterval, "prometheus_rule_group_interval_seconds", "rule_group")
142-
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastEvalTime, "prometheus_rule_group_last_evaluation_timestamp_seconds", "rule_group")
143-
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastDuration, "prometheus_rule_group_last_duration_seconds", "rule_group")
144-
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupRules, "prometheus_rule_group_rules", "rule_group")
145-
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastEvalSamples, "prometheus_rule_group_last_evaluation_samples", "rule_group")
146+
data.SendSumOfCountersPerUserWithLabels(out, m.IterationsMissed, "prometheus_rule_group_iterations_missed_total", labels...)
147+
data.SendSumOfCountersPerUserWithLabels(out, m.IterationsScheduled, "prometheus_rule_group_iterations_total", labels...)
148+
data.SendSumOfCountersPerUserWithLabels(out, m.EvalTotal, "prometheus_rule_evaluations_total", labels...)
149+
data.SendSumOfCountersPerUserWithLabels(out, m.EvalFailures, "prometheus_rule_evaluation_failures_total", labels...)
150+
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupInterval, "prometheus_rule_group_interval_seconds", labels...)
151+
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastEvalTime, "prometheus_rule_group_last_evaluation_timestamp_seconds", labels...)
152+
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastDuration, "prometheus_rule_group_last_duration_seconds", labels...)
153+
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupRules, "prometheus_rule_group_rules", labels...)
154+
data.SendSumOfGaugesPerUserWithLabels(out, m.GroupLastEvalSamples, "prometheus_rule_group_last_evaluation_samples", labels...)
146155
}

pkg/ruler/manager_metrics_test.go

Lines changed: 101 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ import (
1212
"github.com/stretchr/testify/require"
1313
)
1414

15-
func TestManagerMetrics(t *testing.T) {
15+
func TestManagerMetricsWithRuleGroupLabel(t *testing.T) {
1616
mainReg := prometheus.NewPedanticRegistry()
1717

18-
managerMetrics := NewManagerMetrics()
18+
managerMetrics := NewManagerMetrics(false)
1919
mainReg.MustRegister(managerMetrics)
2020
managerMetrics.AddUserRegistry("user1", populateManager(1))
2121
managerMetrics.AddUserRegistry("user2", populateManager(10))
@@ -134,6 +134,104 @@ cortex_prometheus_rule_group_rules{rule_group="group_two",user="user3"} 100000
134134
require.NoError(t, err)
135135
}
136136

137+
func TestManagerMetricsWithoutRuleGroupLabel(t *testing.T) {
138+
mainReg := prometheus.NewPedanticRegistry()
139+
140+
managerMetrics := NewManagerMetrics(true)
141+
mainReg.MustRegister(managerMetrics)
142+
managerMetrics.AddUserRegistry("user1", populateManager(1))
143+
managerMetrics.AddUserRegistry("user2", populateManager(10))
144+
managerMetrics.AddUserRegistry("user3", populateManager(100))
145+
146+
managerMetrics.AddUserRegistry("user4", populateManager(1000))
147+
managerMetrics.RemoveUserRegistry("user4")
148+
149+
//noinspection ALL
150+
err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(`
151+
# HELP cortex_prometheus_last_evaluation_samples The number of samples returned during the last rule group evaluation.
152+
# TYPE cortex_prometheus_last_evaluation_samples gauge
153+
cortex_prometheus_last_evaluation_samples{user="user1"} 2000
154+
cortex_prometheus_last_evaluation_samples{user="user2"} 20000
155+
cortex_prometheus_last_evaluation_samples{user="user3"} 200000
156+
# HELP cortex_prometheus_rule_evaluation_duration_seconds The duration for a rule to execute.
157+
# TYPE cortex_prometheus_rule_evaluation_duration_seconds summary
158+
cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.5"} 1
159+
cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.9"} 1
160+
cortex_prometheus_rule_evaluation_duration_seconds{user="user1",quantile="0.99"} 1
161+
cortex_prometheus_rule_evaluation_duration_seconds_sum{user="user1"} 1
162+
cortex_prometheus_rule_evaluation_duration_seconds_count{user="user1"} 1
163+
cortex_prometheus_rule_evaluation_duration_seconds{user="user2",quantile="0.5"} 10
164+
cortex_prometheus_rule_evaluation_duration_seconds{user="user2",quantile="0.9"} 10
165+
cortex_prometheus_rule_evaluation_duration_seconds{user="user2",quantile="0.99"} 10
166+
cortex_prometheus_rule_evaluation_duration_seconds_sum{user="user2"} 10
167+
cortex_prometheus_rule_evaluation_duration_seconds_count{user="user2"} 1
168+
cortex_prometheus_rule_evaluation_duration_seconds{user="user3",quantile="0.5"} 100
169+
cortex_prometheus_rule_evaluation_duration_seconds{user="user3",quantile="0.9"} 100
170+
cortex_prometheus_rule_evaluation_duration_seconds{user="user3",quantile="0.99"} 100
171+
cortex_prometheus_rule_evaluation_duration_seconds_sum{user="user3"} 100
172+
cortex_prometheus_rule_evaluation_duration_seconds_count{user="user3"} 1
173+
# HELP cortex_prometheus_rule_evaluation_failures_total The total number of rule evaluation failures.
174+
# TYPE cortex_prometheus_rule_evaluation_failures_total counter
175+
cortex_prometheus_rule_evaluation_failures_total{user="user1"} 2
176+
cortex_prometheus_rule_evaluation_failures_total{user="user2"} 20
177+
cortex_prometheus_rule_evaluation_failures_total{user="user3"} 200
178+
# HELP cortex_prometheus_rule_evaluations_total The total number of rule evaluations.
179+
# TYPE cortex_prometheus_rule_evaluations_total counter
180+
cortex_prometheus_rule_evaluations_total{user="user1"} 2
181+
cortex_prometheus_rule_evaluations_total{user="user2"} 20
182+
cortex_prometheus_rule_evaluations_total{user="user3"} 200
183+
# HELP cortex_prometheus_rule_group_duration_seconds The duration of rule group evaluations.
184+
# TYPE cortex_prometheus_rule_group_duration_seconds summary
185+
cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.01"} 1
186+
cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.05"} 1
187+
cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.5"} 1
188+
cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.9"} 1
189+
cortex_prometheus_rule_group_duration_seconds{user="user1",quantile="0.99"} 1
190+
cortex_prometheus_rule_group_duration_seconds_sum{user="user1"} 1
191+
cortex_prometheus_rule_group_duration_seconds_count{user="user1"} 1
192+
cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.01"} 10
193+
cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.05"} 10
194+
cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.5"} 10
195+
cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.9"} 10
196+
cortex_prometheus_rule_group_duration_seconds{user="user2",quantile="0.99"} 10
197+
cortex_prometheus_rule_group_duration_seconds_sum{user="user2"} 10
198+
cortex_prometheus_rule_group_duration_seconds_count{user="user2"} 1
199+
cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.01"} 100
200+
cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.05"} 100
201+
cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.5"} 100
202+
cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.9"} 100
203+
cortex_prometheus_rule_group_duration_seconds{user="user3",quantile="0.99"} 100
204+
cortex_prometheus_rule_group_duration_seconds_sum{user="user3"} 100
205+
cortex_prometheus_rule_group_duration_seconds_count{user="user3"} 1
206+
# HELP cortex_prometheus_rule_group_iterations_missed_total The total number of rule group evaluations missed due to slow rule group evaluation.
207+
# TYPE cortex_prometheus_rule_group_iterations_missed_total counter
208+
cortex_prometheus_rule_group_iterations_missed_total{user="user1"} 2
209+
cortex_prometheus_rule_group_iterations_missed_total{user="user2"} 20
210+
cortex_prometheus_rule_group_iterations_missed_total{user="user3"} 200
211+
# HELP cortex_prometheus_rule_group_iterations_total The total number of scheduled rule group evaluations, whether executed or missed.
212+
# TYPE cortex_prometheus_rule_group_iterations_total counter
213+
cortex_prometheus_rule_group_iterations_total{user="user1"} 2
214+
cortex_prometheus_rule_group_iterations_total{user="user2"} 20
215+
cortex_prometheus_rule_group_iterations_total{user="user3"} 200
216+
# HELP cortex_prometheus_rule_group_last_duration_seconds The duration of the last rule group evaluation.
217+
# TYPE cortex_prometheus_rule_group_last_duration_seconds gauge
218+
cortex_prometheus_rule_group_last_duration_seconds{user="user1"} 2000
219+
cortex_prometheus_rule_group_last_duration_seconds{user="user2"} 20000
220+
cortex_prometheus_rule_group_last_duration_seconds{user="user3"} 200000
221+
# HELP cortex_prometheus_rule_group_last_evaluation_timestamp_seconds The timestamp of the last rule group evaluation in seconds.
222+
# TYPE cortex_prometheus_rule_group_last_evaluation_timestamp_seconds gauge
223+
cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{user="user1"} 2000
224+
cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{user="user2"} 20000
225+
cortex_prometheus_rule_group_last_evaluation_timestamp_seconds{user="user3"} 200000
226+
# HELP cortex_prometheus_rule_group_rules The number of rules.
227+
# TYPE cortex_prometheus_rule_group_rules gauge
228+
cortex_prometheus_rule_group_rules{user="user1"} 2000
229+
cortex_prometheus_rule_group_rules{user="user2"} 20000
230+
cortex_prometheus_rule_group_rules{user="user3"} 200000
231+
`))
232+
require.NoError(t, err)
233+
}
234+
137235
func populateManager(base float64) *prometheus.Registry {
138236
r := prometheus.NewRegistry()
139237

@@ -265,7 +363,7 @@ func newGroupMetrics(r prometheus.Registerer) *groupMetrics {
265363
func TestMetricsArePerUser(t *testing.T) {
266364
mainReg := prometheus.NewPedanticRegistry()
267365

268-
managerMetrics := NewManagerMetrics()
366+
managerMetrics := NewManagerMetrics(true)
269367
mainReg.MustRegister(managerMetrics)
270368
managerMetrics.AddUserRegistry("user1", populateManager(1))
271369
managerMetrics.AddUserRegistry("user2", populateManager(10))

pkg/ruler/ruler.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,8 @@ type Config struct {
115115

116116
RingCheckPeriod time.Duration `yaml:"-"`
117117

118-
EnableQueryStats bool `yaml:"query_stats_enabled"`
118+
EnableQueryStats bool `yaml:"query_stats_enabled"`
119+
DisableRuleGroupLabel bool `yaml:"disable_rule_group_label"`
119120
}
120121

121122
// Validate config and returns error on failure
@@ -179,6 +180,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
179180
f.Var(&cfg.DisabledTenants, "ruler.disabled-tenants", "Comma separated list of tenants whose rules this ruler cannot evaluate. If specified, a ruler that would normally pick the specified tenant(s) for processing will ignore them instead. Subject to sharding.")
180181

181182
f.BoolVar(&cfg.EnableQueryStats, "ruler.query-stats-enabled", false, "Report the wall time for ruler queries to complete as a per user metric and as an info level log message.")
183+
f.BoolVar(&cfg.DisableRuleGroupLabel, "ruler.disable-rule-group-label", false, "Disable the rule_group label on exported metrics")
182184

183185
cfg.RingCheckPeriod = 5 * time.Second
184186
}

0 commit comments

Comments
 (0)