Skip to content

Commit a0c465d

Browse files
authored
feat: grafana modular lib Couchbase (#1454)
* grafana modular lib Couchbase * readd rows based off new guidance * make fmt * use signals API; wip * fix queries from last commit * cleanup config object after recent changes to signals * make fmt * pr feedback: increase usage, setence casing, using more of the groupLabels/instance labels, rename logs dashboard * make fmt
1 parent 7a55aea commit a0c465d

23 files changed

+2501
-5067
lines changed

couchbase-mixin/alerts/alerts.libsonnet renamed to couchbase-mixin/alerts.libsonnet

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
{
2-
prometheusAlerts+:: {
3-
groups+: [
2+
new(this): {
3+
groups: [
44
{
55
name: 'couchbase',
66
rules: [
77
{
88
alert: 'CouchbaseHighCPUUsage',
99
expr: |||
1010
(sys_cpu_utilization_rate) > %(alertsCriticalCPUUsage)s
11-
||| % $._config,
11+
||| % this.config,
1212
'for': '5m',
1313
labels: {
1414
severity: 'critical',
@@ -19,14 +19,14 @@
1919
(
2020
'{{ printf "%%.0f" $value }} percent CPU usage on node {{$labels.instance}} and on cluster {{$labels.couchbase_cluster}}, ' +
2121
'which is above the threshold of %(alertsCriticalCPUUsage)s.'
22-
) % $._config,
22+
) % this.config,
2323
},
2424
},
2525
{
2626
alert: 'CouchbaseHighMemoryUsage',
2727
expr: |||
2828
100 * (sys_mem_actual_used / clamp_min(sys_mem_actual_used + sys_mem_actual_free, 1)) > %(alertsCriticalMemoryUsage)s
29-
||| % $._config,
29+
||| % this.config,
3030
'for': '5m',
3131
labels: {
3232
severity: 'critical',
@@ -37,14 +37,14 @@
3737
(
3838
'{{ printf "%%.0f" $value }} percent memory usage on node {{$labels.instance}} and on cluster {{$labels.couchbase_cluster}}, ' +
3939
'which is above the threshold of %(alertsCriticalMemoryUsage)s.'
40-
) % $._config,
40+
) % this.config,
4141
},
4242
},
4343
{
4444
alert: 'CouchbaseMemoryEvictionRate',
4545
expr: |||
4646
(kv_ep_num_value_ejects) > %(alertsWarningMemoryEvictionRate)s
47-
||| % $._config,
47+
||| % this.config,
4848
'for': '5m',
4949
labels: {
5050
severity: 'warning',
@@ -55,14 +55,14 @@
5555
(
5656
'{{ printf "%%.0f" $value }} evictions in bucket {{$labels.bucket}}, on node {{$labels.instance}}, and on cluster {{$labels.couchbase_cluster}}, ' +
5757
'which is above the threshold of %(alertsWarningMemoryEvictionRate)s.'
58-
) % $._config,
58+
) % this.config,
5959
},
6060
},
6161
{
6262
alert: 'CouchbaseInvalidRequestVolume',
6363
expr: |||
6464
sum without(instance, job) (rate(n1ql_invalid_requests[2m])) > %(alertsWarningInvalidRequestVolume)s
65-
||| % $._config,
65+
||| % this.config,
6666
'for': '2m',
6767
labels: {
6868
severity: 'warning',
@@ -73,7 +73,7 @@
7373
(
7474
'{{ printf "%%.0f" $value }} invalid requests to {{$labels.couchbase_cluster}}, ' +
7575
'which is above the threshold of %(alertsWarningInvalidRequestVolume)s.'
76-
) % $._config,
76+
) % this.config,
7777
},
7878
},
7979
],

couchbase-mixin/config.libsonnet

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,40 @@
11
{
2-
_config+:: {
3-
enableMultiCluster: false,
4-
couchbaseSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
5-
multiclusterSelector: 'job=~"$job"',
6-
dashboardTags: ['couchbase-mixin'],
7-
dashboardPeriod: 'now-1h',
8-
dashboardTimezone: 'default',
9-
dashboardRefresh: '1m',
2+
local this = self,
3+
enableMultiCluster: false,
4+
filteringSelector: 'job=~"integrations/couchbase"',
5+
groupLabels: if self.enableMultiCluster then ['job', 'cluster', 'couchbase_cluster'] else ['job', 'couchbase_cluster'],
6+
instanceLabels: ['instance'],
7+
dashboardTags: ['couchbase-mixin'],
8+
uid: 'couchbase',
9+
dashboardNamePrefix: 'Couchbase',
1010

11-
// alerts thresholds
12-
alertsCriticalCPUUsage: 85, // %
13-
alertsCriticalMemoryUsage: 85, // %
14-
alertsWarningMemoryEvictionRate: 10, // count
15-
alertsWarningInvalidRequestVolume: 1000, // count
1611

17-
enableLokiLogs: true,
12+
// additional params
13+
dashboardPeriod: 'now-1h',
14+
dashboardTimezone: 'default',
15+
dashboardRefresh: '1m',
16+
17+
// logs lib related
18+
enableLokiLogs: true,
19+
logLabels: if self.enableMultiCluster then ['job', 'instance', 'cluster', 'level'] else ['job', 'instance', 'level'],
20+
extraLogLabels: [], // Required by logs-lib
21+
logsVolumeGroupBy: 'level',
22+
showLogsVolume: true,
23+
24+
// alerts thresholds
25+
alertsCriticalCPUUsage: 85, // %
26+
alertsCriticalMemoryUsage: 85, // %
27+
alertsWarningMemoryEvictionRate: 10, // count
28+
alertsWarningInvalidRequestVolume: 1000, // count
29+
30+
// metrics source for signals library
31+
metricsSource: 'prometheus',
32+
33+
signals+: {
34+
cluster: (import './signals/cluster.libsonnet')(this),
35+
node: (import './signals/node.libsonnet')(this),
36+
query: (import './signals/query.libsonnet')(this),
37+
bucket: (import './signals/bucket.libsonnet')(this),
38+
index: (import './signals/index.libsonnet')(this),
1839
},
1940
}
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
local g = import './g.libsonnet';
2+
local logslib = import 'logs-lib/logs/main.libsonnet';
3+
{
4+
local root = self,
5+
new(this)::
6+
local prefix = this.config.dashboardNamePrefix;
7+
local links = this.grafana.links;
8+
local tags = this.config.dashboardTags;
9+
local uid = g.util.string.slugify(this.config.uid);
10+
local vars = this.grafana.variables;
11+
local annotations = this.grafana.annotations;
12+
local refresh = this.config.dashboardRefresh;
13+
local period = this.config.dashboardPeriod;
14+
local timezone = this.config.dashboardTimezone;
15+
local panels = this.grafana.panels;
16+
17+
{
18+
'couchbase-bucket-overview.json':
19+
g.dashboard.new(prefix + ' bucket overview')
20+
+ g.dashboard.withPanels(
21+
g.util.grid.wrapPanels(
22+
[
23+
panels.bucket_topBucketsByMemoryUsedPanel { gridPos+: { w: 12 } },
24+
panels.bucket_topBucketsByDiskUsedPanel { gridPos+: { w: 12 } },
25+
panels.bucket_topBucketsByCurrentItemsPanel { gridPos+: { w: 8 } },
26+
panels.bucket_topBucketsByOperationsPanel { gridPos+: { w: 8 } },
27+
panels.bucket_topBucketsByOperationsFailedPanel { gridPos+: { w: 8 } },
28+
panels.bucket_topBucketsByHighPriorityRequestsPanel { gridPos+: { w: 12 } },
29+
panels.bucket_bottomBucketsByCacheHitRatioPanel { gridPos+: { w: 12 } },
30+
panels.bucket_topBucketsByVBucketsCountPanel { gridPos+: { w: 12 } },
31+
panels.bucket_topBucketsByVBucketQueueMemoryPanel { gridPos+: { w: 12 } },
32+
],
33+
)
34+
)
35+
+ root.applyCommon(
36+
vars.multiInstance,
37+
uid + '_couchbase_bucket_overview',
38+
tags,
39+
links { couchbaseBucketOverview+:: {} },
40+
annotations,
41+
timezone,
42+
refresh,
43+
period
44+
),
45+
46+
'couchbase-node-overview.json':
47+
g.dashboard.new(prefix + ' node overview')
48+
+ g.dashboard.withPanels(
49+
g.util.panel.resolveCollapsedFlagOnRows(
50+
g.util.grid.wrapPanels(
51+
[
52+
panels.node_memoryUtilizationPanel { gridPos+: { w: 12 } },
53+
panels.node_cpuUtilizationPanel { gridPos+: { w: 12 } },
54+
panels.node_totalMemoryUsedByServicePanel { gridPos+: { w: 8 } },
55+
panels.node_backupSizePanel { gridPos+: { w: 8 } },
56+
panels.node_currentConnectionsPanel { gridPos+: { w: 8 } },
57+
panels.node_httpResponseCodesPanel { gridPos+: { w: 12 } },
58+
panels.node_httpRequestMethodsPanel { gridPos+: { w: 12 } },
59+
panels.node_queryServiceRequestsPanel { gridPos+: { w: 12 } },
60+
panels.node_queryServiceRequestProcessingTimePanel { gridPos+: { w: 12 } },
61+
panels.node_indexServiceRequestsPanel { gridPos+: { w: 8 } },
62+
panels.node_indexCacheHitRatioPanel { gridPos+: { w: 8 } },
63+
panels.node_averageScanLatencyPanel { gridPos+: { w: 8 } },
64+
]
65+
)
66+
)
67+
)
68+
+ root.applyCommon(
69+
vars.multiInstance,
70+
uid + '_couchbase_node_overview',
71+
tags,
72+
links { couchbaseNodeOverview+:: {} },
73+
annotations,
74+
timezone,
75+
refresh,
76+
period
77+
),
78+
79+
'couchbase-cluster-overview.json':
80+
g.dashboard.new(prefix + ' cluster overview')
81+
+ g.dashboard.withPanels(
82+
g.util.panel.resolveCollapsedFlagOnRows(
83+
g.util.grid.wrapPanels(
84+
[
85+
panels.cluster_topNodesByMemoryUsagePanel { gridPos+: { w: 12 } },
86+
panels.cluster_topNodesByHTTPRequestsPanel { gridPos+: { w: 12 } },
87+
panels.cluster_topNodesByQueryServiceRequestsPanel { gridPos+: { w: 12 } },
88+
panels.cluster_topNodesByIndexAverageScanLatencyPanel { gridPos+: { w: 12 } },
89+
panels.cluster_xdcrReplicationRatePanel { gridPos+: { w: 8 } },
90+
panels.cluster_xdcrDocsReceivedPanel { gridPos+: { w: 8 } },
91+
panels.cluster_localBackupSizePanel { gridPos+: { w: 8 } },
92+
] + this.grafana.rows.clusterOverviewBucket,
93+
)
94+
)
95+
)
96+
+ root.applyCommon(
97+
vars.multiInstance,
98+
uid + '_couchbase_cluster_overview',
99+
tags,
100+
links { couchbaseClusterOverview+:: {} },
101+
annotations,
102+
timezone,
103+
refresh,
104+
period
105+
),
106+
107+
}
108+
+
109+
if this.config.enableLokiLogs then
110+
{
111+
'couchbase-logs.json':
112+
logslib.new(
113+
prefix + ' logs',
114+
datasourceName=this.grafana.variables.datasources.loki.name,
115+
datasourceRegex=this.grafana.variables.datasources.loki.regex,
116+
filterSelector=this.config.filteringSelector,
117+
labels=this.config.groupLabels + this.config.extraLogLabels,
118+
formatParser=null,
119+
showLogsVolume=this.config.showLogsVolume,
120+
)
121+
{
122+
dashboards+:
123+
{
124+
logs+:
125+
// reference to self, already generated variables, to keep them, but apply other common data in applyCommon
126+
root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { logs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period),
127+
},
128+
panels+:
129+
{
130+
// modify log panel
131+
logs+:
132+
g.panel.logs.options.withEnableLogDetails(true)
133+
+ g.panel.logs.options.withShowTime(false)
134+
+ g.panel.logs.options.withWrapLogMessage(false),
135+
},
136+
variables+: {
137+
// add prometheus datasource for annotations processing
138+
toArray+: [
139+
this.grafana.variables.datasources.prometheus { hide: 2 },
140+
],
141+
},
142+
}.dashboards.logs,
143+
}
144+
else {},
145+
146+
applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
147+
g.dashboard.withTags(tags)
148+
+ g.dashboard.withUid(uid)
149+
+ g.dashboard.withLinks(std.objectValues(links))
150+
+ g.dashboard.withTimezone(timezone)
151+
+ g.dashboard.withRefresh(refresh)
152+
+ g.dashboard.time.withFrom(period)
153+
+ g.dashboard.withVariables(vars)
154+
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
155+
}

0 commit comments

Comments
 (0)