Skip to content
Merged
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
prometheusAlerts+:: {
new(this): {
groups+: [
{
name: 'ApacheTomcatAlerts',
rules: [
{
alert: 'ApacheTomcatAlertsHighCpuUsage',
expr: |||
sum by (%(agg)s) (jvm_process_cpu_load{%(filteringSelector)s}) > %(ApacheTomcatAlertsCriticalCpuUsage)s
||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) },
sum by (%(agg)s) (jvm_process_cpu_load{%(filteringSelector)s}) > %(alertsCriticalCpuUsage)s
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -18,15 +18,15 @@
description:
(
'The CPU usage has been at {{ printf "%%.0f" $value }} percent over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(ApacheTomcatAlertsCriticalCpuUsage)s percent.'
) % $._config,
'which is above the threshold of %(alertsCriticalCpuUsage)s percent.'
) % this.config,
},
},
{
alert: 'ApacheTomcatAlertsHighMemoryUsage',
expr: |||
sum(jvm_memory_usage_used_bytes{%(filteringSelector)s}) by (%(agg)s) / sum(jvm_physical_memory_bytes{%(filteringSelector)s}) by (%(agg)s) * 100 > %(ApacheTomcatAlertsCriticalMemoryUsage)s
||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) },
sum(jvm_memory_usage_used_bytes{%(filteringSelector)s}) by (%(agg)s) / sum(jvm_physical_memory_bytes{%(filteringSelector)s}) by (%(agg)s) * 100 > %(alertsCriticalMemoryUsage)s
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -36,15 +36,15 @@
description:
(
'The memory usage has been at {{ printf "%%.0f" $value }} percent over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(ApacheTomcatAlertsCriticalMemoryUsage)s percent.'
) % $._config,
'which is above the threshold of %(alertsCriticalMemoryUsage)s percent.'
) % this.config,
},
},
{
alert: 'ApacheTomcatAlertsHighRequestErrorPercent',
alert: 'ApacheTomcatAlertsRequestErrors',
expr: |||
sum by (%(agg)s) (increase(tomcat_errorcount_total{%(filteringSelector)s}[5m]) / increase(tomcat_requestcount_total{%(filteringSelector)s}[5m]) * 100) > %(ApacheTomcatAlertsCriticalRequestErrorPercentage)s
||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) },
sum by (%(agg)s) (increase(tomcat_errorcount_total{%(filteringSelector)s}[5m]) / increase(tomcat_requestcount_total{%(filteringSelector)s}[5m]) * 100) > %(alertsCriticalRequestErrorPercentage)s
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -54,15 +54,15 @@
description:
(
'The percentage of request errors has been at {{ printf "%%.0f" $value }} percent over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(ApacheTomcatAlertsCriticalRequestErrorPercentage)s percent.'
) % $._config,
'which is above the threshold of %(alertsCriticalRequestErrorPercentage)s percent.'
) % this.config,
},
},
{
alert: 'ApacheTomcatAlertsModeratelyHighProcessingTime',
alert: 'ApacheTomcatAlertsHighProcessingTime',
expr: |||
sum by (%(agg)s) (increase(tomcat_processingtime_total{%(filteringSelector)s}[5m]) / increase(tomcat_requestcount_total{%(filteringSelector)s}[5m])) > %(ApacheTomcatAlertsWarningProcessingTime)s
||| % $._config { agg: std.join(',', $._config.groupLabels + $._config.instanceLabels) },
sum by (%(agg)s) (increase(tomcat_processingtime_total{%(filteringSelector)s}[5m]) / increase(tomcat_requestcount_total{%(filteringSelector)s}[5m])) > %(alertsWarningProcessingTime)s
||| % this.config { agg: std.join(',', this.config.groupLabels + this.config.instanceLabels) },
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -72,8 +72,8 @@
description:
(
'The processing time has been at {{ printf "%%.0f" $value }}ms over the last 5 minutes on {{$labels.instance}}, ' +
'which is above the threshold of %(ApacheTomcatAlertsWarningProcessingTime)sms.'
) % $._config,
'which is above the threshold of %(alertsWarningProcessingTime)sms.'
) % this.config,
},
},
],
Expand Down
45 changes: 27 additions & 18 deletions apache-tomcat-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,24 +1,33 @@
{
_config+:: {
dashboardTags: ['apache-tomcat-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
local this = self,
filteringSelector: 'job="integrations/tomcat"',
groupLabels: ['job', 'cluster'],
logLabels: [],
instanceLabels: ['instance'],

//alert thresholds
ApacheTomcatAlertsCriticalCpuUsage: 80, //%
ApacheTomcatAlertsCriticalMemoryUsage: 80, //%
ApacheTomcatAlertsCriticalRequestErrorPercentage: 5, //%
ApacheTomcatAlertsWarningProcessingTime: 300, //ms
uid: 'apache-tomcat',
dashboardTags: [self.uid + '-mixin'],
dashboardNamePrefix: 'Apache Tomcat',
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
metricsSource: ['prometheus'], // metrics source for signals

// used in alerts:
filteringSelector: 'job="integrations/tomcat"',
groupLabels: if self.enableMultiCluster then ['job', 'cluster'] else ['job'],
instanceLabels: ['instance'],

enableLokiLogs: true,
enableMultiCluster: false,
multiclusterSelector: 'job=~"$job"',
tomcatSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
// Logging configuration
enableLokiLogs: true,
extraLogLabels: ['level'], // Required by logs-lib
logsVolumeGroupBy: 'level',
showLogsVolume: true,

// alert thresholds
alertsCriticalCpuUsage: 80, //%
alertsCriticalMemoryUsage: 80, //%
alertsCriticalRequestErrorPercentage: 5, //%
alertsWarningProcessingTime: 300, //ms

signals+: {
overview: (import './signals/overview.libsonnet')(this),
hosts: (import './signals/hosts.libsonnet')(this),
},
}
125 changes: 125 additions & 0 deletions apache-tomcat-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
local g = import './g.libsonnet';
local commonlib = import 'common-lib/common/main.libsonnet';
local logslib = import 'logs-lib/logs/main.libsonnet';

{
local root = self,
new(this)::
local prefix = this.config.dashboardNamePrefix;
local links = this.grafana.links;
local tags = this.config.dashboardTags;
local uid = g.util.string.slugify(this.config.uid);
local vars = this.grafana.variables;
local annotations = this.grafana.annotations;
local refresh = this.config.dashboardRefresh;
local period = this.config.dashboardPeriod;
local timezone = this.config.dashboardTimezone;
{
'apache-tomcat-overview.json':
g.dashboard.new(prefix + ' overview')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.overview,
],
),
),
) + root.applyCommon(
vars.multiInstance + [
g.dashboard.variable.query.new('protocol',
query='label_values(tomcat_bytesreceived_total{%(queriesSelector)s}, protocol)' % vars) + g.dashboard.variable.custom.selectionOptions.withMulti(true)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm not sure if this is a linting/automatic formatting thing for a multiline bit of code, but the second line also exceeds the console width and ends up wrapped. Might be worth just folding it into one line?
Looks a bit strange in both git diff and IDEs

+ g.dashboard.variable.query.queryTypes.withLabelValues(label='protocol', metric='tomcat_bytesreceived_total{%(queriesSelector)s}' % vars)
+ g.dashboard.variable.query.withDatasourceFromVariable(variable=vars.datasources.prometheus),

g.dashboard.variable.query.new('port', query='label_values(tomcat_bytesreceived_total{%(queriesSelector)s}, port)' % vars) + g.dashboard.variable.custom.selectionOptions.withMulti(true)
+ g.dashboard.variable.query.queryTypes.withLabelValues(label='port', metric='tomcat_bytesreceived_total{%(queriesSelector)s}' % vars)
+ g.dashboard.variable.query.withDatasourceFromVariable(variable=vars.datasources.prometheus),
],
uid + '_overview',
tags,
links { apacheTomcatOverview:: {} },
annotations,
timezone,
refresh,
period
),

'apache-tomcat-hosts.json':
g.dashboard.new(prefix + ' hosts')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.hosts,
this.grafana.rows.hostServlets,
],
),
),
) + root.applyCommon(
vars.multiInstance + [
g.dashboard.variable.query.new('host')
+ g.dashboard.variable.custom.selectionOptions.withMulti(true)
+ g.dashboard.variable.query.queryTypes.withLabelValues(label='host', metric='tomcat_session_sessioncounter_total{%(queriesSelector)s}' % vars)
+ g.dashboard.variable.query.withDatasourceFromVariable(variable=vars.datasources.prometheus),

g.dashboard.variable.query.new('context')
+ g.dashboard.variable.custom.selectionOptions.withMulti(true)
+ g.dashboard.variable.query.queryTypes.withLabelValues(label='context', metric='tomcat_session_sessioncounter_total{%(queriesSelector)s}' % vars)
+ g.dashboard.variable.query.withDatasourceFromVariable(variable=vars.datasources.prometheus),

g.dashboard.variable.query.new('servlet')
+ g.dashboard.variable.custom.selectionOptions.withMulti(true)
+ g.dashboard.variable.query.queryTypes.withLabelValues(label='servlet', metric='tomcat_servlet_requestcount_total{%(queriesSelector)s}' % vars)
+ g.dashboard.variable.query.withDatasourceFromVariable(variable=vars.datasources.prometheus),
],
uid + '_hosts',
tags,
links { apacheTomcatHosts:: {} },
annotations,
timezone,
refresh,
period
),
} + if this.config.enableLokiLogs then {
'apache-tomcat-logs.json':
logslib.new(
prefix + ' logs',
datasourceName=this.grafana.variables.datasources.loki.name,
datasourceRegex=this.grafana.variables.datasources.loki.regex,
filterSelector=this.config.filteringSelector,
labels=this.config.groupLabels + this.config.extraLogLabels,
formatParser=null,
showLogsVolume=this.config.showLogsVolume,
)
{
dashboards+:
{
logs+:
root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { apacheTomcatLogs:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period),
},
panels+:
{
logs+:
g.panel.logs.options.withEnableLogDetails(true)
+ g.panel.logs.options.withShowTime(false)
+ g.panel.logs.options.withWrapLogMessage(false),
},
variables+: {
toArray+: [
this.grafana.variables.datasources.prometheus { hide: 2 },
],
},
}.dashboards.logs,
},

applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
g.dashboard.withTags(tags)
+ g.dashboard.withUid(uid)
+ g.dashboard.withLinks(std.objectValues(links))
+ g.dashboard.withTimezone(timezone)
+ g.dashboard.withRefresh(refresh)
+ g.dashboard.time.withFrom(period)
+ g.dashboard.withVariables(vars)
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
}
Loading
Loading