Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
prometheusAlerts+:: {
groups+: [
new(this): {
groups: [
{
name: 'aerospike',
name: 'aerospike.rules',
rules: [
{
alert: 'AerospikeNodeHighMemoryUsage',
expr: |||
100 - sum without (service) (aerospike_node_stats_system_free_mem_pct) >= %(alertsCriticalNodeHighMemoryUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -19,14 +19,14 @@
(
'{{ printf "%%.0f" $value }} percent of system memory used on node {{$labels.instance}} on cluster {{$labels.aerospike_cluster}}, ' +
'which is above the threshold of %(alertsCriticalNodeHighMemoryUsage)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'AerospikeNamespaceHighDiskUsage',
expr: |||
100 - sum without (service) (aerospike_namespace_device_free_pct) >= %(alertsCriticalNamespaceHighDiskUsage)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -37,14 +37,14 @@
(
'{{ printf "%%.0f" $value }} percent of disk space available for namespace {{$labels.ns}} on node {{$labels.instance}}, on cluster {{$labels.aerospike_cluster}}, ' +
'which is above the threshold of %(alertsCriticalNamespaceHighDiskUsage)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'AerospikeUnavailablePartitions',
expr: |||
sum without(service) (aerospike_namespace_unavailable_partitions) > %(alertsCriticalUnavailablePartitions)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -55,14 +55,14 @@
(
'{{ printf "%%.0f" $value }} unavailable partition(s) in namespace {{$labels.ns}}, on node {{$labels.instance}}, on cluster {{$labels.aerospike_cluster}}, ' +
'which is above the threshold of %(alertsCriticalUnavailablePartitions)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'AerospikeDeadPartitions',
expr: |||
sum without(service) (aerospike_namespace_dead_partitions) > %(alertsCriticalDeadPartitions)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -73,14 +73,14 @@
(
'{{ printf "%%.0f" $value }} dead partition(s) in namespace {{$labels.ns}}, on node {{$labels.instance}}, on cluster {{$labels.aerospike_cluster}}, ' +
'which is above the threshold of %(alertsCriticalDeadPartitions)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'AerospikeNamespaceRejectingWrites',
expr: |||
sum without(service) (aerospike_namespace_stop_writes + aerospike_namespace_clock_skew_stop_writes) > %(alertsCriticalSystemRejectingWrites)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -90,14 +90,14 @@
description:
(
'Namespace {{$labels.ns}} on node {{$labels.instance}} on cluster {{$labels.aerospike_cluster}} is currently rejecting all client-originated writes.'
) % $._config,
) % this.config,
},
},
{
alert: 'AerospikeHighClientReadErrorRate',
expr: |||
sum without(service) (rate(aerospike_namespace_client_read_error[5m])) / (clamp_min(sum without(service) (rate(aerospike_namespace_client_read_error[5m])) + sum without(service) (rate(aerospike_namespace_client_read_success[5m])), 1)) > %(alertsWarningHighClientReadErrorRate)s
||| % $._config,
sum by(instance, aerospike_cluster, ns) (rate(aerospike_namespace_client_read_error[5m])) / (clamp_min(sum by(instance, aerospike_cluster, ns) (rate(aerospike_namespace_client_read_error[5m])) + sum by(instance, aerospike_cluster, ns) (rate(aerospike_namespace_client_read_success[5m])), 1)) > %(alertsWarningHighClientReadErrorRate)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -108,14 +108,14 @@
(
'{{ printf "%%.0f" $value }} percent of client read transactions are resulting in errors for namespace {{$labels.ns}}, on node {{$labels.instance}}, on cluster {{$labels.aerospike_cluster}}, ' +
'which is above the threshold of %(alertsWarningHighClientReadErrorRate)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'AerospikeHighClientWriteErrorRate',
expr: |||
sum without(service) (rate(aerospike_namespace_client_write_error[5m])) / (clamp_min(sum without(service) (rate(aerospike_namespace_client_write_error[5m])) + sum without(service) (rate(aerospike_namespace_client_write_success[5m])), 1)) > %(alertsWarningHighClientWriteErrorRate)s
||| % $._config,
sum by(instance, aerospike_cluster, ns) (rate(aerospike_namespace_client_write_error[5m])) / (clamp_min(sum by(instance, aerospike_cluster, ns) (rate(aerospike_namespace_client_write_error[5m])) + sum by(instance, aerospike_cluster, ns) (rate(aerospike_namespace_client_write_success[5m])), 1)) > %(alertsWarningHighClientWriteErrorRate)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -126,14 +126,14 @@
(
'{{ printf "%%.0f" $value }} percent of client write transactions are resulting in errors for namespace {{$labels.ns}}, on node {{$labels.instance}}, on cluster {{$labels.aerospike_cluster}}, ' +
'which is above the threshold of %(alertsWarningHighClientWriteErrorRate)s.'
) % $._config,
) % this.config,
},
},
{
alert: 'AerospikeHighClientUDFErrorRate',
expr: |||
sum without(service) (rate(aerospike_namespace_client_udf_error[5m])) / (clamp_min(sum without(service) (rate(aerospike_namespace_client_udf_error[5m])) + sum without(service) (rate(aerospike_namespace_client_udf_complete[5m])), 1)) > %(alertsWarningHighClientUDFErrorRate)s
||| % $._config,
sum by(instance, aerospike_cluster, ns) (rate(aerospike_namespace_client_udf_error[5m])) / (clamp_min(sum by(instance, aerospike_cluster, ns) (rate(aerospike_namespace_client_udf_error[5m])) + sum by(instance, aerospike_cluster, ns) (rate(aerospike_namespace_client_udf_complete[5m])), 1)) > %(alertsWarningHighClientUDFErrorRate)s
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -144,7 +144,7 @@
(
'{{ printf "%%.0f" $value }} percent of client UDF transactions are resulting in errors for namespace {{$labels.ns}}, on node {{$labels.instance}}, on cluster {{$labels.aerospike_cluster}}, ' +
'which is above the threshold of %(alertsWarningHighClientUDFErrorRate)s.'
) % $._config,
) % this.config,
},
},
],
Expand Down
54 changes: 35 additions & 19 deletions aerospike-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,25 +1,41 @@
{
_config+:: {
enableMultiCluster: false,
aerospikeSelector: if self.enableMultiCluster then 'job=~"$job", cluster=~"$cluster"' else 'job=~"$job"',
multiclusterSelector: 'job=~"$job"',
filterSelector: 'job=~"integrations/aerospike"',
local this = self,
filteringSelector: 'job="integrations/aerospike"',
groupLabels: ['job', 'aerospike_cluster', 'cluster'],
logLabels: ['job', 'cluster', 'instance'],
instanceLabels: ['instance', 'ns'], // ns == namespace

dashboardTags: ['aerospike-mixin'],
dashboardPeriod: 'now-30m',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
dashboardTags: [self.uid],
uid: 'aerospike',
dashboardNamePrefix: 'Aerospike',
dashboardPeriod: 'now-30m',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
metricsSource: [
'prometheusAerospike7', // For queries that are required for Aerospike 7.0+ with metric changes
'prometheus', // For Aerospike < 7.0
],

// alerts thresholds
alertsCriticalNodeHighMemoryUsage: 80, // %
alertsCriticalNamespaceHighDiskUsage: 80, // %
alertsCriticalUnavailablePartitions: 0, // count
alertsCriticalDeadPartitions: 0, // count
alertsCriticalSystemRejectingWrites: 0, // count
alertsWarningHighClientReadErrorRate: 25, // %
alertsWarningHighClientWriteErrorRate: 25, // %
alertsWarningHighClientUDFErrorRate: 25, // %
// Logging configuration
enableLokiLogs: true,
extraLogLabels: ['level'], // Required by logs-lib
logsVolumeGroupBy: 'level',
showLogsVolume: true,

enableLokiLogs: true,
// Alerts thresholds
alertsCriticalNodeHighMemoryUsage: 80, // %
alertsCriticalNamespaceHighDiskUsage: 80, // %
alertsCriticalUnavailablePartitions: 0, // count
alertsCriticalDeadPartitions: 0, // count
alertsCriticalSystemRejectingWrites: 0, // count
alertsWarningHighClientReadErrorRate: 25, // %
alertsWarningHighClientWriteErrorRate: 25, // %
alertsWarningHighClientUDFErrorRate: 25, // %

// Signals configuration
signals+: {
overview: (import './signals/overview.libsonnet')(this),
namespace: (import './signals/namespace.libsonnet')(this),
instance: (import './signals/instance.libsonnet')(this),
},
}
135 changes: 135 additions & 0 deletions aerospike-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
local g = import './g.libsonnet';
local commonlib = import 'common-lib/common/main.libsonnet';
local logslib = import 'logs-lib/logs/main.libsonnet';

{
local root = self,
new(this)::
local prefix = this.config.dashboardNamePrefix;
local links = this.grafana.links;
local tags = this.config.dashboardTags;
local uid = g.util.string.slugify(this.config.uid);
local vars = this.grafana.variables;
local annotations = this.grafana.annotations;
local refresh = this.config.dashboardRefresh;
local period = this.config.dashboardPeriod;
local timezone = this.config.dashboardTimezone;
{
'aerospike-overview.json':
g.dashboard.new(prefix + ' overview')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.clusterOverview,
]
)
)
) + root.applyCommon(
vars.multiInstance + [
g.dashboard.variable.custom.new(
'k',
values=['2', '4', '6', '8', '10'],
) + g.dashboard.variable.custom.generalOptions.withCurrent('2')
+ g.dashboard.variable.custom.generalOptions.withLabel('Top node count')
+ g.dashboard.variable.custom.selectionOptions.withMulti(false)
+ g.dashboard.variable.custom.selectionOptions.withIncludeAll(false),
],
uid + '_overview',
tags,
links { aerospikeOverview:: {} },
annotations,
timezone,
refresh,
period,
),


'aerospike-instance-overview.json':
g.dashboard.new(prefix + ' instance overview')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.instanceOverview,
]
)
)
) + root.applyCommon(
vars.multiInstance,
uid + '_instance_overview',
tags,
links { aerospikeInstanceOverview:: {} },
annotations,
timezone,
refresh,
period,
),

'aerospike-namespace-overview.json':
g.dashboard.new(prefix + ' namespace overview')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.namespaceOverview,
]
)
)
) + root.applyCommon(
vars.multiInstance,
uid + '_namespace_overview',
tags,
links { aerospikeNamespaceOverview:: {} },
annotations,
timezone,
refresh,
period,
),
}
+
if this.config.enableLokiLogs then
{
'aerospike-logs-overview.json':
logslib.new(
prefix + ' logs',
datasourceName=this.grafana.variables.datasources.loki.name,
datasourceRegex=this.grafana.variables.datasources.loki.regex,
filterSelector=this.config.filteringSelector,
labels=this.config.groupLabels + this.config.extraLogLabels,
formatParser=null,
showLogsVolume=this.config.showLogsVolume,
)
{
dashboards+:
{
logs+:
root.applyCommon(super.logs.templating.list, uid=uid + '-logs', tags=tags, links=links { logs+:: {} }, annotations=annotations, timezone=timezone, refresh=refresh, period=period),
},
panels+:
{
logs+:
g.panel.logs.options.withEnableLogDetails(true)
+ g.panel.logs.options.withShowTime(false)
+ g.panel.logs.options.withWrapLogMessage(false),
},
variables+: {
toArray+: [
this.grafana.variables.datasources.prometheus { hide: 2 },
],
},
}.dashboards.logs,
}
else {},


applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
g.dashboard.withTags(tags)
+ g.dashboard.withUid(uid)
+ g.dashboard.withLinks(std.objectValues(links))
+ g.dashboard.withTimezone(timezone)
+ g.dashboard.withRefresh(refresh)
+ g.dashboard.time.withFrom(period)
+ g.dashboard.withVariables(vars)
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
}
Loading
Loading