From 8d6b6f71a6fa2501fb76ee72b650168b2549dc53 Mon Sep 17 00:00:00 2001 From: Florence Morris Date: Wed, 29 Oct 2025 15:08:26 -0400 Subject: [PATCH] In available-metrics-in-metrics-list.csv, added 8 sql.routine.* metrics. Updated metrics.yaml, ran `./cockroach gen metric-list > metrics.yaml` with v25.4.0-rc.1 binary. In v25.3/essential-metrics.md and v25.4/essential-metrics.md, fixed bug where it mentioned self-hosted on Advanced page. In v25.4/essential-metrics.md, added sections for Physical Replication and Logical Replication. --- .../available-metrics-in-metrics-list.csv | 8 + src/current/_data/v25.4/metrics/metrics.yaml | 1630 +++++++++++------ .../_includes/v25.3/essential-metrics.md | 6 +- .../_includes/v25.4/essential-metrics.md | 14 +- 4 files changed, 1047 insertions(+), 611 deletions(-) diff --git a/src/current/_data/v25.4/metrics/available-metrics-in-metrics-list.csv b/src/current/_data/v25.4/metrics/available-metrics-in-metrics-list.csv index c9f2c0e5c52..9d3adad338f 100644 --- a/src/current/_data/v25.4/metrics/available-metrics-in-metrics-list.csv +++ b/src/current/_data/v25.4/metrics/available-metrics-in-metrics-list.csv @@ -483,3 +483,11 @@ rebalancing.range.rebalances rebalancing.replicas.cpunanospersecond rebalancing.replicas.queriespersecond rebalancing.state.imbalanced_overfull_options_exhausted +sql.routine.delete.count +sql.routine.delete.started.count +sql.routine.insert.count +sql.routine.insert.started.count +sql.routine.select.count +sql.routine.select.started.count +sql.routine.update.count +sql.routine.update.started.count diff --git a/src/current/_data/v25.4/metrics/metrics.yaml b/src/current/_data/v25.4/metrics/metrics.yaml index 7fe4416b4ae..fb0a9bb3ae6 100644 --- a/src/current/_data/v25.4/metrics/metrics.yaml +++ b/src/current/_data/v25.4/metrics/metrics.yaml @@ -85,6 +85,28 @@ layers: derivative: NONE how_to_use: Changefeeds use protected timestamps to protect the data from being garbage collected. Ensure the protected timestamp age does not significantly exceed the GC TTL zone configuration. Alert on this metric if the protected timestamp age is greater than 3 times the GC TTL. essential: true + - name: CROSS_CLUSTER_REPLICATION + metrics: + - name: physical_replication.logical_bytes + exported_name: physical_replication_logical_bytes + description: Logical bytes (sum of keys + values) ingested by all replication jobs + y_axis_label: Bytes + type: COUNTER + unit: BYTES + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: Track PCR throughput + essential: true + - name: physical_replication.replicated_time_seconds + exported_name: physical_replication_replicated_time_seconds + description: The replicated time of the physical replication stream in seconds since the unix epoch. + y_axis_label: Seconds + type: GAUGE + unit: SECONDS + aggregation: AVG + derivative: NONE + how_to_use: Track replication lag via current time - physical_replication.replicated_time_seconds + essential: true - name: DISTRIBUTED metrics: - name: distsender.errors.notleaseholder @@ -107,6 +129,58 @@ layers: derivative: NON_NEGATIVE_DERIVATIVE how_to_use: RPC errors do not necessarily indicate a problem. This metric tracks remote procedure calls that return a status value other than "success". A non-success status of an RPC should not be misconstrued as a network transport issue. It is database code logic executed on another cluster node. The non-success status is a result of an orderly execution of an RPC that reports a specific logical condition. essential: true + - name: LOGICAL_DATA_REPLICATION + metrics: + - name: logical_replication.commit_latency + exported_name: logical_replication_commit_latency + description: 'Event commit latency: a difference between event MVCC timestamp and the time it was flushed into disk. If we batch events, then the difference between the oldest event in the batch and flush is recorded' + y_axis_label: Nanoseconds + type: HISTOGRAM + unit: NANOSECONDS + aggregation: AVG + derivative: NONE + how_to_use: track the latency of of applying events from source to destination + essential: true + - name: logical_replication.events_dlqed + exported_name: logical_replication_events_dlqed + description: Row update events sent to DLQ + y_axis_label: Failures + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: track events sent to the dead letter queue + essential: true + - name: logical_replication.events_ingested + exported_name: logical_replication_events_ingested + description: Events ingested by all replication jobs + y_axis_label: Events + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: track events (e.g. updates, deletes, inserts) ingested + essential: true + - name: logical_replication.logical_bytes + exported_name: logical_replication_logical_bytes + description: Logical bytes (sum of keys + values) received by all replication jobs + y_axis_label: Bytes + type: COUNTER + unit: BYTES + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: track logical data replication throughput + essential: true + - name: logical_replication.replicated_time_seconds + exported_name: logical_replication_replicated_time_seconds + description: The replicated time of the logical replication stream in seconds since the unix epoch. + y_axis_label: Seconds + type: GAUGE + unit: SECONDS + aggregation: AVG + derivative: NONE + how_to_use: Track replication lag via current time - logical_replication.replicated_time_seconds + essential: true - name: NETWORKING metrics: - name: clock-offset.meannanos @@ -124,6 +198,10 @@ layers: description: | Sum of exponentially weighted moving average of round-trip latencies, as measured through a gRPC RPC. + Since this metric is based on gRPC RPCs, it is affected by application-level + processing delays and CPU overload effects. See rpc.connection.tcp_rtt for a + metric that is obtained from the kernel's TCP stack. + Dividing this Gauge by rpc.connection.healthy gives an approximation of average latency, but the top-level round-trip-latency histogram is more useful. Instead, users should consult the label families of this metric if they are available @@ -191,6 +269,40 @@ layers: derivative: NON_NEGATIVE_DERIVATIVE how_to_use: See Description. essential: true + - name: rpc.connection.tcp_rtt + exported_name: rpc_connection_tcp_rtt + description: | + Kernel-level TCP round-trip time as measured by the Linux TCP stack. + + This metric reports the smoothed round-trip time (SRTT) as maintained by the + kernel's TCP implementation. Unlike application-level RPC latency measurements, + this reflects pure network latency and is less affected by CPU overload effects. + + This metric is only available on Linux. + y_axis_label: Latency + type: GAUGE + unit: NANOSECONDS + aggregation: AVG + derivative: NONE + how_to_use: High TCP RTT values indicate network issues outside of CockroachDB that could be impacting the user's workload. + essential: true + - name: rpc.connection.tcp_rtt_var + exported_name: rpc_connection_tcp_rtt_var + description: | + Kernel-level TCP round-trip time variance as measured by the Linux TCP stack. + + This metric reports the smoothed round-trip time variance (RTTVAR) as maintained + by the kernel's TCP implementation. This measures the stability of the + connection latency. + + This metric is only available on Linux. + y_axis_label: Latency Variance + type: GAUGE + unit: NANOSECONDS + aggregation: AVG + derivative: NONE + how_to_use: High TCP RTT variance values indicate network stability issues outside of CockroachDB that could be impacting the user's workload. + essential: true - name: rpc.connection.unhealthy exported_name: rpc_connection_unhealthy description: Gauge of current connections in an unhealthy state (not bidirectionally connected or heartbeating) @@ -218,6 +330,39 @@ layers: essential: true - name: SQL metrics: + - name: jobs.auto_create_partial_stats.currently_paused + exported_name: jobs_auto_create_partial_stats_currently_paused + labeled_name: 'jobs{name: auto_create_partial_stats, status: currently_paused}' + description: Number of auto_create_partial_stats jobs currently considered Paused + y_axis_label: jobs + type: GAUGE + unit: COUNT + aggregation: AVG + derivative: NONE + how_to_use: This metric is a high-level indicator that automatically generated partial statistics jobs are paused which can lead to the query optimizer running with stale statistics. Stale statistics can cause suboptimal query plans to be selected leading to poor query performance. + essential: true + - name: jobs.auto_create_partial_stats.currently_running + exported_name: jobs_auto_create_partial_stats_currently_running + labeled_name: 'jobs{type: auto_create_partial_stats, status: currently_running}' + description: Number of auto_create_partial_stats jobs currently running in Resume or OnFailOrCancel state + y_axis_label: jobs + type: GAUGE + unit: COUNT + aggregation: AVG + derivative: NONE + how_to_use: This metric tracks the number of active automatically generated partial statistics jobs that could also be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics. + essential: true + - name: jobs.auto_create_partial_stats.resume_failed + exported_name: jobs_auto_create_partial_stats_resume_failed + labeled_name: 'jobs.resume{name: auto_create_partial_stats, status: failed}' + description: Number of auto_create_partial_stats jobs which failed with a non-retriable error + y_axis_label: jobs + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This metric is a high-level indicator that automatically generated partial table statistics is failing. Failed statistic creation can lead to the query optimizer running with stale statistics. Stale statistics can cause suboptimal query plans to be selected leading to poor query performance. + essential: true - name: jobs.auto_create_stats.currently_paused exported_name: jobs_auto_create_stats_currently_paused labeled_name: 'jobs{name: auto_create_stats, status: currently_paused}' @@ -282,7 +427,7 @@ layers: unit: COUNT aggregation: AVG derivative: NONE - how_to_use: This metric tracks the number of active create statistics jobs that may be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics. + how_to_use: This metric tracks the number of active create statistics jobs that could also be consuming resources. Ensure that foreground SQL traffic is not impacted by correlating this metric with SQL latency and query volume metrics. essential: true - name: schedules.BACKUP.failed exported_name: schedules_BACKUP_failed @@ -373,6 +518,25 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.delete.started.count + exported_name: sql_delete_started_count + labeled_name: 'sql.started.count{query_type: delete}' + description: Number of SQL DELETE statements started + y_axis_label: SQL Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application. + - name: sql.delete.started.count.internal + exported_name: sql_delete_started_count_internal + labeled_name: 'sql.started.count{query_type: delete, query_internal: true}' + description: Number of SQL DELETE statements started (internal queries) + y_axis_label: SQL Internal Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: sql.distsql.contended_queries.count exported_name: sql_distsql_contended_queries_count description: Number of SQL queries that experienced contention @@ -439,6 +603,25 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.insert.started.count + exported_name: sql_insert_started_count + labeled_name: 'sql.started.count{query_type: insert}' + description: Number of SQL INSERT statements started + y_axis_label: SQL Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application. + - name: sql.insert.started.count.internal + exported_name: sql_insert_started_count_internal + labeled_name: 'sql.started.count{query_type: insert, query_internal: true}' + description: Number of SQL INSERT statements started (internal queries) + y_axis_label: SQL Internal Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: sql.mem.root.current exported_name: sql_mem_root_current description: Current sql statement memory usage for root @@ -459,6 +642,162 @@ layers: derivative: NON_NEGATIVE_DERIVATIVE how_to_use: The rate of this metric shows how frequently new connections are being established. This can be useful in determining if a high rate of incoming new connections is causing additional load on the server due to a misconfigured application. essential: true + - name: sql.routine.delete.count + exported_name: sql_routine_delete_count + labeled_name: 'sql.count{query_type: routine-delete}' + description: Number of SQL DELETE statements successfully executed within routine invocation + y_axis_label: SQL Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application. + essential: true + - name: sql.routine.delete.count.internal + exported_name: sql_routine_delete_count_internal + labeled_name: 'sql.count{query_type: routine-delete, query_internal: true}' + description: Number of SQL DELETE statements successfully executed within routine invocation (internal queries) + y_axis_label: SQL Internal Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.routine.delete.started.count + exported_name: sql_routine_delete_started_count + labeled_name: 'sql.started.count{query_type: routine-started-delete}' + description: Number of SQL DELETE statements started within routine invocation + y_axis_label: SQL Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application. + - name: sql.routine.delete.started.count.internal + exported_name: sql_routine_delete_started_count_internal + labeled_name: 'sql.started.count{query_type: routine-started-delete, query_internal: true}' + description: Number of SQL DELETE statements started within routine invocation (internal queries) + y_axis_label: SQL Internal Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.routine.insert.count + exported_name: sql_routine_insert_count + labeled_name: 'sql.count{query_type: routine-insert}' + description: Number of SQL INSERT statements successfully executed within routine invocation + y_axis_label: SQL Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application. + essential: true + - name: sql.routine.insert.count.internal + exported_name: sql_routine_insert_count_internal + labeled_name: 'sql.count{query_type: routine-insert, query_internal: true}' + description: Number of SQL INSERT statements successfully executed within routine invocation (internal queries) + y_axis_label: SQL Internal Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.routine.insert.started.count + exported_name: sql_routine_insert_started_count + labeled_name: 'sql.started.count{query_type: routine-started-insert}' + description: Number of SQL INSERT statements started within routine invocation + y_axis_label: SQL Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application. + - name: sql.routine.insert.started.count.internal + exported_name: sql_routine_insert_started_count_internal + labeled_name: 'sql.started.count{query_type: routine-started-insert, query_internal: true}' + description: Number of SQL INSERT statements started within routine invocation (internal queries) + y_axis_label: SQL Internal Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.routine.select.count + exported_name: sql_routine_select_count + labeled_name: 'sql.count{query_type: routine-select}' + description: Number of SQL SELECT statements successfully executed within routine invocation + y_axis_label: SQL Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application. + essential: true + - name: sql.routine.select.count.internal + exported_name: sql_routine_select_count_internal + labeled_name: 'sql.count{query_type: routine-select, query_internal: true}' + description: Number of SQL SELECT statements successfully executed within routine invocation (internal queries) + y_axis_label: SQL Internal Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.routine.select.started.count + exported_name: sql_routine_select_started_count + labeled_name: 'sql.started.count{query_type: routine-started-select}' + description: Number of SQL SELECT statements started within routine invocation + y_axis_label: SQL Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application. + - name: sql.routine.select.started.count.internal + exported_name: sql_routine_select_started_count_internal + labeled_name: 'sql.started.count{query_type: routine-started-select, query_internal: true}' + description: Number of SQL SELECT statements started within routine invocation (internal queries) + y_axis_label: SQL Internal Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.routine.update.count + exported_name: sql_routine_update_count + labeled_name: 'sql.count{query_type: routine-update}' + description: Number of SQL UPDATE statements successfully executed within routine invocation + y_axis_label: SQL Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application. + essential: true + - name: sql.routine.update.count.internal + exported_name: sql_routine_update_count_internal + labeled_name: 'sql.count{query_type: routine-update, query_internal: true}' + description: Number of SQL UPDATE statements successfully executed within routine invocation (internal queries) + y_axis_label: SQL Internal Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.routine.update.started.count + exported_name: sql_routine_update_started_count + labeled_name: 'sql.started.count{query_type: routine-started-update}' + description: Number of SQL UPDATE statements started within routine invocation + y_axis_label: SQL Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application. + - name: sql.routine.update.started.count.internal + exported_name: sql_routine_update_started_count_internal + labeled_name: 'sql.started.count{query_type: routine-started-update, query_internal: true}' + description: Number of SQL UPDATE statements started within routine invocation (internal queries) + y_axis_label: SQL Internal Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: sql.select.count exported_name: sql_select_count labeled_name: 'sql.count{query_type: select}' @@ -479,6 +818,25 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.select.started.count + exported_name: sql_select_started_count + labeled_name: 'sql.started.count{query_type: select}' + description: Number of SQL SELECT statements started + y_axis_label: SQL Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application. + - name: sql.select.started.count.internal + exported_name: sql_select_started_count_internal + labeled_name: 'sql.started.count{query_type: select, query_internal: true}' + description: Number of SQL SELECT statements started (internal queries) + y_axis_label: SQL Internal Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: sql.service.latency exported_name: sql_service_latency description: Latency of SQL request execution @@ -643,6 +1001,25 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.update.started.count + exported_name: sql_update_started_count + labeled_name: 'sql.started.count{query_type: update}' + description: Number of SQL UPDATE statements started + y_axis_label: SQL Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: This high-level metric reflects workload volume. Monitor this metric to identify abnormal application behavior or patterns over time. If abnormal patterns emerge, apply the metric's time range to the SQL Activity pages to investigate interesting outliers or patterns. For example, on the Transactions page and the Statements page, sort on the Execution Count column. To find problematic sessions, on the Sessions page, sort on the Transaction Count column. Find the sessions with high transaction counts and trace back to a user or application. + - name: sql.update.started.count.internal + exported_name: sql_update_started_count_internal + labeled_name: 'sql.started.count{query_type: update, query_internal: true}' + description: Number of SQL UPDATE statements started (internal queries) + y_axis_label: SQL Internal Statements + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: txn.restarts.serializable exported_name: txn_restarts_serializable description: Number of restarts due to a forwarded commit timestamp and isolation=SERIALIZABLE @@ -864,6 +1241,14 @@ layers: unit: NANOSECONDS aggregation: AVG derivative: NONE + - name: auth.ldap.conn.latency.internal + exported_name: auth_ldap_conn_latency_internal + description: Internal Auth Latency to establish and authenticate a SQL connection using LDAP(excludes external LDAP calls) + y_axis_label: Nanoseconds + type: HISTOGRAM + unit: NANOSECONDS + aggregation: AVG + derivative: NONE - name: auth.password.conn.latency exported_name: auth_password_conn_latency description: Latency to establish and authenticate a SQL connection using password @@ -1211,7 +1596,7 @@ layers: - name: changefeed.checkpoint_hist_nanos exported_name: changefeed_checkpoint_hist_nanos description: Time spent checkpointing changefeed progress - y_axis_label: Changefeeds + y_axis_label: Nanoseconds type: HISTOGRAM unit: NANOSECONDS aggregation: AVG @@ -1416,6 +1801,22 @@ layers: unit: NANOSECONDS aggregation: AVG derivative: NONE + - name: changefeed.progress_skew.span + exported_name: changefeed_progress_skew_span + description: The time difference between the fastest and slowest span's resolved timestamp + y_axis_label: Nanoseconds + type: GAUGE + unit: NANOSECONDS + aggregation: AVG + derivative: NONE + - name: changefeed.progress_skew.table + exported_name: changefeed_progress_skew_table + description: The time difference between the fastest and slowest table's resolved timestamp + y_axis_label: Nanoseconds + type: GAUGE + unit: NANOSECONDS + aggregation: AVG + derivative: NONE - name: changefeed.queue_time_nanos exported_name: changefeed_queue_time_nanos description: Time KV event spent waiting to be processed @@ -1456,6 +1857,14 @@ layers: unit: NANOSECONDS aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: changefeed.sink_backpressure_nanos + exported_name: changefeed_sink_backpressure_nanos + description: Time spent waiting for quota when emitting to the sink (back-pressure). Only populated for sinks using the batching_sink wrapper. As of writing, this includes Kafka (v2), Pub/Sub (v2), and Webhook (v2). + y_axis_label: Nanoseconds + type: HISTOGRAM + unit: NANOSECONDS + aggregation: AVG + derivative: NONE - name: changefeed.sink_batch_hist_nanos exported_name: changefeed_sink_batch_hist_nanos description: Time spent batched in the sink buffer before being flushed and acknowledged @@ -1520,6 +1929,14 @@ layers: unit: NANOSECONDS aggregation: AVG derivative: NONE + - name: changefeed.stage.frontier_persistence.latency + exported_name: changefeed_stage_frontier_persistence_latency + description: 'Latency of the changefeed stage: persisting frontier to job info' + y_axis_label: Latency + type: HISTOGRAM + unit: NANOSECONDS + aggregation: AVG + derivative: NONE - name: changefeed.stage.kv_feed_buffer.latency exported_name: changefeed_stage_kv_feed_buffer_latency description: 'Latency of the changefeed stage: waiting to buffer kv events' @@ -1536,6 +1953,30 @@ layers: unit: NANOSECONDS aggregation: AVG derivative: NONE + - name: changefeed.stage.pts.create.latency + exported_name: changefeed_stage_pts_create_latency + description: 'Latency of the changefeed stage: Time spent creating protected timestamp records on changefeed creation' + y_axis_label: Latency + type: HISTOGRAM + unit: NANOSECONDS + aggregation: AVG + derivative: NONE + - name: changefeed.stage.pts.manage.latency + exported_name: changefeed_stage_pts_manage_latency + description: 'Latency of the changefeed stage: Time spent successfully managing protected timestamp records on highwater advance, including time spent creating new protected timestamps when needed' + y_axis_label: Latency + type: HISTOGRAM + unit: NANOSECONDS + aggregation: AVG + derivative: NONE + - name: changefeed.stage.pts.manage_error.latency + exported_name: changefeed_stage_pts_manage_error_latency + description: 'Latency of the changefeed stage: Time spent managing protected timestamp when we eventually error' + y_axis_label: Latency + type: HISTOGRAM + unit: NANOSECONDS + aggregation: AVG + derivative: NONE - name: changefeed.stage.rangefeed_buffer_checkpoint.latency exported_name: changefeed_stage_rangefeed_buffer_checkpoint_latency description: 'Latency of the changefeed stage: buffering rangefeed checkpoint events' @@ -2956,6 +3397,18 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: distsender.rpc.flushlocktable.sent + exported_name: distsender_rpc_flushlocktable_sent + description: |- + Number of FlushLockTable requests processed. + + This counts the requests in batches handed to DistSender, not the RPCs + sent to individual Ranges as a result. + y_axis_label: RPCs + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: distsender.rpc.gc.sent exported_name: distsender_rpc_gc_sent description: |- @@ -3443,15 +3896,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.auto_config_env_runner.fail_or_cancel_failed - exported_name: jobs_auto_config_env_runner_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: auto_config_env_runner, status: failed}' - description: Number of auto_config_env_runner jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.auto_config_env_runner.fail_or_cancel_retry_error exported_name: jobs_auto_config_env_runner_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: auto_config_env_runner, status: retry_error}' @@ -3551,15 +3995,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.auto_config_runner.fail_or_cancel_failed - exported_name: jobs_auto_config_runner_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: auto_config_runner, status: failed}' - description: Number of auto_config_runner jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.auto_config_runner.fail_or_cancel_retry_error exported_name: jobs_auto_config_runner_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: auto_config_runner, status: retry_error}' @@ -3659,15 +4094,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.auto_config_task.fail_or_cancel_failed - exported_name: jobs_auto_config_task_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: auto_config_task, status: failed}' - description: Number of auto_config_task jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.auto_config_task.fail_or_cancel_retry_error exported_name: jobs_auto_config_task_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: auto_config_task, status: retry_error}' @@ -3731,33 +4157,6 @@ layers: unit: COUNT aggregation: AVG derivative: NONE - - name: jobs.auto_create_partial_stats.currently_paused - exported_name: jobs_auto_create_partial_stats_currently_paused - labeled_name: 'jobs{name: auto_create_partial_stats, status: currently_paused}' - description: Number of auto_create_partial_stats jobs currently considered Paused - y_axis_label: jobs - type: GAUGE - unit: COUNT - aggregation: AVG - derivative: NONE - - name: jobs.auto_create_partial_stats.currently_running - exported_name: jobs_auto_create_partial_stats_currently_running - labeled_name: 'jobs{type: auto_create_partial_stats, status: currently_running}' - description: Number of auto_create_partial_stats jobs currently running in Resume or OnFailOrCancel state - y_axis_label: jobs - type: GAUGE - unit: COUNT - aggregation: AVG - derivative: NONE - - name: jobs.auto_create_partial_stats.expired_pts_records - exported_name: jobs_auto_create_partial_stats_expired_pts_records - labeled_name: 'jobs.expired_pts_records{type: auto_create_partial_stats}' - description: Number of expired protected timestamp records owned by auto_create_partial_stats jobs - y_axis_label: records - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.auto_create_partial_stats.fail_or_cancel_completed exported_name: jobs_auto_create_partial_stats_fail_or_cancel_completed labeled_name: 'jobs.fail_or_cancel{name: auto_create_partial_stats, status: completed}' @@ -3767,15 +4166,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.auto_create_partial_stats.fail_or_cancel_failed - exported_name: jobs_auto_create_partial_stats_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: auto_create_partial_stats, status: failed}' - description: Number of auto_create_partial_stats jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.auto_create_partial_stats.fail_or_cancel_retry_error exported_name: jobs_auto_create_partial_stats_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: auto_create_partial_stats, status: retry_error}' @@ -3785,24 +4175,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.auto_create_partial_stats.protected_age_sec - exported_name: jobs_auto_create_partial_stats_protected_age_sec - labeled_name: 'jobs.protected_age_sec{type: auto_create_partial_stats}' - description: The age of the oldest PTS record protected by auto_create_partial_stats jobs - y_axis_label: seconds - type: GAUGE - unit: SECONDS - aggregation: AVG - derivative: NONE - - name: jobs.auto_create_partial_stats.protected_record_count - exported_name: jobs_auto_create_partial_stats_protected_record_count - labeled_name: 'jobs.protected_record_count{type: auto_create_partial_stats}' - description: Number of protected timestamp records held by auto_create_partial_stats jobs - y_axis_label: records - type: GAUGE - unit: COUNT - aggregation: AVG - derivative: NONE - name: jobs.auto_create_partial_stats.resume_completed exported_name: jobs_auto_create_partial_stats_resume_completed labeled_name: 'jobs.resume{name: auto_create_partial_stats, status: completed}' @@ -3812,15 +4184,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.auto_create_partial_stats.resume_failed - exported_name: jobs_auto_create_partial_stats_resume_failed - labeled_name: 'jobs.resume{name: auto_create_partial_stats, status: failed}' - description: Number of auto_create_partial_stats jobs which failed with a non-retriable error - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.auto_create_partial_stats.resume_retry_error exported_name: jobs_auto_create_partial_stats_resume_retry_error labeled_name: 'jobs.resume{name: auto_create_partial_stats, status: retry_error}' @@ -3839,15 +4202,6 @@ layers: unit: COUNT aggregation: AVG derivative: NONE - - name: jobs.auto_create_stats.expired_pts_records - exported_name: jobs_auto_create_stats_expired_pts_records - labeled_name: 'jobs.expired_pts_records{type: auto_create_stats}' - description: Number of expired protected timestamp records owned by auto_create_stats jobs - y_axis_label: records - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.auto_create_stats.fail_or_cancel_completed exported_name: jobs_auto_create_stats_fail_or_cancel_completed labeled_name: 'jobs.fail_or_cancel{name: auto_create_stats, status: completed}' @@ -3857,15 +4211,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.auto_create_stats.fail_or_cancel_failed - exported_name: jobs_auto_create_stats_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: auto_create_stats, status: failed}' - description: Number of auto_create_stats jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.auto_create_stats.fail_or_cancel_retry_error exported_name: jobs_auto_create_stats_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: auto_create_stats, status: retry_error}' @@ -3875,24 +4220,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.auto_create_stats.protected_age_sec - exported_name: jobs_auto_create_stats_protected_age_sec - labeled_name: 'jobs.protected_age_sec{type: auto_create_stats}' - description: The age of the oldest PTS record protected by auto_create_stats jobs - y_axis_label: seconds - type: GAUGE - unit: SECONDS - aggregation: AVG - derivative: NONE - - name: jobs.auto_create_stats.protected_record_count - exported_name: jobs_auto_create_stats_protected_record_count - labeled_name: 'jobs.protected_record_count{type: auto_create_stats}' - description: Number of protected timestamp records held by auto_create_stats jobs - y_axis_label: records - type: GAUGE - unit: COUNT - aggregation: AVG - derivative: NONE - name: jobs.auto_create_stats.resume_completed exported_name: jobs_auto_create_stats_resume_completed labeled_name: 'jobs.resume{name: auto_create_stats, status: completed}' @@ -3956,15 +4283,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.auto_schema_telemetry.fail_or_cancel_failed - exported_name: jobs_auto_schema_telemetry_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: auto_schema_telemetry, status: failed}' - description: Number of auto_schema_telemetry jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.auto_schema_telemetry.fail_or_cancel_retry_error exported_name: jobs_auto_schema_telemetry_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: auto_schema_telemetry, status: retry_error}' @@ -4064,15 +4382,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.auto_span_config_reconciliation.fail_or_cancel_failed - exported_name: jobs_auto_span_config_reconciliation_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: auto_span_config_reconciliation, status: failed}' - description: Number of auto_span_config_reconciliation jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.auto_span_config_reconciliation.fail_or_cancel_retry_error exported_name: jobs_auto_span_config_reconciliation_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: auto_span_config_reconciliation, status: retry_error}' @@ -4172,15 +4481,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.auto_sql_stats_compaction.fail_or_cancel_failed - exported_name: jobs_auto_sql_stats_compaction_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: auto_sql_stats_compaction, status: failed}' - description: Number of auto_sql_stats_compaction jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.auto_sql_stats_compaction.fail_or_cancel_retry_error exported_name: jobs_auto_sql_stats_compaction_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: auto_sql_stats_compaction, status: retry_error}' @@ -4280,15 +4580,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.auto_update_sql_activity.fail_or_cancel_failed - exported_name: jobs_auto_update_sql_activity_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: auto_update_sql_activity, status: failed}' - description: Number of auto_update_sql_activity jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.auto_update_sql_activity.fail_or_cancel_retry_error exported_name: jobs_auto_update_sql_activity_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: auto_update_sql_activity, status: retry_error}' @@ -4370,15 +4661,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.backup.fail_or_cancel_failed - exported_name: jobs_backup_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: backup, status: failed}' - description: Number of backup jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.backup.fail_or_cancel_retry_error exported_name: jobs_backup_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: backup, status: retry_error}' @@ -4469,15 +4751,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.changefeed.fail_or_cancel_failed - exported_name: jobs_changefeed_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: changefeed, status: failed}' - description: Number of changefeed jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.changefeed.fail_or_cancel_retry_error exported_name: jobs_changefeed_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: changefeed, status: retry_error}' @@ -4549,15 +4822,6 @@ layers: unit: COUNT aggregation: AVG derivative: NONE - - name: jobs.create_stats.expired_pts_records - exported_name: jobs_create_stats_expired_pts_records - labeled_name: 'jobs.expired_pts_records{type: create_stats}' - description: Number of expired protected timestamp records owned by create_stats jobs - y_axis_label: records - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.create_stats.fail_or_cancel_completed exported_name: jobs_create_stats_fail_or_cancel_completed labeled_name: 'jobs.fail_or_cancel{name: create_stats, status: completed}' @@ -4567,15 +4831,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.create_stats.fail_or_cancel_failed - exported_name: jobs_create_stats_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: create_stats, status: failed}' - description: Number of create_stats jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.create_stats.fail_or_cancel_retry_error exported_name: jobs_create_stats_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: create_stats, status: retry_error}' @@ -4585,24 +4840,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.create_stats.protected_age_sec - exported_name: jobs_create_stats_protected_age_sec - labeled_name: 'jobs.protected_age_sec{type: create_stats}' - description: The age of the oldest PTS record protected by create_stats jobs - y_axis_label: seconds - type: GAUGE - unit: SECONDS - aggregation: AVG - derivative: NONE - - name: jobs.create_stats.protected_record_count - exported_name: jobs_create_stats_protected_record_count - labeled_name: 'jobs.protected_record_count{type: create_stats}' - description: Number of protected timestamp records held by create_stats jobs - y_axis_label: records - type: GAUGE - unit: COUNT - aggregation: AVG - derivative: NONE - name: jobs.create_stats.resume_completed exported_name: jobs_create_stats_resume_completed labeled_name: 'jobs.resume{name: create_stats, status: completed}' @@ -4675,15 +4912,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.history_retention.fail_or_cancel_failed - exported_name: jobs_history_retention_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: history_retention, status: failed}' - description: Number of history_retention jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.history_retention.fail_or_cancel_retry_error exported_name: jobs_history_retention_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: history_retention, status: retry_error}' @@ -4783,15 +5011,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.hot_ranges_logger.fail_or_cancel_failed - exported_name: jobs_hot_ranges_logger_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: hot_ranges_logger, status: failed}' - description: Number of hot_ranges_logger jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.hot_ranges_logger.fail_or_cancel_retry_error exported_name: jobs_hot_ranges_logger_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: hot_ranges_logger, status: retry_error}' @@ -4891,15 +5110,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.import.fail_or_cancel_failed - exported_name: jobs_import_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: import, status: failed}' - description: Number of import jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.import.fail_or_cancel_retry_error exported_name: jobs_import_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: import, status: retry_error}' @@ -4981,15 +5191,6 @@ layers: unit: COUNT aggregation: AVG derivative: NONE - - name: jobs.import_rollback.expired_pts_records - exported_name: jobs_import_rollback_expired_pts_records - labeled_name: 'jobs.expired_pts_records{type: import_rollback}' - description: Number of expired protected timestamp records owned by import_rollback jobs - y_axis_label: records - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.import_rollback.fail_or_cancel_completed exported_name: jobs_import_rollback_fail_or_cancel_completed labeled_name: 'jobs.fail_or_cancel{name: import_rollback, status: completed}' @@ -4999,69 +5200,165 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.import_rollback.fail_or_cancel_failed - exported_name: jobs_import_rollback_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: import_rollback, status: failed}' - description: Number of import_rollback jobs which failed with a non-retriable error on their failure or cancelation process + - name: jobs.import_rollback.fail_or_cancel_retry_error + exported_name: jobs_import_rollback_fail_or_cancel_retry_error + labeled_name: 'jobs.fail_or_cancel{name: import_rollback, status: retry_error}' + description: Number of import_rollback jobs which failed with a retriable error on their failure or cancelation process + y_axis_label: jobs + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: jobs.import_rollback.resume_completed + exported_name: jobs_import_rollback_resume_completed + labeled_name: 'jobs.resume{name: import_rollback, status: completed}' + description: Number of import_rollback jobs which successfully resumed to completion + y_axis_label: jobs + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: jobs.import_rollback.resume_failed + exported_name: jobs_import_rollback_resume_failed + labeled_name: 'jobs.resume{name: import_rollback, status: failed}' + description: Number of import_rollback jobs which failed with a non-retriable error + y_axis_label: jobs + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: jobs.import_rollback.resume_retry_error + exported_name: jobs_import_rollback_resume_retry_error + labeled_name: 'jobs.resume{name: import_rollback, status: retry_error}' + description: Number of import_rollback jobs which failed with a retriable error + y_axis_label: jobs + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: jobs.inspect.currently_idle + exported_name: jobs_inspect_currently_idle + labeled_name: 'jobs{type: inspect, status: currently_idle}' + description: Number of inspect jobs currently considered Idle and can be freely shut down + y_axis_label: jobs + type: GAUGE + unit: COUNT + aggregation: AVG + derivative: NONE + - name: jobs.inspect.currently_paused + exported_name: jobs_inspect_currently_paused + labeled_name: 'jobs{name: inspect, status: currently_paused}' + description: Number of inspect jobs currently considered Paused + y_axis_label: jobs + type: GAUGE + unit: COUNT + aggregation: AVG + derivative: NONE + - name: jobs.inspect.currently_running + exported_name: jobs_inspect_currently_running + labeled_name: 'jobs{type: inspect, status: currently_running}' + description: Number of inspect jobs currently running in Resume or OnFailOrCancel state + y_axis_label: jobs + type: GAUGE + unit: COUNT + aggregation: AVG + derivative: NONE + - name: jobs.inspect.expired_pts_records + exported_name: jobs_inspect_expired_pts_records + labeled_name: 'jobs.expired_pts_records{type: inspect}' + description: Number of expired protected timestamp records owned by inspect jobs + y_axis_label: records + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: jobs.inspect.fail_or_cancel_completed + exported_name: jobs_inspect_fail_or_cancel_completed + labeled_name: 'jobs.fail_or_cancel{name: inspect, status: completed}' + description: Number of inspect jobs which successfully completed their failure or cancelation process y_axis_label: jobs type: COUNTER unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.import_rollback.fail_or_cancel_retry_error - exported_name: jobs_import_rollback_fail_or_cancel_retry_error - labeled_name: 'jobs.fail_or_cancel{name: import_rollback, status: retry_error}' - description: Number of import_rollback jobs which failed with a retriable error on their failure or cancelation process + - name: jobs.inspect.fail_or_cancel_retry_error + exported_name: jobs_inspect_fail_or_cancel_retry_error + labeled_name: 'jobs.fail_or_cancel{name: inspect, status: retry_error}' + description: Number of inspect jobs which failed with a retriable error on their failure or cancelation process y_axis_label: jobs type: COUNTER unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.import_rollback.protected_age_sec - exported_name: jobs_import_rollback_protected_age_sec - labeled_name: 'jobs.protected_age_sec{type: import_rollback}' - description: The age of the oldest PTS record protected by import_rollback jobs + - name: jobs.inspect.issues_found + exported_name: jobs_inspect_issues_found + description: Total count of issues found by INSPECT jobs + y_axis_label: Issues + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: jobs.inspect.protected_age_sec + exported_name: jobs_inspect_protected_age_sec + labeled_name: 'jobs.protected_age_sec{type: inspect}' + description: The age of the oldest PTS record protected by inspect jobs y_axis_label: seconds type: GAUGE unit: SECONDS aggregation: AVG derivative: NONE - - name: jobs.import_rollback.protected_record_count - exported_name: jobs_import_rollback_protected_record_count - labeled_name: 'jobs.protected_record_count{type: import_rollback}' - description: Number of protected timestamp records held by import_rollback jobs + - name: jobs.inspect.protected_record_count + exported_name: jobs_inspect_protected_record_count + labeled_name: 'jobs.protected_record_count{type: inspect}' + description: Number of protected timestamp records held by inspect jobs y_axis_label: records type: GAUGE unit: COUNT aggregation: AVG derivative: NONE - - name: jobs.import_rollback.resume_completed - exported_name: jobs_import_rollback_resume_completed - labeled_name: 'jobs.resume{name: import_rollback, status: completed}' - description: Number of import_rollback jobs which successfully resumed to completion + - name: jobs.inspect.resume_completed + exported_name: jobs_inspect_resume_completed + labeled_name: 'jobs.resume{name: inspect, status: completed}' + description: Number of inspect jobs which successfully resumed to completion y_axis_label: jobs type: COUNTER unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.import_rollback.resume_failed - exported_name: jobs_import_rollback_resume_failed - labeled_name: 'jobs.resume{name: import_rollback, status: failed}' - description: Number of import_rollback jobs which failed with a non-retriable error + - name: jobs.inspect.resume_failed + exported_name: jobs_inspect_resume_failed + labeled_name: 'jobs.resume{name: inspect, status: failed}' + description: Number of inspect jobs which failed with a non-retriable error y_axis_label: jobs type: COUNTER unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.import_rollback.resume_retry_error - exported_name: jobs_import_rollback_resume_retry_error - labeled_name: 'jobs.resume{name: import_rollback, status: retry_error}' - description: Number of import_rollback jobs which failed with a retriable error + - name: jobs.inspect.resume_retry_error + exported_name: jobs_inspect_resume_retry_error + labeled_name: 'jobs.resume{name: inspect, status: retry_error}' + description: Number of inspect jobs which failed with a retriable error y_axis_label: jobs type: COUNTER unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: jobs.inspect.runs + exported_name: jobs_inspect_runs + description: Number of INSPECT jobs executed + y_axis_label: Jobs + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: jobs.inspect.runs_with_issues + exported_name: jobs_inspect_runs_with_issues + description: Number of INSPECT jobs that found at least one issue + y_axis_label: Jobs + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.key_visualizer.currently_idle exported_name: jobs_key_visualizer_currently_idle labeled_name: 'jobs{type: key_visualizer, status: currently_idle}' @@ -5107,15 +5404,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.key_visualizer.fail_or_cancel_failed - exported_name: jobs_key_visualizer_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: key_visualizer, status: failed}' - description: Number of key_visualizer jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.key_visualizer.fail_or_cancel_retry_error exported_name: jobs_key_visualizer_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: key_visualizer, status: retry_error}' @@ -5215,15 +5503,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.logical_replication.fail_or_cancel_failed - exported_name: jobs_logical_replication_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: logical_replication, status: failed}' - description: Number of logical_replication jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.logical_replication.fail_or_cancel_retry_error exported_name: jobs_logical_replication_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: logical_replication, status: retry_error}' @@ -5331,15 +5610,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.migration.fail_or_cancel_failed - exported_name: jobs_migration_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: migration, status: failed}' - description: Number of migration jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.migration.fail_or_cancel_retry_error exported_name: jobs_migration_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: migration, status: retry_error}' @@ -5439,15 +5709,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.mvcc_statistics_update.fail_or_cancel_failed - exported_name: jobs_mvcc_statistics_update_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: mvcc_statistics_update, status: failed}' - description: Number of mvcc_statistics_update jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.mvcc_statistics_update.fail_or_cancel_retry_error exported_name: jobs_mvcc_statistics_update_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: mvcc_statistics_update, status: retry_error}' @@ -5547,15 +5808,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.new_schema_change.fail_or_cancel_failed - exported_name: jobs_new_schema_change_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: new_schema_change, status: failed}' - description: Number of new_schema_change jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.new_schema_change.fail_or_cancel_retry_error exported_name: jobs_new_schema_change_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: new_schema_change, status: retry_error}' @@ -5655,15 +5907,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.poll_jobs_stats.fail_or_cancel_failed - exported_name: jobs_poll_jobs_stats_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: poll_jobs_stats, status: failed}' - description: Number of poll_jobs_stats jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.poll_jobs_stats.fail_or_cancel_retry_error exported_name: jobs_poll_jobs_stats_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: poll_jobs_stats, status: retry_error}' @@ -5763,15 +6006,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.replication_stream_ingestion.fail_or_cancel_failed - exported_name: jobs_replication_stream_ingestion_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: replication_stream_ingestion, status: failed}' - description: Number of replication_stream_ingestion jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.replication_stream_ingestion.fail_or_cancel_retry_error exported_name: jobs_replication_stream_ingestion_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: replication_stream_ingestion, status: retry_error}' @@ -5871,15 +6105,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.replication_stream_producer.fail_or_cancel_failed - exported_name: jobs_replication_stream_producer_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: replication_stream_producer, status: failed}' - description: Number of replication_stream_producer jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.replication_stream_producer.fail_or_cancel_retry_error exported_name: jobs_replication_stream_producer_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: replication_stream_producer, status: retry_error}' @@ -5979,15 +6204,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.restore.fail_or_cancel_failed - exported_name: jobs_restore_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: restore, status: failed}' - description: Number of restore jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.restore.fail_or_cancel_retry_error exported_name: jobs_restore_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: restore, status: retry_error}' @@ -6077,15 +6293,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.row_level_ttl.fail_or_cancel_failed - exported_name: jobs_row_level_ttl_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: row_level_ttl, status: failed}' - description: Number of row_level_ttl jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.row_level_ttl.fail_or_cancel_retry_error exported_name: jobs_row_level_ttl_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: row_level_ttl, status: retry_error}' @@ -6183,15 +6390,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.schema_change.fail_or_cancel_failed - exported_name: jobs_schema_change_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: schema_change, status: failed}' - description: Number of schema_change jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.schema_change.fail_or_cancel_retry_error exported_name: jobs_schema_change_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: schema_change, status: retry_error}' @@ -6291,15 +6489,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.schema_change_gc.fail_or_cancel_failed - exported_name: jobs_schema_change_gc_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: schema_change_gc, status: failed}' - description: Number of schema_change_gc jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.schema_change_gc.fail_or_cancel_retry_error exported_name: jobs_schema_change_gc_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: schema_change_gc, status: retry_error}' @@ -6399,15 +6588,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.sql_activity_flush.fail_or_cancel_failed - exported_name: jobs_sql_activity_flush_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: sql_activity_flush, status: failed}' - description: Number of sql_activity_flush jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.sql_activity_flush.fail_or_cancel_retry_error exported_name: jobs_sql_activity_flush_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: sql_activity_flush, status: retry_error}' @@ -6507,15 +6687,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.standby_read_ts_poller.fail_or_cancel_failed - exported_name: jobs_standby_read_ts_poller_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: standby_read_ts_poller, status: failed}' - description: Number of standby_read_ts_poller jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.standby_read_ts_poller.fail_or_cancel_retry_error exported_name: jobs_standby_read_ts_poller_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: standby_read_ts_poller, status: retry_error}' @@ -6615,15 +6786,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.typedesc_schema_change.fail_or_cancel_failed - exported_name: jobs_typedesc_schema_change_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: typedesc_schema_change, status: failed}' - description: Number of typedesc_schema_change jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.typedesc_schema_change.fail_or_cancel_retry_error exported_name: jobs_typedesc_schema_change_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: typedesc_schema_change, status: retry_error}' @@ -6723,15 +6885,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: jobs.update_table_metadata_cache.fail_or_cancel_failed - exported_name: jobs_update_table_metadata_cache_fail_or_cancel_failed - labeled_name: 'jobs.fail_or_cancel{name: update_table_metadata_cache, status: failed}' - description: Number of update_table_metadata_cache jobs which failed with a non-retriable error on their failure or cancelation process - y_axis_label: jobs - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: jobs.update_table_metadata_cache.fail_or_cancel_retry_error exported_name: jobs_update_table_metadata_cache_fail_or_cancel_retry_error labeled_name: 'jobs.fail_or_cancel{name: update_table_metadata_cache, status: retry_error}' @@ -6882,24 +7035,9 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: logical_replication.commit_latency - exported_name: logical_replication_commit_latency - description: 'Event commit latency: a difference between event MVCC timestamp and the time it was flushed into disk. If we batch events, then the difference between the oldest event in the batch and flush is recorded' - y_axis_label: Nanoseconds - type: HISTOGRAM - unit: NANOSECONDS - aggregation: AVG - derivative: NONE - - name: logical_replication.events_dlqed - exported_name: logical_replication_events_dlqed - description: Row update events sent to DLQ - y_axis_label: Failures - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: logical_replication.events_dlqed_age exported_name: logical_replication_events_dlqed_age + labeled_name: 'logical_replication.events{type: dlqed_age}' description: Row update events sent to DLQ due to reaching the maximum time allowed in the retry queue y_axis_label: Failures type: COUNTER @@ -6916,6 +7054,7 @@ layers: derivative: NON_NEGATIVE_DERIVATIVE - name: logical_replication.events_dlqed_errtype exported_name: logical_replication_events_dlqed_errtype + labeled_name: 'logical_replication.events{type: dlqed_errtype}' description: Row update events sent to DLQ due to an error not considered retryable y_axis_label: Failures type: COUNTER @@ -6924,20 +7063,13 @@ layers: derivative: NON_NEGATIVE_DERIVATIVE - name: logical_replication.events_dlqed_space exported_name: logical_replication_events_dlqed_space + labeled_name: 'logical_replication.events{type: dlqed_space}' description: Row update events sent to DLQ due to capacity of the retry queue y_axis_label: Failures type: COUNTER unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: logical_replication.events_ingested - exported_name: logical_replication_events_ingested - description: Events ingested by all replication jobs - y_axis_label: Events - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: logical_replication.events_ingested_by_label exported_name: logical_replication_events_ingested_by_label description: Events ingested by all replication jobs by label @@ -6948,6 +7080,7 @@ layers: derivative: NON_NEGATIVE_DERIVATIVE - name: logical_replication.events_initial_failure exported_name: logical_replication_events_initial_failure + labeled_name: 'logical_replication.events{type: initial_failure}' description: Failed attempts to apply an incoming row update y_axis_label: Failures type: COUNTER @@ -6956,14 +7089,16 @@ layers: derivative: NON_NEGATIVE_DERIVATIVE - name: logical_replication.events_initial_success exported_name: logical_replication_events_initial_success + labeled_name: 'logical_replication.events{type: initial_success}' description: Successful applications of an incoming row update - y_axis_label: Failures + y_axis_label: Successes type: COUNTER unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - name: logical_replication.events_retry_failure exported_name: logical_replication_events_retry_failure + labeled_name: 'logical_replication.events{type: retry_failure}' description: Failed re-attempts to apply a row update y_axis_label: Failures type: COUNTER @@ -6972,8 +7107,9 @@ layers: derivative: NON_NEGATIVE_DERIVATIVE - name: logical_replication.events_retry_success exported_name: logical_replication_events_retry_success + labeled_name: 'logical_replication.events{type: retry_success}' description: Row update events applied after one or more retries - y_axis_label: Failures + y_axis_label: Successes type: COUNTER unit: COUNT aggregation: AVG @@ -6994,14 +7130,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: logical_replication.logical_bytes - exported_name: logical_replication_logical_bytes - description: Logical bytes (sum of keys + values) received by all replication jobs - y_axis_label: Bytes - type: COUNTER - unit: BYTES - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: logical_replication.replan_count exported_name: logical_replication_replan_count description: Total number of dist sql replanning events @@ -7018,14 +7146,6 @@ layers: unit: SECONDS aggregation: AVG derivative: NONE - - name: logical_replication.replicated_time_seconds - exported_name: logical_replication_replicated_time_seconds - description: The replicated time of the logical replication stream in seconds since the unix epoch. - y_axis_label: Seconds - type: GAUGE - unit: SECONDS - aggregation: AVG - derivative: NONE - name: logical_replication.retry_queue_bytes exported_name: logical_replication_retry_queue_bytes description: Logical bytes (sum of keys+values) in the retry queue @@ -7146,22 +7266,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: physical_replication.logical_bytes - exported_name: physical_replication_logical_bytes - description: Logical bytes (sum of keys + values) ingested by all replication jobs - y_axis_label: Bytes - type: COUNTER - unit: BYTES - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - - name: physical_replication.replicated_time_seconds - exported_name: physical_replication_replicated_time_seconds - description: The replicated time of the physical replication stream in seconds since the unix epoch. - y_axis_label: Seconds - type: GAUGE - unit: SECONDS - aggregation: AVG - derivative: NONE - name: physical_replication.resolved_events_ingested exported_name: physical_replication_resolved_events_ingested description: Resolved events ingested by all replication jobs @@ -7192,6 +7296,17 @@ layers: unit: COUNT aggregation: AVG derivative: NONE + - name: round-trip-default-class-latency + exported_name: round_trip_default_class_latency + description: | + Distribution of round-trip latencies with other nodes. + + Similar to round-trip-latency, but only for default class connections. + y_axis_label: Round-trip time + type: HISTOGRAM + unit: NANOSECONDS + aggregation: AVG + derivative: NONE - name: round-trip-latency exported_name: round_trip_latency description: | @@ -7209,6 +7324,39 @@ layers: unit: NANOSECONDS aggregation: AVG derivative: NONE + - name: round-trip-raft-class-latency + exported_name: round_trip_raft_class_latency + description: | + Distribution of round-trip latencies with other nodes. + + Similar to round-trip-latency, but only for raft class connections. + y_axis_label: Round-trip time + type: HISTOGRAM + unit: NANOSECONDS + aggregation: AVG + derivative: NONE + - name: round-trip-rangefeed-class-latency + exported_name: round_trip_rangefeed_class_latency + description: | + Distribution of round-trip latencies with other nodes. + + Similar to round-trip-latency, but only for rangefeed class connections. + y_axis_label: Round-trip time + type: HISTOGRAM + unit: NANOSECONDS + aggregation: AVG + derivative: NONE + - name: round-trip-system-class-latency + exported_name: round_trip_system_class_latency + description: | + Distribution of round-trip latencies with other nodes. + + Similar to round-trip-latency, but only for system class connections. + y_axis_label: Round-trip time + type: HISTOGRAM + unit: NANOSECONDS + aggregation: AVG + derivative: NONE - name: rpc.client.bytes.egress exported_name: rpc_client_bytes_egress description: Counter of TCP bytes sent via gRPC on connections we initiated. @@ -7653,22 +7801,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: sql.delete.started.count - exported_name: sql_delete_started_count - description: Number of SQL DELETE statements started - y_axis_label: SQL Statements - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - - name: sql.delete.started.count.internal - exported_name: sql_delete_started_count_internal - description: Number of SQL DELETE statements started (internal queries) - y_axis_label: SQL Internal Statements - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: sql.disk.distsql.current exported_name: sql_disk_distsql_current description: Current sql statement disk usage for distsql @@ -8093,22 +8225,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: sql.insert.started.count - exported_name: sql_insert_started_count - description: Number of SQL INSERT statements started - y_axis_label: SQL Statements - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - - name: sql.insert.started.count.internal - exported_name: sql_insert_started_count_internal - description: Number of SQL INSERT statements started (internal queries) - y_axis_label: SQL Internal Statements - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: sql.insights.anomaly_detection.evictions exported_name: sql_insights_anomaly_detection_evictions description: Evictions of fingerprint latency summaries due to memory pressure @@ -8773,22 +8889,6 @@ layers: unit: COUNT aggregation: AVG derivative: NONE - - name: sql.select.started.count - exported_name: sql_select_started_count - description: Number of SQL SELECT statements started - y_axis_label: SQL Statements - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - - name: sql.select.started.count.internal - exported_name: sql_select_started_count_internal - description: Number of SQL SELECT statements started (internal queries) - y_axis_label: SQL Internal Statements - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: sql.service.latency.consistent exported_name: sql_service_latency_consistent description: Latency of SQL request execution of non-historical queries @@ -8932,7 +9032,23 @@ layers: type: COUNTER unit: COUNT aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE + derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.stats.ingester.num_processed + exported_name: sql_stats_ingester_num_processed + description: Number of items processed by the SQL stats ingester + y_axis_label: Items + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: sql.stats.ingester.queue_size + exported_name: sql_stats_ingester_queue_size + description: Current number of items queued in the SQL stats ingester + y_axis_label: Items + type: GAUGE + unit: COUNT + aggregation: AVG + derivative: NONE - name: sql.stats.mem.current exported_name: sql_stats_mem_current description: Current memory usage for fingerprint storage @@ -9213,22 +9329,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: sql.update.started.count - exported_name: sql_update_started_count - description: Number of SQL UPDATE statements started - y_axis_label: SQL Statements - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - - name: sql.update.started.count.internal - exported_name: sql_update_started_count_internal - description: Number of SQL UPDATE statements started (internal queries) - y_axis_label: SQL Internal Statements - type: COUNTER - unit: COUNT - aggregation: AVG - derivative: NON_NEGATIVE_DERIVATIVE - name: sql.vecindex.pending_splits_merges exported_name: sql_vecindex_pending_splits_merges description: Total number of vector index splits and merges waiting to be processed @@ -9837,6 +9937,86 @@ layers: derivative: NON_NEGATIVE_DERIVATIVE how_to_use: This metric measures the length of time, in seconds, that the CockroachDB process has been running. Monitor this metric to detect events such as node restarts, which may require investigation or intervention. essential: true + - name: NETWORKING + metrics: + - name: sys.host.net.send.tcp.fast_retrans_segs + exported_name: sys_host_net_send_tcp_fast_retrans_segs + description: |- + Segments retransmitted due to the fast retransmission mechanism in TCP. + Fast retransmissions occur when the sender learns that intermediate segments have been lost. + y_axis_label: Segments + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: sys.host.net.send.tcp.loss_probes + exported_name: sys_host_net_send_tcp_loss_probes + description: |2- + + Number of TCP tail loss probes sent. Loss probes are an optimization to detect + loss of the last packet earlier than the retransmission timer, and can indicate + network issues. Tail loss probes are aggressive, so the base rate is often nonzero + even in healthy networks. + y_axis_label: Probes + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: sys.host.net.send.tcp.retrans_segs + exported_name: sys_host_net_send_tcp_retrans_segs + description: |2 + + The number of TCP segments retransmitted across all network interfaces. + This can indicate packet loss occurring in the network. However, it can + also be caused by recipient nodes not consuming packets in a timely manner, + or the local node overflowing its outgoing buffers, for example due to overload. + + Retransmissions also occur in the absence of problems, as modern TCP stacks + err on the side of aggressively retransmitting segments. + + The linux tool 'ss -i' can show the Linux kernel's smoothed view of round-trip + latency and variance on a per-connection basis. Additionally, 'netstat -s' + shows all TCP counters maintained by the kernel. + y_axis_label: Segments + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: |2 + + Phase changes, especially when occurring on groups of nodes, can indicate packet + loss in the network or a slow consumer of packets. On slow consumers, the + 'sys.host.net.rcvd.drop' metric may be elevated; on overloaded senders, it + is worth checking the 'sys.host.net.send.drop' metric. + Additionally, the 'sys.host.net.send.tcp.*' may provide more insight into the + specific type of retransmission. + essential: true + - name: sys.host.net.send.tcp.slow_start_retrans + exported_name: sys_host_net_send_tcp_slow_start_retrans + description: |2 + + Number of TCP retransmissions in slow start. This can indicate that the network + is unable to support the initial fast ramp-up in window size, and can be a sign + of packet loss or congestion. + y_axis_label: Segments + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: sys.host.net.send.tcp_timeouts + exported_name: sys_host_net_send_tcp_timeouts + description: |2 + + Number of TCP retransmission timeouts. These typically imply that a packet has + not been acknowledged within at least 200ms. Modern TCP stacks use + optimizations such as fast retransmissions and loss probes to avoid hitting + retransmission timeouts. Anecdotally, they still occasionally present themselves + even in supposedly healthy cloud environments. + y_axis_label: Timeouts + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: UNSET metrics: - name: build.timestamp @@ -9903,6 +10083,30 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: log.otlp.sink.grpc.transparent_retries + exported_name: log_otlp_sink_grpc_transparent_retries + description: Number of transparent retries done by otlp-server logging sinks when using GRPC + y_axis_label: Retries + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: log.otlp.sink.write.attempts + exported_name: log_otlp_sink_write_attempts + description: Number of write attempts experienced by otlp-server logging sinks + y_axis_label: Attempts + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: log.otlp.sink.write.errors + exported_name: log_otlp_sink_write_errors + description: Number of write errors experienced by otlp-server logging sinks + y_axis_label: Errors + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: sys.cgo.allocbytes exported_name: sys_cgo_allocbytes description: Current bytes of memory allocated by cgo @@ -10047,6 +10251,14 @@ layers: unit: BYTES aggregation: AVG derivative: NONE + - name: sys.go.limitbytes + exported_name: sys_go_limitbytes + description: Go soft memory limit + y_axis_label: Memory + type: GAUGE + unit: BYTES + aggregation: AVG + derivative: NONE - name: sys.go.pause.other.ns exported_name: sys_go_pause_other_ns description: Estimated non-GC-related total pause time @@ -10457,13 +10669,13 @@ layers: essential: true - name: storage.wal.fsync.latency exported_name: storage_wal_fsync_latency - description: The write ahead log fsync latency + description: The fsync latency to the Write-Ahead Log device. y_axis_label: Fsync Latency type: HISTOGRAM unit: NANOSECONDS aggregation: AVG derivative: NONE - how_to_use: If this value is greater than `100ms`, it is an indication of a disk stall. To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured. + how_to_use: If this value is greater than 100ms, it is an indication of a disk stall. To mitigate the effects of disk stalls, consider deploying your cluster with WAL failover configured. When WAL failover is configured, the more relevant metric is storage.wal.failover_write_and_sync.latency, as this metric reflects the fsync latency of the primary and/or the secondary WAL device. essential: true - name: storage.write-stalls exported_name: storage_write_stalls @@ -12216,6 +12428,14 @@ layers: unit: COUNT aggregation: AVG derivative: NONE + - name: kv.concurrency.locks_shed_due_to_memory_limit + exported_name: kv_concurrency_locks_shed_due_to_memory_limit + description: The number of locks that were shed because the lock table ran into memory limits + y_axis_label: Locks + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: kv.concurrency.locks_with_wait_queues exported_name: kv_concurrency_locks_with_wait_queues description: Number of active locks held in lock tables with active wait-queues @@ -12248,6 +12468,14 @@ layers: unit: COUNT aggregation: AVG derivative: NONE + - name: kv.concurrency.num_lock_shed_due_to_memory_limit_events + exported_name: kv_concurrency_num_lock_shed_due_to_memory_limit_events + description: The number of times locks that were shed by the lock table because it ran into memory limits + y_axis_label: Lock Shed Events + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: kv.loadsplitter.cleardirection exported_name: kv_loadsplitter_cleardirection description: Load-based splitter observed an access direction greater than 80% left or right in the samples. @@ -12424,14 +12652,6 @@ layers: unit: BYTES aggregation: AVG derivative: NONE - - name: kv.rangefeed.mux_stream_send.latency - exported_name: kv_rangefeed_mux_stream_send_latency - description: Latency of sending RangeFeed events to the client - y_axis_label: Latency - type: HISTOGRAM - unit: NANOSECONDS - aggregation: AVG - derivative: NONE - name: kv.rangefeed.mux_stream_send.slow_events exported_name: kv_rangefeed_mux_stream_send_slow_events description: Number of RangeFeed events that took longer than 10s to send to the client @@ -13651,6 +13871,38 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: queue.replicate.enqueue.add + exported_name: queue_replicate_enqueue_add + description: Number of replicas successfully added to the replicate queue + y_axis_label: Replicas + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: queue.replicate.enqueue.failedprecondition + exported_name: queue_replicate_enqueue_failedprecondition + description: Number of replicas that failed the precondition checks and were therefore not added to the replicate queue + y_axis_label: Replicas + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: queue.replicate.enqueue.noaction + exported_name: queue_replicate_enqueue_noaction + description: Number of replicas for which ShouldQueue determined no action was needed and were therefore not added to the replicate queue + y_axis_label: Replicas + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: queue.replicate.enqueue.unexpectederror + exported_name: queue_replicate_enqueue_unexpectederror + description: Number of replicas that were expected to be enqueued (ShouldQueue returned true or the caller decided to add to the replicate queue directly), but failed to be enqueued due to unexpected errors + y_axis_label: Replicas + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: queue.replicate.nonvoterpromotions exported_name: queue_replicate_nonvoterpromotions description: Number of non-voters promoted to voters by the replicate queue @@ -13667,6 +13919,22 @@ layers: unit: COUNT aggregation: AVG derivative: NONE + - name: queue.replicate.priority_inversion.requeue + exported_name: queue_replicate_priority_inversion_requeue + description: Number of priority inversions in the replicate queue that resulted in requeuing of the replicas. A priority inversion occurs when the priority at processing time ends up being lower than at enqueue time. When the priority has changed from a high priority repair action to rebalance, the change is requeued to avoid unfairness. + y_axis_label: Replicas + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: queue.replicate.priority_inversion.total + exported_name: queue_replicate_priority_inversion_total + description: Total number of priority inversions in the replicate queue. A priority inversion occurs when the priority at processing time ends up being lower than at enqueue time + y_axis_label: Replicas + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: queue.replicate.process.failure exported_name: queue_replicate_process_failure description: Number of replicas which failed processing in the replicate queue @@ -13699,6 +13967,14 @@ layers: unit: COUNT aggregation: AVG derivative: NONE + - name: queue.replicate.queue_full + exported_name: queue_replicate_queue_full + description: Number of times a replica was dropped from the queue due to queue fullness + y_axis_label: Replicas + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: queue.replicate.rebalancenonvoterreplica exported_name: queue_replicate_rebalancenonvoterreplica description: Number of non-voter replica rebalancer-initiated additions attempted by the replicate queue @@ -15073,6 +15349,60 @@ layers: unit: COUNT aggregation: AVG derivative: NONE + - name: ranges.decommissioning.nudger.enqueue + exported_name: ranges_decommissioning_nudger_enqueue + labeled_name: 'ranges.decommissioning.nudger.enqueue{status: enqueue}' + description: 'Number of enqueued enqueues of a range for decommissioning by the decommissioning nudger. Note: This metric tracks when the nudger attempts to enqueue, but the replica might not end up being enqueued by the priority queue due to various filtering or failure conditions.' + y_axis_label: Ranges + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: ranges.decommissioning.nudger.enqueue.failure + exported_name: ranges_decommissioning_nudger_enqueue_failure + labeled_name: ranges.decommissioning.nudger.enqueue.failure + description: Number of ranges that failed to enqueue at the replicate queue + y_axis_label: Ranges + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: ranges.decommissioning.nudger.enqueue.success + exported_name: ranges_decommissioning_nudger_enqueue_success + labeled_name: ranges.decommissioning.nudger.enqueue.success + description: Number of ranges that were successfully enqueued by the decommisioning nudger + y_axis_label: Ranges + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease + exported_name: ranges_decommissioning_nudger_not_leaseholder_or_invalid_lease + labeled_name: ranges.decommissioning.nudger.not_leaseholder_or_invalid_lease + description: Number of ranges that were not the leaseholder or had an invalid lease at the decommissioning nudger + y_axis_label: Ranges + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: ranges.decommissioning.nudger.process.failure + exported_name: ranges_decommissioning_nudger_process_failure + labeled_name: ranges.decommissioning.nudger.process.failure + description: Number of ranges enqueued by the decommissioning nudger that failed to process by the replicate queue + y_axis_label: Ranges + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE + - name: ranges.decommissioning.nudger.process.success + exported_name: ranges_decommissioning_nudger_process_success + labeled_name: ranges.decommissioning.nudger.process.success + description: Number of ranges enqueued by the decommissioning nudger that were successfully processed by the replicate queue + y_axis_label: Ranges + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: ranges.overreplicated exported_name: ranges_overreplicated description: Number of ranges with more live replicas than the replication target @@ -15381,7 +15711,7 @@ layers: description: Number of disk reads per query y_axis_label: Disk Reads per Query type: GAUGE - unit: COUNT + unit: CONST aggregation: AVG derivative: NONE - name: rocksdb.table-readers-mem-estimate @@ -15544,6 +15874,14 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: rpc.method.flushlocktable.recv + exported_name: rpc_method_flushlocktable_recv + description: Number of FlushLockTable requests processed + y_axis_label: RPCs + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: rpc.method.gc.recv exported_name: rpc_method_gc_recv description: Number of GC requests processed @@ -15993,6 +16331,78 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: storage.compression.cr + exported_name: storage_compression_cr + description: Average compression ratio of sstable and blob value data. + y_axis_label: Ratio + type: GAUGE + unit: CONST + aggregation: AVG + derivative: NONE + - name: storage.compression.minlz.bytes + exported_name: storage_compression_minlz_bytes + description: Total on disk size of sstable and blob value data that is compressed with the MinLZ algorithm. + y_axis_label: Bytes + type: GAUGE + unit: CONST + aggregation: AVG + derivative: NONE + - name: storage.compression.minlz.cr + exported_name: storage_compression_minlz_cr + description: Average compression ratio of sstable and blob value data that is compressed with the MinLZ algorithm. + y_axis_label: Ratio + type: GAUGE + unit: CONST + aggregation: AVG + derivative: NONE + - name: storage.compression.none.bytes + exported_name: storage_compression_none_bytes + description: Total on disk size of sstable and blob value data that is not compressed. + y_axis_label: Bytes + type: GAUGE + unit: BYTES + aggregation: AVG + derivative: NONE + - name: storage.compression.snappy.bytes + exported_name: storage_compression_snappy_bytes + description: Total on disk size of sstable and blob value data that is compressed with the Snappy algorithm. + y_axis_label: Bytes + type: GAUGE + unit: BYTES + aggregation: AVG + derivative: NONE + - name: storage.compression.snappy.cr + exported_name: storage_compression_snappy_cr + description: Average compression ratio of sstable and blob value data that is compressed with the snappy algorithm. + y_axis_label: Ratio + type: GAUGE + unit: CONST + aggregation: AVG + derivative: NONE + - name: storage.compression.unknown.bytes + exported_name: storage_compression_unknown_bytes + description: Total on disk size of sstable and blob value data that is compressed but for which we have no compression statistics. + y_axis_label: Bytes + type: GAUGE + unit: BYTES + aggregation: AVG + derivative: NONE + - name: storage.compression.zstd.bytes + exported_name: storage_compression_zstd_bytes + description: Total on disk size of sstable and blob value data that is compressed with the Zstd algorithm. + y_axis_label: Bytes + type: GAUGE + unit: BYTES + aggregation: AVG + derivative: NONE + - name: storage.compression.zstd.cr + exported_name: storage_compression_zstd_cr + description: Average compression ratio of sstable and blob value data that is compressed with the Zstd algorithm. + y_axis_label: Ratio + type: GAUGE + unit: CONST + aggregation: AVG + derivative: NONE - name: storage.disk-slow exported_name: storage_disk_slow description: Number of instances of disk operations taking longer than 10s @@ -16009,6 +16419,14 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: storage.disk-unhealthy.duration + exported_name: storage_disk_unhealthy_duration + description: Total disk unhealthy duration in nanos + y_axis_label: Nanoseconds + type: COUNTER + unit: NANOSECONDS + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: storage.disk.io.time exported_name: storage_disk_io_time description: Time spent reading from or writing to the store's disk since this process started (as reported by the OS) @@ -16033,6 +16451,14 @@ layers: unit: BYTES aggregation: AVG derivative: NONE + - name: storage.disk.read-max.iops + exported_name: storage_disk_read_max_iops + description: Maximum rate of read operations performed on the disk (as reported by the OS) + y_axis_label: Operations + type: GAUGE + unit: COUNT + aggregation: AVG + derivative: NONE - name: storage.disk.read.bytes exported_name: storage_disk_read_bytes description: Bytes read from the store's disk since this process started (as reported by the OS) @@ -16073,6 +16499,14 @@ layers: unit: BYTES aggregation: AVG derivative: NONE + - name: storage.disk.write-max.iops + exported_name: storage_disk_write_max_iops + description: Maximum rate of write operations performed on the disk (as reported by the OS) + y_axis_label: Operations + type: GAUGE + unit: COUNT + aggregation: AVG + derivative: NONE - name: storage.disk.write.bytes exported_name: storage_disk_write_bytes description: Bytes written to the store's disk since this process started (as reported by the OS) @@ -16588,7 +17022,7 @@ layers: description: Compaction score of level 0 y_axis_label: Score type: GAUGE - unit: COUNT + unit: CONST aggregation: AVG derivative: NONE - name: storage.l0-level-size @@ -16628,7 +17062,7 @@ layers: description: Compaction score of level 1 y_axis_label: Score type: GAUGE - unit: COUNT + unit: CONST aggregation: AVG derivative: NONE - name: storage.l1-level-size @@ -16652,7 +17086,7 @@ layers: description: Compaction score of level 2 y_axis_label: Score type: GAUGE - unit: COUNT + unit: CONST aggregation: AVG derivative: NONE - name: storage.l2-level-size @@ -16676,7 +17110,7 @@ layers: description: Compaction score of level 3 y_axis_label: Score type: GAUGE - unit: COUNT + unit: CONST aggregation: AVG derivative: NONE - name: storage.l3-level-size @@ -16700,7 +17134,7 @@ layers: description: Compaction score of level 4 y_axis_label: Score type: GAUGE - unit: COUNT + unit: CONST aggregation: AVG derivative: NONE - name: storage.l4-level-size @@ -16724,7 +17158,7 @@ layers: description: Compaction score of level 5 y_axis_label: Score type: GAUGE - unit: COUNT + unit: CONST aggregation: AVG derivative: NONE - name: storage.l5-level-size @@ -16748,7 +17182,7 @@ layers: description: Compaction score of level 6 y_axis_label: Score type: GAUGE - unit: COUNT + unit: CONST aggregation: AVG derivative: NONE - name: storage.l6-level-size @@ -16903,38 +17337,6 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE - - name: storage.sstable.compression.none.count - exported_name: storage_sstable_compression_none_count - description: Count of SSTables that are uncompressed. - y_axis_label: SSTables - type: GAUGE - unit: COUNT - aggregation: AVG - derivative: NONE - - name: storage.sstable.compression.snappy.count - exported_name: storage_sstable_compression_snappy_count - description: Count of SSTables that have been compressed with the snappy compression algorithm. - y_axis_label: SSTables - type: GAUGE - unit: COUNT - aggregation: AVG - derivative: NONE - - name: storage.sstable.compression.unknown.count - exported_name: storage_sstable_compression_unknown_count - description: Count of SSTables that have an unknown compression algorithm. - y_axis_label: SSTables - type: GAUGE - unit: COUNT - aggregation: AVG - derivative: NONE - - name: storage.sstable.compression.zstd.count - exported_name: storage_sstable_compression_zstd_count - description: Count of SSTables that have been compressed with the zstd compression algorithm. - y_axis_label: SSTables - type: GAUGE - unit: COUNT - aggregation: AVG - derivative: NONE - name: storage.sstable.remote.bytes exported_name: storage_sstable_remote_bytes description: Bytes in SSTables that are stored off-disk (remotely) in object storage. @@ -16991,9 +17393,17 @@ layers: unit: BYTES aggregation: AVG derivative: NONE + - name: storage.value_separation.value_retrieval.count + exported_name: storage_value_separation_value_retrieval_count + description: The number of value retrievals of values separated into blob files. + y_axis_label: Events + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: storage.wal.bytes_in exported_name: storage_wal_bytes_in - description: The number of logical bytes the storage engine has written to the WAL + description: The number of logical bytes the storage engine has written to the Write-Ahead Log. y_axis_label: Events type: COUNTER unit: COUNT @@ -17001,7 +17411,7 @@ layers: derivative: NON_NEGATIVE_DERIVATIVE - name: storage.wal.bytes_written exported_name: storage_wal_bytes_written - description: The number of bytes the storage engine has written to the WAL + description: The number of bytes the storage engine has written to the Write-Ahead Log. y_axis_label: Events type: COUNTER unit: COUNT @@ -17009,20 +17419,22 @@ layers: derivative: NON_NEGATIVE_DERIVATIVE - name: storage.wal.failover.primary.duration exported_name: storage_wal_failover_primary_duration - description: Cumulative time spent writing to the primary WAL directory. Only populated when WAL failover is configured + description: Cumulative time spent writing to the primary WAL directory. y_axis_label: Nanoseconds type: COUNTER unit: NANOSECONDS aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: Only populated when WAL failover is configured. - name: storage.wal.failover.secondary.duration exported_name: storage_wal_failover_secondary_duration - description: Cumulative time spent writing to the secondary WAL directory. Only populated when WAL failover is configured + description: Cumulative time spent writing to the secondary WAL directory. y_axis_label: Nanoseconds type: COUNTER unit: NANOSECONDS aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: Only populated when WAL failover is configured. - name: storage.wal.failover.switch.count exported_name: storage_wal_failover_switch_count description: Count of the number of times WAL writing has switched from primary to secondary and vice versa. @@ -17031,14 +17443,16 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + how_to_use: Only populated when WAL failover is configured. A high switch count indicates that many disk stalls were encountered. - name: storage.wal.failover.write_and_sync.latency exported_name: storage_wal_failover_write_and_sync_latency - description: The observed latency for writing and syncing to the write ahead log. Only populated when WAL failover is configured + description: The observed latency for writing and syncing to the logical Write-Ahead Log. y_axis_label: Nanoseconds type: HISTOGRAM unit: NANOSECONDS aggregation: AVG derivative: NONE + how_to_use: Only populated when WAL failover is configured. Without WAL failover, the relevant metric is storage.wal.fsync.latency. - name: storage.write-amplification exported_name: storage_write_amplification description: |- @@ -17204,6 +17618,14 @@ layers: unit: COUNT aggregation: AVG derivative: NON_NEGATIVE_DERIVATIVE + - name: subsume.locks_written + exported_name: subsume_locks_written + description: Number of locks written to storage during subsume (range merge) + y_axis_label: Locks Written + type: COUNTER + unit: COUNT + aggregation: AVG + derivative: NON_NEGATIVE_DERIVATIVE - name: sysbytes exported_name: sysbytes description: Number of bytes in system KV pairs diff --git a/src/current/_includes/v25.3/essential-metrics.md b/src/current/_includes/v25.3/essential-metrics.md index 6f02ba25f05..d9a445fca22 100644 --- a/src/current/_includes/v25.3/essential-metrics.md +++ b/src/current/_includes/v25.3/essential-metrics.md @@ -1,8 +1,6 @@ {% assign version = page.version.version | replace: ".", "" %} {% comment %}DEBUG: {{ version }}{% endcomment %} -These essential CockroachDB metrics let you monitor your CockroachDB {{ site.data.products.core }} cluster. Use them to build custom dashboards with the following tools: - {% comment %} STEP 1. Assign variables specific to deployment {% endcomment %} {% if include.deployment == 'self-hosted' %} {% assign metrics_datadog = site.data[version].metrics.datadog-cockroachdb %} @@ -10,6 +8,8 @@ These essential CockroachDB metrics let you monitor your CockroachDB {{ site.dat {% assign datadog_prefix = "cockroachdb" %} {% assign category_order = "HARDWARE,STORAGE,OVERLOAD,NETWORKING,DISTRIBUTED,REPLICATION,SQL,CHANGEFEEDS,TTL,UNSET," %} +These essential CockroachDB metrics let you monitor your CockroachDB {{ site.data.products.core }} cluster. Use them to build custom dashboards with the following tools: + - [Grafana]({% link {{ page.version.version }}/monitor-cockroachdb-with-prometheus.md %}#step-5-visualize-metrics-in-grafana) - [Datadog Integration]({% link {{ page.version.version }}/datadog.md %}): The [**Datadog Integration Metric Name**]({{ datadog_link }}) column lists the corresponding Datadog metric which requires the `{{ datadog_prefix }}.` prefix. @@ -20,6 +20,8 @@ These essential CockroachDB metrics let you monitor your CockroachDB {{ site.dat {% comment %} Removed NETWORKING category for advanced deployment {% endcomment %} {% assign category_order = "HARDWARE,STORAGE,OVERLOAD,DISTRIBUTED,REPLICATION,SQL,CHANGEFEEDS,TTL,UNSET," %} +These essential CockroachDB metrics let you monitor your CockroachDB {{ site.data.products.advanced }} cluster. Use them to build custom dashboards with the following tools: + - [Datadog integration]({% link cockroachcloud/tools-page.md %}#monitor-cockroachdb-cloud-with-datadog) - The [**Datadog Integration Metric Name**]({{ datadog_link }}) column lists the corresponding Datadog metric which requires the `{{ datadog_prefix }}` prefix. - [Metrics export]({% link cockroachcloud/export-metrics-advanced.md %}) diff --git a/src/current/_includes/v25.4/essential-metrics.md b/src/current/_includes/v25.4/essential-metrics.md index 6f02ba25f05..4f4627a9dcb 100644 --- a/src/current/_includes/v25.4/essential-metrics.md +++ b/src/current/_includes/v25.4/essential-metrics.md @@ -1,14 +1,14 @@ {% assign version = page.version.version | replace: ".", "" %} {% comment %}DEBUG: {{ version }}{% endcomment %} -These essential CockroachDB metrics let you monitor your CockroachDB {{ site.data.products.core }} cluster. Use them to build custom dashboards with the following tools: - {% comment %} STEP 1. Assign variables specific to deployment {% endcomment %} {% if include.deployment == 'self-hosted' %} {% assign metrics_datadog = site.data[version].metrics.datadog-cockroachdb %} {% assign datadog_link = "https://docs.datadoghq.com/integrations/cockroachdb/?tab=host#metrics" %} {% assign datadog_prefix = "cockroachdb" %} - {% assign category_order = "HARDWARE,STORAGE,OVERLOAD,NETWORKING,DISTRIBUTED,REPLICATION,SQL,CHANGEFEEDS,TTL,UNSET," %} + {% assign category_order = "HARDWARE,STORAGE,OVERLOAD,NETWORKING,DISTRIBUTED,REPLICATION,SQL,CHANGEFEEDS,TTL,CROSS_CLUSTER_REPLICATION,LOGICAL_DATA_REPLICATION,UNSET," %} + +These essential CockroachDB metrics let you monitor your CockroachDB {{ site.data.products.core }} cluster. Use them to build custom dashboards with the following tools: - [Grafana]({% link {{ page.version.version }}/monitor-cockroachdb-with-prometheus.md %}#step-5-visualize-metrics-in-grafana) - [Datadog Integration]({% link {{ page.version.version }}/datadog.md %}): The [**Datadog Integration Metric Name**]({{ datadog_link }}) column lists the corresponding Datadog metric which requires the `{{ datadog_prefix }}.` prefix. @@ -18,7 +18,9 @@ These essential CockroachDB metrics let you monitor your CockroachDB {{ site.dat {% assign datadog_link = "https://docs.datadoghq.com/integrations/cockroach-cloud/#metrics" %} {% assign datadog_prefix = "crdb_dedicated" %} {% comment %} Removed NETWORKING category for advanced deployment {% endcomment %} - {% assign category_order = "HARDWARE,STORAGE,OVERLOAD,DISTRIBUTED,REPLICATION,SQL,CHANGEFEEDS,TTL,UNSET," %} + {% assign category_order = "HARDWARE,STORAGE,OVERLOAD,DISTRIBUTED,REPLICATION,SQL,CHANGEFEEDS,TTL,CROSS_CLUSTER_REPLICATION,LOGICAL_DATA_REPLICATION,UNSET," %} + +These essential CockroachDB metrics let you monitor your CockroachDB {{ site.data.products.advanced }} cluster. Use them to build custom dashboards with the following tools: - [Datadog integration]({% link cockroachcloud/tools-page.md %}#monitor-cockroachdb-cloud-with-datadog) - The [**Datadog Integration Metric Name**]({{ datadog_link }}) column lists the corresponding Datadog metric which requires the `{{ datadog_prefix }}` prefix. - [Metrics export]({% link cockroachcloud/export-metrics-advanced.md %}) @@ -56,7 +58,7 @@ The **Usage** column explains why each metric is important to visualize and how {% comment %} Order categories, NOTE: new categories may break this order, however all relevant categories will be displayed though not in the desired order{% endcomment %} {% comment %}DEBUG: category_names_string = {{ category_names_string }}{% endcomment %} -{% assign category_names_string_ordered = category_names_string | replace: "CHANGEFEEDS,DISTRIBUTED,NETWORKING,SQL,TTL,UNSET,HARDWARE,OVERLOAD,REPLICATION,STORAGE,", category_order %} +{% assign category_names_string_ordered = category_names_string | replace: "CHANGEFEEDS,CROSS_CLUSTER_REPLICATION,DISTRIBUTED,LOGICAL_DATA_REPLICATION,NETWORKING,SQL,TTL,UNSET,HARDWARE,OVERLOAD,STORAGE,", category_order %} {% comment %}DEBUG: category_names_string_ordered = {{ category_names_string_ordered }}{% endcomment %} {% assign category_names_array = category_names_string_ordered | split: "," %} @@ -90,6 +92,8 @@ The **Usage** column explains why each metric is important to visualize and how {% elsif category_name == "REPLICATION" %}{% assign category_display_name = "KV Replication" %} {% elsif category_name == "CHANGEFEEDS" %}{% assign category_display_name = "Changefeeds" %} {% elsif category_name == "TTL" %}{% assign category_display_name = "Row-level TTL" %} + {% elsif category_name == "CROSS_CLUSTER_REPLICATION" %}{% assign category_display_name = "Physical Replication" %} + {% elsif category_name == "LOGICAL_DATA_REPLICATION" %}{% assign category_display_name = "Logical Replication" %} {% else %}{% assign category_display_name = category_name %}{% comment %} For example, SQL {% endcomment %} {% endif %}