From 68c4f31bc76c479501dac384831bd5441366000b Mon Sep 17 00:00:00 2001 From: Max Inden Date: Mon, 22 Jun 2020 15:57:04 +0200 Subject: [PATCH 1/6] client/network/service: Add primary dimension to connection metrics Two nodes can be interconnected via one or more connections. The first of those connections is called the primary connection. This commit adds another dimension to the `sub_libp2p_connections_{closed,opened}_total` metrics to differentiate primary and non-primary connections being opened / closed. By intuition more than one connection between two nodes is rare. Tracking the fact whether a connection is primary or not will help prove or disprove this intuition. --- client/network/src/service.rs | 45 +++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/client/network/src/service.rs b/client/network/src/service.rs index 93560a6c0b968..1724796352883 100644 --- a/client/network/src/service.rs +++ b/client/network/src/service.rs @@ -889,16 +889,16 @@ impl Metrics { connections_closed_total: register(CounterVec::new( Opts::new( "sub_libp2p_connections_closed_total", - "Total number of connections closed, by reason and direction" + "Total number of connections closed, by direction, reason and by being primary or not" ), - &["direction", "reason"] + &["direction", "reason", "was_primary"] )?, registry)?, connections_opened_total: register(CounterVec::new( Opts::new( "sub_libp2p_connections_opened_total", - "Total number of connections opened" + "Total number of connections opened by direction and by being primary or not" ), - &["direction"] + &["direction", "is_primary"] )?, registry)?, import_queue_blocks_submitted: register(Counter::new( "import_queue_blocks_submitted", @@ -1214,41 +1214,44 @@ impl Future for NetworkWorker { } this.event_streams.send(ev); }, - Poll::Ready(SwarmEvent::ConnectionEstablished { peer_id, endpoint, .. }) => { + Poll::Ready(SwarmEvent::ConnectionEstablished { peer_id, endpoint, num_established }) => { trace!(target: "sub-libp2p", "Libp2p => Connected({:?})", peer_id); + + let is_primary = if num_established.get() == 1 { "true" } else { "false" }; + if let Some(metrics) = this.metrics.as_ref() { match endpoint { ConnectedPoint::Dialer { .. } => - metrics.connections_opened_total.with_label_values(&["out"]).inc(), + metrics.connections_opened_total.with_label_values(&["out", is_primary]).inc(), ConnectedPoint::Listener { .. } => - metrics.connections_opened_total.with_label_values(&["in"]).inc(), + metrics.connections_opened_total.with_label_values(&["in", is_primary]).inc(), } } }, - Poll::Ready(SwarmEvent::ConnectionClosed { peer_id, cause, endpoint, .. }) => { + Poll::Ready(SwarmEvent::ConnectionClosed { peer_id, cause, endpoint, num_established }) => { trace!(target: "sub-libp2p", "Libp2p => Disconnected({:?}, {:?})", peer_id, cause); if let Some(metrics) = this.metrics.as_ref() { - let dir = match endpoint { + let direction = match endpoint { ConnectedPoint::Dialer { .. } => "out", ConnectedPoint::Listener { .. } => "in", }; - match cause { - ConnectionError::IO(_) => - metrics.connections_closed_total.with_label_values(&[dir, "transport-error"]).inc(), + let reason = match cause { + ConnectionError::IO(_) => "transport-error", ConnectionError::Handler(NodeHandlerWrapperError::Handler(EitherError::A(EitherError::A( EitherError::A(EitherError::A(EitherError::B( - EitherError::A(PingFailure::Timeout)))))))) => - metrics.connections_closed_total.with_label_values(&[dir, "ping-timeout"]).inc(), + EitherError::A(PingFailure::Timeout)))))))) => "ping-timeout", ConnectionError::Handler(NodeHandlerWrapperError::Handler(EitherError::A(EitherError::A( EitherError::A(EitherError::A(EitherError::A( - EitherError::B(LegacyConnectionKillError)))))))) => - metrics.connections_closed_total.with_label_values(&[dir, "force-closed"]).inc(), - ConnectionError::Handler(NodeHandlerWrapperError::Handler(_)) => - metrics.connections_closed_total.with_label_values(&[dir, "protocol-error"]).inc(), - ConnectionError::Handler(NodeHandlerWrapperError::KeepAliveTimeout) => - metrics.connections_closed_total.with_label_values(&[dir, "keep-alive-timeout"]).inc(), - } + EitherError::B(LegacyConnectionKillError)))))))) => "force-closed", + ConnectionError::Handler(NodeHandlerWrapperError::Handler(_)) => "protocol-error", + ConnectionError::Handler(NodeHandlerWrapperError::KeepAliveTimeout) => "keep-alive-timeout", + }; + + // `num_established` represents the number of *remaining* connections. + let was_primary = if num_established == 0 { "true" } else { "false" }; + + metrics.connections_closed_total.with_label_values(&[direction, reason, was_primary]).inc(); } }, Poll::Ready(SwarmEvent::NewListenAddr(addr)) => { From 2d2f93e414440b9fc9e8f7fae6fe48bd95af6b8f Mon Sep 17 00:00:00 2001 From: Max Inden Date: Mon, 22 Jun 2020 19:26:14 +0200 Subject: [PATCH 2/6] .maintain/monitoring: Ensure to sum over all connections_closed variants --- .../monitoring/grafana-dashboards/substrate-networking.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.maintain/monitoring/grafana-dashboards/substrate-networking.json b/.maintain/monitoring/grafana-dashboards/substrate-networking.json index f18ca66c13a07..b374a692b9c67 100644 --- a/.maintain/monitoring/grafana-dashboards/substrate-networking.json +++ b/.maintain/monitoring/grafana-dashboards/substrate-networking.json @@ -1790,7 +1790,7 @@ "steppedLine": false, "targets": [ { - "expr": "avg(increase(${metric_namespace}_sub_libp2p_connections_closed_total{instance=~\"${nodename}\"}[$__interval])) by (reason)", + "expr": "avg(sum(rate(${metric_namespace}_sub_libp2p_connections_closed_total{instance=~\"${nodename}\"}[$__interval])) by (instance, reason)) by (reason)", "interval": "", "legendFormat": "{{reason}}", "refId": "A" @@ -2719,4 +2719,4 @@ "list": [] }, "version": 103 -} \ No newline at end of file +} From 2ca29a8b23d81dd6ecce7b2cb0b8d74d1b2e97ee Mon Sep 17 00:00:00 2001 From: Max Inden Date: Tue, 23 Jun 2020 10:53:23 +0200 Subject: [PATCH 3/6] client/network/service: Rename is_primary to is_first --- client/network/src/service.rs | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/client/network/src/service.rs b/client/network/src/service.rs index 1724796352883..621fe60936df8 100644 --- a/client/network/src/service.rs +++ b/client/network/src/service.rs @@ -889,16 +889,16 @@ impl Metrics { connections_closed_total: register(CounterVec::new( Opts::new( "sub_libp2p_connections_closed_total", - "Total number of connections closed, by direction, reason and by being primary or not" + "Total number of connections closed, by direction, reason and by being the first or not" ), - &["direction", "reason", "was_primary"] + &["direction", "reason", "was_first"] )?, registry)?, connections_opened_total: register(CounterVec::new( Opts::new( "sub_libp2p_connections_opened_total", - "Total number of connections opened by direction and by being primary or not" + "Total number of connections opened by direction and by being the first or not" ), - &["direction", "is_primary"] + &["direction", "is_first"] )?, registry)?, import_queue_blocks_submitted: register(Counter::new( "import_queue_blocks_submitted", @@ -1217,15 +1217,14 @@ impl Future for NetworkWorker { Poll::Ready(SwarmEvent::ConnectionEstablished { peer_id, endpoint, num_established }) => { trace!(target: "sub-libp2p", "Libp2p => Connected({:?})", peer_id); - let is_primary = if num_established.get() == 1 { "true" } else { "false" }; - if let Some(metrics) = this.metrics.as_ref() { - match endpoint { - ConnectedPoint::Dialer { .. } => - metrics.connections_opened_total.with_label_values(&["out", is_primary]).inc(), - ConnectedPoint::Listener { .. } => - metrics.connections_opened_total.with_label_values(&["in", is_primary]).inc(), - } + let direction = match endpoint { + ConnectedPoint::Dialer { .. } => "out", + ConnectedPoint::Listener { .. } => "in", + }; + let is_first = if num_established.get() == 1 { "true" } else { "false" }; + + metrics.connections_opened_total.with_label_values(&[direction, is_first]).inc(); } }, Poll::Ready(SwarmEvent::ConnectionClosed { peer_id, cause, endpoint, num_established }) => { @@ -1249,9 +1248,9 @@ impl Future for NetworkWorker { }; // `num_established` represents the number of *remaining* connections. - let was_primary = if num_established == 0 { "true" } else { "false" }; + let was_first = if num_established == 0 { "true" } else { "false" }; - metrics.connections_closed_total.with_label_values(&[direction, reason, was_primary]).inc(); + metrics.connections_closed_total.with_label_values(&[direction, reason, was_first]).inc(); } }, Poll::Ready(SwarmEvent::NewListenAddr(addr)) => { From b03b12dce27db5af886bd712cdb1a764b0e56d24 Mon Sep 17 00:00:00 2001 From: Max Inden Date: Tue, 23 Jun 2020 11:56:13 +0200 Subject: [PATCH 4/6] client/network/service: Split by metric name with two additional metrics --- client/network/src/service.rs | 39 +++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/client/network/src/service.rs b/client/network/src/service.rs index 621fe60936df8..d7ea780b7167c 100644 --- a/client/network/src/service.rs +++ b/client/network/src/service.rs @@ -854,6 +854,8 @@ struct Metrics { // This list is ordered alphabetically connections_closed_total: CounterVec, connections_opened_total: CounterVec, + distinct_peers_connections_closed_total: CounterVec, + distinct_peers_connections_opened_total: CounterVec, import_queue_blocks_submitted: Counter, import_queue_finality_proofs_submitted: Counter, import_queue_justifications_submitted: Counter, @@ -889,16 +891,30 @@ impl Metrics { connections_closed_total: register(CounterVec::new( Opts::new( "sub_libp2p_connections_closed_total", - "Total number of connections closed, by direction, reason and by being the first or not" + "Total number of connections closed, by direction and reason" ), - &["direction", "reason", "was_first"] + &["direction", "reason"] )?, registry)?, connections_opened_total: register(CounterVec::new( Opts::new( "sub_libp2p_connections_opened_total", - "Total number of connections opened by direction and by being the first or not" + "Total number of connections opened by direction" ), - &["direction", "is_first"] + &["direction"] + )?, registry)?, + distinct_peers_connections_closed_total: register(CounterVec::new( + Opts::new( + "sub_libp2p_distinct_peers_connections_closed_total", + "Total number of connections closed with distinct peers, by direction and reason" + ), + &["direction", "reason"] + )?, registry)?, + distinct_peers_connections_opened_total: register(CounterVec::new( + Opts::new( + "sub_libp2p_distinct_peers_connections_opened_total", + "Total number of connections opened with distinct peers by direction" + ), + &["direction"] )?, registry)?, import_queue_blocks_submitted: register(Counter::new( "import_queue_blocks_submitted", @@ -1222,9 +1238,12 @@ impl Future for NetworkWorker { ConnectedPoint::Dialer { .. } => "out", ConnectedPoint::Listener { .. } => "in", }; - let is_first = if num_established.get() == 1 { "true" } else { "false" }; - metrics.connections_opened_total.with_label_values(&[direction, is_first]).inc(); + metrics.connections_opened_total.with_label_values(&[direction]).inc(); + + if num_established.get() == 1 { + metrics.distinct_peers_connections_opened_total.with_label_values(&[direction]).inc(); + } } }, Poll::Ready(SwarmEvent::ConnectionClosed { peer_id, cause, endpoint, num_established }) => { @@ -1247,10 +1266,12 @@ impl Future for NetworkWorker { ConnectionError::Handler(NodeHandlerWrapperError::KeepAliveTimeout) => "keep-alive-timeout", }; - // `num_established` represents the number of *remaining* connections. - let was_first = if num_established == 0 { "true" } else { "false" }; + metrics.connections_closed_total.with_label_values(&[direction, reason]).inc(); - metrics.connections_closed_total.with_label_values(&[direction, reason, was_first]).inc(); + // `num_established` represents the number of *remaining* connections. + if num_established == 0 { + metrics.distinct_peers_connections_closed_total.with_label_values(&[direction, reason]).inc(); + } } }, Poll::Ready(SwarmEvent::NewListenAddr(addr)) => { From 9f3f33e0240c08224cc4bad0dde8091116864e1e Mon Sep 17 00:00:00 2001 From: Max Inden Date: Tue, 23 Jun 2020 12:02:56 +0200 Subject: [PATCH 5/6] Revert ".maintain/monitoring: Ensure to sum over all connections_closed variants" This reverts commit 2d2f93e414440b9fc9e8f7fae6fe48bd95af6b8f. --- .../monitoring/grafana-dashboards/substrate-networking.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.maintain/monitoring/grafana-dashboards/substrate-networking.json b/.maintain/monitoring/grafana-dashboards/substrate-networking.json index b374a692b9c67..f18ca66c13a07 100644 --- a/.maintain/monitoring/grafana-dashboards/substrate-networking.json +++ b/.maintain/monitoring/grafana-dashboards/substrate-networking.json @@ -1790,7 +1790,7 @@ "steppedLine": false, "targets": [ { - "expr": "avg(sum(rate(${metric_namespace}_sub_libp2p_connections_closed_total{instance=~\"${nodename}\"}[$__interval])) by (instance, reason)) by (reason)", + "expr": "avg(increase(${metric_namespace}_sub_libp2p_connections_closed_total{instance=~\"${nodename}\"}[$__interval])) by (reason)", "interval": "", "legendFormat": "{{reason}}", "refId": "A" @@ -2719,4 +2719,4 @@ "list": [] }, "version": 103 -} +} \ No newline at end of file From 779a06ebf1af7727925690e77fbef40b3ce1118c Mon Sep 17 00:00:00 2001 From: Max Inden Date: Tue, 23 Jun 2020 14:13:34 +0200 Subject: [PATCH 6/6] client/network/service: Remove labels from distinct metrics --- client/network/src/service.rs | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/client/network/src/service.rs b/client/network/src/service.rs index d7ea780b7167c..8cce3367f79ec 100644 --- a/client/network/src/service.rs +++ b/client/network/src/service.rs @@ -854,8 +854,8 @@ struct Metrics { // This list is ordered alphabetically connections_closed_total: CounterVec, connections_opened_total: CounterVec, - distinct_peers_connections_closed_total: CounterVec, - distinct_peers_connections_opened_total: CounterVec, + distinct_peers_connections_closed_total: Counter, + distinct_peers_connections_opened_total: Counter, import_queue_blocks_submitted: Counter, import_queue_finality_proofs_submitted: Counter, import_queue_justifications_submitted: Counter, @@ -902,19 +902,13 @@ impl Metrics { ), &["direction"] )?, registry)?, - distinct_peers_connections_closed_total: register(CounterVec::new( - Opts::new( + distinct_peers_connections_closed_total: register(Counter::new( "sub_libp2p_distinct_peers_connections_closed_total", - "Total number of connections closed with distinct peers, by direction and reason" - ), - &["direction", "reason"] + "Total number of connections closed with distinct peers" )?, registry)?, - distinct_peers_connections_opened_total: register(CounterVec::new( - Opts::new( + distinct_peers_connections_opened_total: register(Counter::new( "sub_libp2p_distinct_peers_connections_opened_total", - "Total number of connections opened with distinct peers by direction" - ), - &["direction"] + "Total number of connections opened with distinct peers" )?, registry)?, import_queue_blocks_submitted: register(Counter::new( "import_queue_blocks_submitted", @@ -1238,11 +1232,10 @@ impl Future for NetworkWorker { ConnectedPoint::Dialer { .. } => "out", ConnectedPoint::Listener { .. } => "in", }; - metrics.connections_opened_total.with_label_values(&[direction]).inc(); if num_established.get() == 1 { - metrics.distinct_peers_connections_opened_total.with_label_values(&[direction]).inc(); + metrics.distinct_peers_connections_opened_total.inc(); } } }, @@ -1253,7 +1246,6 @@ impl Future for NetworkWorker { ConnectedPoint::Dialer { .. } => "out", ConnectedPoint::Listener { .. } => "in", }; - let reason = match cause { ConnectionError::IO(_) => "transport-error", ConnectionError::Handler(NodeHandlerWrapperError::Handler(EitherError::A(EitherError::A( @@ -1265,12 +1257,11 @@ impl Future for NetworkWorker { ConnectionError::Handler(NodeHandlerWrapperError::Handler(_)) => "protocol-error", ConnectionError::Handler(NodeHandlerWrapperError::KeepAliveTimeout) => "keep-alive-timeout", }; - metrics.connections_closed_total.with_label_values(&[direction, reason]).inc(); // `num_established` represents the number of *remaining* connections. if num_established == 0 { - metrics.distinct_peers_connections_closed_total.with_label_values(&[direction, reason]).inc(); + metrics.distinct_peers_connections_closed_total.inc(); } } },