Skip to content

Commit 54e0370

Browse files
authored
Track histogram of transport handling times (#80581)
Adds to the transport node stats a record of the distribution of the times for which a transport thread was handling a message, represented as a histogram. Closes #80428
1 parent 3d0c9ef commit 54e0370

File tree

15 files changed

+393
-19
lines changed

15 files changed

+393
-19
lines changed

docs/reference/cluster/nodes-stats.asciidoc

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1899,6 +1899,54 @@ Size of TX packets sent by the node during internal cluster communication.
18991899
(integer)
19001900
Size, in bytes, of TX packets sent by the node during internal cluster
19011901
communication.
1902+
1903+
`inbound_handling_time_histogram`::
1904+
(array)
1905+
The distribution of the time spent handling each inbound message on a transport
1906+
thread, represented as a histogram.
1907+
+
1908+
.Properties of `inbound_handling_time_histogram`
1909+
[%collapsible]
1910+
=======
1911+
`ge_millis`::
1912+
(integer)
1913+
The inclusive lower bound of the bucket in milliseconds. Omitted on the first
1914+
bucket since this bucket has no lower bound.
1915+
1916+
`lt_millis`::
1917+
(integer)
1918+
The exclusive upper bound of the bucket in milliseconds. Omitted on the last
1919+
bucket since this bucket has no upper bound.
1920+
1921+
`count`::
1922+
(integer)
1923+
The number of times a transport thread took a period of time within the bounds
1924+
of this bucket to handle an inbound message.
1925+
=======
1926+
1927+
`outbound_handling_time_histogram`::
1928+
(array)
1929+
The distribution of the time spent sending each outbound transport message on a
1930+
transport thread, represented as a histogram.
1931+
+
1932+
.Properties of `outbound_handling_time_histogram`
1933+
[%collapsible]
1934+
=======
1935+
`ge_millis`::
1936+
(integer)
1937+
The inclusive lower bound of the bucket in milliseconds. Omitted on the first
1938+
bucket since this bucket has no lower bound.
1939+
1940+
`lt_millis`::
1941+
(integer)
1942+
The exclusive upper bound of the bucket in milliseconds. Omitted on the last
1943+
bucket since this bucket has no upper bound.
1944+
1945+
`count`::
1946+
(integer)
1947+
The number of times a transport thread took a period of time within the bounds
1948+
of this bucket to send a transport message.
1949+
=======
19021950
======
19031951

19041952
[[cluster-nodes-stats-api-response-body-http]]

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/nodes.stats/60_transport_stats.yml

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,48 @@
2020
- gte: { nodes.$node_id.transport.tx_count: 0 }
2121
- gte: { nodes.$node_id.transport.rx_size_in_bytes: 0 }
2222
- gte: { nodes.$node_id.transport.tx_size_in_bytes: 0 }
23+
24+
---
25+
"Transport handling time histogram":
26+
- skip:
27+
version: " - 8.0.99"
28+
reason: "handling_time_histograms were added in 8.1"
29+
features: [arbitrary_key]
30+
31+
- do:
32+
nodes.info: {}
33+
- set:
34+
nodes._arbitrary_key_: node_id
35+
36+
- do:
37+
nodes.stats:
38+
metric: [ transport ]
39+
40+
- length: { nodes.$node_id.transport.inbound_handling_time_histogram: 18 }
41+
42+
- gte: { nodes.$node_id.transport.inbound_handling_time_histogram.0.count: 0 }
43+
- is_false: nodes.$node_id.transport.inbound_handling_time_histogram.0.ge_millis
44+
- match: { nodes.$node_id.transport.inbound_handling_time_histogram.0.lt_millis: 1 }
45+
46+
- gte: { nodes.$node_id.transport.inbound_handling_time_histogram.1.count: 0 }
47+
- match: { nodes.$node_id.transport.inbound_handling_time_histogram.1.ge_millis: 1 }
48+
- match: { nodes.$node_id.transport.inbound_handling_time_histogram.1.lt_millis: 2 }
49+
50+
- gte: { nodes.$node_id.transport.inbound_handling_time_histogram.17.count: 0 }
51+
- match: { nodes.$node_id.transport.inbound_handling_time_histogram.17.ge_millis: 65536 }
52+
- is_false: nodes.$node_id.transport.inbound_handling_time_histogram.17.lt_millis
53+
54+
55+
- length: { nodes.$node_id.transport.outbound_handling_time_histogram: 18 }
56+
57+
- gte: { nodes.$node_id.transport.outbound_handling_time_histogram.0.count: 0 }
58+
- is_false: nodes.$node_id.transport.outbound_handling_time_histogram.0.ge_millis
59+
- match: { nodes.$node_id.transport.outbound_handling_time_histogram.0.lt_millis: 1 }
60+
61+
- gte: { nodes.$node_id.transport.outbound_handling_time_histogram.1.count: 0 }
62+
- match: { nodes.$node_id.transport.outbound_handling_time_histogram.1.ge_millis: 1 }
63+
- match: { nodes.$node_id.transport.outbound_handling_time_histogram.1.lt_millis: 2 }
64+
65+
- gte: { nodes.$node_id.transport.outbound_handling_time_histogram.17.count: 0 }
66+
- match: { nodes.$node_id.transport.outbound_handling_time_histogram.17.ge_millis: 65536 }
67+
- is_false: nodes.$node_id.transport.outbound_handling_time_histogram.17.lt_millis
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0 and the Server Side Public License, v 1; you may not use this file except
5+
* in compliance with, at your election, the Elastic License 2.0 or the Server
6+
* Side Public License, v 1.
7+
*/
8+
9+
package org.elasticsearch.common.network;
10+
11+
import java.util.concurrent.atomic.LongAdder;
12+
13+
/**
14+
* Tracks how long message handling takes on a transport thread as a histogram with fixed buckets.
15+
*/
16+
public class HandlingTimeTracker {
17+
18+
public static int[] getBucketUpperBounds() {
19+
int[] bounds = new int[17];
20+
for (int i = 0; i < bounds.length; i++) {
21+
bounds[i] = 1 << i;
22+
}
23+
return bounds;
24+
}
25+
26+
private static int getBucket(long handlingTimeMillis) {
27+
if (handlingTimeMillis <= 0) {
28+
return 0;
29+
} else if (LAST_BUCKET_LOWER_BOUND <= handlingTimeMillis) {
30+
return BUCKET_COUNT - 1;
31+
} else {
32+
return Long.SIZE - Long.numberOfLeadingZeros(handlingTimeMillis);
33+
}
34+
}
35+
36+
public static final int BUCKET_COUNT = getBucketUpperBounds().length + 1;
37+
38+
private static final long LAST_BUCKET_LOWER_BOUND = getBucketUpperBounds()[BUCKET_COUNT - 2];
39+
40+
private final LongAdder[] buckets;
41+
42+
public HandlingTimeTracker() {
43+
buckets = new LongAdder[BUCKET_COUNT];
44+
for (int i = 0; i < BUCKET_COUNT; i++) {
45+
buckets[i] = new LongAdder();
46+
}
47+
}
48+
49+
public void addHandlingTime(long handlingTimeMillis) {
50+
buckets[getBucket(handlingTimeMillis)].increment();
51+
}
52+
53+
/**
54+
* @return An array of frequencies of handling times in buckets with upper bounds as returned by {@link #getBucketUpperBounds()}, plus
55+
* an extra bucket for handling times longer than the longest upper bound.
56+
*/
57+
public long[] getHistogram() {
58+
final long[] histogram = new long[BUCKET_COUNT];
59+
for (int i = 0; i < BUCKET_COUNT; i++) {
60+
histogram[i] = buckets[i].longValue();
61+
}
62+
return histogram;
63+
}
64+
65+
}

server/src/main/java/org/elasticsearch/common/network/NetworkService.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,16 @@ public interface CustomNameResolver {
9090
}
9191

9292
private final List<CustomNameResolver> customNameResolvers;
93+
private final HandlingTimeTracker handlingTimeTracker = new HandlingTimeTracker();
9394

9495
public NetworkService(List<CustomNameResolver> customNameResolvers) {
9596
this.customNameResolvers = Objects.requireNonNull(customNameResolvers, "customNameResolvers must be non null");
9697
}
9798

99+
public HandlingTimeTracker getHandlingTimeTracker() {
100+
return handlingTimeTracker;
101+
}
102+
98103
/**
99104
* Resolves {@code bindHosts} to a list of internet addresses. The list will
100105
* not contain duplicate addresses.

server/src/main/java/org/elasticsearch/http/AbstractHttpServerTransport.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,11 +355,12 @@ protected void serverAcceptedChannel(HttpChannel httpChannel) {
355355
*/
356356
public void incomingRequest(final HttpRequest httpRequest, final HttpChannel httpChannel) {
357357
httpClientStatsTracker.updateClientStats(httpRequest, httpChannel);
358-
final long startTime = threadPool.relativeTimeInMillis();
358+
final long startTime = threadPool.rawRelativeTimeInMillis();
359359
try {
360360
handleIncomingRequest(httpRequest, httpChannel, httpRequest.getInboundException());
361361
} finally {
362-
final long took = threadPool.relativeTimeInMillis() - startTime;
362+
final long took = threadPool.rawRelativeTimeInMillis() - startTime;
363+
networkService.getHandlingTimeTracker().addHandlingTime(took);
363364
final long logThreshold = slowLogThresholdMs;
364365
if (logThreshold > 0 && took > logThreshold) {
365366
logger.warn(

server/src/main/java/org/elasticsearch/transport/InboundHandler.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import org.elasticsearch.common.io.stream.NamedWriteableAwareStreamInput;
1818
import org.elasticsearch.common.io.stream.NamedWriteableRegistry;
1919
import org.elasticsearch.common.io.stream.StreamInput;
20+
import org.elasticsearch.common.network.HandlingTimeTracker;
2021
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
2122
import org.elasticsearch.common.util.concurrent.ThreadContext;
2223
import org.elasticsearch.core.TimeValue;
@@ -40,6 +41,7 @@ public class InboundHandler {
4041
private final TransportHandshaker handshaker;
4142
private final TransportKeepAlive keepAlive;
4243
private final Transport.ResponseHandlers responseHandlers;
44+
private final HandlingTimeTracker handlingTimeTracker;
4345
private final Transport.RequestHandlers requestHandlers;
4446

4547
private volatile TransportMessageListener messageListener = TransportMessageListener.NOOP_LISTENER;
@@ -53,7 +55,8 @@ public class InboundHandler {
5355
TransportHandshaker handshaker,
5456
TransportKeepAlive keepAlive,
5557
Transport.RequestHandlers requestHandlers,
56-
Transport.ResponseHandlers responseHandlers
58+
Transport.ResponseHandlers responseHandlers,
59+
HandlingTimeTracker handlingTimeTracker
5760
) {
5861
this.threadPool = threadPool;
5962
this.outboundHandler = outboundHandler;
@@ -62,6 +65,7 @@ public class InboundHandler {
6265
this.keepAlive = keepAlive;
6366
this.requestHandlers = requestHandlers;
6467
this.responseHandlers = responseHandlers;
68+
this.handlingTimeTracker = handlingTimeTracker;
6569
}
6670

6771
void setMessageListener(TransportMessageListener listener) {
@@ -77,7 +81,7 @@ void setSlowLogThreshold(TimeValue slowLogThreshold) {
7781
}
7882

7983
void inboundMessage(TcpChannel channel, InboundMessage message) throws Exception {
80-
final long startTime = threadPool.relativeTimeInMillis();
84+
final long startTime = threadPool.rawRelativeTimeInMillis();
8185
channel.getChannelStats().markAccessed(startTime);
8286
TransportLogger.logInboundMessage(channel, message);
8387

@@ -155,7 +159,8 @@ private void messageReceived(TcpChannel channel, InboundMessage message, long st
155159
}
156160
}
157161
} finally {
158-
final long took = threadPool.relativeTimeInMillis() - startTime;
162+
final long took = threadPool.rawRelativeTimeInMillis() - startTime;
163+
handlingTimeTracker.addHandlingTime(took);
159164
final long logThreshold = slowLogThresholdMs;
160165
if (logThreshold > 0 && took > logThreshold) {
161166
if (isRequest) {

server/src/main/java/org/elasticsearch/transport/OutboundHandler.java

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import org.elasticsearch.common.bytes.BytesReference;
2020
import org.elasticsearch.common.io.stream.RecyclerBytesStreamOutput;
2121
import org.elasticsearch.common.network.CloseableChannel;
22+
import org.elasticsearch.common.network.HandlingTimeTracker;
2223
import org.elasticsearch.common.recycler.Recycler;
2324
import org.elasticsearch.common.transport.NetworkExceptionHelper;
2425
import org.elasticsearch.common.util.concurrent.ThreadContext;
@@ -37,17 +38,26 @@ final class OutboundHandler {
3738
private final StatsTracker statsTracker;
3839
private final ThreadPool threadPool;
3940
private final Recycler<BytesRef> recycler;
41+
private final HandlingTimeTracker handlingTimeTracker;
4042

4143
private volatile long slowLogThresholdMs = Long.MAX_VALUE;
4244

4345
private volatile TransportMessageListener messageListener = TransportMessageListener.NOOP_LISTENER;
4446

45-
OutboundHandler(String nodeName, Version version, StatsTracker statsTracker, ThreadPool threadPool, Recycler<BytesRef> recycler) {
47+
OutboundHandler(
48+
String nodeName,
49+
Version version,
50+
StatsTracker statsTracker,
51+
ThreadPool threadPool,
52+
Recycler<BytesRef> recycler,
53+
HandlingTimeTracker handlingTimeTracker
54+
) {
4655
this.nodeName = nodeName;
4756
this.version = version;
4857
this.statsTracker = statsTracker;
4958
this.threadPool = threadPool;
5059
this.recycler = recycler;
60+
this.handlingTimeTracker = handlingTimeTracker;
5161
}
5262

5363
void setSlowLogThreshold(TimeValue slowLogThreshold) {
@@ -168,7 +178,7 @@ private void internalSend(
168178
@Nullable OutboundMessage message,
169179
ActionListener<Void> listener
170180
) {
171-
final long startTime = threadPool.relativeTimeInMillis();
181+
final long startTime = threadPool.rawRelativeTimeInMillis();
172182
channel.getChannelStats().markAccessed(startTime);
173183
final long messageSize = reference.length();
174184
TransportLogger.logOutboundMessage(channel, reference);
@@ -196,7 +206,8 @@ public void onFailure(Exception e) {
196206
private void maybeLogSlowMessage(boolean success) {
197207
final long logThreshold = slowLogThresholdMs;
198208
if (logThreshold > 0) {
199-
final long took = threadPool.relativeTimeInMillis() - startTime;
209+
final long took = threadPool.rawRelativeTimeInMillis() - startTime;
210+
handlingTimeTracker.addHandlingTime(took);
200211
if (took > logThreshold) {
201212
logger.warn(
202213
"sending transport message [{}] of size [{}] on [{}] took [{}ms] which is above the warn "

server/src/main/java/org/elasticsearch/transport/TcpTransport.java

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import org.elasticsearch.common.io.stream.RecyclerBytesStreamOutput;
3030
import org.elasticsearch.common.io.stream.StreamInput;
3131
import org.elasticsearch.common.network.CloseableChannel;
32+
import org.elasticsearch.common.network.HandlingTimeTracker;
3233
import org.elasticsearch.common.network.NetworkAddress;
3334
import org.elasticsearch.common.network.NetworkService;
3435
import org.elasticsearch.common.network.NetworkUtils;
@@ -116,6 +117,7 @@ public abstract class TcpTransport extends AbstractLifecycleComponent implements
116117

117118
private final TransportHandshaker handshaker;
118119
private final TransportKeepAlive keepAlive;
120+
private final HandlingTimeTracker outboundHandlingTimeTracker = new HandlingTimeTracker();
119121
private final OutboundHandler outboundHandler;
120122
private final InboundHandler inboundHandler;
121123
private final ResponseHandlers responseHandlers = new ResponseHandlers();
@@ -141,7 +143,7 @@ public TcpTransport(
141143
String nodeName = Node.NODE_NAME_SETTING.get(settings);
142144

143145
this.recycler = createRecycler(settings, pageCacheRecycler);
144-
this.outboundHandler = new OutboundHandler(nodeName, version, statsTracker, threadPool, recycler);
146+
this.outboundHandler = new OutboundHandler(nodeName, version, statsTracker, threadPool, recycler, outboundHandlingTimeTracker);
145147
this.handshaker = new TransportHandshaker(
146148
version,
147149
threadPool,
@@ -165,7 +167,8 @@ public TcpTransport(
165167
handshaker,
166168
keepAlive,
167169
requestHandlers,
168-
responseHandlers
170+
responseHandlers,
171+
networkService.getHandlingTimeTracker()
169172
);
170173
}
171174

@@ -918,7 +921,9 @@ public final TransportStats getStats() {
918921
messagesReceived,
919922
bytesRead,
920923
messagesSent,
921-
bytesWritten
924+
bytesWritten,
925+
networkService.getHandlingTimeTracker().getHistogram(),
926+
outboundHandlingTimeTracker.getHistogram()
922927
);
923928
}
924929

0 commit comments

Comments
 (0)