From 3b1f84e8f2f6aa736f3c625e42926cb1e0c25381 Mon Sep 17 00:00:00 2001 From: Aaron Davidson Date: Tue, 16 Dec 2014 11:53:05 -0800 Subject: [PATCH 1/2] [SPARK-4864] Add documentation to Netty-based configs --- docs/configuration.md | 53 +++++++++++++++++++ .../spark/network/util/TransportConf.java | 2 +- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/docs/configuration.md b/docs/configuration.md index 64aa94f622af..0c6159407cbf 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -852,6 +852,59 @@ Apart from these, the following properties are also available, and may be useful between nodes leading to flooding the network with those. + + spark.shuffle.io.preferDirectBufs + true + + (Netty only) Off-heap buffers are used to reduce garbage collection during shuffle and cache + block transfer. For environments where off-heap memory is tightly limited, users may wish to + turn this off to force all allocations from Netty to be on-heap. + + + + spark.shuffle.io.numConnectionsPerPeer + 1 + + (Netty only) Connections between hosts are reused in order to reduce connection buildup for + large clusters. For small clusters with many hard disks, this may result in insufficient + concurrency to saturate all disks, and so users may consider increasing this value. + + + + spark.shuffle.io.serverThreads + min(num-cores, 8) + + (Netty only) Size of fixed pool of threads to make requests. As each thread gets 16 MB of + off-heap buffer space initially, the default is limited to 8 cores to avoid excessive off-heap + allocation. It has been experimentally determined that this default should be sufficient to + saturate a 10Gb/s network, but in some cases this may have to be adjusted. + + + + spark.shuffle.io.clientThreads + min(num-cores, 8) + + (Netty only) Size of fixed pool of threads to make requests. This has the same properties as + spark.shuffle.io.serverThreads, above. + + + + spark.shuffle.io.maxRetries + 3 + + (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is + set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC + pauses or transient network connectivity issues. + + + + spark.shuffle.io.retryWait + 5 + + (Netty only) Seconds to wait between retries of fetches. The maximum delay caused by retrying + is simply maxRetries * retryWait, by default 15 seconds. + + #### Scheduling diff --git a/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java b/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java index 13b37f96f8ce..7c9adf52af0f 100644 --- a/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java +++ b/network/common/src/main/java/org/apache/spark/network/util/TransportConf.java @@ -40,7 +40,7 @@ public int connectionTimeoutMs() { return conf.getInt("spark.shuffle.io.connectionTimeout", 120) * 1000; } - /** Number of concurrent connections between two nodes for fetching data. **/ + /** Number of concurrent connections between two nodes for fetching data. */ public int numConnectionsPerPeer() { return conf.getInt("spark.shuffle.io.numConnectionsPerPeer", 1); } From 8a8b3739d1b357fc2f993c43d6cbdba9d306d27c Mon Sep 17 00:00:00 2001 From: Aaron Davidson Date: Wed, 17 Dec 2014 14:35:08 -0800 Subject: [PATCH 2/2] Address Patrick's comments --- docs/configuration.md | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 0c6159407cbf..110fdb90ab42 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -866,33 +866,15 @@ Apart from these, the following properties are also available, and may be useful 1 (Netty only) Connections between hosts are reused in order to reduce connection buildup for - large clusters. For small clusters with many hard disks, this may result in insufficient + large clusters. For clusters with many hard disks and few hosts, this may result in insufficient concurrency to saturate all disks, and so users may consider increasing this value. - - spark.shuffle.io.serverThreads - min(num-cores, 8) - - (Netty only) Size of fixed pool of threads to make requests. As each thread gets 16 MB of - off-heap buffer space initially, the default is limited to 8 cores to avoid excessive off-heap - allocation. It has been experimentally determined that this default should be sufficient to - saturate a 10Gb/s network, but in some cases this may have to be adjusted. - - - - spark.shuffle.io.clientThreads - min(num-cores, 8) - - (Netty only) Size of fixed pool of threads to make requests. This has the same properties as - spark.shuffle.io.serverThreads, above. - - spark.shuffle.io.maxRetries 3 - (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is + (Netty only) Fetches that fail due to IO-related exceptions are automatically retried if this is set to a non-zero value. This retry logic helps stabilize large shuffles in the face of long GC pauses or transient network connectivity issues.