From 04291808875013f63037767c52e52dcfd250fd63 Mon Sep 17 00:00:00 2001 From: Ankit Saurabh Date: Fri, 4 Nov 2022 15:24:44 +0000 Subject: [PATCH 1/6] Made the lower limit to 1 --- .../src/main/java/org/apache/hadoop/fs/s3a/Constants.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java index e5b0a9b5aa163..aba1c1893d5bc 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java @@ -1248,7 +1248,7 @@ private Constants() { * The size of a single prefetched block in number of bytes. */ public static final String PREFETCH_BLOCK_SIZE_KEY = "fs.s3a.prefetch.block.size"; - public static final int PREFETCH_BLOCK_DEFAULT_SIZE = 8 * 1024 * 1024; + public static final int PREFETCH_BLOCK_DEFAULT_SIZE = 1; /** * Maximum number of blocks prefetched at any given time. From cdbed32f24ceb70295fd01452f3c6648779a698b Mon Sep 17 00:00:00 2001 From: Ankit Saurabh Date: Tue, 8 Nov 2022 10:45:20 +0000 Subject: [PATCH 2/6] Lowered the Prefetch Limit, unchanged the Default size --- .../src/main/java/org/apache/hadoop/fs/s3a/Constants.java | 2 +- .../src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java index aba1c1893d5bc..e5b0a9b5aa163 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java @@ -1248,7 +1248,7 @@ private Constants() { * The size of a single prefetched block in number of bytes. */ public static final String PREFETCH_BLOCK_SIZE_KEY = "fs.s3a.prefetch.block.size"; - public static final int PREFETCH_BLOCK_DEFAULT_SIZE = 1; + public static final int PREFETCH_BLOCK_DEFAULT_SIZE = 8 * 1024 * 1024; /** * Maximum number of blocks prefetched at any given time. diff --git a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java index 3e6f2322d3b00..30abaa6211282 100644 --- a/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java +++ b/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java @@ -525,8 +525,7 @@ public void initialize(URI name, Configuration originalConf) this.prefetchEnabled = conf.getBoolean(PREFETCH_ENABLED_KEY, PREFETCH_ENABLED_DEFAULT); long prefetchBlockSizeLong = - longBytesOption(conf, PREFETCH_BLOCK_SIZE_KEY, PREFETCH_BLOCK_DEFAULT_SIZE, - PREFETCH_BLOCK_DEFAULT_SIZE); + longBytesOption(conf, PREFETCH_BLOCK_SIZE_KEY, PREFETCH_BLOCK_DEFAULT_SIZE, 1); if (prefetchBlockSizeLong > (long) Integer.MAX_VALUE) { throw new IOException("S3A prefatch block size exceeds int limit"); } From 563f0ff20f8da74346f3129811ccc75768149e87 Mon Sep 17 00:00:00 2001 From: Ankit Saurabh Date: Tue, 8 Nov 2022 16:43:21 +0000 Subject: [PATCH 3/6] Added the lower limit information in the Prefetch Readme --- .../src/site/markdown/tools/hadoop-aws/prefetching.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md index e966c2dce4cb3..28fce819e00b9 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md @@ -43,6 +43,11 @@ Multiple blocks may be read in parallel. |`fs.s3a.prefetch.block.size` |Size of a block |`8M` | |`fs.s3a.prefetch.block.count` |Number of blocks to prefetch |`8` | +Although, default size of the block for prefetching the input stream is 8 MB, minimum size allowed to set +is 1 byte for a block. User should set the block size as per the use knowing that very low block size increases +the number of blocks. Thus, it affects the performance by increasing the overhead for reading and prefetching +each block. + ### Key Components `S3PrefetchingInputStream` - When prefetching is enabled, S3AFileSystem will return an instance of From ddba5cdc5162812e6e1f29b7a13cae4af6d8bed8 Mon Sep 17 00:00:00 2001 From: Ankit Saurabh Date: Tue, 8 Nov 2022 17:33:12 +0000 Subject: [PATCH 4/6] Added new line after period in readme. --- .../src/site/markdown/tools/hadoop-aws/prefetching.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md index 28fce819e00b9..027a590c8ea48 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md @@ -43,9 +43,10 @@ Multiple blocks may be read in parallel. |`fs.s3a.prefetch.block.size` |Size of a block |`8M` | |`fs.s3a.prefetch.block.count` |Number of blocks to prefetch |`8` | -Although, default size of the block for prefetching the input stream is 8 MB, minimum size allowed to set -is 1 byte for a block. User should set the block size as per the use knowing that very low block size increases -the number of blocks. Thus, it affects the performance by increasing the overhead for reading and prefetching +Although, default size of the block for prefetching the input stream is 8 MB, +minimum size allowed to set is 1 byte for a block. +User should set the block size with the understanding that smaller block sizes increases the number of blocks. +Thus, smaller block size affects the performance by increasing the overhead for reading and prefetching each block. ### Key Components From bf73b01d524d539bdff52cc11daf6dec0c40abc3 Mon Sep 17 00:00:00 2001 From: Ankit Saurabh Date: Wed, 9 Nov 2022 16:31:02 +0000 Subject: [PATCH 5/6] Updated the documentation of general s3a client configuration. --- .../src/site/markdown/tools/hadoop-aws/index.md | 4 +++- .../src/site/markdown/tools/hadoop-aws/prefetching.md | 8 +++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md index cd7793bfa92d3..c68836f06bb4d 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md @@ -1107,7 +1107,9 @@ options are covered in [Testing](./testing.md). fs.s3a.prefetch.block.size 8MB - The size of a single prefetched block of data. + The size of a single prefetched block of data. + Default value is 8 MB. + Lower limit for the block size is 1 byte. diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md index 027a590c8ea48..8bb85008e3624 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/prefetching.md @@ -43,11 +43,9 @@ Multiple blocks may be read in parallel. |`fs.s3a.prefetch.block.size` |Size of a block |`8M` | |`fs.s3a.prefetch.block.count` |Number of blocks to prefetch |`8` | -Although, default size of the block for prefetching the input stream is 8 MB, -minimum size allowed to set is 1 byte for a block. -User should set the block size with the understanding that smaller block sizes increases the number of blocks. -Thus, smaller block size affects the performance by increasing the overhead for reading and prefetching -each block. +The default size of a block is 8MB, and the minimum allowed block size is 1 byte. +Decreasing block size will increase the number of blocks to be read for a file. +A smaller block size may negatively impact performance as the number of prefetches required will increase. ### Key Components From 5705e996bf260b1f75cf21858701db1cf640c6b1 Mon Sep 17 00:00:00 2001 From: Ankit Saurabh Date: Thu, 10 Nov 2022 11:38:30 +0000 Subject: [PATCH 6/6] Improved block description in s3a client configuration --- .../hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md index c68836f06bb4d..0faf7ceebf8e2 100644 --- a/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md +++ b/hadoop-tools/hadoop-aws/src/site/markdown/tools/hadoop-aws/index.md @@ -1107,9 +1107,8 @@ options are covered in [Testing](./testing.md). fs.s3a.prefetch.block.size 8MB - The size of a single prefetched block of data. - Default value is 8 MB. - Lower limit for the block size is 1 byte. + The size of a single prefetched block of data. + Decreasing this will increase the number of prefetches required, and may negatively impact performance.