Avoid a copy during decompression with new BoundedDelegatingInputStream.

apurtell · apurtell · commit f2202acae072 · 2021-05-19T13:49:49.000-07:00
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/BoundedDelegatingInputStream.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/BoundedDelegatingInputStream.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.io;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.yetus.audience.InterfaceAudience;
+
+/**
+ * This is a stream that will only supply bytes from its delegate up to a certain limit.
+ * When there is an attempt to set the position beyond that it will signal that the input
+ * is finished.
+ */
+@InterfaceAudience.Private
+public class BoundedDelegatingInputStream extends DelegatingInputStream {
+
+  protected long limit;
+  protected long pos;
+
+  public BoundedDelegatingInputStream(InputStream in, long limit) {
+    super(in);
+    this.limit = limit;
+    this.pos = 0;
+  }
+
+  public void setDelegate(InputStream in, long limit) {
+    this.in = in;
+    this.limit = limit;
+    this.pos = 0;
+  }
+
+  /**
+   * Call the delegate's {@code read()} method if the current position is less than the limit.
+   * @return the byte read or -1 if the end of stream or the limit has been reached.
+   */
+  @Override
+  public int read() throws IOException {
+    if (pos >= limit) {
+      return -1;
+    }
+    int result = in.read();
+    pos++;
+    return result;
+  }
+
+  /**
+   * Call the delegate's {@code read(byte[], int, int)} method if the current position is less
+   * than the limit.
+   * @param b read buffer
+   * @param off Start offset
+   * @param len The number of bytes to read
+   * @return the number of bytes read or -1 if the end of stream or the limit has been reached.
+   */
+  @Override
+  public int read(final byte[] b, final int off, final int len) throws IOException {
+    if (pos >= limit) {
+      return -1;
+    }
+    long readLen = Math.min(len, limit - pos);
+    int read = in.read(b, off, (int)readLen);
+    if (read < 0) {
+      return -1;
+    }
+    pos += read;
+    return read;
+  }
+
+  /**
+   * Call the delegate's {@code skip(long)} method.
+   * @param len the number of bytes to skip
+   * @return the actual number of bytes skipped
+   */
+  @Override
+  public long skip(final long len) throws IOException {
+    long skipped = in.skip(Math.min(len, limit - pos));
+    pos += skipped;
+    return skipped;
+  }
+
+  /**
+   * Call the delegate's {@code available()} method.
+   * @return the delegate's available bytes if the current position is less than the limit,
+   * or 0 otherwise
+   */
+  @Override
+  public int available() throws IOException {
+    if (pos >= limit) {
+      return 0;
+    }
+    int available = in.available();
+    return (int) Math.min(available, limit - pos);
+  }
+
+}
diff --git a/hbase-common/src/main/java/org/apache/hadoop/hbase/io/DelegatingInputStream.java b/hbase-common/src/main/java/org/apache/hadoop/hbase/io/DelegatingInputStream.java
@@ -1,4 +1,4 @@
-/**
+/*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/CompressionContext.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/CompressionContext.java
@@ -18,7 +18,6 @@
 
 package org.apache.hadoop.hbase.regionserver.wal;
 
-import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
@@ -27,10 +26,9 @@
 import java.lang.reflect.InvocationTargetException;
 import java.util.EnumMap;
 import java.util.Map;
-import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.HBaseInterfaceAudience;
-import org.apache.hadoop.hbase.io.DelegatingInputStream;
+import org.apache.hadoop.hbase.io.BoundedDelegatingInputStream;
 import org.apache.hadoop.hbase.io.TagCompressionContext;
 import org.apache.hadoop.hbase.io.compress.Compression;
 import org.apache.hadoop.hbase.io.util.Dictionary;
@@ -71,7 +69,7 @@ static class ValueCompressor {
     static final int IO_BUFFER_SIZE = 4096;
 
     private final Compression.Algorithm algorithm;
-    private DelegatingInputStream lowerIn;
+    private BoundedDelegatingInputStream lowerIn;
     private ByteArrayOutputStream lowerOut;
     private InputStream compressedIn;
     private OutputStream compressedOut;
@@ -102,31 +100,22 @@ public byte[] compress(byte[] valueArray, int valueOffset, int valueLength)
     public int decompress(InputStream in, int inLength, byte[] outArray, int outOffset,
         int outLength) throws IOException {
 
-      // We handle input as a sequence of byte[] arrays (call them segments), with
-      // DelegatingInputStream providing a way to switch in a new segment, wrapped in a
-      // ByteArrayInputStream, when the old segment has been fully consumed.
-
-      // Originally I looked at using BoundedInputStream but you can't reuse/reset the
-      // BIS instance, and we can't just create new streams each time around because
-      // that would reset compression codec state, which must accumulate over all values
-      // in the file in order to build the dictionary in the same way as the compressor
-      // did.
-
-      // Read in all of the next segment of compressed bytes to process.
-      byte[] inBuffer = new byte[inLength];
-      IOUtils.readFully(in, inBuffer);
+      // Our input is a sequence of bounded byte ranges (call them segments), with
+      // BoundedDelegatingInputStream providing a way to switch in a new segment when the 
+      // previous segment has been fully consumed.
 
       // Create the input streams here the first time around.
       if (compressedIn == null) {
-        lowerIn = new DelegatingInputStream(new ByteArrayInputStream(inBuffer));
+        lowerIn = new BoundedDelegatingInputStream(in, inLength);
         compressedIn = algorithm.createDecompressionStream(lowerIn, algorithm.getDecompressor(),
           IO_BUFFER_SIZE);
       } else {
-        lowerIn.setDelegate(new ByteArrayInputStream(inBuffer));
+        lowerIn.setDelegate(in, inLength);
       }
 
-      // Caller must handle short reads. With current Hadoop compression codecs all 'outLength'
-      // bytes are read in here, so not an issue now.
+      // Caller must handle short reads.
+      // With current Hadoop compression codecs all 'outLength' bytes are read in here, so not
+      // an issue for now.
       return compressedIn.read(outArray, outOffset, outLength);
     }
 

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-/**`
	`1`	`+/*`
`2`	`2`	`* Licensed to the Apache Software Foundation (ASF) under one`
`3`	`3`	`* or more contributor license agreements. See the NOTICE file`
`4`	`4`	`* distributed with this work for additional information`