From 954d1582bea4a083e031239864f5092fbb2291c7 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Sat, 19 Nov 2016 15:04:10 +0100 Subject: [PATCH 1/5] Use a buffer to do character to byte conversion in StreamOutput#writeString Today we call `writeByte` up to 3x per character in each string written via `StreamOutput#writeString` this can have quite some overhead when strings are long or many strings are written. This change adds a local buffer to convert chars to bytes into the local buffer. Converted bytes are then written via `writeBytes` instead reducing the overhead of this opertion. Closes #21660 --- .../common/io/stream/StreamOutput.java | 37 ++++++++++++++----- .../common/io/stream/BytesStreamsTests.java | 21 +++++++++++ 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java b/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java index 3ba911ef9eebd..3faf3e6361ada 100644 --- a/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java +++ b/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.IndexFormatTooOldException; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; @@ -298,23 +299,41 @@ public void writeText(Text text) throws IOException { } } + // we use a small buffer to convert strings to bytes since we want to prevent calling writeByte + // for every byte in the string (see #21660 for details). + // This buffer will never be the oversized limit of 1024 bytes and will not be shared across streams + private byte[] convertStringBuffer = BytesRef.EMPTY_BYTES; // TODO should we reduce it to 0 bytes once the stream is closed? + public void writeString(String str) throws IOException { - int charCount = str.length(); + final int charCount = str.length(); + final int bufferSize = Math.min(3 * charCount, 1024); // at most 3 bytes per character is needed here + if (convertStringBuffer.length < bufferSize) { + convertStringBuffer = new byte[ArrayUtil.oversize(bufferSize, Byte.BYTES)]; + } + byte[] buffer = convertStringBuffer; + int offset = 0; writeVInt(charCount); - int c; for (int i = 0; i < charCount; i++) { - c = str.charAt(i); + final int c = str.charAt(i); if (c <= 0x007F) { - writeByte((byte) c); + buffer[offset++] = ((byte) c); } else if (c > 0x07FF) { - writeByte((byte) (0xE0 | c >> 12 & 0x0F)); - writeByte((byte) (0x80 | c >> 6 & 0x3F)); - writeByte((byte) (0x80 | c >> 0 & 0x3F)); + buffer[offset++] = ((byte) (0xE0 | c >> 12 & 0x0F)); + buffer[offset++] = ((byte) (0x80 | c >> 6 & 0x3F)); + buffer[offset++] = ((byte) (0x80 | c >> 0 & 0x3F)); } else { - writeByte((byte) (0xC0 | c >> 6 & 0x1F)); - writeByte((byte) (0x80 | c >> 0 & 0x3F)); + buffer[offset++] = ((byte) (0xC0 | c >> 6 & 0x1F)); + buffer[offset++] = ((byte) (0x80 | c >> 0 & 0x3F)); + } + // make sure any possible char can fit into the buffer in any possible iteration + // we need at most 3 bytes so we flush the buffer once we have less than 3 bytes + // left before we start another iteration + if (offset >= buffer.length-3) { + writeBytes(buffer, offset); + offset = 0; } } + writeBytes(buffer, offset); } public void writeFloat(float v) throws IOException { diff --git a/core/src/test/java/org/elasticsearch/common/io/stream/BytesStreamsTests.java b/core/src/test/java/org/elasticsearch/common/io/stream/BytesStreamsTests.java index f51a85b2f9af2..366b0a1f56f83 100644 --- a/core/src/test/java/org/elasticsearch/common/io/stream/BytesStreamsTests.java +++ b/core/src/test/java/org/elasticsearch/common/io/stream/BytesStreamsTests.java @@ -657,4 +657,25 @@ private static Map randomMap(Map map, int size, Supplier k IntStream.range(0, size).forEach(i -> map.put(keyGenerator.get(), valueGenerator.get())); return map; } + + public void testWriteRandomStrings() throws IOException { + final int iters = scaledRandomIntBetween(5, 20); + for (int iter = 0; iter < iters; iter++) { + List strings = new ArrayList<>(); + int numStrings = randomIntBetween(100, 1000); + BytesStreamOutput output = new BytesStreamOutput(0); + for (int i = 0; i < numStrings; i++) { + String s = randomRealisticUnicodeOfLengthBetween(0, 2048); + strings.add(s); + output.writeString(s); + } + + try (StreamInput streamInput = output.bytes().streamInput()) { + for (int i = 0; i < numStrings; i++) { + String s = streamInput.readString(); + assertEquals(strings.get(i), s); + } + } + } + } } From a65de33e3039342368f157ae14f412984d62715b Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 21 Nov 2016 09:21:48 +0100 Subject: [PATCH 2/5] apply feedback --- .../common/io/stream/StreamOutput.java | 2 +- .../common/io/stream/BytesStreamsTests.java | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java b/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java index 3faf3e6361ada..cade955fe2c37 100644 --- a/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java +++ b/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java @@ -328,7 +328,7 @@ public void writeString(String str) throws IOException { // make sure any possible char can fit into the buffer in any possible iteration // we need at most 3 bytes so we flush the buffer once we have less than 3 bytes // left before we start another iteration - if (offset >= buffer.length-3) { + if (offset > buffer.length-3) { writeBytes(buffer, offset); offset = 0; } diff --git a/core/src/test/java/org/elasticsearch/common/io/stream/BytesStreamsTests.java b/core/src/test/java/org/elasticsearch/common/io/stream/BytesStreamsTests.java index 366b0a1f56f83..e9958c1c5165f 100644 --- a/core/src/test/java/org/elasticsearch/common/io/stream/BytesStreamsTests.java +++ b/core/src/test/java/org/elasticsearch/common/io/stream/BytesStreamsTests.java @@ -19,7 +19,9 @@ package org.elasticsearch.common.io.stream; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Constants; +import org.apache.lucene.util.UnicodeUtil; import org.elasticsearch.common.bytes.BytesArray; import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.geo.GeoPoint; @@ -678,4 +680,20 @@ public void testWriteRandomStrings() throws IOException { } } } + + /* + * tests the extreme case where characters use more than 2 bytes + */ + public void testWriteLargeSurrogateOnlyString() throws IOException { + String deseretLetter = "\uD801\uDC00"; + assertEquals(2, deseretLetter.length()); + String largeString = IntStream.range(0, 2048).mapToObj(s -> deseretLetter).collect(Collectors.joining("")).trim(); + assertEquals("expands to 4 bytes", 4, new BytesRef(deseretLetter).length); + try (BytesStreamOutput output = new BytesStreamOutput(0)) { + output.writeString(largeString); + try (StreamInput streamInput = output.bytes().streamInput()) { + assertEquals(largeString, streamInput.readString()); + } + } + } } From 665feec0da877b61e753269d464c25e66c791b4b Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 21 Nov 2016 09:22:20 +0100 Subject: [PATCH 3/5] add comment --- .../java/org/elasticsearch/common/io/stream/StreamOutput.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java b/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java index cade955fe2c37..788d8dfb925ab 100644 --- a/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java +++ b/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java @@ -307,7 +307,7 @@ public void writeText(Text text) throws IOException { public void writeString(String str) throws IOException { final int charCount = str.length(); final int bufferSize = Math.min(3 * charCount, 1024); // at most 3 bytes per character is needed here - if (convertStringBuffer.length < bufferSize) { + if (convertStringBuffer.length < bufferSize) { // we don't use ArrayUtils.grow since copying the bytes is unnecessary convertStringBuffer = new byte[ArrayUtil.oversize(bufferSize, Byte.BYTES)]; } byte[] buffer = convertStringBuffer; From cad5308f572513987b33b8a96e61f1cdc3c7fdd5 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 21 Nov 2016 09:33:00 +0100 Subject: [PATCH 4/5] simplify loop in StreamInput#readString --- .../common/io/stream/StreamInput.java | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java b/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java index 794ed6f36fac7..724f79166996b 100644 --- a/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java +++ b/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java @@ -24,8 +24,10 @@ import org.apache.lucene.index.IndexFormatTooOldException; import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.LockObtainFailedException; +import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BitUtil; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.CharsRef; import org.apache.lucene.util.CharsRefBuilder; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.Version; @@ -323,15 +325,16 @@ public Integer readOptionalVInt() throws IOException { return null; } - private final CharsRefBuilder spare = new CharsRefBuilder(); + // we don't use a CharsRefBuilder since we exactly know the size of the character array up front + // this prevents calling grow for every character since we don't need this + private final CharsRef spare = new CharsRef(); public String readString() throws IOException { final int charCount = readVInt(); - spare.clear(); - spare.grow(charCount); - int c; - while (spare.length() < charCount) { - c = readByte() & 0xff; + spare.length = charCount; + spare.chars = ArrayUtil.grow(spare.chars, charCount); + for (int i = 0; i < charCount; i++) { + final int c = readByte() & 0xff; switch (c >> 4) { case 0: case 1: @@ -341,15 +344,17 @@ public String readString() throws IOException { case 5: case 6: case 7: - spare.append((char) c); + spare.chars[i] = (char) c; break; case 12: case 13: - spare.append((char) ((c & 0x1F) << 6 | readByte() & 0x3F)); + spare.chars[i] = ((char) ((c & 0x1F) << 6 | readByte() & 0x3F)); break; case 14: - spare.append((char) ((c & 0x0F) << 12 | (readByte() & 0x3F) << 6 | (readByte() & 0x3F) << 0)); + spare.chars[i] = ((char) ((c & 0x0F) << 12 | (readByte() & 0x3F) << 6 | (readByte() & 0x3F) << 0)); break; + default: + new AssertionError("unexpected character: " + c + " hex: " + Integer.toHexString(c)); } } return spare.toString(); From b5d94e35b22a912f7c72f789d691d7c569a4ab88 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 21 Nov 2016 09:45:03 +0100 Subject: [PATCH 5/5] make readString consistent with writeString and add comments --- .../common/io/stream/StreamInput.java | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java b/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java index 724f79166996b..899779eee43cb 100644 --- a/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java +++ b/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java @@ -330,9 +330,15 @@ public Integer readOptionalVInt() throws IOException { private final CharsRef spare = new CharsRef(); public String readString() throws IOException { + // TODO it would be nice to not call readByte() for every character but we don't know how much to read up-front + // we can make the loop much more complicated but that won't buy us much compared to the bounds checks in readByte() final int charCount = readVInt(); + if (spare.chars.length < charCount) { + // we don't use ArrayUtils.grow since there is no need to copy the array + spare.chars = new char[ArrayUtil.oversize(charCount, Character.BYTES)]; + } spare.length = charCount; - spare.chars = ArrayUtil.grow(spare.chars, charCount); + final char[] buffer = spare.chars; for (int i = 0; i < charCount; i++) { final int c = readByte() & 0xff; switch (c >> 4) { @@ -344,14 +350,14 @@ public String readString() throws IOException { case 5: case 6: case 7: - spare.chars[i] = (char) c; + buffer[i] = (char) c; break; case 12: case 13: - spare.chars[i] = ((char) ((c & 0x1F) << 6 | readByte() & 0x3F)); + buffer[i] = ((char) ((c & 0x1F) << 6 | readByte() & 0x3F)); break; case 14: - spare.chars[i] = ((char) ((c & 0x0F) << 12 | (readByte() & 0x3F) << 6 | (readByte() & 0x3F) << 0)); + buffer[i] = ((char) ((c & 0x0F) << 12 | (readByte() & 0x3F) << 6 | (readByte() & 0x3F) << 0)); break; default: new AssertionError("unexpected character: " + c + " hex: " + Integer.toHexString(c));