elastic · s1monw · Nov 21, 2016 · Nov 19, 2016 · Nov 21, 2016 · Nov 21, 2016
diff --git a/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java b/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java
@@ -24,8 +24,10 @@
 import org.apache.lucene.index.IndexFormatTooOldException;
 import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BitUtil;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.CharsRefBuilder;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.Version;
@@ -323,15 +325,22 @@ public Integer readOptionalVInt() throws IOException {
         return null;
     }
 
-    private final CharsRefBuilder spare = new CharsRefBuilder();
+    // we don't use a CharsRefBuilder since we exactly know the size of the character array up front
+    // this prevents calling grow for every character since we don't need this
+    private final CharsRef spare = new CharsRef();
 
     public String readString() throws IOException {
+        // TODO it would be nice to not call readByte() for every character but we don't know how much to read up-front
+        // we can make the loop much more complicated but that won't buy us much compared to the bounds checks in readByte()
         final int charCount = readVInt();
-        spare.clear();
-        spare.grow(charCount);
-        int c;
-        while (spare.length() < charCount) {
-            c = readByte() & 0xff;
+        if (spare.chars.length < charCount) {
+            // we don't use ArrayUtils.grow since there is no need to copy the array
+            spare.chars = new char[ArrayUtil.oversize(charCount, Character.BYTES)];
+        }
+        spare.length = charCount;
+        final char[] buffer = spare.chars;
+        for (int i = 0; i < charCount; i++) {
+            final int c = readByte() & 0xff;
             switch (c >> 4) {
                 case 0:
                 case 1:
@@ -341,15 +350,17 @@ public String readString() throws IOException {
                 case 5:
                 case 6:
                 case 7:
-                    spare.append((char) c);
+                    buffer[i] = (char) c;
                     break;
                 case 12:
                 case 13:
-                    spare.append((char) ((c & 0x1F) << 6 | readByte() & 0x3F));
+                    buffer[i] = ((char) ((c & 0x1F) << 6 | readByte() & 0x3F));
                     break;
                 case 14:
-                    spare.append((char) ((c & 0x0F) << 12 | (readByte() & 0x3F) << 6 | (readByte() & 0x3F) << 0));
+                    buffer[i] = ((char) ((c & 0x0F) << 12 | (readByte() & 0x3F) << 6 | (readByte() & 0x3F) << 0));
                     break;
+                default:
+                    new AssertionError("unexpected character: " + c + " hex: " + Integer.toHexString(c));
             }
         }
         return spare.toString();

diff --git a/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java b/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java
@@ -24,6 +24,7 @@
 import org.apache.lucene.index.IndexFormatTooOldException;
 import org.apache.lucene.store.AlreadyClosedException;
 import org.apache.lucene.store.LockObtainFailedException;
+import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BitUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
@@ -298,23 +299,41 @@ public void writeText(Text text) throws IOException {
         }
     }
 
+    // we use a small buffer to convert strings to bytes since we want to prevent calling writeByte
+    // for every byte in the string (see #21660 for details).
+    // This buffer will never be the oversized limit of 1024 bytes and will not be shared across streams
+    private byte[] convertStringBuffer = BytesRef.EMPTY_BYTES; // TODO should we reduce it to 0 bytes once the stream is closed?
+
     public void writeString(String str) throws IOException {
-        int charCount = str.length();
+        final int charCount = str.length();
+        final int bufferSize = Math.min(3 * charCount, 1024); // at most 3 bytes per character is needed here
+        if (convertStringBuffer.length < bufferSize) { // we don't use ArrayUtils.grow since copying the bytes is unnecessary
+            convertStringBuffer = new byte[ArrayUtil.oversize(bufferSize, Byte.BYTES)];
+        }
+        byte[] buffer = convertStringBuffer;
+        int offset = 0;
         writeVInt(charCount);
-        int c;
         for (int i = 0; i < charCount; i++) {
-            c = str.charAt(i);
+            final int c = str.charAt(i);
             if (c <= 0x007F) {
-                writeByte((byte) c);
+                buffer[offset++] = ((byte) c);
             } else if (c > 0x07FF) {
-                writeByte((byte) (0xE0 | c >> 12 & 0x0F));
-                writeByte((byte) (0x80 | c >> 6 & 0x3F));
-                writeByte((byte) (0x80 | c >> 0 & 0x3F));
+                buffer[offset++] = ((byte) (0xE0 | c >> 12 & 0x0F));
+                buffer[offset++] = ((byte) (0x80 | c >> 6 & 0x3F));
+                buffer[offset++] = ((byte) (0x80 | c >> 0 & 0x3F));
             } else {
-                writeByte((byte) (0xC0 | c >> 6 & 0x1F));
-                writeByte((byte) (0x80 | c >> 0 & 0x3F));
+                buffer[offset++] = ((byte) (0xC0 | c >> 6 & 0x1F));
+                buffer[offset++] = ((byte) (0x80 | c >> 0 & 0x3F));
+            }
+            // make sure any possible char can fit into the buffer in any possible iteration
+            // we need at most 3 bytes so we flush the buffer once we have less than 3 bytes
+            // left before we start another iteration
+            if (offset > buffer.length-3) {
+                writeBytes(buffer, offset);
+                offset = 0;
             }
         }
+        writeBytes(buffer, offset);
     }
 
     public void writeFloat(float v) throws IOException {

diff --git a/core/src/test/java/org/elasticsearch/common/io/stream/BytesStreamsTests.java b/core/src/test/java/org/elasticsearch/common/io/stream/BytesStreamsTests.java
@@ -19,7 +19,9 @@
 
 package org.elasticsearch.common.io.stream;
 
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Constants;
+import org.apache.lucene.util.UnicodeUtil;
 import org.elasticsearch.common.bytes.BytesArray;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.geo.GeoPoint;
@@ -657,4 +659,41 @@ private static <K, V> Map<K, V> randomMap(Map<K, V> map, int size, Supplier<K> k
         IntStream.range(0, size).forEach(i -> map.put(keyGenerator.get(), valueGenerator.get()));
         return map;
     }
+
+    public void testWriteRandomStrings() throws IOException {
+        final int iters = scaledRandomIntBetween(5, 20);
+        for (int iter = 0; iter < iters; iter++) {
+            List<String> strings = new ArrayList<>();
+            int numStrings = randomIntBetween(100, 1000);
+            BytesStreamOutput output = new BytesStreamOutput(0);
+            for (int i = 0; i < numStrings; i++) {
+                String s = randomRealisticUnicodeOfLengthBetween(0, 2048);
+                strings.add(s);
+                output.writeString(s);
+            }
+
+            try (StreamInput streamInput = output.bytes().streamInput()) {
+                for (int i = 0; i < numStrings; i++) {
+                    String s = streamInput.readString();
+                    assertEquals(strings.get(i), s);
+                }
+            }
+        }
+    }
+
+    /*
+     * tests the extreme case where characters use more than 2 bytes
+     */
+    public void testWriteLargeSurrogateOnlyString() throws IOException {
+        String deseretLetter = "\uD801\uDC00";
+        assertEquals(2, deseretLetter.length());
+        String largeString = IntStream.range(0, 2048).mapToObj(s -> deseretLetter).collect(Collectors.joining("")).trim();
+        assertEquals("expands to 4 bytes", 4, new BytesRef(deseretLetter).length);
+        try (BytesStreamOutput output = new BytesStreamOutput(0)) {
+            output.writeString(largeString);
+            try (StreamInput streamInput = output.bytes().streamInput()) {
+                assertEquals(largeString, streamInput.readString());
+            }
+        }
+    }
 }