Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.CharsRefBuilder;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.Version;
Expand Down Expand Up @@ -323,15 +325,22 @@ public Integer readOptionalVInt() throws IOException {
return null;
}

private final CharsRefBuilder spare = new CharsRefBuilder();
// we don't use a CharsRefBuilder since we exactly know the size of the character array up front
// this prevents calling grow for every character since we don't need this
private final CharsRef spare = new CharsRef();

public String readString() throws IOException {
// TODO it would be nice to not call readByte() for every character but we don't know how much to read up-front
// we can make the loop much more complicated but that won't buy us much compared to the bounds checks in readByte()
final int charCount = readVInt();
spare.clear();
spare.grow(charCount);
int c;
while (spare.length() < charCount) {
c = readByte() & 0xff;
if (spare.chars.length < charCount) {
// we don't use ArrayUtils.grow since there is no need to copy the array
spare.chars = new char[ArrayUtil.oversize(charCount, Character.BYTES)];
}
spare.length = charCount;
final char[] buffer = spare.chars;
for (int i = 0; i < charCount; i++) {
final int c = readByte() & 0xff;
switch (c >> 4) {
case 0:
case 1:
Expand All @@ -341,15 +350,17 @@ public String readString() throws IOException {
case 5:
case 6:
case 7:
spare.append((char) c);
buffer[i] = (char) c;
break;
case 12:
case 13:
spare.append((char) ((c & 0x1F) << 6 | readByte() & 0x3F));
buffer[i] = ((char) ((c & 0x1F) << 6 | readByte() & 0x3F));
break;
case 14:
spare.append((char) ((c & 0x0F) << 12 | (readByte() & 0x3F) << 6 | (readByte() & 0x3F) << 0));
buffer[i] = ((char) ((c & 0x0F) << 12 | (readByte() & 0x3F) << 6 | (readByte() & 0x3F) << 0));
break;
default:
new AssertionError("unexpected character: " + c + " hex: " + Integer.toHexString(c));
}
}
return spare.toString();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
Expand Down Expand Up @@ -298,23 +299,41 @@ public void writeText(Text text) throws IOException {
}
}

// we use a small buffer to convert strings to bytes since we want to prevent calling writeByte
// for every byte in the string (see #21660 for details).
// This buffer will never be the oversized limit of 1024 bytes and will not be shared across streams
private byte[] convertStringBuffer = BytesRef.EMPTY_BYTES; // TODO should we reduce it to 0 bytes once the stream is closed?

public void writeString(String str) throws IOException {
int charCount = str.length();
final int charCount = str.length();
final int bufferSize = Math.min(3 * charCount, 1024); // at most 3 bytes per character is needed here
if (convertStringBuffer.length < bufferSize) { // we don't use ArrayUtils.grow since copying the bytes is unnecessary
convertStringBuffer = new byte[ArrayUtil.oversize(bufferSize, Byte.BYTES)];
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you can replace the three above lines with just convertStringBuffer = ArrayUtil.grow(convertStringBuffer, bufferSize);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so I had this before but there is no need to copy the array since we are trashing it that's why I used oversize?

byte[] buffer = convertStringBuffer;
int offset = 0;
writeVInt(charCount);
int c;
for (int i = 0; i < charCount; i++) {
c = str.charAt(i);
final int c = str.charAt(i);
if (c <= 0x007F) {
writeByte((byte) c);
buffer[offset++] = ((byte) c);
} else if (c > 0x07FF) {
writeByte((byte) (0xE0 | c >> 12 & 0x0F));
writeByte((byte) (0x80 | c >> 6 & 0x3F));
writeByte((byte) (0x80 | c >> 0 & 0x3F));
buffer[offset++] = ((byte) (0xE0 | c >> 12 & 0x0F));
buffer[offset++] = ((byte) (0x80 | c >> 6 & 0x3F));
buffer[offset++] = ((byte) (0x80 | c >> 0 & 0x3F));
} else {
writeByte((byte) (0xC0 | c >> 6 & 0x1F));
writeByte((byte) (0x80 | c >> 0 & 0x3F));
buffer[offset++] = ((byte) (0xC0 | c >> 6 & 0x1F));
buffer[offset++] = ((byte) (0x80 | c >> 0 & 0x3F));
}
// make sure any possible char can fit into the buffer in any possible iteration
// we need at most 3 bytes so we flush the buffer once we have less than 3 bytes
// left before we start another iteration
if (offset > buffer.length-3) {
writeBytes(buffer, offset);
offset = 0;
}
}
writeBytes(buffer, offset);
}

public void writeFloat(float v) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@

package org.elasticsearch.common.io.stream;

import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Constants;
import org.apache.lucene.util.UnicodeUtil;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.geo.GeoPoint;
Expand Down Expand Up @@ -657,4 +659,41 @@ private static <K, V> Map<K, V> randomMap(Map<K, V> map, int size, Supplier<K> k
IntStream.range(0, size).forEach(i -> map.put(keyGenerator.get(), valueGenerator.get()));
return map;
}

public void testWriteRandomStrings() throws IOException {
final int iters = scaledRandomIntBetween(5, 20);
for (int iter = 0; iter < iters; iter++) {
List<String> strings = new ArrayList<>();
int numStrings = randomIntBetween(100, 1000);
BytesStreamOutput output = new BytesStreamOutput(0);
for (int i = 0; i < numStrings; i++) {
String s = randomRealisticUnicodeOfLengthBetween(0, 2048);
strings.add(s);
output.writeString(s);
}

try (StreamInput streamInput = output.bytes().streamInput()) {
for (int i = 0; i < numStrings; i++) {
String s = streamInput.readString();
assertEquals(strings.get(i), s);
}
}
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe also test an explicit big string that only contains chars that are stored on 3 bytes?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


/*
* tests the extreme case where characters use more than 2 bytes
*/
public void testWriteLargeSurrogateOnlyString() throws IOException {
String deseretLetter = "\uD801\uDC00";
assertEquals(2, deseretLetter.length());
String largeString = IntStream.range(0, 2048).mapToObj(s -> deseretLetter).collect(Collectors.joining("")).trim();
assertEquals("expands to 4 bytes", 4, new BytesRef(deseretLetter).length);
try (BytesStreamOutput output = new BytesStreamOutput(0)) {
output.writeString(largeString);
try (StreamInput streamInput = output.bytes().streamInput()) {
assertEquals(largeString, streamInput.readString());
}
}
}
}