Skip to content

Commit 454cfc2

Browse files
authored
More efficient encoding of range fields. (#26470)
This PR removes the vInt that precedes every value in order to know how long they are. Instead the query takes an enum that tells how to compute the length of values: for fixed-length data (ip addresses, double, float) the length is a constant while longs and integers use a variable-length representation that allows the length to be computed from the encoded values. Also the encoding of ints/longs was made a bit more efficient in order not to waste 3 bits in the header. As a consequence, values between -8 and 7 can now be encoded on 1 byte and values between -2048 and 2047 can now be encoded on 2 bytes or less. Closes #26443
1 parent 6708498 commit 454cfc2

File tree

4 files changed

+273
-131
lines changed

4 files changed

+273
-131
lines changed

core/src/main/java/org/apache/lucene/queries/BinaryDocValuesRangeQuery.java

Lines changed: 59 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -37,15 +37,18 @@ public final class BinaryDocValuesRangeQuery extends Query {
3737

3838
private final String fieldName;
3939
private final QueryType queryType;
40+
private final LengthType lengthType;
4041
private final BytesRef from;
4142
private final BytesRef to;
4243
private final Object originalFrom;
4344
private final Object originalTo;
4445

45-
public BinaryDocValuesRangeQuery(String fieldName, QueryType queryType, BytesRef from, BytesRef to,
46+
public BinaryDocValuesRangeQuery(String fieldName, QueryType queryType, LengthType lengthType,
47+
BytesRef from, BytesRef to,
4648
Object originalFrom, Object originalTo) {
4749
this.fieldName = fieldName;
4850
this.queryType = queryType;
51+
this.lengthType = lengthType;
4952
this.from = from;
5053
this.to = to;
5154
this.originalFrom = originalFrom;
@@ -66,29 +69,34 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
6669
final TwoPhaseIterator iterator = new TwoPhaseIterator(values) {
6770

6871
ByteArrayDataInput in = new ByteArrayDataInput();
69-
BytesRef otherFrom = new BytesRef(16);
70-
BytesRef otherTo = new BytesRef(16);
72+
BytesRef otherFrom = new BytesRef();
73+
BytesRef otherTo = new BytesRef();
7174

7275
@Override
7376
public boolean matches() throws IOException {
7477
BytesRef encodedRanges = values.binaryValue();
7578
in.reset(encodedRanges.bytes, encodedRanges.offset, encodedRanges.length);
7679
int numRanges = in.readVInt();
80+
final byte[] bytes = encodedRanges.bytes;
81+
otherFrom.bytes = bytes;
82+
otherTo.bytes = bytes;
83+
int offset = in.getPosition();
7784
for (int i = 0; i < numRanges; i++) {
78-
otherFrom.length = in.readVInt();
79-
otherFrom.bytes = encodedRanges.bytes;
80-
otherFrom.offset = in.getPosition();
81-
in.skipBytes(otherFrom.length);
85+
int length = lengthType.readLength(bytes, offset);
86+
otherFrom.offset = offset;
87+
otherFrom.length = length;
88+
offset += length;
8289

83-
otherTo.length = in.readVInt();
84-
otherTo.bytes = encodedRanges.bytes;
85-
otherTo.offset = in.getPosition();
86-
in.skipBytes(otherTo.length);
90+
length = lengthType.readLength(bytes, offset);
91+
otherTo.offset = offset;
92+
otherTo.length = length;
93+
offset += length;
8794

8895
if (queryType.matches(from, to, otherFrom, otherTo)) {
8996
return true;
9097
}
9198
}
99+
assert offset == encodedRanges.offset + encodedRanges.length;
92100
return false;
93101
}
94102

@@ -114,13 +122,14 @@ public boolean equals(Object o) {
114122
BinaryDocValuesRangeQuery that = (BinaryDocValuesRangeQuery) o;
115123
return Objects.equals(fieldName, that.fieldName) &&
116124
queryType == that.queryType &&
125+
lengthType == that.lengthType &&
117126
Objects.equals(from, that.from) &&
118127
Objects.equals(to, that.to);
119128
}
120129

121130
@Override
122131
public int hashCode() {
123-
return Objects.hash(getClass(), fieldName, queryType, from, to);
132+
return Objects.hash(getClass(), fieldName, queryType, lengthType, from, to);
124133
}
125134

126135
public enum QueryType {
@@ -161,4 +170,42 @@ boolean matches(BytesRef from, BytesRef to, BytesRef otherFrom, BytesRef otherTo
161170

162171
}
163172

173+
public enum LengthType {
174+
FIXED_4 {
175+
@Override
176+
int readLength(byte[] bytes, int offset) {
177+
return 4;
178+
}
179+
},
180+
FIXED_8 {
181+
@Override
182+
int readLength(byte[] bytes, int offset) {
183+
return 8;
184+
}
185+
},
186+
FIXED_16 {
187+
@Override
188+
int readLength(byte[] bytes, int offset) {
189+
return 16;
190+
}
191+
},
192+
VARIABLE {
193+
@Override
194+
int readLength(byte[] bytes, int offset) {
195+
// the first bit encodes the sign and the next 4 bits encode the number
196+
// of additional bytes
197+
int token = Byte.toUnsignedInt(bytes[offset]);
198+
int length = (token >>> 3) & 0x0f;
199+
if ((token & 0x80) == 0) {
200+
length = 0x0f - length;
201+
}
202+
return 1 + length;
203+
}
204+
};
205+
206+
/**
207+
* Return the length of the value that starts at {@code offset} in {@code bytes}.
208+
*/
209+
abstract int readLength(byte[] bytes, int offset);
210+
}
164211
}

core/src/main/java/org/elasticsearch/index/mapper/BinaryRangeUtil.java

Lines changed: 84 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,14 @@
1818
*/
1919
package org.elasticsearch.index.mapper;
2020

21+
import org.apache.lucene.document.HalfFloatPoint;
2122
import org.apache.lucene.store.ByteArrayDataOutput;
2223
import org.apache.lucene.util.BytesRef;
24+
import org.apache.lucene.util.NumericUtils;
2325

2426
import java.io.IOException;
2527
import java.util.ArrayList;
28+
import java.util.Comparator;
2629
import java.util.List;
2730
import java.util.Set;
2831

@@ -32,67 +35,77 @@ enum BinaryRangeUtil {
3235

3336
static BytesRef encodeLongRanges(Set<RangeFieldMapper.Range> ranges) throws IOException {
3437
List<RangeFieldMapper.Range> sortedRanges = new ArrayList<>(ranges);
35-
sortedRanges.sort((r1, r2) -> {
36-
long r1From = ((Number) r1.from).longValue();
37-
long r2From = ((Number) r2.from).longValue();
38-
int cmp = Long.compare(r1From, r2From);
39-
if (cmp != 0) {
40-
return cmp;
41-
} else {
42-
long r1To = ((Number) r1.from).longValue();
43-
long r2To = ((Number) r2.from).longValue();
44-
return Long.compare(r1To, r2To);
45-
}
46-
});
38+
Comparator<RangeFieldMapper.Range> fromComparator = Comparator.comparingLong(range -> ((Number) range.from).longValue());
39+
Comparator<RangeFieldMapper.Range> toComparator = Comparator.comparingLong(range -> ((Number) range.to).longValue());
40+
sortedRanges.sort(fromComparator.thenComparing(toComparator));
4741

48-
final byte[] encoded = new byte[5 + ((5 + 9) * 2) * sortedRanges.size()];
42+
final byte[] encoded = new byte[5 + (9 * 2) * sortedRanges.size()];
4943
ByteArrayDataOutput out = new ByteArrayDataOutput(encoded);
5044
out.writeVInt(sortedRanges.size());
5145
for (RangeFieldMapper.Range range : sortedRanges) {
52-
byte[] encodedFrom = encode(((Number) range.from).longValue());
53-
out.writeVInt(encodedFrom.length);
46+
byte[] encodedFrom = encodeLong(((Number) range.from).longValue());
5447
out.writeBytes(encodedFrom, encodedFrom.length);
55-
byte[] encodedTo = encode(((Number) range.to).longValue());
56-
out.writeVInt(encodedTo.length);
48+
byte[] encodedTo = encodeLong(((Number) range.to).longValue());
5749
out.writeBytes(encodedTo, encodedTo.length);
5850
}
5951
return new BytesRef(encoded, 0, out.getPosition());
6052
}
6153

6254
static BytesRef encodeDoubleRanges(Set<RangeFieldMapper.Range> ranges) throws IOException {
6355
List<RangeFieldMapper.Range> sortedRanges = new ArrayList<>(ranges);
64-
sortedRanges.sort((r1, r2) -> {
65-
double r1From = ((Number) r1.from).doubleValue();
66-
double r2From = ((Number) r2.from).doubleValue();
67-
int cmp = Double.compare(r1From, r2From);
68-
if (cmp != 0) {
69-
return cmp;
70-
} else {
71-
double r1To = ((Number) r1.from).doubleValue();
72-
double r2To = ((Number) r2.from).doubleValue();
73-
return Double.compare(r1To, r2To);
74-
}
75-
});
56+
Comparator<RangeFieldMapper.Range> fromComparator = Comparator.comparingDouble(range -> ((Number) range.from).doubleValue());
57+
Comparator<RangeFieldMapper.Range> toComparator = Comparator.comparingDouble(range -> ((Number) range.to).doubleValue());
58+
sortedRanges.sort(fromComparator.thenComparing(toComparator));
7659

77-
final byte[] encoded = new byte[5 + ((5 + 9) * 2) * sortedRanges.size()];
60+
final byte[] encoded = new byte[5 + (8 * 2) * sortedRanges.size()];
7861
ByteArrayDataOutput out = new ByteArrayDataOutput(encoded);
7962
out.writeVInt(sortedRanges.size());
8063
for (RangeFieldMapper.Range range : sortedRanges) {
81-
byte[] encodedFrom = BinaryRangeUtil.encode(((Number) range.from).doubleValue());
82-
out.writeVInt(encodedFrom.length);
64+
byte[] encodedFrom = encodeDouble(((Number) range.from).doubleValue());
8365
out.writeBytes(encodedFrom, encodedFrom.length);
84-
byte[] encodedTo = BinaryRangeUtil.encode(((Number) range.to).doubleValue());
85-
out.writeVInt(encodedTo.length);
66+
byte[] encodedTo = encodeDouble(((Number) range.to).doubleValue());
8667
out.writeBytes(encodedTo, encodedTo.length);
8768
}
8869
return new BytesRef(encoded, 0, out.getPosition());
8970
}
9071

72+
static BytesRef encodeFloatRanges(Set<RangeFieldMapper.Range> ranges) throws IOException {
73+
List<RangeFieldMapper.Range> sortedRanges = new ArrayList<>(ranges);
74+
Comparator<RangeFieldMapper.Range> fromComparator = Comparator.comparingDouble(range -> ((Number) range.from).floatValue());
75+
Comparator<RangeFieldMapper.Range> toComparator = Comparator.comparingDouble(range -> ((Number) range.to).floatValue());
76+
sortedRanges.sort(fromComparator.thenComparing(toComparator));
77+
78+
final byte[] encoded = new byte[5 + (4 * 2) * sortedRanges.size()];
79+
ByteArrayDataOutput out = new ByteArrayDataOutput(encoded);
80+
out.writeVInt(sortedRanges.size());
81+
for (RangeFieldMapper.Range range : sortedRanges) {
82+
byte[] encodedFrom = encodeFloat(((Number) range.from).floatValue());
83+
out.writeBytes(encodedFrom, encodedFrom.length);
84+
byte[] encodedTo = encodeFloat(((Number) range.to).floatValue());
85+
out.writeBytes(encodedTo, encodedTo.length);
86+
}
87+
return new BytesRef(encoded, 0, out.getPosition());
88+
}
89+
90+
static byte[] encodeDouble(double number) {
91+
byte[] encoded = new byte[8];
92+
NumericUtils.longToSortableBytes(NumericUtils.doubleToSortableLong(number), encoded, 0);
93+
return encoded;
94+
}
95+
96+
static byte[] encodeFloat(float number) {
97+
byte[] encoded = new byte[4];
98+
NumericUtils.intToSortableBytes(NumericUtils.floatToSortableInt(number), encoded, 0);
99+
return encoded;
100+
}
101+
91102
/**
92103
* Encodes the specified number of type long in a variable-length byte format.
93104
* The byte format preserves ordering, which means the returned byte array can be used for comparing as is.
105+
* The first bit stores the sign and the 4 subsequent bits encode the number of bytes that are used to
106+
* represent the long value, in addition to the first one.
94107
*/
95-
static byte[] encode(long number) {
108+
static byte[] encodeLong(long number) {
96109
int sign = 1; // means positive
97110
if (number < 0) {
98111
number = -1 - number;
@@ -101,46 +114,48 @@ static byte[] encode(long number) {
101114
return encode(number, sign);
102115
}
103116

104-
/**
105-
* Encodes the specified number of type double in a variable-length byte format.
106-
* The byte format preserves ordering, which means the returned byte array can be used for comparing as is.
107-
*/
108-
static byte[] encode(double number) {
109-
long l;
110-
int sign;
111-
if (number < 0.0) {
112-
l = Double.doubleToRawLongBits(-0d - number);
113-
sign = 0;
114-
} else {
115-
l = Double.doubleToRawLongBits(number);
116-
sign = 1; // means positive
117-
}
118-
return encode(l, sign);
119-
}
120-
121117
private static byte[] encode(long l, int sign) {
122118
assert l >= 0;
123-
int bits = 64 - Long.numberOfLeadingZeros(l);
124119

125-
int numBytes = (bits + 7) / 8; // between 0 and 8
126-
byte[] encoded = new byte[1 + numBytes];
127-
// encode the sign first to make sure positive values compare greater than negative values
128-
// and then the number of bytes, to make sure that large values compare greater than low values
129-
if (sign > 0) {
130-
encoded[0] = (byte) ((sign << 4) | numBytes);
131-
} else {
132-
encoded[0] = (byte) ((sign << 4) | (8 - numBytes));
120+
// the header is formed of:
121+
// - 1 bit for the sign
122+
// - 4 bits for the number of additional bytes
123+
// - up to 3 bits of the value
124+
// additional bytes are data bytes
125+
126+
int numBits = 64 - Long.numberOfLeadingZeros(l);
127+
int numAdditionalBytes = (numBits + 7 - 3) / 8;
128+
129+
byte[] encoded = new byte[1 + numAdditionalBytes];
130+
131+
// write data bytes
132+
int i = encoded.length;
133+
while (numBits > 0) {
134+
int index = --i;
135+
assert index > 0 || numBits <= 3; // byte 0 can't encode more than 3 bits
136+
encoded[index] = (byte) l;
137+
l >>>= 8;
138+
numBits -= 8;
133139
}
134-
for (int b = 0; b < numBytes; ++b) {
135-
if (sign == 1) {
136-
encoded[encoded.length - 1 - b] = (byte) (l >>> (8 * b));
137-
} else if (sign == 0) {
138-
encoded[encoded.length - 1 - b] = (byte) (0xFF - ((l >>> (8 * b)) & 0xFF));
139-
} else {
140-
throw new AssertionError();
140+
assert Byte.toUnsignedInt(encoded[0]) <= 0x07;
141+
assert encoded.length == 1 || encoded[0] != 0 || Byte.toUnsignedInt(encoded[1]) > 0x07;
142+
143+
if (sign == 0) {
144+
// reverse the order
145+
for (int j = 0; j < encoded.length; ++j) {
146+
encoded[j] = (byte) ~Byte.toUnsignedInt(encoded[j]);
141147
}
148+
// the first byte only uses 3 bits, we need the 5 upper bits for the header
149+
encoded[0] &= 0x07;
150+
}
151+
152+
// write the header
153+
encoded[0] |= sign << 7;
154+
if (sign > 0) {
155+
encoded[0] |= numAdditionalBytes << 3;
156+
} else {
157+
encoded[0] |= (15 - numAdditionalBytes) << 3;
142158
}
143159
return encoded;
144160
}
145-
146161
}

0 commit comments

Comments
 (0)