Skip to content

Commit 40bb166

Browse files
authored
Index ids in binary form. (#25352)
Indexing ids in binary form should help with indexing speed since we would have to compare fewer bytes upon sorting, should help with memory usage of the live version map since keys will be shorter, and might help with disk usage depending on how efficient the terms dictionary is at compressing terms. Since we can only expect base64 ids in the auto-generated case, this PR tries to use an encoding that makes the binary id equal to the base64-decoded id in the majority of cases (253 out of 256). It also specializes numeric ids, since this seems to be common when content that is stored in Elasticsearch comes from another database that uses eg. auto-increment ids. Another option could be to require base64 ids all the time. It would make things simpler but I'm not sure users would welcome this requirement. This PR should bring some benefits, but I expect it to be mostly useful when coupled with something like #24615. Closes #18154
1 parent 17a587e commit 40bb166

File tree

19 files changed

+513
-129
lines changed

19 files changed

+513
-129
lines changed

core/src/main/java/org/elasticsearch/index/fielddata/UidIndexFieldData.java

Lines changed: 20 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,13 @@
1919

2020
package org.elasticsearch.index.fielddata;
2121

22-
import org.apache.lucene.index.DirectoryReader;
2322
import org.apache.lucene.index.LeafReaderContext;
24-
import org.apache.lucene.index.MultiDocValues;
25-
import org.apache.lucene.index.SortedSetDocValues;
2623
import org.apache.lucene.search.SortField;
2724
import org.apache.lucene.util.BytesRef;
2825
import org.apache.lucene.util.BytesRefBuilder;
2926
import org.elasticsearch.index.Index;
3027
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
3128
import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource;
32-
import org.elasticsearch.index.fielddata.plain.AbstractAtomicOrdinalsFieldData;
3329
import org.elasticsearch.index.mapper.UidFieldMapper;
3430
import org.elasticsearch.search.MultiValueMode;
3531

@@ -42,16 +38,14 @@
4238
* already using: this is just a view.
4339
* TODO: Remove fielddata access on _uid and _id, or add doc values to _id.
4440
*/
45-
public final class UidIndexFieldData implements IndexOrdinalsFieldData {
41+
public final class UidIndexFieldData implements IndexFieldData<AtomicFieldData> {
4642

4743
private final Index index;
48-
private final String type;
4944
private final BytesRef prefix;
50-
private final IndexOrdinalsFieldData idFieldData;
45+
private final IndexFieldData<?> idFieldData;
5146

52-
public UidIndexFieldData(Index index, String type, IndexOrdinalsFieldData idFieldData) {
47+
public UidIndexFieldData(Index index, String type, IndexFieldData<?> idFieldData) {
5348
this.index = index;
54-
this.type = type;
5549
BytesRefBuilder prefix = new BytesRefBuilder();
5650
prefix.append(new BytesRef(type));
5751
prefix.append((byte) '#');
@@ -76,12 +70,12 @@ public SortField sortField(Object missingValue, MultiValueMode sortMode, Nested
7670
}
7771

7872
@Override
79-
public AtomicOrdinalsFieldData load(LeafReaderContext context) {
73+
public AtomicFieldData load(LeafReaderContext context) {
8074
return new UidAtomicFieldData(prefix, idFieldData.load(context));
8175
}
8276

8377
@Override
84-
public AtomicOrdinalsFieldData loadDirect(LeafReaderContext context) throws Exception {
78+
public AtomicFieldData loadDirect(LeafReaderContext context) throws Exception {
8579
return new UidAtomicFieldData(prefix, idFieldData.loadDirect(context));
8680
}
8781

@@ -90,39 +84,19 @@ public void clear() {
9084
idFieldData.clear();
9185
}
9286

93-
@Override
94-
public IndexOrdinalsFieldData loadGlobal(DirectoryReader indexReader) {
95-
return new UidIndexFieldData(index, type, idFieldData.loadGlobal(indexReader));
96-
}
97-
98-
@Override
99-
public IndexOrdinalsFieldData localGlobalDirect(DirectoryReader indexReader) throws Exception {
100-
return new UidIndexFieldData(index, type, idFieldData.localGlobalDirect(indexReader));
101-
}
102-
103-
@Override
104-
public MultiDocValues.OrdinalMap getOrdinalMap() {
105-
return idFieldData.getOrdinalMap();
106-
}
107-
108-
static final class UidAtomicFieldData implements AtomicOrdinalsFieldData {
87+
static final class UidAtomicFieldData implements AtomicFieldData {
10988

11089
private final BytesRef prefix;
111-
private final AtomicOrdinalsFieldData idFieldData;
90+
private final AtomicFieldData idFieldData;
11291

113-
UidAtomicFieldData(BytesRef prefix, AtomicOrdinalsFieldData idFieldData) {
92+
UidAtomicFieldData(BytesRef prefix, AtomicFieldData idFieldData) {
11493
this.prefix = prefix;
11594
this.idFieldData = idFieldData;
11695
}
11796

11897
@Override
11998
public ScriptDocValues<?> getScriptValues() {
120-
return AbstractAtomicOrdinalsFieldData.DEFAULT_SCRIPT_FUNCTION.apply(getOrdinalsValues());
121-
}
122-
123-
@Override
124-
public SortedBinaryDocValues getBytesValues() {
125-
return FieldData.toString(getOrdinalsValues());
99+
return new ScriptDocValues.Strings(getBytesValues());
126100
}
127101

128102
@Override
@@ -136,54 +110,30 @@ public void close() {
136110
}
137111

138112
@Override
139-
public SortedSetDocValues getOrdinalsValues() {
140-
SortedSetDocValues idValues = idFieldData.getOrdinalsValues();
141-
return new SortedSetDocValues() {
113+
public SortedBinaryDocValues getBytesValues() {
114+
SortedBinaryDocValues idValues = idFieldData.getBytesValues();
115+
return new SortedBinaryDocValues() {
142116

143117
private final BytesRefBuilder scratch = new BytesRefBuilder();
144118

145119
@Override
146-
public int nextDoc() throws IOException {
147-
return idValues.nextDoc();
120+
public boolean advanceExact(int doc) throws IOException {
121+
return idValues.advanceExact(doc);
148122
}
149123

150124
@Override
151-
public int docID() {
152-
return idValues.docID();
125+
public int docValueCount() {
126+
return idValues.docValueCount();
153127
}
154128

155129
@Override
156-
public long cost() {
157-
return idValues.cost();
158-
}
159-
160-
@Override
161-
public int advance(int target) throws IOException {
162-
return idValues.advance(target);
163-
}
164-
165-
@Override
166-
public boolean advanceExact(int target) throws IOException {
167-
return idValues.advanceExact(target);
168-
}
169-
170-
@Override
171-
public long nextOrd() throws IOException {
172-
return idValues.nextOrd();
173-
}
174-
175-
@Override
176-
public BytesRef lookupOrd(long ord) throws IOException {
177-
scratch.setLength(0);
178-
scratch.append(prefix);
179-
scratch.append(idValues.lookupOrd(ord));
130+
public BytesRef nextValue() throws IOException {
131+
BytesRef nextID = idValues.nextValue();
132+
scratch.copyBytes(prefix);
133+
scratch.append(nextID);
180134
return scratch.get();
181135
}
182136

183-
@Override
184-
public long getValueCount() {
185-
return idValues.getValueCount();
186-
}
187137
};
188138
}
189139

core/src/main/java/org/elasticsearch/index/fieldvisitor/FieldsVisitor.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ public void postProcess(MapperService mapperService) {
105105
public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
106106
if (SourceFieldMapper.NAME.equals(fieldInfo.name)) {
107107
source = new BytesArray(value);
108+
} else if (IdFieldMapper.NAME.equals(fieldInfo.name)) {
109+
id = Uid.decodeId(value);
108110
} else {
109111
addValue(fieldInfo.name, new BytesRef(value));
110112
}
@@ -114,10 +116,14 @@ public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
114116
public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException {
115117
final String value = new String(bytes, StandardCharsets.UTF_8);
116118
if (UidFieldMapper.NAME.equals(fieldInfo.name)) {
119+
// 5.x-only
120+
// TODO: Remove when we are on 7.x
117121
Uid uid = Uid.createUid(value);
118122
type = uid.type();
119123
id = uid.id();
120124
} else if (IdFieldMapper.NAME.equals(fieldInfo.name)) {
125+
// only applies to 5.x indices that have single_type = true
126+
// TODO: Remove when we are on 7.x
121127
id = value;
122128
} else {
123129
addValue(fieldInfo.name, value);

core/src/main/java/org/elasticsearch/index/mapper/DocumentParser.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -436,7 +436,13 @@ private static ParseContext nestedContext(ParseContext context, ObjectMapper map
436436
if (idField != null) {
437437
// We just need to store the id as indexed field, so that IndexWriter#deleteDocuments(term) can then
438438
// delete it when the root document is deleted too.
439-
nestedDoc.add(new Field(IdFieldMapper.NAME, idField.stringValue(), IdFieldMapper.Defaults.NESTED_FIELD_TYPE));
439+
if (idField.stringValue() != null) {
440+
// backward compat with 5.x
441+
// TODO: Remove on 7.0
442+
nestedDoc.add(new Field(IdFieldMapper.NAME, idField.stringValue(), IdFieldMapper.Defaults.NESTED_FIELD_TYPE));
443+
} else {
444+
nestedDoc.add(new Field(IdFieldMapper.NAME, idField.binaryValue(), IdFieldMapper.Defaults.NESTED_FIELD_TYPE));
445+
}
440446
} else {
441447
throw new IllegalStateException("The root document of a nested document should have an id field");
442448
}

core/src/main/java/org/elasticsearch/index/mapper/IdFieldMapper.java

Lines changed: 134 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,28 @@
2222
import org.apache.lucene.document.Field;
2323
import org.apache.lucene.index.IndexOptions;
2424
import org.apache.lucene.index.IndexableField;
25+
import org.apache.lucene.index.LeafReaderContext;
2526
import org.apache.lucene.search.Query;
27+
import org.apache.lucene.search.SortField;
2628
import org.apache.lucene.search.TermInSetQuery;
27-
import org.elasticsearch.common.Nullable;
29+
import org.apache.lucene.util.BytesRef;
30+
import org.elasticsearch.Version;
31+
import org.elasticsearch.common.lucene.BytesRefs;
2832
import org.elasticsearch.common.lucene.Lucene;
2933
import org.elasticsearch.common.xcontent.XContentBuilder;
34+
import org.elasticsearch.index.Index;
3035
import org.elasticsearch.index.IndexSettings;
36+
import org.elasticsearch.index.fielddata.AtomicFieldData;
3137
import org.elasticsearch.index.fielddata.IndexFieldData;
38+
import org.elasticsearch.index.fielddata.IndexFieldData.XFieldComparatorSource.Nested;
39+
import org.elasticsearch.index.fielddata.fieldcomparator.BytesRefFieldComparatorSource;
40+
import org.elasticsearch.index.fielddata.IndexFieldDataCache;
41+
import org.elasticsearch.index.fielddata.ScriptDocValues;
42+
import org.elasticsearch.index.fielddata.SortedBinaryDocValues;
3243
import org.elasticsearch.index.fielddata.plain.PagedBytesIndexFieldData;
3344
import org.elasticsearch.index.query.QueryShardContext;
45+
import org.elasticsearch.indices.breaker.CircuitBreakerService;
46+
import org.elasticsearch.search.MultiValueMode;
3447

3548
import java.io.IOException;
3649
import java.util.Arrays;
@@ -109,15 +122,31 @@ public boolean isSearchable() {
109122
}
110123

111124
@Override
112-
public Query termQuery(Object value, @Nullable QueryShardContext context) {
125+
public Query termQuery(Object value, QueryShardContext context) {
113126
return termsQuery(Arrays.asList(value), context);
114127
}
115128

116129
@Override
117-
public Query termsQuery(List<?> values, @Nullable QueryShardContext context) {
130+
public Query termsQuery(List<?> values, QueryShardContext context) {
118131
if (indexOptions() != IndexOptions.NONE) {
119-
// 6.x index, _id is indexed
120-
return super.termsQuery(values, context);
132+
failIfNotIndexed();
133+
BytesRef[] bytesRefs = new BytesRef[values.size()];
134+
final boolean is5xIndex = context.indexVersionCreated().before(Version.V_6_0_0_alpha3);
135+
for (int i = 0; i < bytesRefs.length; i++) {
136+
BytesRef id;
137+
if (is5xIndex) {
138+
// 5.x index with index.mapping.single_type = true
139+
id = BytesRefs.toBytesRef(values.get(i));
140+
} else {
141+
Object idObject = values.get(i);
142+
if (idObject instanceof BytesRef) {
143+
idObject = ((BytesRef) idObject).utf8ToString();
144+
}
145+
id = Uid.encodeId(idObject.toString());
146+
}
147+
bytesRefs[i] = id;
148+
}
149+
return new TermInSetQuery(name(), bytesRefs);
121150
}
122151
// 5.x index, _uid is indexed
123152
return new TermInSetQuery(UidFieldMapper.NAME, Uid.createUidsForTypesAndIds(context.queryTypes(), values));
@@ -128,13 +157,106 @@ public IndexFieldData.Builder fielddataBuilder() {
128157
if (indexOptions() == IndexOptions.NONE) {
129158
throw new IllegalArgumentException("Fielddata access on the _uid field is disallowed");
130159
}
131-
return new PagedBytesIndexFieldData.Builder(
160+
final IndexFieldData.Builder fieldDataBuilder = new PagedBytesIndexFieldData.Builder(
132161
TextFieldMapper.Defaults.FIELDDATA_MIN_FREQUENCY,
133162
TextFieldMapper.Defaults.FIELDDATA_MAX_FREQUENCY,
134163
TextFieldMapper.Defaults.FIELDDATA_MIN_SEGMENT_SIZE);
164+
return new IndexFieldData.Builder() {
165+
@Override
166+
public IndexFieldData<?> build(IndexSettings indexSettings, MappedFieldType fieldType, IndexFieldDataCache cache,
167+
CircuitBreakerService breakerService, MapperService mapperService) {
168+
final IndexFieldData<?> fieldData = fieldDataBuilder.build(indexSettings, fieldType, cache, breakerService, mapperService);
169+
if (indexSettings.getIndexVersionCreated().before(Version.V_6_0_0_alpha3)) {
170+
// ids were indexed as utf-8
171+
return fieldData;
172+
}
173+
return new IndexFieldData<AtomicFieldData>() {
174+
175+
@Override
176+
public Index index() {
177+
return fieldData.index();
178+
}
179+
180+
@Override
181+
public String getFieldName() {
182+
return fieldData.getFieldName();
183+
}
184+
185+
@Override
186+
public AtomicFieldData load(LeafReaderContext context) {
187+
return wrap(fieldData.load(context));
188+
}
189+
190+
@Override
191+
public AtomicFieldData loadDirect(LeafReaderContext context) throws Exception {
192+
return wrap(fieldData.loadDirect(context));
193+
}
194+
195+
@Override
196+
public SortField sortField(Object missingValue, MultiValueMode sortMode, Nested nested, boolean reverse) {
197+
XFieldComparatorSource source = new BytesRefFieldComparatorSource(this, missingValue, sortMode, nested);
198+
return new SortField(getFieldName(), source, reverse);
199+
}
200+
201+
@Override
202+
public void clear() {
203+
fieldData.clear();
204+
}
205+
206+
};
207+
}
208+
};
135209
}
136210
}
137211

212+
private static AtomicFieldData wrap(AtomicFieldData in) {
213+
return new AtomicFieldData() {
214+
215+
@Override
216+
public void close() {
217+
in.close();
218+
}
219+
220+
@Override
221+
public long ramBytesUsed() {
222+
return in.ramBytesUsed();
223+
}
224+
225+
@Override
226+
public ScriptDocValues<?> getScriptValues() {
227+
return new ScriptDocValues.Strings(getBytesValues());
228+
}
229+
230+
@Override
231+
public SortedBinaryDocValues getBytesValues() {
232+
SortedBinaryDocValues inValues = in.getBytesValues();
233+
return new SortedBinaryDocValues() {
234+
235+
@Override
236+
public BytesRef nextValue() throws IOException {
237+
BytesRef encoded = inValues.nextValue();
238+
return new BytesRef(Uid.decodeId(
239+
Arrays.copyOfRange(encoded.bytes, encoded.offset, encoded.offset + encoded.length)));
240+
}
241+
242+
@Override
243+
public int docValueCount() {
244+
final int count = inValues.docValueCount();
245+
// If the count is not 1 then the impl is not correct as the binary representation
246+
// does not preserve order. But id fields only have one value per doc so we are good.
247+
assert count == 1;
248+
return inValues.docValueCount();
249+
}
250+
251+
@Override
252+
public boolean advanceExact(int doc) throws IOException {
253+
return inValues.advanceExact(doc);
254+
}
255+
};
256+
}
257+
};
258+
}
259+
138260
static MappedFieldType defaultFieldType(IndexSettings indexSettings) {
139261
MappedFieldType defaultFieldType = Defaults.FIELD_TYPE.clone();
140262
if (indexSettings.isSingleType()) {
@@ -166,8 +288,12 @@ public void postParse(ParseContext context) throws IOException {}
166288
@Override
167289
protected void parseCreateField(ParseContext context, List<IndexableField> fields) throws IOException {
168290
if (fieldType.indexOptions() != IndexOptions.NONE || fieldType.stored()) {
169-
Field id = new Field(NAME, context.sourceToParse().id(), fieldType);
170-
fields.add(id);
291+
if (context.mapperService().getIndexSettings().getIndexVersionCreated().onOrAfter(Version.V_6_0_0_alpha3)) {
292+
BytesRef id = Uid.encodeId(context.sourceToParse().id());
293+
fields.add(new Field(NAME, id, fieldType));
294+
} else {
295+
fields.add(new Field(NAME, context.sourceToParse().id(), fieldType));
296+
}
171297
}
172298
}
173299

0 commit comments

Comments
 (0)