Skip to content

Commit 5870fd7

Browse files
Rewrite indexFieldHasDuplicateData
Rewrite how duplicate data is detected in the index to have less number of iterations. For every iteration check if the number of documents in it is more than a half of documents in the segment.
1 parent 738c785 commit 5870fd7

File tree

2 files changed

+46
-90
lines changed

2 files changed

+46
-90
lines changed

server/src/main/java/org/elasticsearch/search/query/QueryPhase.java

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -556,14 +556,14 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort
556556
return true;
557557
}
558558

559+
559560
/**
560561
* Returns true if more than 50% of data in the index have the same value
561562
* The evaluation is approximation based on finding the median value and estimating its count
562-
* Returns true if the total count of median values is greater or equal to half of the total count of documents
563563
*/
564564
static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
565-
long globalDocCount = 0;
566-
long globalMedianCount = 0;
565+
long docsOpt = 0; // number of docs in segments that would benefit optimization
566+
long docsNoOpt = 0; // number of docs in segments that would NOT benefit optimization, e.g. docs in segments with duplicate data
567567
for (LeafReaderContext lrc : reader.leaves()) {
568568
PointValues pointValues = lrc.reader().getPointValues(field);
569569
if (pointValues == null) continue;
@@ -572,31 +572,33 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
572572
continue;
573573
}
574574
assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values
575-
globalDocCount += docCount;
576-
long medianValue = estimateMedianValue(pointValues);
577-
long medianCount = estimatePointCount(pointValues, medianValue, medianValue);
578-
globalMedianCount += medianCount;
579-
}
580-
return (globalMedianCount >= globalDocCount/2);
581-
}
582575

583-
static long estimateMedianValue(PointValues pointValues) throws IOException {
584-
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
585-
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
586-
while (minValue < maxValue) {
587-
long avgValue = Math.floorDiv(minValue + maxValue, 2);
588-
long countLeft = estimatePointCount(pointValues, minValue, avgValue);
589-
long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
590-
if (countLeft >= countRight) {
591-
maxValue = avgValue;
576+
int duplicateDocCount = docCount/2; // expected doc count of duplicate data
577+
long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
578+
long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
579+
boolean hasDuplicateData = true;
580+
while ((minValue < maxValue) && hasDuplicateData) {
581+
long avgValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2
582+
long countLeft = estimatePointCount(pointValues, minValue, avgValue);
583+
long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
584+
if ((countLeft >= countRight) && (countLeft > duplicateDocCount) ) {
585+
maxValue = avgValue;
586+
} else if ((countRight > countLeft) && (countRight > duplicateDocCount)) {
587+
minValue = avgValue + 1;
588+
} else {
589+
hasDuplicateData = false;
590+
}
591+
}
592+
if (hasDuplicateData) {
593+
docsNoOpt += docCount;
592594
} else {
593-
minValue = avgValue + 1;
595+
docsOpt += docCount;
594596
}
595597
}
596-
return maxValue;
598+
return (docsNoOpt > docsOpt);
597599
}
598600

599-
static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
601+
private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
600602
final byte[] minValueAsBytes = new byte[Long.BYTES];
601603
LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
602604
final byte[] maxValueAsBytes = new byte[Long.BYTES];

server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java

Lines changed: 22 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -67,13 +67,8 @@
6767
import org.apache.lucene.search.spans.SpanNearQuery;
6868
import org.apache.lucene.search.spans.SpanTermQuery;
6969
import org.apache.lucene.store.Directory;
70-
import org.apache.lucene.store.IOContext;
71-
import org.apache.lucene.store.IndexInput;
72-
import org.apache.lucene.store.IndexOutput;
7370
import org.apache.lucene.util.BytesRef;
7471
import org.apache.lucene.util.FixedBitSet;
75-
import org.apache.lucene.util.bkd.BKDReader;
76-
import org.apache.lucene.util.bkd.BKDWriter;
7772
import org.elasticsearch.action.search.SearchTask;
7873
import org.elasticsearch.common.settings.Settings;
7974
import org.elasticsearch.index.mapper.DateFieldMapper;
@@ -96,15 +91,12 @@
9691
import java.util.Collections;
9792
import java.util.List;
9893

99-
import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue;
100-
import static org.elasticsearch.search.query.QueryPhase.estimatePointCount;
94+
import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
10195
import static org.elasticsearch.search.query.TopDocsCollectorContext.hasInfMaxScore;
10296
import static org.hamcrest.Matchers.anyOf;
10397
import static org.hamcrest.Matchers.equalTo;
10498
import static org.hamcrest.Matchers.greaterThanOrEqualTo;
10599
import static org.hamcrest.Matchers.instanceOf;
106-
import static org.hamcrest.Matchers.lessThan;
107-
import static org.hamcrest.Matchers.lessThanOrEqualTo;
108100
import static org.mockito.Mockito.mock;
109101
import static org.mockito.Mockito.when;
110102
import static org.mockito.Mockito.spy;
@@ -712,66 +704,28 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
712704
}
713705

714706
public void testIndexHasDuplicateData() throws IOException {
715-
int valuesCount = 5000;
716-
int maxPointsInLeafNode = 40;
717-
long expectedMedianCount = (long)(valuesCount * 0.6);
718-
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
719-
720-
try (Directory dir = newDirectory()) {
721-
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
722-
byte[] longBytes = new byte[8];
723-
for (int docId = 0; docId < valuesCount; docId++) {
724-
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
725-
LongPoint.encodeDimension(value, longBytes, 0);
726-
w.add(longBytes, docId);
727-
}
728-
long indexFP;
729-
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
730-
indexFP = w.finish(out);
731-
}
732-
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
733-
in.seek(indexFP);
734-
BKDReader r = new BKDReader(in);
735-
long medianValue = estimateMedianValue(r);
736-
long medianCount = estimatePointCount(r, medianValue, medianValue);
737-
738-
assertEquals(expectedMedianValue, medianValue);
739-
assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data
740-
assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount)));
741-
assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount)));
742-
}
743-
}
744-
}
745-
746-
public void testIndexHasNotDuplicateData() throws IOException {
747-
int valuesCount = 5000;
748-
int maxPointsInLeafNode = 40;
749-
long expectedMedianCount = (long)(valuesCount * 0.35);
750-
long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
751-
752-
try (Directory dir = newDirectory()) {
753-
BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
754-
byte[] longBytes = new byte[8];
755-
for (int docId = 0; docId < valuesCount; docId++) {
756-
long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
757-
LongPoint.encodeDimension(value, longBytes, 0);
758-
w.add(longBytes, docId);
759-
}
760-
long indexFP;
761-
try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
762-
indexFP = w.finish(out);
763-
}
764-
try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
765-
in.seek(indexFP);
766-
BKDReader r = new BKDReader(in);
767-
long medianValue = estimateMedianValue(r);
768-
long medianCount = estimatePointCount(r, medianValue, medianValue);
769-
770-
// can't make any assertion about the values of medianValue and medianCount
771-
// as BKDReader::estimatePointCount can be really off for non-duplicate data
772-
assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data
773-
}
707+
int docsCount = 7000;
708+
float duplicateRatio1 = 0.6f;
709+
float duplicateRatio2 = 0.35f;
710+
long duplicateValue = randomLongBetween(-10000000L, 10000000L);
711+
Directory dir = newDirectory();
712+
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
713+
for (int docId = 0; docId < docsCount; docId++) {
714+
Document doc = new Document();
715+
long value = (randomFloat() < duplicateRatio1) ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
716+
long value2 = (randomFloat() < duplicateRatio2) ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
717+
doc.add(new LongPoint("duplicateField", value));
718+
doc.add(new LongPoint("notDuplicateField", value2));
719+
writer.addDocument(doc);
774720
}
721+
writer.close();
722+
final IndexReader reader = DirectoryReader.open(dir);
723+
boolean hasDuplicateData = indexFieldHasDuplicateData(reader, "duplicateField");
724+
boolean hasDuplicateData2 = indexFieldHasDuplicateData(reader, "notDuplicateField");
725+
reader.close();
726+
dir.close();
727+
assertTrue(hasDuplicateData);
728+
assertFalse(hasDuplicateData2);
775729
}
776730

777731
public void testMaxScoreQueryVisitor() {

0 commit comments

Comments
 (0)