Rewrite indexFieldHasDuplicateData

mayya-sharipova · mayya-sharipova · commit 5870fd7a48d6 · 2019-11-12T17:46:05.000-05:00
Rewrite how duplicate data is detected in the index
to have less number of iterations.
For every iteration check if the number of documents
in it is more than a half of documents in the segment.
diff --git a/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java b/server/src/main/java/org/elasticsearch/search/query/QueryPhase.java
@@ -556,14 +556,14 @@ private static boolean canEarlyTerminate(IndexReader reader, SortAndFormats sort
         return true;
     }
 
+
     /**
      * Returns true if more than 50% of data in the index have the same value
      * The evaluation is approximation based on finding the median value and estimating its count
-     * Returns true if the total count of median values is greater or equal to half of the total count of documents
      */
     static boolean indexFieldHasDuplicateData(IndexReader reader, String field) throws IOException {
-        long globalDocCount = 0;
-        long globalMedianCount = 0;
+        long docsOpt = 0; // number of docs in segments that would benefit optimization
+        long docsNoOpt = 0; // number of docs in segments that would NOT benefit optimization, e.g. docs in segments with duplicate data
         for (LeafReaderContext lrc : reader.leaves()) {
             PointValues pointValues = lrc.reader().getPointValues(field);
             if (pointValues == null) continue;
@@ -572,31 +572,33 @@ static boolean indexFieldHasDuplicateData(IndexReader reader, String field) thro
                 continue;
             }
             assert(pointValues.size() == docCount); // TODO: modify the code to handle multiple values
-            globalDocCount += docCount;
-            long medianValue = estimateMedianValue(pointValues);
-            long medianCount = estimatePointCount(pointValues, medianValue, medianValue);
-            globalMedianCount += medianCount;
-        }
-        return (globalMedianCount >= globalDocCount/2);
-    }
 
-    static long estimateMedianValue(PointValues pointValues) throws IOException {
-        long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
-        long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
-        while (minValue < maxValue) {
-            long avgValue = Math.floorDiv(minValue + maxValue, 2);
-            long countLeft = estimatePointCount(pointValues, minValue, avgValue);
-            long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
-            if (countLeft >= countRight) {
-                maxValue = avgValue;
+            int duplicateDocCount = docCount/2; // expected doc count of duplicate data
+            long minValue = LongPoint.decodeDimension(pointValues.getMinPackedValue(), 0);
+            long maxValue = LongPoint.decodeDimension(pointValues.getMaxPackedValue(), 0);
+            boolean hasDuplicateData = true;
+            while ((minValue < maxValue) && hasDuplicateData) {
+                long avgValue = Math.floorDiv(minValue, 2) + Math.floorDiv(maxValue, 2); // to avoid overflow first divide each value by 2
+                long countLeft = estimatePointCount(pointValues, minValue, avgValue);
+                long countRight = estimatePointCount(pointValues, avgValue + 1, maxValue);
+                if ((countLeft >= countRight) && (countLeft > duplicateDocCount) ) {
+                    maxValue = avgValue;
+                } else if ((countRight > countLeft) && (countRight > duplicateDocCount)) {
+                    minValue = avgValue + 1;
+                } else {
+                    hasDuplicateData = false;
+                }
+            }
+            if (hasDuplicateData) {
+                docsNoOpt += docCount;
             } else {
-                minValue = avgValue + 1;
+                docsOpt += docCount;
             }
         }
-        return maxValue;
+        return (docsNoOpt > docsOpt);
     }
 
-    static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
+    private static long estimatePointCount(PointValues pointValues, long minValue, long maxValue) {
         final byte[] minValueAsBytes = new byte[Long.BYTES];
         LongPoint.encodeDimension(minValue, minValueAsBytes, 0);
         final byte[] maxValueAsBytes = new byte[Long.BYTES];
diff --git a/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java b/server/src/test/java/org/elasticsearch/search/query/QueryPhaseTests.java
@@ -67,13 +67,8 @@
 import org.apache.lucene.search.spans.SpanNearQuery;
 import org.apache.lucene.search.spans.SpanTermQuery;
 import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.IOContext;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
-import org.apache.lucene.util.bkd.BKDReader;
-import org.apache.lucene.util.bkd.BKDWriter;
 import org.elasticsearch.action.search.SearchTask;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.index.mapper.DateFieldMapper;
@@ -96,15 +91,12 @@
 import java.util.Collections;
 import java.util.List;
 
-import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue;
-import static org.elasticsearch.search.query.QueryPhase.estimatePointCount;
+import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData;
 import static org.elasticsearch.search.query.TopDocsCollectorContext.hasInfMaxScore;
 import static org.hamcrest.Matchers.anyOf;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 import static org.hamcrest.Matchers.instanceOf;
-import static org.hamcrest.Matchers.lessThan;
-import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
 import static org.mockito.Mockito.spy;
@@ -712,66 +704,28 @@ public void testNumericLongOrDateSortOptimization() throws Exception {
     }
 
     public void testIndexHasDuplicateData() throws IOException {
-        int valuesCount = 5000;
-        int maxPointsInLeafNode = 40;
-        long expectedMedianCount = (long)(valuesCount * 0.6);
-        long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
-
-        try (Directory dir = newDirectory()) {
-            BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
-            byte[] longBytes = new byte[8];
-            for (int docId = 0; docId < valuesCount; docId++) {
-                long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
-                LongPoint.encodeDimension(value, longBytes, 0);
-                w.add(longBytes, docId);
-            }
-            long indexFP;
-            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
-                indexFP = w.finish(out);
-            }
-            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
-                in.seek(indexFP);
-                BKDReader r = new BKDReader(in);
-                long medianValue = estimateMedianValue(r);
-                long medianCount = estimatePointCount(r, medianValue, medianValue);
-
-                assertEquals(expectedMedianValue, medianValue);
-                assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data
-                assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount)));
-                assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount)));
-            }
-        }
-    }
-
-    public void testIndexHasNotDuplicateData() throws IOException {
-        int valuesCount = 5000;
-        int maxPointsInLeafNode = 40;
-        long expectedMedianCount = (long)(valuesCount * 0.35);
-        long expectedMedianValue = randomLongBetween(-10000000L, 10000000L);
-
-        try (Directory dir = newDirectory()) {
-            BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount);
-            byte[] longBytes = new byte[8];
-            for (int docId = 0; docId < valuesCount; docId++) {
-                long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L);
-                LongPoint.encodeDimension(value, longBytes, 0);
-                w.add(longBytes, docId);
-            }
-            long indexFP;
-            try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) {
-                indexFP = w.finish(out);
-            }
-            try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) {
-                in.seek(indexFP);
-                BKDReader r = new BKDReader(in);
-                long medianValue = estimateMedianValue(r);
-                long medianCount = estimatePointCount(r, medianValue, medianValue);
-
-                // can't make any assertion about the values of medianValue and medianCount
-                // as BKDReader::estimatePointCount can be really off for non-duplicate data
-                assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data
-            }
+        int docsCount = 7000;
+        float duplicateRatio1 = 0.6f;
+        float duplicateRatio2 = 0.35f;
+        long duplicateValue = randomLongBetween(-10000000L, 10000000L);
+        Directory dir = newDirectory();
+        IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null));
+        for (int docId = 0; docId < docsCount; docId++) {
+            Document doc = new Document();
+            long value = (randomFloat() < duplicateRatio1) ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
+            long value2 = (randomFloat() < duplicateRatio2) ? duplicateValue : randomLongBetween(-10000000L, 10000000L);
+            doc.add(new LongPoint("duplicateField", value));
+            doc.add(new LongPoint("notDuplicateField", value2));
+            writer.addDocument(doc);
         }
+        writer.close();
+        final IndexReader reader = DirectoryReader.open(dir);
+        boolean hasDuplicateData = indexFieldHasDuplicateData(reader, "duplicateField");
+        boolean hasDuplicateData2 = indexFieldHasDuplicateData(reader, "notDuplicateField");
+        reader.close();
+        dir.close();
+        assertTrue(hasDuplicateData);
+        assertFalse(hasDuplicateData2);
     }
 
     public void testMaxScoreQueryVisitor() {