|
67 | 67 | import org.apache.lucene.search.spans.SpanNearQuery; |
68 | 68 | import org.apache.lucene.search.spans.SpanTermQuery; |
69 | 69 | import org.apache.lucene.store.Directory; |
70 | | -import org.apache.lucene.store.IOContext; |
71 | | -import org.apache.lucene.store.IndexInput; |
72 | | -import org.apache.lucene.store.IndexOutput; |
73 | 70 | import org.apache.lucene.util.BytesRef; |
74 | 71 | import org.apache.lucene.util.FixedBitSet; |
75 | | -import org.apache.lucene.util.bkd.BKDReader; |
76 | | -import org.apache.lucene.util.bkd.BKDWriter; |
77 | 72 | import org.elasticsearch.action.search.SearchTask; |
78 | 73 | import org.elasticsearch.common.settings.Settings; |
79 | 74 | import org.elasticsearch.index.mapper.DateFieldMapper; |
|
96 | 91 | import java.util.Collections; |
97 | 92 | import java.util.List; |
98 | 93 |
|
99 | | -import static org.elasticsearch.search.query.QueryPhase.estimateMedianValue; |
100 | | -import static org.elasticsearch.search.query.QueryPhase.estimatePointCount; |
| 94 | +import static org.elasticsearch.search.query.QueryPhase.indexFieldHasDuplicateData; |
101 | 95 | import static org.elasticsearch.search.query.TopDocsCollectorContext.hasInfMaxScore; |
102 | 96 | import static org.hamcrest.Matchers.anyOf; |
103 | 97 | import static org.hamcrest.Matchers.equalTo; |
104 | 98 | import static org.hamcrest.Matchers.greaterThanOrEqualTo; |
105 | 99 | import static org.hamcrest.Matchers.instanceOf; |
106 | | -import static org.hamcrest.Matchers.lessThan; |
107 | | -import static org.hamcrest.Matchers.lessThanOrEqualTo; |
108 | 100 | import static org.mockito.Mockito.mock; |
109 | 101 | import static org.mockito.Mockito.when; |
110 | 102 | import static org.mockito.Mockito.spy; |
@@ -712,66 +704,28 @@ public void testNumericLongOrDateSortOptimization() throws Exception { |
712 | 704 | } |
713 | 705 |
|
714 | 706 | public void testIndexHasDuplicateData() throws IOException { |
715 | | - int valuesCount = 5000; |
716 | | - int maxPointsInLeafNode = 40; |
717 | | - long expectedMedianCount = (long)(valuesCount * 0.6); |
718 | | - long expectedMedianValue = randomLongBetween(-10000000L, 10000000L); |
719 | | - |
720 | | - try (Directory dir = newDirectory()) { |
721 | | - BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount); |
722 | | - byte[] longBytes = new byte[8]; |
723 | | - for (int docId = 0; docId < valuesCount; docId++) { |
724 | | - long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L); |
725 | | - LongPoint.encodeDimension(value, longBytes, 0); |
726 | | - w.add(longBytes, docId); |
727 | | - } |
728 | | - long indexFP; |
729 | | - try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { |
730 | | - indexFP = w.finish(out); |
731 | | - } |
732 | | - try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) { |
733 | | - in.seek(indexFP); |
734 | | - BKDReader r = new BKDReader(in); |
735 | | - long medianValue = estimateMedianValue(r); |
736 | | - long medianCount = estimatePointCount(r, medianValue, medianValue); |
737 | | - |
738 | | - assertEquals(expectedMedianValue, medianValue); |
739 | | - assertThat(medianCount, greaterThanOrEqualTo((long) (valuesCount/2))); //assert that Index has duplicate data |
740 | | - assertThat(medianCount, greaterThanOrEqualTo((long) (0.75 * expectedMedianCount))); |
741 | | - assertThat(medianCount, lessThanOrEqualTo((long) (1.25 * expectedMedianCount))); |
742 | | - } |
743 | | - } |
744 | | - } |
745 | | - |
746 | | - public void testIndexHasNotDuplicateData() throws IOException { |
747 | | - int valuesCount = 5000; |
748 | | - int maxPointsInLeafNode = 40; |
749 | | - long expectedMedianCount = (long)(valuesCount * 0.35); |
750 | | - long expectedMedianValue = randomLongBetween(-10000000L, 10000000L); |
751 | | - |
752 | | - try (Directory dir = newDirectory()) { |
753 | | - BKDWriter w = new BKDWriter(valuesCount, dir, "tmp", 1, 1, 8, maxPointsInLeafNode, 1, valuesCount); |
754 | | - byte[] longBytes = new byte[8]; |
755 | | - for (int docId = 0; docId < valuesCount; docId++) { |
756 | | - long value = docId < expectedMedianCount ? expectedMedianValue : randomLongBetween(-10000000L, 10000000L); |
757 | | - LongPoint.encodeDimension(value, longBytes, 0); |
758 | | - w.add(longBytes, docId); |
759 | | - } |
760 | | - long indexFP; |
761 | | - try (IndexOutput out = dir.createOutput("bkd", IOContext.DEFAULT)) { |
762 | | - indexFP = w.finish(out); |
763 | | - } |
764 | | - try (IndexInput in = dir.openInput("bkd", IOContext.DEFAULT)) { |
765 | | - in.seek(indexFP); |
766 | | - BKDReader r = new BKDReader(in); |
767 | | - long medianValue = estimateMedianValue(r); |
768 | | - long medianCount = estimatePointCount(r, medianValue, medianValue); |
769 | | - |
770 | | - // can't make any assertion about the values of medianValue and medianCount |
771 | | - // as BKDReader::estimatePointCount can be really off for non-duplicate data |
772 | | - assertThat(medianCount, lessThan((long) (valuesCount/2))); //assert that Index does NOT have duplicate data |
773 | | - } |
| 707 | + int docsCount = 7000; |
| 708 | + float duplicateRatio1 = 0.6f; |
| 709 | + float duplicateRatio2 = 0.35f; |
| 710 | + long duplicateValue = randomLongBetween(-10000000L, 10000000L); |
| 711 | + Directory dir = newDirectory(); |
| 712 | + IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(null)); |
| 713 | + for (int docId = 0; docId < docsCount; docId++) { |
| 714 | + Document doc = new Document(); |
| 715 | + long value = (randomFloat() < duplicateRatio1) ? duplicateValue : randomLongBetween(-10000000L, 10000000L); |
| 716 | + long value2 = (randomFloat() < duplicateRatio2) ? duplicateValue : randomLongBetween(-10000000L, 10000000L); |
| 717 | + doc.add(new LongPoint("duplicateField", value)); |
| 718 | + doc.add(new LongPoint("notDuplicateField", value2)); |
| 719 | + writer.addDocument(doc); |
774 | 720 | } |
| 721 | + writer.close(); |
| 722 | + final IndexReader reader = DirectoryReader.open(dir); |
| 723 | + boolean hasDuplicateData = indexFieldHasDuplicateData(reader, "duplicateField"); |
| 724 | + boolean hasDuplicateData2 = indexFieldHasDuplicateData(reader, "notDuplicateField"); |
| 725 | + reader.close(); |
| 726 | + dir.close(); |
| 727 | + assertTrue(hasDuplicateData); |
| 728 | + assertFalse(hasDuplicateData2); |
775 | 729 | } |
776 | 730 |
|
777 | 731 | public void testMaxScoreQueryVisitor() { |
|
0 commit comments