From a9d6c0d276bbb010563668dab8cb18fd9b75db8a Mon Sep 17 00:00:00 2001 From: markharwood Date: Fri, 2 Jun 2017 17:53:40 +0100 Subject: [PATCH 1/3] Aggregations bug: Significant_text fails on arrays of text. The set of previously-seen tokens in a doc was allocated per-JSON-field string value rather than once per JSON document meaning the number of docs containing a term could be over-counted leading to exceptions from the checks in significance heuristics. Added unit test for this scenario Closes #25029 --- .../SignificantTextAggregator.java | 87 +++++++++---------- .../SignificantTextAggregatorTests.java | 31 +++++++ 2 files changed, 74 insertions(+), 44 deletions(-) diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java index c7539a4ca0216..6ec62df3831ac 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java @@ -113,45 +113,39 @@ public void collect(int doc, long bucket) throws IOException { } } - private void processTokenStream(int doc, long bucket, TokenStream ts, String fieldText) throws IOException{ + private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms, String fieldText) throws IOException{ if (dupSequenceSpotter != null) { ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter); } CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); try { - //Assume tokens will average 5 bytes in length to size number of tokens - BytesRefHash inDocTerms = new BytesRefHash(1+(fieldText.length()/5), context.bigArrays()); - - try{ - while (ts.incrementToken()) { - if (dupSequenceSpotter != null) { - long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes(); - long growth = newTrieSize - lastTrieSize; - // Only update the circuitbreaker after - if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) { - addRequestCircuitBreakerBytes(growth); - lastTrieSize = newTrieSize; - } + while (ts.incrementToken()) { + if (dupSequenceSpotter != null) { + long newTrieSize = dupSequenceSpotter.getEstimatedSizeInBytes(); + long growth = newTrieSize - lastTrieSize; + // Only update the circuitbreaker after + if (growth > MEMORY_GROWTH_REPORTING_INTERVAL_BYTES) { + addRequestCircuitBreakerBytes(growth); + lastTrieSize = newTrieSize; } - previous.clear(); - previous.copyChars(termAtt); - BytesRef bytes = previous.get(); - if (inDocTerms.add(bytes) >= 0) { - if (includeExclude == null || includeExclude.accept(bytes)) { - long bucketOrdinal = bucketOrds.add(bytes); - if (bucketOrdinal < 0) { // already seen - bucketOrdinal = -1 - bucketOrdinal; - collectExistingBucket(sub, doc, bucketOrdinal); - } else { - collectBucket(sub, doc, bucketOrdinal); - } + } + previous.clear(); + previous.copyChars(termAtt); + BytesRef bytes = previous.get(); + if (inDocTerms.add(bytes) >= 0) { + if (includeExclude == null || includeExclude.accept(bytes)) { + long bucketOrdinal = bucketOrds.add(bytes); + if (bucketOrdinal < 0) { // already seen + bucketOrdinal = -1 - bucketOrdinal; + collectExistingBucket(sub, doc, bucketOrdinal); + } else { + collectBucket(sub, doc, bucketOrdinal); } } } - } finally{ - Releasables.close(inDocTerms); } + } finally{ ts.close(); } @@ -166,23 +160,28 @@ private void collectFromSource(int doc, long bucket, String indexedFieldName, St SourceLookup sourceLookup = context.lookup().source(); sourceLookup.setSegmentAndDocument(ctx, doc); + BytesRefHash inDocTerms = new BytesRefHash(256, context.bigArrays()); - for (String sourceField : sourceFieldNames) { - List textsToHighlight = sourceLookup.extractRawValues(sourceField); - textsToHighlight = textsToHighlight.stream().map(obj -> { - if (obj instanceof BytesRef) { - return fieldType.valueForDisplay(obj).toString(); - } else { - return obj; - } - }).collect(Collectors.toList()); - - Analyzer analyzer = fieldType.indexAnalyzer(); - for (Object fieldValue : textsToHighlight) { - String fieldText = fieldValue.toString(); - TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText); - processTokenStream(doc, bucket, ts, fieldText); - } + try { + for (String sourceField : sourceFieldNames) { + List textsToHighlight = sourceLookup.extractRawValues(sourceField); + textsToHighlight = textsToHighlight.stream().map(obj -> { + if (obj instanceof BytesRef) { + return fieldType.valueForDisplay(obj).toString(); + } else { + return obj; + } + }).collect(Collectors.toList()); + + Analyzer analyzer = fieldType.indexAnalyzer(); + for (Object fieldValue : textsToHighlight) { + String fieldText = fieldValue.toString(); + TokenStream ts = analyzer.tokenStream(indexedFieldName, fieldText); + processTokenStream(doc, bucket, ts, inDocTerms, fieldText); + } + } + } finally{ + Releasables.close(inDocTerms); } } }; diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java index c8d8b6d59798a..b3f99543fc6df 100644 --- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java +++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java @@ -123,4 +123,35 @@ public void testSignificance() throws IOException { } } } + + /** + * Test documents with arrays of text + */ + public void testSignificanceOnTextArrays() throws IOException { + TextFieldType textFieldType = new TextFieldType(); + textFieldType.setName("text"); + textFieldType.setIndexAnalyzer(new NamedAnalyzer("my_analyzer", AnalyzerScope.GLOBAL, new StandardAnalyzer())); + + IndexWriterConfig indexWriterConfig = newIndexWriterConfig(); + try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, indexWriterConfig)) { + for (int i = 0; i < 10; i++) { + Document doc = new Document(); + doc.add(new Field("text", "foo", textFieldType)); + String json ="{ \"text\" : [\"foo\",\"foo\"]}"; + doc.add(new StoredField("_source", new BytesRef(json))); + w.addDocument(doc); + } + + SignificantTextAggregationBuilder sigAgg = new SignificantTextAggregationBuilder("sig_text", "text"); + try (IndexReader reader = DirectoryReader.open(w)) { + assertEquals("test expects a single segment", 1, reader.leaves().size()); + IndexSearcher searcher = new IndexSearcher(reader); + searchAndReduce(searcher, new TermQuery(new Term("text", "foo")), sigAgg, textFieldType); + // No significant results to be found in this test - only checking we don't end up + // with the internal exception discovered in issue https://github.com/elastic/elasticsearch/issues/25029 + } + } + } + + } From 2c982c0d5a0c724c77c2ac2e3f52df7b5ca0e7d2 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 12 Jun 2017 12:01:51 +0100 Subject: [PATCH 2/3] Added multi-field test --- .../bucket/significant/SignificantTextAggregatorTests.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java index b3f99543fc6df..8376d8c57a10b 100644 --- a/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java +++ b/core/src/test/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregatorTests.java @@ -137,12 +137,13 @@ public void testSignificanceOnTextArrays() throws IOException { for (int i = 0; i < 10; i++) { Document doc = new Document(); doc.add(new Field("text", "foo", textFieldType)); - String json ="{ \"text\" : [\"foo\",\"foo\"]}"; + String json ="{ \"text\" : [\"foo\",\"foo\"], \"title\" : [\"foo\", \"foo\"]}"; doc.add(new StoredField("_source", new BytesRef(json))); w.addDocument(doc); } SignificantTextAggregationBuilder sigAgg = new SignificantTextAggregationBuilder("sig_text", "text"); + sigAgg.sourceFieldNames(Arrays.asList(new String [] {"title", "text"})); try (IndexReader reader = DirectoryReader.open(w)) { assertEquals("test expects a single segment", 1, reader.leaves().size()); IndexSearcher searcher = new IndexSearcher(reader); From a430e3967ed2efb97fb4a7772da4ae50e2e36607 Mon Sep 17 00:00:00 2001 From: markharwood Date: Mon, 12 Jun 2017 12:51:53 +0100 Subject: [PATCH 3/3] Checkstyle violation fix --- .../bucket/significant/SignificantTextAggregator.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java index 6ec62df3831ac..8b58678b67652 100644 --- a/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java +++ b/core/src/main/java/org/elasticsearch/search/aggregations/bucket/significant/SignificantTextAggregator.java @@ -113,7 +113,8 @@ public void collect(int doc, long bucket) throws IOException { } } - private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms, String fieldText) throws IOException{ + private void processTokenStream(int doc, long bucket, TokenStream ts, BytesRefHash inDocTerms, String fieldText) + throws IOException{ if (dupSequenceSpotter != null) { ts = new DeDuplicatingTokenFilter(ts, dupSequenceSpotter); }