From 37c9264db678378d8eecf6d20d2e37ea64dc341a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christoph=20B=C3=BCscher?= Date: Mon, 26 Apr 2021 13:09:03 +0200 Subject: [PATCH] Fix case sensitivity rules for wildcard queries on text fields (#71751) (#72214) Wildcard queries on text fields should not apply the fields analyzer to the search query. However, we accidentally enabled this in #53127 by moving the query normalization to the StringFieldType super type. This change fixes this by separating the notion of normalization and case insensitivity (as implemented in the `case_insensitive` flag). This is done because we still need to maintain normalization of the query sting when the wildcard query method on the field type is requested from the `query_string` query parser. Wildcard queries on keyword fields should also continue to apply the fields normalizer, regardless of whether the `case_insensitive` is set, because normalization could involve something else than lowercasing (e.g. substituting umlauts like in the GermanNormalizationFilter). Closes #71403 --- .../ICUCollationKeywordFieldMapper.java | 1 + .../search/query/SearchQueryIT.java | 7 ++- .../index/mapper/KeywordFieldMapper.java | 16 +++++++ .../index/mapper/MappedFieldType.java | 7 ++- .../index/mapper/StringFieldType.java | 18 +++++++- .../index/search/QueryStringQueryParser.java | 2 +- .../index/mapper/IndexFieldTypeTests.java | 2 +- .../index/mapper/TextFieldTypeTests.java | 43 +++++++++++++++++++ .../wildcard/mapper/WildcardFieldMapper.java | 5 +++ 9 files changed, 96 insertions(+), 5 deletions(-) diff --git a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java index 608b374e4d76d..e34e7bfc175ab 100644 --- a/plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java +++ b/plugins/analysis-icu/src/main/java/org/elasticsearch/index/mapper/ICUCollationKeywordFieldMapper.java @@ -12,6 +12,7 @@ import com.ibm.icu.text.RawCollationKey; import com.ibm.icu.text.RuleBasedCollator; import com.ibm.icu.util.ULocale; + import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.SortedSetDocValuesField; diff --git a/server/src/internalClusterTest/java/org/elasticsearch/search/query/SearchQueryIT.java b/server/src/internalClusterTest/java/org/elasticsearch/search/query/SearchQueryIT.java index 5e818e4aa461f..5d962f26eea71 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/search/query/SearchQueryIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/search/query/SearchQueryIT.java @@ -1924,7 +1924,7 @@ public void testWildcardQueryNormalizationOnKeywordField() { } /** - * Test that wildcard queries on text fields get normalized + * Test that wildcard queries on text fields don't get normalized */ public void testWildcardQueryNormalizationOnTextField() { assertAcked(prepareCreate("test") @@ -1940,6 +1940,11 @@ public void testWildcardQueryNormalizationOnTextField() { { WildcardQueryBuilder wildCardQuery = wildcardQuery("field1", "Bb*"); SearchResponse searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); + assertHitCount(searchResponse, 0L); + + // the following works not because of normalization but because of the `case_insensitive` parameter + wildCardQuery = wildcardQuery("field1", "Bb*").caseInsensitive(true); + searchResponse = client().prepareSearch().setQuery(wildCardQuery).get(); assertHitCount(searchResponse, 1L); wildCardQuery = wildcardQuery("field1", "bb*"); diff --git a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java index 261c0f369ca6b..bb9a0fbfc8a02 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/KeywordFieldMapper.java @@ -15,6 +15,8 @@ import org.apache.lucene.document.SortedSetDocValuesField; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.MultiTermQuery; +import org.apache.lucene.search.Query; import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.lucene.Lucene; import org.elasticsearch.common.xcontent.XContentParser; @@ -325,6 +327,19 @@ protected BytesRef indexedValueForSearch(Object value) { return getTextSearchInfo().getSearchAnalyzer().normalize(name(), value.toString()); } + /** + * Wildcard queries on keyword fields use the normalizer of the underlying field, regardless of their case sensitivity option + */ + @Override + public Query wildcardQuery( + String value, + MultiTermQuery.RewriteMethod method, + boolean caseInsensitive, + SearchExecutionContext context + ) { + return super.wildcardQuery(value, method, caseInsensitive, true, context); + } + @Override public CollapseType collapseType() { return CollapseType.KEYWORD; @@ -335,6 +350,7 @@ public CollapseType collapseType() { public int ignoreAbove() { return ignoreAbove; } + } private final boolean indexed; diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java index 9a53790093578..2bab53a5bea32 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java @@ -36,8 +36,8 @@ import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.query.DistanceFeatureQueryBuilder; import org.elasticsearch.index.query.QueryRewriteContext; -import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.index.query.QueryShardException; +import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.search.DocValueFormat; import org.elasticsearch.search.fetch.subphase.FetchFieldsPhase; import org.elasticsearch.search.lookup.SearchLookup; @@ -248,6 +248,11 @@ public Query wildcardQuery(String value, + "] which is of type [" + typeName() + "]"); } + public Query normalizedWildcardQuery(String value, @Nullable MultiTermQuery.RewriteMethod method, SearchExecutionContext context) { + throw new QueryShardException(context, "Can only use wildcard queries on keyword, text and wildcard fields - not on [" + name + + "] which is of type [" + typeName() + "]"); + } + public Query regexpQuery(String value, int syntaxFlags, int matchFlags, int maxDeterminizedStates, @Nullable MultiTermQuery.RewriteMethod method, SearchExecutionContext context) { throw new QueryShardException(context, "Can only use regexp queries on keyword and text fields - not on [" + name diff --git a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java index 24a963ac02acb..c85e8f19d9d7e 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/StringFieldType.java @@ -113,6 +113,22 @@ public static final String normalizeWildcardPattern(String fieldname, String val @Override public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, boolean caseInsensitive, SearchExecutionContext context) { + return wildcardQuery(value, method, caseInsensitive, false, context); + } + + + @Override + public Query normalizedWildcardQuery(String value, MultiTermQuery.RewriteMethod method, SearchExecutionContext context) { + return wildcardQuery(value, method, false, true, context); + } + + protected Query wildcardQuery( + String value, + MultiTermQuery.RewriteMethod method, + boolean caseInsensitive, + boolean shouldNormalize, + SearchExecutionContext context + ) { failIfNotIndexed(); if (context.allowExpensiveQueries() == false) { throw new ElasticsearchException("[wildcard] queries cannot be executed when '" + @@ -120,7 +136,7 @@ public Query wildcardQuery(String value, MultiTermQuery.RewriteMethod method, bo } Term term; - if (getTextSearchInfo().getSearchAnalyzer() != null) { + if (getTextSearchInfo().getSearchAnalyzer() != null && shouldNormalize) { value = normalizeWildcardPattern(name(), value, getTextSearchInfo().getSearchAnalyzer()); term = new Term(name(), value); } else { diff --git a/server/src/main/java/org/elasticsearch/index/search/QueryStringQueryParser.java b/server/src/main/java/org/elasticsearch/index/search/QueryStringQueryParser.java index d8f168447b9bc..4e53c3f697ed9 100644 --- a/server/src/main/java/org/elasticsearch/index/search/QueryStringQueryParser.java +++ b/server/src/main/java/org/elasticsearch/index/search/QueryStringQueryParser.java @@ -682,7 +682,7 @@ private Query getWildcardQuerySingle(String field, String termStr) throws ParseE if (getAllowLeadingWildcard() == false && (termStr.startsWith("*") || termStr.startsWith("?"))) { throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery"); } - return currentFieldType.wildcardQuery(termStr, getMultiTermRewriteMethod(), context); + return currentFieldType.normalizedWildcardQuery(termStr, getMultiTermRewriteMethod(), context); } catch (RuntimeException e) { if (lenient) { return newLenientFieldQuery(field, e); diff --git a/server/src/test/java/org/elasticsearch/index/mapper/IndexFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/IndexFieldTypeTests.java index ee951ffad4e03..c7b17bcbd1ba5 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/IndexFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/IndexFieldTypeTests.java @@ -14,8 +14,8 @@ import org.elasticsearch.common.regex.Regex; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.IndexSettings; -import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.index.query.QueryShardException; +import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.test.ESTestCase; import java.util.function.Predicate; diff --git a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldTypeTests.java b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldTypeTests.java index 84edce76762c4..2d3b6d683f698 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/TextFieldTypeTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/TextFieldTypeTests.java @@ -19,9 +19,11 @@ import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; +import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.Automata; import org.apache.lucene.util.automaton.Automaton; +import org.apache.lucene.util.automaton.CharacterRunAutomaton; import org.apache.lucene.util.automaton.Operations; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.lucene.BytesRefs; @@ -157,4 +159,45 @@ public void testFetchSourceValue() throws IOException { assertEquals(Collections.singletonList("42"), fetchSourceValue(fieldType, 42L)); assertEquals(Collections.singletonList("true"), fetchSourceValue(fieldType, true)); } + + public void testWildcardQuery() { + TextFieldType ft = createFieldType(); + + // case sensitive + AutomatonQuery actual = (AutomatonQuery) ft.wildcardQuery("*Butterflies*", null, false, MOCK_CONTEXT); + AutomatonQuery expected = new WildcardQuery(new Term("field", new BytesRef("*Butterflies*"))); + assertEquals(expected, actual); + assertFalse(new CharacterRunAutomaton(actual.getAutomaton()).run("some butterflies somewhere")); + + // case insensitive + actual = (AutomatonQuery) ft.wildcardQuery("*Butterflies*", null, true, MOCK_CONTEXT); + expected = AutomatonQueries.caseInsensitiveWildcardQuery(new Term("field", new BytesRef("*Butterflies*"))); + assertEquals(expected, actual); + assertTrue(new CharacterRunAutomaton(actual.getAutomaton()).run("some butterflies somewhere")); + assertTrue(new CharacterRunAutomaton(actual.getAutomaton()).run("some Butterflies somewhere")); + + ElasticsearchException ee = expectThrows(ElasticsearchException.class, + () -> ft.wildcardQuery("valu*", null, MOCK_CONTEXT_DISALLOW_EXPENSIVE)); + assertEquals("[wildcard] queries cannot be executed when 'search.allow_expensive_queries' is set to false.", + ee.getMessage()); + } + + /** + * we use this e.g. in query string query parser to normalize terms on text fields + */ + public void testNormalizedWildcardQuery() { + TextFieldType ft = createFieldType(); + + AutomatonQuery actual = (AutomatonQuery) ft.normalizedWildcardQuery("*Butterflies*", null, MOCK_CONTEXT); + AutomatonQuery expected = new WildcardQuery(new Term("field", new BytesRef("*butterflies*"))); + assertEquals(expected, actual); + assertTrue(new CharacterRunAutomaton(actual.getAutomaton()).run("some butterflies somewhere")); + assertFalse(new CharacterRunAutomaton(actual.getAutomaton()).run("some Butterflies somewhere")); + + ElasticsearchException ee = expectThrows(ElasticsearchException.class, + () -> ft.wildcardQuery("valu*", null, MOCK_CONTEXT_DISALLOW_EXPENSIVE)); + assertEquals("[wildcard] queries cannot be executed when 'search.allow_expensive_queries' is set to false.", + ee.getMessage()); + } + } diff --git a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java index 05dcee3d9675e..69cf1acb611ba 100644 --- a/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java +++ b/x-pack/plugin/wildcard/src/main/java/org/elasticsearch/xpack/wildcard/mapper/WildcardFieldMapper.java @@ -265,6 +265,11 @@ private WildcardFieldType(String name, String nullValue, int ignoreAbove, this.ignoreAbove = ignoreAbove; } + @Override + public Query normalizedWildcardQuery(String value, MultiTermQuery.RewriteMethod method, SearchExecutionContext context) { + return wildcardQuery(value, method, false, context); + } + @Override public Query wildcardQuery(String wildcardPattern, RewriteMethod method, boolean caseInsensitive, SearchExecutionContext context) {