diff --git a/core/src/main/java/org/elasticsearch/index/similarity/ClassicSimilarityProvider.java b/core/src/main/java/org/elasticsearch/index/similarity/ClassicSimilarityProvider.java index e031c5b3dac1c..1bff743de0076 100644 --- a/core/src/main/java/org/elasticsearch/index/similarity/ClassicSimilarityProvider.java +++ b/core/src/main/java/org/elasticsearch/index/similarity/ClassicSimilarityProvider.java @@ -36,12 +36,19 @@ */ public class ClassicSimilarityProvider extends AbstractSimilarityProvider { + private final Version indexCreatedVersion; private final ClassicSimilarity similarity = new ClassicSimilarity(); public ClassicSimilarityProvider(String name, Settings settings, Settings indexSettings) { super(name); + indexCreatedVersion = Version.indexCreated(indexSettings); + if (indexCreatedVersion.onOrAfter(Version.V_6_0_0_beta2)) { + throw new IllegalArgumentException("The [classic] similarity is disallowed as of 6.0. It is advised that you use the [BM25] " + + "similarity instead which usually provides better scores. In case you really need to keep the same scores as the " + + "[classic] similarity, it is possible to reimplement it using the [scripted] similarity."); + } boolean discountOverlaps = settings.getAsBooleanLenientForPreEs6Indices( - Version.indexCreated(indexSettings), "discount_overlaps", true, new DeprecationLogger(ESLoggerFactory.getLogger(getClass()))); + indexCreatedVersion, "discount_overlaps", true, new DeprecationLogger(ESLoggerFactory.getLogger(getClass()))); this.similarity.setDiscountOverlaps(discountOverlaps); } diff --git a/core/src/main/java/org/elasticsearch/index/similarity/SimilarityService.java b/core/src/main/java/org/elasticsearch/index/similarity/SimilarityService.java index 4d8c9359a8172..9c1051fb417a9 100644 --- a/core/src/main/java/org/elasticsearch/index/similarity/SimilarityService.java +++ b/core/src/main/java/org/elasticsearch/index/similarity/SimilarityService.java @@ -43,16 +43,17 @@ public final class SimilarityService extends AbstractIndexComponent { private final Similarity defaultSimilarity; private final Map similarities; private static final Map DEFAULTS; + private static final Map V5X_DEFAULTS; public static final Map BUILT_IN; static { Map defaults = new HashMap<>(); Map buildIn = new HashMap<>(); - defaults.put("classic", - (name, settings, indexSettings, scriptService) -> new ClassicSimilarityProvider(name, settings, indexSettings)); defaults.put("BM25", (name, settings, indexSettings, scriptService) -> new BM25SimilarityProvider(name, settings, indexSettings)); defaults.put("boolean", (name, settings, indexSettings, scriptService) -> new BooleanSimilarityProvider(name, settings, indexSettings)); + // We keep the definition of the classic similarity on 6.x indices so that users get get a nicer exception + // if they try to define a custom classic similarity buildIn.put("classic", (name, settings, indexSettings, scriptService) -> new ClassicSimilarityProvider(name, settings, indexSettings)); buildIn.put("BM25", @@ -70,6 +71,11 @@ public final class SimilarityService extends AbstractIndexComponent { buildIn.put("scripted", ScriptedSimilarityProvider::new); DEFAULTS = Collections.unmodifiableMap(defaults); BUILT_IN = Collections.unmodifiableMap(buildIn); + + Map v5xDefaults = new HashMap<>(defaults); + v5xDefaults.put("classic", + (name, settings, indexSettings, scriptService) -> new ClassicSimilarityProvider(name, settings, indexSettings)); + V5X_DEFAULTS = Collections.unmodifiableMap(v5xDefaults); } public SimilarityService(IndexSettings indexSettings, ScriptService scriptService, @@ -94,8 +100,11 @@ public SimilarityService(IndexSettings indexSettings, ScriptService scriptServic SimilarityProvider.Factory factory = similarities.getOrDefault(typeName, defaultFactory); providers.put(name, factory.create(name, providerSettings, indexSettings.getSettings(), scriptService)); } + Map defaults = indexSettings.getIndexVersionCreated().onOrAfter(Version.V_6_0_0_beta2) + ? DEFAULTS + : V5X_DEFAULTS; Map providerMapping = addSimilarities(similaritySettings, indexSettings.getSettings(), scriptService, - DEFAULTS); + defaults); for (Map.Entry entry : providerMapping.entrySet()) { // Avoid overwriting custom providers for indices older that v5.0 if (providers.containsKey(entry.getKey()) && indexSettings.getIndexVersionCreated().before(Version.V_5_0_0_alpha1)) { @@ -133,7 +142,14 @@ private Map addSimilarities(Map s } public SimilarityProvider getSimilarity(String name) { - return similarities.get(name); + SimilarityProvider provider = similarities.get(name); + if (provider == null && "classic".equals(name)) { + // TODO: remove this special case on 7.x + throw new IllegalArgumentException("The [classic] similarity is disallowed as of 6.0. It is advised that you use the [BM25] " + + "similarity instead which usually provides better scores. In case you really need to keep the same scores as the " + + "[classic] similarity, it is possible to reimplement it using the [scripted] similarity."); + } + return provider; } Similarity getDefaultSimilarity() { diff --git a/core/src/test/java/org/elasticsearch/index/similarity/SimilarityServiceTests.java b/core/src/test/java/org/elasticsearch/index/similarity/SimilarityServiceTests.java index ed219c972b614..3804ac5ec4847 100644 --- a/core/src/test/java/org/elasticsearch/index/similarity/SimilarityServiceTests.java +++ b/core/src/test/java/org/elasticsearch/index/similarity/SimilarityServiceTests.java @@ -19,7 +19,7 @@ package org.elasticsearch.index.similarity; import org.apache.lucene.search.similarities.BM25Similarity; -import org.apache.lucene.search.similarities.ClassicSimilarity; +import org.apache.lucene.search.similarities.LMDirichletSimilarity; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.IndexSettings; import org.elasticsearch.test.ESTestCase; @@ -50,10 +50,10 @@ public void testOverrideBuiltInSimilarity() { } public void testOverrideDefaultSimilarity() { - Settings settings = Settings.builder().put("index.similarity.default.type", "classic") + Settings settings = Settings.builder().put("index.similarity.default.type", "LMDirichlet") .build(); IndexSettings indexSettings = IndexSettingsModule.newIndexSettings("test", settings); SimilarityService service = new SimilarityService(indexSettings, null, Collections.emptyMap()); - assertTrue(service.getDefaultSimilarity() instanceof ClassicSimilarity); + assertTrue(service.getDefaultSimilarity() instanceof LMDirichletSimilarity); } } diff --git a/core/src/test/java/org/elasticsearch/index/similarity/SimilarityTests.java b/core/src/test/java/org/elasticsearch/index/similarity/SimilarityTests.java index 3e7f4650c3e6d..22cbde3491f54 100644 --- a/core/src/test/java/org/elasticsearch/index/similarity/SimilarityTests.java +++ b/core/src/test/java/org/elasticsearch/index/similarity/SimilarityTests.java @@ -33,6 +33,8 @@ import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity; import org.apache.lucene.search.similarities.LambdaTTF; import org.apache.lucene.search.similarities.NormalizationH2; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.compress.CompressedXContent; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentFactory; @@ -43,6 +45,7 @@ import org.elasticsearch.plugins.Plugin; import org.elasticsearch.test.ESSingleNodeTestCase; import org.elasticsearch.test.InternalSettingsPlugin; +import org.hamcrest.Matchers; import java.io.IOException; import java.util.Collection; @@ -59,10 +62,16 @@ protected Collection> getPlugins() { public void testResolveDefaultSimilarities() { SimilarityService similarityService = createIndex("foo").similarityService(); - assertThat(similarityService.getSimilarity("classic").get(), instanceOf(ClassicSimilarity.class)); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> similarityService.getSimilarity("classic").get()); + assertThat(e.getMessage(), Matchers.containsString("The [classic] similarity is disallowed")); assertThat(similarityService.getSimilarity("BM25").get(), instanceOf(BM25Similarity.class)); assertThat(similarityService.getSimilarity("boolean").get(), instanceOf(BooleanSimilarity.class)); assertThat(similarityService.getSimilarity("default"), equalTo(null)); + + SimilarityService legacySimilarityService = createIndex("bar", Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_5_6_0) + .build()).similarityService(); + assertThat(legacySimilarityService.getSimilarity("classic").get(), instanceOf(ClassicSimilarity.class)); } public void testResolveSimilaritiesFromMapping_classic() throws IOException { @@ -76,7 +85,13 @@ public void testResolveSimilaritiesFromMapping_classic() throws IOException { .put("index.similarity.my_similarity.type", "classic") .put("index.similarity.my_similarity.discount_overlaps", false) .build(); - IndexService indexService = createIndex("foo", indexSettings); + IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> createIndex("foo", indexSettings)); + assertThat(e.getMessage(), Matchers.containsString("The [classic] similarity is disallowed")); + + Settings legacyIndexSettings = Settings.builder() + .put(indexSettings) + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.V_5_6_0).build(); + IndexService indexService = createIndex("foo", legacyIndexSettings); DocumentMapper documentMapper = indexService.mapperService().documentMapperParser().parse("type", new CompressedXContent(mapping)); assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(), instanceOf(ClassicSimilarityProvider.class)); diff --git a/core/src/test/java/org/elasticsearch/similarity/SimilarityIT.java b/core/src/test/java/org/elasticsearch/similarity/SimilarityIT.java index c925e46cfa048..c9dedd342dac5 100644 --- a/core/src/test/java/org/elasticsearch/similarity/SimilarityIT.java +++ b/core/src/test/java/org/elasticsearch/similarity/SimilarityIT.java @@ -46,7 +46,7 @@ public void testCustomBM25Similarity() throws Exception { .field("type", "text") .endObject() .startObject("field2") - .field("similarity", "classic") + .field("similarity", "boolean") .field("type", "text") .endObject() .endObject() diff --git a/docs/reference/index-modules/similarity.asciidoc b/docs/reference/index-modules/similarity.asciidoc index 5be6fa2ae7246..dc6369d6cb29c 100644 --- a/docs/reference/index-modules/similarity.asciidoc +++ b/docs/reference/index-modules/similarity.asciidoc @@ -72,20 +72,6 @@ This similarity has the following options: Type name: `BM25` -[float] -[[classic-similarity]] -==== Classic similarity - -The classic similarity that is based on the TF/IDF model. This -similarity has the following option: - -`discount_overlaps`:: - Determines whether overlap tokens (Tokens with - 0 position increment) are ignored when computing norm. By default this - is true, meaning overlap tokens do not count when computing norms. - -Type name: `classic` - [float] [[drf]] ==== DFR similarity diff --git a/docs/reference/mapping/params/similarity.asciidoc b/docs/reference/mapping/params/similarity.asciidoc index 0a5979c9d3272..436609b2f25f6 100644 --- a/docs/reference/mapping/params/similarity.asciidoc +++ b/docs/reference/mapping/params/similarity.asciidoc @@ -20,11 +20,6 @@ configuration are: See {defguide}/pluggable-similarites.html[Pluggable Similarity Algorithms] for more information. -`classic`:: - The TF/IDF algorithm which used to be the default in Elasticsearch and - Lucene. See {defguide}/practical-scoring-function.html[Lucene’s Practical Scoring Function] - for more information. - `boolean`:: A simple boolean similarity, which is used when full-text ranking is not needed and the score should only be based on whether the query terms match or not. @@ -44,13 +39,9 @@ PUT my_index "default_field": { <1> "type": "text" }, - "classic_field": { - "type": "text", - "similarity": "classic" <2> - }, "boolean_sim_field": { "type": "text", - "similarity": "boolean" <3> + "similarity": "boolean" <2> } } } @@ -59,5 +50,4 @@ PUT my_index -------------------------------------------------- // CONSOLE <1> The `default_field` uses the `BM25` similarity. -<2> The `classic_field` uses the `classic` similarity (ie TF/IDF). -<3> The `boolean_sim_field` uses the `boolean` similarity. +<2> The `boolean_sim_field` uses the `boolean` similarity. diff --git a/docs/reference/migration/migrate_6_0/settings.asciidoc b/docs/reference/migration/migrate_6_0/settings.asciidoc index 289e1903fb81e..683da5062aa98 100644 --- a/docs/reference/migration/migrate_6_0/settings.asciidoc +++ b/docs/reference/migration/migrate_6_0/settings.asciidoc @@ -86,3 +86,10 @@ they are replaced with `script.allowed_types` and `script.allowed_contexts`. The `discovery.type` settings no longer supports the values `gce`, `aws` and `ec2`. Integration with these platforms should be done by setting the `discovery.zen.hosts_provider` setting to one of those values. + +==== Similarity settings + +The `classic` similarity has been disallowed on 6.0+ indices. We advise to +use <> instead, which usually produces better scores. In case you +absolutely need to keep the same scores, you can reimplement the `classic` +similarity using the new <> similarity.