elastic
diff --git a/‎docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc‎
Lines changed: 12 additions & 0 deletions b/‎docs/reference/analysis/tokenfilters/synonym-graph-tokenfilter.asciidoc‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc‎
Lines changed: 13 additions & 0 deletions b/‎docs/reference/analysis/tokenfilters/synonym-tokenfilter.asciidoc‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java‎
Lines changed: 8 additions & 2 deletions b/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/ASCIIFoldingTokenFilterFactory.java‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java‎
Lines changed: 6 additions & 0 deletions b/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/AbstractCompoundWordTokenFilterFactory.java‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java‎
Lines changed: 14 additions & 0 deletions b/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CJKBigramFilterFactory.java‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java‎
Lines changed: 1 addition & 1 deletion b/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonAnalysisPlugin.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java‎
Lines changed: 13 additions & 0 deletions b/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/CommonGramsTokenFilterFactory.java‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java‎
Lines changed: 13 additions & 0 deletions b/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/EdgeNGramTokenFilterFactory.java‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java‎
Lines changed: 13 additions & 0 deletions b/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/FingerprintTokenFilterFactory.java‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java‎
Lines changed: 23 additions & 1 deletion b/‎modules/analysis-common/src/main/java/org/elasticsearch/analysis/common/MultiplexerTokenFilterFactory.java‎
Lines changed: 23 additions & 1 deletion
@@ -175,3 +175,15 @@ PUT /test_index
 
 Using `synonyms_path` to define WordNet synonyms in a file is supported
 as well.
+
+=== Parsing synonym files
+
+Elasticsearch will use the token filters preceding the synonym filter
+in a tokenizer chain to parse the entries in a synonym file.  So, for example, if a
+synonym filter is placed after a stemmer, then the stemmer will also be applied
+to the synonym entries.  Because entries in the synonym map cannot have stacked
+positions, some token filters may cause issues here.  Token filters that produce
+multiple versions of a token may choose which version of the token to emit when
+parsing synonyms, e.g. `asciifolding` will only produce the folded version of the
+token.  Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an
+error.
@@ -163,3 +163,16 @@ PUT /test_index
 
 Using `synonyms_path` to define WordNet synonyms in a file is supported
 as well.
+
+
+=== Parsing synonym files
+
+Elasticsearch will use the token filters preceding the synonym filter
+in a tokenizer chain to parse the entries in a synonym file.  So, for example, if a
+synonym filter is placed after a stemmer, then the stemmer will also be applied
+to the synonym entries.  Because entries in the synonym map cannot have stacked
+positions, some token filters may cause issues here.  Token filters that produce
+multiple versions of a token may choose which version of the token to emit when
+parsing synonyms, e.g. `asciifolding` will only produce the folded version of the
+token.  Others, e.g. `multiplexer`, `word_delimiter_graph` or `ngram` will throw an
+error.
@@ -33,7 +33,8 @@
  * Factory for ASCIIFoldingFilter.
  */
 public class ASCIIFoldingTokenFilterFactory extends AbstractTokenFilterFactory
-        implements MultiTermAwareComponent {
+    implements MultiTermAwareComponent {
+
     public static final ParseField PRESERVE_ORIGINAL = new ParseField("preserve_original");
     public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;
 
@@ -53,7 +54,7 @@ public TokenStream create(TokenStream tokenStream) {
     }
 
     @Override
-    public Object getMultiTermComponent() {
+    public TokenFilterFactory getSynonymFilter() {
         if (preserveOriginal == false) {
             return this;
         } else {
@@ -70,4 +71,9 @@ public TokenStream create(TokenStream tokenStream) {
             };
         }
     }
+
+    @Override
+    public Object getMultiTermComponent() {
+        return getSynonymFilter();
+    }
 }
@@ -26,6 +26,7 @@
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 import org.elasticsearch.index.analysis.Analysis;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 /**
  * Contains the common configuration settings between subclasses of this class.
@@ -51,4 +52,9 @@ protected AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, En
             throw new IllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");
         }
     }
+
+    @Override
+    public TokenFilterFactory getSynonymFilter() {
+        return IDENTITY_FILTER;     // don't decompound synonym file
+    }
 }
@@ -19,13 +19,16 @@
 
 package org.elasticsearch.analysis.common;
 
+import org.apache.logging.log4j.LogManager;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.cjk.CJKBigramFilter;
 import org.apache.lucene.analysis.miscellaneous.DisableGraphAttribute;
+import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 import java.util.Arrays;
 import java.util.HashSet;
@@ -48,6 +51,9 @@
  */
 public final class CJKBigramFilterFactory extends AbstractTokenFilterFactory {
 
+    private static final DeprecationLogger DEPRECATION_LOGGER
+        = new DeprecationLogger(LogManager.getLogger(CJKBigramFilterFactory.class));
+
     private final int flags;
     private final boolean outputUnigrams;
 
@@ -90,4 +96,12 @@ public TokenStream create(TokenStream tokenStream) {
         return filter;
     }
 
+    @Override
+    public TokenFilterFactory getSynonymFilter() {
+        if (outputUnigrams) {
+            DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+                + "] will not be usable to parse synonyms after v7.0");
+        }
+        return this;
+    }
 }
@@ -425,7 +425,7 @@ public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
         filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
-        filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
+        filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, false, KeywordRepeatFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
         filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
                 new LengthFilter(input, 0, Integer.MAX_VALUE)));  // TODO this one seems useless
 
@@ -19,18 +19,24 @@
 
 package org.elasticsearch.analysis.common;
 
+import org.apache.logging.log4j.LogManager;
 import org.apache.lucene.analysis.CharArraySet;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.commongrams.CommonGramsFilter;
 import org.apache.lucene.analysis.commongrams.CommonGramsQueryFilter;
+import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 import org.elasticsearch.index.analysis.Analysis;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 public class CommonGramsTokenFilterFactory extends AbstractTokenFilterFactory {
 
+    private static final DeprecationLogger DEPRECATION_LOGGER
+        = new DeprecationLogger(LogManager.getLogger(CommonGramsTokenFilterFactory.class));
+
     private final CharArraySet words;
 
     private final boolean ignoreCase;
@@ -60,5 +66,12 @@ public TokenStream create(TokenStream tokenStream) {
             return filter;
         }
     }
+
+    @Override
+    public TokenFilterFactory getSynonymFilter() {
+        DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+            + "] will not be usable to parse synonyms after v7.0");
+        return this;
+    }
 }
 
@@ -19,18 +19,24 @@
 
 package org.elasticsearch.analysis.common;
 
+import org.apache.logging.log4j.LogManager;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
 import org.apache.lucene.analysis.ngram.NGramTokenFilter;
 import org.apache.lucene.analysis.reverse.ReverseStringFilter;
+import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 
 public class EdgeNGramTokenFilterFactory extends AbstractTokenFilterFactory {
 
+    private static final DeprecationLogger DEPRECATION_LOGGER
+        = new DeprecationLogger(LogManager.getLogger(EdgeNGramTokenFilterFactory.class));
+
     private final int minGram;
 
     private final int maxGram;
@@ -77,4 +83,11 @@ public TokenStream create(TokenStream tokenStream) {
     public boolean breaksFastVectorHighlighter() {
         return true;
     }
+
+    @Override
+    public TokenFilterFactory getSynonymFilter() {
+        DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+            + "] will not be usable to parse synonyms after v7.0");
+        return this;
+    }
 }
@@ -19,18 +19,24 @@
 
 package org.elasticsearch.analysis.common;
 
+import org.apache.logging.log4j.LogManager;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.FingerprintFilter;
+import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
+import org.elasticsearch.index.analysis.TokenFilterFactory;
 
 import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.DEFAULT_MAX_OUTPUT_SIZE;
 import static org.elasticsearch.analysis.common.FingerprintAnalyzerProvider.MAX_OUTPUT_SIZE;
 
 public class FingerprintTokenFilterFactory extends AbstractTokenFilterFactory {
 
+    private static final DeprecationLogger DEPRECATION_LOGGER
+        = new DeprecationLogger(LogManager.getLogger(FingerprintTokenFilterFactory.class));
+
     private final char separator;
     private final int maxOutputSize;
 
@@ -47,4 +53,11 @@ public TokenStream create(TokenStream tokenStream) {
         return result;
     }
 
+    @Override
+    public TokenFilterFactory getSynonymFilter() {
+        DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+            + "] will not be usable to parse synonyms after v7.0");
+        return this;
+    }
+
 }
@@ -19,12 +19,14 @@
 
 package org.elasticsearch.analysis.common;
 
+import org.apache.logging.log4j.LogManager;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.miscellaneous.ConditionalTokenFilter;
 import org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.logging.DeprecationLogger;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.env.Environment;
 import org.elasticsearch.index.IndexSettings;
@@ -40,6 +42,9 @@
 
 public class MultiplexerTokenFilterFactory extends AbstractTokenFilterFactory {
 
+    private static final DeprecationLogger DEPRECATION_LOGGER
+        = new DeprecationLogger(LogManager.getLogger(MultiplexerTokenFilterFactory.class));
+
     private List<String> filterNames;
     private final boolean preserveOriginal;
 
@@ -54,6 +59,17 @@ public TokenStream create(TokenStream tokenStream) {
         throw new UnsupportedOperationException("TokenFilterFactory.getChainAwareTokenFilterFactory() must be called first");
     }
 
+    @Override
+    public TokenFilterFactory getSynonymFilter() {
+        if (preserveOriginal) {
+            DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+                + "] will not be usable to parse synonyms after v7.0");
+            return IDENTITY_FILTER;
+        }
+        throw new IllegalArgumentException("Token filter [" + name()
+            + "] cannot be used to parse synonyms unless [preserve_original] is [true]");
+    }
+
     @Override
     public TokenFilterFactory getChainAwareTokenFilterFactory(TokenizerFactory tokenizer, List<CharFilterFactory> charFilters,
                                                               List<TokenFilterFactory> previousTokenFilters,
@@ -98,7 +114,13 @@ public TokenStream create(TokenStream tokenStream) {
 
             @Override
             public TokenFilterFactory getSynonymFilter() {
-                return IDENTITY_FILTER;
+                if (preserveOriginal) {
+                    DEPRECATION_LOGGER.deprecatedAndMaybeLog("synonym_tokenfilters", "Token filter [" + name()
+                        + "] will not be usable to parse synonyms after v7.0");
+                    return IDENTITY_FILTER;
+                }
+                throw new IllegalArgumentException("Token filter [" + name()
+                    + "] cannot be used to parse synonyms unless [preserve_original] is [true]");
             }
         };
     }
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@`
`26`	`26`	`import org.elasticsearch.index.IndexSettings;`
`27`	`27`	`import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;`
`28`	`28`	`import org.elasticsearch.index.analysis.Analysis;`
	`29`	`+import org.elasticsearch.index.analysis.TokenFilterFactory;`
`29`	`30`
`30`	`31`	`/**`
`31`	`32`	`* Contains the common configuration settings between subclasses of this class.`
`@@ -51,4 +52,9 @@ protected AbstractCompoundWordTokenFilterFactory(IndexSettings indexSettings, En`
`51`	`52`	`throw new IllegalArgumentException("word_list must be provided for [" + name + "], either as a path to a file, or directly");`
`52`	`53`	`}`
`53`	`54`	`}`
	`55`	`+`
	`56`	`+ @Override`
	`57`	`+ public TokenFilterFactory getSynonymFilter() {`
	`58`	`+ return IDENTITY_FILTER; // don't decompound synonym file`
	`59`	`+ }`
`54`	`60`	`}`