-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Start building analysis-common module #23614
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1869db8
a4ed494
1e4a2d9
e0cef65
2310f3c
ed8d9be
dcbc7cf
c20aca4
a09c37f
c92a6a8
39df4c9
1ba49a6
1caf03f
16102ff
9ee485f
c98bcc4
04512d2
efb6392
42338e9
5b555a7
929b83f
7301a6a
047173b
5fb1442
5d45b69
fced812
eaafe40
f6b907c
d4bf67c
96531c7
dfdc53c
51609e1
8608652
fac537e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,10 +20,20 @@ | |
| package org.elasticsearch.index.analysis; | ||
|
|
||
| import org.apache.lucene.analysis.TokenStream; | ||
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; | ||
| import org.elasticsearch.search.fetch.subphase.highlight.FastVectorHighlighter; | ||
|
|
||
| public interface TokenFilterFactory { | ||
|
|
||
| String name(); | ||
|
|
||
| TokenStream create(TokenStream tokenStream); | ||
|
|
||
| /** | ||
| * Does this analyzer mess up the {@link OffsetAttribute}s in such as way as to break the | ||
| * {@link FastVectorHighlighter}? If this is {@code true} then the | ||
| * {@linkplain FastVectorHighlighter} will attempt to work around the broken offsets. | ||
| */ | ||
| default boolean breaksFastVectorHighlighter() { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this something we should really "allow"? Perhaps the hack could continue to exist as it did before, but with checking the name of the class instead of instanceof?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure. I kind of like that the hack is at least more visible this way. For now I think we should keep it. Maybe we can pitch it if we ever go to 100% unified highlighter.... |
||
| return false; | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,8 @@ | |
| */ | ||
| package org.elasticsearch.action.admin.indices; | ||
|
|
||
| import org.apache.lucene.analysis.MockTokenFilter; | ||
| import org.apache.lucene.analysis.TokenStream; | ||
| import org.elasticsearch.Version; | ||
| import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest; | ||
| import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse; | ||
|
|
@@ -27,18 +29,28 @@ | |
| import org.elasticsearch.common.settings.Settings; | ||
| import org.elasticsearch.env.Environment; | ||
| import org.elasticsearch.index.IndexSettings; | ||
| import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; | ||
| import org.elasticsearch.index.analysis.AnalysisRegistry; | ||
| import org.elasticsearch.index.analysis.IndexAnalyzers; | ||
| import org.elasticsearch.index.analysis.TokenFilterFactory; | ||
| import org.elasticsearch.index.mapper.AllFieldMapper; | ||
| import org.elasticsearch.indices.analysis.AnalysisModule; | ||
| import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; | ||
| import org.elasticsearch.plugins.AnalysisPlugin; | ||
| import org.elasticsearch.test.ESTestCase; | ||
| import org.elasticsearch.test.IndexSettingsModule; | ||
|
|
||
| import java.io.IOException; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
|
|
||
| import static java.util.Collections.emptyList; | ||
| import static java.util.Collections.singletonList; | ||
| import static java.util.Collections.singletonMap; | ||
|
|
||
| /** | ||
| * Tests for {@link TransportAnalyzeAction}. See the more "intense" version of this test in the | ||
| * {@code common-analysis} module. | ||
| */ | ||
| public class TransportAnalyzeActionTests extends ESTestCase { | ||
|
|
||
| private IndexAnalyzers indexAnalyzers; | ||
|
|
@@ -53,23 +65,28 @@ public void setUp() throws Exception { | |
| Settings indexSettings = Settings.builder() | ||
| .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) | ||
| .put(IndexMetaData.SETTING_INDEX_UUID, UUIDs.randomBase64UUID()) | ||
| .put("index.analysis.filter.wordDelimiter.type", "word_delimiter") | ||
| .put("index.analysis.filter.wordDelimiter.split_on_numerics", false) | ||
| .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace") | ||
| .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter") | ||
| .put("index.analysis.analyzer.custom_analyzer.tokenizer", "whitespace") | ||
| .putArray("index.analysis.analyzer.custom_analyzer.filter", "lowercase", "wordDelimiter") | ||
| .put("index.analysis.tokenizer.trigram.type", "ngram") | ||
| .put("index.analysis.tokenizer.trigram.min_gram", 3) | ||
| .put("index.analysis.tokenizer.trigram.max_gram", 3) | ||
| .put("index.analysis.filter.synonym.type", "synonym") | ||
| .putArray("index.analysis.filter.synonym.synonyms", "kimchy => shay") | ||
| .put("index.analysis.filter.synonym.tokenizer", "trigram") | ||
| .put("index.analysis.filter.synonym.min_gram", 3) | ||
| .put("index.analysis.filter.synonym.max_gram", 3).build(); | ||
| .put("index.analysis.analyzer.custom_analyzer.tokenizer", "standard") | ||
| .put("index.analysis.analyzer.custom_analyzer.filter", "mock").build(); | ||
| IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("index", indexSettings); | ||
| environment = new Environment(settings); | ||
| registry = new AnalysisModule(environment, emptyList()).getAnalysisRegistry(); | ||
| AnalysisPlugin plugin = new AnalysisPlugin() { | ||
| class MockFactory extends AbstractTokenFilterFactory { | ||
| MockFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { | ||
| super(indexSettings, name, settings); | ||
| } | ||
|
|
||
| @Override | ||
| public TokenStream create(TokenStream tokenStream) { | ||
| return new MockTokenFilter(tokenStream, MockTokenFilter.ENGLISH_STOPSET); | ||
| } | ||
| } | ||
|
|
||
| @Override | ||
| public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() { | ||
| return singletonMap("mock", MockFactory::new); | ||
| } | ||
| }; | ||
| registry = new AnalysisModule(environment, singletonList(plugin)).getAnalysisRegistry(); | ||
| indexAnalyzers = registry.build(idxSettings); | ||
| } | ||
|
|
||
|
|
@@ -143,51 +160,44 @@ public void testFillsAttributes() throws IOException { | |
| } | ||
|
|
||
| public void testWithIndexAnalyzers() throws IOException { | ||
|
|
||
| AnalyzeRequest request = new AnalyzeRequest(); | ||
| request.analyzer("standard"); | ||
| request.text("the quick brown fox"); | ||
| request.analyzer("custom_analyzer"); | ||
| request.text("the qu1ck brown fox"); | ||
| AnalyzeResponse analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); | ||
| List<AnalyzeResponse.AnalyzeToken> tokens = analyze.getTokens(); | ||
| assertEquals(4, tokens.size()); | ||
| assertEquals(3, tokens.size()); | ||
| assertEquals("quick", tokens.get(0).getTerm()); | ||
| assertEquals("brown", tokens.get(1).getTerm()); | ||
| assertEquals("fox", tokens.get(2).getTerm()); | ||
|
|
||
| request.analyzer("whitespace"); | ||
| request.text("the qu1ck brown fox-dog"); | ||
| request.analyzer("standard"); | ||
| analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); | ||
| tokens = analyze.getTokens(); | ||
| assertEquals(4, tokens.size()); | ||
| assertEquals("the", tokens.get(0).getTerm()); | ||
| assertEquals("quick", tokens.get(1).getTerm()); | ||
| assertEquals("brown", tokens.get(2).getTerm()); | ||
| assertEquals("fox", tokens.get(3).getTerm()); | ||
|
|
||
| request.analyzer("custom_analyzer"); | ||
| request.text("the qu1ck brown fox-dog"); | ||
| analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); | ||
| tokens = analyze.getTokens(); | ||
| assertEquals(5, tokens.size()); | ||
|
|
||
| // Switch the analyzer out for just a tokenizer | ||
| request.analyzer(null); | ||
| request.tokenizer("whitespace"); | ||
| request.addTokenFilter("lowercase"); | ||
| request.addTokenFilter("wordDelimiter"); | ||
| request.text("the qu1ck brown fox-dog"); | ||
| request.tokenizer("standard"); | ||
| analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); | ||
| tokens = analyze.getTokens(); | ||
| assertEquals(5, tokens.size()); | ||
| assertEquals(4, tokens.size()); | ||
| assertEquals("the", tokens.get(0).getTerm()); | ||
| assertEquals("qu1ck", tokens.get(1).getTerm()); | ||
| assertEquals("quick", tokens.get(1).getTerm()); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this block of the test now seems redundant, it duplicates the one above it
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This one tests using a tokenizer instead of a fully built analyzer. I'll push a comment explaining |
||
| assertEquals("brown", tokens.get(2).getTerm()); | ||
| assertEquals("fox", tokens.get(3).getTerm()); | ||
| assertEquals("dog", tokens.get(4).getTerm()); | ||
|
|
||
| request.analyzer(null); | ||
| request.tokenizer("trigram"); | ||
| request.addTokenFilter("synonym"); | ||
| request.text("kimchy"); | ||
| // Now try applying our token filter | ||
| request.addTokenFilter("mock"); | ||
| analyze = TransportAnalyzeAction.analyze(request, AllFieldMapper.NAME, null, indexAnalyzers, registry, environment); | ||
| tokens = analyze.getTokens(); | ||
| assertEquals(2, tokens.size()); | ||
| assertEquals("sha", tokens.get(0).getTerm()); | ||
| assertEquals("hay", tokens.get(1).getTerm()); | ||
| assertEquals(3, tokens.size()); | ||
| assertEquals("quick", tokens.get(0).getTerm()); | ||
| assertEquals("brown", tokens.get(1).getTerm()); | ||
| assertEquals("fox", tokens.get(2).getTerm()); | ||
| } | ||
|
|
||
| public void testGetIndexAnalyserWithoutIndexAnalyzers() throws IOException { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I should put javadoc on this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Er, on the interface method, rather.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
++, it would help as its not clear to me what the method's purpose is, since its currently used in the
containsBrokenAnalysischeck