-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Upgrade to lucene-6.5.0-snapshot-f919485. #23087
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 886c1da9adc3347f61ab95ecbf4dbeeaa0e7acb2 |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| df9e94f63ad7d9188f14820c435ea1dc3c28d87a |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 3539f8dc9c3ed8ebe90afcb3daa2e9afcf5108d1 |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| da76338e4f299963da9d7ab33dae7586dfc902c2 |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| f6318d120236c7ac03fca6bf98825b4cb4347fc8 |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 68f045ff272e10c307fe25a1867c2948b614b57c |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| b58a7a15267614a9a14f7cf6257454e0c24b146d |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| d5f00fcd00fee6906b563d201bc00bdea7a92baa |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 2664901a494d87e9f4cef65be14cca918da7c4f5 |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 476a79293f9a15ea1ee5f93684587205d03480d1 |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| f4dd70223178cca067b0cade4e58c4d82bec87d6 |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 72c4ec5d811480164db556b54c7a76bd3ea16bd6 |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| f7af3755fdd09df7c258c655aff03ddef9536a04 |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 2bf820109203b990e93a05dade8dcebec6aeb71a |
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| fc1f32923ee68761ee05051f4ef6f4a4ab3acdec |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| /* | ||
| * Licensed to Elasticsearch under one or more contributor | ||
| * license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright | ||
| * ownership. Elasticsearch licenses this file to you under | ||
| * the Apache License, Version 2.0 (the "License"); you may | ||
| * not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, | ||
| * software distributed under the License is distributed on an | ||
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| * KIND, either express or implied. See the License for the | ||
| * specific language governing permissions and limitations | ||
| * under the License. | ||
| */ | ||
|
|
||
| package org.elasticsearch.index.analysis; | ||
|
|
||
| import org.apache.lucene.analysis.CharArraySet; | ||
| import org.apache.lucene.analysis.TokenStream; | ||
| import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter; | ||
| import org.apache.lucene.analysis.miscellaneous.WordDelimiterIterator; | ||
| import org.elasticsearch.common.settings.Settings; | ||
| import org.elasticsearch.env.Environment; | ||
| import org.elasticsearch.index.IndexSettings; | ||
|
|
||
| import java.util.List; | ||
| import java.util.Set; | ||
|
|
||
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_ALL; | ||
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_NUMBERS; | ||
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.CATENATE_WORDS; | ||
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_NUMBER_PARTS; | ||
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.GENERATE_WORD_PARTS; | ||
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.PRESERVE_ORIGINAL; | ||
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_CASE_CHANGE; | ||
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.SPLIT_ON_NUMERICS; | ||
| import static org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE; | ||
| import static org.elasticsearch.index.analysis.WordDelimiterTokenFilterFactory.parseTypes; | ||
|
|
||
| public class WordDelimiterGraphTokenFilterFactory extends AbstractTokenFilterFactory { | ||
|
|
||
| private final byte[] charTypeTable; | ||
| private final int flags; | ||
| private final CharArraySet protoWords; | ||
|
|
||
| public WordDelimiterGraphTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { | ||
| super(indexSettings, name, settings); | ||
|
|
||
| // Sample Format for the type table: | ||
| // $ => DIGIT | ||
| // % => DIGIT | ||
| // . => DIGIT | ||
| // \u002C => DIGIT | ||
| // \u200D => ALPHANUM | ||
| List<String> charTypeTableValues = Analysis.getWordList(env, settings, "type_table"); | ||
| if (charTypeTableValues == null) { | ||
| this.charTypeTable = WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE; | ||
| } else { | ||
| this.charTypeTable = parseTypes(charTypeTableValues); | ||
| } | ||
| int flags = 0; | ||
| // If set, causes parts of words to be generated: "PowerShot" => "Power" "Shot" | ||
| flags |= getFlag(GENERATE_WORD_PARTS, settings, "generate_word_parts", true); | ||
| // If set, causes number subwords to be generated: "500-42" => "500" "42" | ||
| flags |= getFlag(GENERATE_NUMBER_PARTS, settings, "generate_number_parts", true); | ||
| // 1, causes maximum runs of word parts to be catenated: "wi-fi" => "wifi" | ||
| flags |= getFlag(CATENATE_WORDS, settings, "catenate_words", false); | ||
| // If set, causes maximum runs of number parts to be catenated: "500-42" => "50042" | ||
| flags |= getFlag(CATENATE_NUMBERS, settings, "catenate_numbers", false); | ||
| // If set, causes all subword parts to be catenated: "wi-fi-4000" => "wifi4000" | ||
| flags |= getFlag(CATENATE_ALL, settings, "catenate_all", false); | ||
| // 1, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regards) | ||
| flags |= getFlag(SPLIT_ON_CASE_CHANGE, settings, "split_on_case_change", true); | ||
| // If set, includes original words in subwords: "500-42" => "500" "42" "500-42" | ||
| flags |= getFlag(PRESERVE_ORIGINAL, settings, "preserve_original", false); | ||
| // 1, causes "j2se" to be three tokens; "j" "2" "se" | ||
| flags |= getFlag(SPLIT_ON_NUMERICS, settings, "split_on_numerics", true); | ||
| // If set, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil" | ||
| flags |= getFlag(STEM_ENGLISH_POSSESSIVE, settings, "stem_english_possessive", true); | ||
| // If not null is the set of tokens to protect from being delimited | ||
| Set<?> protectedWords = Analysis.getWordSet(env, indexSettings.getIndexVersionCreated(), settings, "protected_words"); | ||
| this.protoWords = protectedWords == null ? null : CharArraySet.copy(protectedWords); | ||
| this.flags = flags; | ||
| } | ||
|
|
||
| @Override | ||
| public TokenStream create(TokenStream tokenStream) { | ||
| return new WordDelimiterGraphFilter(tokenStream, charTypeTable, flags, protoWords); | ||
| } | ||
|
|
||
| private int getFlag(int flag, Settings settings, String key, boolean defaultValue) { | ||
| if (settings.getAsBoolean(key, defaultValue)) { | ||
| return flag; | ||
| } | ||
| return 0; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this deserves a separate PR, WDYT ? We need to document how this works and make sure that we add all the warnings regarding the restrictions of using this filter in conjunction with others.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can do that. I added it because the test that checks whether all analysis components are exposed failed otherwise.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok I created #23104 to track the inclusion of this new filter. We can add the documentation and tests in a follow-up