|
1 | 1 | [[analysis-common-grams-tokenfilter]] |
2 | | -=== Common Grams Token Filter |
| 2 | +=== Common grams token filter |
| 3 | +++++ |
| 4 | +<titleabbrev>Common grams</titleabbrev> |
| 5 | +++++ |
3 | 6 |
|
4 | | -Token filter that generates bigrams for frequently occurring terms. |
5 | | -Single terms are still indexed. It can be used as an alternative to the |
6 | | -<<analysis-stop-tokenfilter,Stop |
7 | | -Token Filter>> when we don't want to completely ignore common terms. |
| 7 | +Generates https://en.wikipedia.org/wiki/Bigram[bigrams] for a specified set of |
| 8 | +common words. |
8 | 9 |
|
9 | | -For example, the text "the quick brown is a fox" will be tokenized as |
10 | | -"the", "the_quick", "quick", "brown", "brown_is", "is", "is_a", "a", |
11 | | -"a_fox", "fox". Assuming "the", "is" and "a" are common words. |
| 10 | +For example, you can specify `is` and `the` as common words. This filter then |
| 11 | +converts the tokens `[the, quick, fox, is, brown]` to `[the, the_quick, quick, |
| 12 | +fox, fox_is, is, is_brown, brown]`. |
12 | 13 |
|
13 | | -When `query_mode` is enabled, the token filter removes common words and |
14 | | -single terms followed by a common word. This parameter should be enabled |
15 | | -in the search analyzer. |
| 14 | +You can use the `common_grams` filter in place of the |
| 15 | +<<analysis-stop-tokenfilter,stop token filter>> when you don't want to |
| 16 | +completely ignore common words. |
16 | 17 |
|
17 | | -For example, the query "the quick brown is a fox" will be tokenized as |
18 | | -"the_quick", "quick", "brown_is", "is_a", "a_fox", "fox". |
| 18 | +This filter uses Lucene's |
| 19 | +https://lucene.apache.org/core/{lucene_version_path}/analyzers-common/org/apache/lucene/analysis/commongrams/CommonGramsFilter.html[CommonGramsFilter]. |
19 | 20 |
|
20 | | -The following are settings that can be set: |
| 21 | +[[analysis-common-grams-analyze-ex]] |
| 22 | +==== Example |
21 | 23 |
|
22 | | -[cols="<,<",options="header",] |
23 | | -|======================================================================= |
24 | | -|Setting |Description |
25 | | -|`common_words` |A list of common words to use. |
26 | | - |
27 | | -|`common_words_path` |A path (either relative to `config` location, or |
28 | | -absolute) to a list of common words. Each word should be in its own |
29 | | -"line" (separated by a line break). The file must be UTF-8 encoded. |
30 | | - |
31 | | -|`ignore_case` |If true, common words matching will be case insensitive |
32 | | -(defaults to `false`). |
33 | | - |
34 | | -|`query_mode` |Generates bigrams then removes common words and single |
35 | | -terms followed by a common word (defaults to `false`). |
36 | | -|======================================================================= |
37 | | - |
38 | | -Note, `common_words` or `common_words_path` field is required. |
39 | | - |
40 | | -Here is an example: |
| 24 | +The following <<indices-analyze,analyze API>> request creates bigrams for `is` |
| 25 | +and `the`: |
41 | 26 |
|
42 | 27 | [source,console] |
43 | 28 | -------------------------------------------------- |
44 | | -PUT /common_grams_example |
| 29 | +GET /_analyze |
45 | 30 | { |
46 | | - "settings": { |
47 | | - "analysis": { |
48 | | - "analyzer": { |
49 | | - "index_grams": { |
50 | | - "tokenizer": "whitespace", |
51 | | - "filter": ["common_grams"] |
52 | | - }, |
53 | | - "search_grams": { |
54 | | - "tokenizer": "whitespace", |
55 | | - "filter": ["common_grams_query"] |
56 | | - } |
57 | | - }, |
58 | | - "filter": { |
59 | | - "common_grams": { |
60 | | - "type": "common_grams", |
61 | | - "common_words": ["the", "is", "a"] |
62 | | - }, |
63 | | - "common_grams_query": { |
64 | | - "type": "common_grams", |
65 | | - "query_mode": true, |
66 | | - "common_words": ["the", "is", "a"] |
67 | | - } |
68 | | - } |
69 | | - } |
| 31 | + "tokenizer" : "whitespace", |
| 32 | + "filter" : [ |
| 33 | + "common_grams", { |
| 34 | + "type": "common_grams", |
| 35 | + "common_words": ["is", "the"] |
70 | 36 | } |
| 37 | + ], |
| 38 | + "text" : "the quick fox is brown" |
71 | 39 | } |
72 | 40 | -------------------------------------------------- |
73 | 41 |
|
74 | | -You can see the output by using e.g. the `_analyze` endpoint: |
| 42 | +The filter produces the following tokens: |
75 | 43 |
|
76 | | -[source,console] |
| 44 | +[source,text] |
77 | 45 | -------------------------------------------------- |
78 | | -POST /common_grams_example/_analyze |
79 | | -{ |
80 | | - "analyzer" : "index_grams", |
81 | | - "text" : "the quick brown is a fox" |
82 | | -} |
| 46 | +[ the, the_quick, quick, fox, fox_is, is, is_brown, brown ] |
83 | 47 | -------------------------------------------------- |
84 | | -// TEST[continued] |
85 | | - |
86 | | -And the response will be: |
87 | 48 |
|
| 49 | +///////////////////// |
88 | 50 | [source,console-result] |
89 | 51 | -------------------------------------------------- |
90 | 52 | { |
@@ -112,57 +74,155 @@ And the response will be: |
112 | 74 | "position" : 1 |
113 | 75 | }, |
114 | 76 | { |
115 | | - "token" : "brown", |
| 77 | + "token" : "fox", |
116 | 78 | "start_offset" : 10, |
117 | | - "end_offset" : 15, |
| 79 | + "end_offset" : 13, |
118 | 80 | "type" : "word", |
119 | 81 | "position" : 2 |
120 | 82 | }, |
121 | 83 | { |
122 | | - "token" : "brown_is", |
| 84 | + "token" : "fox_is", |
123 | 85 | "start_offset" : 10, |
124 | | - "end_offset" : 18, |
| 86 | + "end_offset" : 16, |
125 | 87 | "type" : "gram", |
126 | 88 | "position" : 2, |
127 | 89 | "positionLength" : 2 |
128 | 90 | }, |
129 | 91 | { |
130 | 92 | "token" : "is", |
131 | | - "start_offset" : 16, |
132 | | - "end_offset" : 18, |
| 93 | + "start_offset" : 14, |
| 94 | + "end_offset" : 16, |
133 | 95 | "type" : "word", |
134 | 96 | "position" : 3 |
135 | 97 | }, |
136 | 98 | { |
137 | | - "token" : "is_a", |
138 | | - "start_offset" : 16, |
139 | | - "end_offset" : 20, |
| 99 | + "token" : "is_brown", |
| 100 | + "start_offset" : 14, |
| 101 | + "end_offset" : 22, |
140 | 102 | "type" : "gram", |
141 | 103 | "position" : 3, |
142 | 104 | "positionLength" : 2 |
143 | 105 | }, |
144 | 106 | { |
145 | | - "token" : "a", |
146 | | - "start_offset" : 19, |
147 | | - "end_offset" : 20, |
| 107 | + "token" : "brown", |
| 108 | + "start_offset" : 17, |
| 109 | + "end_offset" : 22, |
148 | 110 | "type" : "word", |
149 | 111 | "position" : 4 |
150 | | - }, |
151 | | - { |
152 | | - "token" : "a_fox", |
153 | | - "start_offset" : 19, |
154 | | - "end_offset" : 24, |
155 | | - "type" : "gram", |
156 | | - "position" : 4, |
157 | | - "positionLength" : 2 |
158 | | - }, |
159 | | - { |
160 | | - "token" : "fox", |
161 | | - "start_offset" : 21, |
162 | | - "end_offset" : 24, |
163 | | - "type" : "word", |
164 | | - "position" : 5 |
165 | 112 | } |
166 | 113 | ] |
167 | 114 | } |
168 | 115 | -------------------------------------------------- |
| 116 | +///////////////////// |
| 117 | + |
| 118 | +[[analysis-common-grams-tokenfilter-analyzer-ex]] |
| 119 | +==== Add to an analyzer |
| 120 | + |
| 121 | +The following <<indices-create-index,create index API>> request uses the |
| 122 | +`common_grams` filter to configure a new |
| 123 | +<<analysis-custom-analyzer,custom analyzer>>: |
| 124 | + |
| 125 | +[source,console] |
| 126 | +-------------------------------------------------- |
| 127 | +PUT /common_grams_example |
| 128 | +{ |
| 129 | + "settings": { |
| 130 | + "analysis": { |
| 131 | + "analyzer": { |
| 132 | + "index_grams": { |
| 133 | + "tokenizer": "whitespace", |
| 134 | + "filter": ["common_grams"] |
| 135 | + } |
| 136 | + }, |
| 137 | + "filter": { |
| 138 | + "common_grams": { |
| 139 | + "type": "common_grams", |
| 140 | + "common_words": ["a", "is", "the"] |
| 141 | + } |
| 142 | + } |
| 143 | + } |
| 144 | + } |
| 145 | +} |
| 146 | +-------------------------------------------------- |
| 147 | + |
| 148 | +[[analysis-common-grams-tokenfilter-configure-parms]] |
| 149 | +==== Configurable parameters |
| 150 | + |
| 151 | +`common_words`:: |
| 152 | ++ |
| 153 | +-- |
| 154 | +(Required+++*+++, array of strings) |
| 155 | +A list of tokens. The filter generates bigrams for these tokens. |
| 156 | + |
| 157 | +Either this or the `common_words_path` parameter is required. |
| 158 | +-- |
| 159 | + |
| 160 | +`common_words_path`:: |
| 161 | ++ |
| 162 | +-- |
| 163 | +(Required+++*+++, string) |
| 164 | +Path to a file containing a list of tokens. The filter generates bigrams for |
| 165 | +these tokens. |
| 166 | + |
| 167 | +This path must be absolute or relative to the `config` location. The file must |
| 168 | +be UTF-8 encoded. Each token in the file must be separated by a line break. |
| 169 | + |
| 170 | +Either this or the `common_words` parameter is required. |
| 171 | +-- |
| 172 | + |
| 173 | +`ignore_case`:: |
| 174 | +(Optional, boolean) |
| 175 | +If `true`, matches for common words matching are case-insensitive. |
| 176 | +Defaults to `false`. |
| 177 | + |
| 178 | +`query_mode`:: |
| 179 | ++ |
| 180 | +-- |
| 181 | +(Optional, boolean) |
| 182 | +If `true`, the filter excludes the following tokens from the output: |
| 183 | + |
| 184 | +* Unigrams for common words |
| 185 | +* Unigrams for terms followed by common words |
| 186 | + |
| 187 | +Defaults to `false`. We recommend enabling this parameter for |
| 188 | +<<search-analyzer,search analyzers>>. |
| 189 | + |
| 190 | +For example, you can enable this parameter and specify `is` and `the` as |
| 191 | +common words. This filter converts the tokens `[the, quick, fox, is, brown]` to |
| 192 | +`[the_quick, quick, fox_is, is_brown,]`. |
| 193 | +-- |
| 194 | + |
| 195 | +[[analysis-common-grams-tokenfilter-customize]] |
| 196 | +==== Customize |
| 197 | + |
| 198 | +To customize the `common_grams` filter, duplicate it to create the basis |
| 199 | +for a new custom token filter. You can modify the filter using its configurable |
| 200 | +parameters. |
| 201 | + |
| 202 | +For example, the following request creates a custom `common_grams` filter with |
| 203 | +`ignore_case` and `query_mode` set to `true`: |
| 204 | + |
| 205 | +[source,console] |
| 206 | +-------------------------------------------------- |
| 207 | +PUT /common_grams_example |
| 208 | +{ |
| 209 | + "settings": { |
| 210 | + "analysis": { |
| 211 | + "analyzer": { |
| 212 | + "index_grams": { |
| 213 | + "tokenizer": "whitespace", |
| 214 | + "filter": ["common_grams_query"] |
| 215 | + } |
| 216 | + }, |
| 217 | + "filter": { |
| 218 | + "common_grams_query": { |
| 219 | + "type": "common_grams", |
| 220 | + "common_words": ["a", "is", "the"], |
| 221 | + "ignore_case": true, |
| 222 | + "query_mode": true |
| 223 | + } |
| 224 | + } |
| 225 | + } |
| 226 | + } |
| 227 | +} |
| 228 | +-------------------------------------------------- |
0 commit comments