Skip to content

Commit 2df907c

Browse files
authored
Add ingest-attachment support for per document indexed_chars limit (#31352)
We today support a global `indexed_chars` processor parameter. But in some cases, users would like to set this limit depending on the document itself. It used to be supported in mapper-attachments plugin by extracting the limit value from a meta field in the document sent to indexation process. We add an option which reads this limit value from the document itself by adding a setting named `indexed_chars_field`. Which allows running: ``` PUT _ingest/pipeline/attachment { "description" : "Extract attachment information. Used to parse pdf and office files", "processors" : [ { "attachment" : { "field" : "data", "indexed_chars_field" : "size" } } ] } ``` Then index either: ``` PUT index/doc/1?pipeline=attachment { "data": "BASE64" } ``` Which will use the default value (or the one defined by `indexed_chars`) Or ``` PUT index/doc/2?pipeline=attachment { "data": "BASE64", "size": 1000 } ``` Backport of #28977 in 6.x branch (6.4.0)
1 parent b2ed885 commit 2df907c

File tree

4 files changed

+264
-21
lines changed

4 files changed

+264
-21
lines changed

docs/plugins/ingest-attachment.asciidoc

Lines changed: 116 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ include::install_remove.asciidoc[]
2525
| `field` | yes | - | The field to get the base64 encoded field from
2626
| `target_field` | no | attachment | The field that will hold the attachment information
2727
| `indexed_chars` | no | 100000 | The number of chars being used for extraction to prevent huge fields. Use `-1` for no limit.
28+
| `indexed_chars_field` | no | `null` | Field name from which you can overwrite the number of chars being used for extraction. See `indexed_chars`.
2829
| `properties` | no | all properties | Array of properties to select to be stored. Can be `content`, `title`, `name`, `author`, `keywords`, `date`, `content_type`, `content_length`, `language`
2930
| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document
3031
|======
@@ -44,11 +45,11 @@ PUT _ingest/pipeline/attachment
4445
}
4546
]
4647
}
47-
PUT my_index/my_type/my_id?pipeline=attachment
48+
PUT my_index/_doc/my_id?pipeline=attachment
4849
{
4950
"data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0="
5051
}
51-
GET my_index/my_type/my_id
52+
GET my_index/_doc/my_id
5253
--------------------------------------------------
5354
// CONSOLE
5455

@@ -59,7 +60,7 @@ Returns this:
5960
{
6061
"found": true,
6162
"_index": "my_index",
62-
"_type": "my_type",
63+
"_type": "_doc",
6364
"_id": "my_id",
6465
"_version": 1,
6566
"_source": {
@@ -99,6 +100,115 @@ NOTE: Extracting contents from binary data is a resource intensive operation and
99100
consumes a lot of resources. It is highly recommended to run pipelines
100101
using this processor in a dedicated ingest node.
101102

103+
[[ingest-attachment-extracted-chars]]
104+
==== Limit the number of extracted chars
105+
106+
To prevent extracting too many chars and overload the node memory, the number of chars being used for extraction
107+
is limited by default to `100000`. You can change this value by setting `indexed_chars`. Use `-1` for no limit but
108+
ensure when setting this that your node will have enough HEAP to extract the content of very big documents.
109+
110+
You can also define this limit per document by extracting from a given field the limit to set. If the document
111+
has that field, it will overwrite the `indexed_chars` setting. To set this field, define the `indexed_chars_field`
112+
setting.
113+
114+
For example:
115+
116+
[source,js]
117+
--------------------------------------------------
118+
PUT _ingest/pipeline/attachment
119+
{
120+
"description" : "Extract attachment information",
121+
"processors" : [
122+
{
123+
"attachment" : {
124+
"field" : "data",
125+
"indexed_chars" : 11,
126+
"indexed_chars_field" : "max_size"
127+
}
128+
}
129+
]
130+
}
131+
PUT my_index/_doc/my_id?pipeline=attachment
132+
{
133+
"data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0="
134+
}
135+
GET my_index/_doc/my_id
136+
--------------------------------------------------
137+
// CONSOLE
138+
139+
Returns this:
140+
141+
[source,js]
142+
--------------------------------------------------
143+
{
144+
"found": true,
145+
"_index": "my_index",
146+
"_type": "_doc",
147+
"_id": "my_id",
148+
"_version": 1,
149+
"_source": {
150+
"data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=",
151+
"attachment": {
152+
"content_type": "application/rtf",
153+
"language": "sl",
154+
"content": "Lorem ipsum",
155+
"content_length": 11
156+
}
157+
}
158+
}
159+
--------------------------------------------------
160+
// TESTRESPONSE
161+
162+
163+
[source,js]
164+
--------------------------------------------------
165+
PUT _ingest/pipeline/attachment
166+
{
167+
"description" : "Extract attachment information",
168+
"processors" : [
169+
{
170+
"attachment" : {
171+
"field" : "data",
172+
"indexed_chars" : 11,
173+
"indexed_chars_field" : "max_size"
174+
}
175+
}
176+
]
177+
}
178+
PUT my_index/_doc/my_id_2?pipeline=attachment
179+
{
180+
"data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=",
181+
"max_size": 5
182+
}
183+
GET my_index/_doc/my_id_2
184+
--------------------------------------------------
185+
// CONSOLE
186+
187+
Returns this:
188+
189+
[source,js]
190+
--------------------------------------------------
191+
{
192+
"found": true,
193+
"_index": "my_index",
194+
"_type": "_doc",
195+
"_id": "my_id_2",
196+
"_version": 1,
197+
"_source": {
198+
"data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=",
199+
"max_size": 5,
200+
"attachment": {
201+
"content_type": "application/rtf",
202+
"language": "ro",
203+
"content": "Lorem",
204+
"content_length": 5
205+
}
206+
}
207+
}
208+
--------------------------------------------------
209+
// TESTRESPONSE
210+
211+
102212
[[ingest-attachment-with-arrays]]
103213
==== Using the Attachment Processor with arrays
104214

@@ -150,7 +260,7 @@ PUT _ingest/pipeline/attachment
150260
}
151261
]
152262
}
153-
PUT my_index/my_type/my_id?pipeline=attachment
263+
PUT my_index/_doc/my_id?pipeline=attachment
154264
{
155265
"attachments" : [
156266
{
@@ -163,7 +273,7 @@ PUT my_index/my_type/my_id?pipeline=attachment
163273
}
164274
]
165275
}
166-
GET my_index/my_type/my_id
276+
GET my_index/_doc/my_id
167277
--------------------------------------------------
168278
// CONSOLE
169279

@@ -172,7 +282,7 @@ Returns this:
172282
--------------------------------------------------
173283
{
174284
"_index" : "my_index",
175-
"_type" : "my_type",
285+
"_type" : "_doc",
176286
"_id" : "my_id",
177287
"_version" : 1,
178288
"found" : true,

plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
import org.elasticsearch.ingest.IngestDocument;
3030
import org.elasticsearch.ingest.Processor;
3131

32-
import java.io.IOException;
3332
import java.util.Arrays;
3433
import java.util.EnumSet;
3534
import java.util.HashMap;
@@ -42,6 +41,7 @@
4241
import static org.elasticsearch.ingest.ConfigurationUtils.readBooleanProperty;
4342
import static org.elasticsearch.ingest.ConfigurationUtils.readIntProperty;
4443
import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalList;
44+
import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalStringProperty;
4545
import static org.elasticsearch.ingest.ConfigurationUtils.readStringProperty;
4646

4747
public final class AttachmentProcessor extends AbstractProcessor {
@@ -55,15 +55,17 @@ public final class AttachmentProcessor extends AbstractProcessor {
5555
private final Set<Property> properties;
5656
private final int indexedChars;
5757
private final boolean ignoreMissing;
58+
private final String indexedCharsField;
5859

5960
AttachmentProcessor(String tag, String field, String targetField, Set<Property> properties,
60-
int indexedChars, boolean ignoreMissing) throws IOException {
61+
int indexedChars, boolean ignoreMissing, String indexedCharsField) {
6162
super(tag);
6263
this.field = field;
6364
this.targetField = targetField;
6465
this.properties = properties;
6566
this.indexedChars = indexedChars;
6667
this.ignoreMissing = ignoreMissing;
68+
this.indexedCharsField = indexedCharsField;
6769
}
6870

6971
boolean isIgnoreMissing() {
@@ -82,6 +84,17 @@ public void execute(IngestDocument ingestDocument) {
8284
throw new IllegalArgumentException("field [" + field + "] is null, cannot parse.");
8385
}
8486

87+
Integer indexedChars = this.indexedChars;
88+
89+
if (indexedCharsField != null) {
90+
// If the user provided the number of characters to be extracted as part of the document, we use it
91+
indexedChars = ingestDocument.getFieldValue(indexedCharsField, Integer.class, true);
92+
if (indexedChars == null) {
93+
// If the field does not exist we fall back to the global limit
94+
indexedChars = this.indexedChars;
95+
}
96+
}
97+
8598
Metadata metadata = new Metadata();
8699
String parsedContent = "";
87100
try {
@@ -183,14 +196,15 @@ public AttachmentProcessor create(Map<String, Processor.Factory> registry, Strin
183196
Map<String, Object> config) throws Exception {
184197
String field = readStringProperty(TYPE, processorTag, config, "field");
185198
String targetField = readStringProperty(TYPE, processorTag, config, "target_field", "attachment");
186-
List<String> properyNames = readOptionalList(TYPE, processorTag, config, "properties");
199+
List<String> propertyNames = readOptionalList(TYPE, processorTag, config, "properties");
187200
int indexedChars = readIntProperty(TYPE, processorTag, config, "indexed_chars", NUMBER_OF_CHARS_INDEXED);
188201
boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false);
202+
String indexedCharsField = readOptionalStringProperty(TYPE, processorTag, config, "indexed_chars_field");
189203

190204
final Set<Property> properties;
191-
if (properyNames != null) {
205+
if (propertyNames != null) {
192206
properties = EnumSet.noneOf(Property.class);
193-
for (String fieldName : properyNames) {
207+
for (String fieldName : propertyNames) {
194208
try {
195209
properties.add(Property.parse(fieldName));
196210
} catch (Exception e) {
@@ -202,7 +216,7 @@ public AttachmentProcessor create(Map<String, Processor.Factory> registry, Strin
202216
properties = DEFAULT_PROPERTIES;
203217
}
204218

205-
return new AttachmentProcessor(processorTag, field, targetField, properties, indexedChars, ignoreMissing);
219+
return new AttachmentProcessor(processorTag, field, targetField, properties, indexedChars, ignoreMissing, indexedCharsField);
206220
}
207221
}
208222

plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java

Lines changed: 54 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@
2727
import org.elasticsearch.test.ESTestCase;
2828
import org.junit.Before;
2929

30-
import java.io.IOException;
3130
import java.io.InputStream;
3231
import java.util.ArrayList;
3332
import java.util.Base64;
@@ -54,9 +53,9 @@ public class AttachmentProcessorTests extends ESTestCase {
5453
private AttachmentProcessor processor;
5554

5655
@Before
57-
public void createStandardProcessor() throws IOException {
56+
public void createStandardProcessor() {
5857
processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field",
59-
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false);
58+
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false, null);
6059
}
6160

6261
public void testEnglishTextDocument() throws Exception {
@@ -89,7 +88,7 @@ public void testHtmlDocumentWithRandomFields() throws Exception {
8988
selectedProperties.add(AttachmentProcessor.Property.DATE);
9089
}
9190
processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field",
92-
"target_field", selectedProperties, 10000, false);
91+
"target_field", selectedProperties, 10000, false, null);
9392

9493
Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor);
9594
assertThat(attachmentData.keySet(), hasSize(selectedFieldNames.length));
@@ -242,15 +241,15 @@ public void testNullValueWithIgnoreMissing() throws Exception {
242241
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(),
243242
Collections.singletonMap("source_field", null));
244243
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
245-
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, true);
244+
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, true, null);
246245
processor.execute(ingestDocument);
247246
assertIngestDocument(originalIngestDocument, ingestDocument);
248247
}
249248

250249
public void testNonExistentWithIgnoreMissing() throws Exception {
251250
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap());
252251
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
253-
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, true);
252+
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, true, null);
254253
processor.execute(ingestDocument);
255254
assertIngestDocument(originalIngestDocument, ingestDocument);
256255
}
@@ -259,22 +258,28 @@ public void testNullWithoutIgnoreMissing() throws Exception {
259258
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(),
260259
Collections.singletonMap("source_field", null));
261260
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
262-
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, false);
261+
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, false, null);
263262
Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument));
264263
assertThat(exception.getMessage(), equalTo("field [source_field] is null, cannot parse."));
265264
}
266265

267266
public void testNonExistentWithoutIgnoreMissing() throws Exception {
268267
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap());
269268
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
270-
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, false);
269+
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, false, null);
271270
Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument));
272271
assertThat(exception.getMessage(), equalTo("field [source_field] not present as part of path [source_field]"));
273272
}
274273

275274
private Map<String, Object> parseDocument(String file, AttachmentProcessor processor) throws Exception {
275+
return parseDocument(file, processor, new HashMap<>());
276+
}
277+
278+
private Map<String, Object> parseDocument(String file, AttachmentProcessor processor, Map<String, Object> optionalFields)
279+
throws Exception {
276280
Map<String, Object> document = new HashMap<>();
277281
document.put("source_field", getAsBase64(file));
282+
document.putAll(optionalFields);
278283

279284
IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document);
280285
processor.execute(ingestDocument);
@@ -284,7 +289,47 @@ private Map<String, Object> parseDocument(String file, AttachmentProcessor proce
284289
return attachmentData;
285290
}
286291

287-
protected String getAsBase64(String filename) throws Exception {
292+
public void testIndexedChars() throws Exception {
293+
processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field",
294+
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, null);
295+
296+
Map<String, Object> attachmentData = parseDocument("text-in-english.txt", processor);
297+
298+
assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
299+
assertThat(attachmentData.get("language"), is("en"));
300+
assertThat(attachmentData.get("content"), is("\"God Save the Queen"));
301+
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
302+
assertThat(attachmentData.get("content_length"), is(19L));
303+
304+
processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field",
305+
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, "max_length");
306+
307+
attachmentData = parseDocument("text-in-english.txt", processor);
308+
309+
assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
310+
assertThat(attachmentData.get("language"), is("en"));
311+
assertThat(attachmentData.get("content"), is("\"God Save the Queen"));
312+
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
313+
assertThat(attachmentData.get("content_length"), is(19L));
314+
315+
attachmentData = parseDocument("text-in-english.txt", processor, Collections.singletonMap("max_length", 10));
316+
317+
assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
318+
assertThat(attachmentData.get("language"), is("sk"));
319+
assertThat(attachmentData.get("content"), is("\"God Save"));
320+
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
321+
assertThat(attachmentData.get("content_length"), is(10L));
322+
323+
attachmentData = parseDocument("text-in-english.txt", processor, Collections.singletonMap("max_length", 100));
324+
325+
assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
326+
assertThat(attachmentData.get("language"), is("en"));
327+
assertThat(attachmentData.get("content"), is("\"God Save the Queen\" (alternatively \"God Save the King\""));
328+
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
329+
assertThat(attachmentData.get("content_length"), is(56L));
330+
}
331+
332+
private String getAsBase64(String filename) throws Exception {
288333
String path = "/org/elasticsearch/ingest/attachment/test/sample-files/" + filename;
289334
try (InputStream is = AttachmentProcessorTests.class.getResourceAsStream(path)) {
290335
byte bytes[] = IOUtils.toByteArray(is);

0 commit comments

Comments
 (0)