Skip to content

Commit 87553bb

Browse files
authored
Add ingest-attachment support for per document indexed_chars limit (#28977)
We today support a global `indexed_chars` processor parameter. But in some cases, users would like to set this limit depending on the document itself. It used to be supported in mapper-attachments plugin by extracting the limit value from a meta field in the document sent to indexation process. We add an option which reads this limit value from the document itself by adding a setting named `indexed_chars_field`. Which allows running: ``` PUT _ingest/pipeline/attachment { "description" : "Extract attachment information. Used to parse pdf and office files", "processors" : [ { "attachment" : { "field" : "data", "indexed_chars_field" : "size" } } ] } ``` Then index either: ``` PUT index/doc/1?pipeline=attachment { "data": "BASE64" } ``` Which will use the default value (or the one defined by `indexed_chars`) Or ``` PUT index/doc/2?pipeline=attachment { "data": "BASE64", "size": 1000 } ``` Closes #28942
1 parent 29a7285 commit 87553bb

File tree

4 files changed

+264
-19
lines changed

4 files changed

+264
-19
lines changed

docs/plugins/ingest-attachment.asciidoc

Lines changed: 116 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ include::install_remove.asciidoc[]
2525
| `field` | yes | - | The field to get the base64 encoded field from
2626
| `target_field` | no | attachment | The field that will hold the attachment information
2727
| `indexed_chars` | no | 100000 | The number of chars being used for extraction to prevent huge fields. Use `-1` for no limit.
28+
| `indexed_chars_field` | no | `null` | Field name from which you can overwrite the number of chars being used for extraction. See `indexed_chars`.
2829
| `properties` | no | all properties | Array of properties to select to be stored. Can be `content`, `title`, `name`, `author`, `keywords`, `date`, `content_type`, `content_length`, `language`
2930
| `ignore_missing` | no | `false` | If `true` and `field` does not exist, the processor quietly exits without modifying the document
3031
|======
@@ -44,11 +45,11 @@ PUT _ingest/pipeline/attachment
4445
}
4546
]
4647
}
47-
PUT my_index/my_type/my_id?pipeline=attachment
48+
PUT my_index/_doc/my_id?pipeline=attachment
4849
{
4950
"data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0="
5051
}
51-
GET my_index/my_type/my_id
52+
GET my_index/_doc/my_id
5253
--------------------------------------------------
5354
// CONSOLE
5455

@@ -59,7 +60,7 @@ Returns this:
5960
{
6061
"found": true,
6162
"_index": "my_index",
62-
"_type": "my_type",
63+
"_type": "_doc",
6364
"_id": "my_id",
6465
"_version": 1,
6566
"_source": {
@@ -99,6 +100,115 @@ NOTE: Extracting contents from binary data is a resource intensive operation and
99100
consumes a lot of resources. It is highly recommended to run pipelines
100101
using this processor in a dedicated ingest node.
101102

103+
[[ingest-attachment-extracted-chars]]
104+
==== Limit the number of extracted chars
105+
106+
To prevent extracting too many chars and overload the node memory, the number of chars being used for extraction
107+
is limited by default to `100000`. You can change this value by setting `indexed_chars`. Use `-1` for no limit but
108+
ensure when setting this that your node will have enough HEAP to extract the content of very big documents.
109+
110+
You can also define this limit per document by extracting from a given field the limit to set. If the document
111+
has that field, it will overwrite the `indexed_chars` setting. To set this field, define the `indexed_chars_field`
112+
setting.
113+
114+
For example:
115+
116+
[source,js]
117+
--------------------------------------------------
118+
PUT _ingest/pipeline/attachment
119+
{
120+
"description" : "Extract attachment information",
121+
"processors" : [
122+
{
123+
"attachment" : {
124+
"field" : "data",
125+
"indexed_chars" : 11,
126+
"indexed_chars_field" : "max_size"
127+
}
128+
}
129+
]
130+
}
131+
PUT my_index/_doc/my_id?pipeline=attachment
132+
{
133+
"data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0="
134+
}
135+
GET my_index/_doc/my_id
136+
--------------------------------------------------
137+
// CONSOLE
138+
139+
Returns this:
140+
141+
[source,js]
142+
--------------------------------------------------
143+
{
144+
"found": true,
145+
"_index": "my_index",
146+
"_type": "_doc",
147+
"_id": "my_id",
148+
"_version": 1,
149+
"_source": {
150+
"data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=",
151+
"attachment": {
152+
"content_type": "application/rtf",
153+
"language": "sl",
154+
"content": "Lorem ipsum",
155+
"content_length": 11
156+
}
157+
}
158+
}
159+
--------------------------------------------------
160+
// TESTRESPONSE
161+
162+
163+
[source,js]
164+
--------------------------------------------------
165+
PUT _ingest/pipeline/attachment
166+
{
167+
"description" : "Extract attachment information",
168+
"processors" : [
169+
{
170+
"attachment" : {
171+
"field" : "data",
172+
"indexed_chars" : 11,
173+
"indexed_chars_field" : "max_size"
174+
}
175+
}
176+
]
177+
}
178+
PUT my_index/_doc/my_id_2?pipeline=attachment
179+
{
180+
"data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=",
181+
"max_size": 5
182+
}
183+
GET my_index/_doc/my_id_2
184+
--------------------------------------------------
185+
// CONSOLE
186+
187+
Returns this:
188+
189+
[source,js]
190+
--------------------------------------------------
191+
{
192+
"found": true,
193+
"_index": "my_index",
194+
"_type": "_doc",
195+
"_id": "my_id_2",
196+
"_version": 1,
197+
"_source": {
198+
"data": "e1xydGYxXGFuc2kNCkxvcmVtIGlwc3VtIGRvbG9yIHNpdCBhbWV0DQpccGFyIH0=",
199+
"max_size": 5,
200+
"attachment": {
201+
"content_type": "application/rtf",
202+
"language": "ro",
203+
"content": "Lorem",
204+
"content_length": 5
205+
}
206+
}
207+
}
208+
--------------------------------------------------
209+
// TESTRESPONSE
210+
211+
102212
[[ingest-attachment-with-arrays]]
103213
==== Using the Attachment Processor with arrays
104214

@@ -150,7 +260,7 @@ PUT _ingest/pipeline/attachment
150260
}
151261
]
152262
}
153-
PUT my_index/my_type/my_id?pipeline=attachment
263+
PUT my_index/_doc/my_id?pipeline=attachment
154264
{
155265
"attachments" : [
156266
{
@@ -163,7 +273,7 @@ PUT my_index/my_type/my_id?pipeline=attachment
163273
}
164274
]
165275
}
166-
GET my_index/my_type/my_id
276+
GET my_index/_doc/my_id
167277
--------------------------------------------------
168278
// CONSOLE
169279

@@ -172,7 +282,7 @@ Returns this:
172282
--------------------------------------------------
173283
{
174284
"_index" : "my_index",
175-
"_type" : "my_type",
285+
"_type" : "_doc",
176286
"_id" : "my_id",
177287
"_version" : 1,
178288
"found" : true,

plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/AttachmentProcessor.java

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
import static org.elasticsearch.ingest.ConfigurationUtils.readBooleanProperty;
4343
import static org.elasticsearch.ingest.ConfigurationUtils.readIntProperty;
4444
import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalList;
45+
import static org.elasticsearch.ingest.ConfigurationUtils.readOptionalStringProperty;
4546
import static org.elasticsearch.ingest.ConfigurationUtils.readStringProperty;
4647

4748
public final class AttachmentProcessor extends AbstractProcessor {
@@ -55,15 +56,17 @@ public final class AttachmentProcessor extends AbstractProcessor {
5556
private final Set<Property> properties;
5657
private final int indexedChars;
5758
private final boolean ignoreMissing;
59+
private final String indexedCharsField;
5860

5961
AttachmentProcessor(String tag, String field, String targetField, Set<Property> properties,
60-
int indexedChars, boolean ignoreMissing) throws IOException {
62+
int indexedChars, boolean ignoreMissing, String indexedCharsField) {
6163
super(tag);
6264
this.field = field;
6365
this.targetField = targetField;
6466
this.properties = properties;
6567
this.indexedChars = indexedChars;
6668
this.ignoreMissing = ignoreMissing;
69+
this.indexedCharsField = indexedCharsField;
6770
}
6871

6972
boolean isIgnoreMissing() {
@@ -82,6 +85,17 @@ public void execute(IngestDocument ingestDocument) {
8285
throw new IllegalArgumentException("field [" + field + "] is null, cannot parse.");
8386
}
8487

88+
Integer indexedChars = this.indexedChars;
89+
90+
if (indexedCharsField != null) {
91+
// If the user provided the number of characters to be extracted as part of the document, we use it
92+
indexedChars = ingestDocument.getFieldValue(indexedCharsField, Integer.class, true);
93+
if (indexedChars == null) {
94+
// If the field does not exist we fall back to the global limit
95+
indexedChars = this.indexedChars;
96+
}
97+
}
98+
8599
Metadata metadata = new Metadata();
86100
String parsedContent = "";
87101
try {
@@ -183,14 +197,15 @@ public AttachmentProcessor create(Map<String, Processor.Factory> registry, Strin
183197
Map<String, Object> config) throws Exception {
184198
String field = readStringProperty(TYPE, processorTag, config, "field");
185199
String targetField = readStringProperty(TYPE, processorTag, config, "target_field", "attachment");
186-
List<String> properyNames = readOptionalList(TYPE, processorTag, config, "properties");
200+
List<String> propertyNames = readOptionalList(TYPE, processorTag, config, "properties");
187201
int indexedChars = readIntProperty(TYPE, processorTag, config, "indexed_chars", NUMBER_OF_CHARS_INDEXED);
188202
boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, "ignore_missing", false);
203+
String indexedCharsField = readOptionalStringProperty(TYPE, processorTag, config, "indexed_chars_field");
189204

190205
final Set<Property> properties;
191-
if (properyNames != null) {
206+
if (propertyNames != null) {
192207
properties = EnumSet.noneOf(Property.class);
193-
for (String fieldName : properyNames) {
208+
for (String fieldName : propertyNames) {
194209
try {
195210
properties.add(Property.parse(fieldName));
196211
} catch (Exception e) {
@@ -202,7 +217,7 @@ public AttachmentProcessor create(Map<String, Processor.Factory> registry, Strin
202217
properties = DEFAULT_PROPERTIES;
203218
}
204219

205-
return new AttachmentProcessor(processorTag, field, targetField, properties, indexedChars, ignoreMissing);
220+
return new AttachmentProcessor(processorTag, field, targetField, properties, indexedChars, ignoreMissing, indexedCharsField);
206221
}
207222
}
208223

plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java

Lines changed: 54 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ public class AttachmentProcessorTests extends ESTestCase {
5454
private AttachmentProcessor processor;
5555

5656
@Before
57-
public void createStandardProcessor() throws IOException {
57+
public void createStandardProcessor() {
5858
processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field",
59-
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false);
59+
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 10000, false, null);
6060
}
6161

6262
public void testEnglishTextDocument() throws Exception {
@@ -89,7 +89,7 @@ public void testHtmlDocumentWithRandomFields() throws Exception {
8989
selectedProperties.add(AttachmentProcessor.Property.DATE);
9090
}
9191
processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field",
92-
"target_field", selectedProperties, 10000, false);
92+
"target_field", selectedProperties, 10000, false, null);
9393

9494
Map<String, Object> attachmentData = parseDocument("htmlWithEmptyDateMeta.html", processor);
9595
assertThat(attachmentData.keySet(), hasSize(selectedFieldNames.length));
@@ -242,15 +242,15 @@ public void testNullValueWithIgnoreMissing() throws Exception {
242242
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(),
243243
Collections.singletonMap("source_field", null));
244244
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
245-
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, true);
245+
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, true, null);
246246
processor.execute(ingestDocument);
247247
assertIngestDocument(originalIngestDocument, ingestDocument);
248248
}
249249

250250
public void testNonExistentWithIgnoreMissing() throws Exception {
251251
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap());
252252
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
253-
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, true);
253+
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, true, null);
254254
processor.execute(ingestDocument);
255255
assertIngestDocument(originalIngestDocument, ingestDocument);
256256
}
@@ -259,22 +259,28 @@ public void testNullWithoutIgnoreMissing() throws Exception {
259259
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(),
260260
Collections.singletonMap("source_field", null));
261261
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
262-
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, false);
262+
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, false, null);
263263
Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument));
264264
assertThat(exception.getMessage(), equalTo("field [source_field] is null, cannot parse."));
265265
}
266266

267267
public void testNonExistentWithoutIgnoreMissing() throws Exception {
268268
IngestDocument originalIngestDocument = RandomDocumentPicks.randomIngestDocument(random(), Collections.emptyMap());
269269
IngestDocument ingestDocument = new IngestDocument(originalIngestDocument);
270-
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, false);
270+
Processor processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field", "randomTarget", null, 10, false, null);
271271
Exception exception = expectThrows(Exception.class, () -> processor.execute(ingestDocument));
272272
assertThat(exception.getMessage(), equalTo("field [source_field] not present as part of path [source_field]"));
273273
}
274274

275275
private Map<String, Object> parseDocument(String file, AttachmentProcessor processor) throws Exception {
276+
return parseDocument(file, processor, new HashMap<>());
277+
}
278+
279+
private Map<String, Object> parseDocument(String file, AttachmentProcessor processor, Map<String, Object> optionalFields)
280+
throws Exception {
276281
Map<String, Object> document = new HashMap<>();
277282
document.put("source_field", getAsBase64(file));
283+
document.putAll(optionalFields);
278284

279285
IngestDocument ingestDocument = RandomDocumentPicks.randomIngestDocument(random(), document);
280286
processor.execute(ingestDocument);
@@ -284,7 +290,47 @@ private Map<String, Object> parseDocument(String file, AttachmentProcessor proce
284290
return attachmentData;
285291
}
286292

287-
protected String getAsBase64(String filename) throws Exception {
293+
public void testIndexedChars() throws Exception {
294+
processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field",
295+
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, null);
296+
297+
Map<String, Object> attachmentData = parseDocument("text-in-english.txt", processor);
298+
299+
assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
300+
assertThat(attachmentData.get("language"), is("en"));
301+
assertThat(attachmentData.get("content"), is("\"God Save the Queen"));
302+
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
303+
assertThat(attachmentData.get("content_length"), is(19L));
304+
305+
processor = new AttachmentProcessor(randomAlphaOfLength(10), "source_field",
306+
"target_field", EnumSet.allOf(AttachmentProcessor.Property.class), 19, false, "max_length");
307+
308+
attachmentData = parseDocument("text-in-english.txt", processor);
309+
310+
assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
311+
assertThat(attachmentData.get("language"), is("en"));
312+
assertThat(attachmentData.get("content"), is("\"God Save the Queen"));
313+
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
314+
assertThat(attachmentData.get("content_length"), is(19L));
315+
316+
attachmentData = parseDocument("text-in-english.txt", processor, Collections.singletonMap("max_length", 10));
317+
318+
assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
319+
assertThat(attachmentData.get("language"), is("sk"));
320+
assertThat(attachmentData.get("content"), is("\"God Save"));
321+
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
322+
assertThat(attachmentData.get("content_length"), is(10L));
323+
324+
attachmentData = parseDocument("text-in-english.txt", processor, Collections.singletonMap("max_length", 100));
325+
326+
assertThat(attachmentData.keySet(), containsInAnyOrder("language", "content", "content_type", "content_length"));
327+
assertThat(attachmentData.get("language"), is("en"));
328+
assertThat(attachmentData.get("content"), is("\"God Save the Queen\" (alternatively \"God Save the King\""));
329+
assertThat(attachmentData.get("content_type").toString(), containsString("text/plain"));
330+
assertThat(attachmentData.get("content_length"), is(56L));
331+
}
332+
333+
private String getAsBase64(String filename) throws Exception {
288334
String path = "/org/elasticsearch/ingest/attachment/test/sample-files/" + filename;
289335
try (InputStream is = AttachmentProcessorTests.class.getResourceAsStream(path)) {
290336
byte bytes[] = IOUtils.toByteArray(is);

0 commit comments

Comments
 (0)