Skip to content

Commit 76a977a

Browse files
committed
Remove support for Visio and potm files
* Send a non supported document to an ingest pipeline using `ingest-attachment` * If Tika is not able to parse the document because of a missing class (we are not importing all jars needed by Tika), Tika throws a Throwable which is not catch. This commit removes support for Visio and POTM office files. So elasticsearch is not killed anymore when you run a command like: ``` GET _ingest/pipeline/_simulate { "pipeline" : { "processors" : [ { "attachment" : { "field" : "file" } } ] }, "docs" : [ { "_source" : { "file" : "BASE64CONTENT" } } ] } ``` The good news is that it does not kill the node anymore and allows to extract the text which is in the Office document even if we have a Visio content (which is not extracted anymore). Related to #22077 Backport of #23214 in 5.2 branch
1 parent 0561d1b commit 76a977a

File tree

7 files changed

+46
-1
lines changed

7 files changed

+46
-1
lines changed

plugins/ingest-attachment/build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,11 @@ dependencyLicenses {
7474
}
7575

7676
forbiddenPatterns {
77+
exclude '**/*.doc'
7778
exclude '**/*.docx'
7879
exclude '**/*.pdf'
7980
exclude '**/*.epub'
81+
exclude '**/*.vsdx'
8082
}
8183

8284
thirdPartyAudit.excludes = [

plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@
2222
import org.apache.tika.Tika;
2323
import org.apache.tika.exception.TikaException;
2424
import org.apache.tika.metadata.Metadata;
25+
import org.apache.tika.mime.MediaType;
2526
import org.apache.tika.parser.AutoDetectParser;
2627
import org.apache.tika.parser.Parser;
28+
import org.apache.tika.parser.ParserDecorator;
2729
import org.elasticsearch.SpecialPermission;
2830
import org.elasticsearch.bootstrap.JarHell;
2931
import org.elasticsearch.common.SuppressForbidden;
@@ -45,7 +47,9 @@
4547
import java.security.PrivilegedExceptionAction;
4648
import java.security.ProtectionDomain;
4749
import java.security.SecurityPermission;
50+
import java.util.Collections;
4851
import java.util.PropertyPermission;
52+
import java.util.Set;
4953

5054
/**
5155
* Runs tika with limited parsers and limited permissions.
@@ -54,6 +58,9 @@
5458
*/
5559
final class TikaImpl {
5660

61+
/** Exclude some formats */
62+
private static final Set<MediaType> EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml"));
63+
5764
/** subset of parsers for types we support */
5865
private static final Parser PARSERS[] = new Parser[] {
5966
// documents
@@ -63,7 +70,7 @@ final class TikaImpl {
6370
new org.apache.tika.parser.txt.TXTParser(),
6471
new org.apache.tika.parser.microsoft.OfficeParser(),
6572
new org.apache.tika.parser.microsoft.OldExcelParser(),
66-
new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
73+
ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES),
6774
new org.apache.tika.parser.odf.OpenDocumentParser(),
6875
new org.apache.tika.parser.iwork.IWorkPackageParser(),
6976
new org.apache.tika.parser.xml.DcXMLParser(),

plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
import static org.hamcrest.Matchers.is;
4848
import static org.hamcrest.Matchers.not;
4949
import static org.hamcrest.Matchers.notNullValue;
50+
import static org.hamcrest.Matchers.nullValue;
5051
import static org.hamcrest.core.IsCollectionContaining.hasItem;
5152

5253
public class AttachmentProcessorTests extends ESTestCase {
@@ -130,6 +131,34 @@ public void testWordDocument() throws Exception {
130131
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
131132
}
132133

134+
public void testWordDocumentWithVisioSchema() throws Exception {
135+
Map<String, Object> attachmentData = parseDocument("issue-22077.docx", processor);
136+
137+
assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
138+
"content_length"));
139+
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
140+
assertThat(attachmentData.get("language"), is("en"));
141+
assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z"));
142+
assertThat(attachmentData.get("author"), is(notNullValue()));
143+
assertThat(attachmentData.get("content_length"), is(notNullValue()));
144+
assertThat(attachmentData.get("content_type").toString(),
145+
is("application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
146+
}
147+
148+
public void testLegacyWordDocumentWithVisioSchema() throws Exception {
149+
Map<String, Object> attachmentData = parseDocument("issue-22077.doc", processor);
150+
151+
assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type",
152+
"content_length"));
153+
assertThat(attachmentData.get("content").toString(), containsString("Table of Contents"));
154+
assertThat(attachmentData.get("language"), is("en"));
155+
assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z"));
156+
assertThat(attachmentData.get("author"), is(notNullValue()));
157+
assertThat(attachmentData.get("content_length"), is(notNullValue()));
158+
assertThat(attachmentData.get("content_type").toString(),
159+
is("application/msword"));
160+
}
161+
133162
public void testPdf() throws Exception {
134163
Map<String, Object> attachmentData = parseDocument("test.pdf", processor);
135164
assertThat(attachmentData.get("content"),
@@ -138,6 +167,13 @@ public void testPdf() throws Exception {
138167
assertThat(attachmentData.get("content_length"), is(notNullValue()));
139168
}
140169

170+
public void testVisioIsExcluded() throws Exception {
171+
Map<String, Object> attachmentData = parseDocument("issue-22077.vsdx", processor);
172+
assertThat(attachmentData.get("content"), nullValue());
173+
assertThat(attachmentData.get("content_type"), is("application/vnd.ms-visio.drawing"));
174+
assertThat(attachmentData.get("content_length"), is(0L));
175+
}
176+
141177
public void testEncryptedPdf() throws Exception {
142178
ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor));
143179
assertThat(e.getDetailedMessage(), containsString("document is encrypted"));

0 commit comments

Comments
 (0)