diff --git a/plugins/ingest-attachment/build.gradle b/plugins/ingest-attachment/build.gradle index 97b5a23f1165f..8a4f038b4c36a 100644 --- a/plugins/ingest-attachment/build.gradle +++ b/plugins/ingest-attachment/build.gradle @@ -74,9 +74,11 @@ dependencyLicenses { } forbiddenPatterns { + exclude '**/*.doc' exclude '**/*.docx' exclude '**/*.pdf' exclude '**/*.epub' + exclude '**/*.vsdx' } thirdPartyAudit.excludes = [ diff --git a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java index 27a4cfebbcd2f..b5e2266950780 100644 --- a/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java +++ b/plugins/ingest-attachment/src/main/java/org/elasticsearch/ingest/attachment/TikaImpl.java @@ -22,8 +22,10 @@ import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; import org.elasticsearch.SpecialPermission; import org.elasticsearch.bootstrap.JarHell; import org.elasticsearch.common.SuppressForbidden; @@ -45,7 +47,9 @@ import java.security.PrivilegedExceptionAction; import java.security.ProtectionDomain; import java.security.SecurityPermission; +import java.util.Collections; import java.util.PropertyPermission; +import java.util.Set; /** * Runs tika with limited parsers and limited permissions. @@ -54,6 +58,9 @@ */ final class TikaImpl { + /** Exclude some formats */ + private static final Set EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml")); + /** subset of parsers for types we support */ private static final Parser PARSERS[] = new Parser[] { // documents @@ -63,7 +70,7 @@ final class TikaImpl { new org.apache.tika.parser.txt.TXTParser(), new org.apache.tika.parser.microsoft.OfficeParser(), new org.apache.tika.parser.microsoft.OldExcelParser(), - new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), + ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES), new org.apache.tika.parser.odf.OpenDocumentParser(), new org.apache.tika.parser.iwork.IWorkPackageParser(), new org.apache.tika.parser.xml.DcXMLParser(), diff --git a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java index b59457b5b0119..e5b9d72017fd3 100644 --- a/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java +++ b/plugins/ingest-attachment/src/test/java/org/elasticsearch/ingest/attachment/AttachmentProcessorTests.java @@ -47,6 +47,7 @@ import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.notNullValue; +import static org.hamcrest.Matchers.nullValue; import static org.hamcrest.core.IsCollectionContaining.hasItem; public class AttachmentProcessorTests extends ESTestCase { @@ -130,6 +131,34 @@ public void testWordDocument() throws Exception { is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); } + public void testWordDocumentWithVisioSchema() throws Exception { + Map attachmentData = parseDocument("issue-22077.docx", processor); + + assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", + "content_length")); + assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); + assertThat(attachmentData.get("language"), is("en")); + assertThat(attachmentData.get("date"), is("2015-01-06T18:07:00Z")); + assertThat(attachmentData.get("author"), is(notNullValue())); + assertThat(attachmentData.get("content_length"), is(notNullValue())); + assertThat(attachmentData.get("content_type").toString(), + is("application/vnd.openxmlformats-officedocument.wordprocessingml.document")); + } + + public void testLegacyWordDocumentWithVisioSchema() throws Exception { + Map attachmentData = parseDocument("issue-22077.doc", processor); + + assertThat(attachmentData.keySet(), containsInAnyOrder("content", "language", "date", "author", "content_type", + "content_length")); + assertThat(attachmentData.get("content").toString(), containsString("Table of Contents")); + assertThat(attachmentData.get("language"), is("en")); + assertThat(attachmentData.get("date"), is("2016-12-16T15:04:00Z")); + assertThat(attachmentData.get("author"), is(notNullValue())); + assertThat(attachmentData.get("content_length"), is(notNullValue())); + assertThat(attachmentData.get("content_type").toString(), + is("application/msword")); + } + public void testPdf() throws Exception { Map attachmentData = parseDocument("test.pdf", processor); assertThat(attachmentData.get("content"), @@ -138,6 +167,13 @@ public void testPdf() throws Exception { assertThat(attachmentData.get("content_length"), is(notNullValue())); } + public void testVisioIsExcluded() throws Exception { + Map attachmentData = parseDocument("issue-22077.vsdx", processor); + assertThat(attachmentData.get("content"), nullValue()); + assertThat(attachmentData.get("content_type"), is("application/vnd.ms-visio.drawing")); + assertThat(attachmentData.get("content_length"), is(0L)); + } + public void testEncryptedPdf() throws Exception { ElasticsearchParseException e = expectThrows(ElasticsearchParseException.class, () -> parseDocument("encrypted.pdf", processor)); assertThat(e.getDetailedMessage(), containsString("document is encrypted")); diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc new file mode 100644 index 0000000000000..10badd5809be6 Binary files /dev/null and b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.doc differ diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx new file mode 100644 index 0000000000000..bab550607a915 Binary files /dev/null and b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.docx differ diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx new file mode 100644 index 0000000000000..fb9cde51b4b5b Binary files /dev/null and b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/sample-files/issue-22077.vsdx differ diff --git a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files/testPPT.potm.zip b/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files/testPPT.potm.zip deleted file mode 100644 index 0f87b774a2a67..0000000000000 Binary files a/plugins/ingest-attachment/src/test/resources/org/elasticsearch/ingest/attachment/test/tika-files/testPPT.potm.zip and /dev/null differ diff --git a/plugins/mapper-attachments/build.gradle b/plugins/mapper-attachments/build.gradle index 495e49b42c16d..a99fbd4d4e60e 100644 --- a/plugins/mapper-attachments/build.gradle +++ b/plugins/mapper-attachments/build.gradle @@ -74,9 +74,11 @@ dependencyLicenses { } forbiddenPatterns { + exclude '**/*.doc' exclude '**/*.docx' exclude '**/*.pdf' exclude '**/*.epub' + exclude '**/*.vsdx' } thirdPartyAudit.excludes = [ diff --git a/plugins/mapper-attachments/src/main/java/org/elasticsearch/mapper/attachments/TikaImpl.java b/plugins/mapper-attachments/src/main/java/org/elasticsearch/mapper/attachments/TikaImpl.java index 2babda8ad00ff..de964062d2ec4 100644 --- a/plugins/mapper-attachments/src/main/java/org/elasticsearch/mapper/attachments/TikaImpl.java +++ b/plugins/mapper-attachments/src/main/java/org/elasticsearch/mapper/attachments/TikaImpl.java @@ -22,8 +22,10 @@ import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; import org.elasticsearch.SpecialPermission; import org.elasticsearch.bootstrap.JarHell; import org.elasticsearch.common.SuppressForbidden; @@ -45,7 +47,9 @@ import java.security.PrivilegedExceptionAction; import java.security.ProtectionDomain; import java.security.SecurityPermission; +import java.util.Collections; import java.util.PropertyPermission; +import java.util.Set; /** * Runs tika with limited parsers and limited permissions. @@ -54,6 +58,9 @@ */ final class TikaImpl { + /** Exclude some formats */ + private static final Set EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml")); + /** subset of parsers for types we support */ private static final Parser PARSERS[] = new Parser[] { // documents @@ -63,7 +70,7 @@ final class TikaImpl { new org.apache.tika.parser.txt.TXTParser(), new org.apache.tika.parser.microsoft.OfficeParser(), new org.apache.tika.parser.microsoft.OldExcelParser(), - new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), + ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES), new org.apache.tika.parser.odf.OpenDocumentParser(), new org.apache.tika.parser.iwork.IWorkPackageParser(), new org.apache.tika.parser.xml.DcXMLParser(), diff --git a/plugins/mapper-attachments/src/test/java/org/elasticsearch/mapper/attachments/VariousDocTests.java b/plugins/mapper-attachments/src/test/java/org/elasticsearch/mapper/attachments/VariousDocTests.java index b9359d46f5ecd..1fcbb86aeb01e 100644 --- a/plugins/mapper-attachments/src/test/java/org/elasticsearch/mapper/attachments/VariousDocTests.java +++ b/plugins/mapper-attachments/src/test/java/org/elasticsearch/mapper/attachments/VariousDocTests.java @@ -44,7 +44,9 @@ import static org.elasticsearch.mapper.attachments.AttachmentMapper.FieldNames.TITLE; import static org.elasticsearch.test.StreamsUtils.copyToBytesFromClasspath; import static org.elasticsearch.test.StreamsUtils.copyToStringFromClasspath; +import static org.hamcrest.Matchers.is; import static org.hamcrest.Matchers.isEmptyOrNullString; +import static org.hamcrest.Matchers.isEmptyString; import static org.hamcrest.Matchers.not; /** @@ -121,6 +123,40 @@ public void testAsciidocDocument() throws Exception { testMapper("asciidoc.asciidoc", false); } + public void testWordDocumentWithVisioSchema() throws Exception { + assertParseable("issue-22077.docx"); + testMapper("issue-22077.docx", false); + } + + public void testLegacyWordDocumentWithVisioSchema() throws Exception { + assertParseable("issue-22077.doc"); + testMapper("issue-22077.doc", false); + } + + public void testVisioIsExcluded() throws Exception { + String filename = "issue-22077.vsdx"; + try (InputStream is = VariousDocTests.class.getResourceAsStream("/org/elasticsearch/index/mapper/attachment/test/sample-files/" + + filename)) { + byte bytes[] = IOUtils.toByteArray(is); + String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1); + assertThat(parsedContent, isEmptyString()); + } + + byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/attachment/test/sample-files/" + filename); + BytesReference json = jsonBuilder() + .startObject() + .startObject("file") + .field("_name", filename) + .field("_content", html) + .endObject() + .endObject().bytes(); + + ParseContext.Document doc = docMapper.parse("person", "person", "1", json).rootDoc(); + assertThat(doc.get(docMapper.mappers().getMapper("file.content").fieldType().name()), isEmptyString()); + assertThat(doc.get(docMapper.mappers().getMapper("file.content_type").fieldType().name()), is("application/vnd.ms-visio.drawing")); + assertThat(doc.get(docMapper.mappers().getMapper("file.content_length").fieldType().name()), is("210451")); + } + void assertException(String filename, String expectedMessage) throws Exception { try (InputStream is = VariousDocTests.class.getResourceAsStream("/org/elasticsearch/index/mapper/attachment/test/sample-files/" + filename)) { byte bytes[] = IOUtils.toByteArray(is); diff --git a/plugins/mapper-attachments/src/test/resources/org/elasticsearch/index/mapper/attachment/test/sample-files/issue-22077.doc b/plugins/mapper-attachments/src/test/resources/org/elasticsearch/index/mapper/attachment/test/sample-files/issue-22077.doc new file mode 100644 index 0000000000000..10badd5809be6 Binary files /dev/null and b/plugins/mapper-attachments/src/test/resources/org/elasticsearch/index/mapper/attachment/test/sample-files/issue-22077.doc differ diff --git a/plugins/mapper-attachments/src/test/resources/org/elasticsearch/index/mapper/attachment/test/sample-files/issue-22077.docx b/plugins/mapper-attachments/src/test/resources/org/elasticsearch/index/mapper/attachment/test/sample-files/issue-22077.docx new file mode 100644 index 0000000000000..bab550607a915 Binary files /dev/null and b/plugins/mapper-attachments/src/test/resources/org/elasticsearch/index/mapper/attachment/test/sample-files/issue-22077.docx differ diff --git a/plugins/mapper-attachments/src/test/resources/org/elasticsearch/index/mapper/attachment/test/sample-files/issue-22077.vsdx b/plugins/mapper-attachments/src/test/resources/org/elasticsearch/index/mapper/attachment/test/sample-files/issue-22077.vsdx new file mode 100644 index 0000000000000..fb9cde51b4b5b Binary files /dev/null and b/plugins/mapper-attachments/src/test/resources/org/elasticsearch/index/mapper/attachment/test/sample-files/issue-22077.vsdx differ diff --git a/plugins/mapper-attachments/src/test/resources/org/elasticsearch/index/mapper/attachment/test/tika-files/testPPT.potm.zip b/plugins/mapper-attachments/src/test/resources/org/elasticsearch/index/mapper/attachment/test/tika-files/testPPT.potm.zip deleted file mode 100644 index 0f87b774a2a67..0000000000000 Binary files a/plugins/mapper-attachments/src/test/resources/org/elasticsearch/index/mapper/attachment/test/tika-files/testPPT.potm.zip and /dev/null differ