Skip to content

Commit 07a9f29

Browse files
committed
Remove support for Visio and potm files
* Parse a non supported document using `mapper-attachments` * If Tika is not able to parse the document because of a missing class (we are not importing all jars needed by Tika), Tika throws a Throwable which is not catch. This commit removes support for Visio and POTM office files. The good news is that it does not kill the node anymore and allows to extract the text which is in the Office document even if we have a Visio content (which is not extracted anymore). Related to #22077 and #22079 for mapper-attachments plugin Backport of #23214 in 5.2 branch
1 parent 76a977a commit 07a9f29

File tree

7 files changed

+46
-1
lines changed

7 files changed

+46
-1
lines changed

plugins/mapper-attachments/build.gradle

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,11 @@ dependencyLicenses {
7474
}
7575

7676
forbiddenPatterns {
77+
exclude '**/*.doc'
7778
exclude '**/*.docx'
7879
exclude '**/*.pdf'
7980
exclude '**/*.epub'
81+
exclude '**/*.vsdx'
8082
}
8183

8284
thirdPartyAudit.excludes = [

plugins/mapper-attachments/src/main/java/org/elasticsearch/mapper/attachments/TikaImpl.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@
2222
import org.apache.tika.Tika;
2323
import org.apache.tika.exception.TikaException;
2424
import org.apache.tika.metadata.Metadata;
25+
import org.apache.tika.mime.MediaType;
2526
import org.apache.tika.parser.AutoDetectParser;
2627
import org.apache.tika.parser.Parser;
28+
import org.apache.tika.parser.ParserDecorator;
2729
import org.elasticsearch.SpecialPermission;
2830
import org.elasticsearch.bootstrap.JarHell;
2931
import org.elasticsearch.common.SuppressForbidden;
@@ -45,7 +47,9 @@
4547
import java.security.PrivilegedExceptionAction;
4648
import java.security.ProtectionDomain;
4749
import java.security.SecurityPermission;
50+
import java.util.Collections;
4851
import java.util.PropertyPermission;
52+
import java.util.Set;
4953

5054
/**
5155
* Runs tika with limited parsers and limited permissions.
@@ -54,6 +58,9 @@
5458
*/
5559
final class TikaImpl {
5660

61+
/** Exclude some formats */
62+
private static final Set<MediaType> EXCLUDES = Collections.singleton(MediaType.application("x-tika-ooxml"));
63+
5764
/** subset of parsers for types we support */
5865
private static final Parser PARSERS[] = new Parser[] {
5966
// documents
@@ -63,7 +70,7 @@ final class TikaImpl {
6370
new org.apache.tika.parser.txt.TXTParser(),
6471
new org.apache.tika.parser.microsoft.OfficeParser(),
6572
new org.apache.tika.parser.microsoft.OldExcelParser(),
66-
new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(),
73+
ParserDecorator.withoutTypes(new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), EXCLUDES),
6774
new org.apache.tika.parser.odf.OpenDocumentParser(),
6875
new org.apache.tika.parser.iwork.IWorkPackageParser(),
6976
new org.apache.tika.parser.xml.DcXMLParser(),

plugins/mapper-attachments/src/test/java/org/elasticsearch/mapper/attachments/VariousDocTests.java

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@
4444
import static org.elasticsearch.mapper.attachments.AttachmentMapper.FieldNames.TITLE;
4545
import static org.elasticsearch.test.StreamsUtils.copyToBytesFromClasspath;
4646
import static org.elasticsearch.test.StreamsUtils.copyToStringFromClasspath;
47+
import static org.hamcrest.Matchers.is;
4748
import static org.hamcrest.Matchers.isEmptyOrNullString;
49+
import static org.hamcrest.Matchers.isEmptyString;
4850
import static org.hamcrest.Matchers.not;
4951

5052
/**
@@ -121,6 +123,40 @@ public void testAsciidocDocument() throws Exception {
121123
testMapper("asciidoc.asciidoc", false);
122124
}
123125

126+
public void testWordDocumentWithVisioSchema() throws Exception {
127+
assertParseable("issue-22077.docx");
128+
testMapper("issue-22077.docx", false);
129+
}
130+
131+
public void testLegacyWordDocumentWithVisioSchema() throws Exception {
132+
assertParseable("issue-22077.doc");
133+
testMapper("issue-22077.doc", false);
134+
}
135+
136+
public void testVisioIsExcluded() throws Exception {
137+
String filename = "issue-22077.vsdx";
138+
try (InputStream is = VariousDocTests.class.getResourceAsStream("/org/elasticsearch/index/mapper/attachment/test/sample-files/" +
139+
filename)) {
140+
byte bytes[] = IOUtils.toByteArray(is);
141+
String parsedContent = TikaImpl.parse(bytes, new Metadata(), -1);
142+
assertThat(parsedContent, isEmptyString());
143+
}
144+
145+
byte[] html = copyToBytesFromClasspath("/org/elasticsearch/index/mapper/attachment/test/sample-files/" + filename);
146+
BytesReference json = jsonBuilder()
147+
.startObject()
148+
.startObject("file")
149+
.field("_name", filename)
150+
.field("_content", html)
151+
.endObject()
152+
.endObject().bytes();
153+
154+
ParseContext.Document doc = docMapper.parse("person", "person", "1", json).rootDoc();
155+
assertThat(doc.get(docMapper.mappers().getMapper("file.content").fieldType().name()), isEmptyString());
156+
assertThat(doc.get(docMapper.mappers().getMapper("file.content_type").fieldType().name()), is("application/vnd.ms-visio.drawing"));
157+
assertThat(doc.get(docMapper.mappers().getMapper("file.content_length").fieldType().name()), is("210451"));
158+
}
159+
124160
void assertException(String filename, String expectedMessage) throws Exception {
125161
try (InputStream is = VariousDocTests.class.getResourceAsStream("/org/elasticsearch/index/mapper/attachment/test/sample-files/" + filename)) {
126162
byte bytes[] = IOUtils.toByteArray(is);

0 commit comments

Comments
 (0)