From 6485d175f1fef0301ecf97e714a6ae61f19d3f7d Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Thu, 2 Mar 2023 14:26:08 +0100 Subject: [PATCH 1/2] Emit combined text for mixed content in HtmlDecoder Was only emitting the children's text before --- .../main/java/org/metafacture/html/HtmlDecoder.java | 10 ++++------ .../java/org/metafacture/html/HtmlDecoderTest.java | 11 +++++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java b/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java index eeb1e4397..8438842c1 100644 --- a/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java +++ b/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java @@ -98,12 +98,10 @@ private void process(final Element parent, final StreamReceiver receiver) { addedValueAsSubfield = handleAttributeValuesAsSubfields(receiver, element, attributes, attribute); receiver.literal(attribute.getKey(), attribute.getValue()); } - if (element.children().isEmpty()) { - final String text = element.text().trim(); - final String value = text.isEmpty() ? element.data() : text; - if (!value.isEmpty() && !addedValueAsSubfield) { - receiver.literal("value", value); - } + final String text = element.text().trim(); + final String value = text.isEmpty() ? element.data() : text; + if (!value.isEmpty() && !addedValueAsSubfield) { + receiver.literal("value", value); } process(element, receiver); receiver.endEntity(); diff --git a/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java b/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java index 66d737875..fa70245dd 100644 --- a/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java +++ b/metafacture-html/src/test/java/org/metafacture/html/HtmlDecoderTest.java @@ -77,6 +77,17 @@ public void nestedEntities() { } + @Test + public void mixedContent() { + htmlDecoder.process(new StringReader("

This is the full text

")); + final InOrder ordered = inOrder(receiver); + ordered.verify(receiver).startEntity("p"); + ordered.verify(receiver).literal("value", "This is the full text"); + // elements above plus body, html + ordered.verify(receiver, times(4)).endEntity(); + + } + @Test public void htmlAttributesAsLiterals() { htmlDecoder.process(new StringReader("

Text")); From fb23d42b108076da9d292ad1582ec4111109578c Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Thu, 2 Mar 2023 15:32:49 +0100 Subject: [PATCH 2/2] Replace type specification with diamond operator --- .../src/main/java/org/metafacture/html/HtmlDecoder.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java b/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java index 8438842c1..f80ac407f 100644 --- a/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java +++ b/metafacture-html/src/main/java/org/metafacture/html/HtmlDecoder.java @@ -131,7 +131,7 @@ private boolean handleAttributeValuesAsSubfields(final StreamReceiver receiver, * @param mapString the attributes to be added as subfields */ public void setAttrValsAsSubfields(final String mapString) { - this.attrValsAsSubfields = new HashMap(); + this.attrValsAsSubfields = new HashMap<>(); final String input = mapString.startsWith("&") ? DEFAULT_ATTR_VALS_AS_SUBFIELDS + mapString : mapString; for (final String nameValuePair : input.split("&")) { final String[] nameValue = nameValuePair.split("=");