From b32b900f390d96ac80c08bacc2d7161bccdafd73 Mon Sep 17 00:00:00 2001 From: Buck Evan Date: Wed, 22 Oct 2025 09:28:13 -0500 Subject: [PATCH] Fix #160: Add proper text escaping in FormatHtml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HTML text nodes containing &, <, > were output without escaping, causing xq's output to be unparseable when piped back through xq -j. This commit adds: - New escapeTextContent() function for minimal entity escaping - Modified FormatHtml to escape text nodes with &, <, > - Tests verifying the output is valid XML Example issue: echo '1 & 2' | xq | xq -j # Before: Error - bare & in output # After: Success - properly escaped as & This is a critical fix preventing data corruption when round-tripping HTML through xq. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- cmd/root_test.go | 52 +++++++++++++++++++++++++++++++++++++++++ internal/utils/utils.go | 12 ++++++++++ 2 files changed, 64 insertions(+) diff --git a/cmd/root_test.go b/cmd/root_test.go index 52b4866..2351e5e 100644 --- a/cmd/root_test.go +++ b/cmd/root_test.go @@ -8,6 +8,7 @@ import ( "strings" "testing" + "github.com/antchfx/xmlquery" "github.com/sibprogrammer/xq/internal/utils" "github.com/spf13/cobra" "github.com/spf13/pflag" @@ -102,6 +103,57 @@ func TestRootCmd(t *testing.T) { assert.ErrorContains(t, err, "invalid argument") } +func TestEscapedTextNodes(t *testing.T) { + // Test case 1: ampersand entity - reproduce issue #160 + // xq outputs bare & which fails when parsed as XML (used by -j flag) + t.Run("ampersand entity output is valid XML", func(t *testing.T) { + input := "1 & 2" + + // First pass: format the HTML + reader1 := strings.NewReader(input) + var output1 bytes.Buffer + err := utils.FormatHtml(reader1, &output1, "", utils.ColorsDisabled) + assert.Nil(t, err) + + result1 := strings.TrimSpace(output1.String()) + t.Logf("First pass output: %q", result1) + + // Second pass: try to parse as XML (this is what `xq -j` does) + reader2 := strings.NewReader(result1) + _, err = xmlquery.Parse(reader2) + assert.Nil(t, err, "xq output should be parseable as XML (for -j flag)") + }) + + // Test case 2: less-than and greater-than entities - reproduce issue #160 + // xq outputs bare < and > which are parsed as tags + t.Run("less-than and greater-than entities output is valid XML", func(t *testing.T) { + input := "is <bold> a valid tag?" + + // First pass: format the HTML + reader1 := strings.NewReader(input) + var output1 bytes.Buffer + err := utils.FormatHtml(reader1, &output1, "", utils.ColorsDisabled) + assert.Nil(t, err) + + result1 := strings.TrimSpace(output1.String()) + t.Logf("First pass output: %q", result1) + + // Second pass: try to parse as XML (this is what `xq -j` does) + reader2 := strings.NewReader(result1) + doc, err := xmlquery.Parse(reader2) + assert.Nil(t, err, "xq output should be parseable as XML (for -j flag)") + + // Verify the text content is preserved correctly + if doc != nil { + textNode := xmlquery.FindOne(doc, "//html") + if textNode != nil { + assert.Equal(t, "is a valid tag?", textNode.InnerText(), + "Text content should preserve the literal < and > characters") + } + } + }) +} + func TestProcessAsJSON(t *testing.T) { tests := []struct { name string diff --git a/internal/utils/utils.go b/internal/utils/utils.go index 2597b2a..a253be6 100644 --- a/internal/utils/utils.go +++ b/internal/utils/utils.go @@ -338,6 +338,9 @@ func FormatHtml(reader io.Reader, writer io.Writer, indent string, colors int) e case html.TextToken: str := normalizeSpaces(string(tokenizer.Text()), indent, level) hasContent = str != "" + if hasContent { + str = escapeTextContent(str) + } _, _ = fmt.Fprint(writer, str) case html.StartTagToken, html.SelfClosingTagToken: if level > 0 { @@ -585,6 +588,15 @@ func escapeText(input string) (string, error) { return result, nil } +func escapeTextContent(input string) string { + // Only escape the minimal set of characters needed for text content + // to avoid XML parsing errors: & < > + result := strings.ReplaceAll(input, "&", "&") + result = strings.ReplaceAll(result, "<", "<") + result = strings.ReplaceAll(result, ">", ">") + return result +} + func normalizeSpaces(input string, indent string, level int) string { if strings.TrimSpace(input) == "" { input = ""