From 1af5b50872adddc237206a98cac11e367336eae2 Mon Sep 17 00:00:00 2001 From: Matt Heffron Date: Sat, 1 Nov 2025 22:31:55 -0700 Subject: [PATCH] Fix a few issues: Accented characters mis-encoded, HTMLish "tags" improperly removed, display dates as originally set in Zotero. --- layouts/bibliography/list.html | 17 +++++----- layouts/bibliography/single.html | 11 +++--- scripts/bib-fns.jq | 8 +++++ scripts/bibSplit.pl | 58 +++++++++++++++++++++----------- scripts/update_bibliography.sh | 15 +++++---- 5 files changed, 70 insertions(+), 39 deletions(-) diff --git a/layouts/bibliography/list.html b/layouts/bibliography/list.html index 46abbca3..0cc8e81b 100644 --- a/layouts/bibliography/list.html +++ b/layouts/bibliography/list.html @@ -62,8 +62,7 @@

{{ or .Params.heading .Title }}

{{- $rendered = $rendered | htmlEscape -}} {{- $rendered | safeHTML -}}
- {{- /* Safe date: use front matter date only if non-empty & matches basic pattern */ -}} - {{- $authors := .Params.authors -}} + {{- $authors := .Params.authors -}} {{- $editors := .Params.editors -}} {{- $isPatent := eq .Params.item_type "patent" -}} {{- /* warnf "authors: %v" $authors */ -}} @@ -91,17 +90,19 @@

{{ or .Params.heading .Title }}


{{- end -}} + {{- /* Safe date: use front matter date only if non-empty & matches basic pattern */ -}} {{- $d := .Date -}} + {{- $fmtDate := .Params.readabledate -}} {{- with .Params.date -}} - {{- $datestr := trim . " " -}} - {{- if and (ne $datestr "") (findRE `^\d{4}-\d{2}-\d{2}` $datestr) -}} - {{- $d = time $datestr -}} + {{- $isoDate := trim . " " -}} + {{- if and (ne $isoDate "") (findRE `^\d{4}-\d{2}-\d{2}` $isoDate) -}} + {{- $d = time $isoDate -}} {{- end -}} - {{- end -}} + {{- end }} {{- $d = $d.Format "2006-01-02" -}} {{- /* Don't display bogus date */ -}} {{- if (ne $d "0001-01-01") }} -
+ {{ end -}} {{ with .Params.abstract -}} @@ -114,7 +115,7 @@

{{ or .Params.heading .Title }}

{{- $previewWords := split $plain " " | first $previewWordLimit -}} {{- $preview = delimit $previewWords " " -}} {{- else -}} - {{- $preview = replace . "\n" "
" -}} + {{- $preview = replace (. | htmlEscape) "\n" "
" -}} {{- end -}}
{{ $preview | safeHTML }}{{ if $previewing }}…{{ end }}
diff --git a/layouts/bibliography/single.html b/layouts/bibliography/single.html index 23e07f76..fed28338 100644 --- a/layouts/bibliography/single.html +++ b/layouts/bibliography/single.html @@ -32,16 +32,17 @@

{{ .Title }}

{{- $d := .Date -}} + {{- $fmtDate := .Params.readabledate -}} {{- with .Params.date -}} - {{- $datestr := trim . " " -}} - {{- if and (ne $datestr "") (findRE `^\d{4}-\d{2}-\d{2}` $datestr) -}} - {{- $d = time $datestr -}} + {{- $isoDate := trim . " " -}} + {{- if and (ne $isoDate "") (findRE `^\d{4}-\d{2}-\d{2}` $isoDate) -}} + {{- $d = time $isoDate -}} {{- end -}} {{- end }} {{- $d = $d.Format "2006-01-02" -}} {{- /* Don't display bogus date */ -}} {{- if (ne $d "0001-01-01") }} - + {{ end -}}

@@ -56,7 +57,7 @@

{{ .Title }}

Abstract
{{ with .Params.abstract }} - {{ . | markdownify }} + {{ . | htmlEscape | markdownify }} {{ else }} No abstract available. {{ end }} diff --git a/scripts/bib-fns.jq b/scripts/bib-fns.jq index 4fc0151b..1ba4d41d 100644 --- a/scripts/bib-fns.jq +++ b/scripts/bib-fns.jq @@ -30,6 +30,14 @@ def issued_iso_string: . end; +def issued_date_readable: + if nonBlankKey("issued") and (.issued | nonBlankKey("date-parts")) then + setpath(["readableDateString"]; + (.issued["date-parts"][0]) as $p | $p | map(pad2) | join("-")) + else + . + end; + def format_person_name: if (has("family") and .family != null and (.family|tostring|length)>0) then .family diff --git a/scripts/bibSplit.pl b/scripts/bibSplit.pl index d383c400..044a1738 100755 --- a/scripts/bibSplit.pl +++ b/scripts/bibSplit.pl @@ -2,6 +2,18 @@ use JSON::PP qw(decode_json encode_json); use Encode qw(decode encode is_utf8); use Unicode::Normalize qw(NFC); +use utf8; +BEGIN +{ + $bibDir = $ENV{'BIBLIOGRAPHY_DIR'}; + $bibItemsDir = $ENV{'BIBITEMS_DIR'}; +} + +# Handy when using the perl debugger... +# sub is8 { +# my ($s) = @_; +# return is_utf8($s) ? "is UTF8" : "is not UTF8"; +# } # Cleanup text fields sub sanitize_text { @@ -9,6 +21,7 @@ sub sanitize_text { return '' unless defined $s; # Ensure decoded characters + # NOTE: It SEEMS is_utf8() checking may not correctly indicate the correct state of the content!! $s = decode('UTF-8', $s, Encode::WARN) unless is_utf8($s); # Repair mojibake (e.g., "’" -> "’") if available @@ -19,32 +32,28 @@ sub sanitize_text { # Normalize and clean $s = NFC($s); # normalize accents/combining marks - $s =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x{80}-\x{9F}]//g; # drop C0/C1 controls - $s =~ s/\x{00A0}/ /g; # NBSP -> space - $s =~ s/\r\n?/ /g; # normalize newlines + $s =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x{80}-\x{9F}]//gu; # drop C0/C1 controls + $s =~ s/\x{00A0}/ /gu; # NBSP -> space + $s =~ s/\r\n?/ /gu; # normalize newlines return $s; } -BEGIN -{ - $bibDir = $ENV{'BIBLIOGRAPHY_DIR'}; - $bibItemsDir = $ENV{'BIBITEMS_DIR'}; -} my $item = $_; -my $json = eval { decode_json($item) } or do { warn "Bad JSON line: $_\n"; next; }; +my $obj = eval { decode_json($item) } or do { warn "Bad JSON line: $_\n"; next; }; -my $key = $json->{key}; -my $target = $json->{target} || print STDERR "Cannot find target for key \"$key\" in line: $_\n"; +my $key = $obj->{key}; +my $target = $obj->{target} || print STDERR "Cannot find target for key \"$key\" in line: $_\n"; if ($key eq $target) { # only top level entries #my $handle = undef; #my $itemjson = "$bibItemsDir/$key.json"; #open($handle, ">:encoding(UTF-8)", $itemjson) || die "$0: cannot open $itemjson in write-open mode: $!"; - #print $handle $item; + #print $handle (is_utf8($item) ? $item : decode('UTF-8', $item, Encode::WARN)); #close $handle || die "$0: close of file $itemjson failed: $!"; - my $obj = eval { decode_json($item) } or do { warn "Bad JSON for $key\n"; next; }; + # No need to decode the $item a second time. Just changed the $json above to be $obj. + #my $obj = eval { decode_json($item) } or do { warn "Bad JSON for $key\n"; next; }; delete $obj->{children}; my $type = $obj->{type} // ''; @@ -59,34 +68,42 @@ BEGIN my $abs = sanitize_text($obj->{abstract} // ''); # a hack for bulleted lists in the abstracts (use markdown there) # won't work for nested lists. - $abs =~ s/\n?\n\N{U+2022}/\n*/g; + $abs =~ s/\n?\n\N{U+2022}/\n*/gu; my $indented = join('', map { " $_\n" } split(/\n/, $abs)); my $abstract = $indented eq '' ? "abstract: ''" : "abstract: |\n$indented"; my $itemDate = defined $obj->{isoDateString} ? $obj->{isoDateString} : ''; + my $itemReadableDate = defined $obj->{readableDateString} ? $obj->{readableDateString} : ''; my $itemAuthors = ''; if (ref($obj->{authorsFormatted}) eq 'ARRAY' && @{$obj->{authorsFormatted}}) { $itemAuthors = "\n"; for my $a (@{$obj->{authorsFormatted}}) { - my $quoted = encode_json($a // ''); - $itemAuthors .= " - $quoted\n"; + # The encode_json is where extended unicode chars get corrupted, e.g., "Emanuelson, Pär" + # There may be other things that now don't work!! + # my $quoted = encode_json($a // ''); + # sanitize_text seems to handle them correctly + my $san = sanitize_text($a // ''); + $itemAuthors .= " - \"$san\"\n"; } - $itemAuthors =~ s/\n$//; # strip trailing newline + $itemAuthors =~ s/\n$//u; # strip trailing newline } my $itemEditors = ''; if (ref($obj->{editorsFormatted}) eq 'ARRAY' && @{$obj->{editorsFormatted}}) { $itemEditors = "\n"; for my $a (@{$obj->{editorsFormatted}}) { - my $quoted = encode_json($a // ''); - $itemEditors .= " - $quoted\n"; + # as above... + # my $quoted = encode_json($a // ''); + my $san = sanitize_text($a // ''); + $itemEditors .= " - \"$san\"\n"; } - $itemEditors =~ s/\n$//; # strip trailing newline + $itemEditors =~ s/\n$//u; # strip trailing newline } my $urlSource = defined $obj->{url} ? $obj->{url} : ''; + # Some/most/all of these *may* need sanitize_text # optional fields - ones used vary by value of type my $applicationNumber = defined $obj->{applicationNumber} ? qq{"$obj->{applicationNumber}"} : '""'; my $assignee = defined $obj->{assignee} ? qq{"$obj->{assignees}"} : '""'; @@ -157,6 +174,7 @@ BEGIN --- $title date: $itemDate +readabledate: $itemReadableDate type: bibliography item_type: $type authors: $itemAuthors diff --git a/scripts/update_bibliography.sh b/scripts/update_bibliography.sh index 8a827414..2229ae5f 100755 --- a/scripts/update_bibliography.sh +++ b/scripts/update_bibliography.sh @@ -7,13 +7,16 @@ function usage () { echo -e "Usage: $0 [ options ] [ input_items_file ]\n" echo -e "Options:" echo -e " -h | --help \tDisplay this message and exit." - echo -e " -r | --rawitems \tSave the complete downloaded JSON as '00-rawItems.json' (See 'input_items_file' below.)." + echo -e " -r | --rawitems \tSave the complete downloaded JSON as '00-rawItems.json'" + echo -e " \t(See 'input_items_file' below.)." echo -e " -g | --tagsfile \tGenerate 'tags.json' containing all tags on the 'cleaned-up' set of entries." echo -e " -y | --typefiles \tGenerate item type information JSON files. (See below.)" echo -e " -c | --collectionsfiles \tGenerate two JSON files containing info about each of the Zotero collections." - echo -e " -u | --curlfiles \tGenerate files in the 'curl/' directory with the output of each call to curl. Very low level debugging." + echo -e " -u | --curlfiles \tGenerate files in the 'curl/' directory with the output of each call to curl." + echo -e " \t(Very low level debugging.)" echo -e " -d | --debugfiles \tGenerate numbered files with the intermediate processing step output JASON." - echo -e " -i N | --infolevel N \tSet the level of display of informational messages. N is 0-10 (Default = 2.). (See below.)" + echo -e " -i N | --infolevel N \tSet the level of display of informational messages. N is 0-10 (Default = 2.)." + echo -e " \t(See below.)" echo -e "\n typefiles: These 3 files contain the type and itemType information for the entries with different level of details." echo -e "\n infolevel: The infolevel controls how much detail is presented during processing." echo -e " 0: NO info messages." @@ -374,11 +377,11 @@ if $debugFiles ; then fi finalCount=$(jq '. | length' <<< "$items") -items=$(jq 'include "./bib-fns";map(issued_iso_string)' <<< "$items") +items=$(jq 'include "./bib-fns";map(issued_iso_string | issued_date_readable | add_author_string | add_editor_string)' <<< "$items") -items=$(jq 'include "./bib-fns";map(add_author_string)' <<< "$items") +#items=$(jq 'include "./bib-fns";map(add_author_string)' <<< "$items") -items=$(jq 'include "./bib-fns";map(add_editor_string)' <<< "$items") +#items=$(jq 'include "./bib-fns";map(add_editor_string)' <<< "$items") # if $removeChildrenFromFinalFile; then # # Remove .children arrays, if any. Save space. # items=$(jq 'map(del(.children))' <<< "$items")