From 1af5b50872adddc237206a98cac11e367336eae2 Mon Sep 17 00:00:00 2001
From: Matt Heffron <heffron@alumni.caltech.edu>
Date: Sat, 1 Nov 2025 22:31:55 -0700
Subject: [PATCH] Fix a few issues: Accented characters mis-encoded, HTMLish
 "tags" improperly removed, display dates as originally set in Zotero.

---
 layouts/bibliography/list.html   | 17 +++++-----
 layouts/bibliography/single.html | 11 +++---
 scripts/bib-fns.jq               |  8 +++++
 scripts/bibSplit.pl              | 58 +++++++++++++++++++++-----------
 scripts/update_bibliography.sh   | 15 +++++----
 5 files changed, 70 insertions(+), 39 deletions(-)
diff --git a/layouts/bibliography/list.html b/layouts/bibliography/list.html
index 46abbca3..0cc8e81b 100644
--- a/layouts/bibliography/list.html
+++ b/layouts/bibliography/list.html
@@ -62,8 +62,7 @@ <h1 class="bib-title">{{ or .Params.heading .Title }}</h1>
       {{- $rendered = $rendered | htmlEscape -}}
       <a href="{{ .RelPermalink }}"><span><strong>{{- $rendered | safeHTML -}}</strong></span></a><br>
 
-      {{- /* Safe date: use front matter date only if non-empty & matches basic pattern */ -}}
-            {{- $authors := .Params.authors -}}
+      {{- $authors := .Params.authors -}}
       {{- $editors := .Params.editors -}}
       {{- $isPatent := eq .Params.item_type "patent" -}}
       {{- /* warnf "authors: %v" $authors */ -}}
@@ -91,17 +90,19 @@ <h1 class="bib-title">{{ or .Params.heading .Title }}</h1>
       <br>
       {{- end -}}
 
+      {{- /* Safe date: use front matter date only if non-empty & matches basic pattern */ -}}
       {{- $d := .Date -}}
+      {{- $fmtDate := .Params.readabledate -}}
       {{- with .Params.date -}}
-        {{- $datestr := trim . " " -}}
-        {{- if and (ne $datestr "") (findRE `^\d{4}-\d{2}-\d{2}` $datestr) -}}
-          {{- $d = time $datestr -}}
+        {{- $isoDate := trim . " " -}}
+        {{- if and (ne $isoDate "") (findRE `^\d{4}-\d{2}-\d{2}` $isoDate) -}}
+          {{- $d = time $isoDate -}}
         {{- end -}}
-      {{- end -}}
+      {{- end }}
       {{- $d = $d.Format "2006-01-02" -}}
       {{- /* Don't display bogus date */ -}}
       {{- if (ne $d "0001-01-01") }}
-        <time datetime="{{ $d }}">{{ $d }}</time><br>
+        <time datetime="{{ $d }}">{{ $fmtDate }}</time>
       {{ end -}}
 
       {{ with .Params.abstract -}}
@@ -114,7 +115,7 @@ <h1 class="bib-title">{{ or .Params.heading .Title }}</h1>
           {{- $previewWords := split $plain " " | first $previewWordLimit -}}
           {{- $preview = delimit $previewWords " " -}}
         {{- else -}}
-          {{- $preview = replace . "\n" "<br>" -}}
+          {{- $preview = replace (. | htmlEscape) "\n" "<br>" -}}
         {{- end -}}
         <br>
         <div>{{ $preview | safeHTML }}{{ if $previewing }}…{{ end }}</div>
diff --git a/layouts/bibliography/single.html b/layouts/bibliography/single.html
index 23e07f76..fed28338 100644
--- a/layouts/bibliography/single.html
+++ b/layouts/bibliography/single.html
@@ -32,16 +32,17 @@ <h1>{{ .Title }}</h1>
 
 <p>
   {{- $d := .Date -}}
+      {{- $fmtDate := .Params.readabledate -}}
   {{- with .Params.date -}}
-    {{- $datestr := trim . " " -}}
-    {{- if and (ne $datestr "") (findRE `^\d{4}-\d{2}-\d{2}` $datestr) -}}
-      {{- $d = time $datestr -}}
+    {{- $isoDate := trim . " " -}}
+    {{- if and (ne $isoDate "") (findRE `^\d{4}-\d{2}-\d{2}` $isoDate) -}}
+      {{- $d = time $isoDate -}}
     {{- end -}}
   {{- end }}
   {{- $d = $d.Format "2006-01-02" -}}
   {{- /* Don't display bogus date */ -}}
   {{- if (ne $d "0001-01-01") }}
-    <time datetime="{{ $d }}">{{ $d }}</time>
+    <time datetime="{{ $d }}">{{ $fmtDate }}</time>
   {{ end -}}
 </p>
 <p>
@@ -56,7 +57,7 @@ <h1>{{ .Title }}</h1>
     <strong>Abstract</strong>
     <br>
     {{ with .Params.abstract }}
-      {{ . | markdownify }}
+      {{ . | htmlEscape | markdownify }}
     {{ else }}
       No abstract available.
     {{ end }}
diff --git a/scripts/bib-fns.jq b/scripts/bib-fns.jq
index 4fc0151b..1ba4d41d 100644
--- a/scripts/bib-fns.jq
+++ b/scripts/bib-fns.jq
@@ -30,6 +30,14 @@ def issued_iso_string:
     . 
   end;
 
+def issued_date_readable:
+  if nonBlankKey("issued") and (.issued | nonBlankKey("date-parts")) then 
+    setpath(["readableDateString"]; 
+    (.issued["date-parts"][0]) as $p | $p | map(pad2) | join("-"))
+  else 
+    . 
+  end;
+  
 def format_person_name:
       if (has("family") and .family != null and (.family|tostring|length)>0) then
         .family
diff --git a/scripts/bibSplit.pl b/scripts/bibSplit.pl
index d383c400..044a1738 100755
--- a/scripts/bibSplit.pl
+++ b/scripts/bibSplit.pl
@@ -2,6 +2,18 @@
 use JSON::PP qw(decode_json encode_json);
 use Encode qw(decode encode is_utf8);  
 use Unicode::Normalize qw(NFC);
+use utf8;
+BEGIN 
+{ 
+  $bibDir = $ENV{'BIBLIOGRAPHY_DIR'};
+  $bibItemsDir = $ENV{'BIBITEMS_DIR'};
+}
+
+# Handy when using the perl debugger...
+# sub is8 {
+#     my ($s) = @_;
+#     return is_utf8($s) ? "is UTF8" : "is not UTF8";
+# }
 
 # Cleanup text fields
 sub sanitize_text {
@@ -9,6 +21,7 @@ sub sanitize_text {
   return '' unless defined $s;
 
   # Ensure decoded characters
+  # NOTE: It SEEMS is_utf8() checking may not correctly indicate the correct state of the content!!
   $s = decode('UTF-8', $s, Encode::WARN) unless is_utf8($s);
 
   # Repair mojibake (e.g., "â" -> "’") if available
@@ -19,32 +32,28 @@ sub sanitize_text {
 
   # Normalize and clean
   $s = NFC($s);                       # normalize accents/combining marks
-  $s =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x{80}-\x{9F}]//g;  # drop C0/C1 controls
-  $s =~ s/\x{00A0}/ /g;               # NBSP -> space
-  $s =~ s/\r\n?/ /g;                 # normalize newlines
+  $s =~ s/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x{80}-\x{9F}]//gu;  # drop C0/C1 controls
+  $s =~ s/\x{00A0}/ /gu;              # NBSP -> space
+  $s =~ s/\r\n?/ /gu;                 # normalize newlines
 
   return $s;
 }
 
-BEGIN 
-{ 
-  $bibDir = $ENV{'BIBLIOGRAPHY_DIR'};
-  $bibItemsDir = $ENV{'BIBITEMS_DIR'};
-}
 my $item = $_;
-my $json = eval { decode_json($item) } or do { warn "Bad JSON line: $_\n"; next; };
+my $obj = eval { decode_json($item) } or do { warn "Bad JSON line: $_\n"; next; };
 
-my $key = $json->{key};
-my $target = $json->{target} || print STDERR "Cannot find target for key \"$key\" in line: $_\n";
+my $key = $obj->{key};
+my $target = $obj->{target} || print STDERR "Cannot find target for key \"$key\" in line: $_\n";
 
 if ($key eq $target) {  # only top level entries
   #my $handle = undef;
   #my $itemjson = "$bibItemsDir/$key.json";
   #open($handle, ">:encoding(UTF-8)", $itemjson) || die "$0: cannot open $itemjson in write-open mode: $!";
-  #print $handle $item;
+  #print $handle (is_utf8($item) ? $item : decode('UTF-8', $item, Encode::WARN));
   #close $handle || die "$0: close of file $itemjson failed: $!";
 
-  my $obj = eval { decode_json($item) } or do { warn "Bad JSON for $key\n"; next; };
+  # No need to decode the $item a second time. Just changed the $json above to be $obj.
+  #my $obj = eval { decode_json($item) } or do { warn "Bad JSON for $key\n"; next; };
   delete $obj->{children};
 
   my $type = $obj->{type} // '';
@@ -59,34 +68,42 @@ BEGIN
   my $abs = sanitize_text($obj->{abstract} // '');
   # a hack for bulleted lists in the abstracts (use markdown there)
   # won't work for nested lists.
-  $abs =~ s/\n?\n\N{U+2022}/\n*/g;
+  $abs =~ s/\n?\n\N{U+2022}/\n*/gu;
   my $indented = join('', map { "  $_\n" } split(/\n/, $abs));
   my $abstract = $indented eq '' ? "abstract: ''" : "abstract: |\n$indented";
 
   my $itemDate = defined $obj->{isoDateString} ? $obj->{isoDateString} : '';
+  my $itemReadableDate = defined $obj->{readableDateString} ? $obj->{readableDateString} : '';
 
   my $itemAuthors = '';
   if (ref($obj->{authorsFormatted}) eq 'ARRAY' && @{$obj->{authorsFormatted}}) {
     $itemAuthors = "\n";
     for my $a (@{$obj->{authorsFormatted}}) {
-      my $quoted = encode_json($a // '');
-      $itemAuthors .= "  - $quoted\n";
+      # The encode_json is where extended unicode chars get corrupted, e.g., "Emanuelson, Pär"
+      # There may be other things that now don't work!!
+      # my $quoted = encode_json($a // '');
+      # sanitize_text seems to handle them correctly
+      my $san = sanitize_text($a // '');
+      $itemAuthors .= "  - \"$san\"\n";
     }
-    $itemAuthors =~ s/\n$//;  # strip trailing newline
+    $itemAuthors =~ s/\n$//u;  # strip trailing newline
   }
 
   my $itemEditors = '';
   if (ref($obj->{editorsFormatted}) eq 'ARRAY' && @{$obj->{editorsFormatted}}) {
     $itemEditors = "\n";
     for my $a (@{$obj->{editorsFormatted}}) {
-      my $quoted = encode_json($a // '');
-      $itemEditors .= "  - $quoted\n";
+      # as above...
+      # my $quoted = encode_json($a // '');
+      my $san = sanitize_text($a // '');
+      $itemEditors .= "  - \"$san\"\n";
     }
-    $itemEditors =~ s/\n$//;  # strip trailing newline
+    $itemEditors =~ s/\n$//u;  # strip trailing newline
   }
   
   my $urlSource = defined $obj->{url} ? $obj->{url} : '';
 
+  # Some/most/all of these *may* need sanitize_text
   # optional fields - ones used vary by value of type
   my $applicationNumber = defined $obj->{applicationNumber} ? qq{"$obj->{applicationNumber}"} : '""';
   my $assignee = defined $obj->{assignee} ? qq{"$obj->{assignees}"} : '""';
@@ -157,6 +174,7 @@ BEGIN
 ---
 $title
 date: $itemDate
+readabledate: $itemReadableDate
 type: bibliography
 item_type: $type
 authors: $itemAuthors
diff --git a/scripts/update_bibliography.sh b/scripts/update_bibliography.sh
index 8a827414..2229ae5f 100755
--- a/scripts/update_bibliography.sh
+++ b/scripts/update_bibliography.sh
@@ -7,13 +7,16 @@ function usage () {
   echo -e "Usage: $0 [ options ] [ input_items_file ]\n"
   echo -e "Options:"
   echo -e "  -h | --help               \tDisplay this message and exit."
-  echo -e "  -r | --rawitems           \tSave the complete downloaded JSON as '00-rawItems.json' (See 'input_items_file' below.)."
+  echo -e "  -r | --rawitems           \tSave the complete downloaded JSON as '00-rawItems.json'"
+  echo -e "                            \t(See 'input_items_file' below.)."
   echo -e "  -g | --tagsfile           \tGenerate 'tags.json' containing all tags on the 'cleaned-up' set of entries."
   echo -e "  -y | --typefiles          \tGenerate item type information JSON files. (See below.)"
   echo -e "  -c | --collectionsfiles   \tGenerate two JSON files containing info about each of the Zotero collections."
-  echo -e "  -u | --curlfiles          \tGenerate files in the 'curl/' directory with the output of each call to curl. Very low level debugging."
+  echo -e "  -u | --curlfiles          \tGenerate files in the 'curl/' directory with the output of each call to curl."
+  echo -e "                            \t(Very low level debugging.)"
   echo -e "  -d | --debugfiles         \tGenerate numbered files with the intermediate processing step output JASON."
-  echo -e "  -i N | --infolevel N      \tSet the level of display of informational messages. N is 0-10 (Default = 2.). (See below.)"
+  echo -e "  -i N | --infolevel N      \tSet the level of display of informational messages. N is 0-10 (Default = 2.)."
+  echo -e "                            \t(See below.)"
   echo -e "\n typefiles: These 3 files contain the type and itemType information for the entries with different level of details."
   echo -e "\n infolevel: The infolevel controls how much detail is presented during processing."
   echo -e "   0: NO info messages."
@@ -374,11 +377,11 @@ if $debugFiles ; then
 fi
 finalCount=$(jq '. | length' <<< "$items")
 
-items=$(jq 'include "./bib-fns";map(issued_iso_string)' <<< "$items")
+items=$(jq 'include "./bib-fns";map(issued_iso_string | issued_date_readable | add_author_string | add_editor_string)' <<< "$items")
 
-items=$(jq 'include "./bib-fns";map(add_author_string)' <<< "$items")
+#items=$(jq 'include "./bib-fns";map(add_author_string)' <<< "$items")
 
-items=$(jq 'include "./bib-fns";map(add_editor_string)' <<< "$items")
+#items=$(jq 'include "./bib-fns";map(add_editor_string)' <<< "$items")
 # if $removeChildrenFromFinalFile; then
 #   # Remove .children arrays, if any. Save space.
 #   items=$(jq 'map(del(.children))' <<< "$items")