Merge pull request #4978 from jepler/dictionary-better-heuristic

jepler · web-flow · commit 22e8a50585ad · 2021-07-11T12:49:03.000-05:00
makeqstrdata: use an extremely accurate dictionary heuristic
diff --git a/py/makeqstrdata.py b/py/makeqstrdata.py
@@ -9,6 +9,7 @@
 
 from __future__ import print_function
 
+import bisect
 import re
 import sys
 
@@ -279,7 +280,7 @@ def translate(translation_file, i18ns):
 
 class TextSplitter:
     def __init__(self, words):
-        words.sort(key=lambda x: len(x), reverse=True)
+        words = sorted(words, key=lambda x: len(x), reverse=True)
         self.words = set(words)
         if words:
             pat = "|".join(re.escape(w) for w in words) + "|."
@@ -330,6 +331,7 @@ def compute_huffman_coding(translations, compression_filename):
                 end_unused = min(ord_c, end_unused)
     max_words = end_unused - 0x80
 
+    bits_per_codepoint = 16 if max_ord > 255 else 8
     values_type = "uint16_t" if max_ord > 255 else "uint8_t"
     max_words_len = 160 if max_ord > 255 else 255
 
@@ -344,23 +346,67 @@ def compute_huffman_coding(translations, compression_filename):
         # again, neither will "there" or "wither", since they have "the"
         # as substrings.
         extractor = TextSplitter(words)
+        counter = collections.Counter()
+        for t in texts:
+            for atom in extractor.iter(t):
+                counter[atom] += 1
+        cb = huffman.codebook(counter.items())
+        lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
+
+        def bit_length(s):
+            return sum(len(cb[c]) for c in s)
+
+        def est_len(occ):
+            idx = bisect.bisect_left(lengths, (occ, 0))
+            return lengths[idx][1] + 1
+
+        # The cost of adding a dictionary word is just its storage size
+        # while its savings is close to the difference between the original
+        # huffman bit-length of the string and the estimated bit-length
+        # of the dictionary word, times the number of times the word appears.
+        #
+        # The savings is not strictly accurate because including a word into
+        # the Huffman tree bumps up the encoding lengths of all words in the
+        # same subtree.  In the extreme case when the new word is so frequent
+        # that it gets a one-bit encoding, all other words will cost an extra
+        # bit each.
+        #
+        # Another source of inaccuracy is that compressed strings end up
+        # on byte boundaries, not bit boundaries, so saving 1 bit somewhere
+        # might not save a byte.
+        #
+        # In fact, when this change was first made, some translations (luckily,
+        # ones on boards not at all close to full) wasted up to 40 bytes,
+        # while the most constrained boards typically gained 100 bytes or
+        # more.
+        #
+        # The difference between the two is the estimated net savings, in bits.
+        def est_net_savings(s, occ):
+            savings = occ * (bit_length(s) - est_len(occ))
+            cost = len(s) * bits_per_codepoint
+            return savings - cost
+
         counter = collections.Counter()
         for t in texts:
             for (found, word) in extractor.iter_words(t):
                 if not found:
                     for substr in iter_substrings(word, minlen=3, maxlen=9):
                         counter[substr] += 1
 
-        # Score the candidates we found.  This is an empirical formula only,
-        # chosen for its effectiveness.
+        # Score the candidates we found.  This is a semi-empirical formula that
+        # attempts to model the number of bits saved as closely as possible.
+        #
+        # It attempts to compute the codeword lengths of the original word
+        # to the codeword length the dictionary entry would get, times
+        # the number of occurrences, less the ovehead of the entries in the
+        # words[] array.
+
         scores = sorted(
-            ((s, (len(s) - 1) ** (occ + 4)) for (s, occ) in counter.items() if occ > 4),
-            key=lambda x: x[1],
-            reverse=True,
+            ((s, -est_net_savings(s, occ)) for (s, occ) in counter.items()), key=lambda x: x[1]
         )
 
-        # Pick the one with the highest score.
-        if not scores:
+        # Pick the one with the highest score.  The score must be negative.
+        if not scores or scores[0][-1] >= 0:
             break
 
         word = scores[0][0]
diff --git a/tools/fwsizes.py b/tools/fwsizes.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# Run this from within an unzipped directory of build logs from github
+# to get a CSV file full of firmware sizes.  Super useful to compare sizes
+# from various builds.
+# Taken from https://github.com/adafruit/circuitpython/pull/4564#issuecomment-816655101
+import os, re
+
+for fn in os.listdir():
+    if os.path.isfile(fn) and ("build-arm " in fn or "build-riscv " in fn):
+        board = re.split("[()]", fn)[1]
+        if board in (
+            "spresense",
+            "teensy40",
+            "teensy41",
+            "feather_m7_1011",
+            "feather_mimxrt1011",
+            "feather_mimxrt1062",
+            "imxrt1010_evk",
+            "imxrt1020_evk",
+            "imxrt1060_evk",
+            "metro_m7_1011",
+        ):
+            continue
+        with open(fn, "r") as f:
+            head = "Build " + board + " for "
+            lines = iter(f)
+            for line in lines:
+                if head in line:
+                    tr = line.split(head)[1].split()[0]
+                    assert "make: Entering directory" in next(lines)
+                    assert "Use make V=1, make V=2" in next(lines)
+                    while re.search(
+                        r"\{\}|QSTR updated|FREEZE|\{'sku':|hex\tfilename|boot2.elf|Including User C Module from|Font missing|section `.bss' type changed to PROGBITS",
+                        next(lines),
+                    ):
+                        pass
+                    free = next(lines).split("bytes used, ")[1].split()[0]
+                    print(board + "," + tr + "," + free)