Skip to content

Commit 22e8a50

Browse files
authored
Merge pull request #4978 from jepler/dictionary-better-heuristic
makeqstrdata: use an extremely accurate dictionary heuristic
2 parents 7bee37e + 0b8b16f commit 22e8a50

File tree

2 files changed

+92
-8
lines changed

2 files changed

+92
-8
lines changed

py/makeqstrdata.py

Lines changed: 54 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from __future__ import print_function
1111

12+
import bisect
1213
import re
1314
import sys
1415

@@ -279,7 +280,7 @@ def translate(translation_file, i18ns):
279280

280281
class TextSplitter:
281282
def __init__(self, words):
282-
words.sort(key=lambda x: len(x), reverse=True)
283+
words = sorted(words, key=lambda x: len(x), reverse=True)
283284
self.words = set(words)
284285
if words:
285286
pat = "|".join(re.escape(w) for w in words) + "|."
@@ -330,6 +331,7 @@ def compute_huffman_coding(translations, compression_filename):
330331
end_unused = min(ord_c, end_unused)
331332
max_words = end_unused - 0x80
332333

334+
bits_per_codepoint = 16 if max_ord > 255 else 8
333335
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
334336
max_words_len = 160 if max_ord > 255 else 255
335337

@@ -344,23 +346,67 @@ def compute_huffman_coding(translations, compression_filename):
344346
# again, neither will "there" or "wither", since they have "the"
345347
# as substrings.
346348
extractor = TextSplitter(words)
349+
counter = collections.Counter()
350+
for t in texts:
351+
for atom in extractor.iter(t):
352+
counter[atom] += 1
353+
cb = huffman.codebook(counter.items())
354+
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())
355+
356+
def bit_length(s):
357+
return sum(len(cb[c]) for c in s)
358+
359+
def est_len(occ):
360+
idx = bisect.bisect_left(lengths, (occ, 0))
361+
return lengths[idx][1] + 1
362+
363+
# The cost of adding a dictionary word is just its storage size
364+
# while its savings is close to the difference between the original
365+
# huffman bit-length of the string and the estimated bit-length
366+
# of the dictionary word, times the number of times the word appears.
367+
#
368+
# The savings is not strictly accurate because including a word into
369+
# the Huffman tree bumps up the encoding lengths of all words in the
370+
# same subtree. In the extreme case when the new word is so frequent
371+
# that it gets a one-bit encoding, all other words will cost an extra
372+
# bit each.
373+
#
374+
# Another source of inaccuracy is that compressed strings end up
375+
# on byte boundaries, not bit boundaries, so saving 1 bit somewhere
376+
# might not save a byte.
377+
#
378+
# In fact, when this change was first made, some translations (luckily,
379+
# ones on boards not at all close to full) wasted up to 40 bytes,
380+
# while the most constrained boards typically gained 100 bytes or
381+
# more.
382+
#
383+
# The difference between the two is the estimated net savings, in bits.
384+
def est_net_savings(s, occ):
385+
savings = occ * (bit_length(s) - est_len(occ))
386+
cost = len(s) * bits_per_codepoint
387+
return savings - cost
388+
347389
counter = collections.Counter()
348390
for t in texts:
349391
for (found, word) in extractor.iter_words(t):
350392
if not found:
351393
for substr in iter_substrings(word, minlen=3, maxlen=9):
352394
counter[substr] += 1
353395

354-
# Score the candidates we found. This is an empirical formula only,
355-
# chosen for its effectiveness.
396+
# Score the candidates we found. This is a semi-empirical formula that
397+
# attempts to model the number of bits saved as closely as possible.
398+
#
399+
# It attempts to compute the codeword lengths of the original word
400+
# to the codeword length the dictionary entry would get, times
401+
# the number of occurrences, less the ovehead of the entries in the
402+
# words[] array.
403+
356404
scores = sorted(
357-
((s, (len(s) - 1) ** (occ + 4)) for (s, occ) in counter.items() if occ > 4),
358-
key=lambda x: x[1],
359-
reverse=True,
405+
((s, -est_net_savings(s, occ)) for (s, occ) in counter.items()), key=lambda x: x[1]
360406
)
361407

362-
# Pick the one with the highest score.
363-
if not scores:
408+
# Pick the one with the highest score. The score must be negative.
409+
if not scores or scores[0][-1] >= 0:
364410
break
365411

366412
word = scores[0][0]

tools/fwsizes.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/usr/bin/env python3
2+
# Run this from within an unzipped directory of build logs from github
3+
# to get a CSV file full of firmware sizes. Super useful to compare sizes
4+
# from various builds.
5+
# Taken from https://github.com/adafruit/circuitpython/pull/4564#issuecomment-816655101
6+
import os, re
7+
8+
for fn in os.listdir():
9+
if os.path.isfile(fn) and ("build-arm " in fn or "build-riscv " in fn):
10+
board = re.split("[()]", fn)[1]
11+
if board in (
12+
"spresense",
13+
"teensy40",
14+
"teensy41",
15+
"feather_m7_1011",
16+
"feather_mimxrt1011",
17+
"feather_mimxrt1062",
18+
"imxrt1010_evk",
19+
"imxrt1020_evk",
20+
"imxrt1060_evk",
21+
"metro_m7_1011",
22+
):
23+
continue
24+
with open(fn, "r") as f:
25+
head = "Build " + board + " for "
26+
lines = iter(f)
27+
for line in lines:
28+
if head in line:
29+
tr = line.split(head)[1].split()[0]
30+
assert "make: Entering directory" in next(lines)
31+
assert "Use make V=1, make V=2" in next(lines)
32+
while re.search(
33+
r"\{\}|QSTR updated|FREEZE|\{'sku':|hex\tfilename|boot2.elf|Including User C Module from|Font missing|section `.bss' type changed to PROGBITS",
34+
next(lines),
35+
):
36+
pass
37+
free = next(lines).split("bytes used, ")[1].split()[0]
38+
print(board + "," + tr + "," + free)

0 commit comments

Comments
 (0)