Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 54 additions & 8 deletions py/makeqstrdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from __future__ import print_function

import bisect
import re
import sys

Expand Down Expand Up @@ -279,7 +280,7 @@ def translate(translation_file, i18ns):

class TextSplitter:
def __init__(self, words):
words.sort(key=lambda x: len(x), reverse=True)
words = sorted(words, key=lambda x: len(x), reverse=True)
self.words = set(words)
if words:
pat = "|".join(re.escape(w) for w in words) + "|."
Expand Down Expand Up @@ -330,6 +331,7 @@ def compute_huffman_coding(translations, compression_filename):
end_unused = min(ord_c, end_unused)
max_words = end_unused - 0x80

bits_per_codepoint = 16 if max_ord > 255 else 8
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
max_words_len = 160 if max_ord > 255 else 255

Expand All @@ -344,23 +346,67 @@ def compute_huffman_coding(translations, compression_filename):
# again, neither will "there" or "wither", since they have "the"
# as substrings.
extractor = TextSplitter(words)
counter = collections.Counter()
for t in texts:
for atom in extractor.iter(t):
counter[atom] += 1
cb = huffman.codebook(counter.items())
lengths = sorted(dict((v, len(cb[k])) for k, v in counter.items()).items())

def bit_length(s):
return sum(len(cb[c]) for c in s)

def est_len(occ):
idx = bisect.bisect_left(lengths, (occ, 0))
return lengths[idx][1] + 1

# The cost of adding a dictionary word is just its storage size
# while its savings is close to the difference between the original
# huffman bit-length of the string and the estimated bit-length
# of the dictionary word, times the number of times the word appears.
#
# The savings is not strictly accurate because including a word into
# the Huffman tree bumps up the encoding lengths of all words in the
# same subtree. In the extreme case when the new word is so frequent
# that it gets a one-bit encoding, all other words will cost an extra
# bit each.
#
# Another source of inaccuracy is that compressed strings end up
# on byte boundaries, not bit boundaries, so saving 1 bit somewhere
# might not save a byte.
#
# In fact, when this change was first made, some translations (luckily,
# ones on boards not at all close to full) wasted up to 40 bytes,
# while the most constrained boards typically gained 100 bytes or
# more.
#
# The difference between the two is the estimated net savings, in bits.
def est_net_savings(s, occ):
savings = occ * (bit_length(s) - est_len(occ))
cost = len(s) * bits_per_codepoint
return savings - cost

counter = collections.Counter()
for t in texts:
for (found, word) in extractor.iter_words(t):
if not found:
for substr in iter_substrings(word, minlen=3, maxlen=9):
counter[substr] += 1

# Score the candidates we found. This is an empirical formula only,
# chosen for its effectiveness.
# Score the candidates we found. This is a semi-empirical formula that
# attempts to model the number of bits saved as closely as possible.
#
# It attempts to compute the codeword lengths of the original word
# to the codeword length the dictionary entry would get, times
# the number of occurrences, less the ovehead of the entries in the
# words[] array.

scores = sorted(
((s, (len(s) - 1) ** (occ + 4)) for (s, occ) in counter.items() if occ > 4),
key=lambda x: x[1],
reverse=True,
((s, -est_net_savings(s, occ)) for (s, occ) in counter.items()), key=lambda x: x[1]
)

# Pick the one with the highest score.
if not scores:
# Pick the one with the highest score. The score must be negative.
if not scores or scores[0][-1] >= 0:
break

word = scores[0][0]
Expand Down
38 changes: 38 additions & 0 deletions tools/fwsizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python3
# Run this from within an unzipped directory of build logs from github
# to get a CSV file full of firmware sizes. Super useful to compare sizes
# from various builds.
# Taken from https://github.com/adafruit/circuitpython/pull/4564#issuecomment-816655101
import os, re

for fn in os.listdir():
if os.path.isfile(fn) and ("build-arm " in fn or "build-riscv " in fn):
board = re.split("[()]", fn)[1]
if board in (
"spresense",
"teensy40",
"teensy41",
"feather_m7_1011",
"feather_mimxrt1011",
"feather_mimxrt1062",
"imxrt1010_evk",
"imxrt1020_evk",
"imxrt1060_evk",
"metro_m7_1011",
):
continue
with open(fn, "r") as f:
head = "Build " + board + " for "
lines = iter(f)
for line in lines:
if head in line:
tr = line.split(head)[1].split()[0]
assert "make: Entering directory" in next(lines)
assert "Use make V=1, make V=2" in next(lines)
while re.search(
r"\{\}|QSTR updated|FREEZE|\{'sku':|hex\tfilename|boot2.elf|Including User C Module from|Font missing|section `.bss' type changed to PROGBITS",
next(lines),
):
pass
free = next(lines).split("bytes used, ")[1].split()[0]
print(board + "," + tr + "," + free)