99
1010from __future__ import print_function
1111
12+ import bisect
1213import re
1314import sys
1415
@@ -279,7 +280,7 @@ def translate(translation_file, i18ns):
279280
280281class TextSplitter :
281282 def __init__ (self , words ):
282- words . sort ( key = lambda x : len (x ), reverse = True )
283+ words = sorted ( words , key = lambda x : len (x ), reverse = True )
283284 self .words = set (words )
284285 if words :
285286 pat = "|" .join (re .escape (w ) for w in words ) + "|."
@@ -330,6 +331,7 @@ def compute_huffman_coding(translations, compression_filename):
330331 end_unused = min (ord_c , end_unused )
331332 max_words = end_unused - 0x80
332333
334+ bits_per_codepoint = 16 if max_ord > 255 else 8
333335 values_type = "uint16_t" if max_ord > 255 else "uint8_t"
334336 max_words_len = 160 if max_ord > 255 else 255
335337
@@ -344,23 +346,67 @@ def compute_huffman_coding(translations, compression_filename):
344346 # again, neither will "there" or "wither", since they have "the"
345347 # as substrings.
346348 extractor = TextSplitter (words )
349+ counter = collections .Counter ()
350+ for t in texts :
351+ for atom in extractor .iter (t ):
352+ counter [atom ] += 1
353+ cb = huffman .codebook (counter .items ())
354+ lengths = sorted (dict ((v , len (cb [k ])) for k , v in counter .items ()).items ())
355+
356+ def bit_length (s ):
357+ return sum (len (cb [c ]) for c in s )
358+
359+ def est_len (occ ):
360+ idx = bisect .bisect_left (lengths , (occ , 0 ))
361+ return lengths [idx ][1 ] + 1
362+
363+ # The cost of adding a dictionary word is just its storage size
364+ # while its savings is close to the difference between the original
365+ # huffman bit-length of the string and the estimated bit-length
366+ # of the dictionary word, times the number of times the word appears.
367+ #
368+ # The savings is not strictly accurate because including a word into
369+ # the Huffman tree bumps up the encoding lengths of all words in the
370+ # same subtree. In the extreme case when the new word is so frequent
371+ # that it gets a one-bit encoding, all other words will cost an extra
372+ # bit each.
373+ #
374+ # Another source of inaccuracy is that compressed strings end up
375+ # on byte boundaries, not bit boundaries, so saving 1 bit somewhere
376+ # might not save a byte.
377+ #
378+ # In fact, when this change was first made, some translations (luckily,
379+ # ones on boards not at all close to full) wasted up to 40 bytes,
380+ # while the most constrained boards typically gained 100 bytes or
381+ # more.
382+ #
383+ # The difference between the two is the estimated net savings, in bits.
384+ def est_net_savings (s , occ ):
385+ savings = occ * (bit_length (s ) - est_len (occ ))
386+ cost = len (s ) * bits_per_codepoint
387+ return savings - cost
388+
347389 counter = collections .Counter ()
348390 for t in texts :
349391 for (found , word ) in extractor .iter_words (t ):
350392 if not found :
351393 for substr in iter_substrings (word , minlen = 3 , maxlen = 9 ):
352394 counter [substr ] += 1
353395
354- # Score the candidates we found. This is an empirical formula only,
355- # chosen for its effectiveness.
396+ # Score the candidates we found. This is a semi-empirical formula that
397+ # attempts to model the number of bits saved as closely as possible.
398+ #
399+ # It attempts to compute the codeword lengths of the original word
400+ # to the codeword length the dictionary entry would get, times
401+ # the number of occurrences, less the ovehead of the entries in the
402+ # words[] array.
403+
356404 scores = sorted (
357- ((s , (len (s ) - 1 ) ** (occ + 4 )) for (s , occ ) in counter .items () if occ > 4 ),
358- key = lambda x : x [1 ],
359- reverse = True ,
405+ ((s , - est_net_savings (s , occ )) for (s , occ ) in counter .items ()), key = lambda x : x [1 ]
360406 )
361407
362- # Pick the one with the highest score.
363- if not scores :
408+ # Pick the one with the highest score. The score must be negative.
409+ if not scores or scores [ 0 ][ - 1 ] >= 0 :
364410 break
365411
366412 word = scores [0 ][0 ]
0 commit comments