From 243300aaf15c589ab3d1860bad377a5c0ad09446 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Tue, 25 Mar 2025 17:34:26 +0000 Subject: [PATCH 01/10] initial --- Tools/i18n/msgfmt.py | 90 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 10 deletions(-) diff --git a/Tools/i18n/msgfmt.py b/Tools/i18n/msgfmt.py index f005c4e7b5b79e..88b24fa3e14ee4 100755 --- a/Tools/i18n/msgfmt.py +++ b/Tools/i18n/msgfmt.py @@ -34,7 +34,7 @@ from email.parser import HeaderParser import codecs -__version__ = "1.2" +__version__ = "1.3" MESSAGES = {} @@ -60,21 +60,57 @@ def add(ctxt, id, str, fuzzy): def generate(): "Return the generated output." global MESSAGES + + def hash_insert_entry(string, i): + hash_val = hashpjw(string) + hash_cursor = hash_val % hash_tab_size + inc = 1 + (hash_val % (hash_tab_size - 2)) + while hash_table[hash_cursor] != 0: + hash_cursor += inc + hash_cursor %= hash_tab_size + hash_table[hash_cursor] = i + 1 + + # From [gettext.git]/gettext-tools/src/write-mo.c: + # Each string has an associate hashing value V, computed by a fixed + # function. To locate the string we use open addressing with double + # hashing. The first index will be V % M, where M is the size of the + # hashing table. If no entry is found, iterating with a second, + # independent hashing function takes place. This second value will + # be 1 + V % (M - 2). + # The approximate number of probes will be + # + # for unsuccessful search: (1 - N / M) ^ -1 + # for successful search: - (N / M) ^ -1 * ln (1 - N / M) + # + # where N is the number of keys. + # + # If we now choose M to be the next prime bigger than 4 / 3 * N, + # we get the values + # 4 and 1.85 resp. + # Because unsuccessful searches are unlikely this is a good value. + # Formulas: [Knuth, The Art of Computer Programming, Volume 3, + # 766 Sorting and Searching, 1973, Addison Wesley] + hash_tab_size = next_prime((len(sorted(MESSAGES.keys())) * 4) // 3) + if hash_tab_size <= 2: + hash_tab_size = 3 + hash_table = array.array("I", [0] * hash_tab_size) + # the keys are sorted in the .mo file keys = sorted(MESSAGES.keys()) offsets = [] ids = strs = b'' - for id in keys: + for i, id in enumerate(keys): # For each string, we need size and file offset. Each string is NUL # terminated; the NUL does not count into the size. + hash_insert_entry(id, i) offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id]))) ids += id + b'\0' strs += MESSAGES[id] + b'\0' output = '' - # The header is 7 32-bit unsigned integers. We don't use hash tables, so + ## FIX ME The header is 7 32-bit unsigned integers. We use hash tables, so # the keys start right after the index tables. # translated string. - keystart = 7*4+16*len(keys) + keystart = 7*4+16*len(keys)+hash_tab_size*4 # and the values start after the keys valuestart = keystart + len(ids) koffsets = [] @@ -86,13 +122,15 @@ def generate(): voffsets += [l2, o2+valuestart] offsets = koffsets + voffsets output = struct.pack("Iiiiiii", - 0x950412de, # Magic - 0, # Version - len(keys), # # of entries - 7*4, # start of key index - 7*4+len(keys)*8, # start of value index - 0, 0) # size and offset of hash table + 0x950412de, # Magic + 0, # Version + len(keys), # # of entries + 7*4, # start of key index + 7*4+len(keys)*8, # start of value index + hash_tab_size, # size of hash table + 7 * 4 + 2 * (len(keys) * 8)) # offset of hash table output += array.array("i", offsets).tobytes() + output += hash_table.tobytes() output += ids output += strs return output @@ -252,5 +290,37 @@ def main(): make(filename, outfile) +# Utilities for writing hash table + +def hashpjw(str_param): + hval = 0 + for s in str_param: + if not s: + break + hval <<= 4 + hval += s + g = hval & 0xF << 28 + if g != 0: + hval ^= g >> 24 + hval ^= g + return hval + + +def next_prime(start): + def is_prime(num): + divn = 3 + sq = divn * divn + while sq < num and num % divn != 0: + divn += 1 + sq += 4 * divn + divn += 1 + + return num % divn != 0 + + candidate = start | 1 + while not is_prime(candidate): + candidate += 2 + return candidate + if __name__ == '__main__': main() From a58a559b8c9d3e1bef92b99b3c905cdfd72b994c Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Thu, 27 Mar 2025 20:44:17 +0000 Subject: [PATCH 02/10] Update stuff --- Lib/test/test_tools/msgfmt_data/fuzzy.mo | Bin 28 -> 40 bytes Lib/test/test_tools/msgfmt_data/general.mo | Bin 728 -> 780 bytes Lib/test/test_tools/test_msgfmt.py | 5 ++++- ...5-03-25-18-00-00.gh-issue-131725.qwfh321.rst | 1 + 4 files changed, 5 insertions(+), 1 deletion(-) create mode 100644 Misc/NEWS.d/next/Tools-Demos/2025-03-25-18-00-00.gh-issue-131725.qwfh321.rst diff --git a/Lib/test/test_tools/msgfmt_data/fuzzy.mo b/Lib/test/test_tools/msgfmt_data/fuzzy.mo index 4b144831cf5f75c62f6e9446b91ce164bb3c8dee..273edbbbd7cbd799fe1bf1497157bacf5f3a271e 100644 GIT binary patch delta 25 Ucmb2qm>?p;%)r1P1H{Mx02ti?8~^|S delta 13 OcmdOpnIOW!00jUFr2z2& diff --git a/Lib/test/test_tools/msgfmt_data/general.mo b/Lib/test/test_tools/msgfmt_data/general.mo index bc0683a62d0ddaecbc751b9a59eb1fe19bb7619c..af660003141d892235906d5b302696aa041684fa 100644 GIT binary patch delta 214 zcmXwzF$#k~5Jkt;H3~5(S|^}=$RX@H|LF kuaUcC97YN{@Y@;#w(^Dqpz*+hekKGTbm1PxNY`A=A78Ey1^@s6 delta 162 zcmeBSyTLj^q@Do@tO2r>fp{$tvjg!(AZ7;Qn?Ni8#CL(17l_{hu`m$-0b-DP0Y->= z6Cf=I#5O=I4-}7Pg6OY?()B=^4=COYq Date: Thu, 27 Mar 2025 20:53:17 +0000 Subject: [PATCH 03/10] Regenerate files after merge --- Lib/test/test_tools/msgfmt_data/general.mo | Bin 780 -> 764 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/Lib/test/test_tools/msgfmt_data/general.mo b/Lib/test/test_tools/msgfmt_data/general.mo index af660003141d892235906d5b302696aa041684fa..49228dde71b664ff475856dec9672190bd12e4a2 100644 GIT binary patch delta 260 zcmeBS`@=dxrJjd@f#D$#a{}=@Am#_+A3!V*#2kza49Y-S14y$2X&)fX45UM#{0JZ| z2gC_L%m);&XJTLgnb!xTMS%2dAPq8S1C+i1q(S;WK?|B_J&fqz!;HNWBY?2C2^g z(sDpt0L1b@em@gL|9U9B8A$U1#kT`#9w2=TNQ(gJ%Rm~W{sj|*6WBrypsheN7&w3! z1XzHW6^KEO;RIqfAO=|o13>){keHOrGI;}&!ek#t0b?$OywviX%)HcM1ulk)$|@G6 v)Z*mCg4ALqFweI%rz8_BT~d^pmkt&K>D?T`sLnX~0+Yt%dZxh1ADCDGrDrHh From bbbf84d1cae017a1f3f042ffb5fe9d3afc54271d Mon Sep 17 00:00:00 2001 From: Stan Ulbrych <89152624+StanFromIreland@users.noreply.github.com> Date: Sat, 29 Mar 2025 13:31:48 +0000 Subject: [PATCH 04/10] Update Tools/i18n/msgfmt.py Co-authored-by: Serhiy Storchaka --- Tools/i18n/msgfmt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/i18n/msgfmt.py b/Tools/i18n/msgfmt.py index af29442c21d024..d9aaa864d93d2b 100755 --- a/Tools/i18n/msgfmt.py +++ b/Tools/i18n/msgfmt.py @@ -90,7 +90,7 @@ def hash_insert_entry(string, i): # Because unsuccessful searches are unlikely this is a good value. # Formulas: [Knuth, The Art of Computer Programming, Volume 3, # 766 Sorting and Searching, 1973, Addison Wesley] - hash_tab_size = next_prime((len(sorted(MESSAGES.keys())) * 4) // 3) + hash_tab_size = next_prime((len(MESSAGES) * 4) // 3) if hash_tab_size <= 2: hash_tab_size = 3 hash_table = array.array("I", [0] * hash_tab_size) From 7b0685d4955861d02667803a5d8796eee4988058 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sat, 29 Mar 2025 13:38:29 +0000 Subject: [PATCH 05/10] Apply Serhiys suggestions --- Tools/i18n/msgfmt.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/Tools/i18n/msgfmt.py b/Tools/i18n/msgfmt.py index d9aaa864d93d2b..457574a964e971 100755 --- a/Tools/i18n/msgfmt.py +++ b/Tools/i18n/msgfmt.py @@ -65,7 +65,7 @@ def hash_insert_entry(string, i): hash_val = hashpjw(string) hash_cursor = hash_val % hash_tab_size inc = 1 + (hash_val % (hash_tab_size - 2)) - while hash_table[hash_cursor] != 0: + while hash_table[hash_cursor]: hash_cursor += inc hash_cursor %= hash_tab_size hash_table[hash_cursor] = i + 1 @@ -106,10 +106,9 @@ def hash_insert_entry(string, i): offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id]))) ids += id + b'\0' strs += MESSAGES[id] + b'\0' - output = '' - ## FIX ME The header is 7 32-bit unsigned integers. We use hash tables, so - # the keys start right after the index tables. - # translated string. + + # The header is 7 32-bit unsigned integers, and we have an index table and + # hash table. keystart = 7*4+16*len(keys)+hash_tab_size*4 # and the values start after the keys valuestart = keystart + len(ids) @@ -301,7 +300,7 @@ def hashpjw(str_param): hval <<= 4 hval += s g = hval & 0xF << 28 - if g != 0: + if g: hval ^= g >> 24 hval ^= g return hval From a16a6c3755eb69c8d11439c6063a3085f3082f6a Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sun, 30 Mar 2025 09:34:49 +0100 Subject: [PATCH 06/10] update tests --- Lib/test/test_tools/test_msgfmt.py | 4 ++-- Tools/i18n/msgfmt.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_tools/test_msgfmt.py b/Lib/test/test_tools/test_msgfmt.py index c45a23a69ee482..f128c70bb31ad2 100644 --- a/Lib/test/test_tools/test_msgfmt.py +++ b/Lib/test/test_tools/test_msgfmt.py @@ -66,8 +66,8 @@ def test_binary_header(self): self.assertEqual(num_strings, 9) self.assertEqual(orig_table_offset, 28) self.assertEqual(trans_table_offset, 100) - self.assertEqual(hash_table_size, 0) - self.assertEqual(hash_table_offset, 0) + self.assertEqual(hash_table_size, 13) + self.assertEqual(hash_table_offset, 172) def test_translations(self): with open(data_dir / 'general.mo', 'rb') as f: diff --git a/Tools/i18n/msgfmt.py b/Tools/i18n/msgfmt.py index 457574a964e971..a86bea83e2722a 100755 --- a/Tools/i18n/msgfmt.py +++ b/Tools/i18n/msgfmt.py @@ -5,8 +5,8 @@ This program converts a textual Uniforum-style message catalog (.po file) into a binary GNU catalog (.mo file). This is essentially the same function as the -GNU msgfmt program, however, it is a simpler implementation. Currently it -does not handle plural forms but it does handle message contexts. +GNU msgfmt program. Currently it does not handle plural forms but it does +handle message contexts. Usage: msgfmt.py [OPTIONS] filename.po From a92e6f1fc4e680c234eaf94644a70dcfee7eca37 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sun, 30 Mar 2025 12:49:27 +0100 Subject: [PATCH 07/10] Address feedback --- Tools/i18n/msgfmt.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/Tools/i18n/msgfmt.py b/Tools/i18n/msgfmt.py index a86bea83e2722a..d0ae433842fe9c 100755 --- a/Tools/i18n/msgfmt.py +++ b/Tools/i18n/msgfmt.py @@ -34,7 +34,7 @@ from email.parser import HeaderParser import codecs -__version__ = "1.3" +__version__ = "1.2" MESSAGES = {} @@ -121,7 +121,7 @@ def hash_insert_entry(string, i): voffsets += [l2, o2+valuestart] offsets = koffsets + voffsets output = struct.pack("Iiiiiii", - 0x950412de, # Magic + 0x950412de, # Magic 0, # Version len(keys), # # of entries 7*4, # start of key index @@ -292,14 +292,16 @@ def main(): # Utilities for writing hash table -def hashpjw(str_param): +# Peter J. Weinberger hash function +# See: https://www.drdobbs.com/database/hashing-rehashed/184409859 +def hashpjw(strs): hval = 0 - for s in str_param: + for s in strs: if not s: break hval <<= 4 hval += s - g = hval & 0xF << 28 + g = hval & (0xF << 28) if g: hval ^= g >> 24 hval ^= g From d65f9589ca0de27b5139c6ef3e9ec07d5c483f67 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sun, 30 Mar 2025 12:49:49 +0100 Subject: [PATCH 08/10] Address feedback (test) --- Lib/test/test_tools/test_msgfmt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_tools/test_msgfmt.py b/Lib/test/test_tools/test_msgfmt.py index f128c70bb31ad2..d9a332df07731d 100644 --- a/Lib/test/test_tools/test_msgfmt.py +++ b/Lib/test/test_tools/test_msgfmt.py @@ -155,7 +155,7 @@ def test_version(self): for option in ('--version', '-V'): res = assert_python_ok(msgfmt, option) out = res.out.decode('utf-8').strip() - self.assertEqual('msgfmt.py 1.3', out) + self.assertEqual('msgfmt.py 1.2', out) def test_invalid_option(self): res = assert_python_failure(msgfmt, '--invalid-option') From ba324cbcf45ba9691b7e7cf5529ecaceb22ee9b4 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sun, 30 Mar 2025 13:41:47 +0100 Subject: [PATCH 09/10] Add test --- Lib/test/test_tools/test_msgfmt.py | 25 +++++++++++++++++++++++-- Tools/i18n/msgfmt.py | 8 ++++---- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_tools/test_msgfmt.py b/Lib/test/test_tools/test_msgfmt.py index d9a332df07731d..5635d1e9e458f9 100644 --- a/Lib/test/test_tools/test_msgfmt.py +++ b/Lib/test/test_tools/test_msgfmt.py @@ -9,7 +9,7 @@ from test.support.os_helper import temp_cwd from test.support.script_helper import assert_python_failure, assert_python_ok -from test.test_tools import skip_if_missing, toolsdir +from test.test_tools import imports_under_tool, skip_if_missing, toolsdir skip_if_missing('i18n') @@ -18,6 +18,9 @@ script_dir = Path(toolsdir) / 'i18n' msgfmt = script_dir / 'msgfmt.py' +with imports_under_tool("i18n"): + from msgfmt import _hashpjw + def compile_messages(po_file, mo_file): assert_python_ok(msgfmt, '-o', mo_file, po_file) @@ -42,7 +45,25 @@ def test_compilation(self): self.assertDictEqual(actual._catalog, expected._catalog) def test_hash_table(self): - pass + # Check _hashpjw generates correct hash values + self.assertEqual(_hashpjw(b"stan"), 502398) + self.assertEqual(_hashpjw(b"foo"), 27999) + + # Check hash table is generated correctly for general.po + with temp_cwd(): + tmp_mo_file = "messages.mo" + compile_messages(data_dir / "general.po", tmp_mo_file) + with open(tmp_mo_file, "rb") as f: + mo_data = f.read() + + header = struct.unpack("=7I", mo_data[:28]) + hash_table_size, hash_table_offset = header[5:7] + + hash_tab = struct.unpack(f"={hash_table_size}I", + mo_data[hash_table_offset : hash_table_offset + (hash_table_size * 4)]) + + self.assertEqual(hash_tab, (1, 3, 0, 8, 9, 7, 2, 0, 4, 5, 0, 6, 0)) + def test_binary_header(self): with temp_cwd(): diff --git a/Tools/i18n/msgfmt.py b/Tools/i18n/msgfmt.py index d0ae433842fe9c..f69931baa225a2 100755 --- a/Tools/i18n/msgfmt.py +++ b/Tools/i18n/msgfmt.py @@ -62,7 +62,7 @@ def generate(): global MESSAGES def hash_insert_entry(string, i): - hash_val = hashpjw(string) + hash_val = _hashpjw(string) hash_cursor = hash_val % hash_tab_size inc = 1 + (hash_val % (hash_tab_size - 2)) while hash_table[hash_cursor]: @@ -90,7 +90,7 @@ def hash_insert_entry(string, i): # Because unsuccessful searches are unlikely this is a good value. # Formulas: [Knuth, The Art of Computer Programming, Volume 3, # 766 Sorting and Searching, 1973, Addison Wesley] - hash_tab_size = next_prime((len(MESSAGES) * 4) // 3) + hash_tab_size = _next_prime((len(MESSAGES) * 4) // 3) if hash_tab_size <= 2: hash_tab_size = 3 hash_table = array.array("I", [0] * hash_tab_size) @@ -294,7 +294,7 @@ def main(): # Peter J. Weinberger hash function # See: https://www.drdobbs.com/database/hashing-rehashed/184409859 -def hashpjw(strs): +def _hashpjw(strs): hval = 0 for s in strs: if not s: @@ -308,7 +308,7 @@ def hashpjw(strs): return hval -def next_prime(start): +def _next_prime(start): def is_prime(num): divn = 3 sq = divn * divn From 4cd916e382bab6e317bb4d9bc76fcf194bf5db42 Mon Sep 17 00:00:00 2001 From: Stan Ulbrych Date: Sat, 24 May 2025 17:40:01 +0100 Subject: [PATCH 10/10] Regen --- Lib/test/test_tools/msgfmt_data/general.mo | Bin 764 -> 723 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/Lib/test/test_tools/msgfmt_data/general.mo b/Lib/test/test_tools/msgfmt_data/general.mo index 49228dde71b664ff475856dec9672190bd12e4a2..3107f6711bdd10bbdde590bddaf308b78a58914c 100644 GIT binary patch delta 84 zcmeyvdYN@XitASf28IMi1_nML9md4KzyqXHfwTyaE&|f>K)M}DF9*`hK>l$keHu!i NpSbew=Jkxei~u5X4R8Pe delta 123 zcmcc2`iFHwimMzW149B3^D!_m)H5+K@BryPAT0ui0P{RK+@ zn7Hz;T8OT5QEFmIW`3ToOJYf?m4cChk%_LMfv%C6f}w?#ftj{}k%7VHX2t+U0Q`{` An*aa+