@@ -3462,7 +3462,7 @@ Python code to generate the values:
34623462 reduced_fac_odd_part = fac_odd_part % (2**64)
34633463 print(f"{reduced_fac_odd_part:#018x}u")
34643464*/
3465- static uint64_t reduced_factorial_odd_part [] = {
3465+ static const uint64_t reduced_factorial_odd_part [] = {
34663466 0x0000000000000001u , 0x0000000000000001u , 0x0000000000000001u , 0x0000000000000003u ,
34673467 0x0000000000000003u , 0x000000000000000fu , 0x000000000000002du , 0x000000000000013bu ,
34683468 0x000000000000013bu , 0x0000000000000b13u , 0x000000000000375fu , 0x0000000000026115u ,
@@ -3494,7 +3494,7 @@ Python code to generate the values:
34943494 inverted_fac_odd_part = pow(fac_odd_part, -1, 2**64)
34953495 print(f"{inverted_fac_odd_part:#018x}u")
34963496*/
3497- static uint64_t inverted_factorial_odd_part [] = {
3497+ static const uint64_t inverted_factorial_odd_part [] = {
34983498 0x0000000000000001u , 0x0000000000000001u , 0x0000000000000001u , 0xaaaaaaaaaaaaaaabu ,
34993499 0xaaaaaaaaaaaaaaabu , 0xeeeeeeeeeeeeeeefu , 0x4fa4fa4fa4fa4fa5u , 0x2ff2ff2ff2ff2ff3u ,
35003500 0x2ff2ff2ff2ff2ff3u , 0x938cc70553e3771bu , 0xb71c27cddd93e49fu , 0xb38e3229fcdee63du ,
@@ -3514,6 +3514,25 @@ static uint64_t inverted_factorial_odd_part[] = {
35143514 0x547fb1b8ab9d0ba3u , 0x8f15a826498852e3u , 0x32e1a03f38880283u , 0x3de4cce63283f0c1u ,
35153515};
35163516
3517+ /* exponent of the largest power of 2 dividing factorial(n), for n in range(68)
3518+
3519+ Python code to generate the values:
3520+
3521+ import math
3522+
3523+ for n in range(68):
3524+ fac = math.factorial(n)
3525+ fac_trailing_zeros = (fac & -fac).bit_length() - 1
3526+ print(fac_trailing_zeros)
3527+ */
3528+
3529+ static const uint8_t factorial_trailing_zeros [] = {
3530+ 0 , 0 , 1 , 1 , 3 , 3 , 4 , 4 , 7 , 7 , 8 , 8 , 10 , 10 , 11 , 11 , // 0-15
3531+ 15 , 15 , 16 , 16 , 18 , 18 , 19 , 19 , 22 , 22 , 23 , 23 , 25 , 25 , 26 , 26 , // 16-31
3532+ 31 , 31 , 32 , 32 , 34 , 34 , 35 , 35 , 38 , 38 , 39 , 39 , 41 , 41 , 42 , 42 , // 32-47
3533+ 46 , 46 , 47 , 47 , 49 , 49 , 50 , 50 , 53 , 53 , 54 , 54 , 56 , 56 , 57 , 57 , // 48-63
3534+ 63 , 63 , 64 , 64 , // 64-67
3535+ };
35173536
35183537/*[clinic input]
35193538math.comb
@@ -3588,15 +3607,14 @@ math_comb_impl(PyObject *module, PyObject *n, PyObject *k)
35883607 where 2**shift is the largest power of two dividing comb(n, k)
35893608 and comb_odd_part is comb(n, k) >> shift. comb_odd_part can be
35903609 calculated efficiently via arithmetic modulo 2**64, using three
3591- lookups and two uint64_t multiplications, while the necessary
3592- shift can be computed via Kummer's theorem: it's the number of
3593- carries when adding k to n - k in binary, which in turn is the
3594- number of set bits of n ^ k ^ (n - k).
3610+ lookups and two uint64_t multiplications.
35953611 */
35963612 uint64_t comb_odd_part = reduced_factorial_odd_part [ni ]
35973613 * inverted_factorial_odd_part [ki ]
35983614 * inverted_factorial_odd_part [ni - ki ];
3599- int shift = _Py_popcount32 ((uint32_t )(ni ^ ki ^ (ni - ki )));
3615+ int shift = factorial_trailing_zeros [ni ]
3616+ - factorial_trailing_zeros [ki ]
3617+ - factorial_trailing_zeros [ni - ki ];
36003618 result = PyLong_FromUnsignedLongLong (comb_odd_part << shift );
36013619 goto done ;
36023620 }
0 commit comments