@@ -130,37 +130,56 @@ static void _rlrshift(uint64_t *v)
130130 * sense to define this much smaller special case here to avoid
131131 * including it just for printf.
132132 *
133- * It works by iteratively dividing the most significant 32 bits of
134- * the 64 bit value by 5. This will leave a remainder of 0-4
135- * (i.e. three significant bits), ensuring that the top 29 bits of the
136- * remainder are zero for the next iteration. Thus in the second
137- * iteration only 35 significant bits remain, and in the third only
138- * six. This was tested exhaustively through the first ~10B values in
139- * the input space, and for ~2e12 (4 hours runtime) random inputs
140- * taken from the full 64 bit space.
133+ * It works by multiplying v by the reciprocal of 5 i.e.:
134+ *
135+ * result = v * ((1 << 64) / 5) / (1 << 64)
136+ *
137+ * This produces a 128-bit result, but we drop the bottom 64 bits which
138+ * accounts for the division by (1 << 64). The product is kept to 64 bits
139+ * by summing partial multiplications and shifting right by 32 which on
140+ * most 32-bit architectures means only a register drop.
141+ *
142+ * Here the multiplier is: (1 << 64) / 5 = 0x3333333333333333
143+ * i.e. a 62 bits value. To compensate for the reduced precision, we
144+ * add an initial bias of 1 to v. Enlarging the multiplier to 64 bits
145+ * would also work but a final right shift would be needed, and carry
146+ * handling on the summing of partial mults would be necessary, requiring
147+ * more instructions. Given that we already want to add bias of 2 for
148+ * the result to be rounded to nearest and not truncated, we might as well
149+ * combine those together into a bias of 3. This also conveniently allows
150+ * for keeping the multiplier in a single 32-bit register given its pattern.
141151 */
142152static void _ldiv5 (uint64_t * v )
143153{
144- uint32_t hi ;
145- uint64_t rem = * v , quot = 0U , q ;
146- int i ;
154+ uint32_t v_lo = * v ;
155+ uint32_t v_hi = * v >> 32 ;
156+ uint32_t m = 0x33333333 ;
157+ uint64_t result ;
147158
148- static const char shifts [] = { 32 , 3 , 0 };
159+ /*
160+ * Force the multiplier constant into a register and make it
161+ * opaque to the compiler, otherwise gcc tries to be too smart
162+ * for its own good with a large expansion of adds and shifts.
163+ */
164+ __asm__ ("" : "+r" (m ));
149165
150166 /*
151- * Usage in this file wants rounded behavior, not truncation. So add
152- * two to get the threshold right.
167+ * Apply the bias of 3. We can't add it to v as this would overflow
168+ * it when at max range. Factor it out with the multiplier upfront.
169+ * Here we multiply the low and high parts separately to avoid an
170+ * unnecessary 64-bit add-with-carry.
153171 */
154- rem += 2U ;
172+ result = (( uint64_t )( m * 3U ) << 32 ) | ( m * 3U ) ;
155173
156- for (i = 0 ; i < 3 ; i ++ ) {
157- hi = rem >> shifts [i ];
158- q = (uint64_t )(hi / 5U ) << shifts [i ];
159- rem -= q * 5U ;
160- quot += q ;
161- }
174+ /* The actual multiplication. */
175+ result += (uint64_t )v_lo * m ;
176+ result >>= 32 ;
177+ result += (uint64_t )v_lo * m ;
178+ result += (uint64_t )v_hi * m ;
179+ result >>= 32 ;
180+ result += (uint64_t )v_hi * m ;
162181
163- * v = quot ;
182+ * v = result ;
164183}
165184
166185static char _get_digit (uint64_t * fr , int * digit_count )
0 commit comments