From 91b93433bc8e20f655bd307bfe0664876d9672a3 Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Mon, 13 May 2024 17:40:23 +0900
Subject: [PATCH 1/9] Changed the bcmul calculation method

Multiplication is performed after converting to unsigned long, resulting in faster calculations.
---
 ext/bcmath/libbcmath/src/private.h |   4 +
 ext/bcmath/libbcmath/src/recmul.c  | 313 +++++++++++------------------
 2 files changed, 120 insertions(+), 197 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/private.h b/ext/bcmath/libbcmath/src/private.h
index 1403baad0c257..fa538150ef73a 100644
--- a/ext/bcmath/libbcmath/src/private.h
+++ b/ext/bcmath/libbcmath/src/private.h
@@ -84,9 +84,13 @@ static inline uint64_t BC_BSWAP64(uint64_t u)
 #if SIZEOF_SIZE_T >= 8
 #  define BC_BSWAP(u) BC_BSWAP64(u)
 #  define BC_UINT_T uint64_t
+#  define BC_LONGABLE_DIGITS 8
+#  define BC_LONGABLE_OVERFLOW 100000000
 #else
 #  define BC_BSWAP(u) BC_BSWAP32(u)
 #  define BC_UINT_T uint32_t
+#  define BC_LONGABLE_DIGITS 4
+#  define BC_LONGABLE_OVERFLOW 10000
 #endif
 
 #ifdef WORDS_BIGENDIAN
diff --git a/ext/bcmath/libbcmath/src/recmul.c b/ext/bcmath/libbcmath/src/recmul.c
index 3b3b696f99d46..75ace8a88355a 100644
--- a/ext/bcmath/libbcmath/src/recmul.c
+++ b/ext/bcmath/libbcmath/src/recmul.c
@@ -36,217 +36,134 @@
 #include "private.h" /* For _bc_rm_leading_zeros() */
 #include "zend_alloc.h"
 
-/* Recursive vs non-recursive multiply crossover ranges. */
-#if defined(MULDIGITS)
-#include "muldigits.h"
-#else
-#define MUL_BASE_DIGITS 80
-#endif
-
-int mul_base_digits = MUL_BASE_DIGITS;
-#define MUL_SMALL_DIGITS mul_base_digits/4
 
 /* Multiply utility routines */
 
-static bc_num new_sub_num(size_t length, size_t scale, char *value)
+/*
+ * Converts BCD to long, going backwards from pointer n by the number of
+ * characters specified by len.
+ */
+static inline unsigned long bc_partial_convert_to_long(const char *n, size_t len)
 {
-	bc_num temp = (bc_num) emalloc(sizeof(bc_struct));
+	unsigned long num = 0;
+	unsigned long base = 1;
 
-	temp->n_sign = PLUS;
-	temp->n_len = length;
-	temp->n_scale = scale;
-	temp->n_refs = 1;
-	temp->n_value = value;
-	return temp;
+	for (size_t i = 0; i < len; i++) {
+		num += *n * base;
+		base *= BASE;
+		n--;
+	}
+
+	return num;
 }
 
-static void _bc_simp_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_num *prod)
+/*
+ * If the n_values ​​of n1 and n2 are both 4 (32-bit) or 8 (64-bit) digits or less,
+ * the calculation will be performed at high speed without using an array.
+ */
+static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_num *prod)
 {
-	char *n1ptr, *n2ptr, *pvptr;
-	char *n1end, *n2end;        /* To the end of n1 and n2. */
-	int sum = 0;
+	char *n1end = n1->n_value + n1len - 1;
+	char *n2end = n2->n_value + n2len - 1;
 
-	int prodlen = n1len + n2len + 1;
+	unsigned long n1_l = bc_partial_convert_to_long(n1end, n1len);
+	unsigned long n2_l = bc_partial_convert_to_long(n2end, n2len);
+	unsigned long prod_l = n1_l * n2_l;
 
+	size_t prodlen = n1len + n2len;
 	*prod = bc_new_num_nonzeroed(prodlen, 0);
+	char *pptr = (*prod)->n_value;
+	char *pend = pptr + prodlen - 1;
 
-	n1end = (char *) (n1->n_value + n1len - 1);
-	n2end = (char *) (n2->n_value + n2len - 1);
-	pvptr = (char *) ((*prod)->n_value + prodlen - 1);
-
-	/* Here is the loop... */
-	for (int index = 0; index < prodlen - 1; index++) {
-		n1ptr = (char *) (n1end - MAX(0, index - n2len + 1));
-		n2ptr = (char *) (n2end - MIN(index, n2len - 1));
-		while ((n1ptr >= n1->n_value) && (n2ptr <= n2end)) {
-			sum += *n1ptr * *n2ptr;
-			n1ptr--;
-			n2ptr++;
-		}
-		*pvptr-- = sum % BASE;
-		sum = sum / BASE;
+	while (pend >= pptr) {
+		*pend-- = prod_l % BASE;
+		prod_l /= BASE;
 	}
-	*pvptr = sum;
 }
 
-
-/* A special adder/subtractor for the recursive divide and conquer
-   multiply algorithm.  Note: if sub is called, accum must
-   be larger that what is being subtracted.  Also, accum and val
-   must have n_scale = 0.  (e.g. they must look like integers. *) */
-static void _bc_shift_addsub(bc_num accum, bc_num val, int shift, bool sub)
+/*
+ * Converts the BCD of bc_num by 4 (32 bits) or 8 (64 bits) digits to an array of unsigned longs.
+ * The array is generated starting with the smaller digits.
+ * e.g. 12345678901234567890 => {34567890, 56789012, 1234}
+ *
+ * Multiply and add these groups of numbers to perform multiplication fast.
+ * How much to shift the digits when adding values ​​can be calculated from the index of the array.
+ */
+static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_num *prod)
 {
-	signed char *accp, *valp;
-	unsigned int carry = 0;
-	size_t count = val->n_len;
-
-	if (val->n_value[0] == 0) {
-		count--;
+	size_t i;
+	char *n1end = n1->n_value + n1len - 1;
+	char *n2end = n2->n_value + n2len - 1;
+	size_t prodlen = n1len + n2len;
+
+	size_t n1_arr_size = n1len / BC_LONGABLE_DIGITS + (n1len % BC_LONGABLE_DIGITS ? 1 : 0);
+	size_t n2_arr_size = n2len / BC_LONGABLE_DIGITS + (n2len % BC_LONGABLE_DIGITS ? 1 : 0);
+	size_t prod_arr_size = n1_arr_size + n2_arr_size - 1;
+
+	unsigned long n1_l[n1_arr_size];
+	unsigned long n2_l[n2_arr_size];
+	unsigned long prod_l[prod_arr_size];
+	for (i = 0; i < prod_arr_size; i++) {
+		prod_l[i] = 0;
 	}
-	assert(accum->n_len + accum->n_scale >= shift + count);
-
-	/* Set up pointers and others */
-	accp = (signed char *) (accum->n_value + accum->n_len + accum->n_scale - shift - 1);
-	valp = (signed char *) (val->n_value + val->n_len - 1);
-
-	if (sub) {
-		/* Subtraction, carry is really borrow. */
-		while (count--) {
-			*accp -= *valp-- + carry;
-			if (*accp < 0) {
-				carry = 1;
-				*accp-- += BASE;
-			} else {
-				carry = 0;
-				accp--;
-			}
-		}
-		while (carry) {
-			*accp -= carry;
-			if (*accp < 0) {
-				*accp-- += BASE;
-			} else {
-				carry = 0;
-			}
-		}
-	} else {
-		/* Addition */
-		while (count--) {
-			*accp += *valp-- + carry;
-			if (*accp > (BASE - 1)) {
-				carry = 1;
-				*accp-- -= BASE;
-			} else {
-				carry = 0;
-				accp--;
-			}
-		}
-		while (carry) {
-			*accp += carry;
-			if (*accp > (BASE - 1)) {
-				*accp-- -= BASE;
-			} else {
-				carry = 0;
-			}
-		}
-	}
-}
-
-/* Recursive divide and conquer multiply algorithm.
-   Based on
-   Let u = u0 + u1*(b^n)
-   Let v = v0 + v1*(b^n)
-   Then uv = (B^2n+B^n)*u1*v1 + B^n*(u1-u0)*(v0-v1) + (B^n+1)*u0*v0
 
-   B is the base of storage, number of digits in u1,u0 close to equal.
-*/
-static void _bc_rec_mul(bc_num u, size_t ulen, bc_num v, size_t vlen, bc_num *prod)
-{
-	bc_num u0, u1, v0, v1;
-	bc_num m1, m2, m3;
-	size_t n;
-	bool m1zero;
-
-	/* Base case? */
-	if ((ulen + vlen) < mul_base_digits
-		|| ulen < MUL_SMALL_DIGITS
-		|| vlen < MUL_SMALL_DIGITS
-	) {
-		_bc_simp_mul(u, ulen, v, vlen, prod);
-		return;
+	/* Convert n1 to long[] */
+	i = 0;
+	while (n1len > 0) {
+		size_t len = MIN(BC_LONGABLE_DIGITS, n1len);
+		n1_l[i] = bc_partial_convert_to_long(n1end, len);
+		n1end -= len;
+		n1len -= len;
+		i++;
 	}
 
-	/* Calculate n -- the u and v split point in digits. */
-	n = (MAX(ulen, vlen) + 1) / 2;
-
-	/* Split u and v. */
-	if (ulen < n) {
-		u1 = bc_copy_num(BCG(_zero_));
-		u0 = new_sub_num(ulen, 0, u->n_value);
-	} else {
-		u1 = new_sub_num(ulen - n, 0, u->n_value);
-		u0 = new_sub_num(n, 0, u->n_value + ulen - n);
-	}
-	if (vlen < n) {
-		v1 = bc_copy_num(BCG(_zero_));
-		v0 = new_sub_num(vlen, 0, v->n_value);
-	} else {
-		v1 = new_sub_num(vlen - n, 0, v->n_value);
-		v0 = new_sub_num(n, 0, v->n_value + vlen - n);
+	/* Convert n2 to long[] */
+	i = 0;
+	while (n2len > 0) {
+		size_t len = MIN(BC_LONGABLE_DIGITS, n2len);
+		n2_l[i] = bc_partial_convert_to_long(n2end, len);
+		n2end -= len;
+		n2len -= len;
+		i++;
 	}
-	_bc_rm_leading_zeros(u1);
-	_bc_rm_leading_zeros(u0);
-	_bc_rm_leading_zeros(v1);
-	_bc_rm_leading_zeros(v0);
-
-	m1zero = bc_is_zero(u1) || bc_is_zero(v1);
-
-	/* Calculate sub results ... */
-
-	bc_num d1 = bc_sub(u1, u0, 0);
-	bc_num d2 = bc_sub(v0, v1, 0);
-
 
-	/* Do recursive multiplies and shifted adds. */
-	if (m1zero) {
-		m1 = bc_copy_num(BCG(_zero_));
-	} else {
-		_bc_rec_mul(u1, u1->n_len, v1, v1->n_len, &m1);
+	/* Multiplication and addition */
+	for (i = 0; i < n1_arr_size; i++) {
+		for (size_t j = 0; j < n2_arr_size; j++) {
+			prod_l[i + j] += n1_l[i] * n2_l[j];
+		}
 	}
 
-	if (bc_is_zero(d1) || bc_is_zero(d2)) {
-		m2 = bc_copy_num(BCG(_zero_));
-	} else {
-		_bc_rec_mul(d1, d1->n_len, d2, d2->n_len, &m2);
+	/*
+	 * Move a value exceeding 8 digits by carrying to the next digit.
+	 * However, the last digit does nothing.
+	 */
+	for (i = 0; i < prod_arr_size - 1; i++) {
+		prod_l[i + 1] += prod_l[i] / BC_LONGABLE_OVERFLOW;
+		prod_l[i] %= BC_LONGABLE_OVERFLOW;
 	}
 
-	if (bc_is_zero(u0) || bc_is_zero(v0)) {
-		m3 = bc_copy_num(BCG(_zero_));
-	} else {
-		_bc_rec_mul(u0, u0->n_len, v0, v0->n_len, &m3);
+	/* Convert to bc_num */
+	*prod = bc_new_num_nonzeroed(prodlen, 0);
+	char *pptr = (*prod)->n_value;
+	char *pend = pptr + prodlen - 1;
+	i = 0;
+	while (i < prod_arr_size - 1) {
+		for (size_t j = 0; j < BC_LONGABLE_DIGITS; j++) {
+			*pend-- = prod_l[i] % BASE;
+			prod_l[i] /= BASE;
+		}
+		i++;
 	}
 
-	/* Initialize product */
-	*prod = bc_new_num(ulen + vlen + 1, 0);
-
-	if (!m1zero) {
-		_bc_shift_addsub(*prod, m1, 2 * n, false);
-		_bc_shift_addsub(*prod, m1, n, false);
+	/*
+	 * The last digit may carry over.
+	 * Also need to fill it to the end with zeros, so loop until the end of the string.
+	 */
+	while (pend >= pptr) {
+		*pend-- = prod_l[i] % BASE;
+		prod_l[i] /= BASE;
 	}
-	_bc_shift_addsub(*prod, m3, n, false);
-	_bc_shift_addsub(*prod, m3, 0, false);
-	_bc_shift_addsub(*prod, m2, n, d1->n_sign != d2->n_sign);
-
-	/* Now clean up! */
-	bc_free_num (&u1);
-	bc_free_num (&u0);
-	bc_free_num (&v1);
-	bc_free_num (&m1);
-	bc_free_num (&v0);
-	bc_free_num (&m2);
-	bc_free_num (&m3);
-	bc_free_num (&d1);
-	bc_free_num (&d2);
 }
 
 /* The multiply routine.  N2 times N1 is put int PROD with the scale of
@@ -255,26 +172,28 @@ static void _bc_rec_mul(bc_num u, size_t ulen, bc_num v, size_t vlen, bc_num *pr
 
 bc_num bc_multiply(bc_num n1, bc_num n2, size_t scale)
 {
-	bc_num pval;
-	size_t len1, len2;
-	size_t full_scale, prod_scale;
+	bc_num prod;
 
 	/* Initialize things. */
-	len1 = n1->n_len + n1->n_scale;
-	len2 = n2->n_len + n2->n_scale;
-	full_scale = n1->n_scale + n2->n_scale;
-	prod_scale = MIN(full_scale, MAX(scale, MAX(n1->n_scale, n2->n_scale)));
+	size_t len1 = n1->n_len + n1->n_scale;
+	size_t len2 = n2->n_len + n2->n_scale;
+	size_t full_scale = n1->n_scale + n2->n_scale;
+	size_t prod_scale = MIN(full_scale, MAX(scale, MAX(n1->n_scale, n2->n_scale)));
 
 	/* Do the multiply */
-	_bc_rec_mul(n1, len1, n2, len2, &pval);
+	if (len1 <= BC_LONGABLE_DIGITS && len2 <= BC_LONGABLE_DIGITS) {
+		bc_fast_mul(n1, len1, n2, len2, &prod);
+	} else {
+		bc_standard_mul(n1, len1, n2, len2, &prod);
+	}
 
 	/* Assign to prod and clean up the number. */
-	pval->n_sign = (n1->n_sign == n2->n_sign ? PLUS : MINUS);
-	pval->n_len = len2 + len1 + 1 - full_scale;
-	pval->n_scale = prod_scale;
-	_bc_rm_leading_zeros(pval);
-	if (bc_is_zero(pval)) {
-		pval->n_sign = PLUS;
+	prod->n_sign = (n1->n_sign == n2->n_sign ? PLUS : MINUS);
+	prod->n_len -= full_scale;
+	prod->n_scale = prod_scale;
+	_bc_rm_leading_zeros(prod);
+	if (bc_is_zero(prod)) {
+		prod->n_sign = PLUS;
 	}
-	return pval;
+	return prod;
 }

From ec9f8dfe5a9be43fee345ff00355e54e73f81d49 Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Mon, 13 May 2024 18:44:16 +0900
Subject: [PATCH 2/9] Fixed array allocation

---
 ext/bcmath/libbcmath/src/recmul.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/recmul.c b/ext/bcmath/libbcmath/src/recmul.c
index 75ace8a88355a..f676ab4f672f6 100644
--- a/ext/bcmath/libbcmath/src/recmul.c
+++ b/ext/bcmath/libbcmath/src/recmul.c
@@ -100,12 +100,9 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 	size_t n2_arr_size = n2len / BC_LONGABLE_DIGITS + (n2len % BC_LONGABLE_DIGITS ? 1 : 0);
 	size_t prod_arr_size = n1_arr_size + n2_arr_size - 1;
 
-	unsigned long n1_l[n1_arr_size];
-	unsigned long n2_l[n2_arr_size];
-	unsigned long prod_l[prod_arr_size];
-	for (i = 0; i < prod_arr_size; i++) {
-		prod_l[i] = 0;
-	}
+	unsigned long *n1_l = emalloc(n1_arr_size * sizeof(unsigned long));
+	unsigned long *n2_l = emalloc(n2_arr_size * sizeof(unsigned long));
+	unsigned long *prod_l = ecalloc(prod_arr_size, sizeof(unsigned long));
 
 	/* Convert n1 to long[] */
 	i = 0;
@@ -164,6 +161,10 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 		*pend-- = prod_l[i] % BASE;
 		prod_l[i] /= BASE;
 	}
+
+	efree(n1_l);
+	efree(n2_l);
+	efree(prod_l);
 }
 
 /* The multiply routine.  N2 times N1 is put int PROD with the scale of

From ac9342df8f5d2cd16d8b214fa8861059b4bdb955 Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Mon, 13 May 2024 19:28:57 +0900
Subject: [PATCH 3/9] Fixed array allocation

---
 ext/bcmath/libbcmath/src/recmul.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/recmul.c b/ext/bcmath/libbcmath/src/recmul.c
index f676ab4f672f6..6edf856b5a2df 100644
--- a/ext/bcmath/libbcmath/src/recmul.c
+++ b/ext/bcmath/libbcmath/src/recmul.c
@@ -100,9 +100,15 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 	size_t n2_arr_size = n2len / BC_LONGABLE_DIGITS + (n2len % BC_LONGABLE_DIGITS ? 1 : 0);
 	size_t prod_arr_size = n1_arr_size + n2_arr_size - 1;
 
-	unsigned long *n1_l = emalloc(n1_arr_size * sizeof(unsigned long));
-	unsigned long *n2_l = emalloc(n2_arr_size * sizeof(unsigned long));
-	unsigned long *prod_l = ecalloc(prod_arr_size, sizeof(unsigned long));
+	unsigned long *buf = emalloc((n1_arr_size + n2_arr_size + prod_arr_size) * sizeof(unsigned long));
+
+	unsigned long *n1_l = buf;
+	unsigned long *n2_l = buf + n1_arr_size;
+	unsigned long *prod_l = n2_l + n2_arr_size;
+
+	for (i = 0; i < prod_arr_size; i++) {
+		prod_l[i] = 0;
+	}
 
 	/* Convert n1 to long[] */
 	i = 0;
@@ -162,9 +168,7 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 		prod_l[i] /= BASE;
 	}
 
-	efree(n1_l);
-	efree(n2_l);
-	efree(prod_l);
+	efree(buf);
 }
 
 /* The multiply routine.  N2 times N1 is put int PROD with the scale of

From 899bb39ee2f209d348fe47cf40dea10d4261e6ba Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Mon, 13 May 2024 19:32:10 +0900
Subject: [PATCH 4/9] Changed unsigned long to BC_UINT_T

---
 ext/bcmath/libbcmath/src/recmul.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/recmul.c b/ext/bcmath/libbcmath/src/recmul.c
index 6edf856b5a2df..e457a7333e132 100644
--- a/ext/bcmath/libbcmath/src/recmul.c
+++ b/ext/bcmath/libbcmath/src/recmul.c
@@ -43,10 +43,10 @@
  * Converts BCD to long, going backwards from pointer n by the number of
  * characters specified by len.
  */
-static inline unsigned long bc_partial_convert_to_long(const char *n, size_t len)
+static inline BC_UINT_T bc_partial_convert_to_long(const char *n, size_t len)
 {
-	unsigned long num = 0;
-	unsigned long base = 1;
+	BC_UINT_T num = 0;
+	BC_UINT_T base = 1;
 
 	for (size_t i = 0; i < len; i++) {
 		num += *n * base;
@@ -66,9 +66,9 @@ static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc
 	char *n1end = n1->n_value + n1len - 1;
 	char *n2end = n2->n_value + n2len - 1;
 
-	unsigned long n1_l = bc_partial_convert_to_long(n1end, n1len);
-	unsigned long n2_l = bc_partial_convert_to_long(n2end, n2len);
-	unsigned long prod_l = n1_l * n2_l;
+	BC_UINT_T n1_l = bc_partial_convert_to_long(n1end, n1len);
+	BC_UINT_T n2_l = bc_partial_convert_to_long(n2end, n2len);
+	BC_UINT_T prod_l = n1_l * n2_l;
 
 	size_t prodlen = n1len + n2len;
 	*prod = bc_new_num_nonzeroed(prodlen, 0);
@@ -82,7 +82,7 @@ static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc
 }
 
 /*
- * Converts the BCD of bc_num by 4 (32 bits) or 8 (64 bits) digits to an array of unsigned longs.
+ * Converts the BCD of bc_num by 4 (32 bits) or 8 (64 bits) digits to an array of BC_UINT_Ts.
  * The array is generated starting with the smaller digits.
  * e.g. 12345678901234567890 => {34567890, 56789012, 1234}
  *
@@ -100,11 +100,11 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 	size_t n2_arr_size = n2len / BC_LONGABLE_DIGITS + (n2len % BC_LONGABLE_DIGITS ? 1 : 0);
 	size_t prod_arr_size = n1_arr_size + n2_arr_size - 1;
 
-	unsigned long *buf = emalloc((n1_arr_size + n2_arr_size + prod_arr_size) * sizeof(unsigned long));
+	BC_UINT_T *buf = emalloc((n1_arr_size + n2_arr_size + prod_arr_size) * sizeof(BC_UINT_T));
 
-	unsigned long *n1_l = buf;
-	unsigned long *n2_l = buf + n1_arr_size;
-	unsigned long *prod_l = n2_l + n2_arr_size;
+	BC_UINT_T *n1_l = buf;
+	BC_UINT_T *n2_l = buf + n1_arr_size;
+	BC_UINT_T *prod_l = n2_l + n2_arr_size;
 
 	for (i = 0; i < prod_arr_size; i++) {
 		prod_l[i] = 0;

From ea57a9f3c8a94372065aa3d82cb06cdd727c415f Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Tue, 14 May 2024 08:21:32 +0900
Subject: [PATCH 5/9] Fixed comments and constant names

---
 ext/bcmath/libbcmath/src/private.h |  8 ++++----
 ext/bcmath/libbcmath/src/recmul.c  | 32 +++++++++++++++---------------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/private.h b/ext/bcmath/libbcmath/src/private.h
index fa538150ef73a..7e367441292ee 100644
--- a/ext/bcmath/libbcmath/src/private.h
+++ b/ext/bcmath/libbcmath/src/private.h
@@ -84,13 +84,13 @@ static inline uint64_t BC_BSWAP64(uint64_t u)
 #if SIZEOF_SIZE_T >= 8
 #  define BC_BSWAP(u) BC_BSWAP64(u)
 #  define BC_UINT_T uint64_t
-#  define BC_LONGABLE_DIGITS 8
-#  define BC_LONGABLE_OVERFLOW 100000000
+#  define BC_MUL_UINT_DIGITS 8
+#  define BC_MUL_UINT_OVERFLOW 100000000
 #else
 #  define BC_BSWAP(u) BC_BSWAP32(u)
 #  define BC_UINT_T uint32_t
-#  define BC_LONGABLE_DIGITS 4
-#  define BC_LONGABLE_OVERFLOW 10000
+#  define BC_MUL_UINT_DIGITS 4
+#  define BC_MUL_UINT_OVERFLOW 10000
 #endif
 
 #ifdef WORDS_BIGENDIAN
diff --git a/ext/bcmath/libbcmath/src/recmul.c b/ext/bcmath/libbcmath/src/recmul.c
index e457a7333e132..0d916747e0e37 100644
--- a/ext/bcmath/libbcmath/src/recmul.c
+++ b/ext/bcmath/libbcmath/src/recmul.c
@@ -40,10 +40,10 @@
 /* Multiply utility routines */
 
 /*
- * Converts BCD to long, going backwards from pointer n by the number of
+ * Converts BCD to uint, going backwards from pointer n by the number of
  * characters specified by len.
  */
-static inline BC_UINT_T bc_partial_convert_to_long(const char *n, size_t len)
+static inline BC_UINT_T bc_partial_convert_to_uint(const char *n, size_t len)
 {
 	BC_UINT_T num = 0;
 	BC_UINT_T base = 1;
@@ -66,8 +66,8 @@ static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc
 	char *n1end = n1->n_value + n1len - 1;
 	char *n2end = n2->n_value + n2len - 1;
 
-	BC_UINT_T n1_l = bc_partial_convert_to_long(n1end, n1len);
-	BC_UINT_T n2_l = bc_partial_convert_to_long(n2end, n2len);
+	BC_UINT_T n1_l = bc_partial_convert_to_uint(n1end, n1len);
+	BC_UINT_T n2_l = bc_partial_convert_to_uint(n2end, n2len);
 	BC_UINT_T prod_l = n1_l * n2_l;
 
 	size_t prodlen = n1len + n2len;
@@ -96,8 +96,8 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 	char *n2end = n2->n_value + n2len - 1;
 	size_t prodlen = n1len + n2len;
 
-	size_t n1_arr_size = n1len / BC_LONGABLE_DIGITS + (n1len % BC_LONGABLE_DIGITS ? 1 : 0);
-	size_t n2_arr_size = n2len / BC_LONGABLE_DIGITS + (n2len % BC_LONGABLE_DIGITS ? 1 : 0);
+	size_t n1_arr_size = n1len / BC_MUL_UINT_DIGITS + (n1len % BC_MUL_UINT_DIGITS ? 1 : 0);
+	size_t n2_arr_size = n2len / BC_MUL_UINT_DIGITS + (n2len % BC_MUL_UINT_DIGITS ? 1 : 0);
 	size_t prod_arr_size = n1_arr_size + n2_arr_size - 1;
 
 	BC_UINT_T *buf = emalloc((n1_arr_size + n2_arr_size + prod_arr_size) * sizeof(BC_UINT_T));
@@ -110,21 +110,21 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 		prod_l[i] = 0;
 	}
 
-	/* Convert n1 to long[] */
+	/* Convert n1 to uint[] */
 	i = 0;
 	while (n1len > 0) {
-		size_t len = MIN(BC_LONGABLE_DIGITS, n1len);
-		n1_l[i] = bc_partial_convert_to_long(n1end, len);
+		size_t len = MIN(BC_MUL_UINT_DIGITS, n1len);
+		n1_l[i] = bc_partial_convert_to_uint(n1end, len);
 		n1end -= len;
 		n1len -= len;
 		i++;
 	}
 
-	/* Convert n2 to long[] */
+	/* Convert n2 to uint[] */
 	i = 0;
 	while (n2len > 0) {
-		size_t len = MIN(BC_LONGABLE_DIGITS, n2len);
-		n2_l[i] = bc_partial_convert_to_long(n2end, len);
+		size_t len = MIN(BC_MUL_UINT_DIGITS, n2len);
+		n2_l[i] = bc_partial_convert_to_uint(n2end, len);
 		n2end -= len;
 		n2len -= len;
 		i++;
@@ -142,8 +142,8 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 	 * However, the last digit does nothing.
 	 */
 	for (i = 0; i < prod_arr_size - 1; i++) {
-		prod_l[i + 1] += prod_l[i] / BC_LONGABLE_OVERFLOW;
-		prod_l[i] %= BC_LONGABLE_OVERFLOW;
+		prod_l[i + 1] += prod_l[i] / BC_MUL_UINT_OVERFLOW;
+		prod_l[i] %= BC_MUL_UINT_OVERFLOW;
 	}
 
 	/* Convert to bc_num */
@@ -152,7 +152,7 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 	char *pend = pptr + prodlen - 1;
 	i = 0;
 	while (i < prod_arr_size - 1) {
-		for (size_t j = 0; j < BC_LONGABLE_DIGITS; j++) {
+		for (size_t j = 0; j < BC_MUL_UINT_DIGITS; j++) {
 			*pend-- = prod_l[i] % BASE;
 			prod_l[i] /= BASE;
 		}
@@ -186,7 +186,7 @@ bc_num bc_multiply(bc_num n1, bc_num n2, size_t scale)
 	size_t prod_scale = MIN(full_scale, MAX(scale, MAX(n1->n_scale, n2->n_scale)));
 
 	/* Do the multiply */
-	if (len1 <= BC_LONGABLE_DIGITS && len2 <= BC_LONGABLE_DIGITS) {
+	if (len1 <= BC_MUL_UINT_DIGITS && len2 <= BC_MUL_UINT_DIGITS) {
 		bc_fast_mul(n1, len1, n2, len2, &prod);
 	} else {
 		bc_standard_mul(n1, len1, n2, len2, &prod);

From b23e0a392978fd37658a06cbc37feededc1e19fb Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Tue, 14 May 2024 08:22:55 +0900
Subject: [PATCH 6/9] Moved the constant

---
 ext/bcmath/libbcmath/src/private.h | 4 ----
 ext/bcmath/libbcmath/src/recmul.c  | 9 +++++++++
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/private.h b/ext/bcmath/libbcmath/src/private.h
index 7e367441292ee..1403baad0c257 100644
--- a/ext/bcmath/libbcmath/src/private.h
+++ b/ext/bcmath/libbcmath/src/private.h
@@ -84,13 +84,9 @@ static inline uint64_t BC_BSWAP64(uint64_t u)
 #if SIZEOF_SIZE_T >= 8
 #  define BC_BSWAP(u) BC_BSWAP64(u)
 #  define BC_UINT_T uint64_t
-#  define BC_MUL_UINT_DIGITS 8
-#  define BC_MUL_UINT_OVERFLOW 100000000
 #else
 #  define BC_BSWAP(u) BC_BSWAP32(u)
 #  define BC_UINT_T uint32_t
-#  define BC_MUL_UINT_DIGITS 4
-#  define BC_MUL_UINT_OVERFLOW 10000
 #endif
 
 #ifdef WORDS_BIGENDIAN
diff --git a/ext/bcmath/libbcmath/src/recmul.c b/ext/bcmath/libbcmath/src/recmul.c
index 0d916747e0e37..247e2a238cc53 100644
--- a/ext/bcmath/libbcmath/src/recmul.c
+++ b/ext/bcmath/libbcmath/src/recmul.c
@@ -37,6 +37,15 @@
 #include "zend_alloc.h"
 
 
+#if SIZEOF_SIZE_T >= 8
+#  define BC_MUL_UINT_DIGITS 8
+#  define BC_MUL_UINT_OVERFLOW 100000000
+#else
+#  define BC_MUL_UINT_DIGITS 4
+#  define BC_MUL_UINT_OVERFLOW 10000
+#endif
+
+
 /* Multiply utility routines */
 
 /*

From ec51d7664e8516828851b8a312436f87245b24d9 Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Tue, 14 May 2024 09:12:55 +0900
Subject: [PATCH 7/9] Fixed variable names and comments

---
 ext/bcmath/libbcmath/src/recmul.c | 38 +++++++++++++++----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/recmul.c b/ext/bcmath/libbcmath/src/recmul.c
index 247e2a238cc53..9a832d754c459 100644
--- a/ext/bcmath/libbcmath/src/recmul.c
+++ b/ext/bcmath/libbcmath/src/recmul.c
@@ -75,9 +75,9 @@ static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc
 	char *n1end = n1->n_value + n1len - 1;
 	char *n2end = n2->n_value + n2len - 1;
 
-	BC_UINT_T n1_l = bc_partial_convert_to_uint(n1end, n1len);
-	BC_UINT_T n2_l = bc_partial_convert_to_uint(n2end, n2len);
-	BC_UINT_T prod_l = n1_l * n2_l;
+	BC_UINT_T n1_uint = bc_partial_convert_to_uint(n1end, n1len);
+	BC_UINT_T n2_uint = bc_partial_convert_to_uint(n2end, n2len);
+	BC_UINT_T prod_uint = n1_uint * n2_uint;
 
 	size_t prodlen = n1len + n2len;
 	*prod = bc_new_num_nonzeroed(prodlen, 0);
@@ -85,8 +85,8 @@ static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc
 	char *pend = pptr + prodlen - 1;
 
 	while (pend >= pptr) {
-		*pend-- = prod_l % BASE;
-		prod_l /= BASE;
+		*pend-- = prod_uint % BASE;
+		prod_uint /= BASE;
 	}
 }
 
@@ -111,19 +111,19 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 
 	BC_UINT_T *buf = emalloc((n1_arr_size + n2_arr_size + prod_arr_size) * sizeof(BC_UINT_T));
 
-	BC_UINT_T *n1_l = buf;
-	BC_UINT_T *n2_l = buf + n1_arr_size;
-	BC_UINT_T *prod_l = n2_l + n2_arr_size;
+	BC_UINT_T *n1_uint = buf;
+	BC_UINT_T *n2_uint = buf + n1_arr_size;
+	BC_UINT_T *prod_uint = n2_uint + n2_arr_size;
 
 	for (i = 0; i < prod_arr_size; i++) {
-		prod_l[i] = 0;
+		prod_uint[i] = 0;
 	}
 
 	/* Convert n1 to uint[] */
 	i = 0;
 	while (n1len > 0) {
 		size_t len = MIN(BC_MUL_UINT_DIGITS, n1len);
-		n1_l[i] = bc_partial_convert_to_uint(n1end, len);
+		n1_uint[i] = bc_partial_convert_to_uint(n1end, len);
 		n1end -= len;
 		n1len -= len;
 		i++;
@@ -133,7 +133,7 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 	i = 0;
 	while (n2len > 0) {
 		size_t len = MIN(BC_MUL_UINT_DIGITS, n2len);
-		n2_l[i] = bc_partial_convert_to_uint(n2end, len);
+		n2_uint[i] = bc_partial_convert_to_uint(n2end, len);
 		n2end -= len;
 		n2len -= len;
 		i++;
@@ -142,17 +142,17 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 	/* Multiplication and addition */
 	for (i = 0; i < n1_arr_size; i++) {
 		for (size_t j = 0; j < n2_arr_size; j++) {
-			prod_l[i + j] += n1_l[i] * n2_l[j];
+			prod_uint[i + j] += n1_uint[i] * n2_uint[j];
 		}
 	}
 
 	/*
-	 * Move a value exceeding 8 digits by carrying to the next digit.
+	 * Move a value exceeding 4/8 digits by carrying to the next digit.
 	 * However, the last digit does nothing.
 	 */
 	for (i = 0; i < prod_arr_size - 1; i++) {
-		prod_l[i + 1] += prod_l[i] / BC_MUL_UINT_OVERFLOW;
-		prod_l[i] %= BC_MUL_UINT_OVERFLOW;
+		prod_uint[i + 1] += prod_uint[i] / BC_MUL_UINT_OVERFLOW;
+		prod_uint[i] %= BC_MUL_UINT_OVERFLOW;
 	}
 
 	/* Convert to bc_num */
@@ -162,8 +162,8 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 	i = 0;
 	while (i < prod_arr_size - 1) {
 		for (size_t j = 0; j < BC_MUL_UINT_DIGITS; j++) {
-			*pend-- = prod_l[i] % BASE;
-			prod_l[i] /= BASE;
+			*pend-- = prod_uint[i] % BASE;
+			prod_uint[i] /= BASE;
 		}
 		i++;
 	}
@@ -173,8 +173,8 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 	 * Also need to fill it to the end with zeros, so loop until the end of the string.
 	 */
 	while (pend >= pptr) {
-		*pend-- = prod_l[i] % BASE;
-		prod_l[i] /= BASE;
+		*pend-- = prod_uint[i] % BASE;
+		prod_uint[i] /= BASE;
 	}
 
 	efree(buf);

From 079f31c2772c8449d7d4168ba3b550f530e1f785 Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Tue, 14 May 2024 21:56:20 +0900
Subject: [PATCH 8/9] save loop cost

---
 ext/bcmath/libbcmath/src/recmul.c | 333 ++++++++++++++++++++++++++----
 1 file changed, 298 insertions(+), 35 deletions(-)

diff --git a/ext/bcmath/libbcmath/src/recmul.c b/ext/bcmath/libbcmath/src/recmul.c
index 9a832d754c459..b635a20fe7a81 100644
--- a/ext/bcmath/libbcmath/src/recmul.c
+++ b/ext/bcmath/libbcmath/src/recmul.c
@@ -33,6 +33,7 @@
 #include <stddef.h>
 #include <assert.h>
 #include <stdbool.h>
+#include <stdlib.h>
 #include "private.h" /* For _bc_rm_leading_zeros() */
 #include "zend_alloc.h"
 
@@ -45,23 +46,292 @@
 #  define BC_MUL_UINT_OVERFLOW 10000
 #endif
 
+#define BC_UINT_TO_BCD_ONE_DIGIT(num, bcd) do { \
+	bcd = num % 10; \
+	num /= 10; \
+} while (0)
+
 
 /* Multiply utility routines */
 
 /*
  * Converts BCD to uint, going backwards from pointer n by the number of
  * characters specified by len.
+ *
+ * Since the upper limit of len is known, open the loop to save loop cost.
  */
 static inline BC_UINT_T bc_partial_convert_to_uint(const char *n, size_t len)
 {
-	BC_UINT_T num = 0;
-	BC_UINT_T base = 1;
+	BC_UINT_T num = n[0];
+
+	switch (len) {
+		case 1:
+			return num;
+		case 2:
+			num += n[-1] * 10;
+			return num;
+		case 3:
+			num += n[-1] * 10;
+			num += n[-2] * 100;
+			return num;
+		case 4:
+			num += n[-1] * 10;
+			num += n[-2] * 100;
+			num += n[-3] * 1000;
+			return num;
+#if BC_MUL_UINT_DIGITS == 8
+		case 5:
+			num += n[-1] * 10;
+			num += n[-2] * 100;
+			num += n[-3] * 1000;
+			num += n[-4] * 10000;
+			return num;
+		case 6:
+			num += n[-1] * 10;
+			num += n[-2] * 100;
+			num += n[-3] * 1000;
+			num += n[-4] * 10000;
+			num += n[-5] * 100000;
+			return num;
+		case 7:
+			num += n[-1] * 10;
+			num += n[-2] * 100;
+			num += n[-3] * 1000;
+			num += n[-4] * 10000;
+			num += n[-5] * 100000;
+			num += n[-6] * 1000000;
+			return num;
+		case 8:
+			num += n[-1] * 10;
+			num += n[-2] * 100;
+			num += n[-3] * 1000;
+			num += n[-4] * 10000;
+			num += n[-5] * 100000;
+			num += n[-6] * 1000000;
+			num += n[-7] * 10000000;
+			return num;
+#endif
+	}
 
-	for (size_t i = 0; i < len; i++) {
-		num += *n * base;
-		base *= BASE;
-		n--;
+	return num;
+}
+
+/*
+ * Since the number of digits is fixed, open the loop and saves cost.
+ */
+static inline BC_UINT_T bc_partial_convert_to_uint_fixed_digits(const char *n)
+{
+	BC_UINT_T num = n[0];
+	num += n[-1] * 10;
+	num += n[-2] * 100;
+	num += n[-3] * 1000;
+#if BC_MUL_UINT_DIGITS == 8
+	num += n[-4] * 10000;
+	num += n[-5] * 100000;
+	num += n[-6] * 1000000;
+	num += n[-7] * 10000000;
+#endif
+
+	return num;
+}
+
+/*
+ * Convert HEX to BCD. The number of digits can be up to 16 decimal digits (64-bit).
+ */
+static inline void bc_uint_to_bcd(BC_UINT_T num, char *bcd, size_t len)
+{
+	switch (len) {
+		case 0:
+			return;
+		case 1:
+			*bcd = num % 10;
+			return;
+		case 2:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			bcd[-1] = num % 10;
+			return;
+		case 3:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			bcd[-2] = num % 10;
+			return;
+		case 4:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			bcd[-3] = num % 10;
+			return;
+		case 5:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+			bcd[-4] = num % 10;
+			return;
+		case 6:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-4]);
+			bcd[-5] = num % 10;
+			return;
+		case 7:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-4]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-5]);
+			bcd[-6] = num % 10;
+			return;
+		case 8:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-4]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-5]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-6]);
+			bcd[-7] = num % 10;
+			return;
+#if BC_MUL_UINT_DIGITS == 8
+		case 9:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-4]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-5]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-6]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-7]);
+			bcd[-8] = num % 10;
+			return;
+		case 10:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-4]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-5]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-6]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-7]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-8]);
+			bcd[-9] = num % 10;
+			return;
+		case 11:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-4]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-5]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-6]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-7]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-8]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-9]);
+			bcd[-10] = num % 10;
+			return;
+		case 12:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-4]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-5]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-6]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-7]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-8]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-9]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-10]);
+			bcd[-11] = num % 10;
+			return;
+		case 13:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-4]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-5]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-6]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-7]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-8]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-9]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-10]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-11]);
+			bcd[-12] = num % 10;
+			return;
+		case 14:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-4]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-5]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-6]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-7]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-8]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-9]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-10]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-11]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-12]);
+			bcd[-13] = num % 10;
+			return;
+		case 15:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-4]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-5]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-6]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-7]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-8]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-9]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-10]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-11]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-12]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-13]);
+			bcd[-14] = num % 10;
+			return;
+		case 16:
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-4]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-5]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-6]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-7]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-8]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-9]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-10]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-11]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-12]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-13]);
+			BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-14]);
+			bcd[-15] = num % 10;
+			return;
+#endif
 	}
+}
+
+/*
+ * Converts 8 decimal digits of HEX to 8 digits of BCD (64-bit).
+ * Since the number of digits is fixed, open the loop and saves cost.
+ */
+static inline BC_UINT_T bc_uint_to_bcd_fixed_digits(BC_UINT_T num, char *bcd)
+{
+	BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[0]);
+	BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-1]);
+	BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-2]);
+	BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-3]);
+#if BC_MUL_UINT_DIGITS == 8
+	BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-4]);
+	BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-5]);
+	BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-6]);
+	BC_UINT_TO_BCD_ONE_DIGIT(num, bcd[-7]);
+#endif
 
 	return num;
 }
@@ -81,13 +351,9 @@ static inline void bc_fast_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc
 
 	size_t prodlen = n1len + n2len;
 	*prod = bc_new_num_nonzeroed(prodlen, 0);
-	char *pptr = (*prod)->n_value;
-	char *pend = pptr + prodlen - 1;
+	char *pend = (*prod)->n_value + prodlen - 1;
 
-	while (pend >= pptr) {
-		*pend-- = prod_uint % BASE;
-		prod_uint /= BASE;
-	}
+	bc_uint_to_bcd(prod_uint, pend, prodlen);
 }
 
 /*
@@ -121,23 +387,27 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 
 	/* Convert n1 to uint[] */
 	i = 0;
-	while (n1len > 0) {
-		size_t len = MIN(BC_MUL_UINT_DIGITS, n1len);
-		n1_uint[i] = bc_partial_convert_to_uint(n1end, len);
-		n1end -= len;
-		n1len -= len;
+	while (n1len >= BC_MUL_UINT_DIGITS) {
+		n1_uint[i] = bc_partial_convert_to_uint_fixed_digits(n1end);
+		n1end -= BC_MUL_UINT_DIGITS;
+		n1len -= BC_MUL_UINT_DIGITS;
 		i++;
 	}
+	if (n1len > 0) {
+		n1_uint[i] = bc_partial_convert_to_uint(n1end, n1len);
+	}
 
 	/* Convert n2 to uint[] */
 	i = 0;
-	while (n2len > 0) {
-		size_t len = MIN(BC_MUL_UINT_DIGITS, n2len);
-		n2_uint[i] = bc_partial_convert_to_uint(n2end, len);
-		n2end -= len;
-		n2len -= len;
+	while (n2len >= BC_MUL_UINT_DIGITS) {
+		n2_uint[i] = bc_partial_convert_to_uint_fixed_digits(n2end);
+		n2end -= BC_MUL_UINT_DIGITS;
+		n2len -= BC_MUL_UINT_DIGITS;
 		i++;
 	}
+	if (n2len > 0) {
+		n2_uint[i] = bc_partial_convert_to_uint(n2end, n2len);
+	}
 
 	/* Multiplication and addition */
 	for (i = 0; i < n1_arr_size; i++) {
@@ -157,25 +427,18 @@ static void bc_standard_mul(bc_num n1, size_t n1len, bc_num n2, int n2len, bc_nu
 
 	/* Convert to bc_num */
 	*prod = bc_new_num_nonzeroed(prodlen, 0);
-	char *pptr = (*prod)->n_value;
-	char *pend = pptr + prodlen - 1;
-	i = 0;
-	while (i < prod_arr_size - 1) {
-		for (size_t j = 0; j < BC_MUL_UINT_DIGITS; j++) {
-			*pend-- = prod_uint[i] % BASE;
-			prod_uint[i] /= BASE;
-		}
-		i++;
+	char *pend = (*prod)->n_value + prodlen - 1;
+	for (i = 0; i < prod_arr_size - 1; i++) {
+		prod_uint[i] = bc_uint_to_bcd_fixed_digits(prod_uint[i], pend);
+		pend -= BC_MUL_UINT_DIGITS;
+		prodlen -= BC_MUL_UINT_DIGITS;
 	}
 
 	/*
 	 * The last digit may carry over.
 	 * Also need to fill it to the end with zeros, so loop until the end of the string.
 	 */
-	while (pend >= pptr) {
-		*pend-- = prod_uint[i] % BASE;
-		prod_uint[i] /= BASE;
-	}
+	bc_uint_to_bcd(prod_uint[i], pend, prodlen);
 
 	efree(buf);
 }

From c2ffd4aab7316d8156f3302cce490ceda128ab04 Mon Sep 17 00:00:00 2001
From: Saki Takamachi <saki@sakiot.com>
Date: Tue, 14 May 2024 21:56:30 +0900
Subject: [PATCH 9/9] use EXPECTED

---
 ext/bcmath/libbcmath/src/recmul.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ext/bcmath/libbcmath/src/recmul.c b/ext/bcmath/libbcmath/src/recmul.c
index b635a20fe7a81..a30325503262c 100644
--- a/ext/bcmath/libbcmath/src/recmul.c
+++ b/ext/bcmath/libbcmath/src/recmul.c
@@ -458,7 +458,7 @@ bc_num bc_multiply(bc_num n1, bc_num n2, size_t scale)
 	size_t prod_scale = MIN(full_scale, MAX(scale, MAX(n1->n_scale, n2->n_scale)));
 
 	/* Do the multiply */
-	if (len1 <= BC_MUL_UINT_DIGITS && len2 <= BC_MUL_UINT_DIGITS) {
+	if (EXPECTED(len1 <= BC_MUL_UINT_DIGITS && len2 <= BC_MUL_UINT_DIGITS)) {
 		bc_fast_mul(n1, len1, n2, len2, &prod);
 	} else {
 		bc_standard_mul(n1, len1, n2, len2, &prod);