libtom
diff --git a/‎bn_mp_balance_mul.c‎
Lines changed: 79 additions & 0 deletions b/‎bn_mp_balance_mul.c‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎bn_mp_mul.c‎
Lines changed: 36 additions & 0 deletions b/‎bn_mp_mul.c‎
Lines changed: 36 additions & 0 deletions
@@ -0,0 +1,79 @@
+#include "tommath_private.h"
+#ifdef BN_MP_BALANCE_MUL_C
+/* LibTomMath, multiple-precision integer library -- Tom St Denis */
+/* SPDX-License-Identifier: Unlicense */
+
+/* single-digit multiplication with the smaller number as the single-digit */
+int mp_balance_mul(const mp_int *a, const mp_int *b, mp_int *c)
+{
+   int e, count, len_a, len_b, nblocks, i, j, bsize;
+   mp_int a0, tmp, A, B, r;
+
+   len_a = a->used;
+   len_b = b->used;
+
+   nblocks = MAX(a->used, b->used) / MIN(a->used, b->used);
+   bsize = MIN(a->used, b->used) ;
+   e = MP_OKAY;
+
+   if ((e = mp_init_size(&a0, bsize + 2)) != MP_OKAY) {
+      return e;
+   }
+   if ((e = mp_init_multi(&tmp, &r, NULL)) != MP_OKAY) {
+      mp_clear(&a0);
+      return e;
+   }
+
+   /* Make sure that A is the larger one*/
+   if (len_a < len_b) {
+      B = *a;
+      A = *b;
+   } else {
+      A = *a;
+      B = *b;
+   }
+
+   for (i = 0, j=0; i < nblocks; i++) {
+      /* Cut a slice off of a */
+      a0.used = 0;
+      for (count = 0; count < bsize; count++) {
+         a0.dp[count] = A.dp[ j++ ];
+         a0.used++;
+      }
+      /* Multiply with b */
+      if ((e = mp_mul(&a0, &B, &tmp)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+      /* Shift tmp to the correct position */
+      if ((e = mp_lshd(&tmp, bsize * i)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+      /* Add to output. No carry needed */
+      if ((e = mp_add(&r, &tmp, &r)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+   }
+   /* The left-overs; there are always left-overs */
+   if (j < A.used) {
+      a0.used = 0;
+      for (count = 0; j < A.used; count++) {
+         a0.dp[count] = A.dp[ j++ ];
+         a0.used++;
+      }
+      if ((e = mp_mul(&a0, &B, &tmp)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+      if ((e = mp_lshd(&tmp, bsize * i)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+      if ((e = mp_add(&r, &tmp, &r)) != MP_OKAY) {
+         goto LBL_ERR;
+      }
+   }
+
+   mp_exch(&r,c);
+LBL_ERR:
+   mp_clear_multi(&a0, &tmp, &r,NULL);
+   return e;
+}
+#endif
@@ -7,7 +7,41 @@
 int mp_mul(const mp_int *a, const mp_int *b, mp_int *c)
 {
    int     res, neg;
+#ifdef BN_MP_BALANCE_MUL_C
+   int len_b, len_a;
+#endif
    neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+#ifdef BN_MP_BALANCE_MUL_C
+   len_a = a->used;
+   len_b = b->used;
+
+   if (len_a == len_b) {
+      goto GO_ON;
+   }
+   /*
+    * Check sizes. The smaller one needs to be larger than the Karatsuba cut-off.
+    * The bigger one needs to be at least about one KARATSUBA_MUL_CUTOFF bigger
+    * to make some sense, but it depends on architecture, OS, position of the
+    * stars... so YMMV.
+    * Using it to cut the input into slices small enough for fast_s_mp_mul_digs
+    * was actually slower on the author's machine, but YMMV.
+    */
+   if ((MIN(len_a, len_b) < KARATSUBA_MUL_CUTOFF)
+       || ((MAX(len_a, len_b)) / 2 < KARATSUBA_MUL_CUTOFF)) {
+      goto GO_ON;
+   }
+   /*
+    * Not much effect was observed below a ratio of 1:2, but again: YMMV.
+    */
+   if ((MAX(len_a, len_b) /  MIN(len_a, len_b)) < 2) {
+      goto GO_ON;
+   }
+
+   res = mp_balance_mul(a,b,c);
+   goto END;
+
+GO_ON:
+#endif
 
    /* use Toom-Cook? */
 #ifdef BN_MP_TOOM_MUL_C
@@ -45,7 +79,9 @@ int mp_mul(const mp_int *a, const mp_int *b, mp_int *c)
 #endif
          }
       }
+END:
    c->sign = (c->used > 0) ? neg : MP_ZPOS;
    return res;
 }
 #endif
+