--- /dev/null
+/* mpz_powm(res,base,exp,mod) -- Set RES to (base**exp) mod MOD.
+
+ Contributed by Paul Zimmermann.
+
+Copyright 1991, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2005, 2009
+Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
+
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+#ifdef BERKELEY_MP
+#include "mp.h"
+#endif
+
+/* Compute t = a mod m, a is defined by (ap,an), m is defined by (mp,mn), and
+ t is defined by (tp,mn). */
+static void
+reduce (mp_ptr tp, mp_srcptr ap, mp_size_t an, mp_srcptr mp, mp_size_t mn)
+{
+ mp_ptr qp;
+ TMP_DECL;
+
+ TMP_MARK;
+ qp = TMP_ALLOC_LIMBS (an - mn + 1);
+
+ mpn_tdiv_qr (qp, tp, 0L, ap, an, mp, mn);
+
+ TMP_FREE;
+}
+
+#if REDUCE_EXPONENT
+/* Return the group order of the ring mod m. */
+static mp_limb_t
+phi (mp_limb_t t)
+{
+ mp_limb_t d, m, go;
+
+ go = 1;
+
+ if (t % 2 == 0)
+ {
+ t = t / 2;
+ while (t % 2 == 0)
+ {
+ go *= 2;
+ t = t / 2;
+ }
+ }
+ for (d = 3;; d += 2)
+ {
+ m = d - 1;
+ for (;;)
+ {
+ unsigned long int q = t / d;
+ if (q < d)
+ {
+ if (t <= 1)
+ return go;
+ if (t == d)
+ return go * m;
+ return go * (t - 1);
+ }
+ if (t != q * d)
+ break;
+ go *= m;
+ m = d;
+ t = q;
+ }
+ }
+}
+#endif
+
+/* average number of calls to redc for an exponent of n bits
+ with the sliding window algorithm of base 2^k: the optimal is
+ obtained for the value of k which minimizes 2^(k-1)+n/(k+1):
+
+ n\k 4 5 6 7 8
+ 128 156* 159 171 200 261
+ 256 309 307* 316 343 403
+ 512 617 607* 610 632 688
+ 1024 1231 1204 1195* 1207 1256
+ 2048 2461 2399 2366 2360* 2396
+ 4096 4918 4787 4707 4665* 4670
+*/
+\f
+
+/* Use REDC instead of usual reduction for sizes < POWM_THRESHOLD. In REDC
+ each modular multiplication costs about 2*n^2 limbs operations, whereas
+ using usual reduction it costs 3*K(n), where K(n) is the cost of a
+ multiplication using Karatsuba, and a division is assumed to cost 2*K(n),
+ for example using Burnikel-Ziegler's algorithm. This gives a theoretical
+ threshold of a*SQR_KARATSUBA_THRESHOLD, with a=(3/2)^(1/(2-ln(3)/ln(2))) ~
+ 2.66. */
+/* For now, also disable REDC when MOD is even, as the inverse can't handle
+ that. At some point, we might want to make the code faster for that case,
+ perhaps using CRR. */
+
+#ifndef POWM_THRESHOLD
+#define POWM_THRESHOLD ((8 * SQR_KARATSUBA_THRESHOLD) / 3)
+#endif
+
+#define HANDLE_NEGATIVE_EXPONENT 1
+#undef REDUCE_EXPONENT
+
+void
+#ifndef BERKELEY_MP
+mpz_powm (mpz_ptr r, mpz_srcptr b, mpz_srcptr e, mpz_srcptr m)
+#else /* BERKELEY_MP */
+pow (mpz_srcptr b, mpz_srcptr e, mpz_srcptr m, mpz_ptr r)
+#endif /* BERKELEY_MP */
+{
+ mp_ptr xp, tp, qp, gp, this_gp;
+ mp_srcptr bp, ep, mp;
+ mp_size_t bn, es, en, mn, xn;
+ mp_limb_t invm, c;
+ unsigned long int enb;
+ mp_size_t i, K, j, l, k;
+ int m_zero_cnt, e_zero_cnt;
+ int sh;
+ int use_redc;
+#if HANDLE_NEGATIVE_EXPONENT
+ mpz_t new_b;
+#endif
+#if REDUCE_EXPONENT
+ mpz_t new_e;
+#endif
+ TMP_DECL;
+
+ mp = PTR(m);
+ mn = ABSIZ (m);
+ if (mn == 0)
+ DIVIDE_BY_ZERO;
+
+ TMP_MARK;
+
+ es = SIZ (e);
+ if (es <= 0)
+ {
+ if (es == 0)
+ {
+ /* Exponent is zero, result is 1 mod m, i.e., 1 or 0 depending on if
+ m equals 1. */
+ SIZ(r) = (mn == 1 && mp[0] == 1) ? 0 : 1;
+ PTR(r)[0] = 1;
+ TMP_FREE; /* we haven't really allocated anything here */
+ return;
+ }
+#if HANDLE_NEGATIVE_EXPONENT
+ MPZ_TMP_INIT (new_b, mn + 1);
+
+ if (! mpz_invert (new_b, b, m))
+ DIVIDE_BY_ZERO;
+ b = new_b;
+ es = -es;
+#else
+ DIVIDE_BY_ZERO;
+#endif
+ }
+ en = es;
+
+#if REDUCE_EXPONENT
+ /* Reduce exponent by dividing it by phi(m) when m small. */
+ if (mn == 1 && mp[0] < 0x7fffffffL && en * GMP_NUMB_BITS > 150)
+ {
+ MPZ_TMP_INIT (new_e, 2);
+ mpz_mod_ui (new_e, e, phi (mp[0]));
+ e = new_e;
+ }
+#endif
+
+ use_redc = mn < POWM_THRESHOLD && mp[0] % 2 != 0;
+ if (use_redc)
+ {
+ /* invm = -1/m mod 2^BITS_PER_MP_LIMB, must have m odd */
+ modlimb_invert (invm, mp[0]);
+ invm = -invm;
+ }
+ else
+ {
+ /* Normalize m (i.e. make its most significant bit set) as required by
+ division functions below. */
+ count_leading_zeros (m_zero_cnt, mp[mn - 1]);
+ m_zero_cnt -= GMP_NAIL_BITS;
+ if (m_zero_cnt != 0)
+ {
+ mp_ptr new_mp;
+ new_mp = TMP_ALLOC_LIMBS (mn);
+ mpn_lshift (new_mp, mp, mn, m_zero_cnt);
+ mp = new_mp;
+ }
+ }
+
+ /* Determine optimal value of k, the number of exponent bits we look at
+ at a time. */
+ count_leading_zeros (e_zero_cnt, PTR(e)[en - 1]);
+ e_zero_cnt -= GMP_NAIL_BITS;
+ enb = en * GMP_NUMB_BITS - e_zero_cnt; /* number of bits of exponent */
+ k = 1;
+ K = 2;
+ while (2 * enb > K * (2 + k * (3 + k)))
+ {
+ k++;
+ K *= 2;
+ if (k == 10) /* cap allocation */
+ break;
+ }
+
+ tp = TMP_ALLOC_LIMBS (2 * mn);
+ qp = TMP_ALLOC_LIMBS (mn + 1);
+
+ gp = __GMP_ALLOCATE_FUNC_LIMBS (K / 2 * mn);
+
+ /* Compute x*R^n where R=2^BITS_PER_MP_LIMB. */
+ bn = ABSIZ (b);
+ bp = PTR(b);
+ /* Handle |b| >= m by computing b mod m. FIXME: It is not strictly necessary
+ for speed or correctness to do this when b and m have the same number of
+ limbs, perhaps remove mpn_cmp call. */
+ if (bn > mn || (bn == mn && mpn_cmp (bp, mp, mn) >= 0))
+ {
+ /* Reduce possibly huge base while moving it to gp[0]. Use a function
+ call to reduce, since we don't want the quotient allocation to
+ live until function return. */
+ if (use_redc)
+ {
+ reduce (tp + mn, bp, bn, mp, mn); /* b mod m */
+ MPN_ZERO (tp, mn);
+ mpn_tdiv_qr (qp, gp, 0L, tp, 2 * mn, mp, mn); /* unnormnalized! */
+ }
+ else
+ {
+ reduce (gp, bp, bn, mp, mn);
+ }
+ }
+ else
+ {
+ /* |b| < m. We pad out operands to become mn limbs, which simplifies
+ the rest of the function, but slows things down when |b| << m. */
+ if (use_redc)
+ {
+ MPN_ZERO (tp, mn);
+ MPN_COPY (tp + mn, bp, bn);
+ MPN_ZERO (tp + mn + bn, mn - bn);
+ mpn_tdiv_qr (qp, gp, 0L, tp, 2 * mn, mp, mn);
+ }
+ else
+ {
+ MPN_COPY (gp, bp, bn);
+ MPN_ZERO (gp + bn, mn - bn);
+ }
+ }
+
+ /* Compute xx^i for odd g < 2^i. */
+
+ xp = TMP_ALLOC_LIMBS (mn);
+ mpn_sqr_n (tp, gp, mn);
+ if (use_redc)
+ mpn_redc_1 (xp, tp, mp, mn, invm); /* xx = x^2*R^n */
+ else
+ mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
+ this_gp = gp;
+ for (i = 1; i < K / 2; i++)
+ {
+ mpn_mul_n (tp, this_gp, xp, mn);
+ this_gp += mn;
+ if (use_redc)
+ mpn_redc_1 (this_gp, tp, mp, mn, invm); /* g[i] = x^(2i+1)*R^n */
+ else
+ mpn_tdiv_qr (qp, this_gp, 0L, tp, 2 * mn, mp, mn);
+ }
+
+ /* Start the real stuff. */
+ ep = PTR (e);
+ i = en - 1; /* current index */
+ c = ep[i]; /* current limb */
+ sh = GMP_NUMB_BITS - e_zero_cnt; /* significant bits in ep[i] */
+ sh -= k; /* index of lower bit of ep[i] to take into account */
+ if (sh < 0)
+ { /* k-sh extra bits are needed */
+ if (i > 0)
+ {
+ i--;
+ c <<= (-sh);
+ sh += GMP_NUMB_BITS;
+ c |= ep[i] >> sh;
+ }
+ }
+ else
+ c >>= sh;
+
+ for (j = 0; c % 2 == 0; j++)
+ c >>= 1;
+
+ MPN_COPY (xp, gp + mn * (c >> 1), mn);
+ while (--j >= 0)
+ {
+ mpn_sqr_n (tp, xp, mn);
+ if (use_redc)
+ mpn_redc_1 (xp, tp, mp, mn, invm);
+ else
+ mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
+ }
+
+ while (i > 0 || sh > 0)
+ {
+ c = ep[i];
+ l = k; /* number of bits treated */
+ sh -= l;
+ if (sh < 0)
+ {
+ if (i > 0)
+ {
+ i--;
+ c <<= (-sh);
+ sh += GMP_NUMB_BITS;
+ c |= ep[i] >> sh;
+ }
+ else
+ {
+ l += sh; /* last chunk of bits from e; l < k */
+ }
+ }
+ else
+ c >>= sh;
+ c &= ((mp_limb_t) 1 << l) - 1;
+
+ /* This while loop implements the sliding window improvement--loop while
+ the most significant bit of c is zero, squaring xx as we go. */
+ while ((c >> (l - 1)) == 0 && (i > 0 || sh > 0))
+ {
+ mpn_sqr_n (tp, xp, mn);
+ if (use_redc)
+ mpn_redc_1 (xp, tp, mp, mn, invm);
+ else
+ mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
+ if (sh != 0)
+ {
+ sh--;
+ c = (c << 1) + ((ep[i] >> sh) & 1);
+ }
+ else
+ {
+ i--;
+ sh = GMP_NUMB_BITS - 1;
+ c = (c << 1) + (ep[i] >> sh);
+ }
+ }
+
+ /* Replace xx by xx^(2^l)*x^c. */
+ if (c != 0)
+ {
+ for (j = 0; c % 2 == 0; j++)
+ c >>= 1;
+
+ /* c0 = c * 2^j, i.e. xx^(2^l)*x^c = (A^(2^(l - j))*c)^(2^j) */
+ l -= j;
+ while (--l >= 0)
+ {
+ mpn_sqr_n (tp, xp, mn);
+ if (use_redc)
+ mpn_redc_1 (xp, tp, mp, mn, invm);
+ else
+ mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
+ }
+ mpn_mul_n (tp, xp, gp + mn * (c >> 1), mn);
+ if (use_redc)
+ mpn_redc_1 (xp, tp, mp, mn, invm);
+ else
+ mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
+ }
+ else
+ j = l; /* case c=0 */
+ while (--j >= 0)
+ {
+ mpn_sqr_n (tp, xp, mn);
+ if (use_redc)
+ mpn_redc_1 (xp, tp, mp, mn, invm);
+ else
+ mpn_tdiv_qr (qp, xp, 0L, tp, 2 * mn, mp, mn);
+ }
+ }
+
+ if (use_redc)
+ {
+ /* Convert back xx to xx/R^n. */
+ MPN_COPY (tp, xp, mn);
+ MPN_ZERO (tp + mn, mn);
+ mpn_redc_1 (xp, tp, mp, mn, invm);
+ if (mpn_cmp (xp, mp, mn) >= 0)
+ mpn_sub_n (xp, xp, mp, mn);
+ }
+ else
+ {
+ if (m_zero_cnt != 0)
+ {
+ mp_limb_t cy;
+ cy = mpn_lshift (tp, xp, mn, m_zero_cnt);
+ tp[mn] = cy;
+ mpn_tdiv_qr (qp, xp, 0L, tp, mn + (cy != 0), mp, mn);
+ mpn_rshift (xp, xp, mn, m_zero_cnt);
+ }
+ }
+ xn = mn;
+ MPN_NORMALIZE (xp, xn);
+
+ if ((ep[0] & 1) && SIZ(b) < 0 && xn != 0)
+ {
+ mp = PTR(m); /* want original, unnormalized m */
+ mpn_sub (xp, mp, mn, xp, xn);
+ xn = mn;
+ MPN_NORMALIZE (xp, xn);
+ }
+ MPZ_REALLOC (r, xn);
+ SIZ (r) = xn;
+ MPN_COPY (PTR(r), xp, xn);
+
+ __GMP_FREE_FUNC_LIMBS (gp, K / 2 * mn);
+ TMP_FREE;
+}