Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpz / n_pow_ui.c
diff --git a/gmp/mpz/n_pow_ui.c b/gmp/mpz/n_pow_ui.c

new file mode 100644 (file)

index 0000000..4f3f497
--- /dev/null
+++ b/gmp/mpz/n_pow_ui.c
@@ -0,0 +1,523 @@
+/* mpz_n_pow_ui -- mpn raised to ulong.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2001, 2002, 2005 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+
+/* Change this to "#define TRACE(x) x" for some traces. */
+#define TRACE(x)
+
+
+/* Use this to test the mul_2 code on a CPU without a native version of that
+   routine.  */
+#if 0
+#define mpn_mul_2  refmpn_mul_2
+#define HAVE_NATIVE_mpn_mul_2  1
+#endif
+
+
+/* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
+   ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
+   bsize==2 or >2, but separating that isn't easy because there's shared
+   code both before and after (the size calculations and the powers of 2
+   handling).
+
+   Alternatives:
+
+   It would work to just use the mpn_mul powering loop for 1 and 2 limb
+   bases, but the current separate loop allows mul_1 and mul_2 to be done
+   in-place, which might help cache locality a bit.  If mpn_mul was relaxed
+   to allow source==dest when vn==1 or 2 then some pointer twiddling might
+   let us get the same effect in one loop.
+
+   The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
+   form the biggest possible power of b that fits, only the biggest power of
+   2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
+   using __mp_bases[b].big_base for small b, and thereby get better value
+   from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
+   so would be more complicated than it's worth, and could well end up being
+   a slowdown for small e.  For big e on the other hand the algorithm is
+   dominated by mpn_sqr_n so there wouldn't much of a saving.  The current
+   code can be viewed as simply doing the first few steps of the powering in
+   a single or double limb where possible.
+
+   If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
+   copy made of b is unnecessary.  We could just use the old alloc'ed block
+   and free it at the end.  But arranging this seems like a lot more trouble
+   than it's worth.  */
+
+
+/* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
+   a limb without overflowing.
+   FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
+
+#define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
+
+
+/* The following are for convenience, they update the size and check the
+   alloc.  */
+
+#define MPN_SQR_N(dst, alloc, src, size)        \
+  do {                                          \
+    ASSERT (2*(size) <= (alloc));               \
+    mpn_sqr_n (dst, src, size);                 \
+    (size) *= 2;                                \
+    (size) -= ((dst)[(size)-1] == 0);           \
+  } while (0)
+
+#define MPN_MUL(dst, alloc, src, size, src2, size2)     \
+  do {                                                  \
+    mp_limb_t  cy;                                      \
+    ASSERT ((size) + (size2) <= (alloc));               \
+    cy = mpn_mul (dst, src, size, src2, size2);         \
+    (size) += (size2) - (cy == 0);                      \
+  } while (0)
+
+#define MPN_MUL_2(ptr, size, alloc, mult)       \
+  do {                                          \
+    mp_limb_t  cy;                              \
+    ASSERT ((size)+2 <= (alloc));               \
+    cy = mpn_mul_2 (ptr, ptr, size, mult);      \
+    (size)++;                                   \
+    (ptr)[(size)] = cy;                         \
+    (size) += (cy != 0);                        \
+  } while (0)
+
+#define MPN_MUL_1(ptr, size, alloc, limb)       \
+  do {                                          \
+    mp_limb_t  cy;                              \
+    ASSERT ((size)+1 <= (alloc));               \
+    cy = mpn_mul_1 (ptr, ptr, size, limb);      \
+    (ptr)[size] = cy;                           \
+    (size) += (cy != 0);                        \
+  } while (0)
+
+#define MPN_LSHIFT(ptr, size, alloc, shift)     \
+  do {                                          \
+    mp_limb_t  cy;                              \
+    ASSERT ((size)+1 <= (alloc));               \
+    cy = mpn_lshift (ptr, ptr, size, shift);    \
+    (ptr)[size] = cy;                           \
+    (size) += (cy != 0);                        \
+  } while (0)
+
+#define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
+  do {                                                  \
+    if ((shift) == 0)                                   \
+      MPN_COPY (dst, src, size);                        \
+    else                                                \
+      {                                                 \
+        mpn_rshift (dst, src, size, shift);             \
+        (size) -= ((dst)[(size)-1] == 0);               \
+      }                                                 \
+  } while (0)
+
+
+/* ralloc and talloc are only wanted for ASSERTs, after the initial space
+   allocations.  Avoid writing values to them in a normal build, to ensure
+   the compiler lets them go dead.  gcc already figures this out itself
+   actually.  */
+
+#define SWAP_RP_TP                                      \
+  do {                                                  \
+    MP_PTR_SWAP (rp, tp);                               \
+    ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
+  } while (0)
+
+
+void
+mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
+{
+  mp_ptr         rp;
+  mp_size_t      rtwos_limbs, ralloc, rsize;
+  int            rneg, i, cnt, btwos, r_bp_overlap;
+  mp_limb_t      blimb, rl;
+  unsigned long  rtwos_bits;
+#if HAVE_NATIVE_mpn_mul_2
+  mp_limb_t      blimb_low, rl_high;
+#else
+  mp_limb_t      b_twolimbs[2];
+#endif
+  TMP_DECL;
+
+  TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
+                 PTR(r), bp, bsize, e, e);
+         mpn_trace ("b", bp, bsize));
+
+  ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
+  ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ABSIZ(r), bp, bsize));
+
+  /* b^0 == 1, including 0^0 == 1 */
+  if (e == 0)
+    {
+      PTR(r)[0] = 1;
+      SIZ(r) = 1;
+      return;
+    }
+
+  /* 0^e == 0 apart from 0^0 above */
+  if (bsize == 0)
+    {
+      SIZ(r) = 0;
+      return;
+    }
+
+  /* Sign of the final result. */
+  rneg = (bsize < 0 && (e & 1) != 0);
+  bsize = ABS (bsize);
+  TRACE (printf ("rneg %d\n", rneg));
+
+  r_bp_overlap = (PTR(r) == bp);
+
+  /* Strip low zero limbs from b. */
+  rtwos_limbs = 0;
+  for (blimb = *bp; blimb == 0; blimb = *++bp)
+    {
+      rtwos_limbs += e;
+      bsize--; ASSERT (bsize >= 1);
+    }
+  TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
+
+  /* Strip low zero bits from b. */
+  count_trailing_zeros (btwos, blimb);
+  blimb >>= btwos;
+  rtwos_bits = e * btwos;
+  rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
+  rtwos_bits %= GMP_NUMB_BITS;
+  TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
+                 btwos, rtwos_limbs, rtwos_bits));
+
+  TMP_MARK;
+
+  rl = 1;
+#if HAVE_NATIVE_mpn_mul_2
+  rl_high = 0;
+#endif
+
+  if (bsize == 1)
+    {
+    bsize_1:
+      /* Power up as far as possible within blimb.  We start here with e!=0,
+         but if e is small then we might reach e==0 and the whole b^e in rl.
+         Notice this code works when blimb==1 too, reaching e==0.  */
+
+      while (blimb <= GMP_NUMB_HALFMAX)
+        {
+          TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
+                         e, blimb, rl));
+          ASSERT (e != 0);
+          if ((e & 1) != 0)
+            rl *= blimb;
+          e >>= 1;
+          if (e == 0)
+            goto got_rl;
+          blimb *= blimb;
+        }
+
+#if HAVE_NATIVE_mpn_mul_2
+      TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
+                     e, blimb, rl));
+
+      /* Can power b once more into blimb:blimb_low */
+      bsize = 2;
+      ASSERT (e != 0);
+      if ((e & 1) != 0)
+       {
+         umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
+         rl >>= GMP_NAIL_BITS;
+       }
+      e >>= 1;
+      umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
+      blimb_low >>= GMP_NAIL_BITS;
+
+    got_rl:
+      TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
+                     e, blimb, blimb_low, rl_high, rl));
+
+      /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
+         final mul_1 or mul_2 rather than a separate lshift.
+         - rl_high:rl mustn't be 1 (since then there's no final mul)
+         - rl_high mustn't overflow
+         - rl_high mustn't change to non-zero, since mul_1+lshift is
+         probably faster than mul_2 (FIXME: is this true?)  */
+
+      if (rtwos_bits != 0
+          && ! (rl_high == 0 && rl == 1)
+          && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
+        {
+          mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
+            | (rl >> (GMP_NUMB_BITS-rtwos_bits));
+          if (! (rl_high == 0 && new_rl_high != 0))
+            {
+              rl_high = new_rl_high;
+              rl <<= rtwos_bits;
+              rtwos_bits = 0;
+              TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
+                             rl_high, rl));
+            }
+        }
+#else
+    got_rl:
+      TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
+                     e, blimb, rl));
+
+      /* Combine left-over rtwos_bits into rl to be handled by the final
+         mul_1 rather than a separate lshift.
+         - rl mustn't be 1 (since then there's no final mul)
+         - rl mustn't overflow  */
+
+      if (rtwos_bits != 0
+          && rl != 1
+          && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
+        {
+          rl <<= rtwos_bits;
+          rtwos_bits = 0;
+          TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
+        }
+#endif
+    }
+  else if (bsize == 2)
+    {
+      mp_limb_t  bsecond = bp[1];
+      if (btwos != 0)
+        blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
+      bsecond >>= btwos;
+      if (bsecond == 0)
+        {
+          /* Two limbs became one after rshift. */
+          bsize = 1;
+          goto bsize_1;
+        }
+
+      TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
+#if HAVE_NATIVE_mpn_mul_2
+      blimb_low = blimb;
+#else
+      bp = b_twolimbs;
+      b_twolimbs[0] = blimb;
+      b_twolimbs[1] = bsecond;
+#endif
+      blimb = bsecond;
+    }
+  else
+    {
+      if (r_bp_overlap || btwos != 0)
+        {
+          mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
+          MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
+          bp = tp;
+          TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
+        }
+#if HAVE_NATIVE_mpn_mul_2
+      /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
+      blimb_low = bp[0];
+#endif
+      blimb = bp[bsize-1];
+
+      TRACE (printf ("big bsize=%ld  ", bsize);
+             mpn_trace ("b", bp, bsize));
+    }
+
+  /* At this point blimb is the most significant limb of the base to use.
+
+     Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
+     limb to round up the division; +1 for multiplies all using an extra
+     limb over the true size; +2 for rl at the end; +1 for lshift at the
+     end.
+
+     The size calculation here is reasonably accurate.  The base is at least
+     half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
+     when it will power up as just over 16, an overestimate of 17/16 =
+     6.25%.  For a 64-bit limb it's half that.
+
+     If e==0 then blimb won't be anything useful (though it will be
+     non-zero), but that doesn't matter since we just end up with ralloc==5,
+     and that's fine for 2 limbs of rl and 1 of lshift.  */
+
+  ASSERT (blimb != 0);
+  count_leading_zeros (cnt, blimb);
+  ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
+  TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
+                 ralloc, bsize, blimb, cnt));
+  MPZ_REALLOC (r, ralloc + rtwos_limbs);
+  rp = PTR(r);
+
+  /* Low zero limbs resulting from powers of 2. */
+  MPN_ZERO (rp, rtwos_limbs);
+  rp += rtwos_limbs;
+
+  if (e == 0)
+    {
+      /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
+         start. */
+      rp[0] = rl;
+      rsize = 1;
+#if HAVE_NATIVE_mpn_mul_2
+      rp[1] = rl_high;
+      rsize += (rl_high != 0);
+#endif
+      ASSERT (rp[rsize-1] != 0);
+    }
+  else
+    {
+      mp_ptr     tp;
+      mp_size_t  talloc;
+
+      /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
+         low bit of e is zero, tp only has to hold the second last power
+         step, which is half the size of the final result.  There's no need
+         to round up the divide by 2, since ralloc includes a +2 for rl
+         which not needed by tp.  In the mpn_mul loop when the low bit of e
+         is 1, tp must hold nearly the full result, so just size it the same
+         as rp.  */
+
+      talloc = ralloc;
+#if HAVE_NATIVE_mpn_mul_2
+      if (bsize <= 2 || (e & 1) == 0)
+        talloc /= 2;
+#else
+      if (bsize <= 1 || (e & 1) == 0)
+        talloc /= 2;
+#endif
+      TRACE (printf ("talloc %ld\n", talloc));
+      tp = TMP_ALLOC_LIMBS (talloc);
+
+      /* Go from high to low over the bits of e, starting with i pointing at
+         the bit below the highest 1 (which will mean i==-1 if e==1).  */
+      count_leading_zeros (cnt, e);
+      i = GMP_LIMB_BITS - cnt - 2;
+
+#if HAVE_NATIVE_mpn_mul_2
+      if (bsize <= 2)
+        {
+          mp_limb_t  mult[2];
+
+          /* Any bsize==1 will have been powered above to be two limbs. */
+          ASSERT (bsize == 2);
+          ASSERT (blimb != 0);
+
+          /* Arrange the final result ends up in r, not in the temp space */
+          if ((i & 1) == 0)
+            SWAP_RP_TP;
+
+          rp[0] = blimb_low;
+          rp[1] = blimb;
+          rsize = 2;
+
+          mult[0] = blimb_low;
+          mult[1] = blimb;
+
+          for ( ; i >= 0; i--)
+            {
+              TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
+                             i, e, rsize, ralloc, talloc);
+                     mpn_trace ("r", rp, rsize));
+
+              MPN_SQR_N (tp, talloc, rp, rsize);
+              SWAP_RP_TP;
+              if ((e & (1L << i)) != 0)
+                MPN_MUL_2 (rp, rsize, ralloc, mult);
+            }
+
+          TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
+          if (rl_high != 0)
+            {
+              mult[0] = rl;
+              mult[1] = rl_high;
+              MPN_MUL_2 (rp, rsize, ralloc, mult);
+            }
+          else if (rl != 1)
+            MPN_MUL_1 (rp, rsize, ralloc, rl);
+        }
+#else
+      if (bsize == 1)
+        {
+          /* Arrange the final result ends up in r, not in the temp space */
+          if ((i & 1) == 0)
+            SWAP_RP_TP;
+
+          rp[0] = blimb;
+          rsize = 1;
+
+          for ( ; i >= 0; i--)
+            {
+              TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
+                             i, e, rsize, ralloc, talloc);
+                     mpn_trace ("r", rp, rsize));
+
+              MPN_SQR_N (tp, talloc, rp, rsize);
+              SWAP_RP_TP;
+              if ((e & (1L << i)) != 0)
+                MPN_MUL_1 (rp, rsize, ralloc, blimb);
+            }
+
+          TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
+          if (rl != 1)
+            MPN_MUL_1 (rp, rsize, ralloc, rl);
+        }
+#endif
+      else
+        {
+          int  parity;
+
+          /* Arrange the final result ends up in r, not in the temp space */
+          ULONG_PARITY (parity, e);
+          if (((parity ^ i) & 1) != 0)
+            SWAP_RP_TP;
+
+          MPN_COPY (rp, bp, bsize);
+          rsize = bsize;
+
+          for ( ; i >= 0; i--)
+            {
+              TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
+                             i, e, rsize, ralloc, talloc);
+                     mpn_trace ("r", rp, rsize));
+
+              MPN_SQR_N (tp, talloc, rp, rsize);
+              SWAP_RP_TP;
+              if ((e & (1L << i)) != 0)
+                {
+                  MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
+                  SWAP_RP_TP;
+                }
+            }
+        }
+    }
+
+  ASSERT (rp == PTR(r) + rtwos_limbs);
+  TRACE (mpn_trace ("end loop r", rp, rsize));
+  TMP_FREE;
+
+  /* Apply any partial limb factors of 2. */
+  if (rtwos_bits != 0)
+    {
+      MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
+      TRACE (mpn_trace ("lshift r", rp, rsize));
+    }
+
+  rsize += rtwos_limbs;
+  SIZ(r) = (rneg ? -rsize : rsize);
+}