Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / generic / sqrtrem.c
diff --git a/gmp/mpn/generic/sqrtrem.c b/gmp/mpn/generic/sqrtrem.c

new file mode 100644 (file)

index 0000000..ac878c5
--- /dev/null
+++ b/gmp/mpn/generic/sqrtrem.c
@@ -0,0 +1,342 @@
+/* mpn_sqrtrem -- square root and remainder
+
+   Contributed to the GNU project by Paul Zimmermann (most code) and
+   Torbjorn Granlund (mpn_sqrtrem1).
+
+   THE FUNCTIONS IN THIS FILE EXCEPT mpn_sqrtrem ARE INTERNAL WITH A
+   MUTABLE INTERFACE.  IT IS ONLY SAFE TO REACH THEM THROUGH DOCUMENTED
+   INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED THAT THEY WILL CHANGE OR
+   DISAPPEAR IN A FUTURE GMP RELEASE.
+
+Copyright 1999, 2000, 2001, 2002, 2004, 2005, 2008 Free Software Foundation,
+Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+
+/* See "Karatsuba Square Root", reference in gmp.texi.  */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+static const unsigned short invsqrttab[384] =
+{
+  0x1ff,0x1fd,0x1fb,0x1f9,0x1f7,0x1f5,0x1f3,0x1f2, /* sqrt(1/80)..sqrt(1/87) */
+  0x1f0,0x1ee,0x1ec,0x1ea,0x1e9,0x1e7,0x1e5,0x1e4, /* sqrt(1/88)..sqrt(1/8f) */
+  0x1e2,0x1e0,0x1df,0x1dd,0x1db,0x1da,0x1d8,0x1d7, /* sqrt(1/90)..sqrt(1/97) */
+  0x1d5,0x1d4,0x1d2,0x1d1,0x1cf,0x1ce,0x1cc,0x1cb, /* sqrt(1/98)..sqrt(1/9f) */
+  0x1c9,0x1c8,0x1c6,0x1c5,0x1c4,0x1c2,0x1c1,0x1c0, /* sqrt(1/a0)..sqrt(1/a7) */
+  0x1be,0x1bd,0x1bc,0x1ba,0x1b9,0x1b8,0x1b7,0x1b5, /* sqrt(1/a8)..sqrt(1/af) */
+  0x1b4,0x1b3,0x1b2,0x1b0,0x1af,0x1ae,0x1ad,0x1ac, /* sqrt(1/b0)..sqrt(1/b7) */
+  0x1aa,0x1a9,0x1a8,0x1a7,0x1a6,0x1a5,0x1a4,0x1a3, /* sqrt(1/b8)..sqrt(1/bf) */
+  0x1a2,0x1a0,0x19f,0x19e,0x19d,0x19c,0x19b,0x19a, /* sqrt(1/c0)..sqrt(1/c7) */
+  0x199,0x198,0x197,0x196,0x195,0x194,0x193,0x192, /* sqrt(1/c8)..sqrt(1/cf) */
+  0x191,0x190,0x18f,0x18e,0x18d,0x18c,0x18c,0x18b, /* sqrt(1/d0)..sqrt(1/d7) */
+  0x18a,0x189,0x188,0x187,0x186,0x185,0x184,0x183, /* sqrt(1/d8)..sqrt(1/df) */
+  0x183,0x182,0x181,0x180,0x17f,0x17e,0x17e,0x17d, /* sqrt(1/e0)..sqrt(1/e7) */
+  0x17c,0x17b,0x17a,0x179,0x179,0x178,0x177,0x176, /* sqrt(1/e8)..sqrt(1/ef) */
+  0x176,0x175,0x174,0x173,0x172,0x172,0x171,0x170, /* sqrt(1/f0)..sqrt(1/f7) */
+  0x16f,0x16f,0x16e,0x16d,0x16d,0x16c,0x16b,0x16a, /* sqrt(1/f8)..sqrt(1/ff) */
+  0x16a,0x169,0x168,0x168,0x167,0x166,0x166,0x165, /* sqrt(1/100)..sqrt(1/107) */
+  0x164,0x164,0x163,0x162,0x162,0x161,0x160,0x160, /* sqrt(1/108)..sqrt(1/10f) */
+  0x15f,0x15e,0x15e,0x15d,0x15c,0x15c,0x15b,0x15a, /* sqrt(1/110)..sqrt(1/117) */
+  0x15a,0x159,0x159,0x158,0x157,0x157,0x156,0x156, /* sqrt(1/118)..sqrt(1/11f) */
+  0x155,0x154,0x154,0x153,0x153,0x152,0x152,0x151, /* sqrt(1/120)..sqrt(1/127) */
+  0x150,0x150,0x14f,0x14f,0x14e,0x14e,0x14d,0x14d, /* sqrt(1/128)..sqrt(1/12f) */
+  0x14c,0x14b,0x14b,0x14a,0x14a,0x149,0x149,0x148, /* sqrt(1/130)..sqrt(1/137) */
+  0x148,0x147,0x147,0x146,0x146,0x145,0x145,0x144, /* sqrt(1/138)..sqrt(1/13f) */
+  0x144,0x143,0x143,0x142,0x142,0x141,0x141,0x140, /* sqrt(1/140)..sqrt(1/147) */
+  0x140,0x13f,0x13f,0x13e,0x13e,0x13d,0x13d,0x13c, /* sqrt(1/148)..sqrt(1/14f) */
+  0x13c,0x13b,0x13b,0x13a,0x13a,0x139,0x139,0x139, /* sqrt(1/150)..sqrt(1/157) */
+  0x138,0x138,0x137,0x137,0x136,0x136,0x135,0x135, /* sqrt(1/158)..sqrt(1/15f) */
+  0x135,0x134,0x134,0x133,0x133,0x132,0x132,0x132, /* sqrt(1/160)..sqrt(1/167) */
+  0x131,0x131,0x130,0x130,0x12f,0x12f,0x12f,0x12e, /* sqrt(1/168)..sqrt(1/16f) */
+  0x12e,0x12d,0x12d,0x12d,0x12c,0x12c,0x12b,0x12b, /* sqrt(1/170)..sqrt(1/177) */
+  0x12b,0x12a,0x12a,0x129,0x129,0x129,0x128,0x128, /* sqrt(1/178)..sqrt(1/17f) */
+  0x127,0x127,0x127,0x126,0x126,0x126,0x125,0x125, /* sqrt(1/180)..sqrt(1/187) */
+  0x124,0x124,0x124,0x123,0x123,0x123,0x122,0x122, /* sqrt(1/188)..sqrt(1/18f) */
+  0x121,0x121,0x121,0x120,0x120,0x120,0x11f,0x11f, /* sqrt(1/190)..sqrt(1/197) */
+  0x11f,0x11e,0x11e,0x11e,0x11d,0x11d,0x11d,0x11c, /* sqrt(1/198)..sqrt(1/19f) */
+  0x11c,0x11b,0x11b,0x11b,0x11a,0x11a,0x11a,0x119, /* sqrt(1/1a0)..sqrt(1/1a7) */
+  0x119,0x119,0x118,0x118,0x118,0x118,0x117,0x117, /* sqrt(1/1a8)..sqrt(1/1af) */
+  0x117,0x116,0x116,0x116,0x115,0x115,0x115,0x114, /* sqrt(1/1b0)..sqrt(1/1b7) */
+  0x114,0x114,0x113,0x113,0x113,0x112,0x112,0x112, /* sqrt(1/1b8)..sqrt(1/1bf) */
+  0x112,0x111,0x111,0x111,0x110,0x110,0x110,0x10f, /* sqrt(1/1c0)..sqrt(1/1c7) */
+  0x10f,0x10f,0x10f,0x10e,0x10e,0x10e,0x10d,0x10d, /* sqrt(1/1c8)..sqrt(1/1cf) */
+  0x10d,0x10c,0x10c,0x10c,0x10c,0x10b,0x10b,0x10b, /* sqrt(1/1d0)..sqrt(1/1d7) */
+  0x10a,0x10a,0x10a,0x10a,0x109,0x109,0x109,0x109, /* sqrt(1/1d8)..sqrt(1/1df) */
+  0x108,0x108,0x108,0x107,0x107,0x107,0x107,0x106, /* sqrt(1/1e0)..sqrt(1/1e7) */
+  0x106,0x106,0x106,0x105,0x105,0x105,0x104,0x104, /* sqrt(1/1e8)..sqrt(1/1ef) */
+  0x104,0x104,0x103,0x103,0x103,0x103,0x102,0x102, /* sqrt(1/1f0)..sqrt(1/1f7) */
+  0x102,0x102,0x101,0x101,0x101,0x101,0x100,0x100  /* sqrt(1/1f8)..sqrt(1/1ff) */
+};
+
+/* Compute s = floor(sqrt(a0)), and *rp = a0 - s^2.  */
+
+#if GMP_NUMB_BITS > 32
+#define MAGIC 0x10000000000    /* 0xffe7debbfc < MAGIC < 0x232b1850f410 */
+#else
+#define MAGIC 0x100000         /* 0xfee6f < MAGIC < 0x29cbc8 */
+#endif
+
+static mp_limb_t
+mpn_sqrtrem1 (mp_ptr rp, mp_limb_t a0)
+{
+#if GMP_NUMB_BITS > 32
+  mp_limb_t a1;
+#endif
+  mp_limb_t x0, t2, t, x2;
+  unsigned abits;
+
+  ASSERT_ALWAYS (GMP_NAIL_BITS == 0);
+  ASSERT_ALWAYS (GMP_LIMB_BITS == 32 || GMP_LIMB_BITS == 64);
+  ASSERT (a0 >= GMP_NUMB_HIGHBIT / 2);
+
+  /* Use Newton iterations for approximating 1/sqrt(a) instead of sqrt(a),
+     since we can do the former without division.  As part of the last
+     iteration convert from 1/sqrt(a) to sqrt(a).  */
+
+  abits = a0 >> (GMP_LIMB_BITS - 1 - 8);       /* extract bits for table lookup */
+  x0 = invsqrttab[abits - 0x80];               /* initial 1/sqrt(a) */
+
+  /* x0 is now an 8 bits approximation of 1/sqrt(a0) */
+
+#if GMP_NUMB_BITS > 32
+  a1 = a0 >> (GMP_LIMB_BITS - 1 - 32);
+  t = (mp_limb_signed_t) (0x2000000000000l - 0x30000  - a1 * x0 * x0) >> 16;
+  x0 = (x0 << 16) + ((mp_limb_signed_t) (x0 * t) >> (16+2));
+
+  /* x0 is now an 16 bits approximation of 1/sqrt(a0) */
+
+  t2 = x0 * (a0 >> (32-8));
+  t = t2 >> 25;
+  t = ((mp_limb_signed_t) ((a0 << 14) - t * t - MAGIC) >> (32-8));
+  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 15);
+  x0 >>= 32;
+#else
+  t2 = x0 * (a0 >> (16-8));
+  t = t2 >> 13;
+  t = ((mp_limb_signed_t) ((a0 << 6) - t * t - MAGIC) >> (16-8));
+  x0 = t2 + ((mp_limb_signed_t) (x0 * t) >> 7);
+  x0 >>= 16;
+#endif
+
+  /* x0 is now a full limb approximation of sqrt(a0) */
+
+  x2 = x0 * x0;
+  if (x2 + 2*x0 <= a0 - 1)
+    {
+      x2 += 2*x0 + 1;
+      x0++;
+    }
+
+  *rp = a0 - x2;
+  return x0;
+}
+
+
+#define Prec (GMP_NUMB_BITS >> 1)
+
+/* same as mpn_sqrtrem, but for size=2 and {np, 2} normalized
+   return cc such that {np, 2} = sp[0]^2 + cc*2^GMP_NUMB_BITS + rp[0] */
+static mp_limb_t
+mpn_sqrtrem2 (mp_ptr sp, mp_ptr rp, mp_srcptr np)
+{
+  mp_limb_t qhl, q, u, np0, sp0, rp0, q2;
+  int cc;
+
+  ASSERT (np[1] >= GMP_NUMB_HIGHBIT / 2);
+
+  np0 = np[0];
+  sp0 = mpn_sqrtrem1 (rp, np[1]);
+  qhl = 0;
+  rp0 = rp[0];
+  while (rp0 >= sp0)
+    {
+      qhl++;
+      rp0 -= sp0;
+    }
+  /* now rp0 < sp0 < 2^Prec */
+  rp0 = (rp0 << Prec) + (np0 >> Prec);
+  u = 2 * sp0;
+  q = rp0 / u;
+  u = rp0 - q * u;
+  q += (qhl & 1) << (Prec - 1);
+  qhl >>= 1; /* if qhl=1, necessary q=0 as qhl*2^Prec + q <= 2^Prec */
+  /* now we have (initial rp0)<<Prec + np0>>Prec = (qhl<<Prec + q) * (2sp0) + u */
+  sp0 = ((sp0 + qhl) << Prec) + q;
+  cc = u >> Prec;
+  rp0 = ((u << Prec) & GMP_NUMB_MASK) + (np0 & (((mp_limb_t) 1 << Prec) - 1));
+  /* subtract q * q or qhl*2^(2*Prec) from rp */
+  q2 = q * q;
+  cc -= (rp0 < q2) + qhl;
+  rp0 -= q2;
+  /* now subtract 2*q*2^Prec + 2^(2*Prec) if qhl is set */
+  if (cc < 0)
+    {
+      if (sp0 != 0)
+       {
+         rp0 += sp0;
+         cc += rp0 < sp0;
+       }
+      else
+       cc++;
+      --sp0;
+      rp0 += sp0;
+      cc += rp0 < sp0;
+    }
+
+  rp[0] = rp0;
+  sp[0] = sp0;
+  return cc;
+}
+
+/* writes in {sp, n} the square root (rounded towards zero) of {np, 2n},
+   and in {np, n} the low n limbs of the remainder, returns the high
+   limb of the remainder (which is 0 or 1).
+   Assumes {np, 2n} is normalized, i.e. np[2n-1] >= B/4
+   where B=2^GMP_NUMB_BITS.  */
+static mp_limb_t
+mpn_dc_sqrtrem (mp_ptr sp, mp_ptr np, mp_size_t n)
+{
+  mp_limb_t q;                 /* carry out of {sp, n} */
+  int c, b;                    /* carry out of remainder */
+  mp_size_t l, h;
+
+  ASSERT (np[2 * n - 1] >= GMP_NUMB_HIGHBIT / 2);
+
+  if (n == 1)
+    c = mpn_sqrtrem2 (sp, np, np);
+  else
+    {
+      l = n / 2;
+      h = n - l;
+      q = mpn_dc_sqrtrem (sp + l, np + 2 * l, h);
+      if (q != 0)
+       mpn_sub_n (np + 2 * l, np + 2 * l, sp + l, h);
+      q += mpn_divrem (sp, 0, np + l, n, sp + l, h);
+      c = sp[0] & 1;
+      mpn_rshift (sp, sp, l, 1);
+      sp[l - 1] |= (q << (GMP_NUMB_BITS - 1)) & GMP_NUMB_MASK;
+      q >>= 1;
+      if (c != 0)
+       c = mpn_add_n (np + l, np + l, sp + l, h);
+      mpn_sqr_n (np + n, sp, l);
+      b = q + mpn_sub_n (np, np, np + n, 2 * l);
+      c -= (l == h) ? b : mpn_sub_1 (np + 2 * l, np + 2 * l, 1, (mp_limb_t) b);
+      q = mpn_add_1 (sp + l, sp + l, h, q);
+
+      if (c < 0)
+       {
+         c += mpn_addmul_1 (np, sp, n, CNST_LIMB(2)) + 2 * q;
+         c -= mpn_sub_1 (np, np, n, CNST_LIMB(1));
+         q -= mpn_sub_1 (sp, sp, n, CNST_LIMB(1));
+       }
+    }
+
+  return c;
+}
+
+
+mp_size_t
+mpn_sqrtrem (mp_ptr sp, mp_ptr rp, mp_srcptr np, mp_size_t nn)
+{
+  mp_limb_t *tp, s0[1], cc, high, rl;
+  int c;
+  mp_size_t rn, tn;
+  TMP_DECL;
+
+  ASSERT (nn >= 0);
+  ASSERT_MPN (np, nn);
+
+  /* If OP is zero, both results are zero.  */
+  if (nn == 0)
+    return 0;
+
+  ASSERT (np[nn - 1] != 0);
+  ASSERT (rp == NULL || MPN_SAME_OR_SEPARATE_P (np, rp, nn));
+  ASSERT (rp == NULL || ! MPN_OVERLAP_P (sp, (nn + 1) / 2, rp, nn));
+  ASSERT (! MPN_OVERLAP_P (sp, (nn + 1) / 2, np, nn));
+
+  high = np[nn - 1];
+  if (nn == 1 && (high & GMP_NUMB_HIGHBIT))
+    {
+      mp_limb_t r;
+      sp[0] = mpn_sqrtrem1 (&r, high);
+      if (rp != NULL)
+       rp[0] = r;
+      return r != 0;
+    }
+  count_leading_zeros (c, high);
+  c -= GMP_NAIL_BITS;
+
+  c = c / 2; /* we have to shift left by 2c bits to normalize {np, nn} */
+  tn = (nn + 1) / 2; /* 2*tn is the smallest even integer >= nn */
+
+  TMP_MARK;
+  if (nn % 2 != 0 || c > 0)
+    {
+      tp = TMP_ALLOC_LIMBS (2 * tn);
+      tp[0] = 0;            /* needed only when 2*tn > nn, but saves a test */
+      if (c != 0)
+       mpn_lshift (tp + 2 * tn - nn, np, nn, 2 * c);
+      else
+       MPN_COPY (tp + 2 * tn - nn, np, nn);
+      rl = mpn_dc_sqrtrem (sp, tp, tn);
+      /* We have 2^(2k)*N = S^2 + R where k = c + (2tn-nn)*GMP_NUMB_BITS/2,
+        thus 2^(2k)*N = (S-s0)^2 + 2*S*s0 - s0^2 + R where s0=S mod 2^k */
+      c += (nn % 2) * GMP_NUMB_BITS / 2;               /* c now represents k */
+      s0[0] = sp[0] & (((mp_limb_t) 1 << c) - 1);      /* S mod 2^k */
+      rl += mpn_addmul_1 (tp, sp, tn, 2 * s0[0]);      /* R = R + 2*s0*S */
+      cc = mpn_submul_1 (tp, s0, 1, s0[0]);
+      rl -= (tn > 1) ? mpn_sub_1 (tp + 1, tp + 1, tn - 1, cc) : cc;
+      mpn_rshift (sp, sp, tn, c);
+      tp[tn] = rl;
+      if (rp == NULL)
+       rp = tp;
+      c = c << 1;
+      if (c < GMP_NUMB_BITS)
+       tn++;
+      else
+       {
+         tp++;
+         c -= GMP_NUMB_BITS;
+       }
+      if (c != 0)
+       mpn_rshift (rp, tp, tn, c);
+      else
+       MPN_COPY_INCR (rp, tp, tn);
+      rn = tn;
+    }
+  else
+    {
+      if (rp == NULL)
+       rp = TMP_ALLOC_LIMBS (nn);
+      if (rp != np)
+       MPN_COPY (rp, np, nn);
+      rn = tn + (rp[tn] = mpn_dc_sqrtrem (sp, rp, tn));
+    }
+
+  MPN_NORMALIZE (rp, rn);
+
+  TMP_FREE;
+  return rn;
+}