Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / generic / mu_div_qr.c
diff --git a/gmp/mpn/generic/mu_div_qr.c b/gmp/mpn/generic/mu_div_qr.c

new file mode 100644 (file)

index 0000000..9049e59
--- /dev/null
+++ b/gmp/mpn/generic/mu_div_qr.c
@@ -0,0 +1,463 @@
+/* mpn_mu_div_qr, mpn_preinv_mu_div_qr.
+
+   Compute Q = floor(N / D) and R = N-QD.  N is nn limbs and D is dn limbs and
+   must be normalized, and Q must be nn-dn limbs.  The requirement that Q is
+   nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to
+   let N be unmodified during the operation.
+
+   Contributed to the GNU project by Torbjorn Granlund.
+
+   THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE.  IT IS
+   ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS
+   ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP
+   RELEASE.
+
+Copyright 2005, 2006, 2007 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+
+/* We use the "misunderstanding algorithm" (MUA), discovered by Paul Zimmermann
+   and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of
+   Jebelean's bidirectional exact division algorithm.
+
+   The idea of this algorithm is to compute a smaller inverted value than used
+   in the standard Barrett algorithm, and thus save time in the Newton
+   iterations, and pay just a small price when using the inverted value for
+   developing quotient bits.
+
+   Written by Torbjorn Granlund.  Paul Zimmermann suggested the use of the
+   "wrap around" trick.  Based on the GMP divexact code and inspired by code
+   contributed to GMP by Karl Hasselstroem.
+*/
+
+
+/* CAUTION: This code and the code in mu_divappr_q.c should be edited in lockstep.
+
+ Things to work on:
+
+  * Passing k isn't a great interface.  Either 'in' should be passed, or
+    determined by the code.
+
+  * The current mpn_mu_div_qr_itch isn't exactly scientifically written.
+    Scratch space buffer overruns are not unlikely before some analysis is
+    applied.  Since scratch requirements are expected to change, such an
+    analysis will have to wait til things settle.
+
+  * This isn't optimal when the remainder isn't needed, since the final
+    multiplication could be made special and take O(1) time on average, in that
+    case.  This is particularly bad when qn << dn.  At some level, code as in
+    GMP 4 mpn_tdiv_qr should be used, effectively dividing the leading 2qn
+    dividend limbs by the qn divisor limbs.
+
+  * This isn't optimal when the quotient isn't needed, as it might take a lot
+    of space.  The computation is always needed, though, so there is not time
+    to save with special code.
+
+  * The itch/scratch scheme isn't perhaps such a good idea as it once seemed,
+    demonstrated by the fact that the mpn_inv function's scratch needs means
+    that we need to keep a large allocation long after it is needed.  Things
+    are worse as mpn_mul_fft does not accept any scratch parameter, which means
+    we'll have a large memory hole while in mpn_mul_fft.  In general, a peak
+    scratch need in the beginning of a function isn't well-handled by the
+    itch/scratch scheme.
+
+  * Some ideas from comments in divexact.c apply to this code too.
+*/
+
+/* the NOSTAT stuff handles properly the case where files are concatenated */
+#ifdef NOSTAT
+#undef STAT
+#endif
+
+#ifdef STAT
+#undef STAT
+#define STAT(x) x
+#else
+#define NOSTAT
+#define STAT(x)
+#endif
+
+#include <stdlib.h>            /* for NULL */
+#include "gmp.h"
+#include "gmp-impl.h"
+
+
+/* In case k=0 (automatic choice), we distinguish 3 cases:
+   (a) dn < qn:         in = ceil(qn / ceil(qn/dn))
+   (b) dn/3 < qn <= dn: in = ceil(qn / 2)
+   (c) qn < dn/3:       in = qn
+   In all cases we have in <= dn.
+ */
+mp_size_t
+mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k)
+{
+  mp_size_t in;
+
+  if (k == 0)
+    {
+      mp_size_t b;
+      if (qn > dn)
+       {
+         /* Compute an inverse size that is a nice partition of the quotient.  */
+         b = (qn - 1) / dn + 1;        /* ceil(qn/dn), number of blocks */
+         in = (qn - 1) / b + 1;        /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
+       }
+      else if (3 * qn > dn)
+       {
+         in = (qn - 1) / 2 + 1;        /* b = 2 */
+       }
+      else
+       {
+         in = (qn - 1) / 1 + 1;        /* b = 1 */
+       }
+    }
+  else
+    {
+      mp_size_t xn;
+      xn = MIN (dn, qn);
+      in = (xn - 1) / k + 1;
+    }
+
+  return in;
+}
+
+static mp_limb_t
+mpn_mu_div_qr2 (mp_ptr qp,
+               mp_ptr rp,
+               mp_ptr np,
+               mp_size_t nn,
+               mp_srcptr dp,
+               mp_size_t dn,
+               mp_ptr scratch)
+{
+  mp_size_t qn, in;
+  mp_limb_t cy;
+  mp_ptr ip, tp;
+
+  /* FIXME: We should probably not handle tiny operands, but do it for now.  */
+  if (dn == 1)
+    {
+      rp[0] = mpn_divrem_1 (scratch, 0L, np, nn, dp[0]);
+      MPN_COPY (qp, scratch, nn - 1);
+      return scratch[nn - 1];
+    }
+
+  qn = nn - dn;
+
+  /* Compute the inverse size.  */
+  in = mpn_mu_div_qr_choose_in (qn, dn, 0);
+  ASSERT (in <= dn);
+
+#if 1
+  /* This alternative inverse computation method gets slightly more accurate
+     results.  FIXMEs: (1) Temp allocation needs not analysed (2) itch function
+     not adapted (3) mpn_invert scratch needs not met.  */
+  ip = scratch;
+  tp = scratch + in + 1;
+
+  /* compute an approximate inverse on (in+1) limbs */
+  if (dn == in)
+    {
+      MPN_COPY (tp + 1, dp, in);
+      tp[0] = 1;
+      mpn_invert (ip, tp, in + 1, NULL);
+      MPN_COPY_INCR (ip, ip + 1, in);
+    }
+  else
+    {
+      cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1);
+      if (UNLIKELY (cy != 0))
+       MPN_ZERO (ip, in);
+      else
+       {
+         mpn_invert (ip, tp, in + 1, NULL);
+         MPN_COPY_INCR (ip, ip + 1, in);
+       }
+    }
+#else
+  /* This older inverse computation method gets slightly worse results than the
+     one above.  */
+  ip = scratch;
+  tp = scratch + in;
+
+  /* Compute inverse of D to in+1 limbs, then round to 'in' limbs.  Ideally the
+     inversion function should do this automatically.  */
+  if (dn == in)
+    {
+      tp[in + 1] = 0;
+      MPN_COPY (tp + in + 2, dp, in);
+      mpn_invert (tp, tp + in + 1, in + 1, NULL);
+    }
+  else
+    {
+      mpn_invert (tp, dp + dn - (in + 1), in + 1, NULL);
+    }
+  cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT);
+  if (UNLIKELY (cy != 0))
+    MPN_ZERO (tp + 1, in);
+  MPN_COPY (ip, tp + 1, in);
+#endif
+
+/* We can't really handle qh = 1 like this since we'd here clobber N, which is
+   not allowed in the way we've defined this function's API.  */
+#if 0
+  qh = mpn_cmp (np + qn, dp, dn) >= 0;
+  if (qh != 0)
+    mpn_sub_n (np + qn, np + qn, dp, dn);
+#endif
+
+  mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in);
+
+/*  return qh; */
+  return 0;
+}
+
+void
+mpn_preinv_mu_div_qr (mp_ptr qp,
+                     mp_ptr rp,
+                     mp_ptr np,
+                     mp_size_t nn,
+                     mp_srcptr dp,
+                     mp_size_t dn,
+                     mp_srcptr ip,
+                     mp_size_t in,
+                     mp_ptr scratch)
+{
+  mp_size_t qn;
+  mp_limb_t cy;
+  mp_ptr tp;
+  mp_limb_t r;
+
+  qn = nn - dn;
+
+  if (qn == 0)
+    {
+      MPN_COPY (rp, np, dn);
+      return;
+    }
+
+  tp = scratch;
+
+  np += qn;
+  qp += qn;
+
+  MPN_COPY (rp, np, dn);
+
+  while (qn > 0)
+    {
+      if (qn < in)
+       {
+         ip += in - qn;
+         in = qn;
+       }
+      np -= in;
+      qp -= in;
+
+      /* Compute the next block of quotient limbs by multiplying the inverse I
+        by the upper part of the partial remainder R.  */
+      mpn_mul_n (tp, rp + dn - in, ip, in);            /* mulhi  */
+      cy = mpn_add_n (qp, tp + in, rp + dn - in, in);  /* I's msb implicit */
+      ASSERT_ALWAYS (cy == 0);                 /* FIXME */
+
+      /* Compute the product of the quotient block and the divisor D, to be
+        subtracted from the partial remainder combined with new limbs from the
+        dividend N.  We only really need the low dn limbs.  */
+#if WANT_FFT
+      if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD))
+       {
+         /* Use the wrap-around trick.  */
+         mp_size_t m, wn;
+         int k;
+
+         k = mpn_fft_best_k (dn + 1, 0);
+         m = mpn_fft_next_size (dn + 1, k);
+         wn = dn + in - m;                     /* number of wrapped limbs */
+
+         mpn_mul_fft (tp, m, dp, dn, qp, in, k);
+
+         if (wn > 0)
+           {
+             cy = mpn_add_n (tp, tp, rp + dn - wn, wn);
+             mpn_incr_u (tp + wn, cy);
+
+             cy = mpn_cmp (rp + dn - in, tp + dn, m - dn) < 0;
+             mpn_decr_u (tp, cy);
+           }
+       }
+      else
+#endif
+       mpn_mul (tp, dp, dn, qp, in);           /* dn+in limbs, high 'in' cancels */
+
+      r = rp[dn - in] - tp[dn];
+
+      /* Subtract the product from the partial remainder combined with new
+        limbs from the dividend N, generating a new partial remainder R.  */
+      if (dn != in)
+       {
+         cy = mpn_sub_n (tp, np, tp, in);      /* get next 'in' limbs from N */
+         cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy);
+         MPN_COPY (rp, tp, dn);                /* FIXME: try to avoid this */
+       }
+      else
+       {
+         cy = mpn_sub_n (rp, np, tp, in);      /* get next 'in' limbs from N */
+       }
+
+      STAT (int i; int err = 0;
+           static int errarr[5]; static int err_rec; static int tot);
+
+      /* Check the remainder R and adjust the quotient as needed.  */
+      r -= cy;
+      while (r != 0)
+       {
+         /* We loop 0 times with about 69% probability, 1 time with about 31%
+            probability, 2 times with about 0.6% probability, if inverse is
+            computed as recommended.  */
+         mpn_incr_u (qp, 1);
+         cy = mpn_sub_n (rp, rp, dp, dn);
+         r -= cy;
+         STAT (err++);
+       }
+      if (mpn_cmp (rp, dp, dn) >= 0)
+       {
+         /* This is executed with about 76% probability.  */
+         mpn_incr_u (qp, 1);
+         cy = mpn_sub_n (rp, rp, dp, dn);
+         STAT (err++);
+       }
+
+      STAT (
+           tot++;
+           errarr[err]++;
+           if (err > err_rec)
+             err_rec = err;
+           if (tot % 0x10000 == 0)
+             {
+               for (i = 0; i <= err_rec; i++)
+                 printf ("  %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot);
+               printf ("\n");
+             }
+           );
+
+      qn -= in;
+    }
+}
+
+#define THRES 100              /* FIXME: somewhat arbitrary */
+
+#ifdef CHECK
+#undef THRES
+#define THRES 1
+#endif
+
+mp_limb_t
+mpn_mu_div_qr (mp_ptr qp,
+              mp_ptr rp,
+              mp_ptr np,
+              mp_size_t nn,
+              mp_srcptr dp,
+              mp_size_t dn,
+              mp_ptr scratch)
+{
+  mp_size_t qn;
+
+  qn = nn - dn;
+  if (qn + THRES < dn)
+    {
+      /* |______________|________|   dividend                            nn
+               |_______|________|   divisor                              dn
+
+               |______|             quotient (prel)                      qn
+
+                |_______________|   quotient * ignored-part-of(divisor)  dn-1
+      */
+
+      mp_limb_t cy, x;
+
+      if (mpn_cmp (np + nn - (qn + 1), dp + dn - (qn + 1), qn + 1) >= 0)
+       {
+         /* Quotient is 111...111, could optimize this rare case at some point.  */
+         mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);
+         return 0;
+       }
+
+      /* Compute a preliminary quotient and a partial remainder by dividing the
+        most significant limbs of each operand.  */
+      mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1),
+                     np + nn - (2 * qn + 1), 2 * qn + 1,
+                     dp + dn - (qn + 1), qn + 1,
+                     scratch);
+
+      /* Multiply the quotient by the divisor limbs ignored above.  */
+      if (dn - (qn + 1) > qn)
+       mpn_mul (scratch, dp, dn - (qn + 1), qp, qn);  /* prod is dn-1 limbs */
+      else
+       mpn_mul (scratch, qp, qn, dp, dn - (qn + 1));  /* prod is dn-1 limbs */
+
+      cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1));
+      cy = mpn_sub_nc (rp + nn - (2 * qn + 1),
+                      rp + nn - (2 * qn + 1),
+                      scratch + nn - (2 * qn + 1),
+                      qn, cy);
+      x = rp[dn - 1];
+      rp[dn - 1] = x - cy;
+      if (cy > x)
+       {
+         mpn_decr_u (qp, 1);
+         mpn_add_n (rp, rp, dp, dn);
+       }
+    }
+  else
+    {
+      return mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch);
+    }
+
+  return 0;                    /* FIXME */
+}
+
+mp_size_t
+mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k)
+{
+  mp_size_t qn, m;
+  int k;
+
+  /* FIXME: This isn't very carefully written, and might grossly overestimate
+     the amount of scratch needed, and might perhaps also underestimate it,
+     leading to potential buffer overruns.  In particular k=0 might lead to
+     gross overestimates.  */
+
+  if (dn == 1)
+    return nn;
+
+  qn = nn - dn;
+  if (qn >= dn)
+    {
+      k = mpn_fft_best_k (dn + 1, 0);
+      m = mpn_fft_next_size (dn + 1, k);
+      return (mua_k <= 1
+             ? 6 * dn
+             : m + 2 * dn);
+    }
+  else
+    {
+      k = mpn_fft_best_k (dn + 1, 0);
+      m = mpn_fft_next_size (dn + 1, k);
+      return (mua_k <= 1
+             ? m + 4 * qn
+             : m + 2 * qn);
+    }
+}