X-Git-Url: https://oss.titaniummirror.com/gitweb/?a=blobdiff_plain;f=gmp%2Fmpn%2Fgeneric%2Fmu_div_qr.c;fp=gmp%2Fmpn%2Fgeneric%2Fmu_div_qr.c;h=9049e5907af24c9a92eff7ff44b4fad433ad347c;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/generic/mu_div_qr.c b/gmp/mpn/generic/mu_div_qr.c new file mode 100644 index 00000000..9049e590 --- /dev/null +++ b/gmp/mpn/generic/mu_div_qr.c @@ -0,0 +1,463 @@ +/* mpn_mu_div_qr, mpn_preinv_mu_div_qr. + + Compute Q = floor(N / D) and R = N-QD. N is nn limbs and D is dn limbs and + must be normalized, and Q must be nn-dn limbs. The requirement that Q is + nn-dn limbs (and not nn-dn+1 limbs) was put in place in order to allow us to + let N be unmodified during the operation. + + Contributed to the GNU project by Torbjorn Granlund. + + THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH A MUTABLE INTERFACE. IT IS + ONLY SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS + ALMOST GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP + RELEASE. + +Copyright 2005, 2006, 2007 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of the GNU Lesser General Public License as published by +the Free Software Foundation; either version 3 of the License, or (at your +option) any later version. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +License for more details. + +You should have received a copy of the GNU Lesser General Public License +along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ + + +/* We use the "misunderstanding algorithm" (MUA), discovered by Paul Zimmermann + and Torbjorn Granlund when Torbjorn misunderstood Paul's explanation of + Jebelean's bidirectional exact division algorithm. + + The idea of this algorithm is to compute a smaller inverted value than used + in the standard Barrett algorithm, and thus save time in the Newton + iterations, and pay just a small price when using the inverted value for + developing quotient bits. + + Written by Torbjorn Granlund. Paul Zimmermann suggested the use of the + "wrap around" trick. Based on the GMP divexact code and inspired by code + contributed to GMP by Karl Hasselstroem. +*/ + + +/* CAUTION: This code and the code in mu_divappr_q.c should be edited in lockstep. + + Things to work on: + + * Passing k isn't a great interface. Either 'in' should be passed, or + determined by the code. + + * The current mpn_mu_div_qr_itch isn't exactly scientifically written. + Scratch space buffer overruns are not unlikely before some analysis is + applied. Since scratch requirements are expected to change, such an + analysis will have to wait til things settle. + + * This isn't optimal when the remainder isn't needed, since the final + multiplication could be made special and take O(1) time on average, in that + case. This is particularly bad when qn << dn. At some level, code as in + GMP 4 mpn_tdiv_qr should be used, effectively dividing the leading 2qn + dividend limbs by the qn divisor limbs. + + * This isn't optimal when the quotient isn't needed, as it might take a lot + of space. The computation is always needed, though, so there is not time + to save with special code. + + * The itch/scratch scheme isn't perhaps such a good idea as it once seemed, + demonstrated by the fact that the mpn_inv function's scratch needs means + that we need to keep a large allocation long after it is needed. Things + are worse as mpn_mul_fft does not accept any scratch parameter, which means + we'll have a large memory hole while in mpn_mul_fft. In general, a peak + scratch need in the beginning of a function isn't well-handled by the + itch/scratch scheme. + + * Some ideas from comments in divexact.c apply to this code too. +*/ + +/* the NOSTAT stuff handles properly the case where files are concatenated */ +#ifdef NOSTAT +#undef STAT +#endif + +#ifdef STAT +#undef STAT +#define STAT(x) x +#else +#define NOSTAT +#define STAT(x) +#endif + +#include /* for NULL */ +#include "gmp.h" +#include "gmp-impl.h" + + +/* In case k=0 (automatic choice), we distinguish 3 cases: + (a) dn < qn: in = ceil(qn / ceil(qn/dn)) + (b) dn/3 < qn <= dn: in = ceil(qn / 2) + (c) qn < dn/3: in = qn + In all cases we have in <= dn. + */ +mp_size_t +mpn_mu_div_qr_choose_in (mp_size_t qn, mp_size_t dn, int k) +{ + mp_size_t in; + + if (k == 0) + { + mp_size_t b; + if (qn > dn) + { + /* Compute an inverse size that is a nice partition of the quotient. */ + b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */ + in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */ + } + else if (3 * qn > dn) + { + in = (qn - 1) / 2 + 1; /* b = 2 */ + } + else + { + in = (qn - 1) / 1 + 1; /* b = 1 */ + } + } + else + { + mp_size_t xn; + xn = MIN (dn, qn); + in = (xn - 1) / k + 1; + } + + return in; +} + +static mp_limb_t +mpn_mu_div_qr2 (mp_ptr qp, + mp_ptr rp, + mp_ptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn, in; + mp_limb_t cy; + mp_ptr ip, tp; + + /* FIXME: We should probably not handle tiny operands, but do it for now. */ + if (dn == 1) + { + rp[0] = mpn_divrem_1 (scratch, 0L, np, nn, dp[0]); + MPN_COPY (qp, scratch, nn - 1); + return scratch[nn - 1]; + } + + qn = nn - dn; + + /* Compute the inverse size. */ + in = mpn_mu_div_qr_choose_in (qn, dn, 0); + ASSERT (in <= dn); + +#if 1 + /* This alternative inverse computation method gets slightly more accurate + results. FIXMEs: (1) Temp allocation needs not analysed (2) itch function + not adapted (3) mpn_invert scratch needs not met. */ + ip = scratch; + tp = scratch + in + 1; + + /* compute an approximate inverse on (in+1) limbs */ + if (dn == in) + { + MPN_COPY (tp + 1, dp, in); + tp[0] = 1; + mpn_invert (ip, tp, in + 1, NULL); + MPN_COPY_INCR (ip, ip + 1, in); + } + else + { + cy = mpn_add_1 (tp, dp + dn - (in + 1), in + 1, 1); + if (UNLIKELY (cy != 0)) + MPN_ZERO (ip, in); + else + { + mpn_invert (ip, tp, in + 1, NULL); + MPN_COPY_INCR (ip, ip + 1, in); + } + } +#else + /* This older inverse computation method gets slightly worse results than the + one above. */ + ip = scratch; + tp = scratch + in; + + /* Compute inverse of D to in+1 limbs, then round to 'in' limbs. Ideally the + inversion function should do this automatically. */ + if (dn == in) + { + tp[in + 1] = 0; + MPN_COPY (tp + in + 2, dp, in); + mpn_invert (tp, tp + in + 1, in + 1, NULL); + } + else + { + mpn_invert (tp, dp + dn - (in + 1), in + 1, NULL); + } + cy = mpn_sub_1 (tp, tp, in + 1, GMP_NUMB_HIGHBIT); + if (UNLIKELY (cy != 0)) + MPN_ZERO (tp + 1, in); + MPN_COPY (ip, tp + 1, in); +#endif + +/* We can't really handle qh = 1 like this since we'd here clobber N, which is + not allowed in the way we've defined this function's API. */ +#if 0 + qh = mpn_cmp (np + qn, dp, dn) >= 0; + if (qh != 0) + mpn_sub_n (np + qn, np + qn, dp, dn); +#endif + + mpn_preinv_mu_div_qr (qp, rp, np, nn, dp, dn, ip, in, scratch + in); + +/* return qh; */ + return 0; +} + +void +mpn_preinv_mu_div_qr (mp_ptr qp, + mp_ptr rp, + mp_ptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_srcptr ip, + mp_size_t in, + mp_ptr scratch) +{ + mp_size_t qn; + mp_limb_t cy; + mp_ptr tp; + mp_limb_t r; + + qn = nn - dn; + + if (qn == 0) + { + MPN_COPY (rp, np, dn); + return; + } + + tp = scratch; + + np += qn; + qp += qn; + + MPN_COPY (rp, np, dn); + + while (qn > 0) + { + if (qn < in) + { + ip += in - qn; + in = qn; + } + np -= in; + qp -= in; + + /* Compute the next block of quotient limbs by multiplying the inverse I + by the upper part of the partial remainder R. */ + mpn_mul_n (tp, rp + dn - in, ip, in); /* mulhi */ + cy = mpn_add_n (qp, tp + in, rp + dn - in, in); /* I's msb implicit */ + ASSERT_ALWAYS (cy == 0); /* FIXME */ + + /* Compute the product of the quotient block and the divisor D, to be + subtracted from the partial remainder combined with new limbs from the + dividend N. We only really need the low dn limbs. */ +#if WANT_FFT + if (ABOVE_THRESHOLD (dn, MUL_FFT_MODF_THRESHOLD)) + { + /* Use the wrap-around trick. */ + mp_size_t m, wn; + int k; + + k = mpn_fft_best_k (dn + 1, 0); + m = mpn_fft_next_size (dn + 1, k); + wn = dn + in - m; /* number of wrapped limbs */ + + mpn_mul_fft (tp, m, dp, dn, qp, in, k); + + if (wn > 0) + { + cy = mpn_add_n (tp, tp, rp + dn - wn, wn); + mpn_incr_u (tp + wn, cy); + + cy = mpn_cmp (rp + dn - in, tp + dn, m - dn) < 0; + mpn_decr_u (tp, cy); + } + } + else +#endif + mpn_mul (tp, dp, dn, qp, in); /* dn+in limbs, high 'in' cancels */ + + r = rp[dn - in] - tp[dn]; + + /* Subtract the product from the partial remainder combined with new + limbs from the dividend N, generating a new partial remainder R. */ + if (dn != in) + { + cy = mpn_sub_n (tp, np, tp, in); /* get next 'in' limbs from N */ + cy = mpn_sub_nc (tp + in, rp, tp + in, dn - in, cy); + MPN_COPY (rp, tp, dn); /* FIXME: try to avoid this */ + } + else + { + cy = mpn_sub_n (rp, np, tp, in); /* get next 'in' limbs from N */ + } + + STAT (int i; int err = 0; + static int errarr[5]; static int err_rec; static int tot); + + /* Check the remainder R and adjust the quotient as needed. */ + r -= cy; + while (r != 0) + { + /* We loop 0 times with about 69% probability, 1 time with about 31% + probability, 2 times with about 0.6% probability, if inverse is + computed as recommended. */ + mpn_incr_u (qp, 1); + cy = mpn_sub_n (rp, rp, dp, dn); + r -= cy; + STAT (err++); + } + if (mpn_cmp (rp, dp, dn) >= 0) + { + /* This is executed with about 76% probability. */ + mpn_incr_u (qp, 1); + cy = mpn_sub_n (rp, rp, dp, dn); + STAT (err++); + } + + STAT ( + tot++; + errarr[err]++; + if (err > err_rec) + err_rec = err; + if (tot % 0x10000 == 0) + { + for (i = 0; i <= err_rec; i++) + printf (" %d(%.1f%%)", errarr[i], 100.0*errarr[i]/tot); + printf ("\n"); + } + ); + + qn -= in; + } +} + +#define THRES 100 /* FIXME: somewhat arbitrary */ + +#ifdef CHECK +#undef THRES +#define THRES 1 +#endif + +mp_limb_t +mpn_mu_div_qr (mp_ptr qp, + mp_ptr rp, + mp_ptr np, + mp_size_t nn, + mp_srcptr dp, + mp_size_t dn, + mp_ptr scratch) +{ + mp_size_t qn; + + qn = nn - dn; + if (qn + THRES < dn) + { + /* |______________|________| dividend nn + |_______|________| divisor dn + + |______| quotient (prel) qn + + |_______________| quotient * ignored-part-of(divisor) dn-1 + */ + + mp_limb_t cy, x; + + if (mpn_cmp (np + nn - (qn + 1), dp + dn - (qn + 1), qn + 1) >= 0) + { + /* Quotient is 111...111, could optimize this rare case at some point. */ + mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch); + return 0; + } + + /* Compute a preliminary quotient and a partial remainder by dividing the + most significant limbs of each operand. */ + mpn_mu_div_qr2 (qp, rp + nn - (2 * qn + 1), + np + nn - (2 * qn + 1), 2 * qn + 1, + dp + dn - (qn + 1), qn + 1, + scratch); + + /* Multiply the quotient by the divisor limbs ignored above. */ + if (dn - (qn + 1) > qn) + mpn_mul (scratch, dp, dn - (qn + 1), qp, qn); /* prod is dn-1 limbs */ + else + mpn_mul (scratch, qp, qn, dp, dn - (qn + 1)); /* prod is dn-1 limbs */ + + cy = mpn_sub_n (rp, np, scratch, nn - (2 * qn + 1)); + cy = mpn_sub_nc (rp + nn - (2 * qn + 1), + rp + nn - (2 * qn + 1), + scratch + nn - (2 * qn + 1), + qn, cy); + x = rp[dn - 1]; + rp[dn - 1] = x - cy; + if (cy > x) + { + mpn_decr_u (qp, 1); + mpn_add_n (rp, rp, dp, dn); + } + } + else + { + return mpn_mu_div_qr2 (qp, rp, np, nn, dp, dn, scratch); + } + + return 0; /* FIXME */ +} + +mp_size_t +mpn_mu_div_qr_itch (mp_size_t nn, mp_size_t dn, int mua_k) +{ + mp_size_t qn, m; + int k; + + /* FIXME: This isn't very carefully written, and might grossly overestimate + the amount of scratch needed, and might perhaps also underestimate it, + leading to potential buffer overruns. In particular k=0 might lead to + gross overestimates. */ + + if (dn == 1) + return nn; + + qn = nn - dn; + if (qn >= dn) + { + k = mpn_fft_best_k (dn + 1, 0); + m = mpn_fft_next_size (dn + 1, k); + return (mua_k <= 1 + ? 6 * dn + : m + 2 * dn); + } + else + { + k = mpn_fft_best_k (dn + 1, 0); + m = mpn_fft_next_size (dn + 1, k); + return (mua_k <= 1 + ? m + 4 * qn + : m + 2 * qn); + } +}