Imported gcc-4.4.3

[msp430-gcc.git] / gmp / gen-psqr.c
diff --git a/gmp/gen-psqr.c b/gmp/gen-psqr.c

new file mode 100644 (file)

index 0000000..9c33d7a
--- /dev/null
+++ b/gmp/gen-psqr.c
@@ -0,0 +1,576 @@
+/* Generate perfect square testing data.
+
+Copyright 2002, 2003, 2004 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "dumbmp.c"
+
+
+/* The aim of this program is to choose either mpn_mod_34lsub1 or mpn_mod_1
+   (plus a PERFSQR_PP modulus), and generate tables indicating quadratic
+   residues and non-residues modulo small factors of that modulus.
+
+   For the usual 32 or 64 bit cases mpn_mod_34lsub1 gets used.  That
+   function exists specifically because 2^24-1 and 2^48-1 have nice sets of
+   prime factors.  For other limb sizes it's considered, but if it doesn't
+   have good factors then mpn_mod_1 will be used instead.
+
+   When mpn_mod_1 is used, the modulus PERFSQR_PP is created from a
+   selection of small primes, chosen to fill PERFSQR_MOD_BITS of a limb,
+   with that bit count chosen so (2*GMP_LIMB_BITS)*2^PERFSQR_MOD_BITS <=
+   GMP_LIMB_MAX, allowing PERFSQR_MOD_IDX in mpn/generic/perfsqr.c to do its
+   calculation within a single limb.
+
+   In either case primes can be combined to make divisors.  The table data
+   then effectively indicates remainders which are quadratic residues mod
+   all the primes.  This sort of combining reduces the number of steps
+   needed after mpn_mod_34lsub1 or mpn_mod_1, saving code size and time.
+   Nothing is gained or lost in terms of detections, the same total fraction
+   of non-residues will be identified.
+
+   Nothing particularly sophisticated is attempted for combining factors to
+   make divisors.  This is probably a kind of knapsack problem so it'd be
+   too hard to attempt anything completely general.  For the usual 32 and 64
+   bit limbs we get a good enough result just pairing the biggest and
+   smallest which fit together, repeatedly.
+
+   Another aim is to get powerful combinations, ie. divisors which identify
+   biggest fraction of non-residues, and have those run first.  Again for
+   the usual 32 and 64 bits it seems good enough just to pair for big
+   divisors then sort according to the resulting fraction of non-residues
+   identified.
+
+   Also in this program, a table sq_res_0x100 of residues modulo 256 is
+   generated.  This simply fills bits into limbs of the appropriate
+   build-time GMP_LIMB_BITS each.
+
+*/
+
+
+/* Normally we aren't using const in gen*.c programs, so as not to have to
+   bother figuring out if it works, but using it with f_cmp_divisor and
+   f_cmp_fraction avoids warnings from the qsort calls. */
+
+/* Same tests as gmp.h. */
+#if  defined (__STDC__)                                 \
+  || defined (__cplusplus)                              \
+  || defined (_AIX)                                     \
+  || defined (__DECC)                                   \
+  || (defined (__mips) && defined (_SYSTYPE_SVR4))      \
+  || defined (_MSC_VER)                                 \
+  || defined (_WIN32)
+#define HAVE_CONST        1
+#endif
+
+#if ! HAVE_CONST
+#define const
+#endif
+
+
+mpz_t  *sq_res_0x100;          /* table of limbs */
+int    nsq_res_0x100;          /* elements in sq_res_0x100 array */
+int    sq_res_0x100_num;       /* squares in sq_res_0x100 */
+double sq_res_0x100_fraction;  /* sq_res_0x100_num / 256 */
+
+int     mod34_bits;        /* 3*GMP_NUMB_BITS/4 */
+int     mod_bits;          /* bits from PERFSQR_MOD_34 or MOD_PP */
+int     max_divisor;       /* all divisors <= max_divisor */
+int     max_divisor_bits;  /* ceil(log2(max_divisor)) */
+double  total_fraction;    /* of squares */
+mpz_t   pp;                /* product of primes, or 0 if mod_34lsub1 used */
+mpz_t   pp_norm;           /* pp shifted so NUMB high bit set */
+mpz_t   pp_inverted;       /* invert_limb style inverse */
+mpz_t   mod_mask;          /* 2^mod_bits-1 */
+char    mod34_excuse[128]; /* why mod_34lsub1 not used (if it's not) */
+
+/* raw list of divisors of 2^mod34_bits-1 or pp, just to show in a comment */
+struct rawfactor_t {
+  int     divisor;
+  int     multiplicity;
+};
+struct rawfactor_t  *rawfactor;
+int                 nrawfactor;
+
+/* factors of 2^mod34_bits-1 or pp and associated data, after combining etc */
+struct factor_t {
+  int     divisor;
+  mpz_t   inverse;   /* 1/divisor mod 2^mod_bits */
+  mpz_t   mask;      /* indicating squares mod divisor */
+  double  fraction;  /* squares/total */
+};
+struct factor_t  *factor;
+int              nfactor;       /* entries in use in factor array */
+int              factor_alloc;  /* entries allocated to factor array */
+
+
+int
+f_cmp_divisor (const void *parg, const void *qarg)
+{
+  const struct factor_t *p, *q;
+  p = parg;
+  q = qarg;
+  if (p->divisor > q->divisor)
+    return 1;
+  else if (p->divisor < q->divisor)
+    return -1;
+  else
+    return 0;
+}
+
+int
+f_cmp_fraction (const void *parg, const void *qarg)
+{
+  const struct factor_t *p, *q;
+  p = parg;
+  q = qarg;
+  if (p->fraction > q->fraction)
+    return 1;
+  else if (p->fraction < q->fraction)
+    return -1;
+  else
+    return 0;
+}
+
+/* Remove array[idx] by copying the remainder down, and adjust narray
+   accordingly.  */
+#define COLLAPSE_ELEMENT(array, idx, narray)                    \
+  do {                                                          \
+    mem_copyi ((char *) &(array)[idx],                          \
+               (char *) &(array)[idx+1],                        \
+               ((narray)-((idx)+1)) * sizeof (array[0]));       \
+    (narray)--;                                                 \
+  } while (0)
+
+
+/* return n*2^p mod m */
+int
+mul_2exp_mod (int n, int p, int m)
+{
+  int  i;
+  for (i = 0; i < p; i++)
+    n = (2 * n) % m;
+  return n;
+}
+
+/* return -n mod m */
+int
+neg_mod (int n, int m)
+{
+  ASSERT (n >= 0 && n < m);
+  return (n == 0 ? 0 : m-n);
+}
+
+/* Set "mask" to a value such that "mask & (1<<idx)" is non-zero if
+   "-(idx<<mod_bits)" can be a square modulo m.  */
+void
+square_mask (mpz_t mask, int m)
+{
+  int    p, i, r, idx;
+
+  p = mul_2exp_mod (1, mod_bits, m);
+  p = neg_mod (p, m);
+
+  mpz_set_ui (mask, 0L);
+  for (i = 0; i < m; i++)
+    {
+      r = (i * i) % m;
+      idx = (r * p) % m;
+      mpz_setbit (mask, (unsigned long) idx);
+    }
+}
+
+void
+generate_sq_res_0x100 (int limb_bits)
+{
+  int  i, res;
+
+  nsq_res_0x100 = (0x100 + limb_bits - 1) / limb_bits;
+  sq_res_0x100 = (mpz_t *) xmalloc (nsq_res_0x100 * sizeof (*sq_res_0x100));
+
+  for (i = 0; i < nsq_res_0x100; i++)
+    mpz_init_set_ui (sq_res_0x100[i], 0L);
+
+  for (i = 0; i < 0x100; i++)
+    {
+      res = (i * i) % 0x100;
+      mpz_setbit (sq_res_0x100[res / limb_bits],
+                  (unsigned long) (res % limb_bits));
+    }
+
+  sq_res_0x100_num = 0;
+  for (i = 0; i < nsq_res_0x100; i++)
+    sq_res_0x100_num += mpz_popcount (sq_res_0x100[i]);
+  sq_res_0x100_fraction = (double) sq_res_0x100_num / 256.0;
+}
+
+void
+generate_mod (int limb_bits, int nail_bits)
+{
+  int    numb_bits = limb_bits - nail_bits;
+  int    i, divisor;
+
+  mpz_init_set_ui (pp, 0L);
+  mpz_init_set_ui (pp_norm, 0L);
+  mpz_init_set_ui (pp_inverted, 0L);
+
+  /* no more than limb_bits many factors in a one limb modulus (and of
+     course in reality nothing like that many) */
+  factor_alloc = limb_bits;
+  factor = (struct factor_t *) xmalloc (factor_alloc * sizeof (*factor));
+  rawfactor = (struct rawfactor_t *)
+    xmalloc (factor_alloc * sizeof (*rawfactor));
+
+  if (numb_bits % 4 != 0)
+    {
+      strcpy (mod34_excuse, "GMP_NUMB_BITS % 4 != 0");
+      goto use_pp;
+    }
+
+  max_divisor = 2*limb_bits;
+  max_divisor_bits = log2_ceil (max_divisor);
+
+  if (numb_bits / 4 < max_divisor_bits)
+    {
+      /* Wind back to one limb worth of max_divisor, if that will let us use
+         mpn_mod_34lsub1.  */
+      max_divisor = limb_bits;
+      max_divisor_bits = log2_ceil (max_divisor);
+
+      if (numb_bits / 4 < max_divisor_bits)
+        {
+          strcpy (mod34_excuse, "GMP_NUMB_BITS / 4 too small");
+          goto use_pp;
+        }
+    }
+
+  {
+    /* Can use mpn_mod_34lsub1, find small factors of 2^mod34_bits-1. */
+    mpz_t  m, q, r;
+    int    multiplicity;
+
+    mod34_bits = (numb_bits / 4) * 3;
+
+    /* mpn_mod_34lsub1 returns a full limb value, PERFSQR_MOD_34 folds it at
+       the mod34_bits mark, adding the two halves for a remainder of at most
+       mod34_bits+1 many bits */
+    mod_bits = mod34_bits + 1;
+
+    mpz_init_set_ui (m, 1L);
+    mpz_mul_2exp (m, m, mod34_bits);
+    mpz_sub_ui (m, m, 1L);
+
+    mpz_init (q);
+    mpz_init (r);
+
+    for (i = 3; i <= max_divisor; i++)
+      {
+        if (! isprime (i))
+          continue;
+
+        mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
+        if (mpz_sgn (r) != 0)
+          continue;
+
+        /* if a repeated prime is found it's used as an i^n in one factor */
+        divisor = 1;
+        multiplicity = 0;
+        do
+          {
+            if (divisor > max_divisor / i)
+              break;
+            multiplicity++;
+            mpz_set (m, q);
+            mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
+          }
+        while (mpz_sgn (r) == 0);
+
+        ASSERT (nrawfactor < factor_alloc);
+        rawfactor[nrawfactor].divisor = i;
+        rawfactor[nrawfactor].multiplicity = multiplicity;
+        nrawfactor++;
+      }
+
+    mpz_clear (m);
+    mpz_clear (q);
+    mpz_clear (r);
+  }
+
+  if (nrawfactor <= 2)
+    {
+      mpz_t  new_pp;
+
+      sprintf (mod34_excuse, "only %d small factor%s",
+               nrawfactor, nrawfactor == 1 ? "" : "s");
+
+    use_pp:
+      /* reset to two limbs of max_divisor, in case the mpn_mod_34lsub1 code
+         tried with just one */
+      max_divisor = 2*limb_bits;
+      max_divisor_bits = log2_ceil (max_divisor);
+
+      mpz_init (new_pp);
+      nrawfactor = 0;
+      mod_bits = MIN (numb_bits, limb_bits - max_divisor_bits);
+
+      /* one copy of each small prime */
+      mpz_set_ui (pp, 1L);
+      for (i = 3; i <= max_divisor; i++)
+        {
+          if (! isprime (i))
+            continue;
+
+          mpz_mul_ui (new_pp, pp, (unsigned long) i);
+          if (mpz_sizeinbase (new_pp, 2) > mod_bits)
+            break;
+          mpz_set (pp, new_pp);
+
+          ASSERT (nrawfactor < factor_alloc);
+          rawfactor[nrawfactor].divisor = i;
+          rawfactor[nrawfactor].multiplicity = 1;
+          nrawfactor++;
+        }
+
+      /* Plus an extra copy of one or more of the primes selected, if that
+         still fits in max_divisor and the total in mod_bits.  Usually only
+         3 or 5 will be candidates */
+      for (i = nrawfactor-1; i >= 0; i--)
+        {
+          if (rawfactor[i].divisor > max_divisor / rawfactor[i].divisor)
+            continue;
+          mpz_mul_ui (new_pp, pp, (unsigned long) rawfactor[i].divisor);
+          if (mpz_sizeinbase (new_pp, 2) > mod_bits)
+            continue;
+          mpz_set (pp, new_pp);
+
+          rawfactor[i].multiplicity++;
+        }
+
+      mod_bits = mpz_sizeinbase (pp, 2);
+
+      mpz_set (pp_norm, pp);
+      while (mpz_sizeinbase (pp_norm, 2) < numb_bits)
+        mpz_add (pp_norm, pp_norm, pp_norm);
+
+      mpz_preinv_invert (pp_inverted, pp_norm, numb_bits);
+
+      mpz_clear (new_pp);
+    }
+
+  /* start the factor array */
+  for (i = 0; i < nrawfactor; i++)
+    {
+      int  j;
+      ASSERT (nfactor < factor_alloc);
+      factor[nfactor].divisor = 1;
+      for (j = 0; j < rawfactor[i].multiplicity; j++)
+        factor[nfactor].divisor *= rawfactor[i].divisor;
+      nfactor++;
+    }
+
+ combine:
+  /* Combine entries in the factor array.  Combine the smallest entry with
+     the biggest one that will fit with it (ie. under max_divisor), then
+     repeat that with the new smallest entry. */
+  qsort (factor, nfactor, sizeof (factor[0]), f_cmp_divisor);
+  for (i = nfactor-1; i >= 1; i--)
+    {
+      if (factor[i].divisor <= max_divisor / factor[0].divisor)
+        {
+          factor[0].divisor *= factor[i].divisor;
+          COLLAPSE_ELEMENT (factor, i, nfactor);
+          goto combine;
+        }
+    }
+
+  total_fraction = 1.0;
+  for (i = 0; i < nfactor; i++)
+    {
+      mpz_init (factor[i].inverse);
+      mpz_invert_ui_2exp (factor[i].inverse,
+                          (unsigned long) factor[i].divisor,
+                          (unsigned long) mod_bits);
+
+      mpz_init (factor[i].mask);
+      square_mask (factor[i].mask, factor[i].divisor);
+
+      /* fraction of possible squares */
+      factor[i].fraction = (double) mpz_popcount (factor[i].mask)
+        / factor[i].divisor;
+
+      /* total fraction of possible squares */
+      total_fraction *= factor[i].fraction;
+    }
+
+  /* best tests first (ie. smallest fraction) */
+  qsort (factor, nfactor, sizeof (factor[0]), f_cmp_fraction);
+}
+
+void
+print (int limb_bits, int nail_bits)
+{
+  int    i;
+  mpz_t  mhi, mlo;
+
+  printf ("/* This file generated by gen-psqr.c - DO NOT EDIT. */\n");
+  printf ("\n");
+
+  printf ("#if GMP_LIMB_BITS != %d || GMP_NAIL_BITS != %d\n",
+          limb_bits, nail_bits);
+  printf ("Error, error, this data is for %d bit limb and %d bit nail\n",
+          limb_bits, nail_bits);
+  printf ("#endif\n");
+  printf ("\n");
+
+  printf ("/* Non-zero bit indicates a quadratic residue mod 0x100.\n");
+  printf ("   This test identifies %.2f%% as non-squares (%d/256). */\n",
+          (1.0 - sq_res_0x100_fraction) * 100.0,
+          0x100 - sq_res_0x100_num);
+  printf ("static const mp_limb_t\n");
+  printf ("sq_res_0x100[%d] = {\n", nsq_res_0x100);
+  for (i = 0; i < nsq_res_0x100; i++)
+    {
+      printf ("  CNST_LIMB(0x");
+      mpz_out_str (stdout, 16, sq_res_0x100[i]);
+      printf ("),\n");
+    }
+  printf ("};\n");
+  printf ("\n");
+
+  if (mpz_sgn (pp) != 0)
+    {
+      printf ("/* mpn_mod_34lsub1 not used due to %s */\n", mod34_excuse);
+      printf ("/* PERFSQR_PP = ");
+    }
+  else
+    printf ("/* 2^%d-1 = ", mod34_bits);
+  for (i = 0; i < nrawfactor; i++)
+    {
+      if (i != 0)
+        printf (" * ");
+      printf ("%d", rawfactor[i].divisor);
+      if (rawfactor[i].multiplicity != 1)
+        printf ("^%d", rawfactor[i].multiplicity);
+    }
+  printf (" %s*/\n", mpz_sgn (pp) == 0 ? "... " : "");
+
+  printf ("#define PERFSQR_MOD_BITS  %d\n", mod_bits);
+  if (mpz_sgn (pp) != 0)
+    {
+      printf ("#define PERFSQR_PP            CNST_LIMB(0x");
+      mpz_out_str (stdout, 16, pp);
+      printf (")\n");
+      printf ("#define PERFSQR_PP_NORM       CNST_LIMB(0x");
+      mpz_out_str (stdout, 16, pp_norm);
+      printf (")\n");
+      printf ("#define PERFSQR_PP_INVERTED   CNST_LIMB(0x");
+      mpz_out_str (stdout, 16, pp_inverted);
+      printf (")\n");
+    }
+  printf ("\n");
+
+  mpz_init (mhi);
+  mpz_init (mlo);
+
+  printf ("/* This test identifies %.2f%% as non-squares. */\n",
+          (1.0 - total_fraction) * 100.0);
+  printf ("#define PERFSQR_MOD_TEST(up, usize) \\\n");
+  printf ("  do {                              \\\n");
+  printf ("    mp_limb_t  r;                   \\\n");
+  if (mpz_sgn (pp) != 0)
+    printf ("    PERFSQR_MOD_PP (r, up, usize);  \\\n");
+  else
+    printf ("    PERFSQR_MOD_34 (r, up, usize);  \\\n");
+
+  for (i = 0; i < nfactor; i++)
+    {
+      printf ("                                    \\\n");
+      printf ("    /* %5.2f%% */                    \\\n",
+              (1.0 - factor[i].fraction) * 100.0);
+
+      printf ("    PERFSQR_MOD_%d (r, CNST_LIMB(%2d), CNST_LIMB(0x",
+              factor[i].divisor <= limb_bits ? 1 : 2,
+              factor[i].divisor);
+      mpz_out_str (stdout, 16, factor[i].inverse);
+      printf ("), \\\n");
+      printf ("                   CNST_LIMB(0x");
+
+      if ( factor[i].divisor <= limb_bits)
+        {
+          mpz_out_str (stdout, 16, factor[i].mask);
+        }
+      else
+        {
+          mpz_tdiv_r_2exp (mlo, factor[i].mask, (unsigned long) limb_bits);
+          mpz_tdiv_q_2exp (mhi, factor[i].mask, (unsigned long) limb_bits);
+          mpz_out_str (stdout, 16, mhi);
+          printf ("), CNST_LIMB(0x");
+          mpz_out_str (stdout, 16, mlo);
+        }
+      printf (")); \\\n");
+    }
+
+  printf ("  } while (0)\n");
+  printf ("\n");
+
+  printf ("/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, %.2f%% non-squares. */\n",
+          (1.0 - (total_fraction * 44.0/256.0)) * 100.0);
+  printf ("\n");
+
+  printf ("/* helper for tests/mpz/t-perfsqr.c */\n");
+  printf ("#define PERFSQR_DIVISORS  { 256,");
+  for (i = 0; i < nfactor; i++)
+      printf (" %d,", factor[i].divisor);
+  printf (" }\n");
+
+
+  mpz_clear (mhi);
+  mpz_clear (mlo);
+}
+
+int
+main (int argc, char *argv[])
+{
+  int  limb_bits, nail_bits;
+
+  if (argc != 3)
+    {
+      fprintf (stderr, "Usage: gen-psqr <limbbits> <nailbits>\n");
+      exit (1);
+    }
+
+  limb_bits = atoi (argv[1]);
+  nail_bits = atoi (argv[2]);
+
+  if (limb_bits <= 0
+      || nail_bits < 0
+      || nail_bits >= limb_bits)
+    {
+      fprintf (stderr, "Invalid limb/nail bits: %d %d\n",
+               limb_bits, nail_bits);
+      exit (1);
+    }
+
+  generate_sq_res_0x100 (limb_bits);
+  generate_mod (limb_bits, nail_bits);
+
+  print (limb_bits, nail_bits);
+
+  return 0;
+}