Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / sparc64 / mode1o.c
diff --git a/gmp/mpn/sparc64/mode1o.c b/gmp/mpn/sparc64/mode1o.c

new file mode 100644 (file)

index 0000000..5ec97c5
--- /dev/null
+++ b/gmp/mpn/sparc64/mode1o.c
@@ -0,0 +1,186 @@
+/* UltraSPARC 64 mpn_modexact_1c_odd -- mpn by limb exact style remainder.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#include "mpn/sparc64/sparc64.h"
+
+
+/*                 64-bit divisor   32-bit divisor
+                    cycles/limb      cycles/limb
+                     (approx)         (approx)
+   Ultrasparc 2i:       ?                ?
+*/
+
+
+/* This implementation reduces the number of multiplies done, knowing that
+   on ultrasparc 1 and 2 the mulx instruction stalls the whole chip.
+
+   The key idea is to use the fact that the low limb of q*d equals l, this
+   being the whole purpose of the q calculated.  It means there's no need to
+   calculate the lowest 32x32->64 part of the q*d, instead it can be
+   inferred from l and the other three 32x32->64 parts.  See sparc64.h for
+   details.
+
+   When d is 32-bits, the same applies, but in this case there's only one
+   other 32x32->64 part (ie. HIGH(q)*d).
+
+   The net effect is that for 64-bit divisor each limb is 4 mulx, or for
+   32-bit divisor each is 2 mulx.
+
+   Enhancements:
+
+   No doubt this could be done in assembler, if that helped the scheduling,
+   or perhaps guaranteed good code irrespective of the compiler.
+
+   Alternatives:
+
+   It might be possibly to use floating point.  The loop is dominated by
+   multiply latency, so not sure if floats would improve that.  One
+   possibility would be to take two limbs at a time, with a 128 bit inverse,
+   if there's enough registers, which could effectively use float throughput
+   to reduce total latency across two limbs.  */
+
+#define ASSERT_RETVAL(r)                \
+  ASSERT (orig_c < d ? r < d : r <= d)
+
+mp_limb_t
+mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t orig_c)
+{
+  mp_limb_t  c = orig_c;
+  mp_limb_t  s, l, q, h, inverse;
+
+  ASSERT (size >= 1);
+  ASSERT (d & 1);
+  ASSERT_MPN (src, size);
+  ASSERT_LIMB (d);
+  ASSERT_LIMB (c);
+
+  /* udivx is faster than 10 or 12 mulx's for one limb via an inverse */
+  if (size == 1)
+    {
+      s = src[0];
+      if (s > c)
+       {
+         l = s-c;
+         h = l % d;
+         if (h != 0)
+           h = d - h;
+       }
+      else
+       {
+         l = c-s;
+         h = l % d;
+       }
+      return h;
+    }
+
+  binvert_limb (inverse, d);
+
+  if (d <= 0xFFFFFFFF)
+    {
+      s = *src++;
+      size--;
+      do
+        {
+          SUBC_LIMB (c, l, s, c);
+          s = *src++;
+          q = l * inverse;
+          umul_ppmm_half_lowequal (h, q, d, l);
+          c += h;
+          size--;
+        }
+      while (size != 0);
+
+      if (s <= d)
+        {
+          /* With high s <= d the final step can be a subtract and addback.
+             If c==0 then the addback will restore to l>=0.  If c==d then
+             will get l==d if s==0, but that's ok per the function
+             definition.  */
+
+          l = c - s;
+          l += (l > c ? d : 0);
+
+          ASSERT_RETVAL (l);
+          return l;
+        }
+      else
+        {
+          /* Can't skip a divide, just do the loop code once more. */
+          SUBC_LIMB (c, l, s, c);
+          q = l * inverse;
+          umul_ppmm_half_lowequal (h, q, d, l);
+          c += h;
+
+          ASSERT_RETVAL (c);
+          return c;
+        }
+    }
+  else
+    {
+      mp_limb_t  dl = LOW32 (d);
+      mp_limb_t  dh = HIGH32 (d);
+      long i;
+
+      s = *src++;
+      size--;
+      do
+        {
+          SUBC_LIMB (c, l, s, c);
+          s = *src++;
+          q = l * inverse;
+          umul_ppmm_lowequal (h, q, d, dh, dl, l);
+          c += h;
+          size--;
+        }
+      while (size != 0);
+
+      if (s <= d)
+        {
+          /* With high s <= d the final step can be a subtract and addback.
+             If c==0 then the addback will restore to l>=0.  If c==d then
+             will get l==d if s==0, but that's ok per the function
+             definition.  */
+
+          l = c - s;
+          l += (l > c ? d : 0);
+
+          ASSERT_RETVAL (l);
+          return l;
+        }
+      else
+        {
+          /* Can't skip a divide, just do the loop code once more. */
+          SUBC_LIMB (c, l, s, c);
+          q = l * inverse;
+          umul_ppmm_lowequal (h, q, d, dh, dl, l);
+          c += h;
+
+          ASSERT_RETVAL (c);
+          return c;
+        }
+    }
+}