Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / sparc64 / dive_1.c
diff --git a/gmp/mpn/sparc64/dive_1.c b/gmp/mpn/sparc64/dive_1.c

new file mode 100644 (file)

index 0000000..6f3d7c4
--- /dev/null
+++ b/gmp/mpn/sparc64/dive_1.c
@@ -0,0 +1,147 @@
+/* UltraSPARC 64 mpn_divexact_1 -- mpn by limb exact division.
+
+   THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
+   CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
+   FUTURE GNU MP RELEASES.
+
+Copyright 2000, 2001, 2003 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 3 of the License, or (at your
+option) any later version.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
+
+#include "gmp.h"
+#include "gmp-impl.h"
+#include "longlong.h"
+
+#include "mpn/sparc64/sparc64.h"
+
+
+/*                 64-bit divisor   32-bit divisor
+                    cycles/limb      cycles/limb
+                     (approx)         (approx)
+   Ultrasparc 2i:      110               70
+*/
+
+
+/* There are two key ideas here to reduce mulx's.  Firstly when the divisor
+   is 32-bits the high of q*d can be calculated without the two 32x32->64
+   cross-products involving the high 32-bits of the divisor, that being zero
+   of course.  Secondly umul_ppmm_lowequal and umul_ppmm_half_lowequal save
+   one mulx (each) knowing the low of q*d is equal to the input limb l.
+
+   For size==1, a simple udivx is used.  This is faster than calculating an
+   inverse.
+
+   For a 32-bit divisor and small sizes, an attempt was made at a simple
+   udivx loop (two per 64-bit limb), but it turned out to be slower than
+   mul-by-inverse.  At size==2 the inverse is about 260 cycles total
+   compared to a udivx at 291.  Perhaps the latter would suit when size==2
+   but the high 32-bits of the second limb is zero (saving one udivx), but
+   it doesn't seem worth a special case just for that.  */
+
+void
+mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor)
+{
+  mp_limb_t  inverse, s, s_next, c, l, ls, q;
+  unsigned   rshift, lshift;
+  mp_limb_t  lshift_mask;
+  mp_limb_t  divisor_h;
+
+  ASSERT (size >= 1);
+  ASSERT (divisor != 0);
+  ASSERT (MPN_SAME_OR_SEPARATE_P (dst, src, size));
+  ASSERT_MPN (src, size);
+  ASSERT_LIMB (divisor);
+
+  s = *src++;                 /* src low limb */
+  size--;
+  if (size == 0)
+    {
+      *dst = s / divisor;
+      return;
+    }
+
+  if ((divisor & 1) == 0)
+    {
+      count_trailing_zeros (rshift, divisor);
+      divisor >>= rshift;
+    }
+  else
+    rshift = 0;
+
+  binvert_limb (inverse, divisor);
+
+  lshift = 64 - rshift;
+
+  /* lshift==64 means no shift, so must mask out other part in this case */
+  lshift_mask = (rshift == 0 ? 0 : MP_LIMB_T_MAX);
+
+  c = 0;
+  divisor_h = HIGH32 (divisor);
+
+  if (divisor_h == 0)
+    {
+      /* 32-bit divisor */
+      do
+        {
+          s_next = *src++;
+          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
+          s = s_next;
+
+          SUBC_LIMB (c, l, ls, c);
+
+          q = l * inverse;
+          *dst++ = q;
+
+          umul_ppmm_half_lowequal (l, q, divisor, l);
+          c += l;
+
+          size--;
+        }
+      while (size != 0);
+
+      ls = s >> rshift;
+      l = ls - c;
+      q = l * inverse;
+      *dst = q;
+    }
+  else
+    {
+      /* 64-bit divisor */
+      mp_limb_t  divisor_l = LOW32 (divisor);
+      do
+        {
+          s_next = *src++;
+          ls = (s >> rshift) | ((s_next << lshift) & lshift_mask);
+          s = s_next;
+
+          SUBC_LIMB (c, l, ls, c);
+
+          q = l * inverse;
+          *dst++ = q;
+
+          umul_ppmm_lowequal (l, q, divisor, divisor_h, divisor_l, l);
+          c += l;
+
+          size--;
+        }
+      while (size != 0);
+
+      ls = s >> rshift;
+      l = ls - c;
+      q = l * inverse;
+      *dst = q;
+    }
+}