Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / x86 / pentium4 / sse2 / divrem_1.asm
diff --git a/gmp/mpn/x86/pentium4/sse2/divrem_1.asm b/gmp/mpn/x86/pentium4/sse2/divrem_1.asm

new file mode 100644 (file)

index 0000000..7f973db
--- /dev/null
+++ b/gmp/mpn/x86/pentium4/sse2/divrem_1.asm
@@ -0,0 +1,635 @@
+dnl  Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
+
+dnl  Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
+dnl  Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
+
+
+C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                         mp_srcptr src, mp_size_t size,
+C                         mp_limb_t divisor);
+C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
+C                          mp_srcptr src, mp_size_t size,
+C                          mp_limb_t divisor, mp_limb_t carry);
+C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
+C                                mp_srcptr src, mp_size_t size,
+C                                mp_limb_t divisor, mp_limb_t inverse,
+C                                unsigned shift);
+C
+C Algorithm:
+C
+C The method and nomenclature follow part 8 of "Division by Invariant
+C Integers using Multiplication" by Granlund and Montgomery, reference in
+C gmp.texi.
+C
+C "m" is written for what is m' in the paper, and "d" for d_norm, which
+C won't cause any confusion since it's only the normalized divisor that's of
+C any use in the code.  "b" is written for 2^N, the size of a limb, N being
+C 32 here.
+C
+C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
+C "n-d - q1*d".  This rearrangement gives the same two-limb answer but lets
+C us have just a psubq on the dependent chain.
+C
+C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
+C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
+C
+C Notes:
+C
+C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
+C limb is less than the divisor.  mpn_divrem_1c doesn't check for a zero
+C carry, since in normal circumstances that will be a very rare event.
+C
+C The test for skipping a division is branch free (once size>=1 is tested).
+C The store to the destination high limb is 0 when a divide is skipped, or
+C if it's not skipped then a copy of the src high limb is stored.  The
+C latter is in case src==dst.
+C
+C There's a small bias towards expecting xsize==0, by having code for
+C xsize==0 in a straight line and xsize!=0 under forward jumps.
+C
+C Enhancements:
+C
+C The loop measures 32 cycles, but the dependent chain would suggest it
+C could be done with 30.  Not sure where to start looking for the extras.
+C
+C Alternatives:
+C
+C If the divisor is normalized (high bit set) then a division step can
+C always be skipped, since the high destination limb is always 0 or 1 in
+C that case.  It doesn't seem worth checking for this though, since it
+C probably occurs infrequently.
+
+
+dnl  MUL_THRESHOLD is the value of xsize+size at which the multiply by
+dnl  inverse method is used, rather than plain "divl"s.  Minimum value 1.
+dnl
+dnl  The inverse takes about 80-90 cycles to calculate, but after that the
+dnl  multiply is 32 c/l versus division at about 58 c/l.
+dnl
+dnl  At 4 limbs the div is a touch faster than the mul (and of course
+dnl  simpler), so start the mul from 5 limbs.
+
+deflit(MUL_THRESHOLD, 5)
+
+
+defframe(PARAM_PREINV_SHIFT,   28)  dnl mpn_preinv_divrem_1
+defframe(PARAM_PREINV_INVERSE, 24)  dnl mpn_preinv_divrem_1
+defframe(PARAM_CARRY,  24)          dnl mpn_divrem_1c
+defframe(PARAM_DIVISOR,20)
+defframe(PARAM_SIZE,   16)
+defframe(PARAM_SRC,    12)
+defframe(PARAM_XSIZE,  8)
+defframe(PARAM_DST,    4)
+
+dnl  re-use parameter space
+define(SAVE_ESI,`PARAM_SIZE')
+define(SAVE_EBP,`PARAM_SRC')
+define(SAVE_EDI,`PARAM_DIVISOR')
+define(SAVE_EBX,`PARAM_DST')
+
+       TEXT
+
+       ALIGN(16)
+PROLOGUE(mpn_preinv_divrem_1)
+deflit(`FRAME',0)
+
+       movl    PARAM_SIZE, %ecx
+       xorl    %edx, %edx              C carry if can't skip a div
+
+       movl    %esi, SAVE_ESI
+       movl    PARAM_SRC, %esi
+
+       movl    %ebp, SAVE_EBP
+       movl    PARAM_DIVISOR, %ebp
+
+       movl    %edi, SAVE_EDI
+       movl    PARAM_DST, %edi
+
+       movl    -4(%esi,%ecx,4), %eax   C src high limb
+
+       movl    %ebx, SAVE_EBX
+       movl    PARAM_XSIZE, %ebx
+
+       movd    PARAM_PREINV_INVERSE, %mm4
+
+       movd    PARAM_PREINV_SHIFT, %mm7  C l
+       cmpl    %ebp, %eax              C high cmp divisor
+
+       cmovc(  %eax, %edx)             C high is carry if high<divisor
+       movd    %edx, %mm0              C carry
+
+       movd    %edx, %mm1              C carry
+       movl    $0, %edx
+
+       movd    %ebp, %mm5              C d
+       cmovnc( %eax, %edx)             C 0 if skip div, src high if not
+                                       C (the latter in case src==dst)
+       leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
+
+       movl    %edx, (%edi,%ecx,4)     C dst high limb
+       sbbl    $0, %ecx                C skip one division if high<divisor
+       movl    $32, %eax
+
+       subl    PARAM_PREINV_SHIFT, %eax
+       psllq   %mm7, %mm5              C d normalized
+       leal    (%edi,%ecx,4), %edi     C &dst[xsize+size-1]
+       leal    -4(%esi,%ecx,4), %esi   C &src[size-1]
+
+       movd    %eax, %mm6              C 32-l
+       jmp     L(start_preinv)
+
+EPILOGUE()
+
+
+       ALIGN(16)
+PROLOGUE(mpn_divrem_1c)
+deflit(`FRAME',0)
+
+       movl    PARAM_CARRY, %edx
+
+       movl    PARAM_SIZE, %ecx
+
+       movl    %esi, SAVE_ESI
+       movl    PARAM_SRC, %esi
+
+       movl    %ebp, SAVE_EBP
+       movl    PARAM_DIVISOR, %ebp
+
+       movl    %edi, SAVE_EDI
+       movl    PARAM_DST, %edi
+
+       movl    %ebx, SAVE_EBX
+       movl    PARAM_XSIZE, %ebx
+
+       leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
+       jmp     L(start_1c)
+
+EPILOGUE()
+
+
+       ALIGN(16)
+PROLOGUE(mpn_divrem_1)
+deflit(`FRAME',0)
+
+       movl    PARAM_SIZE, %ecx
+       xorl    %edx, %edx              C initial carry (if can't skip a div)
+
+       movl    %esi, SAVE_ESI
+       movl    PARAM_SRC, %esi
+
+       movl    %ebp, SAVE_EBP
+       movl    PARAM_DIVISOR, %ebp
+
+       movl    %edi, SAVE_EDI
+       movl    PARAM_DST, %edi
+
+       movl    %ebx, SAVE_EBX
+       movl    PARAM_XSIZE, %ebx
+       leal    -4(%edi,%ebx,4), %edi   C &dst[xsize-1]
+
+       orl     %ecx, %ecx              C size
+       jz      L(no_skip_div)          C if size==0
+       movl    -4(%esi,%ecx,4), %eax   C src high limb
+
+       cmpl    %ebp, %eax              C high cmp divisor
+
+       cmovnc( %eax, %edx)             C 0 if skip div, src high if not
+       movl    %edx, (%edi,%ecx,4)     C dst high limb
+
+       movl    $0, %edx
+       cmovc(  %eax, %edx)             C high is carry if high<divisor
+
+       sbbl    $0, %ecx                C size-1 if high<divisor
+L(no_skip_div):
+
+
+L(start_1c):
+       C eax
+       C ebx   xsize
+       C ecx   size
+       C edx   carry
+       C esi   src
+       C edi   &dst[xsize-1]
+       C ebp   divisor
+
+       leal    (%ebx,%ecx), %eax       C size+xsize
+       leal    -4(%esi,%ecx,4), %esi   C &src[size-1]
+       leal    (%edi,%ecx,4), %edi     C &dst[size+xsize-1]
+
+       cmpl    $MUL_THRESHOLD, %eax
+       jae     L(mul_by_inverse)
+
+
+       orl     %ecx, %ecx
+       jz      L(divide_no_integer)    C if size==0
+
+L(divide_integer):
+       C eax   scratch (quotient)
+       C ebx   xsize
+       C ecx   counter
+       C edx   carry
+       C esi   src, decrementing
+       C edi   dst, decrementing
+       C ebp   divisor
+
+       movl    (%esi), %eax
+       subl    $4, %esi
+
+       divl    %ebp
+
+       movl    %eax, (%edi)
+       subl    $4, %edi
+
+       subl    $1, %ecx
+       jnz     L(divide_integer)
+
+
+L(divide_no_integer):
+       orl     %ebx, %ebx
+       jnz     L(divide_fraction)      C if xsize!=0
+
+L(divide_done):
+       movl    SAVE_ESI, %esi
+       movl    SAVE_EDI, %edi
+       movl    SAVE_EBX, %ebx
+       movl    SAVE_EBP, %ebp
+       movl    %edx, %eax
+       ret
+
+
+L(divide_fraction):
+       C eax   scratch (quotient)
+       C ebx   counter
+       C ecx
+       C edx   carry
+       C esi
+       C edi   dst, decrementing
+       C ebp   divisor
+
+       movl    $0, %eax
+
+       divl    %ebp
+
+       movl    %eax, (%edi)
+       subl    $4, %edi
+
+       subl    $1, %ebx
+       jnz     L(divide_fraction)
+
+       jmp     L(divide_done)
+
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+       C eax
+       C ebx   xsize
+       C ecx   size
+       C edx   carry
+       C esi   &src[size-1]
+       C edi   &dst[size+xsize-1]
+       C ebp   divisor
+
+       bsrl    %ebp, %eax              C 31-l
+       movd    %edx, %mm0              C carry
+       movd    %edx, %mm1              C carry
+       movl    %ecx, %edx              C size
+       movl    $31, %ecx
+
+       C
+
+       xorl    %eax, %ecx              C l = leading zeros on d
+       addl    $1, %eax
+
+       shll    %cl, %ebp               C d normalized
+       movd    %ecx, %mm7              C l
+       movl    %edx, %ecx              C size
+
+       movd    %eax, %mm6              C 32-l
+       movl    $-1, %edx
+       movl    $-1, %eax
+
+       C
+
+       subl    %ebp, %edx              C (b-d)-1 so  edx:eax = b*(b-d)-1
+
+       divl    %ebp                    C floor (b*(b-d)-1 / d)
+       movd    %ebp, %mm5              C d
+
+       C
+
+       movd    %eax, %mm4              C m
+
+
+L(start_preinv):
+       C eax   inverse
+       C ebx   xsize
+       C ecx   size
+       C edx
+       C esi   &src[size-1]
+       C edi   &dst[size+xsize-1]
+       C ebp
+       C
+       C mm0   carry
+       C mm1   carry
+       C mm2
+       C mm4   m
+       C mm5   d
+       C mm6   31-l
+       C mm7   l
+
+       psllq   %mm7, %mm0              C n2 = carry << l, for size==0
+
+       subl    $1, %ecx
+       jb      L(integer_none)
+
+       movd    (%esi), %mm0            C src high limb
+       punpckldq %mm1, %mm0
+       psrlq   %mm6, %mm0              C n2 = high (carry:srchigh << l)
+       jz      L(integer_last)
+
+
+C The dependent chain here consists of
+C
+C      2   paddd    n1+n2
+C      8   pmuludq  m*(n1+n2)
+C      2   paddq    n2:nadj + m*(n1+n2)
+C      2   psrlq    q1
+C      8   pmuludq  d*q1
+C      2   psubq    (n-d)-q1*d
+C      2   psrlq    high n-(q1+1)*d mask
+C      2   pand     d masked
+C      2   paddd    n2+d addback
+C      --
+C      30
+C
+C But it seems to run at 32 cycles, so presumably there's something else
+C going on.
+
+       ALIGN(16)
+L(integer_top):
+       C eax
+       C ebx
+       C ecx   counter, size-1 to 0
+       C edx
+       C esi   src, decrementing
+       C edi   dst, decrementing
+       C
+       C mm0   n2
+       C mm4   m
+       C mm5   d
+       C mm6   32-l
+       C mm7   l
+
+       ASSERT(b,`C n2<d
+        movd   %mm0, %eax
+        movd   %mm5, %edx
+        cmpl   %edx, %eax')
+
+       movd    -4(%esi), %mm1          C next src limbs
+       movd    (%esi), %mm2
+       leal    -4(%esi), %esi
+
+       punpckldq %mm2, %mm1
+       psrlq   %mm6, %mm1              C n10
+
+       movq    %mm1, %mm2              C n10
+       movq    %mm1, %mm3              C n10
+       psrad   $31, %mm1               C -n1
+       pand    %mm5, %mm1              C -n1 & d
+       paddd   %mm2, %mm1              C nadj = n10+(-n1&d), ignore overflow
+
+       psrld   $31, %mm2               C n1
+       paddd   %mm0, %mm2              C n2+n1
+       punpckldq %mm0, %mm1            C n2:nadj
+
+       pmuludq %mm4, %mm2              C m*(n2+n1)
+
+       C
+
+       paddq   %mm2, %mm1              C n2:nadj + m*(n2+n1)
+       pxor    %mm2, %mm2              C break dependency, saves 4 cycles
+       pcmpeqd %mm2, %mm2              C FF...FF
+       psrlq   $63, %mm2               C 1
+
+       psrlq   $32, %mm1               C q1 = high(n2:nadj + m*(n2+n1))
+
+       paddd   %mm1, %mm2              C q1+1
+       pmuludq %mm5, %mm1              C q1*d
+
+       punpckldq %mm0, %mm3            C n = n2:n10
+       pxor    %mm0, %mm0
+
+       psubq   %mm5, %mm3              C n - d
+
+       C
+
+       psubq   %mm1, %mm3              C n - (q1+1)*d
+
+       por     %mm3, %mm0              C copy remainder -> new n2
+       psrlq   $32, %mm3               C high n - (q1+1)*d, 0 or -1
+
+       ASSERT(be,`C 0 or -1
+        movd   %mm3, %eax
+        addl   $1, %eax
+        cmpl   $1, %eax')
+
+       paddd   %mm3, %mm2              C q
+       pand    %mm5, %mm3              C mask & d
+
+       paddd   %mm3, %mm0              C addback if necessary
+       movd    %mm2, (%edi)
+       leal    -4(%edi), %edi
+
+       subl    $1, %ecx
+       ja      L(integer_top)
+
+
+L(integer_last):
+       C eax
+       C ebx   xsize
+       C ecx
+       C edx
+       C esi   &src[0]
+       C edi   &dst[xsize]
+       C
+       C mm0   n2
+       C mm4   m
+       C mm5   d
+       C mm6
+       C mm7   l
+
+       ASSERT(b,`C n2<d
+        movd   %mm0, %eax
+        movd   %mm5, %edx
+        cmpl   %edx, %eax')
+
+       movd    (%esi), %mm1            C src[0]
+       psllq   %mm7, %mm1              C n10
+
+       movq    %mm1, %mm2              C n10
+       movq    %mm1, %mm3              C n10
+       psrad   $31, %mm1               C -n1
+       pand    %mm5, %mm1              C -n1 & d
+       paddd   %mm2, %mm1              C nadj = n10+(-n1&d), ignore overflow
+
+       psrld   $31, %mm2               C n1
+       paddd   %mm0, %mm2              C n2+n1
+       punpckldq %mm0, %mm1            C n2:nadj
+
+       pmuludq %mm4, %mm2              C m*(n2+n1)
+
+       C
+
+       paddq   %mm2, %mm1              C n2:nadj + m*(n2+n1)
+       pcmpeqd %mm2, %mm2              C FF...FF
+       psrlq   $63, %mm2               C 1
+
+       psrlq   $32, %mm1               C q1 = high(n2:nadj + m*(n2+n1))
+       paddd   %mm1, %mm2              C q1
+
+       pmuludq %mm5, %mm1              C q1*d
+       punpckldq %mm0, %mm3            C n
+       psubq   %mm5, %mm3              C n - d
+       pxor    %mm0, %mm0
+
+       C
+
+       psubq   %mm1, %mm3              C n - (q1+1)*d
+
+       por     %mm3, %mm0              C remainder -> n2
+       psrlq   $32, %mm3               C high n - (q1+1)*d, 0 or -1
+
+       ASSERT(be,`C 0 or -1
+        movd   %mm3, %eax
+        addl   $1, %eax
+        cmpl   $1, %eax')
+
+       paddd   %mm3, %mm2              C q
+       pand    %mm5, %mm3              C mask & d
+
+       paddd   %mm3, %mm0              C addback if necessary
+       movd    %mm2, (%edi)
+       leal    -4(%edi), %edi
+
+
+L(integer_none):
+       C eax
+       C ebx   xsize
+
+       orl     %ebx, %ebx
+       jnz     L(fraction_some)        C if xsize!=0
+
+
+L(fraction_done):
+       movl    SAVE_EBP, %ebp
+       psrld   %mm7, %mm0              C remainder
+
+       movl    SAVE_EDI, %edi
+       movd    %mm0, %eax
+
+       movl    SAVE_ESI, %esi
+       movl    SAVE_EBX, %ebx
+       emms
+       ret
+
+
+
+C -----------------------------------------------------------------------------
+C
+
+L(fraction_some):
+       C eax
+       C ebx   xsize
+       C ecx
+       C edx
+       C esi
+       C edi   &dst[xsize-1]
+       C ebp
+
+
+L(fraction_top):
+       C eax
+       C ebx   counter, xsize iterations
+       C ecx
+       C edx
+       C esi   src, decrementing
+       C edi   dst, decrementing
+       C
+       C mm0   n2
+       C mm4   m
+       C mm5   d
+       C mm6   32-l
+       C mm7   l
+
+       ASSERT(b,`C n2<d
+        movd   %mm0, %eax
+        movd   %mm5, %edx
+        cmpl   %edx, %eax')
+
+       movq    %mm0, %mm1              C n2
+       pmuludq %mm4, %mm0              C m*n2
+
+       pcmpeqd %mm2, %mm2
+       psrlq   $63, %mm2
+
+       C
+
+       psrlq   $32, %mm0               C high(m*n2)
+
+       paddd   %mm1, %mm0              C q1 = high(n2:0 + m*n2)
+
+       paddd   %mm0, %mm2              C q1+1
+       pmuludq %mm5, %mm0              C q1*d
+
+       psllq   $32, %mm1               C n = n2:0
+       psubq   %mm5, %mm1              C n - d
+
+       C
+
+       psubq   %mm0, %mm1              C r = n - (q1+1)*d
+       pxor    %mm0, %mm0
+
+       por     %mm1, %mm0              C r -> n2
+       psrlq   $32, %mm1               C high n - (q1+1)*d, 0 or -1
+
+       ASSERT(be,`C 0 or -1
+        movd   %mm1, %eax
+        addl   $1, %eax
+        cmpl   $1, %eax')
+
+       paddd   %mm1, %mm2              C q
+       pand    %mm5, %mm1              C mask & d
+
+       paddd   %mm1, %mm0              C addback if necessary
+       movd    %mm2, (%edi)
+       leal    -4(%edi), %edi
+
+       subl    $1, %ebx
+       jne     L(fraction_top)
+
+
+       jmp     L(fraction_done)
+
+EPILOGUE()