Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / x86 / pentium4 / sse2 / mod_1.asm
diff --git a/gmp/mpn/x86/pentium4/sse2/mod_1.asm b/gmp/mpn/x86/pentium4/sse2/mod_1.asm

new file mode 100644 (file)

index 0000000..0e95f13
--- /dev/null
+++ b/gmp/mpn/x86/pentium4/sse2/mod_1.asm
@@ -0,0 +1,391 @@
+dnl  Intel Pentium-4 mpn_mod_1 -- mpn by limb remainder.
+
+dnl  Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+dnl  P4: 31 cycles/limb.
+
+
+C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor);
+C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                       mp_limb_t carry);
+C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor,
+C                             mp_limb_t inverse);
+C
+C An idea was tried in the mul-by-inverse to process the last limb by a jump
+C back to the top of the loop skipping the -4(%esi) fetch.  But that seemed
+C to produce slightly strange timings, like 9 and 10 limb operations about
+C the same speed.  The jump would be successively taken and not-taken, which
+C in theory should predict ok, but perhaps isn't enjoyed by the chip.
+C Duplicating the loop for the last limb seems to be a couple of cycles
+C quicker too.
+C
+C Enhancements:
+C
+C The loop measures 31 cycles, but the dependent chain would suggest it
+C could be done with 30.  Not sure where to start looking for the extra
+C cycle.
+
+
+dnl  MUL_THRESHOLD is the size at which the multiply by inverse method is
+dnl  used, rather than plain "divl"s.  Minimum value 2.
+dnl
+dnl  The inverse takes about 80-90 cycles to calculate, but after that the
+dnl  multiply is 31 c/l versus division at about 58 c/l.
+
+deflit(MUL_THRESHOLD, 5)
+
+
+defframe(PARAM_INVERSE,16)  dnl mpn_preinv_mod_1
+defframe(PARAM_CARRY,  16)  dnl mpn_mod_1c
+defframe(PARAM_DIVISOR,12)
+defframe(PARAM_SIZE,    8)
+defframe(PARAM_SRC,     4)
+
+dnl  re-use parameter space
+define(SAVE_ESI,`PARAM_SIZE')
+define(SAVE_EBP,`PARAM_SRC')
+
+       TEXT
+
+       ALIGN(16)
+PROLOGUE(mpn_preinv_mod_1)
+deflit(`FRAME',0)
+
+       movl    PARAM_SIZE, %ecx
+       movl    %esi, SAVE_ESI
+       movl    $32, %eax
+
+       movd    %eax, %mm6                      C l = 0, so 32-l = 32
+       movl    PARAM_SRC, %esi
+       movl    %ebp, SAVE_EBP
+
+       movd    PARAM_DIVISOR, %mm5
+       pxor    %mm7, %mm7                      C l = 0
+
+       movd    -4(%esi,%ecx,4), %mm0           C src high limb
+       leal    -8(%esi,%ecx,4), %esi           C &src[size-2]
+
+       movd    PARAM_INVERSE, %mm4
+       subl    $2, %ecx                        C size-2
+
+       psubq   %mm5, %mm0                      C high-divisor
+       movq    %mm0, %mm2
+
+       psrlq   $32, %mm0                       C -1 if underflow
+
+       pand    %mm5, %mm0                      C divisor if underflow
+
+       paddq   %mm2, %mm0                      C addback if underflow
+       jz      L(inverse_last)                 C if size==2
+       ja      L(inverse_top)                  C if size>2
+
+
+       C if size==1
+       movl    SAVE_ESI, %esi
+       movd    %mm0, %eax
+       emms
+       ret
+
+EPILOGUE()
+
+
+       ALIGN(16)
+PROLOGUE(mpn_mod_1c)
+deflit(`FRAME',0)
+       movl    PARAM_SIZE, %ecx
+       movl    %esi, SAVE_ESI
+
+       movl    PARAM_SRC, %esi
+       movl    %ebp, SAVE_EBP
+
+       movl    PARAM_CARRY, %edx
+       orl     %ecx, %ecx
+       jz      L(divide_done)          C result==carry if size==0
+
+       movl    PARAM_DIVISOR, %ebp
+       jmp     L(start_1c)
+
+EPILOGUE()
+
+
+       ALIGN(16)
+PROLOGUE(mpn_mod_1)
+deflit(`FRAME',0)
+
+       movl    PARAM_SIZE, %ecx
+       movl    %esi, SAVE_ESI
+
+       movl    PARAM_SRC, %esi
+       movl    %ebp, SAVE_EBP
+
+       movl    PARAM_DIVISOR, %ebp
+       xorl    %edx, %edx              C result 0 if size==0
+
+       orl     %ecx, %ecx
+       jz      L(divide_done)
+       movl    -4(%esi,%ecx,4), %eax   C src high limb
+
+       leal    -1(%ecx), %edx
+       cmpl    %ebp, %eax              C c if high<divisor
+
+       cmovc(  %edx, %ecx)             C size-1 if high<divisor
+
+       movl    $0, %edx                C initial carry
+       cmovc(  %eax, %edx)             C src high limb if high<divisor
+
+       orl     %ecx, %ecx
+       jz      L(divide_done)          C if size==1 and skip div
+
+
+L(start_1c):
+       C eax
+       C ebx
+       C ecx   size
+       C edx   carry
+       C esi   src
+       C edi
+       C ebp   divisor
+
+       leal    -4(%esi,%ecx,4), %esi   C &src[size-1]
+       cmpl    $MUL_THRESHOLD, %ecx
+       jae     L(mul_by_inverse)
+
+
+L(divide_top):
+       C eax
+       C ebx
+       C ecx   counter, limbs, decrementing
+       C edx   remainder
+       C esi   src, decrementing
+       C edi
+       C ebp   divisor
+
+       movl    (%esi), %eax
+       subl    $4, %esi
+
+       divl    %ebp
+
+       subl    $1, %ecx
+       jnz     L(divide_top)
+
+
+L(divide_done):
+       movl    SAVE_ESI, %esi
+       movl    SAVE_EBP, %ebp
+       movl    %edx, %eax
+       ret
+
+
+C -----------------------------------------------------------------------------
+
+L(mul_by_inverse):
+       C eax
+       C ebx
+       C ecx   size
+       C edx   carry
+       C esi   src
+       C edi
+       C ebp   divisor
+
+       bsrl    %ebp, %eax              C 31-l
+
+       movd    %edx, %mm1              C carry
+       movl    %ecx, %edx              C size
+       movl    $31, %ecx
+
+       C
+
+       xorl    %eax, %ecx              C l = leading zeros on d
+       addl    $1, %eax                C 32-l
+
+       shll    %cl, %ebp               C normalize d
+       movd    %ecx, %mm7              C l
+       leal    -1(%edx), %ecx          C size-1
+
+       movd    %eax, %mm6              C 32-l
+       movl    $-1, %edx
+       movl    $-1, %eax
+
+       C
+
+       subl    %ebp, %edx              C (b-d)-1 so edx:eax = b*(b-d)-1
+
+       divl    %ebp                    C floor (b*(b-d)-1 / d)
+
+       movd    %ebp, %mm5              C d
+       movd    (%esi), %mm0            C src high limb
+       punpckldq %mm1, %mm0
+       psrlq   %mm6, %mm0              C n2 = high (carry:srchigh << l)
+
+       C
+
+       movd    %eax, %mm4              C m
+
+
+C The dependent chain here consists of
+C
+C      2   paddd    n1+n2
+C      8   pmuludq  m*(n1+n2)
+C      2   paddq    n2:nadj + m*(n1+n2)
+C      2   psrlq    q1
+C      8   pmuludq  d*q1
+C      2   psubq    (n-d)-q1*d
+C      2   psrlq    high mask
+C      2   pand     d masked
+C      2   paddd    n2+d addback
+C      --
+C      30
+C
+C But it seems to run at 31 cycles, so presumably there's something else
+C going on.
+
+
+       ALIGN(16)
+L(inverse_top):
+       C eax
+       C ebx
+       C ecx   counter, size-1 to 1
+       C edx
+       C esi   src, decrementing
+       C edi
+       C ebp
+       C
+       C mm0   n2
+       C mm4   m
+       C mm5   d
+       C mm6   32-l
+       C mm7   l
+
+       ASSERT(b,`C n2<d
+        movd   %mm0, %eax
+        movd   %mm5, %edx
+        cmpl   %edx, %eax')
+
+       movd    -4(%esi), %mm1          C next src limbs
+       movd    (%esi), %mm2
+       leal    -4(%esi), %esi
+
+       punpckldq %mm2, %mm1
+       psrlq   %mm6, %mm1              C n10
+
+       movq    %mm1, %mm2              C n10
+       movq    %mm1, %mm3              C n10
+       psrad   $31, %mm1               C -n1
+       pand    %mm5, %mm1              C -n1 & d
+       paddd   %mm2, %mm1              C nadj = n10+(-n1&d), ignore overflow
+
+       psrld   $31, %mm2               C n1
+       paddd   %mm0, %mm2              C n2+n1
+       punpckldq %mm0, %mm1            C n2:nadj
+
+       pmuludq %mm4, %mm2              C m*(n2+n1)
+
+       paddq   %mm2, %mm1              C n2:nadj + m*(n2+n1)
+
+       psrlq   $32, %mm1               C q1 = high(n2:nadj + m*(n2+n1))
+
+       pmuludq %mm5, %mm1              C q1*d
+       punpckldq %mm0, %mm3            C n
+       psubq   %mm5, %mm3              C n - d
+       pxor    %mm0, %mm0
+
+       psubq   %mm1, %mm3              C n - (q1+1)*d
+
+       por     %mm3, %mm0              C remainder -> n2
+       psrlq   $32, %mm3               C high n - (q1+1)*d, 0 or -1
+
+       ASSERT(be,`C 0 or -1
+        movd   %mm3, %eax
+        addl   $1, %eax
+        cmpl   $1, %eax')
+
+       pand    %mm5, %mm3              C mask & d
+
+       paddd   %mm3, %mm0              C addback if necessary
+
+       subl    $1, %ecx
+       jnz     L(inverse_top)
+
+
+       C Least significant limb.
+       C Same code as the loop, but there's no -4(%esi) limb to fetch.
+
+L(inverse_last):
+       C eax
+       C ebx
+       C ecx
+       C edx
+       C esi   &src[0]
+       C
+       C mm0   n2
+       C mm4   m
+       C mm5   d
+       C mm6   32-l
+       C mm7   l
+
+       movd    (%esi), %mm1            C src[0]
+       psllq   %mm7, %mm1              C n10
+
+       movq    %mm1, %mm2              C n10
+       movq    %mm1, %mm3              C n10
+       psrad   $31, %mm1               C -n1
+       pand    %mm5, %mm1              C -n1 & d
+       paddd   %mm2, %mm1              C nadj = n10+(-n1&d), ignore overflow
+
+       psrld   $31, %mm2               C n1
+       paddd   %mm0, %mm2              C n2+n1
+       punpckldq %mm0, %mm1            C n2:nadj
+
+       pmuludq %mm4, %mm2              C m*(n2+n1)
+
+       paddq   %mm2, %mm1              C n2:nadj + m*(n2+n1)
+
+       psrlq   $32, %mm1               C q1 = high(n2:nadj + m*(n2+n1))
+
+       pmuludq %mm5, %mm1              C q1*d
+       punpckldq %mm0, %mm3            C n
+       psubq   %mm5, %mm3              C n - d
+       pxor    %mm0, %mm0
+
+       psubq   %mm1, %mm3              C n - (q1+1)*d
+
+       por     %mm3, %mm0              C remainder -> n2
+       psrlq   $32, %mm3               C high n - (q1+1)*d, 0 or -1
+
+       ASSERT(be,`C 0 or -1
+        movd   %mm3, %eax
+        addl   $1, %eax
+        cmpl   $1, %eax')
+
+       movl    SAVE_EBP, %ebp
+       pand    %mm5, %mm3              C mask & d
+
+       movl    SAVE_ESI, %esi
+       paddd   %mm3, %mm0              C addback if necessary
+
+       psrld   %mm7, %mm0
+
+       movd    %mm0, %eax
+
+       emms
+       ret
+
+EPILOGUE()