Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / x86 / k6 / gcd_1.asm
diff --git a/gmp/mpn/x86/k6/gcd_1.asm b/gmp/mpn/x86/k6/gcd_1.asm

new file mode 100644 (file)

index 0000000..58aff08
--- /dev/null
+++ b/gmp/mpn/x86/k6/gcd_1.asm
@@ -0,0 +1,351 @@
+dnl  AMD K6 mpn_gcd_1 -- mpn by 1 gcd.
+
+dnl  Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6: 9.5 cycles/bit (approx)   1x1 gcd
+C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)
+
+
+C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
+C
+C This code is nothing very special, but offers a speedup over what gcc 2.95
+C can do with mpn/generic/gcd_1.c.
+C
+C Future:
+C
+C Using a lookup table to count trailing zeros seems a touch quicker, but
+C after a slightly longer startup.  Might be worthwhile if an mpn_gcd_2 used
+C it too.
+
+
+dnl  If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
+dnl  bigger than y, then a division x%y is done to reduce it.
+dnl
+dnl  A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
+dnl  there should be an advantage in the divl at about 4 or 5 bits, which is
+dnl  what's found.
+
+deflit(DIV_THRESHOLD, 5)
+
+
+defframe(PARAM_LIMB, 12)
+defframe(PARAM_SIZE,  8)
+defframe(PARAM_SRC,   4)
+
+       TEXT
+       ALIGN(16)
+
+PROLOGUE(mpn_gcd_1)
+deflit(`FRAME',0)
+
+       ASSERT(ne, `cmpl $0, PARAM_LIMB')
+       ASSERT(ae, `cmpl $1, PARAM_SIZE')
+
+
+       movl    PARAM_SRC, %eax
+       pushl   %ebx                    FRAME_pushl()
+
+       movl    PARAM_LIMB, %edx
+       movl    $-1, %ecx
+
+       movl    (%eax), %ebx            C src low limb
+
+       movl    %ebx, %eax              C src low limb
+       orl     %edx, %ebx
+
+L(common_twos):
+       shrl    %ebx
+       incl    %ecx
+
+       jnc     L(common_twos)          C 1/4 chance on random data
+       shrl    %cl, %edx               C y
+
+       cmpl    $1, PARAM_SIZE
+       ja      L(size_two_or_more)
+
+
+       ASSERT(nz, `orl %eax, %eax')    C should have src limb != 0
+
+       shrl    %cl, %eax               C x
+
+
+       C Swap if necessary to make x>=y.  Measures a touch quicker as a
+       C jump than a branch free calculation.
+       C
+       C eax   x
+       C ebx
+       C ecx   common twos
+       C edx   y
+
+       movl    %eax, %ebx
+       cmpl    %eax, %edx
+
+       jb      L(noswap)
+       movl    %edx, %eax
+
+       movl    %ebx, %edx
+       movl    %eax, %ebx
+L(noswap):
+
+
+       C See if it's worth reducing x with a divl.
+       C
+       C eax   x
+       C ebx   x
+       C ecx   common twos
+       C edx   y
+
+       shrl    $DIV_THRESHOLD, %ebx
+
+       cmpl    %ebx, %edx
+       ja      L(nodiv)
+
+
+       C Reduce x to x%y.
+       C
+       C eax   x
+       C ebx
+       C ecx   common twos
+       C edx   y
+
+       movl    %edx, %ebx
+       xorl    %edx, %edx
+
+       divl    %ebx
+
+       orl     %edx, %edx      C y
+       nop     C code alignment
+
+       movl    %ebx, %eax      C x
+       jz      L(done_shll)
+L(nodiv):
+
+
+       C eax   x
+       C ebx
+       C ecx   common twos
+       C edx   y
+       C esi
+       C edi
+       C ebp
+
+L(strip_y):
+       shrl    %edx
+       jnc     L(strip_y)
+
+       leal    1(%edx,%edx), %edx
+       movl    %ecx, %ebx      C common twos
+
+       leal    1(%eax), %ecx
+       jmp     L(strip_x_and)
+
+
+C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
+C on a 50/50 chance of 0 or 1.  The chance of the next bit also being 0 is
+C only 1/4.
+C
+C A second computed %cl shift was tried, but that measured a touch slower
+C than branching back.
+C
+C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
+C measured about 1 cycle/bit slower.
+
+       C eax   x
+       C ebx   common twos
+       C ecx   scratch
+       C edx   y
+
+       ALIGN(4)
+L(swap):
+       addl    %eax, %edx      C x-y+y = x
+       negl    %eax            C -(x-y) = y-x
+
+L(strip_x):
+       shrl    %eax            C odd-odd = even, so always one to strip
+       ASSERT(nz)
+
+L(strip_x_leal):
+       leal    1(%eax), %ecx
+
+L(strip_x_and):
+       andl    $1, %ecx        C (x^1)&1
+
+       shrl    %cl, %eax       C shift if x even
+
+       testb   $1, %al
+       jz      L(strip_x)
+
+       ASSERT(nz,`testl $1, %eax')     C x, y odd
+       ASSERT(nz,`testl $1, %edx')
+
+       subl    %edx, %eax
+       jb      L(swap)
+       ja      L(strip_x)
+
+
+       movl    %edx, %eax
+       movl    %ebx, %ecx
+
+L(done_shll):
+       shll    %cl, %eax
+       popl    %ebx
+
+       ret
+
+
+C -----------------------------------------------------------------------------
+C Two or more limbs.
+C
+C x={src,size} is reduced modulo y using either a plain mod_1 style
+C remainder, or a modexact_1 style exact division.
+
+deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
+
+       ALIGN(8)
+L(size_two_or_more):
+       C eax
+       C ebx
+       C ecx   common twos
+       C edx   y, without common twos
+       C esi
+       C edi
+       C ebp
+
+deflit(FRAME_TWO_OR_MORE, FRAME)
+
+       pushl   %edi            defframe_pushl(SAVE_EDI)
+       movl    PARAM_SRC, %ebx
+
+L(y_twos):
+       shrl    %edx
+       jnc     L(y_twos)
+
+       movl    %ecx, %edi              C common twos
+       movl    PARAM_SIZE, %ecx
+
+       pushl   %esi            defframe_pushl(SAVE_ESI)
+       leal    1(%edx,%edx), %esi      C y (odd)
+
+       movl    -4(%ebx,%ecx,4), %eax   C src high limb
+
+       cmpl    %edx, %eax              C carry if high<divisor
+
+       sbbl    %edx, %edx              C -1 if high<divisor
+
+       addl    %edx, %ecx              C skip one limb if high<divisor
+       andl    %eax, %edx
+
+       cmpl    $MODEXACT_THRESHOLD, %ecx
+       jae     L(modexact)
+
+
+L(divide_top):
+       C eax   scratch (quotient)
+       C ebx   src
+       C ecx   counter, size-1 to 1
+       C edx   carry (remainder)
+       C esi   divisor (odd)
+       C edi
+       C ebp
+
+       movl    -4(%ebx,%ecx,4), %eax
+       divl    %esi
+       loop    L(divide_top)
+
+
+       movl    %edx, %eax      C x
+       movl    %esi, %edx      C y (odd)
+
+       movl    %edi, %ebx      C common twos
+       popl    %esi
+
+       popl    %edi
+       leal    1(%eax), %ecx
+
+       orl     %eax, %eax
+       jnz     L(strip_x_and)
+
+
+       movl    %ebx, %ecx
+       movl    %edx, %eax
+
+       shll    %cl, %eax
+       popl    %ebx
+
+       ret
+
+
+       ALIGN(8)
+L(modexact):
+       C eax
+       C ebx   src ptr
+       C ecx   size or size-1
+       C edx
+       C esi   y odd
+       C edi   common twos
+       C ebp
+
+       movl    PARAM_SIZE, %eax
+       pushl   %esi            FRAME_pushl()
+
+       pushl   %eax            FRAME_pushl()
+
+       pushl   %ebx            FRAME_pushl()
+
+ifdef(`PIC',`
+       nop     C code alignment
+       call    L(movl_eip_ebx)
+L(here):
+       addl    $_GLOBAL_OFFSET_TABLE_, %ebx
+       call    GSYM_PREFIX`'mpn_modexact_1_odd@PLT
+',`
+       call    GSYM_PREFIX`'mpn_modexact_1_odd
+')
+
+       movl    %esi, %edx              C y odd
+       movl    SAVE_ESI, %esi
+
+       movl    %edi, %ebx              C common twos
+       movl    SAVE_EDI, %edi
+
+       addl    $eval(FRAME - FRAME_TWO_OR_MORE), %esp
+       orl     %eax, %eax
+
+       leal    1(%eax), %ecx
+       jnz     L(strip_x_and)
+
+
+       movl    %ebx, %ecx
+       movl    %edx, %eax
+
+       shll    %cl, %eax
+       popl    %ebx
+
+       ret
+
+
+ifdef(`PIC',`
+L(movl_eip_ebx):
+       movl    (%esp), %ebx
+       ret_internal
+')
+
+EPILOGUE()