Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / x86 / k7 / gcd_1.asm
diff --git a/gmp/mpn/x86/k7/gcd_1.asm b/gmp/mpn/x86/k7/gcd_1.asm

new file mode 100644 (file)

index 0000000..f912f43
--- /dev/null
+++ b/gmp/mpn/x86/k7/gcd_1.asm
@@ -0,0 +1,369 @@
+dnl  AMD K7 mpn_gcd_1 -- mpn by 1 gcd.
+
+dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 6.75 cycles/bit (approx)  1x1 gcd
+C     11.0 cycles/limb          Nx1 reduction (modexact_1_odd)
+
+
+dnl  Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,
+dnl  where x is the larger of the two.  See tune/README for more.
+dnl
+dnl  divl at 40 cycles compared to the gcd at about 7 cycles/bitpair
+dnl  suggests 40/7*2=11.4 but 7 seems to be about right.
+
+deflit(DIV_THRESHOLD, 7)
+
+
+C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+C
+C This is mixed in with the code, but as per the k7 optimization manual it's
+C a full cache line and suitably aligned so it won't get swapped between
+C code and data.  Having it in TEXT rather than RODATA saves needing a GOT
+C entry when PIC.
+C
+C Actually, there doesn't seem to be a measurable difference between this in
+C it's own cache line or plonked in the middle of the code.  Presumably
+C since TEXT is read-only there's no worries about coherency.
+
+deflit(MASK, 63)
+deflit(MAXSHIFT, 6)
+
+       TEXT
+       ALIGN(64)
+L(table):
+       .byte   MAXSHIFT
+forloop(i,1,MASK,
+`      .byte   m4_count_trailing_zeros(i)
+')
+
+
+C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);
+C
+
+defframe(PARAM_LIMB,   12)
+defframe(PARAM_SIZE,    8)
+defframe(PARAM_SRC,     4)
+
+defframe(SAVE_EBX,     -4)
+defframe(SAVE_ESI,     -8)
+defframe(SAVE_EDI,    -12)
+defframe(SAVE_EBP,    -16)
+defframe(CALL_DIVISOR,-20)
+defframe(CALL_SIZE,   -24)
+defframe(CALL_SRC,    -28)
+
+deflit(STACK_SPACE, 28)
+
+       TEXT
+       ALIGN(16)
+
+PROLOGUE(mpn_gcd_1)
+deflit(`FRAME',0)
+
+       ASSERT(ne, `cmpl $0, PARAM_LIMB')       C y!=0
+       ASSERT(ae, `cmpl $1, PARAM_SIZE')       C size>=1
+
+       movl    PARAM_SRC, %eax
+       movl    PARAM_LIMB, %edx
+       subl    $STACK_SPACE, %esp      deflit(`FRAME',STACK_SPACE)
+
+       movl    %esi, SAVE_ESI
+       movl    %ebx, SAVE_EBX
+
+       movl    (%eax), %esi            C src low limb
+
+ifdef(`PIC',`
+       movl    %edi, SAVE_EDI
+       call    L(movl_eip_to_edi)
+L(here):
+       addl    $L(table)-L(here), %edi
+')
+
+       movl    %esi, %ebx
+       orl     %edx, %esi      C x|y
+       movl    $-1, %ecx
+
+L(twos):
+       incl    %ecx
+       shrl    %esi
+       jnc     L(twos)         C 3/4 chance of x or y odd already
+
+       shrl    %cl, %ebx
+       shrl    %cl, %edx
+       movl    %ecx, %esi      C common twos
+
+       movl    PARAM_SIZE, %ecx
+       cmpl    $1, %ecx
+       ja      L(divide)
+
+
+       C eax
+       C ebx   x
+       C ecx
+       C edx   y
+       C esi   common twos
+       C edi   [PIC] L(table)
+       C ebp
+
+       movl    %edx, %eax
+       cmpl    %ebx, %edx
+
+       cmovb(  %ebx, %eax)     C swap to make x bigger than y
+       cmovb(  %edx, %ebx)
+
+
+L(strip_y):
+       C eax   x
+       C ebx   y
+       C ecx
+       C edx
+       C esi   common twos
+       C edi   [PIC] L(table)
+       C ebp
+
+       ASSERT(nz,`orl %ebx,%ebx')
+       shrl    %ebx
+       jnc     L(strip_y)
+       rcll    %ebx
+
+
+       C eax   x
+       C ebx   y (odd)
+       C ecx
+       C edx
+       C esi   common twos
+       C edi   [PIC] L(table)
+       C ebp
+
+       movl    %eax, %ecx
+       movl    %ebx, %edx
+       shrl    $DIV_THRESHOLD, %eax
+
+       cmpl    %eax, %ebx
+       movl    %ecx, %eax
+       ja      L(strip_x_entry)        C do x%y if x much bigger than y
+
+
+       xorl    %edx, %edx
+
+       divl    %ebx
+
+       orl     %edx, %edx
+       movl    %edx, %eax              C remainder -> x
+       movl    %ebx, %edx              C y
+
+       jz      L(done_ebx)
+       jmp     L(strip_x)
+
+
+       C Offset 0x9D here for non-PIC.  About 0.4 cycles/bit is saved by
+       C ensuring the end of the jnz at the end of this loop doesn't cross
+       C into the next cache line at 0xC0.
+       C
+       C PIC on the other hand is offset 0xAC here and extends to 0xC9, so
+       C it crosses but doesn't suffer any measurable slowdown.
+
+L(top):
+       C eax   x
+       C ebx   y-x
+       C ecx   x-y
+       C edx   y
+       C esi   twos, for use at end
+       C edi   [PIC] L(table)
+
+       cmovc(  %ebx, %ecx)             C if x-y gave carry, use x and y-x
+       cmovc(  %eax, %edx)
+
+L(strip_x):
+       movl    %ecx, %eax
+L(strip_x_entry):
+       andl    $MASK, %ecx
+
+       ASSERT(nz, `orl %eax, %eax')
+
+ifdef(`PIC',`
+       movb    (%ecx,%edi), %cl
+',`
+       movb    L(table) (%ecx), %cl
+')
+
+       shrl    %cl, %eax
+       cmpb    $MAXSHIFT, %cl
+
+       movl    %eax, %ecx
+       movl    %edx, %ebx
+       je      L(strip_x)
+
+       ASSERT(nz, `testl $1, %eax')    C both odd
+       ASSERT(nz, `testl $1, %edx')
+
+       subl    %eax, %ebx
+       subl    %edx, %ecx
+       jnz     L(top)
+
+
+L(done):
+       movl    %esi, %ecx
+       movl    SAVE_ESI, %esi
+ifdef(`PIC',`
+       movl    SAVE_EDI, %edi
+')
+
+       shll    %cl, %eax
+       movl    SAVE_EBX, %ebx
+       addl    $FRAME, %esp
+
+       ret
+
+
+
+C -----------------------------------------------------------------------------
+C two or more limbs
+
+dnl  MODEXACT_THRESHOLD is the size at which it's better to call
+dnl  mpn_modexact_1_odd than do an inline loop.
+
+deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5))
+
+L(divide):
+       C eax   src
+       C ebx
+       C ecx   size
+       C edx   y
+       C esi   common twos
+       C edi   [PIC] L(table)
+       C ebp
+
+L(divide_strip_y):
+       ASSERT(nz,`orl %edx,%edx')
+       shrl    %edx
+       jnc     L(divide_strip_y)
+       leal    1(%edx,%edx), %ebx              C y now odd
+
+       movl    %ebp, SAVE_EBP
+       movl    %eax, %ebp
+       movl    -4(%eax,%ecx,4), %eax           C src high limb
+
+       cmp     $MODEXACT_THRESHOLD, %ecx
+       jae     L(modexact)
+
+       cmpl    %ebx, %eax                      C high cmp divisor
+       movl    $0, %edx
+
+       cmovc(  %eax, %edx)                     C skip a div if high<divisor
+       sbbl    $0, %ecx
+
+
+L(divide_top):
+       C eax   scratch (quotient)
+       C ebx   y
+       C ecx   counter (size to 1, inclusive)
+       C edx   carry (remainder)
+       C esi   common twos
+       C edi   [PIC] L(table)
+       C ebp   src
+
+       movl    -4(%ebp,%ecx,4), %eax
+
+       divl    %ebx
+
+       decl    %ecx
+       jnz     L(divide_top)
+
+
+       C eax
+       C ebx   y (odd)
+       C ecx
+       C edx   x
+       C esi   common twos
+       C edi   [PIC] L(table)
+       C ebp
+
+       orl     %edx, %edx
+       movl    SAVE_EBP, %ebp
+       movl    %edx, %eax
+
+       movl    %edx, %ecx
+       movl    %ebx, %edx
+       jnz     L(strip_x_entry)
+
+
+L(done_ebx):
+       movl    %ebx, %eax
+       jmp     L(done)
+
+
+
+L(modexact):
+       C eax
+       C ebx   y
+       C ecx   size
+       C edx
+       C esi   common twos
+       C edi   [PIC] L(table)
+       C ebp   src
+
+ifdef(`PIC',`
+       movl    %ebp, CALL_SRC
+       movl    %ebx, %ebp              C y
+       movl    %edi, %ebx              C L(table)
+
+       addl    $_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx
+       movl    %ebp, CALL_DIVISOR
+       movl    %ecx, CALL_SIZE
+
+       call    GSYM_PREFIX`'mpn_modexact_1_odd@PLT
+',`
+dnl non-PIC
+       movl    %ebx, CALL_DIVISOR
+       movl    %ebp, CALL_SRC
+       movl    %ecx, CALL_SIZE
+
+       call    GSYM_PREFIX`'mpn_modexact_1_odd
+')
+
+       C eax   x
+       C ebx   [non-PIC] y
+       C ecx
+       C edx
+       C esi   common twos
+       C edi   [PIC] L(table)
+       C ebp   [PIC] y
+
+       orl     %eax, %eax
+       movl    ifdef(`PIC',`%ebp',`%ebx'), %edx
+       movl    SAVE_EBP, %ebp
+
+       movl    %eax, %ecx
+       jnz     L(strip_x_entry)
+
+       movl    %edx, %eax
+       jmp     L(done)
+
+
+ifdef(`PIC', `
+L(movl_eip_to_edi):
+       movl    (%esp), %edi
+       ret_internal
+')
+
+EPILOGUE()