--- /dev/null
+dnl AMD K7 mpn_gcd_1 -- mpn by 1 gcd.
+
+dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K7: 6.75 cycles/bit (approx) 1x1 gcd
+C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
+
+
+dnl Reduce using x%y if x is more than DIV_THRESHOLD bits bigger than y,
+dnl where x is the larger of the two. See tune/README for more.
+dnl
+dnl divl at 40 cycles compared to the gcd at about 7 cycles/bitpair
+dnl suggests 40/7*2=11.4 but 7 seems to be about right.
+
+deflit(DIV_THRESHOLD, 7)
+
+
+C table[n] is the number of trailing zeros on n, or MAXSHIFT if n==0.
+C
+C This is mixed in with the code, but as per the k7 optimization manual it's
+C a full cache line and suitably aligned so it won't get swapped between
+C code and data. Having it in TEXT rather than RODATA saves needing a GOT
+C entry when PIC.
+C
+C Actually, there doesn't seem to be a measurable difference between this in
+C it's own cache line or plonked in the middle of the code. Presumably
+C since TEXT is read-only there's no worries about coherency.
+
+deflit(MASK, 63)
+deflit(MAXSHIFT, 6)
+
+ TEXT
+ ALIGN(64)
+L(table):
+ .byte MAXSHIFT
+forloop(i,1,MASK,
+` .byte m4_count_trailing_zeros(i)
+')
+
+
+C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t limb);
+C
+
+defframe(PARAM_LIMB, 12)
+defframe(PARAM_SIZE, 8)
+defframe(PARAM_SRC, 4)
+
+defframe(SAVE_EBX, -4)
+defframe(SAVE_ESI, -8)
+defframe(SAVE_EDI, -12)
+defframe(SAVE_EBP, -16)
+defframe(CALL_DIVISOR,-20)
+defframe(CALL_SIZE, -24)
+defframe(CALL_SRC, -28)
+
+deflit(STACK_SPACE, 28)
+
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_gcd_1)
+deflit(`FRAME',0)
+
+ ASSERT(ne, `cmpl $0, PARAM_LIMB') C y!=0
+ ASSERT(ae, `cmpl $1, PARAM_SIZE') C size>=1
+
+ movl PARAM_SRC, %eax
+ movl PARAM_LIMB, %edx
+ subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
+
+ movl %esi, SAVE_ESI
+ movl %ebx, SAVE_EBX
+
+ movl (%eax), %esi C src low limb
+
+ifdef(`PIC',`
+ movl %edi, SAVE_EDI
+ call L(movl_eip_to_edi)
+L(here):
+ addl $L(table)-L(here), %edi
+')
+
+ movl %esi, %ebx
+ orl %edx, %esi C x|y
+ movl $-1, %ecx
+
+L(twos):
+ incl %ecx
+ shrl %esi
+ jnc L(twos) C 3/4 chance of x or y odd already
+
+ shrl %cl, %ebx
+ shrl %cl, %edx
+ movl %ecx, %esi C common twos
+
+ movl PARAM_SIZE, %ecx
+ cmpl $1, %ecx
+ ja L(divide)
+
+
+ C eax
+ C ebx x
+ C ecx
+ C edx y
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+ movl %edx, %eax
+ cmpl %ebx, %edx
+
+ cmovb( %ebx, %eax) C swap to make x bigger than y
+ cmovb( %edx, %ebx)
+
+
+L(strip_y):
+ C eax x
+ C ebx y
+ C ecx
+ C edx
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+ ASSERT(nz,`orl %ebx,%ebx')
+ shrl %ebx
+ jnc L(strip_y)
+ rcll %ebx
+
+
+ C eax x
+ C ebx y (odd)
+ C ecx
+ C edx
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+ movl %eax, %ecx
+ movl %ebx, %edx
+ shrl $DIV_THRESHOLD, %eax
+
+ cmpl %eax, %ebx
+ movl %ecx, %eax
+ ja L(strip_x_entry) C do x%y if x much bigger than y
+
+
+ xorl %edx, %edx
+
+ divl %ebx
+
+ orl %edx, %edx
+ movl %edx, %eax C remainder -> x
+ movl %ebx, %edx C y
+
+ jz L(done_ebx)
+ jmp L(strip_x)
+
+
+ C Offset 0x9D here for non-PIC. About 0.4 cycles/bit is saved by
+ C ensuring the end of the jnz at the end of this loop doesn't cross
+ C into the next cache line at 0xC0.
+ C
+ C PIC on the other hand is offset 0xAC here and extends to 0xC9, so
+ C it crosses but doesn't suffer any measurable slowdown.
+
+L(top):
+ C eax x
+ C ebx y-x
+ C ecx x-y
+ C edx y
+ C esi twos, for use at end
+ C edi [PIC] L(table)
+
+ cmovc( %ebx, %ecx) C if x-y gave carry, use x and y-x
+ cmovc( %eax, %edx)
+
+L(strip_x):
+ movl %ecx, %eax
+L(strip_x_entry):
+ andl $MASK, %ecx
+
+ ASSERT(nz, `orl %eax, %eax')
+
+ifdef(`PIC',`
+ movb (%ecx,%edi), %cl
+',`
+ movb L(table) (%ecx), %cl
+')
+
+ shrl %cl, %eax
+ cmpb $MAXSHIFT, %cl
+
+ movl %eax, %ecx
+ movl %edx, %ebx
+ je L(strip_x)
+
+ ASSERT(nz, `testl $1, %eax') C both odd
+ ASSERT(nz, `testl $1, %edx')
+
+ subl %eax, %ebx
+ subl %edx, %ecx
+ jnz L(top)
+
+
+L(done):
+ movl %esi, %ecx
+ movl SAVE_ESI, %esi
+ifdef(`PIC',`
+ movl SAVE_EDI, %edi
+')
+
+ shll %cl, %eax
+ movl SAVE_EBX, %ebx
+ addl $FRAME, %esp
+
+ ret
+
+
+
+C -----------------------------------------------------------------------------
+C two or more limbs
+
+dnl MODEXACT_THRESHOLD is the size at which it's better to call
+dnl mpn_modexact_1_odd than do an inline loop.
+
+deflit(MODEXACT_THRESHOLD, ifdef(`PIC',6,5))
+
+L(divide):
+ C eax src
+ C ebx
+ C ecx size
+ C edx y
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+L(divide_strip_y):
+ ASSERT(nz,`orl %edx,%edx')
+ shrl %edx
+ jnc L(divide_strip_y)
+ leal 1(%edx,%edx), %ebx C y now odd
+
+ movl %ebp, SAVE_EBP
+ movl %eax, %ebp
+ movl -4(%eax,%ecx,4), %eax C src high limb
+
+ cmp $MODEXACT_THRESHOLD, %ecx
+ jae L(modexact)
+
+ cmpl %ebx, %eax C high cmp divisor
+ movl $0, %edx
+
+ cmovc( %eax, %edx) C skip a div if high<divisor
+ sbbl $0, %ecx
+
+
+L(divide_top):
+ C eax scratch (quotient)
+ C ebx y
+ C ecx counter (size to 1, inclusive)
+ C edx carry (remainder)
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp src
+
+ movl -4(%ebp,%ecx,4), %eax
+
+ divl %ebx
+
+ decl %ecx
+ jnz L(divide_top)
+
+
+ C eax
+ C ebx y (odd)
+ C ecx
+ C edx x
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp
+
+ orl %edx, %edx
+ movl SAVE_EBP, %ebp
+ movl %edx, %eax
+
+ movl %edx, %ecx
+ movl %ebx, %edx
+ jnz L(strip_x_entry)
+
+
+L(done_ebx):
+ movl %ebx, %eax
+ jmp L(done)
+
+
+
+L(modexact):
+ C eax
+ C ebx y
+ C ecx size
+ C edx
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp src
+
+ifdef(`PIC',`
+ movl %ebp, CALL_SRC
+ movl %ebx, %ebp C y
+ movl %edi, %ebx C L(table)
+
+ addl $_GLOBAL_OFFSET_TABLE_+[.-L(table)], %ebx
+ movl %ebp, CALL_DIVISOR
+ movl %ecx, CALL_SIZE
+
+ call GSYM_PREFIX`'mpn_modexact_1_odd@PLT
+',`
+dnl non-PIC
+ movl %ebx, CALL_DIVISOR
+ movl %ebp, CALL_SRC
+ movl %ecx, CALL_SIZE
+
+ call GSYM_PREFIX`'mpn_modexact_1_odd
+')
+
+ C eax x
+ C ebx [non-PIC] y
+ C ecx
+ C edx
+ C esi common twos
+ C edi [PIC] L(table)
+ C ebp [PIC] y
+
+ orl %eax, %eax
+ movl ifdef(`PIC',`%ebp',`%ebx'), %edx
+ movl SAVE_EBP, %ebp
+
+ movl %eax, %ecx
+ jnz L(strip_x_entry)
+
+ movl %edx, %eax
+ jmp L(done)
+
+
+ifdef(`PIC', `
+L(movl_eip_to_edi):
+ movl (%esp), %edi
+ ret_internal
+')
+
+EPILOGUE()