X-Git-Url: https://oss.titaniummirror.com/gitweb/?a=blobdiff_plain;f=gmp%2Fmpn%2Fx86%2Fp6%2Fmod_1.asm;fp=gmp%2Fmpn%2Fx86%2Fp6%2Fmod_1.asm;h=b6eacf7e82e513a35f2bddf04ba902dfad0b8eeb;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/x86/p6/mod_1.asm b/gmp/mpn/x86/p6/mod_1.asm new file mode 100644 index 00000000..b6eacf7e --- /dev/null +++ b/gmp/mpn/x86/p6/mod_1.asm @@ -0,0 +1,472 @@ +dnl Intel P6 mpn_mod_1 -- mpn by limb remainder. + +dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + + +C P6: 21.5 cycles/limb + + +C mp_limb_t mpn_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor); +C mp_limb_t mpn_mod_1c (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t carry); +C mp_limb_t mpn_preinv_mod_1 (mp_srcptr src, mp_size_t size, mp_limb_t divisor, +C mp_limb_t inverse); +C +C The code here is in two parts, a simple divl loop and a mul-by-inverse. +C The divl is used by mod_1 and mod_1c for small sizes, until the savings in +C the mul-by-inverse can overcome the time to calculate an inverse. +C preinv_mod_1 goes straight to the mul-by-inverse. +C +C The mul-by-inverse normalizes the divisor (or for preinv_mod_1 it's +C already normalized). The calculation done is r=a%(d*2^n) followed by a +C final (r*2^n)%(d*2^n), where a is the dividend, d the divisor, and n is +C the number of leading zero bits on d. This means there's no bit shifts in +C the main loop, at the cost of an extra divide step at the end. +C +C The simple divl for mod_1 is able to skip one divide step if high n2 + subl %ebp, %eax + + cmovnc( %eax, %edi) C n2-divisor if no underflow + + movl $-1, %eax + movl $-1, %edx + + subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1 + leal -8(%esi,%ebx,4), %ecx C &src[size-2] + + divl %ebp C floor (b*(b-d)-1) / d + +L(preinv_entry): + movl %eax, VAR_INVERSE + + + +C No special scheduling of loads is necessary in this loop, out of order +C execution hides the latencies already. +C +C The way q1+1 is generated in %ebx and d is moved to %eax for the multiply +C seems fastest. The obvious change to generate q1+1 in %eax and then just +C multiply by %ebp (as per mpn/x86/pentium/mod_1.asm in fact) runs 1 cycle +C slower, for no obvious reason. + + + ALIGN(16) +L(inverse_top): + C eax n10 (then scratch) + C ebx scratch (nadj, q1) + C ecx src pointer, decrementing + C edx scratch + C esi n10 + C edi n2 + C ebp divisor + + movl (%ecx), %eax C next src limb + movl %eax, %esi + + sarl $31, %eax C -n1 + movl %ebp, %ebx + + andl %eax, %ebx C -n1 & d + negl %eax C n1 + + addl %edi, %eax C n2+n1 + + mull VAR_INVERSE C m*(n2+n1) + + addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow + subl $4, %ecx + + C + + addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag + leal 1(%edi), %ebx C n2+1 + movl %ebp, %eax C d + + adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1 + jz L(q1_ff) + + mull %ebx C (q1+1)*d + + C + + subl %eax, %esi C low n - (q1+1)*d + + sbbl %edx, %edi C high n - (q1+1)*d, 0 or -1 + + andl %ebp, %edi C d if underflow + + addl %esi, %edi C remainder with addback if necessary + + cmpl PARAM_SRC, %ecx + jae L(inverse_top) + + +C ----------------------------------------------------------------------------- +L(inverse_loop_done): + + C %edi is the remainder modulo d*2^n and now must be reduced to + C 0<=r next n2 + + cmpl PARAM_SRC, %ecx + jae L(inverse_top) + + jmp L(inverse_loop_done) + + +EPILOGUE()