dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division. dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, dnl Inc. dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 3 of the dnl License, or (at your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part. C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize, C mp_srcptr src, mp_size_t size, C mp_limb_t divisor); C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize, C mp_srcptr src, mp_size_t size, C mp_limb_t divisor, mp_limb_t carry); C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize, C mp_srcptr src, mp_size_t size, C mp_limb_t divisor, mp_limb_t inverse, C unsigned shift); C C Algorithm: C C The method and nomenclature follow part 8 of "Division by Invariant C Integers using Multiplication" by Granlund and Montgomery, reference in C gmp.texi. C C "m" is written for what is m' in the paper, and "d" for d_norm, which C won't cause any confusion since it's only the normalized divisor that's of C any use in the code. "b" is written for 2^N, the size of a limb, N being C 32 here. C C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets C us have just a psubq on the dependent chain. C C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here, C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much. C C Notes: C C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero C carry, since in normal circumstances that will be a very rare event. C C The test for skipping a division is branch free (once size>=1 is tested). C The store to the destination high limb is 0 when a divide is skipped, or C if it's not skipped then a copy of the src high limb is stored. The C latter is in case src==dst. C C There's a small bias towards expecting xsize==0, by having code for C xsize==0 in a straight line and xsize!=0 under forward jumps. C C Enhancements: C C The loop measures 32 cycles, but the dependent chain would suggest it C could be done with 30. Not sure where to start looking for the extras. C C Alternatives: C C If the divisor is normalized (high bit set) then a division step can C always be skipped, since the high destination limb is always 0 or 1 in C that case. It doesn't seem worth checking for this though, since it C probably occurs infrequently. dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by dnl inverse method is used, rather than plain "divl"s. Minimum value 1. dnl dnl The inverse takes about 80-90 cycles to calculate, but after that the dnl multiply is 32 c/l versus division at about 58 c/l. dnl dnl At 4 limbs the div is a touch faster than the mul (and of course dnl simpler), so start the mul from 5 limbs. deflit(MUL_THRESHOLD, 5) defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c defframe(PARAM_DIVISOR,20) defframe(PARAM_SIZE, 16) defframe(PARAM_SRC, 12) defframe(PARAM_XSIZE, 8) defframe(PARAM_DST, 4) dnl re-use parameter space define(SAVE_ESI,`PARAM_SIZE') define(SAVE_EBP,`PARAM_SRC') define(SAVE_EDI,`PARAM_DIVISOR') define(SAVE_EBX,`PARAM_DST') TEXT ALIGN(16) PROLOGUE(mpn_preinv_divrem_1) deflit(`FRAME',0) movl PARAM_SIZE, %ecx xorl %edx, %edx C carry if can't skip a div movl %esi, SAVE_ESI movl PARAM_SRC, %esi movl %ebp, SAVE_EBP movl PARAM_DIVISOR, %ebp movl %edi, SAVE_EDI movl PARAM_DST, %edi movl -4(%esi,%ecx,4), %eax C src high limb movl %ebx, SAVE_EBX movl PARAM_XSIZE, %ebx movd PARAM_PREINV_INVERSE, %mm4 movd PARAM_PREINV_SHIFT, %mm7 C l cmpl %ebp, %eax C high cmp divisor cmovc( %eax, %edx) C high is carry if high new n2 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 ASSERT(be,`C 0 or -1 movd %mm3, %eax addl $1, %eax cmpl $1, %eax') paddd %mm3, %mm2 C q pand %mm5, %mm3 C mask & d paddd %mm3, %mm0 C addback if necessary movd %mm2, (%edi) leal -4(%edi), %edi subl $1, %ecx ja L(integer_top) L(integer_last): C eax C ebx xsize C ecx C edx C esi &src[0] C edi &dst[xsize] C C mm0 n2 C mm4 m C mm5 d C mm6 C mm7 l ASSERT(b,`C n2 n2 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1 ASSERT(be,`C 0 or -1 movd %mm3, %eax addl $1, %eax cmpl $1, %eax') paddd %mm3, %mm2 C q pand %mm5, %mm3 C mask & d paddd %mm3, %mm0 C addback if necessary movd %mm2, (%edi) leal -4(%edi), %edi L(integer_none): C eax C ebx xsize orl %ebx, %ebx jnz L(fraction_some) C if xsize!=0 L(fraction_done): movl SAVE_EBP, %ebp psrld %mm7, %mm0 C remainder movl SAVE_EDI, %edi movd %mm0, %eax movl SAVE_ESI, %esi movl SAVE_EBX, %ebx emms ret C ----------------------------------------------------------------------------- C L(fraction_some): C eax C ebx xsize C ecx C edx C esi C edi &dst[xsize-1] C ebp L(fraction_top): C eax C ebx counter, xsize iterations C ecx C edx C esi src, decrementing C edi dst, decrementing C C mm0 n2 C mm4 m C mm5 d C mm6 32-l C mm7 l ASSERT(b,`C n2 n2 psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1 ASSERT(be,`C 0 or -1 movd %mm1, %eax addl $1, %eax cmpl $1, %eax') paddd %mm1, %mm2 C q pand %mm5, %mm1 C mask & d paddd %mm1, %mm0 C addback if necessary movd %mm2, (%edi) leal -4(%edi), %edi subl $1, %ebx jne L(fraction_top) jmp L(fraction_done) EPILOGUE()