dnl Intel P6 mpn_lshsub_n -- mpn papillion support. dnl Copyright 2006 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C P6/13: 3.35 cycles/limb (separate mpn_sub_n + mpn_lshift needs 4.12) C (1) The loop is is not scheduled in any way, and scheduling attempts have not C improved speed on P6/13. Presumably, the K7 will want scheduling, if it C at all wants to use MMX. C (2) We could save a register by not alternatingly using eax and edx in the C loop. define(`rp', `%edi') define(`up', `%esi') define(`vp', `%ebx') define(`n', `%ecx') define(`cnt', `%mm7') ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_lshsub_n) push %edi push %esi push %ebx mov 16(%esp), rp mov 20(%esp), up mov 24(%esp), vp mov 28(%esp), n mov $32, %eax sub 32(%esp), %eax movd %eax, cnt lea (up,n,4), up lea (vp,n,4), vp lea (rp,n,4), rp neg n mov n, %eax and $-8, n and $7, %eax shl %eax C eax = 2x lea (%eax,%eax,4), %edx C edx = 10x ifdef(`PIC',` call L(pic_calc) L(here): ',` lea L(ent)(%eax,%edx,2), %eax C eax = 22x ') pxor %mm1, %mm1 pxor %mm0, %mm0 jmp *%eax ifdef(`PIC',` L(pic_calc): C See mpn/x86/README about old gas bugs lea (%eax,%edx,2), %eax add $L(ent)-L(here), %eax add (%esp), %eax ret_internal ') L(end): C compute (cy<>(32-cnt)) sbb %eax, %eax neg %eax mov 32(%esp), %ecx shld %cl, %edx, %eax emms pop %ebx pop %esi pop %edi ret ALIGN(16) L(top): jecxz L(end) L(ent): mov 0(up,n,4), %eax sbb 0(vp,n,4), %eax movd %eax, %mm0 punpckldq %mm0, %mm1 psrlq %mm7, %mm1 movd %mm1, 0(rp,n,4) mov 4(up,n,4), %edx sbb 4(vp,n,4), %edx movd %edx, %mm1 punpckldq %mm1, %mm0 psrlq %mm7, %mm0 movd %mm0, 4(rp,n,4) mov 8(up,n,4), %eax sbb 8(vp,n,4), %eax movd %eax, %mm0 punpckldq %mm0, %mm1 psrlq %mm7, %mm1 movd %mm1, 8(rp,n,4) mov 12(up,n,4), %edx sbb 12(vp,n,4), %edx movd %edx, %mm1 punpckldq %mm1, %mm0 psrlq %mm7, %mm0 movd %mm0, 12(rp,n,4) mov 16(up,n,4), %eax sbb 16(vp,n,4), %eax movd %eax, %mm0 punpckldq %mm0, %mm1 psrlq %mm7, %mm1 movd %mm1, 16(rp,n,4) mov 20(up,n,4), %edx sbb 20(vp,n,4), %edx movd %edx, %mm1 punpckldq %mm1, %mm0 psrlq %mm7, %mm0 movd %mm0, 20(rp,n,4) mov 24(up,n,4), %eax sbb 24(vp,n,4), %eax movd %eax, %mm0 punpckldq %mm0, %mm1 psrlq %mm7, %mm1 movd %mm1, 24(rp,n,4) mov 28(up,n,4), %edx sbb 28(vp,n,4), %edx movd %edx, %mm1 punpckldq %mm1, %mm0 psrlq %mm7, %mm0 movd %mm0, 28(rp,n,4) lea 8(n), n jmp L(top) EPILOGUE()