X-Git-Url: https://oss.titaniummirror.com/gitweb/?a=blobdiff_plain;f=gmp%2Fmpn%2Fx86%2Fk7%2Fmul_basecase.asm;fp=gmp%2Fmpn%2Fx86%2Fk7%2Fmul_basecase.asm;h=7f4c0002f7f43608ec38fd4aaa0134742ef11d62;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/x86/k7/mul_basecase.asm b/gmp/mpn/x86/k7/mul_basecase.asm new file mode 100644 index 00000000..7f4c0002 --- /dev/null +++ b/gmp/mpn/x86/k7/mul_basecase.asm @@ -0,0 +1,591 @@ +dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers. + +dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + + +C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16 +C limbs/loop unrolling). + + + +dnl K7 UNROLL_COUNT cycles/product (at around 20x20) +dnl 8 4.67 +dnl 16 4.59 +dnl 32 4.42 +dnl Maximum possible with the current code is 32. +dnl +dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get +dnl done with a straight run through a block of code, no inner loop. Using +dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache. + +deflit(UNROLL_COUNT, 32) + + +C void mpn_mul_basecase (mp_ptr wp, +C mp_srcptr xp, mp_size_t xsize, +C mp_srcptr yp, mp_size_t ysize); +C +C Calculate xp,xsize multiplied by yp,ysize, storing the result in +C wp,xsize+ysize. +C +C This routine is essentially the same as mpn/generic/mul_basecase.c, but +C it's faster because it does most of the mpn_addmul_1() startup +C calculations only once. The saving is 15-25% on typical sizes coming from +C the Karatsuba multiply code. + +ifdef(`PIC',` +deflit(UNROLL_THRESHOLD, 5) +',` +deflit(UNROLL_THRESHOLD, 5) +') + +defframe(PARAM_YSIZE,20) +defframe(PARAM_YP, 16) +defframe(PARAM_XSIZE,12) +defframe(PARAM_XP, 8) +defframe(PARAM_WP, 4) + + TEXT + ALIGN(32) +PROLOGUE(mpn_mul_basecase) +deflit(`FRAME',0) + + movl PARAM_XSIZE, %ecx + movl PARAM_YP, %eax + + movl PARAM_XP, %edx + movl (%eax), %eax C yp low limb + + cmpl $2, %ecx + ja L(xsize_more_than_two) + je L(two_by_something) + + + C one limb by one limb + + mull (%edx) + + movl PARAM_WP, %ecx + movl %eax, (%ecx) + movl %edx, 4(%ecx) + ret + + +C ----------------------------------------------------------------------------- +L(two_by_something): +deflit(`FRAME',0) + decl PARAM_YSIZE + pushl %ebx defframe_pushl(`SAVE_EBX') + movl %eax, %ecx C yp low limb + + movl PARAM_WP, %ebx + pushl %esi defframe_pushl(`SAVE_ESI') + movl %edx, %esi C xp + + movl (%edx), %eax C xp low limb + jnz L(two_by_two) + + + C two limbs by one limb + + mull %ecx + + movl %eax, (%ebx) + movl 4(%esi), %eax + movl %edx, %esi C carry + + mull %ecx + + addl %eax, %esi + + movl %esi, 4(%ebx) + movl SAVE_ESI, %esi + + adcl $0, %edx + + movl %edx, 8(%ebx) + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +C Could load yp earlier into another register. + + ALIGN(16) +L(two_by_two): + C eax xp low limb + C ebx wp + C ecx yp low limb + C edx + C esi xp + C edi + C ebp + +dnl FRAME carries on from previous + + mull %ecx C xp[0] * yp[0] + + push %edi defframe_pushl(`SAVE_EDI') + movl %edx, %edi C carry, for wp[1] + + movl %eax, (%ebx) + movl 4(%esi), %eax + + mull %ecx C xp[1] * yp[0] + + addl %eax, %edi + movl PARAM_YP, %ecx + + adcl $0, %edx + movl 4(%ecx), %ecx C yp[1] + movl %edi, 4(%ebx) + + movl 4(%esi), %eax C xp[1] + movl %edx, %edi C carry, for wp[2] + + mull %ecx C xp[1] * yp[1] + + addl %eax, %edi + + adcl $0, %edx + movl (%esi), %eax C xp[0] + + movl %edx, %esi C carry, for wp[3] + + mull %ecx C xp[0] * yp[1] + + addl %eax, 4(%ebx) + adcl %edx, %edi + movl %edi, 8(%ebx) + + adcl $0, %esi + movl SAVE_EDI, %edi + movl %esi, 12(%ebx) + + movl SAVE_ESI, %esi + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + + +C ----------------------------------------------------------------------------- + ALIGN(16) +L(xsize_more_than_two): + +C The first limb of yp is processed with a simple mpn_mul_1 style loop +C inline. Unrolling this doesn't seem worthwhile since it's only run once +C (whereas the addmul below is run ysize-1 many times). A call to the +C actual mpn_mul_1 will be slowed down by the call and parameter pushing and +C popping, and doesn't seem likely to be worthwhile on the typical 13-26 +C limb operations the Karatsuba code calls here with. + + C eax yp[0] + C ebx + C ecx xsize + C edx xp + C esi + C edi + C ebp + +dnl FRAME doesn't carry on from previous, no pushes yet here +defframe(`SAVE_EBX',-4) +defframe(`SAVE_ESI',-8) +defframe(`SAVE_EDI',-12) +defframe(`SAVE_EBP',-16) +deflit(`FRAME',0) + + subl $16, %esp +deflit(`FRAME',16) + + movl %edi, SAVE_EDI + movl PARAM_WP, %edi + + movl %ebx, SAVE_EBX + movl %ebp, SAVE_EBP + movl %eax, %ebp + + movl %esi, SAVE_ESI + xorl %ebx, %ebx + leal (%edx,%ecx,4), %esi C xp end + + leal (%edi,%ecx,4), %edi C wp end of mul1 + negl %ecx + + +L(mul1): + C eax scratch + C ebx carry + C ecx counter, negative + C edx scratch + C esi xp end + C edi wp end of mul1 + C ebp multiplier + + movl (%esi,%ecx,4), %eax + + mull %ebp + + addl %ebx, %eax + movl %eax, (%edi,%ecx,4) + movl $0, %ebx + + adcl %edx, %ebx + incl %ecx + jnz L(mul1) + + + movl PARAM_YSIZE, %edx + movl PARAM_XSIZE, %ecx + + movl %ebx, (%edi) C final carry + decl %edx + + jnz L(ysize_more_than_one) + + + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + + movl SAVE_EBP, %ebp + movl SAVE_ESI, %esi + addl $FRAME, %esp + + ret + + +L(ysize_more_than_one): + cmpl $UNROLL_THRESHOLD, %ecx + movl PARAM_YP, %eax + + jae L(unroll) + + +C ----------------------------------------------------------------------------- + C simple addmul looping + C + C eax yp + C ebx + C ecx xsize + C edx ysize-1 + C esi xp end + C edi wp end of mul1 + C ebp + + leal 4(%eax,%edx,4), %ebp C yp end + negl %ecx + negl %edx + + movl (%esi,%ecx,4), %eax C xp low limb + movl %edx, PARAM_YSIZE C -(ysize-1) + incl %ecx + + xorl %ebx, %ebx C initial carry + movl %ecx, PARAM_XSIZE C -(xsize-1) + movl %ebp, PARAM_YP + + movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier + jmp L(simple_outer_entry) + + + C this is offset 0x121 so close enough to aligned +L(simple_outer_top): + C ebp ysize counter, negative + + movl PARAM_YP, %edx + movl PARAM_XSIZE, %ecx C -(xsize-1) + xorl %ebx, %ebx C carry + + movl %ebp, PARAM_YSIZE + addl $4, %edi C next position in wp + + movl (%edx,%ebp,4), %ebp C yp limb - multiplier + movl -4(%esi,%ecx,4), %eax C xp low limb + + +L(simple_outer_entry): + +L(simple_inner): + C eax xp limb + C ebx carry limb + C ecx loop counter (negative) + C edx scratch + C esi xp end + C edi wp end + C ebp multiplier + + mull %ebp + + addl %eax, %ebx + adcl $0, %edx + + addl %ebx, (%edi,%ecx,4) + movl (%esi,%ecx,4), %eax + adcl $0, %edx + + incl %ecx + movl %edx, %ebx + jnz L(simple_inner) + + + mull %ebp + + movl PARAM_YSIZE, %ebp + addl %eax, %ebx + + adcl $0, %edx + addl %ebx, (%edi) + + adcl $0, %edx + incl %ebp + + movl %edx, 4(%edi) + jnz L(simple_outer_top) + + + movl SAVE_EBX, %ebx + movl SAVE_ESI, %esi + + movl SAVE_EDI, %edi + movl SAVE_EBP, %ebp + addl $FRAME, %esp + + ret + + + +C ----------------------------------------------------------------------------- +C +C The unrolled loop is the same as in mpn_addmul_1(), see that code for some +C comments. +C +C VAR_ADJUST is the negative of how many limbs the leals in the inner loop +C increment xp and wp. This is used to adjust back xp and wp, and rshifted +C to given an initial VAR_COUNTER at the top of the outer loop. +C +C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT +C up to -1, inclusive. +C +C VAR_JMP is the computed jump into the unrolled loop. +C +C VAR_XP_LOW is the least significant limb of xp, which is needed at the +C start of the unrolled loop. +C +C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1, +C inclusive. +C +C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be +C added to give the location of the next limb of yp, which is the multiplier +C in the unrolled loop. +C +C The trick with VAR_ADJUST means it's only necessary to do one fetch in the +C outer loop to take care of xp, wp and the inner loop counter. + +defframe(VAR_COUNTER, -20) +defframe(VAR_ADJUST, -24) +defframe(VAR_JMP, -28) +defframe(VAR_XP_LOW, -32) +deflit(VAR_EXTRA_SPACE, 16) + + +L(unroll): + C eax yp + C ebx + C ecx xsize + C edx ysize-1 + C esi xp end + C edi wp end of mul1 + C ebp + + movl PARAM_XP, %esi + movl 4(%eax), %ebp C multiplier (yp second limb) + leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing + + movl PARAM_WP, %edi + movl %eax, PARAM_YP + negl %edx + + movl %edx, PARAM_YSIZE + leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1 + decl %ecx C xsize-1 + + movl (%esi), %eax C xp low limb + andl $-UNROLL_MASK-1, %ebx + negl %ecx + + subl $VAR_EXTRA_SPACE, %esp +deflit(`FRAME',16+VAR_EXTRA_SPACE) + negl %ebx + andl $UNROLL_MASK, %ecx + + movl %ebx, VAR_ADJUST + movl %ecx, %edx + shll $4, %ecx + + sarl $UNROLL_LOG2, %ebx + + C 17 code bytes per limb +ifdef(`PIC',` + call L(pic_calc) +L(unroll_here): +',` + leal L(unroll_entry) (%ecx,%edx,1), %ecx +') + negl %edx + + movl %eax, VAR_XP_LOW + movl %ecx, VAR_JMP + leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling, + leal 4(%esi,%edx,4), %esi C and start at second limb + jmp L(unroll_outer_entry) + + +ifdef(`PIC',` +L(pic_calc): + C See mpn/x86/README about old gas bugs + leal (%ecx,%edx,1), %ecx + addl $L(unroll_entry)-L(unroll_here), %ecx + addl (%esp), %ecx + ret_internal +') + + +C -------------------------------------------------------------------------- + ALIGN(32) +L(unroll_outer_top): + C ebp ysize counter, negative + + movl VAR_ADJUST, %ebx + movl PARAM_YP, %edx + + movl VAR_XP_LOW, %eax + movl %ebp, PARAM_YSIZE C store incremented ysize counter + + leal 4(%edi,%ebx,4), %edi + leal (%esi,%ebx,4), %esi + sarl $UNROLL_LOG2, %ebx + + movl (%edx,%ebp,4), %ebp C yp next multiplier + movl VAR_JMP, %ecx + +L(unroll_outer_entry): + mull %ebp + + testb $1, %cl C and clear carry bit + movl %ebx, VAR_COUNTER + movl $0, %ebx + + movl $0, %ecx + cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb + cmovnz( %eax, %ebx) + + C Extra fetch of VAR_JMP is bad, but registers are tight + jmp *VAR_JMP + + +C ----------------------------------------------------------------------------- + ALIGN(32) +L(unroll_top): + C eax xp limb + C ebx carry high + C ecx carry low + C edx scratch + C esi xp+8 + C edi wp + C ebp yp multiplier limb + C + C VAR_COUNTER loop counter, negative + C + C 17 bytes each limb + +L(unroll_entry): + +deflit(CHUNK_COUNT,2) +forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, ` + deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) + deflit(`disp1', eval(disp0 + 4)) + +Zdisp( movl, disp0,(%esi), %eax) + adcl %edx, %ebx + + mull %ebp + +Zdisp( addl, %ecx, disp0,(%edi)) + movl $0, %ecx + + adcl %eax, %ebx + + + movl disp1(%esi), %eax + adcl %edx, %ecx + + mull %ebp + + addl %ebx, disp1(%edi) + movl $0, %ebx + + adcl %eax, %ecx +') + + + incl VAR_COUNTER + leal UNROLL_BYTES(%esi), %esi + leal UNROLL_BYTES(%edi), %edi + + jnz L(unroll_top) + + + C eax + C ebx zero + C ecx low + C edx high + C esi + C edi wp, pointing at second last limb) + C ebp + C + C carry flag to be added to high + +deflit(`disp0', ifelse(UNROLL_BYTES,256,-128)) +deflit(`disp1', eval(disp0-0 + 4)) + + movl PARAM_YSIZE, %ebp + adcl $0, %edx + addl %ecx, disp0(%edi) + + adcl $0, %edx + incl %ebp + + movl %edx, disp1(%edi) + jnz L(unroll_outer_top) + + + movl SAVE_ESI, %esi + movl SAVE_EBP, %ebp + + movl SAVE_EDI, %edi + movl SAVE_EBX, %ebx + addl $FRAME, %esp + + ret + +EPILOGUE()