Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / x86 / pentium / mmx / mul_1.asm
diff --git a/gmp/mpn/x86/pentium/mmx/mul_1.asm b/gmp/mpn/x86/pentium/mmx/mul_1.asm

new file mode 100644 (file)

index 0000000..b9fe77e
--- /dev/null
+++ b/gmp/mpn/x86/pentium/mmx/mul_1.asm
@@ -0,0 +1,360 @@
+dnl  Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
+
+dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C    cycles/limb
+C P5:   12.0   for 32-bit multiplier
+C        7.0   for 16-bit multiplier
+
+
+C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                      mp_limb_t multiplier);
+C
+C When the multiplier is 16 bits some special case MMX code is used.  Small
+C multipliers might arise reasonably often from mpz_mul_ui etc.  If the size
+C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
+C size==8 end up being quite close.  If src isn't aligned to an 8 byte
+C boundary then one limb is processed separately with roughly a 5 cycle
+C penalty, so in that case it's say size==8 and size==9 which are close.
+C
+C Alternatives:
+C
+C MMX is not believed to be of any use for 32-bit multipliers, since for
+C instance the current method would just have to be more or less duplicated
+C for the high and low halves of the multiplier, and would probably
+C therefore run at about 14 cycles, which is slower than the plain integer
+C at 12.
+C
+C Adding the high and low MMX products using integer code seems best.  An
+C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
+C any joy.  Perhaps something could be done keeping the values signed and
+C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
+C perhaps not.
+C
+C Future:
+C
+C An mpn_mul_1c entrypoint would need a double carry out of the low result
+C limb in the 16-bit code, unless it could be assumed the carry fits in 16
+C bits, possibly as carry<multiplier, this being true of a big calculation
+C done piece by piece.  But let's worry about that if/when mul_1c is
+C actually used.
+
+defframe(PARAM_MULTIPLIER,16)
+defframe(PARAM_SIZE,      12)
+defframe(PARAM_SRC,       8)
+defframe(PARAM_DST,       4)
+
+       TEXT
+
+       ALIGN(8)
+PROLOGUE(mpn_mul_1)
+deflit(`FRAME',0)
+
+       movl    PARAM_SIZE, %ecx
+       movl    PARAM_SRC, %edx
+
+       cmpl    $1, %ecx
+       jne     L(two_or_more)
+
+       C one limb only
+
+       movl    PARAM_MULTIPLIER, %eax
+       movl    PARAM_DST, %ecx
+
+       mull    (%edx)
+
+       movl    %eax, (%ecx)
+       movl    %edx, %eax
+
+       ret
+
+
+L(two_or_more):
+       C eax   size
+       C ebx
+       C ecx   carry
+       C edx
+       C esi   src
+       C edi
+       C ebp
+
+       pushl   %esi            FRAME_pushl()
+       pushl   %edi            FRAME_pushl()
+
+       movl    %edx, %esi              C src
+       movl    PARAM_DST, %edi
+
+       movl    PARAM_MULTIPLIER, %eax
+       pushl   %ebx            FRAME_pushl()
+
+       leal    (%esi,%ecx,4), %esi     C src end
+       leal    (%edi,%ecx,4), %edi     C dst end
+
+       negl    %ecx                    C -size
+
+       pushl   %ebp            FRAME_pushl()
+       cmpl    $65536, %eax
+
+       jb      L(small)
+
+
+L(big):
+       xorl    %ebx, %ebx              C carry limb
+       sarl    %ecx                    C -size/2
+
+       jnc     L(top)                  C with carry flag clear
+
+
+       C size was odd, process one limb separately
+
+       mull    4(%esi,%ecx,8)          C m * src[0]
+
+       movl    %eax, 4(%edi,%ecx,8)
+       incl    %ecx
+
+       orl     %edx, %ebx              C carry limb, and clear carry flag
+
+
+L(top):
+       C eax
+       C ebx   carry
+       C ecx   counter, negative
+       C edx
+       C esi   src end
+       C edi   dst end
+       C ebp   (scratch carry)
+
+       adcl    $0, %ebx
+       movl    (%esi,%ecx,8), %eax
+
+       mull    PARAM_MULTIPLIER
+
+       movl    %edx, %ebp
+       addl    %eax, %ebx
+
+       adcl    $0, %ebp
+       movl    4(%esi,%ecx,8), %eax
+
+       mull    PARAM_MULTIPLIER
+
+       movl    %ebx, (%edi,%ecx,8)
+       addl    %ebp, %eax
+
+       movl    %eax, 4(%edi,%ecx,8)
+       incl    %ecx
+
+       movl    %edx, %ebx
+       jnz     L(top)
+
+
+       adcl    $0, %ebx
+       popl    %ebp
+
+       movl    %ebx, %eax
+       popl    %ebx
+
+       popl    %edi
+       popl    %esi
+
+       ret
+
+
+L(small):
+       C Special case for 16-bit multiplier.
+       C
+       C eax   multiplier
+       C ebx
+       C ecx   -size
+       C edx   src
+       C esi   src end
+       C edi   dst end
+       C ebp   multiplier
+
+       C size<3 not supported here.  At size==3 we're already a couple of
+       C cycles faster, so there's no threshold as such, just use the MMX
+       C as soon as possible.
+
+       cmpl    $-3, %ecx
+       ja      L(big)
+
+       movd    %eax, %mm7              C m
+       pxor    %mm6, %mm6              C initial carry word
+
+       punpcklwd %mm7, %mm7            C m replicated 2 times
+       addl    $2, %ecx                C -size+2
+
+       punpckldq %mm7, %mm7            C m replicated 4 times
+       andl    $4, %edx                C test alignment, clear carry flag
+
+       movq    %mm7, %mm0              C m
+       jz      L(small_entry)
+
+
+       C Source is unaligned, process one limb separately.
+       C
+       C Plain integer code is used here, since it's smaller and is about
+       C the same 13 cycles as an mmx block would be.
+       C
+       C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
+       C the use of separate incl and orl.
+
+       mull    -8(%esi,%ecx,4)         C m * src[0]
+
+       movl    %eax, -8(%edi,%ecx,4)   C dst[0]
+       incl    %ecx                    C one limb processed
+
+       movd    %edx, %mm6              C initial carry
+
+       orl     %eax, %eax              C clear carry flag
+       jmp     L(small_entry)
+
+
+C The scheduling here is quite tricky, since so many instructions have
+C pairing restrictions.  In particular the js won't pair with a movd, and
+C can't be paired with an adc since it wants flags from the inc, so
+C instructions are rotated to the top of the loop to find somewhere useful
+C for it.
+C
+C Trouble has been taken to avoid overlapping successive loop iterations,
+C since that would greatly increase the size of the startup and finishup
+C code.  Actually there's probably not much advantage to be had from
+C overlapping anyway, since the difficulties are mostly with pairing, not
+C with latencies as such.
+C
+C In the comments x represents the src data and m the multiplier (16
+C bits, but replicated 4 times).
+C
+C The m signs calculated in %mm3 are a loop invariant and could be held in
+C say %mm5, but that would save only one instruction and hence be no faster.
+
+L(small_top):
+       C eax   l.low, then l.high
+       C ebx   (h.low)
+       C ecx   counter, -size+2 to 0 or 1
+       C edx   (h.high)
+       C esi   &src[size]
+       C edi   &dst[size]
+       C ebp
+       C
+       C %mm0  (high products)
+       C %mm1  (low products)
+       C %mm2  (adjust for m using x signs)
+       C %mm3  (adjust for x using m signs)
+       C %mm4
+       C %mm5
+       C %mm6  h.low, then carry
+       C %mm7  m replicated 4 times
+
+       movd    %mm6, %ebx              C h.low
+       psrlq   $32, %mm1               C l.high
+
+       movd    %mm0, %edx              C h.high
+       movq    %mm0, %mm6              C new c
+
+       adcl    %eax, %ebx
+       incl    %ecx
+
+       movd    %mm1, %eax              C l.high
+       movq    %mm7, %mm0
+
+       adcl    %eax, %edx
+       movl    %ebx, -16(%edi,%ecx,4)
+
+       movl    %edx, -12(%edi,%ecx,4)
+       psrlq   $32, %mm6               C c
+
+L(small_entry):
+       pmulhw  -8(%esi,%ecx,4), %mm0   C h = (x*m).high
+       movq    %mm7, %mm1
+
+       pmullw  -8(%esi,%ecx,4), %mm1   C l = (x*m).low
+       movq    %mm7, %mm3
+
+       movq    -8(%esi,%ecx,4), %mm2   C x
+       psraw   $15, %mm3               C m signs
+
+       pand    -8(%esi,%ecx,4), %mm3   C x selected by m signs
+       psraw   $15, %mm2               C x signs
+
+       paddw   %mm3, %mm0              C add x to h if m neg
+       pand    %mm7, %mm2              C m selected by x signs
+
+       paddw   %mm2, %mm0              C add m to h if x neg
+       incl    %ecx
+
+       movd    %mm1, %eax              C l.low
+       punpcklwd %mm0, %mm6            C c + h.low << 16
+
+       psrlq   $16, %mm0               C h.high
+       js      L(small_top)
+
+
+
+
+       movd    %mm6, %ebx              C h.low
+       psrlq   $32, %mm1               C l.high
+
+       adcl    %eax, %ebx
+       popl    %ebp            FRAME_popl()
+
+       movd    %mm0, %edx              C h.high
+       psrlq   $32, %mm0               C l.high
+
+       movd    %mm1, %eax              C l.high
+
+       adcl    %eax, %edx
+       movl    %ebx, -12(%edi,%ecx,4)
+
+       movd    %mm0, %eax              C c
+
+       adcl    $0, %eax
+       movl    %edx, -8(%edi,%ecx,4)
+
+       orl     %ecx, %ecx
+       jnz     L(small_done)           C final %ecx==1 means even, ==0 odd
+
+
+       C Size odd, one extra limb to process.
+       C Plain integer code is used here, since it's smaller and is about
+       C the same speed as another mmx block would be.
+
+       movl    %eax, %ecx
+       movl    PARAM_MULTIPLIER, %eax
+
+       mull    -4(%esi)
+
+       addl    %ecx, %eax
+
+       adcl    $0, %edx
+       movl    %eax, -4(%edi)
+
+       movl    %edx, %eax
+L(small_done):
+       popl    %ebx
+
+       popl    %edi
+       popl    %esi
+
+       emms
+
+       ret
+
+EPILOGUE()