Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / x86 / mul_basecase.asm
diff --git a/gmp/mpn/x86/mul_basecase.asm b/gmp/mpn/x86/mul_basecase.asm

new file mode 100644 (file)

index 0000000..7918ea0
--- /dev/null
+++ b/gmp/mpn/x86/mul_basecase.asm
@@ -0,0 +1,213 @@
+dnl  x86 mpn_mul_basecase -- Multiply two limb vectors and store the result
+dnl  in a third limb vector.
+
+dnl  Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free Software
+dnl  Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C     cycles/crossproduct
+C P5:     15
+C P6:      7.5
+C K6:     12.5
+C K7:      5.5
+C P4:     24
+
+
+C void mpn_mul_basecase (mp_ptr wp,
+C                        mp_srcptr xp, mp_size_t xsize,
+C                        mp_srcptr yp, mp_size_t ysize);
+C
+C This was written in a haste since the Pentium optimized code that was used
+C for all x86 machines was slow for the Pentium II.  This code would benefit
+C from some cleanup.
+C
+C To shave off some percentage of the run-time, one should make 4 variants
+C of the Louter loop, for the four different outcomes of un mod 4.  That
+C would avoid Loop0 altogether.  Code expansion would be > 4-fold for that
+C part of the function, but since it is not very large, that would be
+C acceptable.
+C
+C The mul loop (at L(oopM)) might need some tweaking.  It's current speed is
+C unknown.
+
+defframe(PARAM_YSIZE,20)
+defframe(PARAM_YP,   16)
+defframe(PARAM_XSIZE,12)
+defframe(PARAM_XP,   8)
+defframe(PARAM_WP,   4)
+
+defframe(VAR_MULTIPLIER, -4)
+defframe(VAR_COUNTER,    -8)
+deflit(VAR_STACK_SPACE,  8)
+
+       TEXT
+       ALIGN(8)
+
+PROLOGUE(mpn_mul_basecase)
+deflit(`FRAME',0)
+
+       subl    $VAR_STACK_SPACE,%esp
+       pushl   %esi
+       pushl   %ebp
+       pushl   %edi
+deflit(`FRAME',eval(VAR_STACK_SPACE+12))
+
+       movl    PARAM_XP,%esi
+       movl    PARAM_WP,%edi
+       movl    PARAM_YP,%ebp
+
+       movl    (%esi),%eax             C load xp[0]
+       mull    (%ebp)                  C multiply by yp[0]
+       movl    %eax,(%edi)             C store to wp[0]
+       movl    PARAM_XSIZE,%ecx        C xsize
+       decl    %ecx                    C If xsize = 1, ysize = 1 too
+       jz      L(done)
+
+       pushl   %ebx
+FRAME_pushl()
+       movl    %edx,%ebx
+
+       leal    4(%esi),%esi
+       leal    4(%edi),%edi
+
+L(oopM):
+       movl    (%esi),%eax             C load next limb at xp[j]
+       leal    4(%esi),%esi
+       mull    (%ebp)
+       addl    %ebx,%eax
+       movl    %edx,%ebx
+       adcl    $0,%ebx
+       movl    %eax,(%edi)
+       leal    4(%edi),%edi
+       decl    %ecx
+       jnz     L(oopM)
+
+       movl    %ebx,(%edi)             C most significant limb of product
+       addl    $4,%edi                 C increment wp
+       movl    PARAM_XSIZE,%eax
+       shll    $2,%eax
+       subl    %eax,%edi
+       subl    %eax,%esi
+
+       movl    PARAM_YSIZE,%eax        C ysize
+       decl    %eax
+       jz      L(skip)
+       movl    %eax,VAR_COUNTER        C set index i to ysize
+
+L(outer):
+       movl    PARAM_YP,%ebp           C yp
+       addl    $4,%ebp                 C make ebp point to next v limb
+       movl    %ebp,PARAM_YP
+       movl    (%ebp),%eax             C copy y limb ...
+       movl    %eax,VAR_MULTIPLIER     C ... to stack slot
+       movl    PARAM_XSIZE,%ecx
+
+       xorl    %ebx,%ebx
+       andl    $3,%ecx
+       jz      L(end0)
+
+L(oop0):
+       movl    (%esi),%eax
+       mull    VAR_MULTIPLIER
+       leal    4(%esi),%esi
+       addl    %ebx,%eax
+       movl    $0,%ebx
+       adcl    %ebx,%edx
+       addl    %eax,(%edi)
+       adcl    %edx,%ebx               C propagate carry into cylimb
+
+       leal    4(%edi),%edi
+       decl    %ecx
+       jnz     L(oop0)
+
+L(end0):
+       movl    PARAM_XSIZE,%ecx
+       shrl    $2,%ecx
+       jz      L(endX)
+
+       ALIGN(8)
+L(oopX):
+       movl    (%esi),%eax
+       mull    VAR_MULTIPLIER
+       addl    %eax,%ebx
+       movl    $0,%ebp
+       adcl    %edx,%ebp
+
+       movl    4(%esi),%eax
+       mull    VAR_MULTIPLIER
+       addl    %ebx,(%edi)
+       adcl    %eax,%ebp       C new lo + cylimb
+       movl    $0,%ebx
+       adcl    %edx,%ebx
+
+       movl    8(%esi),%eax
+       mull    VAR_MULTIPLIER
+       addl    %ebp,4(%edi)
+       adcl    %eax,%ebx       C new lo + cylimb
+       movl    $0,%ebp
+       adcl    %edx,%ebp
+
+       movl    12(%esi),%eax
+       mull    VAR_MULTIPLIER
+       addl    %ebx,8(%edi)
+       adcl    %eax,%ebp       C new lo + cylimb
+       movl    $0,%ebx
+       adcl    %edx,%ebx
+
+       addl    %ebp,12(%edi)
+       adcl    $0,%ebx         C propagate carry into cylimb
+
+       leal    16(%esi),%esi
+       leal    16(%edi),%edi
+       decl    %ecx
+       jnz     L(oopX)
+
+L(endX):
+       movl    %ebx,(%edi)
+       addl    $4,%edi
+
+       C we incremented wp and xp in the loop above; compensate
+       movl    PARAM_XSIZE,%eax
+       shll    $2,%eax
+       subl    %eax,%edi
+       subl    %eax,%esi
+
+       movl    VAR_COUNTER,%eax
+       decl    %eax
+       movl    %eax,VAR_COUNTER
+       jnz     L(outer)
+
+L(skip):
+       popl    %ebx
+       popl    %edi
+       popl    %ebp
+       popl    %esi
+       addl    $8,%esp
+       ret
+
+L(done):
+       movl    %edx,4(%edi)       C store to wp[1]
+       popl    %edi
+       popl    %ebp
+       popl    %esi
+       addl    $8,%esp
+       ret
+
+EPILOGUE()