Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / x86 / pentium / copyi.asm
diff --git a/gmp/mpn/x86/pentium/copyi.asm b/gmp/mpn/x86/pentium/copyi.asm

new file mode 100644 (file)

index 0000000..9da08e2
--- /dev/null
+++ b/gmp/mpn/x86/pentium/copyi.asm
@@ -0,0 +1,153 @@
+dnl  Intel Pentium mpn_copyi -- copy limb vector, incrementing.
+
+dnl  Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.25 cycles/limb
+
+
+C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C Destination prefetching is done to avoid repeated write-throughs on lines
+C not already in L1.
+C
+C At least one of the src or dst pointer needs to be incremented rather than
+C using indexing, so that there's somewhere to put the loop control without
+C an AGI.  Incrementing one and not two lets us keep loop overhead to 2
+C cycles.  Making it the src pointer incremented avoids an AGI on the %ecx
+C subtracts in the finishup code.
+C
+C The block of finishup code is almost as big as the main loop itself, which
+C is unfortunate, but it's faster that way than with say rep movsl, by about
+C 10 cycles for instance on P55.
+C
+C There's nothing to be gained from MMX on P55, since it can do only one
+C movq load (or store) per cycle, so the throughput would be the same as the
+C code here (and even then only if src and dst have the same alignment mod
+C 8).
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+       TEXT
+       ALIGN(8)
+PROLOGUE(mpn_copyi)
+deflit(`FRAME',0)
+
+       movl    PARAM_SIZE, %ecx
+       movl    PARAM_DST, %edx
+
+       pushl   %ebx    FRAME_pushl()
+       pushl   %esi    FRAME_pushl()
+
+       leal    (%edx,%ecx,4), %edx     C &dst[size-1]
+       xorl    $-1, %ecx               C -size-1
+
+       movl    PARAM_SRC, %esi
+       addl    $8, %ecx                C -size+7
+
+       jns     L(end)
+
+       movl    -28(%edx,%ecx,4), %eax  C fetch destination cache line, dst[0]
+       nop
+
+L(top):
+       C eax   scratch
+       C ebx   scratch
+       C ecx   counter, limbs, negative
+       C edx   &dst[size-1]
+       C esi   src, incrementing
+       C edi
+       C ebp
+
+       movl    (%edx,%ecx,4), %eax     C fetch destination cache line
+       addl    $8, %ecx
+
+       movl    (%esi), %eax            C read words pairwise
+       movl    4(%esi), %ebx
+       movl    %eax, -60(%edx,%ecx,4)  C store words pairwise
+       movl    %ebx, -56(%edx,%ecx,4)
+
+       movl    8(%esi), %eax
+       movl    12(%esi), %ebx
+       movl    %eax, -52(%edx,%ecx,4)
+       movl    %ebx, -48(%edx,%ecx,4)
+
+       movl    16(%esi), %eax
+       movl    20(%esi), %ebx
+       movl    %eax, -44(%edx,%ecx,4)
+       movl    %ebx, -40(%edx,%ecx,4)
+
+       movl    24(%esi), %eax
+       movl    28(%esi), %ebx
+       movl    %eax, -36(%edx,%ecx,4)
+       movl    %ebx, -32(%edx,%ecx,4)
+
+       leal    32(%esi), %esi
+       js      L(top)
+
+
+L(end):
+       C ecx   0 to 7, representing respectively 7 to 0 limbs remaining
+       C esi   src end
+       C edx   dst, next location to store
+
+       subl    $4, %ecx
+       jns     L(no4)
+
+       movl    (%esi), %eax
+       movl    4(%esi), %ebx
+       movl    %eax, -12(%edx,%ecx,4)
+       movl    %ebx, -8(%edx,%ecx,4)
+
+       movl    8(%esi), %eax
+       movl    12(%esi), %ebx
+       movl    %eax, -4(%edx,%ecx,4)
+       movl    %ebx, (%edx,%ecx,4)
+
+       addl    $16, %esi
+       addl    $4, %ecx
+L(no4):
+
+       subl    $2, %ecx
+       jns     L(no2)
+
+       movl    (%esi), %eax
+       movl    4(%esi), %ebx
+       movl    %eax, -4(%edx,%ecx,4)
+       movl    %ebx, (%edx,%ecx,4)
+
+       addl    $8, %esi
+       addl    $2, %ecx
+L(no2):
+
+       jnz     L(done)
+
+       movl    (%esi), %eax
+       movl    %eax, -4(%edx,%ecx,4)   C risk of cache bank clash here
+
+L(done):
+       popl    %esi
+       popl    %ebx
+
+       ret
+
+EPILOGUE()