Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / x86 / k6 / k62mmx / copyd.asm
diff --git a/gmp/mpn/x86/k6/k62mmx/copyd.asm b/gmp/mpn/x86/k6/k62mmx/copyd.asm

new file mode 100644 (file)

index 0000000..227ed78
--- /dev/null
+++ b/gmp/mpn/x86/k6/k62mmx/copyd.asm
@@ -0,0 +1,107 @@
+dnl  AMD K6-2 mpn_copyd -- copy limb vector, decrementing.
+
+dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C K6-2: 1.0 cycles/limb
+
+
+C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30
+C cycle startup time, which amounts for instance to a 2x speedup at 15
+C limbs.
+C
+C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by
+C processing one limb separately to make it aligned.  This and a final odd
+C limb are handled in a branch-free fashion, ending up re-copying if the
+C special case isn't needed.
+C
+C Alternatives:
+C
+C There used to be a big unrolled version of this, running at 0.56 c/l if
+C the destination was aligned, but that seemed rather excessive for the
+C relative importance of copyd.
+C
+C If the destination alignment is ignored and just left to run at 1.17 c/l
+C some code size and a fixed few cycles can be saved.  Considering how few
+C uses copyd finds perhaps that should be favoured.  The current code has
+C the attraction of being no slower than a basic rep movsl though.
+
+defframe(PARAM_SIZE,12)
+defframe(PARAM_SRC, 8)
+defframe(PARAM_DST, 4)
+
+dnl  re-using parameter space
+define(SAVE_EBX,`PARAM_SIZE')
+
+       TEXT
+       ALIGN(16)
+
+PROLOGUE(mpn_copyd)
+deflit(`FRAME',0)
+
+       movl    PARAM_SIZE, %ecx
+       movl    %ebx, SAVE_EBX
+
+       movl    PARAM_SRC, %eax
+       movl    PARAM_DST, %edx
+
+       subl    $1, %ecx                C better code alignment than decl
+       jb      L(zero)
+
+       jz      L(one_more)
+       leal    4(%edx,%ecx,4), %ebx
+
+Zdisp( movd,   0,(%eax,%ecx,4), %mm0)  C high limb
+Zdisp( movd,   %mm0, 0,(%edx,%ecx,4))  C Zdisp for good code alignment
+
+       cmpl    $1, %ecx
+       je      L(one_more)
+
+       shrl    $2, %ebx
+       andl    $1, %ebx                C 1 if dst[size-2] unaligned
+
+       subl    %ebx, %ecx
+       nop                             C code alignment
+
+L(top):
+       C eax   src
+       C ebx
+       C ecx   counter
+       C edx   dst
+
+       movq    -4(%eax,%ecx,4), %mm0
+       subl    $2, %ecx
+
+       movq    %mm0, 4(%edx,%ecx,4)
+       ja      L(top)
+
+
+L(one_more):
+       movd    (%eax), %mm0
+       movd    %mm0, (%edx)
+
+       movl    SAVE_EBX, %ebx
+       emms_or_femms
+L(zero):
+       ret
+
+EPILOGUE()