Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / x86_64 / aorrlsh_n.asm
diff --git a/gmp/mpn/x86_64/aorrlsh_n.asm b/gmp/mpn/x86_64/aorrlsh_n.asm

new file mode 100644 (file)

index 0000000..55176f7
--- /dev/null
+++ b/gmp/mpn/x86_64/aorrlsh_n.asm
@@ -0,0 +1,161 @@
+dnl  AMD64 mpn_addlsh_n and mpn_rsblsh_n.  R = V2^k +- U.
+dnl  ("rsb" means reversed subtract, name mandated by mpn_sublsh1_n which
+dnl  subtacts the shifted operand from the unshifted operand.)
+
+dnl  Copyright 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C           cycles/limb
+C K8,K9:        3.25   (mpn_lshift + mpn_add_n costs about 4.1 c/l)
+C K10:          3.25   (mpn_lshift + mpn_add_n costs about 4.1 c/l)
+C P4:          14
+C P6-15:        4
+
+C This was written quickly and not optimized at all.  Surely one could get
+C closer to 3 c/l or perhaps even under 3 c/l.  Ideas:
+C   1) Use indexing to save the 3 LEA
+C   2) Write reasonable feed-in code
+C   3) Be more clever about register usage
+C   4) Unroll more, handling CL negation, carry save/restore cost much now
+C   5) Reschedule
+
+C INPUT PARAMETERS
+define(`rp',   `%rdi')
+define(`up',   `%rsi')
+define(`vp',   `%rdx')
+define(`n',    `%rcx')
+define(`cnt'   `%r8')
+
+ifdef(`OPERATION_addlsh_n',`
+  define(ADDSUBC,       `adc')
+  define(func, mpn_addlsh_n)
+')
+ifdef(`OPERATION_rsblsh_n',`
+  define(ADDSUBC,       `sbb')
+  define(func, mpn_rsblsh_n)
+')
+
+MULFUNC_PROLOGUE(mpn_addlsh_n mpn_rsblsh_n)
+
+ASM_START()
+       TEXT
+       ALIGN(16)
+PROLOGUE(func)
+
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       push    %rbx
+
+       mov     n, %rax
+       xor     %ebx, %ebx              C clear carry save register
+       mov     %r8d, %ecx              C shift count
+       xor     %r15d, %r15d            C limb carry
+
+       mov     %eax, %r11d
+       and     $3, %r11d
+       je      L(4)
+       sub     $1, %r11d
+
+L(oopette):
+       mov     0(vp), %r8
+       mov     %r8, %r12
+       shl     %cl, %r8
+       or      %r15, %r8
+       neg     %cl
+       mov     %r12, %r15
+       shr     %cl, %r15
+       neg     %cl
+       add     %ebx, %ebx
+       ADDSUBC 0(up), %r8
+       mov     %r8, 0(rp)
+       sbb     %ebx, %ebx
+       lea     8(up), up
+       lea     8(vp), vp
+       lea     8(rp), rp
+       sub     $1, %r11d
+       jnc     L(oopette)
+
+L(4):
+       sub     $4, %rax
+       jc      L(end)
+
+L(oop):
+       mov     0(vp), %r8
+       mov     %r8, %r12
+       mov     8(vp), %r9
+       mov     %r9, %r13
+       mov     16(vp), %r10
+       mov     %r10, %r14
+       mov     24(vp), %r11
+
+       shl     %cl, %r8
+       shl     %cl, %r9
+       shl     %cl, %r10
+       or      %r15, %r8
+       mov     %r11, %r15
+       shl     %cl, %r11
+
+       neg     %cl
+
+       shr     %cl, %r12
+       shr     %cl, %r13
+       shr     %cl, %r14
+       shr     %cl, %r15               C used next loop
+
+       or      %r12, %r9
+       or      %r13, %r10
+       or      %r14, %r11
+
+       neg     %cl
+
+       add     %ebx, %ebx              C restore carry flag
+
+       ADDSUBC 0(up), %r8
+       ADDSUBC 8(up), %r9
+       ADDSUBC 16(up), %r10
+       ADDSUBC 24(up), %r11
+
+       mov     %r8, 0(rp)
+       mov     %r9, 8(rp)
+       mov     %r10, 16(rp)
+       mov     %r11, 24(rp)
+
+       sbb     %ebx, %ebx              C save carry flag
+
+       lea     32(up), up
+       lea     32(vp), vp
+       lea     32(rp), rp
+
+       sub     $4, %rax
+       jnc     L(oop)
+L(end):
+       add     %ebx, %ebx
+       adc     $0, %r15
+       mov     %r15, %rax
+       pop     %rbx
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+
+       ret
+EPILOGUE()