Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / alpha / ev5 / com_n.asm
diff --git a/gmp/mpn/alpha/ev5/com_n.asm b/gmp/mpn/alpha/ev5/com_n.asm

new file mode 100644 (file)

index 0000000..979e711
--- /dev/null
+++ b/gmp/mpn/alpha/ev5/com_n.asm
@@ -0,0 +1,165 @@
+dnl  Alpha EV5 mpn_com_n -- mpn one's complement.
+
+dnl  Copyright 2003 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C      cycles/limb
+C EV4:    4.75
+C EV5:    2.0
+C EV6:    1.5
+
+
+C mp_limb_t mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
+C 2.0 c/l.  In general, a pattern like this unrolled to N limbs per loop
+C will be 1.5+2/N c/l.
+C
+C 2 cycles of loop control are unavoidable, for pointer updates and the
+C taken branch bubble, but also since ldq cannot issue two cycles after stq
+C (and with a run of stqs that means neither of two cycles at the end of the
+C loop.
+C
+C The fbeq is forced into the second cycle of the loop using unops, since
+C the first time through it must wait for the cvtqt result.  Once that
+C result is ready (a 1 cycle stall) then both the branch and following loads
+C can issue together.
+C
+C The main loop handles an odd count of limbs, being two limbs loaded before
+C each size test, plus one pipelined around from the previous iteration (or
+C setup in the entry sequence).
+C
+C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
+C entry sequence, and an increment of the pointers.  For an odd size there's
+C no increment and the first store in the loop (r24) is a repeat of dst[0].
+C
+C Note that the load for r24 after the possible pointer increment is done
+C before the explicit store to dst[0], in case src==dst.
+
+
+ASM_START()
+
+FLOAT64(L(dat), 2.0)
+
+       ALIGN(16)
+
+PROLOGUE(mpn_com_n,gp)
+
+       C r16   dst
+       C r17   src
+       C r18   size
+
+       lda     r30, -16(r30)           C temporary stack space
+       lda     r7, -3(r18)             C size - 3
+
+       ldq     r20, 0(r17)             C src[0]
+       srl     r7, 1, r6               C (size-3)/2
+
+       stq     r6, 8(r30)              C (size-3)/2
+       and     r7, 1, r5               C 1 if size even
+
+       LEA(    r8, L(dat))
+       s8addq  r5, r17, r17            C skip src[0] if even
+
+       ornot   r31, r20, r20           C ~src[0]
+       unop
+
+       ldt     f0, 8(r30)              C (size-3)/2
+       ldq     r24, 0(r17)             C src[0 or 1]
+
+       stq     r20, 0(r16)             C dst[0]
+       s8addq  r5, r16, r19            C skip dst[0] if even
+
+       ldt     f1, 0(r8)               C data 2.0
+       lda     r30, 16(r30)            C restore stack
+       unop
+       cvtqt   f0, f0                  C (size-3)/2 as float
+
+       ornot   r31, r24, r24
+       blt     r7, L(done_1)           C if size<=2
+       unop
+       unop
+
+
+       C 16-byte alignment here
+L(top):
+       C r17   src, incrementing
+       C r19   dst, incrementing
+       C r24   dst[i] result, ready to store
+       C f0    (size-3)/2, decrementing
+       C f1    2.0
+
+       ldq     r20, 8(r17)             C src[i+1]
+       ldq     r21, 16(r17)            C src[i+2]
+       unop
+       unop
+
+       fbeq    f0, L(done_2)
+       unop
+       ldq     r22, 24(r17)            C src[i+3]
+       ldq     r23, 32(r17)            C src[i+4]
+
+       stq     r24, 0(r19)             C dst[i]
+       ornot   r31, r20, r20
+       subt    f0, f1, f0              C count -= 2
+       unop
+
+       stq     r20, 8(r19)             C dst[i+1]
+       ornot   r31, r21, r21
+       unop
+       unop
+
+       stq     r21, 16(r19)            C dst[i+2]
+       ornot   r31, r22, r22
+
+       stq     r22, 24(r19)            C dst[i+3]
+       ornot   r31, r23, r24
+
+       lda     r17, 32(r17)            C src += 4
+       lda     r19, 32(r19)            C dst += 4
+       unop
+       fbge    f0, L(top)
+
+
+L(done_1):
+       C r19   &dst[size-1]
+       C r24   result for dst[size-1]
+
+       stq     r24, 0(r19)             C dst[size-1]
+       ret     r31, (r26), 1
+
+
+L(done_2):
+       C r19   &dst[size-3]
+       C r20   src[size-2]
+       C r21   src[size-1]
+       C r24   result for dst[size-3]
+
+       stq     r24, 0(r19)             C dst[size-3]
+       ornot   r31, r20, r20
+
+       stq     r20, 8(r19)             C dst[size-2]
+       ornot   r31, r21, r21
+
+       stq     r21, 16(r19)            C dst[size-1]
+       ret     r31, (r26), 1
+
+EPILOGUE()
+ASM_END()