X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Falpha%2Fev5%2Fcom_n.asm;fp=gmp%2Fmpn%2Falpha%2Fev5%2Fcom_n.asm;h=979e711eb8d5e2cc1b91a93ec400a07708295fa5;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git

diff --git a/gmp/mpn/alpha/ev5/com_n.asm b/gmp/mpn/alpha/ev5/com_n.asm
new file mode 100644
index 00000000..979e711e
--- /dev/null
+++ b/gmp/mpn/alpha/ev5/com_n.asm
@@ -0,0 +1,165 @@
+dnl  Alpha EV5 mpn_com_n -- mpn one's complement.
+
+dnl  Copyright 2003 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C      cycles/limb
+C EV4:    4.75
+C EV5:    2.0
+C EV6:    1.5
+
+
+C mp_limb_t mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size);
+C
+C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
+C 2.0 c/l.  In general, a pattern like this unrolled to N limbs per loop
+C will be 1.5+2/N c/l.
+C
+C 2 cycles of loop control are unavoidable, for pointer updates and the
+C taken branch bubble, but also since ldq cannot issue two cycles after stq
+C (and with a run of stqs that means neither of two cycles at the end of the
+C loop.
+C
+C The fbeq is forced into the second cycle of the loop using unops, since
+C the first time through it must wait for the cvtqt result.  Once that
+C result is ready (a 1 cycle stall) then both the branch and following loads
+C can issue together.
+C
+C The main loop handles an odd count of limbs, being two limbs loaded before
+C each size test, plus one pipelined around from the previous iteration (or
+C setup in the entry sequence).
+C
+C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
+C entry sequence, and an increment of the pointers.  For an odd size there's
+C no increment and the first store in the loop (r24) is a repeat of dst[0].
+C
+C Note that the load for r24 after the possible pointer increment is done
+C before the explicit store to dst[0], in case src==dst.
+
+
+ASM_START()
+
+FLOAT64(L(dat), 2.0)
+
+	ALIGN(16)
+
+PROLOGUE(mpn_com_n,gp)
+
+	C r16	dst
+	C r17	src
+	C r18	size
+
+	lda	r30, -16(r30)		C temporary stack space
+	lda	r7, -3(r18)		C size - 3
+
+	ldq	r20, 0(r17)		C src[0]
+	srl	r7, 1, r6		C (size-3)/2
+
+	stq	r6, 8(r30)		C (size-3)/2
+	and	r7, 1, r5		C 1 if size even
+
+	LEA(	r8, L(dat))
+	s8addq	r5, r17, r17		C skip src[0] if even
+
+	ornot	r31, r20, r20		C ~src[0]
+	unop
+
+	ldt	f0, 8(r30)		C (size-3)/2
+	ldq	r24, 0(r17)		C src[0 or 1]
+
+	stq	r20, 0(r16)		C dst[0]
+	s8addq	r5, r16, r19		C skip dst[0] if even
+
+	ldt	f1, 0(r8)		C data 2.0
+	lda	r30, 16(r30)		C restore stack
+	unop
+	cvtqt	f0, f0			C (size-3)/2 as float
+
+	ornot	r31, r24, r24
+	blt	r7, L(done_1)		C if size<=2
+	unop
+	unop
+
+
+	C 16-byte alignment here
+L(top):
+	C r17	src, incrementing
+	C r19	dst, incrementing
+	C r24	dst[i] result, ready to store
+	C f0	(size-3)/2, decrementing
+	C f1	2.0
+
+	ldq	r20, 8(r17)		C src[i+1]
+	ldq	r21, 16(r17)		C src[i+2]
+	unop
+	unop
+
+	fbeq	f0, L(done_2)
+	unop
+	ldq	r22, 24(r17)		C src[i+3]
+	ldq	r23, 32(r17)		C src[i+4]
+
+	stq	r24, 0(r19)		C dst[i]
+	ornot	r31, r20, r20
+	subt	f0, f1, f0		C count -= 2
+	unop
+
+	stq	r20, 8(r19)		C dst[i+1]
+	ornot	r31, r21, r21
+	unop
+	unop
+
+	stq	r21, 16(r19)		C dst[i+2]
+	ornot	r31, r22, r22
+
+	stq	r22, 24(r19)		C dst[i+3]
+	ornot	r31, r23, r24
+
+	lda	r17, 32(r17)		C src += 4
+	lda	r19, 32(r19)		C dst += 4
+	unop
+	fbge	f0, L(top)
+
+
+L(done_1):
+	C r19	&dst[size-1]
+	C r24	result for dst[size-1]
+
+	stq	r24, 0(r19)		C dst[size-1]
+	ret	r31, (r26), 1
+
+
+L(done_2):
+	C r19	&dst[size-3]
+	C r20	src[size-2]
+	C r21	src[size-1]
+	C r24	result for dst[size-3]
+
+	stq	r24, 0(r19)		C dst[size-3]
+	ornot	r31, r20, r20
+
+	stq	r20, 8(r19)		C dst[size-2]
+	ornot	r31, r21, r21
+
+	stq	r21, 16(r19)		C dst[size-1]
+	ret	r31, (r26), 1
+
+EPILOGUE()
+ASM_END()