X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Falpha%2Fev6%2Fnails%2Fsubmul_1.asm;fp=gmp%2Fmpn%2Falpha%2Fev6%2Fnails%2Fsubmul_1.asm;h=4242517a4a52671512850e4a37fe23a0b9b56ff9;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git

diff --git a/gmp/mpn/alpha/ev6/nails/submul_1.asm b/gmp/mpn/alpha/ev6/nails/submul_1.asm
new file mode 100644
index 00000000..4242517a
--- /dev/null
+++ b/gmp/mpn/alpha/ev6/nails/submul_1.asm
@@ -0,0 +1,385 @@
+dnl  Alpha ev6 nails mpn_submul_1.
+
+dnl  Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     4
+
+C TODO
+C  * Reroll loop for 3.75 c/l with current 4-way unrulling.
+C  * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C    umulh.
+C  * Use FP loop count and multiple exit points, that would simpily feed-in lp0
+C    and would work since the loop structure is really regular.
+
+C  INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl  This declaration is munged by configure
+NAILS_SUPPORT(2-63)
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+	sll	vl0, NAIL_BITS, vl0
+	lda	numb_mask, -1(r31)
+	srl	numb_mask, NAIL_BITS, numb_mask
+
+	and	n,	3,	r25
+	cmpeq	r25,	1,	r21
+	bne	r21,	L(1m4)
+	cmpeq	r25,	2,	r21
+	bne	r21,	L(2m4)
+	beq	r25,	L(0m4)
+
+L(3m4):	ldq	ul3,	0(up)
+	lda	n,	-4(n)
+	ldq	ul0,	8(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	16(up)
+	lda	up,	24(up)
+	lda	rp,	-8(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge3)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc1
+	subq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	br	r31,	L(ta3)
+
+L(ge3):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul2,	m2b
+	subq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	m3b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	br	r31,	L(el3)
+
+L(0m4):	lda	n,	-8(n)
+	ldq	ul2,	0(up)
+	ldq	ul3,	8(up)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge4)
+
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	subq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta4)
+
+L(ge4):	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul1,	m1b
+	subq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	srl	m3a,NAIL_BITS,	t0
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	addq	t0,	m2b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	br	r31,	L(el0)
+
+L(2m4):	lda	n,	-4(n)
+	ldq	ul0,	0(up)
+	ldq	ul1,	8(up)
+	lda	up,	16(up)
+	lda	rp,	-16(rp)
+	mulq	vl0,	ul0,	m0a
+	umulh	vl0,	ul0,	m0b
+	bge	n,	L(ge2)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	r31,	acc0
+	subq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	br	r31,	L(ta2)
+
+L(ge2):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	rl0,	16(rp)
+	srl	m0a,NAIL_BITS,	t0
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	addq	t0,	r31,	acc0
+	umulh	vl0,	ul3,	m3b
+	subq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	bge	n,	L(el2)
+
+	br	r31,	L(ta6)
+
+L(1m4):	lda	n,	-4(n)
+	ldq	ul1,	0(up)
+	lda	up,	8(up)
+	lda	rp,	-24(rp)
+	bge	n,	L(ge1)
+
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	subq	rl1,	t0,	acc1
+	and	acc1,numb_mask,	r28
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	24(rp)
+	subq	m1b,	t1,	r0
+	ret	r31,	(r26),	1
+
+L(ge1):	ldq	ul2,	0(up)
+	mulq	vl0,	ul1,	m1a
+	umulh	vl0,	ul1,	m1b
+	ldq	ul3,	8(up)
+	lda	n,	-4(n)
+	mulq	vl0,	ul2,	m2a
+	umulh	vl0,	ul2,	m2b
+	ldq	ul0,	16(up)
+	mulq	vl0,	ul3,	m3a
+	umulh	vl0,	ul3,	m3b
+	ldq	rl1,	24(rp)
+	srl	m1a,NAIL_BITS,	t0
+	ldq	ul1,	24(up)
+	lda	up,	32(up)
+	lda	rp,	32(rp)
+	mulq	vl0,	ul0,	m0a
+	addq	t0,	r31,	acc1
+	umulh	vl0,	ul0,	m0b
+	subq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	blt	n,	L(ta5)
+
+L(ge5):	ldq	ul2,	0(up)
+	br	r31,	L(el1)
+
+	ALIGN(16)
+L(top):	mulq	vl0,	ul0,	m0a		C U1
+	addq	t0,	m0b,	acc1		C L0
+	sra	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-24(rp)			C L1
+C
+L(el2):	umulh	vl0,	ul0,	m0b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	subq	rl1,	acc1,	acc1		C U0
+	ldq	rl2,	0(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m2a,NAIL_BITS,	t0		C U0
+	ldq	ul2,	0(up)			C L1
+C
+	mulq	vl0,	ul1,	m1a		C U1
+	addq	t0,	m1b,	acc0		C L0
+	sra	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	-16(rp)			C L1
+C
+L(el1):	umulh	vl0,	ul1,	m1b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	subq	rl2,	acc0,	acc0		C U0
+	ldq	rl3,	8(rp)			C L1
+C
+	lda	n,	-4(n)			C L1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m3a,NAIL_BITS,	t0		C U0
+	ldq	ul3,	8(up)			C L1
+C
+	mulq	vl0,	ul2,	m2a		C U1
+	addq	t0,	m2b,	acc1		C L0
+	sra	acc0,NUMB_BITS,	t1		C U0
+	stq	r28,	-8(rp)			C L1
+C
+L(el0):	umulh	vl0,	ul2,	m2b		C U1
+	and	acc0,numb_mask,	r28		C L0
+	subq	rl3,	acc1,	acc1		C U0
+	ldq	rl0,	16(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc1,	acc1		C L0
+	srl	m0a,NAIL_BITS,	t0		C U0
+	ldq	ul0,	16(up)			C L1
+C
+	mulq	vl0,	ul3,	m3a		C U1
+	addq	t0,	m3b,	acc0		C L0
+	sra	acc1,NUMB_BITS,	t1		C U0
+	stq	r28,	0(rp)			C L1
+C
+L(el3):	umulh	vl0,	ul3,	m3b		C U1
+	and	acc1,numb_mask,	r28		C L0
+	subq	rl0,	acc0,	acc0		C U0
+	ldq	rl1,	24(rp)			C L1
+C
+	unop					C U1
+	addq	t1,	acc0,	acc0		C L0
+	srl	m1a,NAIL_BITS,	t0		C U0
+	ldq	ul1,	24(up)			C L1
+C
+	lda	up,	32(up)			C L0
+	unop					C U1
+	lda	rp,	32(rp)			C L1
+	bge	n,	L(top)			C U0
+
+L(end):	mulq	vl0,	ul0,	m0a
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	stq	r28,	-24(rp)
+L(ta6):	umulh	vl0,	ul0,	m0b
+	and	acc0,numb_mask,	r28
+	subq	rl1,	acc1,	acc1
+	ldq	rl2,	0(rp)
+	addq	t1,	acc1,	acc1
+	srl	m2a,NAIL_BITS,	t0
+	mulq	vl0,	ul1,	m1a
+	addq	t0,	m1b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	-16(rp)
+L(ta5):	umulh	vl0,	ul1,	m1b
+	and	acc1,numb_mask,	r28
+	subq	rl2,	acc0,	acc0
+	ldq	rl3,	8(rp)
+	addq	t1,	acc0,	acc0
+	srl	m3a,NAIL_BITS,	t0
+	addq	t0,	m2b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	stq	r28,	-8(rp)
+	unop
+	ALIGN(16)
+L(ta4):	and	acc0,numb_mask,	r28
+	subq	rl3,	acc1,	acc1
+	ldq	rl0,	16(rp)
+	addq	t1,	acc1,	acc1
+	srl	m0a,NAIL_BITS,	t0
+	addq	t0,	m3b,	acc0
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	0(rp)
+	unop
+	ALIGN(16)
+L(ta3):	and	acc1,numb_mask,	r28
+	subq	rl0,	acc0,	acc0
+	ldq	rl1,	24(rp)
+	addq	t1,	acc0,	acc0
+	srl	m1a,NAIL_BITS,	t0
+	addq	t0,	m0b,	acc1
+	sra	acc0,NUMB_BITS,	t1
+	stq	r28,	8(rp)
+	unop
+	ALIGN(16)
+L(ta2):	and	acc0,numb_mask,	r28
+	subq	rl1,	acc1,	acc1
+	addq	t1,	acc1,	acc1
+	sra	acc1,NUMB_BITS,	t1
+	stq	r28,	16(rp)
+	and	acc1,numb_mask,	r28
+	subq	m1b,	t1,	r0
+	stq	r28,	24(rp)
+	ret	r31,	(r26),	1
+EPILOGUE()
+ASM_END()