X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Fsparc32%2Fv9%2Fsqr_diagonal.asm;fp=gmp%2Fmpn%2Fsparc32%2Fv9%2Fsqr_diagonal.asm;h=e4a78c5de7a2be5de1fa32b71748b27dba613833;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git

diff --git a/gmp/mpn/sparc32/v9/sqr_diagonal.asm b/gmp/mpn/sparc32/v9/sqr_diagonal.asm
new file mode 100644
index 00000000..e4a78c5d
--- /dev/null
+++ b/gmp/mpn/sparc32/v9/sqr_diagonal.asm
@@ -0,0 +1,451 @@
+dnl  SPARC v9 32-bit mpn_sqr_diagonal.
+
+dnl  Copyright 2001, 2003 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+C INPUT PARAMETERS
+C rp	i0
+C up	i1
+C n	i2
+
+C This code uses a very deep software pipeline, due to the need for moving data
+C forth and back between the integer registers and floating-point registers.
+C
+C A VIS variant of this code would make the pipeline less deep, since the
+C masking now done in the integer unit could take place in the floating-point
+C unit using the FAND instruction.  It would be possible to save several cycles
+C too.
+C
+C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
+C not much slower from the Ecache.  It would perhaps be possible to shave off
+C one cycle, but not easily.  We cannot do better than 10 cycles/limb with the
+C used instructions, since we have 10 memory operations per limb.  But a VIS
+C variant could run three cycles faster than the corresponding non-VIS code.
+
+C This is non-pipelined code showing the algorithm:
+C
+C .Loop:
+C	lduw	[up+0],%g4		C 00000000hhhhllll
+C	sllx	%g4,16,%g3		C 0000hhhhllll0000
+C	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+C	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+C	stx	%g2,[%fp+80]
+C	ldd	[%fp+80],%f0
+C	fitod	%f0,%f4			C hi16
+C	fitod	%f1,%f6			C lo16
+C	ld	[up+0],%f9
+C	fxtod	%f8,%f2
+C	fmuld	%f2,%f4,%f4
+C	fmuld	%f2,%f6,%f6
+C	fdtox	%f4,%f4
+C	fdtox	%f6,%f6
+C	std	%f4,[%fp-24]
+C	std	%f6,[%fp-16]
+C	ldx	[%fp-24],%g2
+C	ldx	[%fp-16],%g1
+C	sllx	%g2,16,%g2
+C	add	%g2,%g1,%g1
+C	stw	%g1,[rp+0]
+C	srlx	%g1,32,%l0
+C	stw	%l0,[rp+4]
+C	add	up,4,up
+C	subcc	n,1,n
+C	bne,pt	%icc,.Loop
+C	add	rp,8,rp
+
+define(`fanop',`fitod %f12,%f10')	dnl  A quasi nop running in the FA pipe
+
+ASM_START()
+
+	TEXT
+	ALIGN(4)
+.Lnoll:
+	.word	0
+
+PROLOGUE(mpn_sqr_diagonal)
+	save	%sp,-256,%sp
+
+ifdef(`PIC',
+`.Lpc:	rd	%pc,%o7
+	ld	[%o7+.Lnoll-.Lpc],%f8',
+`	sethi	%hi(.Lnoll),%g1
+	ld	[%g1+%lo(.Lnoll)],%f8')
+
+	sethi	%hi(0xffff0000),%g5
+	add	%i1,-8,%i1
+
+	lduw	[%i1+8],%g4
+	add	%i1,4,%i1		C s1_ptr++
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	bne,pt	%icc,.L_grt_1
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	add	%i1,4,%i1		C s1_ptr++
+	stx	%g2,[%fp+80]
+	ld	[%i1],%f9
+	ldd	[%fp+80],%f0
+	fxtod	%f8,%f2
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	fmuld	%f2,%f6,%f6
+	fdtox	%f4,%f4
+	fdtox	%f6,%f6
+	std	%f4,[%fp-24]
+	std	%f6,[%fp-16]
+
+	add	%fp, 80, %l3
+	add	%fp, -24, %l4
+	add	%fp, 72, %l5
+	b	.L1
+	add	%fp, -40, %l6
+
+.L_grt_1:
+	stx	%g2,[%fp+80]
+	lduw	[%i1+8],%g4
+	add	%i1,4,%i1		C s1_ptr++
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	bne,pt	%icc,.L_grt_2
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	stx	%g2,[%fp+72]
+	ld	[%i1],%f9
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	fxtod	%f8,%f2
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	ldd	[%fp+72],%f0
+	fdtox	%f4,%f4
+	fdtox	%f6,%f6
+	std	%f4,[%fp-24]
+	fxtod	%f8,%f2
+	std	%f6,[%fp-16]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	fmuld	%f2,%f6,%f6
+	fdtox	%f4,%f4
+
+	add	%fp, 72, %l3
+	add	%fp, -40, %l4
+	add	%fp, 80, %l5
+	b	.L2
+	add	%fp, -24, %l6
+
+.L_grt_2:
+	stx	%g2,[%fp+72]
+	lduw	[%i1+8],%g4
+	ld	[%i1],%f9
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	fxtod	%f8,%f2
+	bne,pt	%icc,.L_grt_3
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	stx	%g2,[%fp+80]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+72],%f0
+	fdtox	%f4,%f4
+	fdtox	%f6,%f6
+	std	%f4,[%fp-24]
+	fxtod	%f8,%f2
+	std	%f6,[%fp-16]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	add	%fp, 80, %l3
+	fmuld	%f2,%f6,%f6
+	add	%fp, -24, %l4
+	ldd	[%fp+80],%f0
+	add	%fp, 72, %l5
+	fdtox	%f4,%f4
+	b	.L3
+	add	%fp, -40, %l6
+
+.L_grt_3:
+	stx	%g2,[%fp+80]
+	fitod	%f0,%f4
+	lduw	[%i1+8],%g4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+72],%f0
+	fdtox	%f4,%f4
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	fdtox	%f6,%f6
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	std	%f4,[%fp-24]
+	fxtod	%f8,%f2
+	std	%f6,[%fp-16]
+	bne,pt	%icc,.L_grt_4
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	stx	%g2,[%fp+72]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	add	%fp, 72, %l3
+	fmuld	%f2,%f4,%f4
+	add	%fp, -40, %l4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	add	%fp, 80, %l5
+	fdtox	%f4,%f4
+	b	.L4
+	add	%fp, -24, %l6
+
+.L_grt_4:
+	stx	%g2,[%fp+72]
+	fitod	%f0,%f4
+	lduw	[%i1+8],%g4
+	fitod	%f1,%f6
+	fmuld	%f2,%f4,%f4
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	fdtox	%f4,%f4
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	fdtox	%f6,%f6
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	std	%f4,[%fp-40]
+	fxtod	%f8,%f2
+	std	%f6,[%fp-32]
+	be,pn	%icc,.L5
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+
+	b,a	.Loop
+
+	.align	16
+C --- LOOP BEGIN
+.Loop:	nop
+	nop
+	stx	%g2,[%fp+80]
+	fitod	%f0,%f4
+C ---
+	nop
+	nop
+	lduw	[%i1+8],%g4
+	fitod	%f1,%f6
+C ---
+	nop
+	nop
+	ldx	[%fp-24],%g2		C p16
+	fanop
+C ---
+	nop
+	nop
+	ldx	[%fp-16],%g1		C p0
+	fmuld	%f2,%f4,%f4
+C ---
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+C ---
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+72],%f0
+	fanop
+C ---
+	srlx	%g1,32,%l0
+	nop
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+C ---
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	nop
+	stw	%l0,[%i0-4]
+	fdtox	%f6,%f6
+C ---
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	std	%f4,[%fp-24]
+	fxtod	%f8,%f2
+C ---
+	std	%f6,[%fp-16]
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+	be,pn	%icc,.Lend
+	fanop
+C ---  LOOP MIDDLE
+	nop
+	nop
+	stx	%g2,[%fp+72]
+	fitod	%f0,%f4
+C ---
+	nop
+	nop
+	lduw	[%i1+8],%g4
+	fitod	%f1,%f6
+C ---
+	nop
+	nop
+	ldx	[%fp-40],%g2		C p16
+	fanop
+C ---
+	nop
+	nop
+	ldx	[%fp-32],%g1		C p0
+	fmuld	%f2,%f4,%f4
+C ---
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+C ---
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%fp+80],%f0
+	fanop
+C ---
+	srlx	%g1,32,%l0
+	nop
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+C ---
+	sllx	%g4,16,%g3		C 0000hhhhllll0000
+	nop
+	stw	%l0,[%i0-4]
+	fdtox	%f6,%f6
+C ---
+	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
+	subcc	%i2,1,%i2
+	std	%f4,[%fp-40]
+	fxtod	%f8,%f2
+C ---
+	std	%f6,[%fp-32]
+	andn	%g2,%g5,%g2		C 0000hhhh0000llll
+	bne,pt	%icc,.Loop
+	fanop
+C --- LOOP END
+
+.L5:	add	%fp, 80, %l3
+	add	%fp, -24, %l4
+	add	%fp, 72, %l5
+	b	.Ltail
+	add	%fp, -40, %l6
+
+.Lend:	add	%fp, 72, %l3
+	add	%fp, -40, %l4
+	add	%fp, 80, %l5
+	add	%fp, -24, %l6
+.Ltail:	stx	%g2,[%l3]
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	ldx	[%l4],%g2		C p16
+	ldx	[%l4+8],%g1		C p0
+	fmuld	%f2,%f4,%f4
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	add	%i1,4,%i1		C s1_ptr++
+	ldd	[%l5],%f0
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+	stw	%l0,[%i0-4]
+.L4:	fdtox	%f6,%f6
+	std	%f4,[%l4]
+	fxtod	%f8,%f2
+	std	%f6,[%l4+8]
+
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	ldx	[%l6],%g2		C p16
+	ldx	[%l6+8],%g1		C p0
+	fmuld	%f2,%f4,%f4
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	ld	[%i1],%f9
+	fmuld	%f2,%f6,%f6
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	ldd	[%l3],%f0
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+	stw	%l0,[%i0-4]
+.L3:	fdtox	%f6,%f6
+	std	%f4,[%l6]
+	fxtod	%f8,%f2
+	std	%f6,[%l6+8]
+
+	fitod	%f0,%f4
+	fitod	%f1,%f6
+	ldx	[%l4],%g2		C p16
+	ldx	[%l4+8],%g1		C p0
+	fmuld	%f2,%f4,%f4
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	fmuld	%f2,%f6,%f6
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	fdtox	%f4,%f4
+	stw	%l0,[%i0-4]
+.L2:	fdtox	%f6,%f6
+	std	%f4,[%l4]
+	std	%f6,[%l4+8]
+
+	ldx	[%l6],%g2		C p16
+	ldx	[%l6+8],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	stw	%l0,[%i0-4]
+
+.L1:	ldx	[%l4],%g2		C p16
+	ldx	[%l4+8],%g1		C p0
+	sllx	%g2,16,%g2		C align p16
+	add	%i0,8,%i0		C res_ptr++
+	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
+	srlx	%g1,32,%l0
+	stw	%g1,[%i0-8]
+	stw	%l0,[%i0-4]
+
+	ret
+	restore	%g0,%g0,%o0
+
+EPILOGUE(mpn_sqr_diagonal)