dnl  SPARC v9 32-bit mpn_sqr_diagonal.

dnl  Copyright 2001, 2003 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.


include(`../config.m4')

C INPUT PARAMETERS
C rp	i0
C up	i1
C n	i2

C This code uses a very deep software pipeline, due to the need for moving data
C forth and back between the integer registers and floating-point registers.
C
C A VIS variant of this code would make the pipeline less deep, since the
C masking now done in the integer unit could take place in the floating-point
C unit using the FAND instruction.  It would be possible to save several cycles
C too.
C
C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and
C not much slower from the Ecache.  It would perhaps be possible to shave off
C one cycle, but not easily.  We cannot do better than 10 cycles/limb with the
C used instructions, since we have 10 memory operations per limb.  But a VIS
C variant could run three cycles faster than the corresponding non-VIS code.

C This is non-pipelined code showing the algorithm:
C
C .Loop:
C	lduw	[up+0],%g4		C 00000000hhhhllll
C	sllx	%g4,16,%g3		C 0000hhhhllll0000
C	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
C	andn	%g2,%g5,%g2		C 0000hhhh0000llll
C	stx	%g2,[%fp+80]
C	ldd	[%fp+80],%f0
C	fitod	%f0,%f4			C hi16
C	fitod	%f1,%f6			C lo16
C	ld	[up+0],%f9
C	fxtod	%f8,%f2
C	fmuld	%f2,%f4,%f4
C	fmuld	%f2,%f6,%f6
C	fdtox	%f4,%f4
C	fdtox	%f6,%f6
C	std	%f4,[%fp-24]
C	std	%f6,[%fp-16]
C	ldx	[%fp-24],%g2
C	ldx	[%fp-16],%g1
C	sllx	%g2,16,%g2
C	add	%g2,%g1,%g1
C	stw	%g1,[rp+0]
C	srlx	%g1,32,%l0
C	stw	%l0,[rp+4]
C	add	up,4,up
C	subcc	n,1,n
C	bne,pt	%icc,.Loop
C	add	rp,8,rp

define(`fanop',`fitod %f12,%f10')	dnl  A quasi nop running in the FA pipe

ASM_START()

	TEXT
	ALIGN(4)
.Lnoll:
	.word	0

PROLOGUE(mpn_sqr_diagonal)
	save	%sp,-256,%sp

ifdef(`PIC',
`.Lpc:	rd	%pc,%o7
	ld	[%o7+.Lnoll-.Lpc],%f8',
`	sethi	%hi(.Lnoll),%g1
	ld	[%g1+%lo(.Lnoll)],%f8')

	sethi	%hi(0xffff0000),%g5
	add	%i1,-8,%i1

	lduw	[%i1+8],%g4
	add	%i1,4,%i1		C s1_ptr++
	sllx	%g4,16,%g3		C 0000hhhhllll0000
	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
	subcc	%i2,1,%i2
	bne,pt	%icc,.L_grt_1
	andn	%g2,%g5,%g2		C 0000hhhh0000llll

	add	%i1,4,%i1		C s1_ptr++
	stx	%g2,[%fp+80]
	ld	[%i1],%f9
	ldd	[%fp+80],%f0
	fxtod	%f8,%f2
	fitod	%f0,%f4
	fitod	%f1,%f6
	fmuld	%f2,%f4,%f4
	fmuld	%f2,%f6,%f6
	fdtox	%f4,%f4
	fdtox	%f6,%f6
	std	%f4,[%fp-24]
	std	%f6,[%fp-16]

	add	%fp, 80, %l3
	add	%fp, -24, %l4
	add	%fp, 72, %l5
	b	.L1
	add	%fp, -40, %l6

.L_grt_1:
	stx	%g2,[%fp+80]
	lduw	[%i1+8],%g4
	add	%i1,4,%i1		C s1_ptr++
	sllx	%g4,16,%g3		C 0000hhhhllll0000
	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
	subcc	%i2,1,%i2
	bne,pt	%icc,.L_grt_2
	andn	%g2,%g5,%g2		C 0000hhhh0000llll

	stx	%g2,[%fp+72]
	ld	[%i1],%f9
	add	%i1,4,%i1		C s1_ptr++
	ldd	[%fp+80],%f0
	fxtod	%f8,%f2
	fitod	%f0,%f4
	fitod	%f1,%f6
	fmuld	%f2,%f4,%f4
	ld	[%i1],%f9
	fmuld	%f2,%f6,%f6
	ldd	[%fp+72],%f0
	fdtox	%f4,%f4
	fdtox	%f6,%f6
	std	%f4,[%fp-24]
	fxtod	%f8,%f2
	std	%f6,[%fp-16]
	fitod	%f0,%f4
	fitod	%f1,%f6
	fmuld	%f2,%f4,%f4
	fmuld	%f2,%f6,%f6
	fdtox	%f4,%f4

	add	%fp, 72, %l3
	add	%fp, -40, %l4
	add	%fp, 80, %l5
	b	.L2
	add	%fp, -24, %l6

.L_grt_2:
	stx	%g2,[%fp+72]
	lduw	[%i1+8],%g4
	ld	[%i1],%f9
	add	%i1,4,%i1		C s1_ptr++
	ldd	[%fp+80],%f0
	sllx	%g4,16,%g3		C 0000hhhhllll0000
	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
	subcc	%i2,1,%i2
	fxtod	%f8,%f2
	bne,pt	%icc,.L_grt_3
	andn	%g2,%g5,%g2		C 0000hhhh0000llll

	stx	%g2,[%fp+80]
	fitod	%f0,%f4
	fitod	%f1,%f6
	fmuld	%f2,%f4,%f4
	ld	[%i1],%f9
	fmuld	%f2,%f6,%f6
	add	%i1,4,%i1		C s1_ptr++
	ldd	[%fp+72],%f0
	fdtox	%f4,%f4
	fdtox	%f6,%f6
	std	%f4,[%fp-24]
	fxtod	%f8,%f2
	std	%f6,[%fp-16]
	fitod	%f0,%f4
	fitod	%f1,%f6
	fmuld	%f2,%f4,%f4
	ld	[%i1],%f9
	add	%fp, 80, %l3
	fmuld	%f2,%f6,%f6
	add	%fp, -24, %l4
	ldd	[%fp+80],%f0
	add	%fp, 72, %l5
	fdtox	%f4,%f4
	b	.L3
	add	%fp, -40, %l6

.L_grt_3:
	stx	%g2,[%fp+80]
	fitod	%f0,%f4
	lduw	[%i1+8],%g4
	fitod	%f1,%f6
	fmuld	%f2,%f4,%f4
	ld	[%i1],%f9
	fmuld	%f2,%f6,%f6
	add	%i1,4,%i1		C s1_ptr++
	ldd	[%fp+72],%f0
	fdtox	%f4,%f4
	sllx	%g4,16,%g3		C 0000hhhhllll0000
	fdtox	%f6,%f6
	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
	subcc	%i2,1,%i2
	std	%f4,[%fp-24]
	fxtod	%f8,%f2
	std	%f6,[%fp-16]
	bne,pt	%icc,.L_grt_4
	andn	%g2,%g5,%g2		C 0000hhhh0000llll

	stx	%g2,[%fp+72]
	fitod	%f0,%f4
	fitod	%f1,%f6
	add	%fp, 72, %l3
	fmuld	%f2,%f4,%f4
	add	%fp, -40, %l4
	ld	[%i1],%f9
	fmuld	%f2,%f6,%f6
	add	%i1,4,%i1		C s1_ptr++
	ldd	[%fp+80],%f0
	add	%fp, 80, %l5
	fdtox	%f4,%f4
	b	.L4
	add	%fp, -24, %l6

.L_grt_4:
	stx	%g2,[%fp+72]
	fitod	%f0,%f4
	lduw	[%i1+8],%g4
	fitod	%f1,%f6
	fmuld	%f2,%f4,%f4
	ld	[%i1],%f9
	fmuld	%f2,%f6,%f6
	add	%i1,4,%i1		C s1_ptr++
	ldd	[%fp+80],%f0
	fdtox	%f4,%f4
	sllx	%g4,16,%g3		C 0000hhhhllll0000
	fdtox	%f6,%f6
	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
	subcc	%i2,1,%i2
	std	%f4,[%fp-40]
	fxtod	%f8,%f2
	std	%f6,[%fp-32]
	be,pn	%icc,.L5
	andn	%g2,%g5,%g2		C 0000hhhh0000llll

	b,a	.Loop

	.align	16
C --- LOOP BEGIN
.Loop:	nop
	nop
	stx	%g2,[%fp+80]
	fitod	%f0,%f4
C ---
	nop
	nop
	lduw	[%i1+8],%g4
	fitod	%f1,%f6
C ---
	nop
	nop
	ldx	[%fp-24],%g2		C p16
	fanop
C ---
	nop
	nop
	ldx	[%fp-16],%g1		C p0
	fmuld	%f2,%f4,%f4
C ---
	sllx	%g2,16,%g2		C align p16
	add	%i0,8,%i0		C res_ptr++
	ld	[%i1],%f9
	fmuld	%f2,%f6,%f6
C ---
	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
	add	%i1,4,%i1		C s1_ptr++
	ldd	[%fp+72],%f0
	fanop
C ---
	srlx	%g1,32,%l0
	nop
	stw	%g1,[%i0-8]
	fdtox	%f4,%f4
C ---
	sllx	%g4,16,%g3		C 0000hhhhllll0000
	nop
	stw	%l0,[%i0-4]
	fdtox	%f6,%f6
C ---
	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
	subcc	%i2,1,%i2
	std	%f4,[%fp-24]
	fxtod	%f8,%f2
C ---
	std	%f6,[%fp-16]
	andn	%g2,%g5,%g2		C 0000hhhh0000llll
	be,pn	%icc,.Lend
	fanop
C ---  LOOP MIDDLE
	nop
	nop
	stx	%g2,[%fp+72]
	fitod	%f0,%f4
C ---
	nop
	nop
	lduw	[%i1+8],%g4
	fitod	%f1,%f6
C ---
	nop
	nop
	ldx	[%fp-40],%g2		C p16
	fanop
C ---
	nop
	nop
	ldx	[%fp-32],%g1		C p0
	fmuld	%f2,%f4,%f4
C ---
	sllx	%g2,16,%g2		C align p16
	add	%i0,8,%i0		C res_ptr++
	ld	[%i1],%f9
	fmuld	%f2,%f6,%f6
C ---
	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
	add	%i1,4,%i1		C s1_ptr++
	ldd	[%fp+80],%f0
	fanop
C ---
	srlx	%g1,32,%l0
	nop
	stw	%g1,[%i0-8]
	fdtox	%f4,%f4
C ---
	sllx	%g4,16,%g3		C 0000hhhhllll0000
	nop
	stw	%l0,[%i0-4]
	fdtox	%f6,%f6
C ---
	or	%g3,%g4,%g2		C 0000hhhhXXXXllll
	subcc	%i2,1,%i2
	std	%f4,[%fp-40]
	fxtod	%f8,%f2
C ---
	std	%f6,[%fp-32]
	andn	%g2,%g5,%g2		C 0000hhhh0000llll
	bne,pt	%icc,.Loop
	fanop
C --- LOOP END

.L5:	add	%fp, 80, %l3
	add	%fp, -24, %l4
	add	%fp, 72, %l5
	b	.Ltail
	add	%fp, -40, %l6

.Lend:	add	%fp, 72, %l3
	add	%fp, -40, %l4
	add	%fp, 80, %l5
	add	%fp, -24, %l6
.Ltail:	stx	%g2,[%l3]
	fitod	%f0,%f4
	fitod	%f1,%f6
	ldx	[%l4],%g2		C p16
	ldx	[%l4+8],%g1		C p0
	fmuld	%f2,%f4,%f4
	sllx	%g2,16,%g2		C align p16
	add	%i0,8,%i0		C res_ptr++
	ld	[%i1],%f9
	fmuld	%f2,%f6,%f6
	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
	add	%i1,4,%i1		C s1_ptr++
	ldd	[%l5],%f0
	srlx	%g1,32,%l0
	stw	%g1,[%i0-8]
	fdtox	%f4,%f4
	stw	%l0,[%i0-4]
.L4:	fdtox	%f6,%f6
	std	%f4,[%l4]
	fxtod	%f8,%f2
	std	%f6,[%l4+8]

	fitod	%f0,%f4
	fitod	%f1,%f6
	ldx	[%l6],%g2		C p16
	ldx	[%l6+8],%g1		C p0
	fmuld	%f2,%f4,%f4
	sllx	%g2,16,%g2		C align p16
	add	%i0,8,%i0		C res_ptr++
	ld	[%i1],%f9
	fmuld	%f2,%f6,%f6
	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
	ldd	[%l3],%f0
	srlx	%g1,32,%l0
	stw	%g1,[%i0-8]
	fdtox	%f4,%f4
	stw	%l0,[%i0-4]
.L3:	fdtox	%f6,%f6
	std	%f4,[%l6]
	fxtod	%f8,%f2
	std	%f6,[%l6+8]

	fitod	%f0,%f4
	fitod	%f1,%f6
	ldx	[%l4],%g2		C p16
	ldx	[%l4+8],%g1		C p0
	fmuld	%f2,%f4,%f4
	sllx	%g2,16,%g2		C align p16
	add	%i0,8,%i0		C res_ptr++
	fmuld	%f2,%f6,%f6
	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
	srlx	%g1,32,%l0
	stw	%g1,[%i0-8]
	fdtox	%f4,%f4
	stw	%l0,[%i0-4]
.L2:	fdtox	%f6,%f6
	std	%f4,[%l4]
	std	%f6,[%l4+8]

	ldx	[%l6],%g2		C p16
	ldx	[%l6+8],%g1		C p0
	sllx	%g2,16,%g2		C align p16
	add	%i0,8,%i0		C res_ptr++
	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
	srlx	%g1,32,%l0
	stw	%g1,[%i0-8]
	stw	%l0,[%i0-4]

.L1:	ldx	[%l4],%g2		C p16
	ldx	[%l4+8],%g1		C p0
	sllx	%g2,16,%g2		C align p16
	add	%i0,8,%i0		C res_ptr++
	add	%g2,%g1,%g1		C add p16 to p0 (ADD1)
	srlx	%g1,32,%l0
	stw	%g1,[%i0-8]
	stw	%l0,[%i0-4]

	ret
	restore	%g0,%g0,%o0

EPILOGUE(mpn_sqr_diagonal)