dnl  PowerPC-64 mpn_divexact_1 -- mpn by limb exact division.

dnl  Copyright 2006 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl  The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')

C		cycles/limb
C POWER3/PPC630:    13-19
C POWER4/PPC970:     16
C POWER5:	     16

C TODO
C  * Check if n=1 code is really an improvment.  It probably isn't.
C  * Perhaps remove L(norm) code, it is currently unreachable.
C  * Make more similar to mode1o.asm.

C INPUT PARAMETERS
define(`rp', `r3')
define(`up', `r4')
define(`n',  `r5')
define(`d',  `r6')


ASM_START()

EXTERN(binvert_limb_table)

PROLOGUE(mpn_divexact_1)
	addic.	n, n, -1
	ld	r12, 0(up)
	bne	cr0, L(2)
	divdu	r0, r12, d
	std	r0, 0(rp)
	blr
L(2):
	rldicl.	r0, d, 0, 63
	li	r10, 0
	bne	cr0, L(7)
	neg	r0, d
	and	r0, d, r0
	cntlzd	r0, r0
	subfic	r0, r0, 63
	rldicl	r10, r0, 0, 32
	srd	d, d, r0
L(7):
	mtctr	n
	LEA(	r5, binvert_limb_table)
	rldicl	r11, d, 63, 57
C	cmpdi	cr7, r0, 0
	lbzx	r0, r5, r11
	mulld	r9, r0, r0
	sldi	r0, r0, 1
	mulld	r9, d, r9
	subf	r0, r9, r0
	mulld	r5, r0, r0
	sldi	r0, r0, 1
	mulld	r5, d, r5
	subf	r0, r5, r0
	mulld	r9, r0, r0
	sldi	r0, r0, 1
	mulld	r9, d, r9
	subf	r7, r9, r0		C r7 = 1/d mod 2^64
C	beq	cr7, L(norm)
	subfic	r8, r10, 64		C set carry as side effect
	li	r5, 0

	ALIGN(16)
L(loop0):
	srd	r11, r12, r10
	ld	r12, 8(up)
	addi	up, up, 8
	sld	r0, r12, r8
	or	r11, r11, r0
	subfe	r9, r5, r11
	mulld	r0, r7, r9
	std	r0, 0(rp)
	addi	rp, rp, 8
	mulhdu	r5, r0, d
	bdnz	L(loop0)

	srd	r0, r12, r10
	subfe	r0, r5, r0
	mulld	r0, r7, r0
	std	r0, 0(rp)
	blr

	ALIGN(16)
L(norm):
	mulld	r11, r12, r7
	std	r11, 0(rp)
	ALIGN(16)
L(loop1):
	mulhdu	r5, r11, d
	ld	r9, 8(up)
	addi	up, up, 8
	subfe	r5, r5, r9
	mulld	r11, r7, r5
	std	r11, 8(rp)
	addi	rp, rp, 8
	bdnz	L(loop1)
	blr
EPILOGUE()
ASM_END()