dnl  AMD64 mpn_divexact_1 -- mpn by limb exact division.

dnl  Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.

dnl  This file is part of the GNU MP Library.

dnl  The GNU MP Library is free software; you can redistribute it and/or modify
dnl  it under the terms of the GNU Lesser General Public License as published
dnl  by the Free Software Foundation; either version 3 of the License, or (at
dnl  your option) any later version.

dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
dnl  License for more details.

dnl  You should have received a copy of the GNU Lesser General Public License
dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.

include(`../config.m4')


C	     cycles/limb
C K8,K9:	10
C K10:		10
C P4:		33
C P6-15 (Core2):13.25
C P6-28 (Atom):	42

C A quick adoption of the 32-bit K7 code.


C INPUT PARAMETERS
C rp		rdi
C up		rsi
C n		rdx
C divisor	rcx

ASM_START()
	TEXT
	ALIGN(16)
PROLOGUE(mpn_divexact_1)
	pushq	%rbx

	movq	%rcx, %rax
	movl	$0, %ecx		C shift count
	movq	%rdx, %r8

	btl	$0, %eax
	jnc	L(evn)			C skip bsfq unless divisor is even

L(odd):	movq	%rax, %rbx
	shrl	%eax
	andl	$127, %eax		C d/2, 7 bits

ifdef(`PIC',`
	movq	binvert_limb_table@GOTPCREL(%rip), %rdx
',`
	movabsq	$binvert_limb_table, %rdx
')

	movzbl	(%rax,%rdx), %eax	C inv 8 bits

	movq	%rbx, %r11		C d without twos

	leal	(%rax,%rax), %edx	C 2*inv
	imull	%eax, %eax		C inv*inv
	imull	%ebx, %eax		C inv*inv*d
	subl	%eax, %edx		C inv = 2*inv - inv*inv*d, 16 bits

	leal	(%rdx,%rdx), %eax	C 2*inv
	imull	%edx, %edx		C inv*inv
	imull	%ebx, %edx		C inv*inv*d
	subl	%edx, %eax		C inv = 2*inv - inv*inv*d, 32 bits

	leaq	(%rax,%rax), %rdx	C 2*inv
	imulq	%rax, %rax		C inv*inv
	imulq	%rbx, %rax		C inv*inv*d
	subq	%rax, %rdx		C inv = 2*inv - inv*inv*d, 64 bits

	leaq	(%rsi,%r8,8), %rsi	C up end
	leaq	-8(%rdi,%r8,8), %rdi	C rp end
	negq	%r8			C -n

	movq	%rdx, %r10		C final inverse
	movq	(%rsi,%r8,8), %rax	C up[0]

	incq	%r8
	jz	L(one)

	movq	(%rsi,%r8,8), %rdx	C up[1]

	shrdq	%cl, %rdx, %rax

	xorl	%ebx, %ebx
	jmp	L(entry)

L(evn):	bsfq	%rax, %rcx
	shrq	%cl, %rax
	jmp	L(odd)

	ALIGN(8)
L(top):
	C rax	q
	C rbx	carry bit, 0 or 1
	C rcx	shift
	C rdx
	C rsi	up end
	C rdi	rp end
	C r8	counter, limbs, negative

	mulq	%r11			C carry limb in rdx

	movq	-8(%rsi,%r8,8), %rax
	movq	(%rsi,%r8,8), %r9

	shrdq	%cl, %r9, %rax
	nop

	subq	%rbx, %rax		C apply carry bit
	setc	%bl

	subq	%rdx, %rax		C apply carry limb
	adcq	$0, %rbx

L(entry):
	imulq	%r10, %rax

	movq	%rax, (%rdi,%r8,8)
	incq	%r8
	jnz	L(top)


	mulq	%r11			C carry limb in rdx

	movq	-8(%rsi), %rax		C up high limb
	shrq	%cl, %rax

	subq	%rbx, %rax		C apply carry bit

	subq	%rdx, %rax		C apply carry limb

	imulq	%r10, %rax

	movq	%rax, (%rdi)

	popq	%rbx
	ret


L(one):
	shrq	%cl, %rax

	imulq	%r10, %rax

	movq	%rax, (%rdi)

	popq	%rbx
	ret

EPILOGUE()