--- /dev/null
+dnl AMD64 mpn_divexact_1 -- mpn by limb exact division.
+
+dnl Copyright 2001, 2002, 2004, 2005, 2006 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C K8,K9: 10
+C K10: 10
+C P4: 33
+C P6-15 (Core2):13.25
+C P6-28 (Atom): 42
+
+C A quick adoption of the 32-bit K7 code.
+
+
+C INPUT PARAMETERS
+C rp rdi
+C up rsi
+C n rdx
+C divisor rcx
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+PROLOGUE(mpn_divexact_1)
+ pushq %rbx
+
+ movq %rcx, %rax
+ movl $0, %ecx C shift count
+ movq %rdx, %r8
+
+ btl $0, %eax
+ jnc L(evn) C skip bsfq unless divisor is even
+
+L(odd): movq %rax, %rbx
+ shrl %eax
+ andl $127, %eax C d/2, 7 bits
+
+ifdef(`PIC',`
+ movq binvert_limb_table@GOTPCREL(%rip), %rdx
+',`
+ movabsq $binvert_limb_table, %rdx
+')
+
+ movzbl (%rax,%rdx), %eax C inv 8 bits
+
+ movq %rbx, %r11 C d without twos
+
+ leal (%rax,%rax), %edx C 2*inv
+ imull %eax, %eax C inv*inv
+ imull %ebx, %eax C inv*inv*d
+ subl %eax, %edx C inv = 2*inv - inv*inv*d, 16 bits
+
+ leal (%rdx,%rdx), %eax C 2*inv
+ imull %edx, %edx C inv*inv
+ imull %ebx, %edx C inv*inv*d
+ subl %edx, %eax C inv = 2*inv - inv*inv*d, 32 bits
+
+ leaq (%rax,%rax), %rdx C 2*inv
+ imulq %rax, %rax C inv*inv
+ imulq %rbx, %rax C inv*inv*d
+ subq %rax, %rdx C inv = 2*inv - inv*inv*d, 64 bits
+
+ leaq (%rsi,%r8,8), %rsi C up end
+ leaq -8(%rdi,%r8,8), %rdi C rp end
+ negq %r8 C -n
+
+ movq %rdx, %r10 C final inverse
+ movq (%rsi,%r8,8), %rax C up[0]
+
+ incq %r8
+ jz L(one)
+
+ movq (%rsi,%r8,8), %rdx C up[1]
+
+ shrdq %cl, %rdx, %rax
+
+ xorl %ebx, %ebx
+ jmp L(entry)
+
+L(evn): bsfq %rax, %rcx
+ shrq %cl, %rax
+ jmp L(odd)
+
+ ALIGN(8)
+L(top):
+ C rax q
+ C rbx carry bit, 0 or 1
+ C rcx shift
+ C rdx
+ C rsi up end
+ C rdi rp end
+ C r8 counter, limbs, negative
+
+ mulq %r11 C carry limb in rdx
+
+ movq -8(%rsi,%r8,8), %rax
+ movq (%rsi,%r8,8), %r9
+
+ shrdq %cl, %r9, %rax
+ nop
+
+ subq %rbx, %rax C apply carry bit
+ setc %bl
+
+ subq %rdx, %rax C apply carry limb
+ adcq $0, %rbx
+
+L(entry):
+ imulq %r10, %rax
+
+ movq %rax, (%rdi,%r8,8)
+ incq %r8
+ jnz L(top)
+
+
+ mulq %r11 C carry limb in rdx
+
+ movq -8(%rsi), %rax C up high limb
+ shrq %cl, %rax
+
+ subq %rbx, %rax C apply carry bit
+
+ subq %rdx, %rax C apply carry limb
+
+ imulq %r10, %rax
+
+ movq %rax, (%rdi)
+
+ popq %rbx
+ ret
+
+
+L(one):
+ shrq %cl, %rax
+
+ imulq %r10, %rax
+
+ movq %rax, (%rdi)
+
+ popq %rbx
+ ret
+
+EPILOGUE()