--- /dev/null
+dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb.
+
+dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C norm unorm frac
+C POWER3/PPC630 16-34 16-34 ~11
+C POWER4/PPC970 29 19
+C POWER5 29 29 ~20
+
+C INPUT PARAMETERS
+C qp = r3
+C fn = r4
+C up = r5
+C un = r6
+C d = r7
+
+C We use a not very predictable branch in the frac code, therefore the cycle
+C count wobbles somewhat. With the alternative branch-free code, things run
+C considerably slower on POWER4/PPC970 and POWER5.
+
+C Add preinv entry point.
+
+
+ASM_START()
+
+EXTERN_FUNC(mpn_invert_limb)
+
+PROLOGUE(mpn_divrem_1)
+
+ mfcr r12
+ add. r10, r6, r4
+ std r25, -56(r1)
+ mr r25, r4
+ mflr r0
+ std r26, -48(r1)
+ mr r26, r5
+ std r28, -32(r1)
+ mr r28, r6
+ std r29, -24(r1)
+ mr r29, r3
+ li r3, 0
+ std r30, -16(r1)
+ mr r30, r7
+ std r31, -8(r1)
+ li r31, 0
+ std r27, -40(r1)
+ std r0, 16(r1)
+ stw r12, 8(r1)
+ stdu r1, -176(r1)
+ beq- cr0, L(1)
+ cmpdi cr7, r7, 0
+ sldi r0, r10, 3
+ add r11, r0, r29
+ addi r29, r11, -8
+ blt- cr7, L(162)
+ cmpdi cr4, r6, 0
+ beq+ cr4, L(71)
+L(163):
+ sldi r9, r6, 3
+ add r9, r9, r5
+ ld r7, -8(r9)
+ cmpld cr7, r7, r30
+ bge- cr7, L(71)
+ cmpdi cr7, r10, 1
+ li r0, 0
+ mr r31, r7
+ std r0, -8(r11)
+ addi r29, r29, -8
+ mr r3, r7
+ beq- cr7, L(1)
+ addi r28, r6, -1
+ cmpdi cr4, r28, 0
+L(71):
+ cntlzd r27, r30
+ sld r30, r30, r27
+ sld r31, r31, r27
+ mr r3, r30
+ CALL( mpn_invert_limb)
+ nop
+ beq- cr4, L(110)
+ sldi r9, r28, 3
+ addic. r6, r28, -2
+ add r9, r9, r26
+ subfic r5, r27, 64
+ ld r8, -8(r9)
+ srd r0, r8, r5
+ or r31, r31, r0
+ sld r7, r8, r27
+ blt- cr0, L(154)
+ addi r28, r28, -1
+ mtctr r28
+ sldi r6, r6, 3
+ ALIGN(16)
+L(uloop):
+ addi r11, r31, 1
+ ldx r8, r26, r6
+ mulld r0, r31, r3
+ mulhdu r10, r31, r3
+ addi r6, r6, -8
+ srd r9, r8, r5
+ or r9, r7, r9
+ addc r0, r0, r9
+ adde r10, r10, r11
+ mulld r31, r10, r30
+ subf r31, r31, r9
+ subfc r0, r0, r31 C r >= ql
+ subfe r0, r0, r0 C r0 = -(r >= ql)
+ not r7, r0
+ add r10, r7, r10 C qh -= (r >= ql)
+ andc r0, r30, r0
+ add r31, r31, r0
+ cmpld cr7, r31, r30
+ bge- cr7, L(164)
+L(123):
+ std r10, 0(r29)
+ addi r29, r29, -8
+ sld r7, r8, r27
+ bdnz L(uloop)
+L(154):
+ addi r11, r31, 1
+ nop
+ mulld r0, r31, r3
+ mulhdu r8, r31, r3
+ addc r0, r0, r7
+ adde r8, r8, r11
+ mulld r31, r8, r30
+ subf r31, r31, r7
+ subfc r0, r0, r31 C r >= ql
+ subfe r0, r0, r0 C r0 = -(r >= ql)
+ not r7, r0
+ add r8, r7, r8 C qh -= (r >= ql)
+ andc r0, r30, r0
+ add r31, r31, r0
+ cmpld cr7, r31, r30
+ bge- cr7, L(165)
+L(134):
+ std r8, 0(r29)
+ addi r29, r29, -8
+L(110):
+ addic. r0, r25, -1
+ blt- cr0, L(156)
+ mtctr r25
+ neg r9, r30
+ ALIGN(16)
+L(ufloop):
+ addi r11, r31, 1
+ nop
+ mulld r7, r3, r31
+ mulhdu r10, r3, r31
+ add r10, r10, r11
+ mulld r31, r9, r10
+ifelse(0,1,`
+ subfc r0, r7, r31
+ subfe r0, r0, r0 C r0 = -(r >= ql)
+ not r7, r0
+ add r10, r7, r10 C qh -= (r >= ql)
+ andc r0, r30, r0
+ add r31, r31, r0
+',`
+ cmpld cr7, r31, r7
+ blt cr7, L(29)
+ add r31, r30, r31
+ addi r10, r10, -1
+L(29):
+')
+ std r10, 0(r29)
+ addi r29, r29, -8
+ bdnz L(ufloop)
+L(156):
+ srd r3, r31, r27
+L(1):
+ addi r1, r1, 176
+ ld r0, 16(r1)
+ lwz r12, 8(r1)
+ mtlr r0
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ mtcrf 8, r12
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+L(162):
+ cmpdi cr7, r6, 0
+ beq- cr7, L(8)
+ sldi r9, r6, 3
+ addi r29, r29, -8
+ add r9, r9, r5
+ addi r28, r6, -1
+ ld r31, -8(r9)
+ subfc r9, r7, r31
+ li r9, 0
+ adde r9, r9, r9
+ neg r0, r9
+ std r9, -8(r11)
+ and r0, r0, r7
+ subf r31, r0, r31
+L(8):
+L(10):
+ mr r3, r30
+ CALL( mpn_invert_limb)
+ nop
+ addic. r6, r28, -1
+ blt- cr0, L(150)
+ mtctr r28
+ sldi r6, r6, 3
+ ALIGN(16)
+L(nloop):
+ addi r11, r31, 1
+ ldx r8, r26, r6
+ mulld r0, r31, r3
+ addi r6, r6, -8
+ mulhdu r10, r31, r3
+ addc r7, r0, r8
+ adde r10, r10, r11
+ mulld r31, r10, r30
+ subf r31, r31, r8 C r = nl - qh * d
+ subfc r0, r7, r31 C r >= ql
+ subfe r0, r0, r0 C r0 = -(r >= ql)
+ not r7, r0
+ add r10, r7, r10 C qh -= (r >= ql)
+ andc r0, r30, r0
+ add r31, r31, r0
+ cmpld cr7, r31, r30
+ bge- cr7, L(167)
+L(51):
+ std r10, 0(r29)
+ addi r29, r29, -8
+ bdnz L(nloop)
+
+L(150):
+ addic. r9, r25, -1
+ blt- cr0, L(152)
+ mtctr r25
+ neg r9, r30
+ ALIGN(16)
+L(nfloop):
+ addi r11, r31, 1
+ nop
+ mulld r7, r3, r31
+ mulhdu r10, r3, r31
+ add r10, r10, r11
+ mulld r31, r9, r10
+ifelse(0,1,`
+ subfc r0, r7, r31
+ subfe r0, r0, r0 C r0 = -(r >= ql)
+ not r7, r0
+ add r10, r7, r10 C qh -= (r >= ql)
+ andc r0, r30, r0
+ add r31, r31, r0
+',`
+ cmpld cr7, r31, r7
+ blt cr7, L(28)
+ add r31, r30, r31
+ addi r10, r10, -1
+L(28):
+')
+ std r10, 0(r29)
+ addi r29, r29, -8
+ bdnz L(nfloop)
+L(152):
+ addi r1, r1, 176
+ mr r3, r31
+ ld r0, 16(r1)
+ lwz r12, 8(r1)
+ mtlr r0
+ ld r25, -56(r1)
+ ld r26, -48(r1)
+ mtcrf 8, r12
+ ld r27, -40(r1)
+ ld r28, -32(r1)
+ ld r29, -24(r1)
+ ld r30, -16(r1)
+ ld r31, -8(r1)
+ blr
+L(164):
+ subf r31, r30, r31
+ addi r10, r10, 1
+ b L(123)
+L(167):
+ subf r31, r30, r31
+ addi r10, r10, 1
+ b L(51)
+L(165):
+ subf r31, r30, r31
+ addi r8, r8, 1
+ b L(134)
+EPILOGUE()