Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / alpha / divrem_2.asm
diff --git a/gmp/mpn/alpha/divrem_2.asm b/gmp/mpn/alpha/divrem_2.asm

new file mode 100644 (file)

index 0000000..b68468b
--- /dev/null
+++ b/gmp/mpn/alpha/divrem_2.asm
@@ -0,0 +1,167 @@
+dnl  Alpha mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
+
+dnl  Copyright 2007, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C              norm    frac
+C ev4
+C ev5          70      70
+C ev6          29      29
+
+C TODO
+C  * Perhaps inline mpn_invert_limb, that would allow us to not save/restore
+C    any registers (thus save ~10 cycles per call).
+C  * Use negated d1 and/or d0 to speed carry propagation.  Might save a cycle
+C    or two.
+C  * Check cluster delays (for ev6).  We very likely could save some cycles.
+C  * Use branch-free code for computing di.
+C  * CAVEAT: We rely on r19 not being clobbered by mpn_invert_limb call.
+
+C INPUT PARAMETERS
+define(`qp',           `r16')
+define(`fn',           `r17')
+define(`up_param',     `r18')
+define(`un_param',     `r19')
+define(`dp',           `r20')
+
+ASM_START()
+PROLOGUE(mpn_divrem_2)
+       ldgp    r29, 0(r27)
+       lda     r30, -80(r30)
+       stq     r26, 0(r30)
+       stq     r9, 8(r30)
+       stq     r10, 16(r30)
+       stq     r11, 24(r30)
+       stq     r12, 32(r30)
+       stq     r13, 40(r30)
+C      stq     r14, 48(r30)
+       stq     r15, 56(r30)
+       .prologue       1
+       stq     r16, 64(r30)
+       bis     r31, r17, r15
+       s8addq  r19, r18, r13
+       lda     r13, -24(r13)
+       ldq     r12, 8(r20)
+       ldq     r10, 0(r20)
+       ldq     r11, 16(r13)
+       ldq     r9, 8(r13)
+
+       bis     r31, r31, r3            C most_significant_q_limb = 0
+       cmpult  r11, r12, r1
+       bne     r1, L(L8)
+       cmpule  r11, r12, r1
+       cmpult  r9, r10, r2
+       and     r1, r2, r1
+       bne     r1, L(L8)
+       subq    r11, r12, r11
+       subq    r11, r2, r11
+       subq    r9, r10, r9
+       lda     r3, 1(r31)              C most_significant_q_limb = 1
+L(L8): stq     r3, 72(r30)
+
+       addq    r15, r19, r19
+       lda     r19, -3(r19)
+       blt     r19, L(L10)
+       bis     r31, r12, r16
+       jsr     r26, mpn_invert_limb
+       ldgp    r29, 0(r26)
+       mulq    r0, r12, r4             C t0 = LO(di * d1)
+       umulh   r0, r10, r2             C s1 = HI(di * d0)
+       addq    r4, r10, r4             C t0 += d0
+       cmpule  r10, r4, r7             C (t0 < d0)
+       addq    r4, r2, r4              C t0 += s1
+       cmpult  r4, r2, r1
+       subq    r1, r7, r7              C t1 (-1, 0, or 1)
+       blt     r7, L(L42)
+L(L22):
+       lda     r0, -1(r0)              C di--
+       cmpult  r4, r12, r1             C cy for: t0 -= d1 (below)
+       subq    r7, r1, r7              C t1 -= cy
+       subq    r4, r12, r4             C t0 -= d1
+       bge     r7, L(L22)
+L(L42):
+       ldq     r16, 64(r30)
+       s8addq  r19, r16, r16
+       ALIGN(16)
+L(loop):
+       mulq    r11, r0, r5             C q0 (early)
+       umulh   r11, r0, r6             C q  (early)
+       addq    r5, r9, r8              C q0 += n1
+       addq    r6, r11, r6             C q  += n2
+       cmpult  r8, r5, r1              C cy for: q0 += n1
+       addq    r6, r1, r6              C q  += cy
+       unop
+       mulq    r12, r6, r1             C LO(d1 * q)
+       umulh   r10, r6, r7             C t1 = HI(d0 * q)
+       subq    r9, r1, r9              C n1 -= LO(d1 * q)
+       mulq    r10, r6, r4             C t0 = LO(d0 * q)
+       unop
+       cmple   r15, r19, r5            C condition and n0...
+       beq     r5, L(L31)
+       ldq     r5, 0(r13)
+       lda     r13, -8(r13)
+L(L31):        subq    r9, r12, r9             C n1 -= d1
+       cmpult  r5, r10, r1             C
+       subq    r9, r1, r9              C
+       subq    r5, r10, r5             C n0 -= d0
+       subq    r9, r7, r9              C n1 -= t0
+       cmpult  r5, r4, r1              C
+       subq    r9, r1, r2              C
+       subq    r5, r4, r5              C n0 -= t1
+       cmpult  r2, r8, r1              C (n1 < q0)
+       addq    r6, r1, r6              C q += cond
+       lda     r1, -1(r1)              C -(n1 >= q0)
+       and     r1, r10, r4             C
+       addq    r5, r4, r9              C n0 += mask & d0
+       and     r1, r12, r1             C
+       cmpult  r9, r5, r11             C cy for: n0 += mask & d0
+       addq    r2, r1, r1              C n1 += mask & d1
+       addq    r1, r11, r11            C n1 += cy
+       cmpult  r11, r12, r1            C
+       beq     r1, L(fix)              C
+L(bck):        stq     r6, 0(r16)
+       lda     r16, -8(r16)
+       lda     r19, -1(r19)
+       bge     r19, L(loop)
+
+L(L10):        stq     r9, 8(r13)
+       stq     r11, 16(r13)
+       ldq     r0, 72(r30)
+       ldq     r26, 0(r30)
+       ldq     r9, 8(r30)
+       ldq     r10, 16(r30)
+       ldq     r11, 24(r30)
+       ldq     r12, 32(r30)
+       ldq     r13, 40(r30)
+C      ldq     r14, 48(r30)
+       ldq     r15, 56(r30)
+       lda     r30, 80(r30)
+       ret     r31, (r26), 1
+
+L(fix):        cmpule  r11, r12, r1
+       cmpult  r9, r10, r2
+       and     r1, r2, r1
+       bne     r1, L(bck)
+       subq    r11, r12, r11
+       subq    r11, r2, r11
+       subq    r9, r10, r9
+       lda     r6, 1(r6)
+       br      L(bck)
+EPILOGUE()
+ASM_END()