]> oss.titaniummirror.com Git - msp430-gcc.git/blobdiff - gmp/mpn/alpha/bdiv_dbm1c.asm
Imported gcc-4.4.3
[msp430-gcc.git] / gmp / mpn / alpha / bdiv_dbm1c.asm
diff --git a/gmp/mpn/alpha/bdiv_dbm1c.asm b/gmp/mpn/alpha/bdiv_dbm1c.asm
new file mode 100644 (file)
index 0000000..e5f11db
--- /dev/null
@@ -0,0 +1,271 @@
+dnl  Alpha mpn_bdiv_dbm1c.
+
+dnl  Copyright 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:     42
+C EV5:     18
+C EV6:      3
+
+C TODO
+C  * Try less unrolling, 2-way should give the same performance.
+C  * Optimize feed-in and wind-down code, for speed, and perhaps further for
+C    code size.
+C  * This runs optimally given the algorithm, r8 is on a 3 operation recurrency
+C    path.  We have not tried very hard to find a better algorithm.  Perhaps
+C    it would be a good task for the GNU superoptimizer.
+
+C INPUT PARAMETERS
+define(`rp', `r16')
+define(`up', `r17')
+define(`n',  `r18')
+define(`bd', `r19')
+define(`cy', `r19')
+
+
+ASM_START()
+PROLOGUE(mpn_bdiv_dbm1c)
+       mov     r20, r8
+
+       ldq     r24, 0(r17)
+       and     r18, 3, r28
+       lda     r18, -4(r18)
+       beq     r28, L(b0)
+       cmpeq   r28, 1, r21
+       bne     r21, L(b1)
+       cmpeq   r28, 2, r21
+       bne     r21, L(b2)
+
+
+L(b3): ldq     r2, 8(r17)
+       ldq     r3, 16(r17)
+       bgt     r18, L(gt3)
+
+       mulq    r24, r19, r5    C U1
+       umulh   r24, r19, r21   C U1
+       mulq    r2, r19, r6     C U1
+       umulh   r2, r19, r22    C U1
+       mulq    r3, r19, r7     C U1
+       umulh   r3, r19, r23    C U1
+       lda     r16, -32(r16)
+       br      L(cj3)
+
+L(gt3):        ldq     r0, 24(r17)
+       mulq    r24, r19, r5    C U1
+       umulh   r24, r19, r21   C U1
+       ldq     r1, 32(r17)
+       mulq    r2, r19, r6     C U1
+       umulh   r2, r19, r22    C U1
+       ldq     r2, 40(r17)
+       mulq    r3, r19, r7     C U1
+       umulh   r3, r19, r23    C U1
+       ldq     r3, 48(r17)
+       lda     r18, -4(r18)
+       lda     r17, 56(r17)
+       mulq    r0, r19, r4     C U1
+       bgt     r18, L(L3)
+
+       br      L(cj7)
+
+
+L(b2): ldq     r3, 8(r17)
+       bgt     r18, L(gt2)
+
+       mulq    r24, r19, r6    C U1
+       umulh   r24, r19, r22   C U1
+       mulq    r3, r19, r7     C U1
+       umulh   r3, r19, r23    C U1
+       lda     r16, -40(r16)
+       br      L(cj2)
+
+L(gt2):        ldq     r0, 16(r17)
+       ldq     r1, 24(r17)
+       mulq    r24, r19, r6    C U1
+       umulh   r24, r19, r22   C U1
+       ldq     r2, 32(r17)
+       mulq    r3, r19, r7     C U1
+       umulh   r3, r19, r23    C U1
+       ldq     r3, 40(r17)
+       lda     r18, -4(r18)
+       lda     r17, 48(r17)
+       mulq    r0, r19, r4     C U1
+       umulh   r0, r19, r20    C U1
+       lda     r16, -8(r16)
+       bgt     r18, L(gt6)
+
+       mulq    r1, r19, r5     C U1
+       br      L(cj6)
+
+L(gt6):        ldq     r0, 0(r17)
+       mulq    r1, r19, r5     C U1
+       br      L(L2)
+
+
+L(b1): bgt     r18, L(gt1)
+
+       mulq    r24, r19, r7    C U1
+       umulh   r24, r19, r23   C U1
+       lda     r16, -48(r16)
+       br      L(cj1)
+
+L(gt1):        ldq     r0, 8(r17)
+       ldq     r1, 16(r17)
+       ldq     r2, 24(r17)
+       mulq    r24, r19, r7    C U1
+       umulh   r24, r19, r23   C U1
+       ldq     r3, 32(r17)
+       lda     r18, -4(r18)
+       lda     r17, 40(r17)
+       mulq    r0, r19, r4     C U1
+       umulh   r0, r19, r20    C U1
+       lda     r16, -16(r16)
+       bgt     r18, L(gt5)
+
+       mulq    r1, r19, r5     C U1
+       umulh   r1, r19, r21    C U1
+       mulq    r2, r19, r6     C U1
+       br      L(cj5)
+
+L(gt5):        ldq     r0, 0(r17)
+       mulq    r1, r19, r5     C U1
+       umulh   r1, r19, r21    C U1
+       ldq     r1, 8(r17)
+       mulq    r2, r19, r6     C U1
+       br      L(L1)
+
+
+L(b0): ldq     r1, 8(r17)
+       ldq     r2, 16(r17)
+       ldq     r3, 24(r17)
+       lda     r17, 32(r17)
+       lda     r16, -24(r16)
+       mulq    r24, r19, r4    C U1
+       umulh   r24, r19, r20   C U1
+       bgt     r18, L(gt4)
+
+       mulq    r1, r19, r5     C U1
+       umulh   r1, r19, r21    C U1
+       mulq    r2, r19, r6     C U1
+       umulh   r2, r19, r22    C U1
+       mulq    r3, r19, r7     C U1
+       br      L(cj4)
+
+L(gt4):        ldq     r0, 0(r17)
+       mulq    r1, r19, r5     C U1
+       umulh   r1, r19, r21    C U1
+       ldq     r1, 8(r17)
+       mulq    r2, r19, r6     C U1
+       umulh   r2, r19, r22    C U1
+       ldq     r2, 16(r17)
+       mulq    r3, r19, r7     C U1
+       br      L(L0)
+
+C *** MAIN LOOP START ***
+       ALIGN(16)
+L(top):        mulq    r0, r19, r4     C U1
+       subq    r8, r28, r8
+L(L3): umulh   r0, r19, r20    C U1
+       cmpult  r8, r5, r28
+       ldq     r0, 0(r17)
+       subq    r8, r5, r8
+       addq    r21, r28, r28
+       stq     r8, 0(r16)
+
+       mulq    r1, r19, r5     C U1
+       subq    r8, r28, r8
+L(L2): umulh   r1, r19, r21    C U1
+       cmpult  r8, r6, r28
+       ldq     r1, 8(r17)
+       subq    r8, r6, r8
+       addq    r22, r28, r28
+       stq     r8, 8(r16)
+
+       mulq    r2, r19, r6     C U1
+       subq    r8, r28, r8
+L(L1): umulh   r2, r19, r22    C U1
+       cmpult  r8, r7, r28
+       ldq     r2, 16(r17)
+       subq    r8, r7, r8
+       addq    r23, r28, r28
+       stq     r8, 16(r16)
+
+       mulq    r3, r19, r7     C U1
+       subq    r8, r28, r8
+L(L0): umulh   r3, r19, r23    C U1
+       cmpult  r8, r4, r28
+       ldq     r3, 24(r17)
+       subq    r8, r4, r8
+       addq    r20, r28, r28
+       stq     r8, 24(r16)
+
+       lda     r18, -4(r18)
+       lda     r17, 32(r17)
+       lda     r16, 32(r16)
+       bgt     r18, L(top)
+C *** MAIN LOOP END ***
+
+       mulq    r0, r19, r4     C U1
+       subq    r8, r28, r8
+L(cj7):        umulh   r0, r19, r20    C U1
+       cmpult  r8, r5, r28
+       subq    r8, r5, r8
+       addq    r21, r28, r28
+       stq     r8, 0(r16)
+       mulq    r1, r19, r5     C U1
+       subq    r8, r28, r8
+L(cj6):        umulh   r1, r19, r21    C U1
+       cmpult  r8, r6, r28
+       subq    r8, r6, r8
+       addq    r22, r28, r28
+       stq     r8, 8(r16)
+       mulq    r2, r19, r6     C U1
+       subq    r8, r28, r8
+L(cj5):        umulh   r2, r19, r22    C U1
+       cmpult  r8, r7, r28
+       subq    r8, r7, r8
+       addq    r23, r28, r28
+       stq     r8, 16(r16)
+       mulq    r3, r19, r7     C U1
+       subq    r8, r28, r8
+L(cj4):        umulh   r3, r19, r23    C U1
+       cmpult  r8, r4, r28
+       subq    r8, r4, r8
+       addq    r20, r28, r28
+       stq     r8, 24(r16)
+       subq    r8, r28, r8
+L(cj3):        cmpult  r8, r5, r28
+       subq    r8, r5, r8
+       addq    r21, r28, r28
+       stq     r8, 32(r16)
+       subq    r8, r28, r8
+L(cj2):        cmpult  r8, r6, r28
+       subq    r8, r6, r8
+       addq    r22, r28, r28
+       stq     r8, 40(r16)
+       subq    r8, r28, r8
+L(cj1):        cmpult  r8, r7, r28
+       subq    r8, r7, r8
+       addq    r23, r28, r28
+       stq     r8, 48(r16)
+       subq    r8, r28, r0
+       ret     r31, (r26), 1
+
+EPILOGUE()
+ASM_END()