]> oss.titaniummirror.com Git - msp430-gcc.git/blobdiff - gmp/mpn/alpha/ev6/aorsmul_1.asm
Imported gcc-4.4.3
[msp430-gcc.git] / gmp / mpn / alpha / ev6 / aorsmul_1.asm
diff --git a/gmp/mpn/alpha/ev6/aorsmul_1.asm b/gmp/mpn/alpha/ev6/aorsmul_1.asm
new file mode 100644 (file)
index 0000000..eda092b
--- /dev/null
@@ -0,0 +1,387 @@
+dnl  Alpha ev6 mpn_addmul_1 and mpn_submul_1.
+
+dnl  Copyright 2000, 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C      cycles/limb
+C EV4:    42
+C EV5:    18
+C EV6:     3.5
+
+C  INPUT PARAMETERS
+define(`rp',   `r16')
+define(`up',   `r17')
+define(`n',    `r18')
+define(`v0',   `r19')
+
+dnl  This code was written in cooperation with ev6 pipeline expert Steve Root.
+
+dnl  The stores can issue a cycle late so we have paired no-op's to 'catch'
+dnl  them, so that further disturbance to the schedule is damped.
+
+dnl  We couldn't pair the loads, because the entangled schedule of the carry's
+dnl  has to happen on one side {0} of the machine.
+
+dnl  This is a great schedule for the d_cache, a poor schedule for the b_cache.
+dnl  The lockup on U0 means that any stall can't be recovered from.  Consider a
+dnl  ldq in L1, say that load gets stalled because it collides with a fill from
+dnl  the b_cache.  On the next cycle, this load gets priority.  If first looks
+dnl  at L0, and goes there.  The instruction we intended for L0 gets to look at
+dnl  L1, which is NOT where we want it.  It either stalls 1, because it can't
+dnl  go in L0, or goes there, and causes a further instruction to stall.
+
+dnl  So for b_cache, we're likely going to want to put one or more cycles back
+dnl  into the code! And, of course, put in lds prefetch for the rp[] operand.
+dnl  At a place where we have an mt followed by a bookkeeping, put the
+dnl  bookkeeping in upper, and the prefetch into lower.
+
+dnl  Note, the ldq's and stq's are at the end of the quadpacks.  Note, we'd
+dnl  like not to have an ldq or an stq to preceded a conditional branch in a
+dnl  quadpack.  The conditional branch moves the retire pointer one cycle
+dnl  later.
+
+ifdef(`OPERATION_addmul_1',`
+    define(`ADDSUB',   `addq')
+    define(`CMPCY',    `cmpult $2,$1')
+    define(`func',     `mpn_addmul_1')
+')
+ifdef(`OPERATION_submul_1',`
+    define(`ADDSUB',   `subq')
+    define(`CMPCY',    `cmpult $1,$2')
+    define(`func',     `mpn_submul_1')
+')
+
+MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
+
+ASM_START()
+PROLOGUE(func)
+       ldq     r3,     0(up)           C
+       and     r18,    7,      r20     C
+       lda     r18,    -9(r18)         C
+       cmpeq   r20,    1,      r21     C
+       beq     r21,    $L1             C
+
+$1mod8:        ldq     r5,     0(rp)           C
+       mulq    v0,     r3,     r7      C
+       umulh   v0,     r3,     r8      C
+       ADDSUB  r5,     r7,     r23     C
+       CMPCY(  r5,     r23),   r20     C
+       addq    r8,     r20,    r0      C
+       stq     r23,    0(rp)           C
+       bge     r18,    $ent1           C
+       ret     r31,    (r26),  1       C
+
+$L1:   lda     r8,     0(r31)          C zero carry reg
+       lda     r24,    0(r31)          C zero carry reg
+       cmpeq   r20,    2,      r21     C
+       bne     r21,    $2mod8          C
+       cmpeq   r20,    3,      r21     C
+       bne     r21,    $3mod8          C
+       cmpeq   r20,    4,      r21     C
+       bne     r21,    $4mod8          C
+       cmpeq   r20,    5,      r21     C
+       bne     r21,    $5mod8          C
+       cmpeq   r20,    6,      r21     C
+       bne     r21,    $6mod8          C
+       cmpeq   r20,    7,      r21     C
+       beq     r21,    $0mod8          C
+
+$7mod8:        ldq     r5,     0(rp)           C
+       lda     up,     8(up)           C
+       mulq    v0,     r3,     r7      C
+       umulh   v0,     r3,     r24     C
+       ADDSUB  r5,     r7,     r23     C
+       CMPCY(  r5,     r23),   r20     C
+       addq    r24,    r20,    r24     C
+       stq     r23,    0(rp)           C
+       lda     rp,     8(rp)           C
+       ldq     r3,     0(up)           C
+$6mod8:        ldq     r1,     8(up)           C
+       mulq    v0,     r3,     r25     C
+       umulh   v0,     r3,     r3      C
+       mulq    v0,     r1,     r28     C
+       ldq     r0,     16(up)          C
+       ldq     r4,     0(rp)           C
+       umulh   v0,     r1,     r8      C
+       ldq     r1,     24(up)          C
+       lda     up,     48(up)          C L1 bookkeeping
+       mulq    v0,     r0,     r2      C
+       ldq     r5,     8(rp)           C
+       lda     rp,     -32(rp)         C L1 bookkeeping
+       umulh   v0,     r0,     r6      C
+       ADDSUB  r4,     r25,    r25     C lo + acc
+       mulq    v0,     r1,     r7      C
+       br      r31,    $ent6           C
+
+$ent1: lda     up,     8(up)           C
+       lda     rp,     8(rp)           C
+       lda     r8,     0(r0)           C
+       ldq     r3,     0(up)           C
+$0mod8:        ldq     r1,     8(up)           C
+       mulq    v0,     r3,     r2      C
+       umulh   v0,     r3,     r6      C
+       mulq    v0,     r1,     r7      C
+       ldq     r0,     16(up)          C
+       ldq     r4,     0(rp)           C
+       umulh   v0,     r1,     r24     C
+       ldq     r1,     24(up)          C
+       mulq    v0,     r0,     r25     C
+       ldq     r5,     8(rp)           C
+       umulh   v0,     r0,     r3      C
+       ADDSUB  r4,     r2,     r2      C lo + acc
+       mulq    v0,     r1,     r28     C
+       lda     rp,     -16(rp)         C
+       br      r31,    $ent0           C
+
+$3mod8:        ldq     r5,     0(rp)           C
+       lda     up,     8(up)           C
+       mulq    v0,     r3,     r7      C
+       umulh   v0,     r3,     r8      C
+       ADDSUB  r5,     r7,     r23     C
+       CMPCY(  r5,     r23),   r20     C
+       addq    r8,     r20,    r24     C
+       stq     r23,    0(rp)           C
+       lda     rp,     8(rp)           C
+       ldq     r3,     0(up)           C
+$2mod8:        ldq     r1,     8(up)           C
+       mulq    v0,     r3,     r25     C
+       umulh   v0,     r3,     r3      C
+       mulq    v0,     r1,     r28     C
+       ble     r18,    $n23            C
+       ldq     r0,     16(up)          C
+       ldq     r4,     0(rp)           C
+       umulh   v0,     r1,     r8      C
+       ldq     r1,     24(up)          C
+       lda     up,     16(up)          C L1 bookkeeping
+       mulq    v0,     r0,     r2      C
+       ldq     r5,     8(rp)           C
+       lda     rp,     0(rp)           C L1 bookkeeping
+       umulh   v0,     r0,     r6      C
+       ADDSUB  r4,     r25,    r25     C lo + acc
+       mulq    v0,     r1,     r7      C
+       br      r31,    $ent2           C
+
+$5mod8:        ldq     r5,     0(rp)           C
+       lda     up,     8(up)           C
+       mulq    v0,     r3,     r7      C
+       umulh   v0,     r3,     r24     C
+       ADDSUB  r5,     r7,     r23     C
+       CMPCY(  r5,     r23),   r20     C
+       addq    r24,    r20,    r8      C
+       stq     r23,    0(rp)           C
+       lda     rp,     8(rp)           C
+       ldq     r3,     0(up)           C
+$4mod8:        ldq     r1,     8(up)           C
+       mulq    v0,     r3,     r2      C
+       umulh   v0,     r3,     r6      C
+       mulq    v0,     r1,     r7      C
+       ldq     r0,     16(up)          C
+       ldq     r4,     0(rp)           C
+       umulh   v0,     r1,     r24     C
+       ldq     r1,     24(up)          C
+       lda     up,     32(up)          C L1 bookkeeping
+       mulq    v0,     r0,     r25     C
+       ldq     r5,     8(rp)           C
+       lda     rp,     16(rp)          C L1 bookkeeping
+       umulh   v0,     r0,     r3      C
+       ADDSUB  r4,     r2,     r2      C lo + acc
+       mulq    v0,     r1,     r28     C
+       CMPCY(  r4,     r2),    r20     C L0 lo add => carry
+       ADDSUB  r2,     r8,     r22     C U0 hi add => answer
+       ble     r18,    $Lend           C
+       ALIGN(16)
+$Loop:
+       bis     r31,    r31,    r31     C U1 mt
+       CMPCY(  r2,     r22),   r21     C L0 hi add => carry
+       addq    r6,     r20,    r6      C U0 hi mul + carry
+       ldq     r0,     0(up)           C
+
+       bis     r31,    r31,    r31     C U1 mt
+       ADDSUB  r5,     r7,     r7      C L0 lo + acc
+       addq    r6,     r21,    r6      C U0 hi mul + carry
+       ldq     r4,     0(rp)           C L1
+
+       umulh   v0,     r1,     r8      C U1
+       CMPCY(  r5,     r7),    r20     C L0 lo add => carry
+       ADDSUB  r7,     r6,     r23     C U0 hi add => answer
+       ldq     r1,     8(up)           C L1
+
+       mulq    v0,     r0,     r2      C U1
+       CMPCY(  r7,     r23),   r21     C L0 hi add => carry
+       addq    r24,    r20,    r24     C U0 hi mul + carry
+       ldq     r5,     8(rp)           C L1
+
+       umulh   v0,     r0,     r6      C U1
+       ADDSUB  r4,     r25,    r25     C U0 lo + acc
+       stq     r22,    -16(rp)         C L0
+       stq     r23,    -8(rp)          C L1
+
+       bis     r31,    r31,    r31     C L0 st slosh
+       mulq    v0,     r1,     r7      C U1
+       bis     r31,    r31,    r31     C L1 st slosh
+       addq    r24,    r21,    r24     C U0 hi mul + carry
+$ent2:
+       CMPCY(  r4,     r25),   r20     C L0 lo add => carry
+       bis     r31,    r31,    r31     C U1 mt
+       lda     r18,    -8(r18)         C L1 bookkeeping
+       ADDSUB  r25,    r24,    r22     C U0 hi add => answer
+
+       bis     r31,    r31,    r31     C U1 mt
+       CMPCY(  r25,    r22),   r21     C L0 hi add => carry
+       addq    r3,     r20,    r3      C U0 hi mul + carry
+       ldq     r0,     16(up)          C L1
+
+       bis     r31,    r31,    r31     C U1 mt
+       ADDSUB  r5,     r28,    r28     C L0 lo + acc
+       addq    r3,     r21,    r3      C U0 hi mul + carry
+       ldq     r4,     16(rp)          C L1
+
+       umulh   v0,     r1,     r24     C U1
+       CMPCY(  r5,     r28),   r20     C L0 lo add => carry
+       ADDSUB  r28,    r3,     r23     C U0 hi add => answer
+       ldq     r1,     24(up)          C L1
+
+       mulq    v0,     r0,     r25     C U1
+       CMPCY(  r28,    r23),   r21     C L0 hi add => carry
+       addq    r8,     r20,    r8      C U0 hi mul + carry
+       ldq     r5,     24(rp)          C L1
+
+       umulh   v0,     r0,     r3      C U1
+       ADDSUB  r4,     r2,     r2      C U0 lo + acc
+       stq     r22,    0(rp)           C L0
+       stq     r23,    8(rp)           C L1
+
+       bis     r31,    r31,    r31     C L0 st slosh
+       mulq    v0,     r1,     r28     C U1
+       bis     r31,    r31,    r31     C L1 st slosh
+       addq    r8,     r21,    r8      C U0 hi mul + carry
+$ent0:
+       CMPCY(  r4,     r2),    r20     C L0 lo add => carry
+       bis     r31,    r31,    r31     C U1 mt
+       lda     up,     64(up)          C L1 bookkeeping
+       ADDSUB  r2,     r8,     r22     C U0 hi add => answer
+
+       bis     r31,    r31,    r31     C U1 mt
+       CMPCY(  r2,     r22),   r21     C L0 hi add => carry
+       addq    r6,     r20,    r6      C U0 hi mul + carry
+       ldq     r0,     -32(up)         C L1
+
+       bis     r31,    r31,    r31     C U1 mt
+       ADDSUB  r5,     r7,     r7      C L0 lo + acc
+       addq    r6,     r21,    r6      C U0 hi mul + carry
+       ldq     r4,     32(rp)          C L1
+
+       umulh   v0,     r1,     r8      C U1
+       CMPCY(  r5,     r7),    r20     C L0 lo add => carry
+       ADDSUB  r7,     r6,     r23     C U0 hi add => answer
+       ldq     r1,     -24(up)         C L1
+
+       mulq    v0,     r0,     r2      C U1
+       CMPCY(  r7,     r23),   r21     C L0 hi add => carry
+       addq    r24,    r20,    r24     C U0 hi mul + carry
+       ldq     r5,     40(rp)          C L1
+
+       umulh   v0,     r0,     r6      C U1
+       ADDSUB  r4,     r25,    r25     C U0 lo + acc
+       stq     r22,    16(rp)          C L0
+       stq     r23,    24(rp)          C L1
+
+       bis     r31,    r31,    r31     C L0 st slosh
+       mulq    v0,     r1,     r7      C U1
+       bis     r31,    r31,    r31     C L1 st slosh
+       addq    r24,    r21,    r24     C U0 hi mul + carry
+$ent6:
+       CMPCY(  r4,     r25),   r20     C L0 lo add => carry
+       bis     r31,    r31,    r31     C U1 mt
+       lda     rp,     64(rp)          C L1 bookkeeping
+       ADDSUB  r25,    r24,    r22     C U0 hi add => answer
+
+       bis     r31,    r31,    r31     C U1 mt
+       CMPCY(  r25,    r22),   r21     C L0 hi add => carry
+       addq    r3,     r20,    r3      C U0 hi mul + carry
+       ldq     r0,     -16(up)         C L1
+
+       bis     r31,    r31,    r31     C U1 mt
+       ADDSUB  r5,     r28,    r28     C L0 lo + acc
+       addq    r3,     r21,    r3      C U0 hi mul + carry
+       ldq     r4,     -16(rp)         C L1
+
+       umulh   v0,     r1,     r24     C U1
+       CMPCY(  r5,     r28),   r20     C L0 lo add => carry
+       ADDSUB  r28,    r3,     r23     C U0 hi add => answer
+       ldq     r1,     -8(up)          C L1
+
+       mulq    v0,     r0,     r25     C U1
+       CMPCY(  r28,    r23),   r21     C L0 hi add => carry
+       addq    r8,     r20,    r8      C U0 hi mul + carry
+       ldq     r5,     -8(rp)          C L1
+
+       umulh   v0,     r0,     r3      C U1
+       ADDSUB  r4,     r2,     r2      C U0 lo + acc
+       stq     r22,    -32(rp)         C L0
+       stq     r23,    -24(rp)         C L1
+
+       bis     r31,    r31,    r31     C L0 st slosh
+       mulq    v0,     r1,     r28     C U1
+       bis     r31,    r31,    r31     C L1 st slosh
+       addq    r8,     r21,    r8      C U0 hi mul + carry
+
+       CMPCY(  r4,     r2),    r20     C L0 lo add => carry
+       ADDSUB  r2,     r8,     r22     C U0 hi add => answer
+       ldl     r31,    256(up)         C prefetch up[]
+       bgt     r18,    $Loop           C U1 bookkeeping
+
+$Lend: CMPCY(  r2,     r22),   r21     C
+       addq    r6,     r20,    r6      C
+       ADDSUB  r5,     r7,     r7      C
+       addq    r6,     r21,    r6      C
+       ldq     r4,     0(rp)           C
+       umulh   v0,     r1,     r8      C
+       CMPCY(  r5,     r7),    r20     C
+       ADDSUB  r7,     r6,     r23     C
+       CMPCY(r7,       r23),   r21     C
+       addq    r24,    r20,    r24     C
+       ldq     r5,     8(rp)           C
+       ADDSUB  r4,     r25,    r25     C
+       stq     r22,    -16(rp)         C
+       stq     r23,    -8(rp)          C
+       addq    r24,    r21,    r24     C
+       br      L(x)
+
+       ALIGN(16)
+$n23:  ldq     r4,     0(rp)           C
+       ldq     r5,     8(rp)           C
+       umulh   v0,     r1,     r8      C
+       ADDSUB  r4,     r25,    r25     C
+L(x):  CMPCY(  r4,     r25),   r20     C
+       ADDSUB  r25,    r24,    r22     C
+       CMPCY(  r25,    r22),   r21     C
+       addq    r3,     r20,    r3      C
+       ADDSUB  r5,     r28,    r28     C
+       addq    r3,     r21,    r3      C
+       CMPCY(  r5,     r28),   r20     C
+       ADDSUB  r28,    r3,     r23     C
+       CMPCY(  r28,    r23),   r21     C
+       addq    r8,     r20,    r8      C
+       stq     r22,    0(rp)           C
+       stq     r23,    8(rp)           C
+       addq    r8,     r21,    r0      C
+       ret     r31,    (r26),  1       C
+EPILOGUE()
+ASM_END()