--- /dev/null
+dnl Alpha ev6 nails mpn_mul_1.
+
+dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C EV4: 42
+C EV5: 18
+C EV6: 3.25
+
+C TODO
+C * Reroll loop for 3.0 c/l with current 4-way unrulling.
+C * The loop is overscheduled wrt loads and wrt multiplies, in particular
+C umulh.
+C * Use FP loop count and multiple exit points, that would simpily feed-in lp0
+C and would work since the loop structure is really regular.
+
+C INPUT PARAMETERS
+define(`rp',`r16')
+define(`up',`r17')
+define(`n', `r18')
+define(`vl0',`r19')
+
+define(`numb_mask',`r6')
+
+define(`m0a',`r0')
+define(`m0b',`r1')
+define(`m1a',`r2')
+define(`m1b',`r3')
+define(`m2a',`r20')
+define(`m2b',`r21')
+define(`m3a',`r22')
+define(`m3b',`r23')
+
+define(`acc0',`r25')
+define(`acc1',`r27')
+
+define(`ul0',`r4')
+define(`ul1',`r5')
+define(`ul2',`r4')
+define(`ul3',`r5')
+
+define(`rl0',`r24')
+define(`rl1',`r24')
+define(`rl2',`r24')
+define(`rl3',`r24')
+
+define(`t0',`r7')
+define(`t1',`r8')
+
+define(`NAIL_BITS',`GMP_NAIL_BITS')
+define(`NUMB_BITS',`GMP_NUMB_BITS')
+
+dnl This declaration is munged by configure
+NAILS_SUPPORT(1-63)
+
+ASM_START()
+PROLOGUE(mpn_mul_1)
+ sll vl0, NAIL_BITS, vl0
+ lda numb_mask, -1(r31)
+ srl numb_mask, NAIL_BITS, numb_mask
+
+ and n, 3, r25
+ cmpeq r25, 1, r21
+ bne r21, L(1m4)
+ cmpeq r25, 2, r21
+ bne r21, L(2m4)
+ beq r25, L(0m4)
+
+L(3m4): ldq ul3, 0(up)
+ lda n, -4(n)
+ ldq ul0, 8(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 16(up)
+ lda up, 24(up)
+ lda rp, -8(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge3)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(ta3)
+
+L(ge3): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, r31, acc1
+ umulh vl0, ul2, m2b
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ br r31, L(el3)
+
+L(0m4): lda n, -8(n)
+ ldq ul2, 0(up)
+ ldq ul3, 8(up)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge4)
+
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta4)
+
+L(ge4): srl m2a,NAIL_BITS, t0
+ ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ addq t0, r31, acc0
+ umulh vl0, ul1, m1b
+ srl m3a,NAIL_BITS, t0
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(el0)
+
+L(2m4): lda n, -4(n)
+ ldq ul0, 0(up)
+ ldq ul1, 8(up)
+ lda up, 16(up)
+ lda rp, -16(rp)
+ mulq vl0, ul0, m0a
+ umulh vl0, ul0, m0b
+ bge n, L(ge2)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m0a,NAIL_BITS, t0
+ addq t0, r31, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ br r31, L(ta2)
+
+L(ge2): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ srl m0a,NAIL_BITS, t0
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ addq t0, r31, acc0
+ umulh vl0, ul3, m3b
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ bge n, L(el2)
+
+ br r31, L(ta6)
+
+L(1m4): lda n, -4(n)
+ ldq ul1, 0(up)
+ lda up, 8(up)
+ lda rp, -24(rp)
+ bge n, L(ge1)
+
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ srl m1a,NAIL_BITS, t0
+ addq t0, r31, acc1
+ and acc1,numb_mask, r28
+ srl acc1,NUMB_BITS, t1
+ stq r28, 24(rp)
+ addq t1, m1b, r0
+ ret r31, (r26), 1
+
+L(ge1): ldq ul2, 0(up)
+ mulq vl0, ul1, m1a
+ umulh vl0, ul1, m1b
+ ldq ul3, 8(up)
+ lda n, -4(n)
+ mulq vl0, ul2, m2a
+ umulh vl0, ul2, m2b
+ ldq ul0, 16(up)
+ mulq vl0, ul3, m3a
+ umulh vl0, ul3, m3b
+ srl m1a,NAIL_BITS, t0
+ ldq ul1, 24(up)
+ lda up, 32(up)
+ lda rp, 32(rp)
+ mulq vl0, ul0, m0a
+ addq t0, r31, acc1
+ umulh vl0, ul0, m0b
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ blt n, L(ta5)
+
+L(ge5): ldq ul2, 0(up)
+ br r31, L(el1)
+
+ ALIGN(16)
+L(top): mulq vl0, ul0, m0a C U1
+ addq t0, m0b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -24(rp) C L1
+C
+L(el2): umulh vl0, ul0, m0b C U1
+ and acc0,numb_mask, r28 C L0
+ unop C U0
+ unop C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m2a,NAIL_BITS, t0 C U0
+ ldq ul2, 0(up) C L1
+C
+ mulq vl0, ul1, m1a C U1
+ addq t0, m1b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, -16(rp) C L1
+C
+L(el1): umulh vl0, ul1, m1b C U1
+ and acc1,numb_mask, r28 C L0
+ unop C U0
+ lda n, -4(n) C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m3a,NAIL_BITS, t0 C U0
+ ldq ul3, 8(up) C L1
+C
+ mulq vl0, ul2, m2a C U1
+ addq t0, m2b, acc1 C L0
+ srl acc0,NUMB_BITS, t1 C U0
+ stq r28, -8(rp) C L1
+C
+L(el0): umulh vl0, ul2, m2b C U1
+ and acc0,numb_mask, r28 C L0
+ unop C U0
+ unop C L1
+C
+ unop C U1
+ addq t1, acc1, acc1 C L0
+ srl m0a,NAIL_BITS, t0 C U0
+ ldq ul0, 16(up) C L1
+C
+ mulq vl0, ul3, m3a C U1
+ addq t0, m3b, acc0 C L0
+ srl acc1,NUMB_BITS, t1 C U0
+ stq r28, 0(rp) C L1
+C
+L(el3): umulh vl0, ul3, m3b C U1
+ and acc1,numb_mask, r28 C L0
+ unop C U0
+ unop C L1
+C
+ unop C U1
+ addq t1, acc0, acc0 C L0
+ srl m1a,NAIL_BITS, t0 C U0
+ ldq ul1, 24(up) C L1
+C
+ lda up, 32(up) C L0
+ unop C U1
+ lda rp, 32(rp) C L1
+ bge n, L(top) C U0
+
+L(end): mulq vl0, ul0, m0a
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -24(rp)
+L(ta6): umulh vl0, ul0, m0b
+ and acc0,numb_mask, r28
+ addq t1, acc1, acc1
+ srl m2a,NAIL_BITS, t0
+ mulq vl0, ul1, m1a
+ addq t0, m1b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, -16(rp)
+L(ta5): umulh vl0, ul1, m1b
+ and acc1,numb_mask, r28
+ addq t1, acc0, acc0
+ srl m3a,NAIL_BITS, t0
+ addq t0, m2b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, -8(rp)
+ ALIGN(16)
+L(ta4): and acc0,numb_mask, r28
+ addq t1, acc1, acc1
+ srl m0a,NAIL_BITS, t0
+ addq t0, m3b, acc0
+ srl acc1,NUMB_BITS, t1
+ stq r28, 0(rp)
+ unop
+ ALIGN(16)
+L(ta3): and acc1,numb_mask, r28
+ addq t1, acc0, acc0
+ srl m1a,NAIL_BITS, t0
+ addq t0, m0b, acc1
+ srl acc0,NUMB_BITS, t1
+ stq r28, 8(rp)
+ unop
+ ALIGN(16)
+L(ta2): and acc0,numb_mask, r28
+ addq t1, acc1, acc1
+ srl acc1,NUMB_BITS, t1
+ stq r28, 16(rp)
+ and acc1,numb_mask, r28
+ addq t1, m1b, r0
+ stq r28, 24(rp)
+ ret r31, (r26), 1
+EPILOGUE()
+ASM_END()