X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Falpha%2Fev6%2Fmul_1.asm;fp=gmp%2Fmpn%2Falpha%2Fev6%2Fmul_1.asm;h=841f5083cbeb38f0690744f0f0c7bc0f3998c762;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/alpha/ev6/mul_1.asm b/gmp/mpn/alpha/ev6/mul_1.asm new file mode 100644 index 00000000..841f5083 --- /dev/null +++ b/gmp/mpn/alpha/ev6/mul_1.asm @@ -0,0 +1,485 @@ +dnl Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the +dnl result in a second limb vector. + +dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C INPUT PARAMETERS +C res_ptr r16 +C s1_ptr r17 +C size r18 +C s2_limb r19 + +C This code runs at 2.25 cycles/limb on EV6. + +C This code was written in close cooperation with ev6 pipeline expert +C Steve Root. Any errors are tege's fault, though. + +C Code structure: + +C code for n < 8 +C code for n > 8 code for (n mod 8) +C code for (n div 8) feed-in code +C 8-way unrolled loop +C wind-down code + +C Some notes about unrolled loop: +C +C r1-r8 multiplies and workup +C r21-r28 multiplies and workup +C r9-r12 loads +C r0 -1 +C r20,r29,r13-r15 scramble +C +C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a +C put-the-carry-into-hi. The idea is that these branches are very rarely +C taken, and since a non-taken branch consumes no resurces, that is better +C than an addq. +C +C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an +C add NEXT cycle #09 which feeds a store in NEXT cycle #02 + +C The code could use some further work: +C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is +C faster than this for size < 3. +C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless +C that is too costly. +C 3. Consider using 4-way unrolling, even if that runs slower. +C 4. Reduce register usage. In particular, try to avoid using r29. + +ASM_START() +PROLOGUE(mpn_mul_1) + cmpult r18, 8, r1 + beq r1, $Large +$Lsmall: + ldq r2,0(r17) C r2 = s1_limb + lda r18,-1(r18) C size-- + mulq r2,r19,r3 C r3 = prod_low + bic r31,r31,r4 C clear cy_limb + umulh r2,r19,r0 C r0 = prod_high + beq r18,$Le1a C jump if size was == 1 + ldq r2,8(r17) C r2 = s1_limb + lda r18,-1(r18) C size-- + stq r3,0(r16) + beq r18,$Le2a C jump if size was == 2 + ALIGN(8) +$Lopa: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + lda r18,-1(r18) C size-- + umulh r2,r19,r4 C r4 = cy_limb + ldq r2,16(r17) C r2 = s1_limb + lda r17,8(r17) C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + stq r3,8(r16) + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + lda r16,8(r16) C res_ptr++ + bne r18,$Lopa + +$Le2a: mulq r2,r19,r3 C r3 = prod_low + addq r4,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r4 C r4 = cy_limb + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + stq r3,8(r16) + addq r4,r0,r0 C cy_limb = prod_high + cy + ret r31,(r26),1 +$Le1a: stq r3,0(r16) + ret r31,(r26),1 + +$Large: + lda r30, -224(r30) + stq r26, 0(r30) + stq r9, 8(r30) + stq r10, 16(r30) + stq r11, 24(r30) + stq r12, 32(r30) + stq r13, 40(r30) + stq r14, 48(r30) + stq r15, 56(r30) + stq r29, 64(r30) + + and r18, 7, r20 C count for the first loop, 0-7 + srl r18, 3, r18 C count for unrolled loop + bis r31, r31, r21 + beq r20, $L_8_or_more C skip first loop + +$L_9_or_more: + ldq r2,0(r17) C r2 = s1_limb + lda r17,8(r17) C s1_ptr++ + lda r20,-1(r20) C size-- + mulq r2,r19,r3 C r3 = prod_low + umulh r2,r19,r21 C r21 = prod_high + beq r20,$Le1b C jump if size was == 1 + bis r31, r31, r0 C FIXME: shouldtn't need this + ldq r2,0(r17) C r2 = s1_limb + lda r17,8(r17) C s1_ptr++ + lda r20,-1(r20) C size-- + stq r3,0(r16) + lda r16,8(r16) C res_ptr++ + beq r20,$Le2b C jump if size was == 2 + ALIGN(8) +$Lopb: mulq r2,r19,r3 C r3 = prod_low + addq r21,r0,r0 C cy_limb = cy_limb + 'cy' + lda r20,-1(r20) C size-- + umulh r2,r19,r21 C r21 = prod_high + ldq r2,0(r17) C r2 = s1_limb + lda r17,8(r17) C s1_ptr++ + addq r3,r0,r3 C r3 = cy_limb + prod_low + stq r3,0(r16) + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + lda r16,8(r16) C res_ptr++ + bne r20,$Lopb + +$Le2b: mulq r2,r19,r3 C r3 = prod_low + addq r21,r0,r0 C cy_limb = cy_limb + 'cy' + umulh r2,r19,r21 C r21 = prod_high + addq r3,r0,r3 C r3 = cy_limb + prod_low + cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) + stq r3,0(r16) + lda r16,8(r16) C res_ptr++ + addq r21,r0,r21 C cy_limb = prod_high + cy + br r31, $L_8_or_more +$Le1b: stq r3,0(r16) + lda r16,8(r16) C res_ptr++ + +$L_8_or_more: + lda r0, -1(r31) C put -1 in r0, for tricky loop control + lda r17, -32(r17) C L1 bookkeeping + lda r18, -1(r18) C decrement count + + ldq r9, 32(r17) C L1 + ldq r10, 40(r17) C L1 + mulq r9, r19, r22 C U1 #07 + ldq r11, 48(r17) C L1 + umulh r9, r19, r23 C U1 #08 + ldq r12, 56(r17) C L1 + mulq r10, r19, r24 C U1 #09 + ldq r9, 64(r17) C L1 + + lda r17, 64(r17) C L1 bookkeeping + + umulh r10, r19, r25 C U1 #11 + mulq r11, r19, r26 C U1 #12 + umulh r11, r19, r27 C U1 #13 + mulq r12, r19, r28 C U1 #14 + ldq r10, 8(r17) C L1 + umulh r12, r19, r1 C U1 #15 + ldq r11, 16(r17) C L1 + mulq r9, r19, r2 C U1 #16 + ldq r12, 24(r17) C L1 + umulh r9, r19, r3 C U1 #17 + addq r21, r22, r13 C L1 mov + mulq r10, r19, r4 C U1 #18 + addq r23, r24, r22 C L0 sum 2 mul's + cmpult r13, r21, r14 C L1 carry from sum + bgt r18, $L_16_or_more + + cmpult r22, r24, r24 C U0 carry from sum + umulh r10, r19, r5 C U1 #02 + addq r25, r26, r23 C U0 sum 2 mul's + mulq r11, r19, r6 C U1 #03 + cmpult r23, r26, r25 C U0 carry from sum + umulh r11, r19, r7 C U1 #04 + addq r27, r28, r28 C U0 sum 2 mul's + mulq r12, r19, r8 C U1 #05 + cmpult r28, r27, r15 C L0 carry from sum + lda r16, 32(r16) C L1 bookkeeping + addq r13, r31, r13 C U0 start carry cascade + umulh r12, r19, r21 C U1 #06 + br r31, $ret0c + +$L_16_or_more: +C --------------------------------------------------------------- + subq r18,1,r18 + cmpult r22, r24, r24 C U0 carry from sum + ldq r9, 32(r17) C L1 + + umulh r10, r19, r5 C U1 #02 + addq r25, r26, r23 C U0 sum 2 mul's + mulq r11, r19, r6 C U1 #03 + cmpult r23, r26, r25 C U0 carry from sum + umulh r11, r19, r7 C U1 #04 + addq r27, r28, r28 C U0 sum 2 mul's + mulq r12, r19, r8 C U1 #05 + cmpult r28, r27, r15 C L0 carry from sum + lda r16, 32(r16) C L1 bookkeeping + addq r13, r31, r13 C U0 start carry cascade + + umulh r12, r19, r21 C U1 #06 +C beq r13, $fix0w C U0 +$ret0w: addq r22, r14, r26 C L0 + ldq r10, 40(r17) C L1 + + mulq r9, r19, r22 C U1 #07 + beq r26, $fix1w C U0 +$ret1w: addq r23, r24, r27 C L0 + ldq r11, 48(r17) C L1 + + umulh r9, r19, r23 C U1 #08 + beq r27, $fix2w C U0 +$ret2w: addq r28, r25, r28 C L0 + ldq r12, 56(r17) C L1 + + mulq r10, r19, r24 C U1 #09 + beq r28, $fix3w C U0 +$ret3w: addq r1, r2, r20 C L0 sum 2 mul's + ldq r9, 64(r17) C L1 + + addq r3, r4, r2 C L0 #10 2 mul's + lda r17, 64(r17) C L1 bookkeeping + cmpult r20, r1, r29 C U0 carry from sum + + umulh r10, r19, r25 C U1 #11 + cmpult r2, r4, r4 C U0 carry from sum + stq r13, -32(r16) C L0 + stq r26, -24(r16) C L1 + + mulq r11, r19, r26 C U1 #12 + addq r5, r6, r14 C U0 sum 2 mul's + stq r27, -16(r16) C L0 + stq r28, -8(r16) C L1 + + umulh r11, r19, r27 C U1 #13 + cmpult r14, r6, r3 C U0 carry from sum +C could do cross-jumping here: +C bra $L_middle_of_unrolled_loop + mulq r12, r19, r28 C U1 #14 + addq r7, r3, r5 C L0 eat carry + addq r20, r15, r20 C U0 carry cascade + ldq r10, 8(r17) C L1 + + umulh r12, r19, r1 C U1 #15 + beq r20, $fix4 C U0 +$ret4w: addq r2, r29, r6 C L0 + ldq r11, 16(r17) C L1 + + mulq r9, r19, r2 C U1 #16 + beq r6, $fix5 C U0 +$ret5w: addq r14, r4, r7 C L0 + ldq r12, 24(r17) C L1 + + umulh r9, r19, r3 C U1 #17 + beq r7, $fix6 C U0 +$ret6w: addq r5, r8, r8 C L0 sum 2 + addq r21, r22, r13 C L1 sum 2 mul's + + mulq r10, r19, r4 C U1 #18 + addq r23, r24, r22 C L0 sum 2 mul's + cmpult r13, r21, r14 C L1 carry from sum + ble r18, $Lend C U0 +C --------------------------------------------------------------- + ALIGN(16) +$Loop: + umulh r0, r18, r18 C U1 #01 decrement r18! + cmpult r8, r5, r29 C L0 carry from last bunch + cmpult r22, r24, r24 C U0 carry from sum + ldq r9, 32(r17) C L1 + + umulh r10, r19, r5 C U1 #02 + addq r25, r26, r23 C U0 sum 2 mul's + stq r20, 0(r16) C L0 + stq r6, 8(r16) C L1 + + mulq r11, r19, r6 C U1 #03 + cmpult r23, r26, r25 C U0 carry from sum + stq r7, 16(r16) C L0 + stq r8, 24(r16) C L1 + + umulh r11, r19, r7 C U1 #04 + bis r31, r31, r31 C L0 st slosh + bis r31, r31, r31 C L1 st slosh + addq r27, r28, r28 C U0 sum 2 mul's + + mulq r12, r19, r8 C U1 #05 + cmpult r28, r27, r15 C L0 carry from sum + lda r16, 64(r16) C L1 bookkeeping + addq r13, r29, r13 C U0 start carry cascade + + umulh r12, r19, r21 C U1 #06 + beq r13, $fix0 C U0 +$ret0: addq r22, r14, r26 C L0 + ldq r10, 40(r17) C L1 + + mulq r9, r19, r22 C U1 #07 + beq r26, $fix1 C U0 +$ret1: addq r23, r24, r27 C L0 + ldq r11, 48(r17) C L1 + + umulh r9, r19, r23 C U1 #08 + beq r27, $fix2 C U0 +$ret2: addq r28, r25, r28 C L0 + ldq r12, 56(r17) C L1 + + mulq r10, r19, r24 C U1 #09 + beq r28, $fix3 C U0 +$ret3: addq r1, r2, r20 C L0 sum 2 mul's + ldq r9, 64(r17) C L1 + + addq r3, r4, r2 C L0 #10 2 mul's + bis r31, r31, r31 C U1 mul hole + lda r17, 64(r17) C L1 bookkeeping + cmpult r20, r1, r29 C U0 carry from sum + + umulh r10, r19, r25 C U1 #11 + cmpult r2, r4, r4 C U0 carry from sum + stq r13, -32(r16) C L0 + stq r26, -24(r16) C L1 + + mulq r11, r19, r26 C U1 #12 + addq r5, r6, r14 C U0 sum 2 mul's + stq r27, -16(r16) C L0 + stq r28, -8(r16) C L1 + + umulh r11, r19, r27 C U1 #13 + bis r31, r31, r31 C L0 st slosh + bis r31, r31, r31 C L1 st slosh + cmpult r14, r6, r3 C U0 carry from sum +$L_middle_of_unrolled_loop: + mulq r12, r19, r28 C U1 #14 + addq r7, r3, r5 C L0 eat carry + addq r20, r15, r20 C U0 carry cascade + ldq r10, 8(r17) C L1 + + umulh r12, r19, r1 C U1 #15 + beq r20, $fix4 C U0 +$ret4: addq r2, r29, r6 C L0 + ldq r11, 16(r17) C L1 + + mulq r9, r19, r2 C U1 #16 + beq r6, $fix5 C U0 +$ret5: addq r14, r4, r7 C L0 + ldq r12, 24(r17) C L1 + + umulh r9, r19, r3 C U1 #17 + beq r7, $fix6 C U0 +$ret6: addq r5, r8, r8 C L0 sum 2 + addq r21, r22, r13 C L1 sum 2 mul's + + mulq r10, r19, r4 C U1 #18 + addq r23, r24, r22 C L0 sum 2 mul's + cmpult r13, r21, r14 C L1 carry from sum + bgt r18, $Loop C U0 +C --------------------------------------------------------------- +$Lend: + cmpult r8, r5, r29 C L0 carry from last bunch + cmpult r22, r24, r24 C U0 carry from sum + + umulh r10, r19, r5 C U1 #02 + addq r25, r26, r23 C U0 sum 2 mul's + stq r20, 0(r16) C L0 + stq r6, 8(r16) C L1 + + mulq r11, r19, r6 C U1 #03 + cmpult r23, r26, r25 C U0 carry from sum + stq r7, 16(r16) C L0 + stq r8, 24(r16) C L1 + + umulh r11, r19, r7 C U1 #04 + addq r27, r28, r28 C U0 sum 2 mul's + + mulq r12, r19, r8 C U1 #05 + cmpult r28, r27, r15 C L0 carry from sum + lda r16, 64(r16) C L1 bookkeeping + addq r13, r29, r13 C U0 start carry cascade + + umulh r12, r19, r21 C U1 #06 + beq r13, $fix0c C U0 +$ret0c: addq r22, r14, r26 C L0 + beq r26, $fix1c C U0 +$ret1c: addq r23, r24, r27 C L0 + beq r27, $fix2c C U0 +$ret2c: addq r28, r25, r28 C L0 + beq r28, $fix3c C U0 +$ret3c: addq r1, r2, r20 C L0 sum 2 mul's + addq r3, r4, r2 C L0 #10 2 mul's + lda r17, 64(r17) C L1 bookkeeping + cmpult r20, r1, r29 C U0 carry from sum + cmpult r2, r4, r4 C U0 carry from sum + stq r13, -32(r16) C L0 + stq r26, -24(r16) C L1 + addq r5, r6, r14 C U0 sum 2 mul's + stq r27, -16(r16) C L0 + stq r28, -8(r16) C L1 + cmpult r14, r6, r3 C U0 carry from sum + addq r7, r3, r5 C L0 eat carry + addq r20, r15, r20 C U0 carry cascade + beq r20, $fix4c C U0 +$ret4c: addq r2, r29, r6 C L0 + beq r6, $fix5c C U0 +$ret5c: addq r14, r4, r7 C L0 + beq r7, $fix6c C U0 +$ret6c: addq r5, r8, r8 C L0 sum 2 + cmpult r8, r5, r29 C L0 carry from last bunch + stq r20, 0(r16) C L0 + stq r6, 8(r16) C L1 + stq r7, 16(r16) C L0 + stq r8, 24(r16) C L1 + addq r29, r21, r0 + + ldq r26, 0(r30) + ldq r9, 8(r30) + ldq r10, 16(r30) + ldq r11, 24(r30) + ldq r12, 32(r30) + ldq r13, 40(r30) + ldq r14, 48(r30) + ldq r15, 56(r30) + ldq r29, 64(r30) + lda r30, 224(r30) + ret r31, (r26), 1 + +C $fix0w: bis r14, r29, r14 C join carries +C br r31, $ret0w +$fix1w: bis r24, r14, r24 C join carries + br r31, $ret1w +$fix2w: bis r25, r24, r25 C join carries + br r31, $ret2w +$fix3w: bis r15, r25, r15 C join carries + br r31, $ret3w +$fix0: bis r14, r29, r14 C join carries + br r31, $ret0 +$fix1: bis r24, r14, r24 C join carries + br r31, $ret1 +$fix2: bis r25, r24, r25 C join carries + br r31, $ret2 +$fix3: bis r15, r25, r15 C join carries + br r31, $ret3 +$fix4: bis r29, r15, r29 C join carries + br r31, $ret4 +$fix5: bis r4, r29, r4 C join carries + br r31, $ret5 +$fix6: addq r5, r4, r5 C can't carry twice! + br r31, $ret6 +$fix0c: bis r14, r29, r14 C join carries + br r31, $ret0c +$fix1c: bis r24, r14, r24 C join carries + br r31, $ret1c +$fix2c: bis r25, r24, r25 C join carries + br r31, $ret2c +$fix3c: bis r15, r25, r15 C join carries + br r31, $ret3c +$fix4c: bis r29, r15, r29 C join carries + br r31, $ret4c +$fix5c: bis r4, r29, r4 C join carries + br r31, $ret5c +$fix6c: addq r5, r4, r5 C can't carry twice! + br r31, $ret6c + +EPILOGUE(mpn_mul_1) +ASM_END()