X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Falpha%2Fev5%2Fcom_n.asm;fp=gmp%2Fmpn%2Falpha%2Fev5%2Fcom_n.asm;h=979e711eb8d5e2cc1b91a93ec400a07708295fa5;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/alpha/ev5/com_n.asm b/gmp/mpn/alpha/ev5/com_n.asm new file mode 100644 index 00000000..979e711e --- /dev/null +++ b/gmp/mpn/alpha/ev5/com_n.asm @@ -0,0 +1,165 @@ +dnl Alpha EV5 mpn_com_n -- mpn one's complement. + +dnl Copyright 2003 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C EV4: 4.75 +C EV5: 2.0 +C EV6: 1.5 + + +C mp_limb_t mpn_com_n (mp_ptr dst, mp_srcptr src, mp_size_t size); +C +C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total +C 2.0 c/l. In general, a pattern like this unrolled to N limbs per loop +C will be 1.5+2/N c/l. +C +C 2 cycles of loop control are unavoidable, for pointer updates and the +C taken branch bubble, but also since ldq cannot issue two cycles after stq +C (and with a run of stqs that means neither of two cycles at the end of the +C loop. +C +C The fbeq is forced into the second cycle of the loop using unops, since +C the first time through it must wait for the cvtqt result. Once that +C result is ready (a 1 cycle stall) then both the branch and following loads +C can issue together. +C +C The main loop handles an odd count of limbs, being two limbs loaded before +C each size test, plus one pipelined around from the previous iteration (or +C setup in the entry sequence). +C +C An even number of limbs is handled by an explicit dst[0]=~src[0] in the +C entry sequence, and an increment of the pointers. For an odd size there's +C no increment and the first store in the loop (r24) is a repeat of dst[0]. +C +C Note that the load for r24 after the possible pointer increment is done +C before the explicit store to dst[0], in case src==dst. + + +ASM_START() + +FLOAT64(L(dat), 2.0) + + ALIGN(16) + +PROLOGUE(mpn_com_n,gp) + + C r16 dst + C r17 src + C r18 size + + lda r30, -16(r30) C temporary stack space + lda r7, -3(r18) C size - 3 + + ldq r20, 0(r17) C src[0] + srl r7, 1, r6 C (size-3)/2 + + stq r6, 8(r30) C (size-3)/2 + and r7, 1, r5 C 1 if size even + + LEA( r8, L(dat)) + s8addq r5, r17, r17 C skip src[0] if even + + ornot r31, r20, r20 C ~src[0] + unop + + ldt f0, 8(r30) C (size-3)/2 + ldq r24, 0(r17) C src[0 or 1] + + stq r20, 0(r16) C dst[0] + s8addq r5, r16, r19 C skip dst[0] if even + + ldt f1, 0(r8) C data 2.0 + lda r30, 16(r30) C restore stack + unop + cvtqt f0, f0 C (size-3)/2 as float + + ornot r31, r24, r24 + blt r7, L(done_1) C if size<=2 + unop + unop + + + C 16-byte alignment here +L(top): + C r17 src, incrementing + C r19 dst, incrementing + C r24 dst[i] result, ready to store + C f0 (size-3)/2, decrementing + C f1 2.0 + + ldq r20, 8(r17) C src[i+1] + ldq r21, 16(r17) C src[i+2] + unop + unop + + fbeq f0, L(done_2) + unop + ldq r22, 24(r17) C src[i+3] + ldq r23, 32(r17) C src[i+4] + + stq r24, 0(r19) C dst[i] + ornot r31, r20, r20 + subt f0, f1, f0 C count -= 2 + unop + + stq r20, 8(r19) C dst[i+1] + ornot r31, r21, r21 + unop + unop + + stq r21, 16(r19) C dst[i+2] + ornot r31, r22, r22 + + stq r22, 24(r19) C dst[i+3] + ornot r31, r23, r24 + + lda r17, 32(r17) C src += 4 + lda r19, 32(r19) C dst += 4 + unop + fbge f0, L(top) + + +L(done_1): + C r19 &dst[size-1] + C r24 result for dst[size-1] + + stq r24, 0(r19) C dst[size-1] + ret r31, (r26), 1 + + +L(done_2): + C r19 &dst[size-3] + C r20 src[size-2] + C r21 src[size-1] + C r24 result for dst[size-3] + + stq r24, 0(r19) C dst[size-3] + ornot r31, r20, r20 + + stq r20, 8(r19) C dst[size-2] + ornot r31, r21, r21 + + stq r21, 16(r19) C dst[size-1] + ret r31, (r26), 1 + +EPILOGUE() +ASM_END()