X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Fia64%2Florrshift.asm;fp=gmp%2Fmpn%2Fia64%2Florrshift.asm;h=59badebc6af96750ecf817191f3ab0ff80ce468d;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/ia64/lorrshift.asm b/gmp/mpn/ia64/lorrshift.asm new file mode 100644 index 00000000..59badebc --- /dev/null +++ b/gmp/mpn/ia64/lorrshift.asm @@ -0,0 +1,344 @@ +dnl IA-64 mpn_lshift/mpn_rshift. + +dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C Itanium: 2.0 +C Itanium 2: 1.0 + +C This code is scheduled deeply since the plain shift instructions shr and shl +C have a latency of 4 (on Itanium) or 3 (on Itanium 2). Poor scheduling of +C these instructions cause a 10 cycle replay trap on Itanium. + +C TODO +C * Optimize function entry and feed-in code. + +C INPUT PARAMETERS +define(`rp',`r32') +define(`up',`r33') +define(`n',`r34') +define(`cnt',`r35') + +define(`tnc',`r9') + +ifdef(`OPERATION_lshift',` + define(`FSH',`shl') + define(`BSH',`shr.u') + define(`UPD',`-8') + define(`POFF',`-512') + define(`PUPD',`-32') + define(`func',`mpn_lshift') +') +ifdef(`OPERATION_rshift',` + define(`FSH',`shr.u') + define(`BSH',`shl') + define(`UPD',`8') + define(`POFF',`512') + define(`PUPD',`32') + define(`func',`mpn_rshift') +') + +MULFUNC_PROLOGUE(mpn_lshift mpn_rshift) + +ASM_START() +PROLOGUE(func) + .prologue + .save ar.lc, r2 + .body +ifdef(`HAVE_ABI_32', +` addp4 rp = 0, rp C M I + addp4 up = 0, up C M I + sxt4 n = n C M I + zxt4 cnt = cnt C I + ;; +') + + {.mmi; cmp.lt p14, p15 = 4, n C M I + and r14 = 3, n C M I + mov.i r2 = ar.lc C I0 +}{.mmi; add r15 = -1, n C M I + sub tnc = 64, cnt C M I + add r16 = -5, n + ;; +}{.mmi; cmp.eq p6, p0 = 1, r14 C M I + cmp.eq p7, p0 = 2, r14 C M I + shr.u n = r16, 2 C I0 +}{.mmi; cmp.eq p8, p0 = 3, r14 C M I +ifdef(`OPERATION_lshift', +` shladd up = r15, 3, up C M I + shladd rp = r15, 3, rp') C M I + ;; +}{.mmi; add r11 = POFF, up C M I + ld8 r10 = [up], UPD C M01 + mov.i ar.lc = n C I0 +}{.bbb; + (p6) br.dptk .Lb01 + (p7) br.dptk .Lb10 + (p8) br.dptk .Lb11 + ;; +} + +.Lb00: ld8 r19 = [up], UPD + ;; + ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r8 = r10, tnc C function return value + (p14) br.cond.dptk .grt4 + + FSH r24 = r10, cnt + BSH r25 = r19, tnc + ;; + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r14 = r25, r24 + FSH r22 = r17, cnt + BSH r23 = r10, tnc + br .Lr4 + +.grt4: FSH r24 = r10, cnt + BSH r25 = r19, tnc + ;; + ld8 r18 = [up], UPD + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + ld8 r19 = [up], UPD + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + ld8 r16 = [up], UPD + FSH r22 = r17, cnt + BSH r23 = r18, tnc + ;; + or r14 = r25, r24 + ld8 r17 = [up], UPD + br.cloop.dpnt .Ltop + br .Lbot + +.Lb01: + (p15) BSH r8 = r10, tnc C function return value I + (p15) FSH r22 = r10, cnt C I + (p15) br.cond.dptk .Lr1 C return B + +.grt1: ld8 r18 = [up], UPD + ;; + ld8 r19 = [up], UPD + BSH r8 = r10, tnc C function return value + ;; + ld8 r16 = [up], UPD + FSH r22 = r10, cnt + BSH r23 = r18, tnc + ;; + ld8 r17 = [up], UPD + br.cloop.dpnt .grt5 + ;; + + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + or r15 = r23, r22 + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + br .Lr5 + +.grt5: FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + ld8 r18 = [up], UPD + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + ld8 r19 = [up], UPD + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r15 = r23, r22 + ld8 r16 = [up], UPD + br .LL01 + + +.Lb10: ld8 r17 = [up], UPD + (p14) br.cond.dptk .grt2 + + BSH r8 = r10, tnc C function return value + ;; + FSH r20 = r10, cnt + BSH r21 = r17, tnc + ;; + or r14 = r21, r20 + FSH r22 = r17, cnt + br .Lr2 C return + +.grt2: ld8 r18 = [up], UPD + BSH r8 = r10, tnc C function return value + ;; + ld8 r19 = [up], UPD + FSH r20 = r10, cnt + BSH r21 = r17, tnc + ;; + ld8 r16 = [up], UPD + FSH r22 = r17, cnt + BSH r23 = r18, tnc + ;; + ld8 r17 = [up], UPD + br.cloop.dpnt .grt6 + ;; + + or r14 = r21, r20 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + FSH r26 = r19, cnt + BSH r27 = r16, tnc + br .Lr6 + +.grt6: or r14 = r21, r20 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ;; + ld8 r18 = [up], UPD + FSH r26 = r19, cnt + BSH r27 = r16, tnc + ;; + ld8 r19 = [up], UPD + br .LL10 + + +.Lb11: ld8 r16 = [up], UPD + ;; + ld8 r17 = [up], UPD + BSH r8 = r10, tnc C function return value + (p14) br.cond.dptk .grt3 + ;; + + FSH r26 = r10, cnt + BSH r27 = r16, tnc + ;; + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + or r15 = r27, r26 + FSH r22 = r17, cnt + br .Lr3 C return + +.grt3: ld8 r18 = [up], UPD + FSH r26 = r10, cnt + BSH r27 = r16, tnc + ;; + ld8 r19 = [up], UPD + FSH r20 = r16, cnt + BSH r21 = r17, tnc + ;; + ld8 r16 = [up], UPD + FSH r22 = r17, cnt + BSH r23 = r18, tnc + ;; + ld8 r17 = [up], UPD + br.cloop.dpnt .grt7 + + or r15 = r27, r26 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + br .Lr7 + +.grt7: or r15 = r27, r26 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + ld8 r18 = [up], UPD + br .LL11 + +C *** MAIN LOOP START *** + ALIGN(32) +.Ltop: + {.mmi; st8 [rp] = r14, UPD C M2 + or r15 = r27, r26 C M3 + FSH r24 = r18, cnt C I0 +}{.mmi; ld8 r18 = [up], UPD C M1 + lfetch [r11], PUPD + BSH r25 = r19, tnc C I1 + ;; } +.LL11: + {.mmi; st8 [rp] = r15, UPD + or r14 = r21, r20 + FSH r26 = r19, cnt +}{.mmi; ld8 r19 = [up], UPD + nop.m 0 + BSH r27 = r16, tnc + ;; } +.LL10: + {.mmi; st8 [rp] = r14, UPD + or r15 = r23, r22 + FSH r20 = r16, cnt +}{.mmi; ld8 r16 = [up], UPD + nop.m 0 + BSH r21 = r17, tnc + ;; } +.LL01: + {.mmi; st8 [rp] = r15, UPD + or r14 = r25, r24 + FSH r22 = r17, cnt +}{.mib; ld8 r17 = [up], UPD + BSH r23 = r18, tnc + br.cloop.dptk .Ltop + ;; } + +C *** MAIN LOOP END *** + +.Lbot: or r15 = r27, r26 + FSH r24 = r18, cnt + BSH r25 = r19, tnc + st8 [rp] = r14, UPD + ;; +.Lr7: or r14 = r21, r20 + FSH r26 = r19, cnt + BSH r27 = r16, tnc + st8 [rp] = r15, UPD + ;; +.Lr6: or r15 = r23, r22 + FSH r20 = r16, cnt + BSH r21 = r17, tnc + st8 [rp] = r14, UPD + ;; +.Lr5: st8 [rp] = r15, UPD + or r14 = r25, r24 + FSH r22 = r17, cnt + ;; +.Lr4: or r15 = r27, r26 + st8 [rp] = r14, UPD + ;; +.Lr3: or r14 = r21, r20 + st8 [rp] = r15, UPD + ;; +.Lr2: st8 [rp] = r14, UPD + ;; +.Lr1: st8 [rp] = r22, UPD C M23 + mov ar.lc = r2 C I0 + br.ret.sptk.many b0 C B +EPILOGUE(func) +ASM_END()