]> oss.titaniummirror.com Git - msp430-gcc.git/blobdiff - gmp/mpn/ia64/lorrshift.asm
Imported gcc-4.4.3
[msp430-gcc.git] / gmp / mpn / ia64 / lorrshift.asm
diff --git a/gmp/mpn/ia64/lorrshift.asm b/gmp/mpn/ia64/lorrshift.asm
new file mode 100644 (file)
index 0000000..59badeb
--- /dev/null
@@ -0,0 +1,344 @@
+dnl  IA-64 mpn_lshift/mpn_rshift.
+
+dnl  Copyright 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation,
+dnl  Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C           cycles/limb
+C Itanium:      2.0
+C Itanium 2:    1.0
+
+C This code is scheduled deeply since the plain shift instructions shr and shl
+C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
+C these instructions cause a 10 cycle replay trap on Itanium.
+
+C TODO
+C  * Optimize function entry and feed-in code.
+
+C INPUT PARAMETERS
+define(`rp',`r32')
+define(`up',`r33')
+define(`n',`r34')
+define(`cnt',`r35')
+
+define(`tnc',`r9')
+
+ifdef(`OPERATION_lshift',`
+       define(`FSH',`shl')
+       define(`BSH',`shr.u')
+       define(`UPD',`-8')
+       define(`POFF',`-512')
+       define(`PUPD',`-32')
+       define(`func',`mpn_lshift')
+')
+ifdef(`OPERATION_rshift',`
+       define(`FSH',`shr.u')
+       define(`BSH',`shl')
+       define(`UPD',`8')
+       define(`POFF',`512')
+       define(`PUPD',`32')
+       define(`func',`mpn_rshift')
+')
+
+MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
+
+ASM_START()
+PROLOGUE(func)
+       .prologue
+       .save           ar.lc, r2
+       .body
+ifdef(`HAVE_ABI_32',
+`      addp4           rp = 0, rp              C                       M I
+       addp4           up = 0, up              C                       M I
+       sxt4            n = n                   C                       M I
+       zxt4            cnt = cnt               C                       I
+       ;;
+')
+
+ {.mmi;        cmp.lt          p14, p15 = 4, n         C                       M I
+       and             r14 = 3, n              C                       M I
+       mov.i           r2 = ar.lc              C                       I0
+}{.mmi;        add             r15 = -1, n             C                       M I
+       sub             tnc = 64, cnt           C                       M I
+       add             r16 = -5, n
+       ;;
+}{.mmi;        cmp.eq          p6, p0 = 1, r14         C                       M I
+       cmp.eq          p7, p0 = 2, r14         C                       M I
+       shr.u           n = r16, 2              C                       I0
+}{.mmi;        cmp.eq          p8, p0 = 3, r14         C                       M I
+ifdef(`OPERATION_lshift',
+`      shladd          up = r15, 3, up         C                       M I
+       shladd          rp = r15, 3, rp')       C                       M I
+       ;;
+}{.mmi;        add             r11 = POFF, up          C                       M I
+       ld8             r10 = [up], UPD         C                       M01
+       mov.i           ar.lc = n               C                       I0
+}{.bbb;
+   (p6)        br.dptk         .Lb01
+   (p7)        br.dptk         .Lb10
+   (p8)        br.dptk         .Lb11
+       ;;
+}
+
+.Lb00: ld8             r19 = [up], UPD
+       ;;
+       ld8             r16 = [up], UPD
+       ;;
+       ld8             r17 = [up], UPD
+       BSH             r8 = r10, tnc           C function return value
+  (p14)        br.cond.dptk    .grt4
+
+       FSH             r24 = r10, cnt
+       BSH             r25 = r19, tnc
+       ;;
+       FSH             r26 = r19, cnt
+       BSH             r27 = r16, tnc
+       ;;
+       FSH             r20 = r16, cnt
+       BSH             r21 = r17, tnc
+       ;;
+       or              r14 = r25, r24
+       FSH             r22 = r17, cnt
+       BSH             r23 = r10, tnc
+       br              .Lr4
+
+.grt4: FSH             r24 = r10, cnt
+       BSH             r25 = r19, tnc
+       ;;
+       ld8             r18 = [up], UPD
+       FSH             r26 = r19, cnt
+       BSH             r27 = r16, tnc
+       ;;
+       ld8             r19 = [up], UPD
+       FSH             r20 = r16, cnt
+       BSH             r21 = r17, tnc
+       ;;
+       ld8             r16 = [up], UPD
+       FSH             r22 = r17, cnt
+       BSH             r23 = r18, tnc
+       ;;
+       or              r14 = r25, r24
+       ld8             r17 = [up], UPD
+       br.cloop.dpnt   .Ltop
+       br              .Lbot
+
+.Lb01:
+  (p15)        BSH             r8 = r10, tnc           C function return value I
+  (p15)        FSH             r22 = r10, cnt          C                       I
+  (p15)        br.cond.dptk    .Lr1                    C return                B
+
+.grt1: ld8             r18 = [up], UPD
+       ;;
+       ld8             r19 = [up], UPD
+       BSH             r8 = r10, tnc           C function return value
+       ;;
+       ld8             r16 = [up], UPD
+       FSH             r22 = r10, cnt
+       BSH             r23 = r18, tnc
+       ;;
+       ld8             r17 = [up], UPD
+       br.cloop.dpnt   .grt5
+       ;;
+
+       FSH             r24 = r18, cnt
+       BSH             r25 = r19, tnc
+       ;;
+       or              r15 = r23, r22
+       FSH             r26 = r19, cnt
+       BSH             r27 = r16, tnc
+       ;;
+       FSH             r20 = r16, cnt
+       BSH             r21 = r17, tnc
+       br              .Lr5
+
+.grt5: FSH             r24 = r18, cnt
+       BSH             r25 = r19, tnc
+       ;;
+       ld8             r18 = [up], UPD
+       FSH             r26 = r19, cnt
+       BSH             r27 = r16, tnc
+       ;;
+       ld8             r19 = [up], UPD
+       FSH             r20 = r16, cnt
+       BSH             r21 = r17, tnc
+       ;;
+       or              r15 = r23, r22
+       ld8             r16 = [up], UPD
+       br              .LL01
+
+
+.Lb10: ld8             r17 = [up], UPD
+  (p14)        br.cond.dptk    .grt2
+
+       BSH             r8 = r10, tnc           C function return value
+       ;;
+       FSH             r20 = r10, cnt
+       BSH             r21 = r17, tnc
+       ;;
+       or              r14 = r21, r20
+       FSH             r22 = r17, cnt
+       br              .Lr2                    C return
+
+.grt2: ld8             r18 = [up], UPD
+       BSH             r8 = r10, tnc           C function return value
+       ;;
+       ld8             r19 = [up], UPD
+       FSH             r20 = r10, cnt
+       BSH             r21 = r17, tnc
+       ;;
+       ld8             r16 = [up], UPD
+       FSH             r22 = r17, cnt
+       BSH             r23 = r18, tnc
+       ;;
+       ld8             r17 = [up], UPD
+       br.cloop.dpnt   .grt6
+       ;;
+
+       or              r14 = r21, r20
+       FSH             r24 = r18, cnt
+       BSH             r25 = r19, tnc
+       ;;
+       FSH             r26 = r19, cnt
+       BSH             r27 = r16, tnc
+       br              .Lr6
+
+.grt6: or              r14 = r21, r20
+       FSH             r24 = r18, cnt
+       BSH             r25 = r19, tnc
+       ;;
+       ld8             r18 = [up], UPD
+       FSH             r26 = r19, cnt
+       BSH             r27 = r16, tnc
+       ;;
+       ld8             r19 = [up], UPD
+       br              .LL10
+
+
+.Lb11: ld8             r16 = [up], UPD
+       ;;
+       ld8             r17 = [up], UPD
+       BSH             r8 = r10, tnc           C function return value
+  (p14)        br.cond.dptk    .grt3
+       ;;
+
+       FSH             r26 = r10, cnt
+       BSH             r27 = r16, tnc
+       ;;
+       FSH             r20 = r16, cnt
+       BSH             r21 = r17, tnc
+       ;;
+       or              r15 = r27, r26
+       FSH             r22 = r17, cnt
+       br              .Lr3                    C return
+
+.grt3: ld8             r18 = [up], UPD
+       FSH             r26 = r10, cnt
+       BSH             r27 = r16, tnc
+       ;;
+       ld8             r19 = [up], UPD
+       FSH             r20 = r16, cnt
+       BSH             r21 = r17, tnc
+       ;;
+       ld8             r16 = [up], UPD
+       FSH             r22 = r17, cnt
+       BSH             r23 = r18, tnc
+       ;;
+       ld8             r17 = [up], UPD
+       br.cloop.dpnt   .grt7
+
+       or              r15 = r27, r26
+       FSH             r24 = r18, cnt
+       BSH             r25 = r19, tnc
+       br              .Lr7
+
+.grt7: or              r15 = r27, r26
+       FSH             r24 = r18, cnt
+       BSH             r25 = r19, tnc
+       ld8             r18 = [up], UPD
+       br              .LL11
+
+C *** MAIN LOOP START ***
+       ALIGN(32)
+.Ltop:
+ {.mmi;        st8             [rp] = r14, UPD         C M2
+       or              r15 = r27, r26          C M3
+       FSH             r24 = r18, cnt          C I0
+}{.mmi;        ld8             r18 = [up], UPD         C M1
+       lfetch          [r11], PUPD
+       BSH             r25 = r19, tnc          C I1
+       ;; }
+.LL11:
+ {.mmi;        st8             [rp] = r15, UPD
+       or              r14 = r21, r20
+       FSH             r26 = r19, cnt
+}{.mmi;        ld8             r19 = [up], UPD
+       nop.m           0
+       BSH             r27 = r16, tnc
+       ;; }
+.LL10:
+ {.mmi;        st8             [rp] = r14, UPD
+       or              r15 = r23, r22
+       FSH             r20 = r16, cnt
+}{.mmi;        ld8             r16 = [up], UPD
+       nop.m           0
+       BSH             r21 = r17, tnc
+       ;; }
+.LL01:
+ {.mmi;        st8             [rp] = r15, UPD
+       or              r14 = r25, r24
+       FSH             r22 = r17, cnt
+}{.mib;        ld8             r17 = [up], UPD
+       BSH             r23 = r18, tnc
+       br.cloop.dptk   .Ltop
+       ;; }
+
+C *** MAIN LOOP END ***
+
+.Lbot: or              r15 = r27, r26
+       FSH             r24 = r18, cnt
+       BSH             r25 = r19, tnc
+       st8             [rp] = r14, UPD
+       ;;
+.Lr7:  or              r14 = r21, r20
+       FSH             r26 = r19, cnt
+       BSH             r27 = r16, tnc
+       st8             [rp] = r15, UPD
+       ;;
+.Lr6:  or              r15 = r23, r22
+       FSH             r20 = r16, cnt
+       BSH             r21 = r17, tnc
+       st8             [rp] = r14, UPD
+       ;;
+.Lr5:  st8             [rp] = r15, UPD
+       or              r14 = r25, r24
+       FSH             r22 = r17, cnt
+       ;;
+.Lr4:  or              r15 = r27, r26
+       st8             [rp] = r14, UPD
+       ;;
+.Lr3:  or              r14 = r21, r20
+       st8             [rp] = r15, UPD
+       ;;
+.Lr2:  st8             [rp] = r14, UPD
+       ;;
+.Lr1:  st8             [rp] = r22, UPD         C                       M23
+       mov             ar.lc = r2              C                       I0
+       br.ret.sptk.many b0                     C                       B
+EPILOGUE(func)
+ASM_END()