]> oss.titaniummirror.com Git - msp430-gcc.git/blobdiff - gmp/mpn/ia64/submul_1.asm
Imported gcc-4.4.3
[msp430-gcc.git] / gmp / mpn / ia64 / submul_1.asm
diff --git a/gmp/mpn/ia64/submul_1.asm b/gmp/mpn/ia64/submul_1.asm
new file mode 100644 (file)
index 0000000..ae46e55
--- /dev/null
@@ -0,0 +1,634 @@
+dnl  IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
+dnl  result from a second limb vector.
+
+dnl  Copyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C         cycles/limb
+C Itanium:    4.0
+C Itanium 2:  2.25 (alignment dependent, sometimes it seems to need 3 c/l)
+
+C TODO
+C  * Optimize feed-in and wind-down code, both for speed and code size.
+C  * Handle low limb input and results specially, using a common stf8 in the
+C    epilogue.
+C  * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in
+C    2nd bundle.  This will allow the bbb bundle to be one cycle earlier and
+C    save a cycle.
+
+C INPUT PARAMETERS
+define(`rp', `r32')
+define(`up', `r33')
+define(`n',  `r34')
+define(`vl', `r35')
+
+ASM_START()
+PROLOGUE(mpn_submul_1)
+       .prologue
+       .save   ar.lc, r2
+       .body
+
+ifdef(`HAVE_ABI_32',
+`      addp4           rp = 0, rp              C M I
+       addp4           up = 0, up              C M I
+       zxt4            n = n                   C I
+       ;;
+')
+{.mmi
+       mov             r10 = rp                C M I
+       mov             r9 = up                 C M I
+       sub             vl = r0, vl             C M I   negate vl
+}
+{.mmi
+       ldf8            f8 = [rp], 8            C M
+       ldf8            f7 = [up], 8            C M
+       add             r19 = -1, n             C M I   n - 1
+       ;;
+}
+{.mmi
+       cmp.eq          p6, p0 = 0, vl          C M I
+       mov             r8 = 0                  C M I   zero cylimb
+       mov             r2 = ar.lc              C I0
+}
+{.mmi
+       setf.sig        f6 = vl                 C M2 M3
+       and             r14 = 3, n              C M I
+       shr.u           r19 = r19, 2            C I0
+       ;;
+}
+{.mmb
+       nop             0
+       cmp.eq          p10, p0 = 0, r14        C M I
+   (p6)        br.spnt         .Ldone                  C B     vl == 0
+}
+{.mmi
+       cmp.eq          p11, p0 = 2, r14        C M I
+       cmp.eq          p12, p0 = 3, r14        C M I
+       mov             ar.lc = r19             C I0
+}
+{.bbb
+  (p10)        br.dptk         .Lb00                   C B
+  (p11)        br.dptk         .Lb10                   C B
+  (p12)        br.dptk         .Lb11                   C B
+       ;;
+}
+
+.Lb01: br.cloop.dptk   .grt1
+
+       xma.l           f39 = f7, f6, f8
+       xma.hu          f43 = f7, f6, f8
+       ;;
+       getf.sig        r27 = f39                       C lo
+       getf.sig        r31 = f43                       C hi
+       ld8             r20 = [r9], 8
+       br              .Lcj1
+
+.grt1: ldf8            f44 = [rp], 8
+       ldf8            f32 = [up], 8
+       ;;
+       ldf8            f45 = [rp], 8
+       ldf8            f33 = [up], 8
+       ;;
+       ldf8            f46 = [rp], 8
+       xma.l           f39 = f7, f6, f8
+       ldf8            f34 = [up], 8
+       xma.hu          f43 = f7, f6, f8
+       ;;
+       ldf8            f47 = [rp], 8
+       xma.l           f36 = f32, f6, f44
+       ldf8            f35 = [up], 8
+       xma.hu          f40 = f32, f6, f44
+       br.cloop.dptk   .grt5
+       ;;
+
+       getf.sig        r27 = f39                       C lo
+       xma.l           f37 = f33, f6, f45
+       ld8             r20 = [r9], 8
+       xma.hu          f41 = f33, f6, f45
+       ;;
+       getf.sig        r31 = f43                       C hi
+       getf.sig        r24 = f36                       C lo
+       xma.l           f38 = f34, f6, f46
+       ld8             r21 = [r9], 8
+       xma.hu          f42 = f34, f6, f46
+       ;;
+       getf.sig        r28 = f40                       C hi
+       getf.sig        r25 = f37                       C lo
+       xma.l           f39 = f35, f6, f47
+       ld8             r22 = [r9], 8
+       xma.hu          f43 = f35, f6, f47
+       ;;
+       getf.sig        r29 = f41                       C hi
+       getf.sig        r26 = f38                       C lo
+       ld8             r23 = [r9], 8
+       br              .Lcj5
+
+.grt5: ldf8            f44 = [rp], 8
+       ldf8            f32 = [up], 8
+       ;;
+       getf.sig        r27 = f39                       C lo
+       xma.l           f37 = f33, f6, f45
+       ld8             r20 = [r9], 8
+       xma.hu          f41 = f33, f6, f45
+       ;;
+       ldf8            f45 = [rp], 8
+       getf.sig        r31 = f43                       C hi
+       ldf8            f33 = [up], 8
+       ;;
+       getf.sig        r24 = f36                       C lo
+       xma.l           f38 = f34, f6, f46
+       ld8             r21 = [r9], 8
+       xma.hu          f42 = f34, f6, f46
+       ;;
+       ldf8            f46 = [rp], 8
+       getf.sig        r28 = f40                       C hi
+       ldf8            f34 = [up], 8
+       ;;
+       getf.sig        r25 = f37                       C lo
+       xma.l           f39 = f35, f6, f47
+       ld8             r22 = [r9], 8
+       xma.hu          f43 = f35, f6, f47
+       ;;
+       ldf8            f47 = [rp], 8
+       getf.sig        r29 = f41                       C hi
+       ldf8            f35 = [up], 8
+       ;;
+       getf.sig        r26 = f38                       C lo
+       xma.l           f36 = f32, f6, f44
+       ld8             r23 = [r9], 8
+       xma.hu          f40 = f32, f6, f44
+       br.cloop.dptk   .Loop
+       br              .Lend
+
+
+.Lb10: ldf8            f47 = [rp], 8
+       ldf8            f35 = [up], 8
+       br.cloop.dptk   .grt2
+
+       xma.l           f38 = f7, f6, f8
+       xma.hu          f42 = f7, f6, f8
+       ;;
+       xma.l           f39 = f35, f6, f47
+       xma.hu          f43 = f35, f6, f47
+       ;;
+       getf.sig        r26 = f38                       C lo
+       getf.sig        r30 = f42                       C hi
+       ld8             r23 = [r9], 8
+       ;;
+       getf.sig        r27 = f39                       C lo
+       getf.sig        r31 = f43                       C hi
+       ld8             r20 = [r9], 8
+       br              .Lcj2
+
+.grt2: ldf8            f44 = [rp], 8
+       ldf8            f32 = [up], 8
+       ;;
+       ldf8            f45 = [rp], 8
+       ldf8            f33 = [up], 8
+       xma.l           f38 = f7, f6, f8
+       xma.hu          f42 = f7, f6, f8
+       ;;
+       ldf8            f46 = [rp], 8
+       ldf8            f34 = [up], 8
+       xma.l           f39 = f35, f6, f47
+       xma.hu          f43 = f35, f6, f47
+       ;;
+       ldf8            f47 = [rp], 8
+       ldf8            f35 = [up], 8
+       ;;
+       getf.sig        r26 = f38                       C lo
+       xma.l           f36 = f32, f6, f44
+       ld8             r23 = [r9], 8
+       xma.hu          f40 = f32, f6, f44
+       br.cloop.dptk   .grt6
+
+       getf.sig        r30 = f42                       C hi
+       ;;
+       getf.sig        r27 = f39                       C lo
+       xma.l           f37 = f33, f6, f45
+       ld8             r20 = [r9], 8
+       xma.hu          f41 = f33, f6, f45
+       ;;
+       getf.sig        r31 = f43                       C hi
+       getf.sig        r24 = f36                       C lo
+       xma.l           f38 = f34, f6, f46
+       ld8             r21 = [r9], 8
+       xma.hu          f42 = f34, f6, f46
+       ;;
+       getf.sig        r28 = f40                       C hi
+       getf.sig        r25 = f37                       C lo
+       xma.l           f39 = f35, f6, f47
+       ld8             r22 = [r9], 8
+       xma.hu          f43 = f35, f6, f47
+       br              .Lcj6
+
+.grt6: ldf8            f44 = [rp], 8
+       getf.sig        r30 = f42                       C hi
+       ldf8            f32 = [up], 8
+       ;;
+       getf.sig        r27 = f39                       C lo
+       xma.l           f37 = f33, f6, f45
+       ld8             r20 = [r9], 8
+       xma.hu          f41 = f33, f6, f45
+       ;;
+       ldf8            f45 = [rp], 8
+       getf.sig        r31 = f43                       C hi
+       ldf8            f33 = [up], 8
+       ;;
+       getf.sig        r24 = f36                       C lo
+       xma.l           f38 = f34, f6, f46
+       ld8             r21 = [r9], 8
+       xma.hu          f42 = f34, f6, f46
+       ;;
+       ldf8            f46 = [rp], 8
+       getf.sig        r28 = f40                       C hi
+       ldf8            f34 = [up], 8
+       ;;
+       getf.sig        r25 = f37                       C lo
+       xma.l           f39 = f35, f6, f47
+       ld8             r22 = [r9], 8
+       xma.hu          f43 = f35, f6, f47
+       br              .LL10
+
+
+.Lb11: ldf8            f46 = [rp], 8
+       ldf8            f34 = [up], 8
+       ;;
+       ldf8            f47 = [rp], 8
+       ldf8            f35 = [up], 8
+       br.cloop.dptk   .grt3
+
+       xma.l           f37 = f7, f6, f8
+       xma.hu          f41 = f7, f6, f8
+       ;;
+       xma.l           f38 = f34, f6, f46
+       xma.hu          f42 = f34, f6, f46
+       ;;
+       getf.sig        r25 = f37                       C lo
+       xma.l           f39 = f35, f6, f47
+       xma.hu          f43 = f35, f6, f47
+       ;;
+       getf.sig        r29 = f41                       C hi
+       ld8             r22 = [r9], 8
+       ;;
+       getf.sig        r26 = f38                       C lo
+       getf.sig        r30 = f42                       C hi
+       ld8             r23 = [r9], 8
+       ;;
+       getf.sig        r27 = f39                       C lo
+       getf.sig        r31 = f43                       C hi
+       ld8             r20 = [r9], 8
+       br              .Lcj3
+
+.grt3: ldf8            f44 = [rp], 8
+       xma.l           f37 = f7, f6, f8
+       ldf8            f32 = [up], 8
+       xma.hu          f41 = f7, f6, f8
+       ;;
+       ldf8            f45 = [rp], 8
+       xma.l           f38 = f34, f6, f46
+       ldf8            f33 = [up], 8
+       xma.hu          f42 = f34, f6, f46
+       ;;
+       ldf8            f46 = [rp], 8
+       ldf8            f34 = [up], 8
+       ;;
+       getf.sig        r25 = f37                       C lo
+       xma.l           f39 = f35, f6, f47
+       ld8             r22 = [r9], 8
+       xma.hu          f43 = f35, f6, f47
+       ;;
+       ldf8            f47 = [rp], 8
+       getf.sig        r29 = f41                       C hi
+       ldf8            f35 = [up], 8
+       ;;
+       getf.sig        r26 = f38                       C lo
+       xma.l           f36 = f32, f6, f44
+       ld8             r23 = [r9], 8
+       xma.hu          f40 = f32, f6, f44
+       br.cloop.dptk   .grt7
+       ;;
+
+       getf.sig        r30 = f42                       C hi
+       getf.sig        r27 = f39                       C lo
+       xma.l           f37 = f33, f6, f45
+       ld8             r20 = [r9], 8
+       xma.hu          f41 = f33, f6, f45
+       ;;
+       getf.sig        r31 = f43                       C hi
+       getf.sig        r24 = f36                       C lo
+       xma.l           f38 = f34, f6, f46
+       ld8             r21 = [r9], 8
+       xma.hu          f42 = f34, f6, f46
+       br              .Lcj7
+
+.grt7: ldf8            f44 = [rp], 8
+       getf.sig        r30 = f42                       C hi
+       ldf8            f32 = [up], 8
+       ;;
+       getf.sig        r27 = f39                       C lo
+       xma.l           f37 = f33, f6, f45
+       ld8             r20 = [r9], 8
+       xma.hu          f41 = f33, f6, f45
+       ;;
+       ldf8            f45 = [rp], 8
+       getf.sig        r31 = f43                       C hi
+       ldf8            f33 = [up], 8
+       ;;
+       getf.sig        r24 = f36                       C lo
+       xma.l           f38 = f34, f6, f46
+       ld8             r21 = [r9], 8
+       xma.hu          f42 = f34, f6, f46
+       br              .LL11
+
+
+.Lb00: ldf8            f45 = [rp], 8
+       ldf8            f33 = [up], 8
+       ;;
+       ldf8            f46 = [rp], 8
+       ldf8            f34 = [up], 8
+       ;;
+       ldf8            f47 = [rp], 8
+       xma.l           f36 = f7, f6, f8
+       ldf8            f35 = [up], 8
+       xma.hu          f40 = f7, f6, f8
+       br.cloop.dptk   .grt4
+
+       xma.l           f37 = f33, f6, f45
+       xma.hu          f41 = f33, f6, f45
+       ;;
+       getf.sig        r24 = f36                       C lo
+       xma.l           f38 = f34, f6, f46
+       ld8             r21 = [r9], 8
+       xma.hu          f42 = f34, f6, f46
+       ;;
+       getf.sig        r28 = f40                       C hi
+       xma.l           f39 = f35, f6, f47
+       getf.sig        r25 = f37                       C lo
+       ld8             r22 = [r9], 8
+       xma.hu          f43 = f35, f6, f47
+       ;;
+       getf.sig        r29 = f41                       C hi
+       getf.sig        r26 = f38                       C lo
+       ld8             r23 = [r9], 8
+       ;;
+       getf.sig        r30 = f42                       C hi
+       getf.sig        r27 = f39                       C lo
+       ld8             r20 = [r9], 8
+       br              .Lcj4
+
+.grt4: ldf8            f44 = [rp], 8
+       xma.l           f37 = f33, f6, f45
+       ldf8            f32 = [up], 8
+       xma.hu          f41 = f33, f6, f45
+       ;;
+       ldf8            f45 = [rp], 8
+       ldf8            f33 = [up], 8
+       xma.l           f38 = f34, f6, f46
+       getf.sig        r24 = f36                       C lo
+       ld8             r21 = [r9], 8
+       xma.hu          f42 = f34, f6, f46
+       ;;
+       ldf8            f46 = [rp], 8
+       getf.sig        r28 = f40                       C hi
+       ldf8            f34 = [up], 8
+       xma.l           f39 = f35, f6, f47
+       getf.sig        r25 = f37                       C lo
+       ld8             r22 = [r9], 8
+       xma.hu          f43 = f35, f6, f47
+       ;;
+       ldf8            f47 = [rp], 8
+       getf.sig        r29 = f41                       C hi
+       ldf8            f35 = [up], 8
+       ;;
+       getf.sig        r26 = f38                       C lo
+       xma.l           f36 = f32, f6, f44
+       ld8             r23 = [r9], 8
+       xma.hu          f40 = f32, f6, f44
+       br.cloop.dptk   .grt8
+       ;;
+
+       getf.sig        r30 = f42                       C hi
+       getf.sig        r27 = f39                       C lo
+       xma.l           f37 = f33, f6, f45
+       ld8             r20 = [r9], 8
+       xma.hu          f41 = f33, f6, f45
+       br              .Lcj8
+
+.grt8: ldf8            f44 = [rp], 8
+       getf.sig        r30 = f42                       C hi
+       ldf8            f32 = [up], 8
+       ;;
+       getf.sig        r27 = f39                       C lo
+       xma.l           f37 = f33, f6, f45
+       ld8             r20 = [r9], 8
+       xma.hu          f41 = f33, f6, f45
+       br              .LL00
+
+       ALIGN(32)
+.Loop:
+{.mmi
+       ldf8            f44 = [rp], 8
+       cmp.ltu         p6, p0 = r27, r8        C lo cmp
+       sub             r14 = r27, r8           C lo sub
+}
+{.mmi
+       getf.sig        r30 = f42                       C hi
+       ldf8            f32 = [up], 8
+       sub             r8 = r20, r31           C hi sub
+       ;;                              C 01
+}
+{.mmf
+       getf.sig        r27 = f39                       C lo
+       st8             [r10] = r14, 8
+       xma.l           f37 = f33, f6, f45
+}
+{.mfi
+       ld8             r20 = [r9], 8
+       xma.hu          f41 = f33, f6, f45
+   (p6)        add             r8 = 1, r8
+       ;;                              C 02
+}
+{.mmi
+.LL00: ldf8            f45 = [rp], 8
+       cmp.ltu         p6, p0 = r24, r8
+       sub             r14 = r24, r8
+}
+{.mmi
+       getf.sig        r31 = f43                       C hi
+       ldf8            f33 = [up], 8
+       sub             r8 = r21, r28
+       ;;                              C 03
+}
+{.mmf
+       getf.sig        r24 = f36                       C lo
+       st8             [r10] = r14, 8
+       xma.l           f38 = f34, f6, f46
+}
+{.mfi
+       ld8             r21 = [r9], 8
+       xma.hu          f42 = f34, f6, f46
+   (p6)        add             r8 = 1, r8
+       ;;                              C 04
+}
+{.mmi
+.LL11: ldf8            f46 = [rp], 8
+       cmp.ltu         p6, p0 = r25, r8
+       sub             r14 = r25, r8
+}
+{.mmi
+       getf.sig        r28 = f40                       C hi
+       ldf8            f34 = [up], 8
+       sub             r8 = r22, r29
+       ;;                              C 05
+}
+{.mmf
+       getf.sig        r25 = f37                       C lo
+       st8             [r10] = r14, 8
+       xma.l           f39 = f35, f6, f47
+}
+{.mfi
+       ld8             r22 = [r9], 8
+       xma.hu          f43 = f35, f6, f47
+   (p6)        add             r8 = 1, r8
+       ;;                              C 06
+}
+{.mmi
+.LL10: ldf8            f47 = [rp], 8
+       cmp.ltu         p6, p0 = r26, r8
+       sub             r14 = r26, r8
+}
+{.mmi
+       getf.sig        r29 = f41                       C hi
+       ldf8            f35 = [up], 8
+       sub             r8 = r23, r30
+       ;;                              C 07
+}
+{.mmf
+       getf.sig        r26 = f38                       C lo
+       st8             [r10] = r14, 8
+       xma.l           f36 = f32, f6, f44
+}
+{.mfi
+       ld8             r23 = [r9], 8
+       xma.hu          f40 = f32, f6, f44
+   (p6)        add             r8 = 1, r8
+}
+       br.cloop.dptk   .Loop
+       ;;
+
+.Lend:
+       cmp.ltu         p6, p0 = r27, r8
+       sub             r14 = r27, r8
+       getf.sig        r30 = f42
+       sub             r8 = r20, r31
+       ;;
+       getf.sig        r27 = f39
+       st8             [r10] = r14, 8
+       xma.l           f37 = f33, f6, f45
+       ld8             r20 = [r9], 8
+       xma.hu          f41 = f33, f6, f45
+   (p6)        add             r8 = 1, r8
+       ;;
+.Lcj8:
+       cmp.ltu         p6, p0 = r24, r8
+       sub             r14 = r24, r8
+       getf.sig        r31 = f43
+       sub             r8 = r21, r28
+       ;;
+       getf.sig        r24 = f36
+       st8             [r10] = r14, 8
+       xma.l           f38 = f34, f6, f46
+       ld8             r21 = [r9], 8
+       xma.hu          f42 = f34, f6, f46
+   (p6)        add             r8 = 1, r8
+       ;;
+.Lcj7:
+       cmp.ltu         p6, p0 = r25, r8
+       sub             r14 = r25, r8
+       getf.sig        r28 = f40
+       sub             r8 = r22, r29
+       ;;
+       getf.sig        r25 = f37
+       st8             [r10] = r14, 8
+       xma.l           f39 = f35, f6, f47
+       ld8             r22 = [r9], 8
+       xma.hu          f43 = f35, f6, f47
+   (p6)        add             r8 = 1, r8
+       ;;
+.Lcj6:
+       cmp.ltu         p6, p0 = r26, r8
+       sub             r14 = r26, r8
+       getf.sig        r29 = f41
+       sub             r8 = r23, r30
+       ;;
+       getf.sig        r26 = f38
+       st8             [r10] = r14, 8
+       ld8             r23 = [r9], 8
+   (p6)        add             r8 = 1, r8
+       ;;
+.Lcj5:
+       cmp.ltu         p6, p0 = r27, r8
+       sub             r14 = r27, r8
+       getf.sig        r30 = f42
+       sub             r8 = r20, r31
+       ;;
+       getf.sig        r27 = f39
+       st8             [r10] = r14, 8
+       ld8             r20 = [r9], 8
+   (p6)        add             r8 = 1, r8
+       ;;
+.Lcj4:
+       cmp.ltu         p6, p0 = r24, r8
+       sub             r14 = r24, r8
+       getf.sig        r31 = f43
+       sub             r8 = r21, r28
+       ;;
+       st8             [r10] = r14, 8
+   (p6)        add             r8 = 1, r8
+       ;;
+.Lcj3:
+       cmp.ltu         p6, p0 = r25, r8
+       sub             r14 = r25, r8
+       sub             r8 = r22, r29
+       ;;
+       st8             [r10] = r14, 8
+   (p6)        add             r8 = 1, r8
+       ;;
+.Lcj2:
+       cmp.ltu         p6, p0 = r26, r8
+       sub             r14 = r26, r8
+       sub             r8 = r23, r30
+       ;;
+       st8             [r10] = r14, 8
+   (p6)        add             r8 = 1, r8
+       ;;
+.Lcj1:
+       cmp.ltu         p6, p0 = r27, r8
+       sub             r14 = r27, r8
+       sub             r8 = r20, r31
+       ;;
+       st8             [r10] = r14, 8
+       mov             ar.lc = r2
+   (p6)        add             r8 = 1, r8
+       br.ret.sptk.many b0
+.Ldone:        mov             ar.lc = r2
+       br.ret.sptk.many b0
+EPILOGUE()
+ASM_END()