]> oss.titaniummirror.com Git - msp430-gcc.git/blobdiff - gmp/mpn/powerpc64/mode64/aors_n.asm
Imported gcc-4.4.3
[msp430-gcc.git] / gmp / mpn / powerpc64 / mode64 / aors_n.asm
diff --git a/gmp/mpn/powerpc64/mode64/aors_n.asm b/gmp/mpn/powerpc64/mode64/aors_n.asm
new file mode 100644 (file)
index 0000000..42b6d79
--- /dev/null
@@ -0,0 +1,203 @@
+dnl  PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
+
+dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007 Free Software
+dnl  Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C              cycles/limb
+C POWER3/PPC630:     1.5
+C POWER4/PPC970:     2
+
+C   n     POWER3/PPC630   POWER4/PPC970
+C     1               17.00           19.00
+C     2                9.00           10.49
+C     3                5.33            7.66
+C     4                4.50            5.14
+C     5                4.20            4.80
+C     6                3.83            4.33
+C     7                3.00            3.99
+C     8                2.87            3.55
+C     9                2.89            3.40
+C    10                2.60            3.42
+C    11                2.45            3.15
+C    12                2.41            2.99
+C    13                2.46            3.01
+C    14                2.42            2.97
+C    15                2.20            2.85
+C    50                1.78            2.44
+C   100                1.83            2.20
+C   200                1.55            2.12
+C   400                1.53            2.05
+C  1000                1.98            2.02#
+C  2000                1.50#           2.04
+C  4000                2.55            2.50
+C  8000                2.70            2.45
+C 16000                2.65            5.94
+C 32000                2.62           16.41
+C 64000                2.73           18.94
+
+C This code is a little bit slower for POWER3/PPC630 than the simple code used
+C previously, but it is much faster for POWER4/PPC970.  The reason for the
+C POWER3/PPC630 slowdown can be attributed to the saving and restoring of 4
+C registers.
+
+C INPUT PARAMETERS
+C rp   r3
+C up   r4
+C vp   r5
+C n    r6
+
+ifdef(`OPERATION_add_n',`
+  define(ADDSUBC,      adde)
+  define(ADDSUB,       addc)
+  define(func,         mpn_add_n)
+  define(func_nc,      mpn_add_nc)
+  define(GENRVAL,      `addi   r3, r3, 1')
+  define(SETCBR,       `addic  r0, $1, -1')
+  define(CLRCB,                `addic  r0, r0, 0')
+')
+ifdef(`OPERATION_sub_n',`
+  define(ADDSUBC,      subfe)
+  define(ADDSUB,       subfc)
+  define(func,         mpn_sub_n)
+  define(func_nc,      mpn_sub_nc)
+  define(GENRVAL,      `neg    r3, r3')
+  define(SETCBR,       `subfic r0, $1, 0')
+  define(CLRCB,                `addic  r0, r1, -1')
+')
+
+MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
+
+ASM_START()
+PROLOGUE(func_nc)
+       SETCBR(r7)
+       b       L(ent)
+EPILOGUE()
+
+PROLOGUE(func)
+       CLRCB
+L(ent):        std     r31, -8(r1)
+       std     r30, -16(r1)
+       std     r29, -24(r1)
+       std     r28, -32(r1)
+
+       rldicl. r0, r6, 0,62    C r0 = n & 3, set cr0
+       cmpdi   cr6, r0, 2
+       addi    r6, r6, 3       C compute count...
+       srdi    r6, r6, 2       C ...for ctr
+       mtctr   r6              C copy count into ctr
+       beq     cr0, L(b00)
+       blt     cr6, L(b01)
+       beq     cr6, L(b10)
+
+L(b11):        ld      r8, 0(r4)       C load s1 limb
+       ld      r9, 0(r5)       C load s2 limb
+       ld      r10, 8(r4)      C load s1 limb
+       ld      r11, 8(r5)      C load s2 limb
+       ld      r12, 16(r4)     C load s1 limb
+       addi    r4, r4, 24
+       ld      r0, 16(r5)      C load s2 limb
+       addi    r5, r5, 24
+       ADDSUBC r29, r9, r8
+       ADDSUBC r30, r11, r10
+       ADDSUBC r31, r0, r12
+       std     r29, 0(r3)
+       std     r30, 8(r3)
+       std     r31, 16(r3)
+       addi    r3, r3, 24
+       bdnz    L(go)
+       b       L(ret)
+
+L(b01):        ld      r12, 0(r4)      C load s1 limb
+       addi    r4, r4, 8
+       ld      r0, 0(r5)       C load s2 limb
+       addi    r5, r5, 8
+       ADDSUBC r31, r0, r12    C add
+       std     r31, 0(r3)
+       addi    r3, r3, 8
+       bdnz    L(go)
+       b       L(ret)
+
+L(b10):        ld      r10, 0(r4)      C load s1 limb
+       ld      r11, 0(r5)      C load s2 limb
+       ld      r12, 8(r4)      C load s1 limb
+       addi    r4, r4, 16
+       ld      r0, 8(r5)       C load s2 limb
+       addi    r5, r5, 16
+       ADDSUBC r30, r11, r10   C add
+       ADDSUBC r31, r0, r12    C add
+       std     r30, 0(r3)
+       std     r31, 8(r3)
+       addi    r3, r3, 16
+       bdnz    L(go)
+       b       L(ret)
+
+L(b00):        C INITCY                C clear/set cy
+L(go): ld      r6, 0(r4)       C load s1 limb
+       ld      r7, 0(r5)       C load s2 limb
+       ld      r8, 8(r4)       C load s1 limb
+       ld      r9, 8(r5)       C load s2 limb
+       ld      r10, 16(r4)     C load s1 limb
+       ld      r11, 16(r5)     C load s2 limb
+       ld      r12, 24(r4)     C load s1 limb
+       ld      r0, 24(r5)      C load s2 limb
+       bdz     L(end)
+
+       addi    r4, r4, 32
+       addi    r5, r5, 32
+
+L(oop):        ADDSUBC r28, r7, r6
+       ld      r6, 0(r4)       C load s1 limb
+       ld      r7, 0(r5)       C load s2 limb
+       ADDSUBC r29, r9, r8
+       ld      r8, 8(r4)       C load s1 limb
+       ld      r9, 8(r5)       C load s2 limb
+       ADDSUBC r30, r11, r10
+       ld      r10, 16(r4)     C load s1 limb
+       ld      r11, 16(r5)     C load s2 limb
+       ADDSUBC r31, r0, r12
+       ld      r12, 24(r4)     C load s1 limb
+       ld      r0, 24(r5)      C load s2 limb
+       std     r28, 0(r3)
+       addi    r4, r4, 32
+       std     r29, 8(r3)
+       addi    r5, r5, 32
+       std     r30, 16(r3)
+       std     r31, 24(r3)
+       addi    r3, r3, 32
+       bdnz    L(oop)          C decrement ctr and loop back
+
+L(end):        ADDSUBC r28, r7, r6
+       ADDSUBC r29, r9, r8
+       ADDSUBC r30, r11, r10
+       ADDSUBC r31, r0, r12
+       std     r28, 0(r3)
+       std     r29, 8(r3)
+       std     r30, 16(r3)
+       std     r31, 24(r3)
+
+L(ret):        ld      r31, -8(r1)
+       ld      r30, -16(r1)
+       ld      r29, -24(r1)
+       ld      r28, -32(r1)
+
+       subfe   r3, r0, r0      C -cy
+       GENRVAL
+       blr
+EPILOGUE()