X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Fx86_64%2Fmul_basecase.asm;fp=gmp%2Fmpn%2Fx86_64%2Fmul_basecase.asm;h=09ec4d14ae7d1d6db6ca970e8770193fe26a6316;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/x86_64/mul_basecase.asm b/gmp/mpn/x86_64/mul_basecase.asm new file mode 100644 index 00000000..09ec4d14 --- /dev/null +++ b/gmp/mpn/x86_64/mul_basecase.asm @@ -0,0 +1,451 @@ +dnl AMD64 mpn_mul_basecase. + +dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C K8,K9: 2.375 +C K10: 2.375 +C P4: ? +C P6-15: 4.45 + +C The inner loops of this code are the result of running a code generation and +C optimization tool suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Use fewer registers. (how??? I can't see it -- david) +C * Avoid some "mov $0,r" and instead use "xor r,r". +C * Don't align loops to a 32-byte boundaries. +C * Can the top of each L(addmul_outer_n) prologue be folded into the +C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the +C case where vn = 1 or 2; is it worth it? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`un_param',`%rdx') +define(`vp', `%rcx') +define(`vn', `%r8') + +define(`v0', `%r12') +define(`v1', `%r9') + +define(`w0', `%rbx') +define(`w1', `%r15') +define(`w2', `%rbp') +define(`w3', `%r10') + +define(`n', `%r11') +define(`outer_addr', `%r14') +define(`un', `%r13') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + + xor R32(un), R32(un) + mov (up), %rax + mov (vp), v0 + + sub un_param, un C rdx used by mul + mov un, n + mov R32(un_param), R32(w0) + + lea (rp,un_param,8), rp + lea (up,un_param,8), up + + mul v0 + + test $1, R8(vn) + jz L(mul_2) + +C =========================================================== +C mul_1 for vp[0] if vn is odd + +L(mul_1): + and $3, R32(w0) + jz L(mul_1_prologue_0) + cmp $2, R32(w0) + jc L(mul_1_prologue_1) + jz L(mul_1_prologue_2) + jmp L(mul_1_prologue_3) + +L(mul_1_prologue_0): + mov %rax, w2 + mov %rdx, w3 C note: already w0 == 0 + lea L(addmul_outer_0)(%rip), outer_addr + jmp L(mul_1_entry_0) + +L(mul_1_prologue_1): + cmp $-1, un + jne 2f + mov %rax, -8(rp) + mov %rdx, (rp) + jmp L(ret) +2: add $1, n + lea L(addmul_outer_1)(%rip), outer_addr + mov %rax, w1 + mov %rdx, w2 + xor R32(w3), R32(w3) + mov (up,n,8), %rax + jmp L(mul_1_entry_1) + +L(mul_1_prologue_2): + add $-2, n + lea L(addmul_outer_2)(%rip), outer_addr + mov %rax, w0 + mov %rdx, w1 + mov 24(up,n,8), %rax + xor R32(w2), R32(w2) + xor R32(w3), R32(w3) + jmp L(mul_1_entry_2) + +L(mul_1_prologue_3): + add $-1, n + lea L(addmul_outer_3)(%rip), outer_addr + mov %rax, w3 + mov %rdx, w0 + jmp L(mul_1_entry_3) + + + C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments + + ALIGN(16) +L(mul_1_top): + mov w0, -16(rp,n,8) + add %rax, w1 + mov (up,n,8), %rax + adc %rdx, w2 +L(mul_1_entry_1): + xor R32(w0), R32(w0) + mul v0 + mov w1, -8(rp,n,8) + add %rax, w2 + adc %rdx, w3 +L(mul_1_entry_0): + mov 8(up,n,8), %rax + mul v0 + mov w2, (rp,n,8) + add %rax, w3 + adc %rdx, w0 +L(mul_1_entry_3): + mov 16(up,n,8), %rax + mul v0 + mov w3, 8(rp,n,8) + xor R32(w2), R32(w2) C zero + mov w2, w3 C zero + add %rax, w0 + mov 24(up,n,8), %rax + mov w2, w1 C zero + adc %rdx, w1 +L(mul_1_entry_2): + mul v0 + add $4, n + js L(mul_1_top) + + mov w0, -16(rp) + add %rax, w1 + mov w1, -8(rp) + adc %rdx, w2 + mov w2, (rp) + + add $-1, vn C vn -= 1 + jz L(ret) + + mov 8(vp), v0 + mov 16(vp), v1 + + lea 8(vp), vp C vp += 1 + lea 8(rp), rp C rp += 1 + + jmp *outer_addr + +C =========================================================== +C mul_2 for vp[0], vp[1] if vn is even + + ALIGN(16) +L(mul_2): + mov 8(vp), v1 + + and $3, R32(w0) + jz L(mul_2_prologue_0) + cmp $2, R32(w0) + jz L(mul_2_prologue_2) + jc L(mul_2_prologue_1) + +L(mul_2_prologue_3): + lea L(addmul_outer_3)(%rip), outer_addr + add $2, n + mov %rax, -16(rp,n,8) + mov %rdx, w2 + xor R32(w3), R32(w3) + xor R32(w0), R32(w0) + mov -16(up,n,8), %rax + jmp L(mul_2_entry_3) + + ALIGN(16) +L(mul_2_prologue_0): + add $3, n + mov %rax, w0 + mov %rdx, w1 + xor R32(w2), R32(w2) + mov -24(up,n,8), %rax + lea L(addmul_outer_0)(%rip), outer_addr + jmp L(mul_2_entry_0) + + ALIGN(16) +L(mul_2_prologue_1): + mov %rax, w3 + mov %rdx, w0 + xor R32(w1), R32(w1) + lea L(addmul_outer_1)(%rip), outer_addr + jmp L(mul_2_entry_1) + + ALIGN(16) +L(mul_2_prologue_2): + add $1, n + lea L(addmul_outer_2)(%rip), outer_addr + mov $0, R32(w0) + mov $0, R32(w1) + mov %rax, w2 + mov -8(up,n,8), %rax + mov %rdx, w3 + jmp L(mul_2_entry_2) + + C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments + + ALIGN(16) +L(mul_2_top): + mov -32(up,n,8), %rax + mul v1 + add %rax, w0 + adc %rdx, w1 + mov -24(up,n,8), %rax + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,n,8), %rax + adc %rdx, w1 + adc $0, R32(w2) +L(mul_2_entry_0): + mul v1 + add %rax, w1 + mov w0, -24(rp,n,8) + adc %rdx, w2 + mov -16(up,n,8), %rax + mul v0 + mov $0, R32(w3) + add %rax, w1 + adc %rdx, w2 + mov -16(up,n,8), %rax + adc $0, R32(w3) + mov $0, R32(w0) + mov w1, -16(rp,n,8) +L(mul_2_entry_3): + mul v1 + add %rax, w2 + mov -8(up,n,8), %rax + adc %rdx, w3 + mov $0, R32(w1) + mul v0 + add %rax, w2 + mov -8(up,n,8), %rax + adc %rdx, w3 + adc R32(w1), R32(w0) C adc $0, w0 +L(mul_2_entry_2): + mul v1 + add %rax, w3 + mov w2, -8(rp,n,8) + adc %rdx, w0 + mov (up,n,8), %rax + mul v0 + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) +L(mul_2_entry_1): + add $4, n + mov w3, -32(rp,n,8) + js L(mul_2_top) + + mov -32(up,n,8), %rax + mul v1 + add %rax, w0 + mov w0, (rp) + adc %rdx, w1 + mov w1, 8(rp) + + add $-2, vn C vn -= 2 + jz L(ret) + + mov 16(vp), v0 + mov 24(vp), v1 + + lea 16(vp), vp C vp += 2 + lea 16(rp), rp C rp += 2 + + jmp *outer_addr + + +C =========================================================== +C addmul_2 for remaining vp's + + C in the following prologues, we reuse un to store the + C adjusted value of n that is reloaded on each iteration + +L(addmul_outer_0): + add $3, un + lea 0(%rip), outer_addr + + mov un, n + mov -24(up,un,8), %rax + mul v0 + mov %rax, w0 + mov -24(up,un,8), %rax + mov %rdx, w1 + xor R32(w2), R32(w2) + jmp L(addmul_entry_0) + +L(addmul_outer_1): + mov un, n + mov (up,un,8), %rax + mul v0 + mov %rax, w3 + mov (up,un,8), %rax + mov %rdx, w0 + xor R32(w1), R32(w1) + jmp L(addmul_entry_1) + +L(addmul_outer_2): + add $1, un + lea 0(%rip), outer_addr + + mov un, n + mov -8(up,un,8), %rax + mul v0 + xor R32(w0), R32(w0) + mov %rax, w2 + xor R32(w1), R32(w1) + mov %rdx, w3 + mov -8(up,un,8), %rax + jmp L(addmul_entry_2) + +L(addmul_outer_3): + add $2, un + lea 0(%rip), outer_addr + + mov un, n + mov -16(up,un,8), %rax + xor R32(w3), R32(w3) + mul v0 + mov %rax, w1 + mov -16(up,un,8), %rax + mov %rdx, w2 + jmp L(addmul_entry_3) + + C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments + + ALIGN(16) +L(addmul_top): + add w3, -32(rp,n,8) + adc %rax, w0 + mov -24(up,n,8), %rax + adc %rdx, w1 + xor R32(w2), R32(w2) + mul v0 + add %rax, w0 + mov -24(up,n,8), %rax + adc %rdx, w1 + adc R32(w2), R32(w2) C adc $0, w2 +L(addmul_entry_0): + mul v1 + xor R32(w3), R32(w3) + add w0, -24(rp,n,8) + adc %rax, w1 + mov -16(up,n,8), %rax + adc %rdx, w2 + mul v0 + add %rax, w1 + mov -16(up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) +L(addmul_entry_3): + mul v1 + add w1, -16(rp,n,8) + adc %rax, w2 + mov -8(up,n,8), %rax + adc %rdx, w3 + mul v0 + xor R32(w0), R32(w0) + add %rax, w2 + adc %rdx, w3 + mov $0, R32(w1) + mov -8(up,n,8), %rax + adc R32(w1), R32(w0) C adc $0, w0 +L(addmul_entry_2): + mul v1 + add w2, -8(rp,n,8) + adc %rax, w3 + adc %rdx, w0 + mov (up,n,8), %rax + mul v0 + add %rax, w3 + mov (up,n,8), %rax + adc %rdx, w0 + adc $0, R32(w1) +L(addmul_entry_1): + mul v1 + add $4, n + js L(addmul_top) + + add w3, -8(rp) + adc %rax, w0 + mov w0, (rp) + adc %rdx, w1 + mov w1, 8(rp) + + add $-2, vn C vn -= 2 + jz L(ret) + + lea 16(rp), rp C rp += 2 + lea 16(vp), vp C vp += 2 + + mov (vp), v0 + mov 8(vp), v1 + + jmp *outer_addr + + ALIGN(16) +L(ret): pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + ret + +EPILOGUE()