X-Git-Url: https://oss.titaniummirror.com/gitweb/?a=blobdiff_plain;f=gmp%2Fmpn%2Fx86_64%2Fmul_2.asm;fp=gmp%2Fmpn%2Fx86_64%2Fmul_2.asm;h=a8ad00069f011ae8b49173f17f7169e3ce294e1c;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/x86_64/mul_2.asm b/gmp/mpn/x86_64/mul_2.asm new file mode 100644 index 00000000..a8ad0006 --- /dev/null +++ b/gmp/mpn/x86_64/mul_2.asm @@ -0,0 +1,173 @@ +dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and +dnl store the result in a third limb vector. + +dnl Copyright 2008 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C K8,K9: 2.275 +C K10: 2.275 +C P4: ? +C P6-15: 4.0 + +C This code is the result of running a code generation and optimization tool +C suite written by David Harvey and Torbjorn Granlund. + +C TODO +C * Work on feed-in and wind-down code. +C * Convert "mov $0" to "xor". +C * Adjust initial lea to save some bytes. +C * Perhaps adjust n from n_param&3 value? +C * Replace with 2.25 c/l sequence. + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`n_param',`%rdx') +define(`vp', `%rcx') + +define(`v0', `%r8') +define(`v1', `%r9') +define(`w0', `%rbx') +define(`w1', `%rcx') +define(`w2', `%rbp') +define(`w3', `%r10') +define(`n', `%r11') + +ASM_START() + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_2) + push %rbx + push %rbp + + mov (vp), v0 + mov 8(vp), v1 + + mov (up), %rax + + mov n_param, n + neg n + lea -8(up,n_param,8), up + lea -8(rp,n_param,8), rp + + and $3, R32(n_param) + jz L(m2p0) + cmp $2, R32(n_param) + jc L(m2p1) + jz L(m2p2) +L(m2p3): + mul v0 + xor R32(w3), R32(w3) + mov %rax, w1 + mov %rdx, w2 + mov 8(up,n,8), %rax + add $-1, n + mul v1 + add %rax, w2 + jmp L(m23) +L(m2p0): + mul v0 + xor R32(w2), R32(w2) + mov %rax, w0 + mov %rdx, w1 + jmp L(m20) +L(m2p1): + mul v0 + xor R32(w3), R32(w3) + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + add $1, n + jmp L(m2top) +L(m2p2): + mul v0 + xor R32(w0), R32(w0) + xor R32(w1), R32(w1) + mov %rax, w2 + mov %rdx, w3 + mov 8(up,n,8), %rax + add $-2, n + jmp L(m22) + + + ALIGN(32) +L(m2top): + add %rax, w3 + adc %rdx, w0 + mov 0(up,n,8), %rax + adc $0, R32(w1) + mov $0, R32(w2) + mul v1 + add %rax, w0 + mov w3, 0(rp,n,8) + adc %rdx, w1 + mov 8(up,n,8), %rax + mul v0 + add %rax, w0 + adc %rdx, w1 + adc $0, R32(w2) +L(m20): mov 8(up,n,8), %rax + mul v1 + add %rax, w1 + adc %rdx, w2 + mov 16(up,n,8), %rax + mov $0, R32(w3) + mul v0 + add %rax, w1 + mov 16(up,n,8), %rax + adc %rdx, w2 + adc $0, R32(w3) + mul v1 + add %rax, w2 + mov w0, 8(rp,n,8) +L(m23): adc %rdx, w3 + mov 24(up,n,8), %rax + mul v0 + mov $0, R32(w0) + add %rax, w2 + adc %rdx, w3 + mov w1, 16(rp,n,8) + mov 24(up,n,8), %rax + mov $0, R32(w1) + adc $0, R32(w0) +L(m22): mul v1 + add %rax, w3 + mov w2, 24(rp,n,8) + adc %rdx, w0 + mov 32(up,n,8), %rax + mul v0 + add $4, n + js L(m2top) + + + add %rax, w3 + adc %rdx, w0 + adc $0, R32(w1) + mov (up), %rax + mul v1 + mov w3, (rp) + add %rax, w0 + adc %rdx, w1 + mov w0, 8(rp) + mov w1, %rax + + pop %rbp + pop %rbx + ret +EPILOGUE()