dnl AMD64 mpn_mul_2 -- Multiply an n-limb vector with a 2-limb vector and dnl store the result in a third limb vector. dnl Copyright 2008 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb C K8,K9: 2.275 C K10: 2.275 C P4: ? C P6-15: 4.0 C This code is the result of running a code generation and optimization tool C suite written by David Harvey and Torbjorn Granlund. C TODO C * Work on feed-in and wind-down code. C * Convert "mov $0" to "xor". C * Adjust initial lea to save some bytes. C * Perhaps adjust n from n_param&3 value? C * Replace with 2.25 c/l sequence. C INPUT PARAMETERS define(`rp', `%rdi') define(`up', `%rsi') define(`n_param',`%rdx') define(`vp', `%rcx') define(`v0', `%r8') define(`v1', `%r9') define(`w0', `%rbx') define(`w1', `%rcx') define(`w2', `%rbp') define(`w3', `%r10') define(`n', `%r11') ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_mul_2) push %rbx push %rbp mov (vp), v0 mov 8(vp), v1 mov (up), %rax mov n_param, n neg n lea -8(up,n_param,8), up lea -8(rp,n_param,8), rp and $3, R32(n_param) jz L(m2p0) cmp $2, R32(n_param) jc L(m2p1) jz L(m2p2) L(m2p3): mul v0 xor R32(w3), R32(w3) mov %rax, w1 mov %rdx, w2 mov 8(up,n,8), %rax add $-1, n mul v1 add %rax, w2 jmp L(m23) L(m2p0): mul v0 xor R32(w2), R32(w2) mov %rax, w0 mov %rdx, w1 jmp L(m20) L(m2p1): mul v0 xor R32(w3), R32(w3) xor R32(w0), R32(w0) xor R32(w1), R32(w1) add $1, n jmp L(m2top) L(m2p2): mul v0 xor R32(w0), R32(w0) xor R32(w1), R32(w1) mov %rax, w2 mov %rdx, w3 mov 8(up,n,8), %rax add $-2, n jmp L(m22) ALIGN(32) L(m2top): add %rax, w3 adc %rdx, w0 mov 0(up,n,8), %rax adc $0, R32(w1) mov $0, R32(w2) mul v1 add %rax, w0 mov w3, 0(rp,n,8) adc %rdx, w1 mov 8(up,n,8), %rax mul v0 add %rax, w0 adc %rdx, w1 adc $0, R32(w2) L(m20): mov 8(up,n,8), %rax mul v1 add %rax, w1 adc %rdx, w2 mov 16(up,n,8), %rax mov $0, R32(w3) mul v0 add %rax, w1 mov 16(up,n,8), %rax adc %rdx, w2 adc $0, R32(w3) mul v1 add %rax, w2 mov w0, 8(rp,n,8) L(m23): adc %rdx, w3 mov 24(up,n,8), %rax mul v0 mov $0, R32(w0) add %rax, w2 adc %rdx, w3 mov w1, 16(rp,n,8) mov 24(up,n,8), %rax mov $0, R32(w1) adc $0, R32(w0) L(m22): mul v1 add %rax, w3 mov w2, 24(rp,n,8) adc %rdx, w0 mov 32(up,n,8), %rax mul v0 add $4, n js L(m2top) add %rax, w3 adc %rdx, w0 adc $0, R32(w1) mov (up), %rax mul v1 mov w3, (rp) add %rax, w0 adc %rdx, w1 mov w0, 8(rp) mov w1, %rax pop %rbp pop %rbx ret EPILOGUE()