dnl AMD64 mpn_addmul_1 and mpn_submul_1. dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb C K8,K9: 2.5 C K10: 2.5 C P4: 14.9 C P6-15 (Core2): 5.09 C P6-28 (Atom): 21.3 C The inner loop of this code is the result of running a code generation and C optimization tool suite written by David Harvey and Torbjorn Granlund. C TODO: C * The inner loop is great, but the prologue and epilogue code was C quickly written. Tune it! C INPUT PARAMETERS define(`rp', `%rdi') define(`up', `%rsi') define(`n_param',`%rdx') define(`vl', `%rcx') define(`n', `%r11') ifdef(`OPERATION_addmul_1',` define(`ADDSUB', `add') define(`func', `mpn_addmul_1') ') ifdef(`OPERATION_submul_1',` define(`ADDSUB', `sub') define(`func', `mpn_submul_1') ') MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) ASM_START() TEXT ALIGN(16) PROLOGUE(func) mov (up), %rax C read first u limb early push %rbx mov n_param, %rbx C move away n from rdx, mul uses it mul vl mov %rbx, %r11 and $3, R32(%rbx) jz L(b0) cmp $2, R32(%rbx) jz L(b2) jg L(b3) L(b1): dec n jne L(gt1) ADDSUB %rax, (rp) jmp L(ret) L(gt1): lea 8(up,n,8), up lea -8(rp,n,8), rp neg n xor %r10, %r10 xor R32(%rbx), R32(%rbx) mov %rax, %r9 mov (up,n,8), %rax mov %rdx, %r8 jmp L(L1) L(b0): lea (up,n,8), up lea -16(rp,n,8), rp neg n xor %r10, %r10 mov %rax, %r8 mov %rdx, %rbx jmp L(L0) L(b3): lea -8(up,n,8), up lea -24(rp,n,8), rp neg n mov %rax, %rbx mov %rdx, %r10 jmp L(L3) L(b2): lea -16(up,n,8), up lea -32(rp,n,8), rp neg n xor %r8, %r8 xor R32(%rbx), R32(%rbx) mov %rax, %r10 mov 24(up,n,8), %rax mov %rdx, %r9 jmp L(L2) ALIGN(16) L(top): ADDSUB %r10, (rp,n,8) adc %rax, %r9 mov (up,n,8), %rax adc %rdx, %r8 mov $0, %r10d L(L1): mul vl ADDSUB %r9, 8(rp,n,8) adc %rax, %r8 adc %rdx, %rbx L(L0): mov 8(up,n,8), %rax mul vl ADDSUB %r8, 16(rp,n,8) adc %rax, %rbx adc %rdx, %r10 L(L3): mov 16(up,n,8), %rax mul vl ADDSUB %rbx, 24(rp,n,8) mov $0, %r8d # zero mov %r8, %rbx # zero adc %rax, %r10 mov 24(up,n,8), %rax mov %r8, %r9 # zero adc %rdx, %r9 L(L2): mul vl add $4, n js L(top) ADDSUB %r10, (rp,n,8) adc %rax, %r9 adc %r8, %rdx ADDSUB %r9, 8(rp,n,8) L(ret): adc $0, %rdx mov %rdx, %rax pop %rbx ret EPILOGUE()