dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation, dnl Inc. dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 3 of the dnl License, or (at your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb C K8,K9: 1.0 C K10: 1.12 C P4: 3.25 C P6-15 (Core2): 1.5 C P6-28 (Atom): 2.5 C INPUT PARAMETERS C up rdi C n rsi C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) C TODO C * Apply the movzwl tricks to the x86/k7 code C * Review feed-in and wind-down code. In particular, try to avoid adcq and C sbbq to placate Pentium4. C * More unrolling and/or index addressing could bring time to under 1 c/l C for Athlon64, approaching 0.67 c/l seems possible. C * There are recurrencies on the carry registers (r8, r9, r10) that might C be the limiting factor for the Pentium4 speed. Splitting these into 6 C registers would help. C * For ultimate Athlon64 performance, a sequence like this might be best. C It should reach 0.5 c/l (limited by L1 cache bandwidth). C C addq (%rdi), %rax C adcq 8(%rdi), %rcx C adcq 16(%rdi), %rdx C adcq $0, %r8 C addq 24(%rdi), %rax C adcq 32(%rdi), %rcx C adcq 40(%rdi), %rdx C adcq $0, %r8 C ... ASM_START() TEXT ALIGN(32) PROLOGUE(mpn_mod_34lsub1) mov $0x0000FFFFFFFFFFFF, %r11 sub $2, %rsi ja L(gt2) mov (%rdi), %rax nop jb L(1) mov 8(%rdi), %rsi mov %rax, %rdx shr $48, %rax C src[0] low and %r11, %rdx C src[0] high add %rdx, %rax mov %esi, %edx shr $32, %rsi C src[1] high add %rsi, %rax shl $16, %rdx C src[1] low add %rdx, %rax L(1): ret ALIGN(16) L(gt2): xor %eax, %eax xor %ecx, %ecx xor %edx, %edx xor %r8, %r8 xor %r9, %r9 xor %r10, %r10 L(top): add (%rdi), %rax adc $0, %r10 add 8(%rdi), %rcx adc $0, %r8 add 16(%rdi), %rdx adc $0, %r9 sub $3,%rsi jng L(end) add 24(%rdi), %rax adc $0, %r10 add 32(%rdi), %rcx adc $0, %r8 add 40(%rdi), %rdx lea 48(%rdi), %rdi adc $0, %r9 sub $3,%rsi jg L(top) add $-24, %rdi L(end): add %r9, %rax adc %r10, %rcx adc %r8, %rdx inc %rsi mov $0x1, %r10d js L(combine) mov $0x10000, %r10d adc 24(%rdi), %rax dec %rsi js L(combine) adc 32(%rdi), %rcx mov $0x100000000, %r10 L(combine): sbb %rsi, %rsi C carry mov %rax, %rdi C 0mod3 shr $48, %rax C 0mod3 high and %r10, %rsi C carry masked and %r11, %rdi C 0mod3 low mov %ecx, %r10d C 1mod3 add %rsi, %rax C apply carry shr $32, %rcx C 1mod3 high add %rdi, %rax C apply 0mod3 low movzwl %dx, %edi C 2mod3 shl $16, %r10 C 1mod3 low add %rcx, %rax C apply 1mod3 high shr $16, %rdx C 2mod3 high add %r10, %rax C apply 1mod3 low shl $32, %rdi C 2mod3 low add %rdx, %rax C apply 2mod3 high add %rdi, %rax C apply 2mod3 low ret EPILOGUE()