--- /dev/null
+dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1.
+
+dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007 Free Software Foundation,
+dnl Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C K8,K9: 1.0
+C K10: 1.12
+C P4: 3.25
+C P6-15 (Core2): 1.5
+C P6-28 (Atom): 2.5
+
+
+C INPUT PARAMETERS
+C up rdi
+C n rsi
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n)
+
+C TODO
+C * Apply the movzwl tricks to the x86/k7 code
+C * Review feed-in and wind-down code. In particular, try to avoid adcq and
+C sbbq to placate Pentium4.
+C * More unrolling and/or index addressing could bring time to under 1 c/l
+C for Athlon64, approaching 0.67 c/l seems possible.
+C * There are recurrencies on the carry registers (r8, r9, r10) that might
+C be the limiting factor for the Pentium4 speed. Splitting these into 6
+C registers would help.
+C * For ultimate Athlon64 performance, a sequence like this might be best.
+C It should reach 0.5 c/l (limited by L1 cache bandwidth).
+C
+C addq (%rdi), %rax
+C adcq 8(%rdi), %rcx
+C adcq 16(%rdi), %rdx
+C adcq $0, %r8
+C addq 24(%rdi), %rax
+C adcq 32(%rdi), %rcx
+C adcq 40(%rdi), %rdx
+C adcq $0, %r8
+C ...
+
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_mod_34lsub1)
+
+ mov $0x0000FFFFFFFFFFFF, %r11
+
+ sub $2, %rsi
+ ja L(gt2)
+
+ mov (%rdi), %rax
+ nop
+ jb L(1)
+
+ mov 8(%rdi), %rsi
+ mov %rax, %rdx
+ shr $48, %rax C src[0] low
+
+ and %r11, %rdx C src[0] high
+ add %rdx, %rax
+ mov %esi, %edx
+
+ shr $32, %rsi C src[1] high
+ add %rsi, %rax
+
+ shl $16, %rdx C src[1] low
+ add %rdx, %rax
+
+L(1): ret
+
+
+ ALIGN(16)
+L(gt2): xor %eax, %eax
+ xor %ecx, %ecx
+ xor %edx, %edx
+ xor %r8, %r8
+ xor %r9, %r9
+ xor %r10, %r10
+
+L(top): add (%rdi), %rax
+ adc $0, %r10
+ add 8(%rdi), %rcx
+ adc $0, %r8
+ add 16(%rdi), %rdx
+ adc $0, %r9
+
+ sub $3,%rsi
+ jng L(end)
+
+ add 24(%rdi), %rax
+ adc $0, %r10
+ add 32(%rdi), %rcx
+ adc $0, %r8
+ add 40(%rdi), %rdx
+ lea 48(%rdi), %rdi
+ adc $0, %r9
+
+ sub $3,%rsi
+ jg L(top)
+
+
+ add $-24, %rdi
+L(end): add %r9, %rax
+ adc %r10, %rcx
+ adc %r8, %rdx
+
+ inc %rsi
+ mov $0x1, %r10d
+ js L(combine)
+
+ mov $0x10000, %r10d
+ adc 24(%rdi), %rax
+ dec %rsi
+ js L(combine)
+
+ adc 32(%rdi), %rcx
+ mov $0x100000000, %r10
+
+L(combine):
+ sbb %rsi, %rsi C carry
+ mov %rax, %rdi C 0mod3
+ shr $48, %rax C 0mod3 high
+
+ and %r10, %rsi C carry masked
+ and %r11, %rdi C 0mod3 low
+ mov %ecx, %r10d C 1mod3
+
+ add %rsi, %rax C apply carry
+ shr $32, %rcx C 1mod3 high
+
+ add %rdi, %rax C apply 0mod3 low
+ movzwl %dx, %edi C 2mod3
+ shl $16, %r10 C 1mod3 low
+
+ add %rcx, %rax C apply 1mod3 high
+ shr $16, %rdx C 2mod3 high
+
+ add %r10, %rax C apply 1mod3 low
+ shl $32, %rdi C 2mod3 low
+
+ add %rdx, %rax C apply 2mod3 high
+ add %rdi, %rax C apply 2mod3 low
+
+ ret
+EPILOGUE()