--- /dev/null
+dnl AMD64 mpn_redc_1 -- Montgomery reduction with a one-limb modular inverse.
+
+dnl Copyright 2004, 2008 Free Software Foundation, Inc.
+dnl
+dnl This file is part of the GNU MP Library.
+dnl
+dnl The GNU MP Library is free software; you can redistribute it and/or
+dnl modify it under the terms of the GNU Lesser General Public License as
+dnl published by the Free Software Foundation; either version 3 of the
+dnl License, or (at your option) any later version.
+dnl
+dnl The GNU MP Library is distributed in the hope that it will be useful,
+dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+dnl Lesser General Public License for more details.
+dnl
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C cycles/limb
+C cycles/limb
+C K8,K9: 2.5
+C K10: 2.5
+C P4: ?
+C P6-15 (Core2): 5.3
+C P6-28 (Atom): ?
+
+C TODO
+C * Handle certain sizes, e.g., 1, 2, 3, 4, 8, with single-loop code.
+C The code for 1, 2, 3, 4 should perhaps be completely register based.
+C * Perhaps align outer loops.
+C * The sub_n at the end leaks side-channel data. How do we fix that?
+C * Write mpn_addsub_n computing R = A + B - C. It should run at 2 c/l.
+C * We could software pipeline the IMUL stuff, by putting it before the
+C outer loops and before the end of the outer loops. The last outer
+C loop iteration would then compute an unneeded product, but it is at
+C least not a stray read fro up[], since it is at up[n].
+C * Can we combine both the add_n and sub_n into the loops, somehow?
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`param_mp',`%rdx')
+define(`n', `%rcx')
+define(`invm', `%r8')
+
+define(`mp', `%r13')
+define(`i', `%r11')
+define(`nneg', `%r12')
+
+ASM_START()
+ TEXT
+ ALIGN(32)
+PROLOGUE(mpn_redc_1)
+ push %rbp
+ push %rbx
+ push %r12
+ push %r13
+ push %r14
+ push n
+ sub $8, %rsp C maintain ABI required rsp alignment
+
+ lea (param_mp,n,8), mp C mp += n
+ lea (up,n,8), up C up += n
+
+ mov n, nneg
+ neg nneg
+
+ mov R32(n), R32(%rax)
+ and $3, R32(%rax)
+ jz L(b0)
+ cmp $2, R32(%rax)
+ jz L(b2)
+ jg L(b3)
+
+L(b1): C lea (mp), mp
+ lea -16(up), up
+L(o1): mov nneg, i
+ mov 16(up,nneg,8), %rbp C up[0]
+ imul invm, %rbp
+
+ mov (mp,i,8), %rax
+ xor %ebx, %ebx
+ mul %rbp
+ add $1, i
+ jnz 1f
+ add %rax, 8(up,i,8)
+ adc $0, %rdx
+ mov %rdx, %r14
+ jmp L(n1)
+
+1: mov %rax, %r9
+ mov (mp,i,8), %rax
+ mov %rdx, %r14
+ jmp L(mi1)
+
+ ALIGN(16)
+L(lo1): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+L(mi1): xor %r10d, %r10d
+ mul %rbp
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul %rbp
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+ mov 16(mp,i,8), %rax
+ mul %rbp
+ xor %r9d, %r9d
+ xor %r14d, %r14d
+ add %rbx, 24(up,i,8)
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ adc %rdx, %r9
+ xor %ebx, %ebx
+ mul %rbp
+ add $4, i
+ js L(lo1)
+L(ed1): add %r10, (up)
+ adc %rax, %r9
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ add %r9, 8(up)
+ adc $0, %r14
+L(n1): mov %r14, 16(up,nneg,8) C up[0]
+ add $8, up
+ dec n
+ jnz L(o1)
+C lea (mp), mp
+ lea 16(up), up
+ jmp L(common)
+
+L(b0): C lea (mp), mp
+ lea -16(up), up
+L(o0): mov nneg, i
+ mov 16(up,nneg,8), %rbp C up[0]
+ imul invm, %rbp
+
+ mov (mp,i,8), %rax
+ xor %r10d, %r10d
+ mul %rbp
+ mov %rax, %r14
+ mov %rdx, %rbx
+ jmp L(mi0)
+
+ ALIGN(16)
+L(lo0): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ mul %rbp
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+L(mi0): mov 8(mp,i,8), %rax
+ mul %rbp
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+ mov 16(mp,i,8), %rax
+ mul %rbp
+ xor %r9d, %r9d
+ xor %r14d, %r14d
+ add %rbx, 24(up,i,8)
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ adc %rdx, %r9
+ xor %ebx, %ebx
+ mul %rbp
+ add $4, i
+ js L(lo0)
+L(ed0): add %r10, (up)
+ adc %rax, %r9
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ add %r9, 8(up)
+ adc $0, %r14
+ mov %r14, 16(up,nneg,8) C up[0]
+ add $8, up
+ dec n
+ jnz L(o0)
+C lea (mp), mp
+ lea 16(up), up
+ jmp L(common)
+
+
+L(b3): lea -8(mp), mp
+ lea -24(up), up
+L(o3): mov nneg, i
+ mov 24(up,nneg,8), %rbp C up[0]
+ imul invm, %rbp
+
+ mov 8(mp,i,8), %rax
+ mul %rbp
+ mov %rax, %rbx
+ mov %rdx, %r10
+ jmp L(mi3)
+
+ ALIGN(16)
+L(lo3): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ mul %rbp
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul %rbp
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+L(mi3): mov 16(mp,i,8), %rax
+ mul %rbp
+ xor %r9d, %r9d
+ xor %r14d, %r14d
+ add %rbx, 24(up,i,8)
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ adc %rdx, %r9
+ xor %ebx, %ebx
+ mul %rbp
+ add $4, i
+ js L(lo3)
+L(ed3): add %r10, 8(up)
+ adc %rax, %r9
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ add %r9, 16(up)
+ adc $0, %r14
+ mov %r14, 24(up,nneg,8) C up[0]
+ add $8, up
+ dec n
+ jnz L(o3)
+ lea 8(mp), mp
+ lea 24(up), up
+ jmp L(common)
+
+L(b2): lea -16(mp), mp
+ lea -32(up), up
+L(o2): mov nneg, i
+ mov 32(up,nneg,8), %rbp C up[0]
+ imul invm, %rbp
+
+ mov 16(mp,i,8), %rax
+ mul %rbp
+ xor %r14d, %r14d
+ mov %rax, %r10
+ mov 24(mp,i,8), %rax
+ mov %rdx, %r9
+ jmp L(mi2)
+
+ ALIGN(16)
+L(lo2): add %r10, (up,i,8)
+ adc %rax, %r9
+ mov (mp,i,8), %rax
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ mul %rbp
+ add %r9, 8(up,i,8)
+ adc %rax, %r14
+ adc %rdx, %rbx
+ mov 8(mp,i,8), %rax
+ mul %rbp
+ add %r14, 16(up,i,8)
+ adc %rax, %rbx
+ adc %rdx, %r10
+ mov 16(mp,i,8), %rax
+ mul %rbp
+ xor %r9d, %r9d
+ xor %r14d, %r14d
+ add %rbx, 24(up,i,8)
+ adc %rax, %r10
+ mov 24(mp,i,8), %rax
+ adc %rdx, %r9
+L(mi2): xor %ebx, %ebx
+ mul %rbp
+ add $4, i
+ js L(lo2)
+L(ed2): add %r10, 16(up)
+ adc %rax, %r9
+ adc %rdx, %r14
+ xor %r10d, %r10d
+ add %r9, 24(up)
+ adc $0, %r14
+ mov %r14, 32(up,nneg,8) C up[0]
+ add $8, up
+ dec n
+ jnz L(o2)
+ lea 16(mp), mp
+ lea 32(up), up
+
+
+L(common):
+ lea (mp,nneg,8), mp C restore entry mp
+
+C cy = mpn_add_n (rp, up, up - n, n);
+C rdi rsi rdx rcx
+ lea (up,nneg,8), up C up -= n
+ lea (up,nneg,8), %rdx C rdx = up - n [up entry value]
+ mov rp, nneg C preserve rp over first call
+ mov 8(%rsp), %rcx C pass entry n
+C mov rp, %rdi
+ CALL( mpn_add_n)
+ test R32(%rax), R32(%rax)
+ jz L(ret)
+
+C mpn_sub_n (rp, rp, mp, n);
+C rdi rsi rdx rcx
+ mov nneg, %rdi
+ mov nneg, %rsi
+ mov mp, %rdx
+ mov 8(%rsp), %rcx C pass entry n
+ CALL( mpn_sub_n)
+
+L(ret):
+ add $8, %rsp
+ pop n C just increment rsp
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbx
+ pop %rbp
+ ret
+EPILOGUE()