dnl AMD64 mpn_redc_1 -- Montgomery reduction with a one-limb modular inverse. dnl Copyright 2004, 2008 Free Software Foundation, Inc. dnl dnl This file is part of the GNU MP Library. dnl dnl The GNU MP Library is free software; you can redistribute it and/or dnl modify it under the terms of the GNU Lesser General Public License as dnl published by the Free Software Foundation; either version 3 of the dnl License, or (at your option) any later version. dnl dnl The GNU MP Library is distributed in the hope that it will be useful, dnl but WITHOUT ANY WARRANTY; without even the implied warranty of dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU dnl Lesser General Public License for more details. dnl dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb C cycles/limb C K8,K9: 2.5 C K10: 2.5 C P4: ? C P6-15 (Core2): 5.3 C P6-28 (Atom): ? C TODO C * Handle certain sizes, e.g., 1, 2, 3, 4, 8, with single-loop code. C The code for 1, 2, 3, 4 should perhaps be completely register based. C * Perhaps align outer loops. C * The sub_n at the end leaks side-channel data. How do we fix that? C * Write mpn_addsub_n computing R = A + B - C. It should run at 2 c/l. C * We could software pipeline the IMUL stuff, by putting it before the C outer loops and before the end of the outer loops. The last outer C loop iteration would then compute an unneeded product, but it is at C least not a stray read fro up[], since it is at up[n]. C * Can we combine both the add_n and sub_n into the loops, somehow? C INPUT PARAMETERS define(`rp', `%rdi') define(`up', `%rsi') define(`param_mp',`%rdx') define(`n', `%rcx') define(`invm', `%r8') define(`mp', `%r13') define(`i', `%r11') define(`nneg', `%r12') ASM_START() TEXT ALIGN(32) PROLOGUE(mpn_redc_1) push %rbp push %rbx push %r12 push %r13 push %r14 push n sub $8, %rsp C maintain ABI required rsp alignment lea (param_mp,n,8), mp C mp += n lea (up,n,8), up C up += n mov n, nneg neg nneg mov R32(n), R32(%rax) and $3, R32(%rax) jz L(b0) cmp $2, R32(%rax) jz L(b2) jg L(b3) L(b1): C lea (mp), mp lea -16(up), up L(o1): mov nneg, i mov 16(up,nneg,8), %rbp C up[0] imul invm, %rbp mov (mp,i,8), %rax xor %ebx, %ebx mul %rbp add $1, i jnz 1f add %rax, 8(up,i,8) adc $0, %rdx mov %rdx, %r14 jmp L(n1) 1: mov %rax, %r9 mov (mp,i,8), %rax mov %rdx, %r14 jmp L(mi1) ALIGN(16) L(lo1): add %r10, (up,i,8) adc %rax, %r9 mov (mp,i,8), %rax adc %rdx, %r14 L(mi1): xor %r10d, %r10d mul %rbp add %r9, 8(up,i,8) adc %rax, %r14 adc %rdx, %rbx mov 8(mp,i,8), %rax mul %rbp add %r14, 16(up,i,8) adc %rax, %rbx adc %rdx, %r10 mov 16(mp,i,8), %rax mul %rbp xor %r9d, %r9d xor %r14d, %r14d add %rbx, 24(up,i,8) adc %rax, %r10 mov 24(mp,i,8), %rax adc %rdx, %r9 xor %ebx, %ebx mul %rbp add $4, i js L(lo1) L(ed1): add %r10, (up) adc %rax, %r9 adc %rdx, %r14 xor %r10d, %r10d add %r9, 8(up) adc $0, %r14 L(n1): mov %r14, 16(up,nneg,8) C up[0] add $8, up dec n jnz L(o1) C lea (mp), mp lea 16(up), up jmp L(common) L(b0): C lea (mp), mp lea -16(up), up L(o0): mov nneg, i mov 16(up,nneg,8), %rbp C up[0] imul invm, %rbp mov (mp,i,8), %rax xor %r10d, %r10d mul %rbp mov %rax, %r14 mov %rdx, %rbx jmp L(mi0) ALIGN(16) L(lo0): add %r10, (up,i,8) adc %rax, %r9 mov (mp,i,8), %rax adc %rdx, %r14 xor %r10d, %r10d mul %rbp add %r9, 8(up,i,8) adc %rax, %r14 adc %rdx, %rbx L(mi0): mov 8(mp,i,8), %rax mul %rbp add %r14, 16(up,i,8) adc %rax, %rbx adc %rdx, %r10 mov 16(mp,i,8), %rax mul %rbp xor %r9d, %r9d xor %r14d, %r14d add %rbx, 24(up,i,8) adc %rax, %r10 mov 24(mp,i,8), %rax adc %rdx, %r9 xor %ebx, %ebx mul %rbp add $4, i js L(lo0) L(ed0): add %r10, (up) adc %rax, %r9 adc %rdx, %r14 xor %r10d, %r10d add %r9, 8(up) adc $0, %r14 mov %r14, 16(up,nneg,8) C up[0] add $8, up dec n jnz L(o0) C lea (mp), mp lea 16(up), up jmp L(common) L(b3): lea -8(mp), mp lea -24(up), up L(o3): mov nneg, i mov 24(up,nneg,8), %rbp C up[0] imul invm, %rbp mov 8(mp,i,8), %rax mul %rbp mov %rax, %rbx mov %rdx, %r10 jmp L(mi3) ALIGN(16) L(lo3): add %r10, (up,i,8) adc %rax, %r9 mov (mp,i,8), %rax adc %rdx, %r14 xor %r10d, %r10d mul %rbp add %r9, 8(up,i,8) adc %rax, %r14 adc %rdx, %rbx mov 8(mp,i,8), %rax mul %rbp add %r14, 16(up,i,8) adc %rax, %rbx adc %rdx, %r10 L(mi3): mov 16(mp,i,8), %rax mul %rbp xor %r9d, %r9d xor %r14d, %r14d add %rbx, 24(up,i,8) adc %rax, %r10 mov 24(mp,i,8), %rax adc %rdx, %r9 xor %ebx, %ebx mul %rbp add $4, i js L(lo3) L(ed3): add %r10, 8(up) adc %rax, %r9 adc %rdx, %r14 xor %r10d, %r10d add %r9, 16(up) adc $0, %r14 mov %r14, 24(up,nneg,8) C up[0] add $8, up dec n jnz L(o3) lea 8(mp), mp lea 24(up), up jmp L(common) L(b2): lea -16(mp), mp lea -32(up), up L(o2): mov nneg, i mov 32(up,nneg,8), %rbp C up[0] imul invm, %rbp mov 16(mp,i,8), %rax mul %rbp xor %r14d, %r14d mov %rax, %r10 mov 24(mp,i,8), %rax mov %rdx, %r9 jmp L(mi2) ALIGN(16) L(lo2): add %r10, (up,i,8) adc %rax, %r9 mov (mp,i,8), %rax adc %rdx, %r14 xor %r10d, %r10d mul %rbp add %r9, 8(up,i,8) adc %rax, %r14 adc %rdx, %rbx mov 8(mp,i,8), %rax mul %rbp add %r14, 16(up,i,8) adc %rax, %rbx adc %rdx, %r10 mov 16(mp,i,8), %rax mul %rbp xor %r9d, %r9d xor %r14d, %r14d add %rbx, 24(up,i,8) adc %rax, %r10 mov 24(mp,i,8), %rax adc %rdx, %r9 L(mi2): xor %ebx, %ebx mul %rbp add $4, i js L(lo2) L(ed2): add %r10, 16(up) adc %rax, %r9 adc %rdx, %r14 xor %r10d, %r10d add %r9, 24(up) adc $0, %r14 mov %r14, 32(up,nneg,8) C up[0] add $8, up dec n jnz L(o2) lea 16(mp), mp lea 32(up), up L(common): lea (mp,nneg,8), mp C restore entry mp C cy = mpn_add_n (rp, up, up - n, n); C rdi rsi rdx rcx lea (up,nneg,8), up C up -= n lea (up,nneg,8), %rdx C rdx = up - n [up entry value] mov rp, nneg C preserve rp over first call mov 8(%rsp), %rcx C pass entry n C mov rp, %rdi CALL( mpn_add_n) test R32(%rax), R32(%rax) jz L(ret) C mpn_sub_n (rp, rp, mp, n); C rdi rsi rdx rcx mov nneg, %rdi mov nneg, %rsi mov mp, %rdx mov 8(%rsp), %rcx C pass entry n CALL( mpn_sub_n) L(ret): add $8, %rsp pop n C just increment rsp pop %r14 pop %r13 pop %r12 pop %rbx pop %rbp ret EPILOGUE()