X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Fx86_64%2Fredc_1.asm;fp=gmp%2Fmpn%2Fx86_64%2Fredc_1.asm;h=23ccceed67121f9b5950759348826856f23a76d2;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/x86_64/redc_1.asm b/gmp/mpn/x86_64/redc_1.asm new file mode 100644 index 00000000..23ccceed --- /dev/null +++ b/gmp/mpn/x86_64/redc_1.asm @@ -0,0 +1,335 @@ +dnl AMD64 mpn_redc_1 -- Montgomery reduction with a one-limb modular inverse. + +dnl Copyright 2004, 2008 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C cycles/limb +C K8,K9: 2.5 +C K10: 2.5 +C P4: ? +C P6-15 (Core2): 5.3 +C P6-28 (Atom): ? + +C TODO +C * Handle certain sizes, e.g., 1, 2, 3, 4, 8, with single-loop code. +C The code for 1, 2, 3, 4 should perhaps be completely register based. +C * Perhaps align outer loops. +C * The sub_n at the end leaks side-channel data. How do we fix that? +C * Write mpn_addsub_n computing R = A + B - C. It should run at 2 c/l. +C * We could software pipeline the IMUL stuff, by putting it before the +C outer loops and before the end of the outer loops. The last outer +C loop iteration would then compute an unneeded product, but it is at +C least not a stray read fro up[], since it is at up[n]. +C * Can we combine both the add_n and sub_n into the loops, somehow? + +C INPUT PARAMETERS +define(`rp', `%rdi') +define(`up', `%rsi') +define(`param_mp',`%rdx') +define(`n', `%rcx') +define(`invm', `%r8') + +define(`mp', `%r13') +define(`i', `%r11') +define(`nneg', `%r12') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_redc_1) + push %rbp + push %rbx + push %r12 + push %r13 + push %r14 + push n + sub $8, %rsp C maintain ABI required rsp alignment + + lea (param_mp,n,8), mp C mp += n + lea (up,n,8), up C up += n + + mov n, nneg + neg nneg + + mov R32(n), R32(%rax) + and $3, R32(%rax) + jz L(b0) + cmp $2, R32(%rax) + jz L(b2) + jg L(b3) + +L(b1): C lea (mp), mp + lea -16(up), up +L(o1): mov nneg, i + mov 16(up,nneg,8), %rbp C up[0] + imul invm, %rbp + + mov (mp,i,8), %rax + xor %ebx, %ebx + mul %rbp + add $1, i + jnz 1f + add %rax, 8(up,i,8) + adc $0, %rdx + mov %rdx, %r14 + jmp L(n1) + +1: mov %rax, %r9 + mov (mp,i,8), %rax + mov %rdx, %r14 + jmp L(mi1) + + ALIGN(16) +L(lo1): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 +L(mi1): xor %r10d, %r10d + mul %rbp + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul %rbp + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(mp,i,8), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add %rbx, 24(up,i,8) + adc %rax, %r10 + mov 24(mp,i,8), %rax + adc %rdx, %r9 + xor %ebx, %ebx + mul %rbp + add $4, i + js L(lo1) +L(ed1): add %r10, (up) + adc %rax, %r9 + adc %rdx, %r14 + xor %r10d, %r10d + add %r9, 8(up) + adc $0, %r14 +L(n1): mov %r14, 16(up,nneg,8) C up[0] + add $8, up + dec n + jnz L(o1) +C lea (mp), mp + lea 16(up), up + jmp L(common) + +L(b0): C lea (mp), mp + lea -16(up), up +L(o0): mov nneg, i + mov 16(up,nneg,8), %rbp C up[0] + imul invm, %rbp + + mov (mp,i,8), %rax + xor %r10d, %r10d + mul %rbp + mov %rax, %r14 + mov %rdx, %rbx + jmp L(mi0) + + ALIGN(16) +L(lo0): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx +L(mi0): mov 8(mp,i,8), %rax + mul %rbp + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(mp,i,8), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add %rbx, 24(up,i,8) + adc %rax, %r10 + mov 24(mp,i,8), %rax + adc %rdx, %r9 + xor %ebx, %ebx + mul %rbp + add $4, i + js L(lo0) +L(ed0): add %r10, (up) + adc %rax, %r9 + adc %rdx, %r14 + xor %r10d, %r10d + add %r9, 8(up) + adc $0, %r14 + mov %r14, 16(up,nneg,8) C up[0] + add $8, up + dec n + jnz L(o0) +C lea (mp), mp + lea 16(up), up + jmp L(common) + + +L(b3): lea -8(mp), mp + lea -24(up), up +L(o3): mov nneg, i + mov 24(up,nneg,8), %rbp C up[0] + imul invm, %rbp + + mov 8(mp,i,8), %rax + mul %rbp + mov %rax, %rbx + mov %rdx, %r10 + jmp L(mi3) + + ALIGN(16) +L(lo3): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul %rbp + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 +L(mi3): mov 16(mp,i,8), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add %rbx, 24(up,i,8) + adc %rax, %r10 + mov 24(mp,i,8), %rax + adc %rdx, %r9 + xor %ebx, %ebx + mul %rbp + add $4, i + js L(lo3) +L(ed3): add %r10, 8(up) + adc %rax, %r9 + adc %rdx, %r14 + xor %r10d, %r10d + add %r9, 16(up) + adc $0, %r14 + mov %r14, 24(up,nneg,8) C up[0] + add $8, up + dec n + jnz L(o3) + lea 8(mp), mp + lea 24(up), up + jmp L(common) + +L(b2): lea -16(mp), mp + lea -32(up), up +L(o2): mov nneg, i + mov 32(up,nneg,8), %rbp C up[0] + imul invm, %rbp + + mov 16(mp,i,8), %rax + mul %rbp + xor %r14d, %r14d + mov %rax, %r10 + mov 24(mp,i,8), %rax + mov %rdx, %r9 + jmp L(mi2) + + ALIGN(16) +L(lo2): add %r10, (up,i,8) + adc %rax, %r9 + mov (mp,i,8), %rax + adc %rdx, %r14 + xor %r10d, %r10d + mul %rbp + add %r9, 8(up,i,8) + adc %rax, %r14 + adc %rdx, %rbx + mov 8(mp,i,8), %rax + mul %rbp + add %r14, 16(up,i,8) + adc %rax, %rbx + adc %rdx, %r10 + mov 16(mp,i,8), %rax + mul %rbp + xor %r9d, %r9d + xor %r14d, %r14d + add %rbx, 24(up,i,8) + adc %rax, %r10 + mov 24(mp,i,8), %rax + adc %rdx, %r9 +L(mi2): xor %ebx, %ebx + mul %rbp + add $4, i + js L(lo2) +L(ed2): add %r10, 16(up) + adc %rax, %r9 + adc %rdx, %r14 + xor %r10d, %r10d + add %r9, 24(up) + adc $0, %r14 + mov %r14, 32(up,nneg,8) C up[0] + add $8, up + dec n + jnz L(o2) + lea 16(mp), mp + lea 32(up), up + + +L(common): + lea (mp,nneg,8), mp C restore entry mp + +C cy = mpn_add_n (rp, up, up - n, n); +C rdi rsi rdx rcx + lea (up,nneg,8), up C up -= n + lea (up,nneg,8), %rdx C rdx = up - n [up entry value] + mov rp, nneg C preserve rp over first call + mov 8(%rsp), %rcx C pass entry n +C mov rp, %rdi + CALL( mpn_add_n) + test R32(%rax), R32(%rax) + jz L(ret) + +C mpn_sub_n (rp, rp, mp, n); +C rdi rsi rdx rcx + mov nneg, %rdi + mov nneg, %rsi + mov mp, %rdx + mov 8(%rsp), %rcx C pass entry n + CALL( mpn_sub_n) + +L(ret): + add $8, %rsp + pop n C just increment rsp + pop %r14 + pop %r13 + pop %r12 + pop %rbx + pop %rbp + ret +EPILOGUE()