X-Git-Url: https://oss.titaniummirror.com/gitweb/?a=blobdiff_plain;f=gmp%2Fmpn%2Fx86_64%2Fpentium4%2Frshift.asm;fp=gmp%2Fmpn%2Fx86_64%2Fpentium4%2Frshift.asm;h=61899c5ecfefc950ddcb2154d9eb49c81a020cb5;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/x86_64/pentium4/rshift.asm b/gmp/mpn/x86_64/pentium4/rshift.asm new file mode 100644 index 00000000..61899c5e --- /dev/null +++ b/gmp/mpn/x86_64/pentium4/rshift.asm @@ -0,0 +1,151 @@ +dnl x86-64 mpn_rshift optimized for Pentium 4. + +dnl Copyright 2003, 2005, 2007, 2008 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or +dnl modify it under the terms of the GNU Lesser General Public License as +dnl published by the Free Software Foundation; either version 3 of the +dnl License, or (at your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, +dnl but WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl Lesser General Public License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + + +C cycles/limb +C K8,K9: 2.5 +C K10: ? +C P4: 3.29 +C P6-15 (Core2): 2.1 (fluctuates, presumably cache related) +C P6-28 (Atom): 14.3 + +C INPUT PARAMETERS +define(`rp',`%rdi') +define(`up',`%rsi') +define(`n',`%rdx') +define(`cnt',`%cl') + +ASM_START() + TEXT + ALIGN(32) +PROLOGUE(mpn_rshift) + mov (up), %rax + movd %ecx, %mm4 + neg %ecx C put lsh count in cl + and $63, %ecx + movd %ecx, %mm5 + + lea -8(up,n,8), up + lea -8(rp,n,8), rp + lea 1(n), %r8d + neg n + + shl %cl, %rax C function return value + + and $3, %r8d + je L(rol) C jump for n = 3, 7, 11, ... + + dec %r8d + jne L(1) +C n = 4, 8, 12, ... + movq 8(up,n,8), %mm2 + psrlq %mm4, %mm2 + movq 16(up,n,8), %mm0 + psllq %mm5, %mm0 + por %mm0, %mm2 + movq %mm2, 8(rp,n,8) + inc n + jmp L(rol) + +L(1): dec %r8d + je L(1x) C jump for n = 1, 5, 9, 13, ... +C n = 2, 6, 10, 16, ... + movq 8(up,n,8), %mm2 + psrlq %mm4, %mm2 + movq 16(up,n,8), %mm0 + psllq %mm5, %mm0 + por %mm0, %mm2 + movq %mm2, 8(rp,n,8) + inc n +L(1x): + cmp $-1, n + je L(ast) + movq 8(up,n,8), %mm2 + psrlq %mm4, %mm2 + movq 16(up,n,8), %mm3 + psrlq %mm4, %mm3 + movq 16(up,n,8), %mm0 + movq 24(up,n,8), %mm1 + psllq %mm5, %mm0 + por %mm0, %mm2 + psllq %mm5, %mm1 + por %mm1, %mm3 + movq %mm2, 8(rp,n,8) + movq %mm3, 16(rp,n,8) + add $2, n + +L(rol): movq 8(up,n,8), %mm2 + psrlq %mm4, %mm2 + movq 16(up,n,8), %mm3 + psrlq %mm4, %mm3 + + add $4, n C 4 + jb L(end) C 2 + ALIGN(32) +L(top): + C finish stuff from lsh block + movq -16(up,n,8), %mm0 + movq -8(up,n,8), %mm1 + psllq %mm5, %mm0 + por %mm0, %mm2 + psllq %mm5, %mm1 + movq (up,n,8), %mm0 + por %mm1, %mm3 + movq 8(up,n,8), %mm1 + movq %mm2, -24(rp,n,8) + movq %mm3, -16(rp,n,8) + C start two new rsh + psllq %mm5, %mm0 + psllq %mm5, %mm1 + + C finish stuff from rsh block + movq -8(up,n,8), %mm2 + movq (up,n,8), %mm3 + psrlq %mm4, %mm2 + por %mm2, %mm0 + psrlq %mm4, %mm3 + movq 8(up,n,8), %mm2 + por %mm3, %mm1 + movq 16(up,n,8), %mm3 + movq %mm0, -8(rp,n,8) + movq %mm1, (rp,n,8) + C start two new lsh + add $4, n + psrlq %mm4, %mm2 + psrlq %mm4, %mm3 + + jae L(top) C 2 +L(end): + movq -16(up,n,8), %mm0 + psllq %mm5, %mm0 + por %mm0, %mm2 + movq -8(up,n,8), %mm1 + psllq %mm5, %mm1 + por %mm1, %mm3 + movq %mm2, -24(rp,n,8) + movq %mm3, -16(rp,n,8) + +L(ast): movq (up), %mm2 + psrlq %mm4, %mm2 + movq %mm2, (rp) + emms + ret +EPILOGUE()