Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / x86 / pentium / mmx / rshift.asm
diff --git a/gmp/mpn/x86/pentium/mmx/rshift.asm b/gmp/mpn/x86/pentium/mmx/rshift.asm

new file mode 100644 (file)

index 0000000..f50b8ab
--- /dev/null
+++ b/gmp/mpn/x86/pentium/mmx/rshift.asm
@@ -0,0 +1,457 @@
+dnl  Intel P5 mpn_rshift -- mpn right shift.
+
+dnl  Copyright 2000, 2002 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C P5: 1.75 cycles/limb.
+
+
+C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
+C                       unsigned shift);
+C
+C Shift src,size right by shift many bits and store the result in dst,size.
+C Zeros are shifted in at the left.  Return the bits shifted out at the
+C right.
+C
+C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
+C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
+C
+C Full speed depends on source and destination being aligned.  Unaligned mmx
+C loads and stores on P5 don't pair and have a 2 cycle penalty.  Some hairy
+C setups and finish-ups are done to ensure alignment for the loop.
+C
+C MMX shifts work out a bit faster even for the simple loop.
+
+defframe(PARAM_SHIFT,16)
+defframe(PARAM_SIZE, 12)
+defframe(PARAM_SRC,  8)
+defframe(PARAM_DST,  4)
+deflit(`FRAME',0)
+
+dnl  Minimum 5, because the unrolled loop can't handle less.
+deflit(UNROLL_THRESHOLD, 5)
+
+       TEXT
+       ALIGN(8)
+
+PROLOGUE(mpn_rshift)
+
+       pushl   %ebx
+       pushl   %edi
+deflit(`FRAME',8)
+
+       movl    PARAM_SIZE, %eax
+       movl    PARAM_DST, %edx
+
+       movl    PARAM_SRC, %ebx
+       movl    PARAM_SHIFT, %ecx
+
+       cmp     $UNROLL_THRESHOLD, %eax
+       jae     L(unroll)
+
+       decl    %eax
+       movl    (%ebx), %edi            C src low limb
+
+       jnz     L(simple)
+
+       shrdl(  %cl, %edi, %eax)        C eax was decremented to zero
+
+       shrl    %cl, %edi
+
+       movl    %edi, (%edx)            C dst low limb
+       popl    %edi                    C risk of data cache bank clash
+
+       popl    %ebx
+
+       ret
+
+
+C -----------------------------------------------------------------------------
+       ALIGN(8)
+L(simple):
+       C eax   size-1
+       C ebx   src
+       C ecx   shift
+       C edx   dst
+       C esi
+       C edi
+       C ebp
+deflit(`FRAME',8)
+
+       movd    (%ebx), %mm5            C src[0]
+       leal    (%ebx,%eax,4), %ebx     C &src[size-1]
+
+       movd    %ecx, %mm6              C rshift
+       leal    -4(%edx,%eax,4), %edx   C &dst[size-2]
+
+       psllq   $32, %mm5
+       negl    %eax
+
+
+C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
+C cycle waiting for the mm0 result to be ready.  For comparison a shrdl is 4
+C cycles and would be 8 in a simple loop.  Using mmx helps the return value
+C and last limb calculations too.
+
+L(simple_top):
+       C eax   counter, limbs, negative
+       C ebx   &src[size-1]
+       C ecx   return value
+       C edx   &dst[size-2]
+       C
+       C mm0   scratch
+       C mm5   return value
+       C mm6   shift
+
+       movq    (%ebx,%eax,4), %mm0
+       incl    %eax
+
+       psrlq   %mm6, %mm0
+
+       movd    %mm0, (%edx,%eax,4)
+       jnz     L(simple_top)
+
+
+       movd    (%ebx), %mm0
+       psrlq   %mm6, %mm5              C return value
+
+       psrlq   %mm6, %mm0
+       popl    %edi
+
+       movd    %mm5, %eax
+       popl    %ebx
+
+       movd    %mm0, 4(%edx)
+
+       emms
+
+       ret
+
+
+C -----------------------------------------------------------------------------
+       ALIGN(8)
+L(unroll):
+       C eax   size
+       C ebx   src
+       C ecx   shift
+       C edx   dst
+       C esi
+       C edi
+       C ebp
+deflit(`FRAME',8)
+
+       movd    (%ebx), %mm5            C src[0]
+       movl    $4, %edi
+
+       movd    %ecx, %mm6              C rshift
+       testl   %edi, %ebx
+
+       psllq   $32, %mm5
+       jz      L(start_src_aligned)
+
+
+       C src isn't aligned, process low limb separately (marked xxx) and
+       C step src and dst by one limb, making src aligned.
+       C
+       C source                  ebx
+       C --+-------+-------+-------+
+       C           |          xxx  |
+       C --+-------+-------+-------+
+       C         4mod8   0mod8   4mod8
+       C
+       C         dest            edx
+       C         --+-------+-------+
+       C           |       |  xxx  |
+       C         --+-------+-------+
+
+       movq    (%ebx), %mm0            C unaligned load
+
+       psrlq   %mm6, %mm0
+       addl    $4, %ebx
+
+       decl    %eax
+
+       movd    %mm0, (%edx)
+       addl    $4, %edx
+L(start_src_aligned):
+
+
+       movq    (%ebx), %mm1
+       testl   %edi, %edx
+
+       psrlq   %mm6, %mm5              C retval
+       jz      L(start_dst_aligned)
+
+       C dst isn't aligned, add 4 to make it so, and pretend the shift is
+       C 32 bits extra.  Low limb of dst (marked xxx) handled here
+       C separately.
+       C
+       C          source          ebx
+       C          --+-------+-------+
+       C            |      mm1      |
+       C          --+-------+-------+
+       C                  4mod8   0mod8
+       C
+       C  dest                    edx
+       C  --+-------+-------+-------+
+       C                    |  xxx  |
+       C  --+-------+-------+-------+
+       C          4mod8   0mod8   4mod8
+
+       movq    %mm1, %mm0
+       addl    $32, %ecx               C new shift
+
+       psrlq   %mm6, %mm0
+
+       movd    %ecx, %mm6
+
+       movd    %mm0, (%edx)
+       addl    $4, %edx
+L(start_dst_aligned):
+
+
+       movq    8(%ebx), %mm3
+       negl    %ecx
+
+       movq    %mm3, %mm2              C mm2 src qword
+       addl    $64, %ecx
+
+       movd    %ecx, %mm7
+       psrlq   %mm6, %mm1
+
+       leal    -12(%ebx,%eax,4), %ebx
+       leal    -20(%edx,%eax,4), %edx
+
+       psllq   %mm7, %mm3
+       subl    $7, %eax                C size-7
+
+       por     %mm1, %mm3              C mm3 ready to store
+       negl    %eax                    C -(size-7)
+
+       jns     L(finish)
+
+
+       C This loop is the important bit, the rest is just support.  Careful
+       C instruction scheduling achieves the claimed 1.75 c/l.  The
+       C relevant parts of the pairing rules are:
+       C
+       C - mmx loads and stores execute only in the U pipe
+       C - only one mmx shift in a pair
+       C - wait one cycle before storing an mmx register result
+       C - the usual address generation interlock
+       C
+       C Two qword calculations are slightly interleaved.  The instructions
+       C marked "C" belong to the second qword, and the "C prev" one is for
+       C the second qword from the previous iteration.
+
+       ALIGN(8)
+L(unroll_loop):
+       C eax   counter, limbs, negative
+       C ebx   &src[size-12]
+       C ecx
+       C edx   &dst[size-12]
+       C esi
+       C edi
+       C
+       C mm0
+       C mm1
+       C mm2   src qword from -8(%ebx,%eax,4)
+       C mm3   dst qword ready to store to -8(%edx,%eax,4)
+       C
+       C mm5   return value
+       C mm6   rshift
+       C mm7   lshift
+
+       movq    (%ebx,%eax,4), %mm0
+       psrlq   %mm6, %mm2
+
+       movq    %mm0, %mm1
+       psllq   %mm7, %mm0
+
+       movq    %mm3, -8(%edx,%eax,4)   C prev
+       por     %mm2, %mm0
+
+       movq    8(%ebx,%eax,4), %mm3    C
+       psrlq   %mm6, %mm1              C
+
+       movq    %mm0, (%edx,%eax,4)
+       movq    %mm3, %mm2              C
+
+       psllq   %mm7, %mm3              C
+       addl    $4, %eax
+
+       por     %mm1, %mm3              C
+       js      L(unroll_loop)
+
+
+L(finish):
+       C eax   0 to 3 representing respectively 3 to 0 limbs remaining
+
+       testb   $2, %al
+
+       jnz     L(finish_no_two)
+
+       movq    (%ebx,%eax,4), %mm0
+       psrlq   %mm6, %mm2
+
+       movq    %mm0, %mm1
+       psllq   %mm7, %mm0
+
+       movq    %mm3, -8(%edx,%eax,4)   C prev
+       por     %mm2, %mm0
+
+       movq    %mm1, %mm2
+       movq    %mm0, %mm3
+
+       addl    $2, %eax
+L(finish_no_two):
+
+
+       C eax   2 or 3 representing respectively 1 or 0 limbs remaining
+       C
+       C mm2   src prev qword, from -8(%ebx,%eax,4)
+       C mm3   dst qword, for -8(%edx,%eax,4)
+
+       testb   $1, %al
+       popl    %edi
+
+       movd    %mm5, %eax      C retval
+       jnz     L(finish_zero)
+
+
+       C One extra limb, destination was aligned.
+       C
+       C source                ebx
+       C +-------+---------------+--
+       C |       |      mm2      |
+       C +-------+---------------+--
+       C
+       C dest                                  edx
+       C +-------+---------------+---------------+--
+       C |       |               |      mm3      |
+       C +-------+---------------+---------------+--
+       C
+       C mm6 = shift
+       C mm7 = ecx = 64-shift
+
+
+       C One extra limb, destination was unaligned.
+       C
+       C source                ebx
+       C +-------+---------------+--
+       C |       |      mm2      |
+       C +-------+---------------+--
+       C
+       C dest                          edx
+       C +---------------+---------------+--
+       C |               |      mm3      |
+       C +---------------+---------------+--
+       C
+       C mm6 = shift+32
+       C mm7 = ecx = 64-(shift+32)
+
+
+       C In both cases there's one extra limb of src to fetch and combine
+       C with mm2 to make a qword at 8(%edx), and in the aligned case
+       C there's a further extra limb of dst to be formed.
+
+
+       movd    8(%ebx), %mm0
+       psrlq   %mm6, %mm2
+
+       movq    %mm0, %mm1
+       psllq   %mm7, %mm0
+
+       movq    %mm3, (%edx)
+       por     %mm2, %mm0
+
+       psrlq   %mm6, %mm1
+       andl    $32, %ecx
+
+       popl    %ebx
+       jz      L(finish_one_unaligned)
+
+       C dst was aligned, must store one extra limb
+       movd    %mm1, 16(%edx)
+L(finish_one_unaligned):
+
+       movq    %mm0, 8(%edx)
+
+       emms
+
+       ret
+
+
+L(finish_zero):
+
+       C No extra limbs, destination was aligned.
+       C
+       C source        ebx
+       C +---------------+--
+       C |      mm2      |
+       C +---------------+--
+       C
+       C dest                        edx+4
+       C +---------------+---------------+--
+       C |               |      mm3      |
+       C +---------------+---------------+--
+       C
+       C mm6 = shift
+       C mm7 = ecx = 64-shift
+
+
+       C No extra limbs, destination was unaligned.
+       C
+       C source        ebx
+       C +---------------+--
+       C |      mm2      |
+       C +---------------+--
+       C
+       C dest                edx+4
+       C +-------+---------------+--
+       C |       |      mm3      |
+       C +-------+---------------+--
+       C
+       C mm6 = shift+32
+       C mm7 = 64-(shift+32)
+
+
+       C The movd for the unaligned case is clearly the same data as the
+       C movq for the aligned case, it's just a choice between whether one
+       C or two limbs should be written.
+
+
+       movq    %mm3, 4(%edx)
+       psrlq   %mm6, %mm2
+
+       movd    %mm2, 12(%edx)
+       andl    $32, %ecx
+
+       popl    %ebx
+       jz      L(finish_zero_unaligned)
+
+       movq    %mm2, 12(%edx)
+L(finish_zero_unaligned):
+
+       emms
+
+       ret
+
+EPILOGUE()