Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / x86 / pentium4 / sse2 / popcount.asm
diff --git a/gmp/mpn/x86/pentium4/sse2/popcount.asm b/gmp/mpn/x86/pentium4/sse2/popcount.asm

new file mode 100644 (file)

index 0000000..cb982ad
--- /dev/null
+++ b/gmp/mpn/x86/pentium4/sse2/popcount.asm
@@ -0,0 +1,266 @@
+dnl  X86-32 and X86-64 mpn_popcount using SSE2.
+
+dnl  Copyright 2006, 2007 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+
+include(`../config.m4')
+
+
+C 32-bit                     popcount        hamdist
+C                           cycles/limb     cycles/limb
+C P5:                           -
+C P6 model 0-8,10-12)           -
+C P6 model 9  (Banias)          ?
+C P6 model 13 (Dothan)          4
+C P4 model 0  (Willamette)      ?
+C P4 model 1  (?)               ?
+C P4 model 2  (Northwood)       3.9
+C P4 model 3  (Prescott)        ?
+C P4 model 4  (Nocona)          ?
+C K6:                           -
+C K7:                           -
+C K8:                           ?
+
+C 64-bit                     popcount        hamdist
+C                           cycles/limb     cycles/limb
+C P4 model 4 (Nocona):          8
+C K8:                           7.5
+C K10:                         3.5
+C P6-15:                       3.68
+
+C TODO
+C  * Make a mpn_hamdist based on this.  Alignment could either be handled by
+C    using movdqu for one operand and movdqa for the other, or by painfully
+C    shifting as we go.  Unfortunately, there seem to be no useable shift
+C    instruction, except for one that takes an immediate count.
+C  * It would probably be possible to cut a few cycles/limb using software
+C    pipelining.
+C  * There are 35 decode slots unused by the SSE2 instructions.  Loop control
+C    needs just 2 or 3 slots, leaving around 32 slots.  This allows a parallel
+C    integer based popcount.  Such a combined loop would handle 6 limbs in
+C    about 30 cycles on K8.
+C  * We could save a byte or two by using 32-bit operations on areg.
+C  * Check if using movdqa to a temp of and then register-based pand is faster.
+
+ifelse(GMP_LIMB_BITS,`32',
+`      define(`up',  `%edx')
+       define(`n',   `%ecx')
+       define(`areg',`%eax')
+       define(`breg',`%ebx')
+       define(`zero',`%xmm4')
+       define(`LIMB32',`       $1')
+       define(`LIMB64',`dnl')
+',`
+       define(`up',  `%rdi')
+       define(`n',   `%rsi')
+       define(`areg',`%rax')
+       define(`breg',`%rdx')
+       define(`zero',`%xmm8')
+       define(`LIMB32',`dnl')
+       define(`LIMB64',`       $1')
+')
+
+define(`mm01010101',`%xmm6')
+define(`mm00110011',`%xmm7')
+define(`mm00001111',`%xmm2')
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_XMM',  eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
+
+undefine(`psadbw')                     C override inherited m4 version
+
+ASM_START()
+
+C Make cnsts global to work around Apple relocation bug.
+ifdef(`DARWIN',`
+       define(`cnsts', MPN(popccnsts))
+       GLOBL   cnsts')
+
+       TEXT
+       ALIGN(32)
+PROLOGUE(mpn_popcount)
+
+LIMB32(`mov    4(%esp), up     ')
+LIMB32(`mov    8(%esp), n      ')
+LIMB32(`push   %ebx            ')
+
+       pxor    %xmm3, %xmm3            C zero grand total count
+LIMB64(`pxor   zero, zero      ')
+ifdef(`PIC',`
+       LEA(    cnsts, breg)
+',`
+LIMB32(`mov    $cnsts, breg    ')
+LIMB64(`movabs $cnsts, breg    ')
+')
+
+       movdqa  -48(breg), mm01010101
+       movdqa  -32(breg), mm00110011
+       movdqa  -16(breg), mm00001111
+
+       mov     up, areg
+       and     $-16, up                C round `up' down to 128-bit boundary
+       and     $12, areg               C 32:areg = 0, 4, 8, 12
+                                       C 64:areg = 0, 8
+       movdqa  (up), %xmm0
+       pand    64(breg,areg,4), %xmm0
+       shr     $m4_log2(GMP_LIMB_BYTES), %eax
+       add     areg, n                 C compensate n for rounded down `up'
+
+       pxor    %xmm4, %xmm4
+       sub     $LIMBS_PER_XMM, n
+       jbe     L(sum)
+
+       sub     $LIMBS_PER_XMM, n
+       ja      L(ent)
+       jmp     L(lsum)
+
+       ALIGN(16)
+L(top):        movdqa  (up), %xmm0
+L(ent):        movdqa  16(up), %xmm4
+
+       movdqa  %xmm0, %xmm1
+       movdqa  %xmm4, %xmm5
+       psrld   $1, %xmm0
+       psrld   $1, %xmm4
+       pand    mm01010101, %xmm0
+       pand    mm01010101, %xmm4
+       psubd   %xmm0, %xmm1
+       psubd   %xmm4, %xmm5
+
+       movdqa  %xmm1, %xmm0
+       movdqa  %xmm5, %xmm4
+       psrlq   $2, %xmm1
+       psrlq   $2, %xmm5
+       pand    mm00110011, %xmm0
+       pand    mm00110011, %xmm4
+       pand    mm00110011, %xmm1
+       pand    mm00110011, %xmm5
+       paddq   %xmm0, %xmm1
+       paddq   %xmm4, %xmm5
+
+LIMB32(`pxor   zero, zero      ')
+
+       add     $32, up
+       sub     $LIMBS_PER_2XMM, n
+
+       paddq   %xmm5, %xmm1
+       movdqa  %xmm1, %xmm0
+       psrlq   $4, %xmm1
+       pand    mm00001111, %xmm0
+       pand    mm00001111, %xmm1
+       paddq   %xmm0, %xmm1
+
+       psadbw  zero, %xmm1
+       paddq   %xmm1, %xmm3            C add to grand total
+
+       jnc     L(top)
+L(end):
+       add     $LIMBS_PER_2XMM, n
+       jz      L(rt)
+       movdqa  (up), %xmm0
+       pxor    %xmm4, %xmm4
+       sub     $LIMBS_PER_XMM, n
+       jbe     L(sum)
+L(lsum):
+       movdqa  %xmm0, %xmm4
+       movdqa  16(up), %xmm0
+L(sum):
+       shl     $m4_log2(GMP_LIMB_BYTES), n
+       and     $12, n
+       pand    (breg,n,4), %xmm0
+
+       movdqa  %xmm0, %xmm1
+       movdqa  %xmm4, %xmm5
+       psrld   $1, %xmm0
+       psrld   $1, %xmm4
+       pand    mm01010101, %xmm0
+       pand    mm01010101, %xmm4
+       psubd   %xmm0, %xmm1
+       psubd   %xmm4, %xmm5
+
+       movdqa  %xmm1, %xmm0
+       movdqa  %xmm5, %xmm4
+       psrlq   $2, %xmm1
+       psrlq   $2, %xmm5
+       pand    mm00110011, %xmm0
+       pand    mm00110011, %xmm4
+       pand    mm00110011, %xmm1
+       pand    mm00110011, %xmm5
+       paddq   %xmm0, %xmm1
+       paddq   %xmm4, %xmm5
+
+LIMB32(`pxor   zero, zero      ')
+
+       paddq   %xmm5, %xmm1
+       movdqa  %xmm1, %xmm0
+       psrlq   $4, %xmm1
+       pand    mm00001111, %xmm0
+       pand    mm00001111, %xmm1
+       paddq   %xmm0, %xmm1
+
+       psadbw  zero, %xmm1
+       paddq   %xmm1, %xmm3            C add to grand total
+
+
+C Add the two 64-bit halves of the grand total counter
+L(rt): movdqa  %xmm3, %xmm0
+       psrldq  $8, %xmm3
+       paddq   %xmm3, %xmm0
+       movd    %xmm0, areg             C movq avoided due to gas bug
+
+LIMB32(`pop    %ebx            ')
+       ret
+
+EPILOGUE()
+DEF_OBJECT(dummy,16)
+C Three magic constants used for masking out bits
+       .byte   0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+       .byte   0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+
+       .byte   0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+       .byte   0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+
+       .byte   0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+       .byte   0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
+cnsts:
+C Masks for high end of number
+       .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+       .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+       .byte   0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+       .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+       .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+       .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+       .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+       .byte   0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+C Masks for low end of number
+       .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+       .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+       .byte   0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+       .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+       .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+       .byte   0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+       .byte   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+       .byte   0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+END_OBJECT(dummy)