Imported gcc-4.4.3

[msp430-gcc.git] / gmp / mpn / alpha / ev67 / hamdist.asm
diff --git a/gmp/mpn/alpha/ev67/hamdist.asm b/gmp/mpn/alpha/ev67/hamdist.asm

new file mode 100644 (file)

index 0000000..a72d95e
--- /dev/null
+++ b/gmp/mpn/alpha/ev67/hamdist.asm
@@ -0,0 +1,100 @@
+dnl  Alpha ev67 mpn_hamdist -- mpn hamming distance.
+
+dnl  Copyright 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or
+dnl  modify it under the terms of the GNU Lesser General Public License as
+dnl  published by the Free Software Foundation; either version 3 of the
+dnl  License, or (at your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful,
+dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
+dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+dnl  Lesser General Public License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C ev67: 2.5 cycles/limb
+
+
+C unsigned long mpn_hamdist (mp_srcptr xp, mp_srcptr yp, mp_size_t size);
+C
+C The hope was for 2.0 c/l here, but that isn't achieved.  We're limited by
+C renaming register shortage.  Since we need 5 instructions per limb, further
+C unrolling could approach 1.5 c/l.
+C
+C The main loop processes two limbs from each operand on each iteration.  An
+C odd size is handled by processing xp[0]^yp[0] at the start.  If the size
+C is even that result is discarded, and is repeated by the main loop.
+C
+
+ASM_START()
+PROLOGUE(mpn_hamdist)
+
+       C r16   xp
+       C r17   yp
+       C r18   size
+
+       ldq     r1, 0(r16)              C L0  xp[0]
+       ldq     r2, 0(r17)              C L1  yp[0]
+       and     r18, 1, r8              C U1  1 if size odd
+       srl     r18, 1, r18             C U0  size, limb pairs
+
+       clr     r0                      C L0  initial total
+       s8addq  r8, r17, r17            C U1  yp++ if size odd
+       s8addq  r8, r16, r16            C L1  xp++ if size odd
+       clr     r6                      C U0  dummy initial xor 1
+
+       xor     r1, r2, r5              C L   initial xor 0
+       beq     r18, L(one)             C U   if size==1
+
+       cmoveq  r8, r31, r5             C L   discard first limb if size even
+       unop                            C U
+
+
+       ALIGN(16)
+L(top):
+       C r0    total accumulating
+       C r7    xor 0
+       C r8    xor 1
+       C r16   xp, incrementing
+       C r17   yp, incrementing
+       C r18   size, limb pairs, decrementing
+
+       ldq     r1, 0(r16)              C L
+       ldq     r2, 0(r17)              C L
+       ctpop   r5, r7                  C U0
+       lda     r16, 16(r16)            C U
+
+       ldq     r3, -8(r16)             C L
+       ldq     r4, 8(r17)              C L
+       ctpop   r6, r8                  C U0
+       lda     r17, 16(r17)            C U
+
+       ldl     r31, 256(r16)           C L     prefetch
+       ldl     r31, 256(r17)           C L     prefetch
+       xor     r1, r2, r5              C U
+       lda     r18, -1(r18)            C U
+
+       xor     r3, r4, r6              C U
+       addq    r0, r7, r0              C L
+       addq    r0, r8, r0              C L
+       bne     r18, L(top)             C U
+
+
+       ctpop   r6, r8                  C U0
+       addq    r0, r8, r0              C L
+L(one):
+       ctpop   r5, r7                  C U0
+       addq    r0, r7, r0              C L
+
+       ret     r31, (r26), 1           C L0
+
+EPILOGUE()
+ASM_END()