X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Fpowerpc64%2Fvmx%2Fpopcount.asm;fp=gmp%2Fmpn%2Fpowerpc64%2Fvmx%2Fpopcount.asm;h=b9f5896fb7683965495a1ba7a0e2fc91f0a1200c;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git

diff --git a/gmp/mpn/powerpc64/vmx/popcount.asm b/gmp/mpn/powerpc64/vmx/popcount.asm
new file mode 100644
index 00000000..b9f5896f
--- /dev/null
+++ b/gmp/mpn/powerpc64/vmx/popcount.asm
@@ -0,0 +1,260 @@
+dnl  PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount.
+
+dnl  Copyright 2006 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C                   cycles/limb
+C 7400,7410 (G4):       2.75
+C 744x,745x (G4+):      2.25
+C 970 (G5):             5.3
+
+C STATUS
+C  * Works for all sizes and alignments.
+
+C TODO
+C  * Tune the awkward huge n outer loop code.
+C  * Two lvx, two vperm, and two vxor could make us a similar hamdist.
+C  * For the 970, a combined VMX+intop approach might be best.
+C  * Compress cnsts table in 64-bit mode, only half the values are needed.
+
+define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
+define(`LIMBS_PER_VR',  eval(16/GMP_LIMB_BYTES))
+define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES))
+
+define(`OPERATION_popcount')
+
+ifdef(`OPERATION_popcount',`
+  define(`func',`mpn_popcount')
+  define(`up',		`r3')
+  define(`n',		`r4')
+  define(`HAM',		`dnl')
+')
+ifdef(`OPERATION_hamdist',`
+  define(`func',`mpn_hamdist')
+  define(`up',		`r3')
+  define(`vp',		`r4')
+  define(`n',		`r5')
+  define(`HAM',		`$1')
+')
+
+define(`x01010101',`v2')
+define(`x00110011',`v7')
+define(`x00001111',`v10')
+define(`cnt1',`v11')
+define(`cnt2',`v12')
+define(`cnt4',`v13')
+
+ifelse(GMP_LIMB_BITS,32,`
+	define(`LIMB32',`	$1')
+	define(`LIMB64',`')
+',`
+	define(`LIMB32',`')
+	define(`LIMB64',`	$1')
+')
+
+C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow
+C in vsum4ubs.  For large operands, we work in chunks, of size LIMBS_PER_CHUNK.
+define(`LIMBS_PER_CHUNK', 0x1000)
+define(`LIMBS_CHUNK_THRES', 0x1001)
+
+ASM_START()
+PROLOGUE(mpn_popcount)
+	mfspr	r10, 256
+	oris	r0, r10, 0xfffc		C Set VRSAVE bit 0-13
+	mtspr	256, r0
+
+ifdef(`HAVE_ABI_mode32',
+`	rldicl	n, n, 0, 32')		C zero extend n
+
+C Load various constants into vector registers
+	LEAL(	r11, cnsts)
+	li	r12, 16
+	vspltisb cnt1, 1		C 0x0101...01 used as shift count
+	vspltisb cnt2, 2		C 0x0202...02 used as shift count
+	vspltisb cnt4, 4		C 0x0404...04 used as shift count
+	lvx	x01010101, 0, r11	C 0x3333...33
+	lvx	x00110011, r12, r11	C 0x5555...55
+	vspltisb x00001111, 15		C 0x0f0f...0f
+
+LIMB64(`lis	r0, LIMBS_CHUNK_THRES	')
+LIMB64(`cmpd	cr7, n, r0		')
+
+	lvx	v0, 0, up
+	addi	r7, r11, 96
+	rlwinm	r6, up, 2,26,29
+	lvx	v8, r7, r6
+	vand	v0, v0, v8
+
+LIMB32(`rlwinm	r8, up, 30,30,31	')
+LIMB64(`rlwinm	r8, up, 29,31,31	')
+	add	n, n, r8		C compensate n for rounded down `up'
+
+	vxor	v1, v1, v1
+	li	r8, 0			C grand total count
+
+	vxor	v3, v3, v3		C zero total count
+
+	addic.	n, n, -LIMBS_PER_VR
+	ble	L(sum)
+
+	addic.	n, n, -LIMBS_PER_VR
+	ble	L(lsum)
+
+C For 64-bit machines, handle huge n that would overflow vsum4ubs
+LIMB64(`ble	cr7, L(small)		')
+LIMB64(`addis	r9, n, -LIMBS_PER_CHUNK	') C remaining n
+LIMB64(`lis	n, LIMBS_PER_CHUNK	')
+L(small):
+
+
+LIMB32(`srwi	r7, n, 3	')	C loop count corresponding to n
+LIMB64(`srdi	r7, n, 2	')	C loop count corresponding to n
+	addi	r7, r7, 1
+	mtctr	r7			C copy n to count register
+	b	L(ent)
+
+	ALIGN(8)
+L(top):	lvx	v0, 0, up
+	li	r7, 128			C prefetch distance
+L(ent):	lvx	v1, r12, up
+	addi	up, up, 32
+	vsr	v4, v0, cnt1
+	vsr	v5, v1, cnt1
+	dcbt	up, r7			C prefetch
+	vand	v8, v4, x01010101
+	vand	v9, v5, x01010101
+	vsububm	v0, v0, v8		C 64 2-bit accumulators (0..2)
+	vsububm	v1, v1, v9		C 64 2-bit accumulators (0..2)
+	vsr	v4, v0, cnt2
+	vsr	v5, v1, cnt2
+	vand	v8, v0, x00110011
+	vand	v9, v1, x00110011
+	vand	v4, v4, x00110011
+	vand	v5, v5, x00110011
+	vaddubm	v0, v4, v8		C 32 4-bit accumulators (0..4)
+	vaddubm	v1, v5, v9		C 32 4-bit accumulators (0..4)
+	vaddubm	v8, v0, v1		C 32 4-bit accumulators (0..8)
+	vsr	v9, v8, cnt4
+	vand	v6, v8, x00001111
+	vand	v9, v9, x00001111
+	vaddubm	v6, v9, v6		C 16 8-bit accumulators (0..16)
+	vsum4ubs v3, v6, v3		C sum 4 x 4 bytes into 4 32-bit fields
+	bdnz	L(top)
+
+	andi.	n, n, eval(LIMBS_PER_2VR-1)
+	beq	L(rt)
+
+	lvx	v0, 0, up
+	vxor	v1, v1, v1
+	cmpwi	n, LIMBS_PER_VR
+	ble	L(sum)
+L(lsum):
+	vor	v1, v0, v0
+	lvx	v0, r12, up
+L(sum):
+LIMB32(`rlwinm	r6, n, 4,26,27	')
+LIMB64(`rlwinm	r6, n, 5,26,26	')
+	addi	r7, r11, 32
+	lvx	v8, r7, r6
+	vand	v0, v0, v8
+
+	vsr	v4, v0, cnt1
+	vsr	v5, v1, cnt1
+	vand	v8, v4, x01010101
+	vand	v9, v5, x01010101
+	vsububm	v0, v0, v8		C 64 2-bit accumulators (0..2)
+	vsububm	v1, v1, v9		C 64 2-bit accumulators (0..2)
+	vsr	v4, v0, cnt2
+	vsr	v5, v1, cnt2
+	vand	v8, v0, x00110011
+	vand	v9, v1, x00110011
+	vand	v4, v4, x00110011
+	vand	v5, v5, x00110011
+	vaddubm	v0, v4, v8		C 32 4-bit accumulators (0..4)
+	vaddubm	v1, v5, v9		C 32 4-bit accumulators (0..4)
+	vaddubm	v8, v0, v1		C 32 4-bit accumulators (0..8)
+	vsr	v9, v8, cnt4
+	vand	v6, v8, x00001111
+	vand	v9, v9, x00001111
+	vaddubm	v6, v9, v6		C 16 8-bit accumulators (0..16)
+	vsum4ubs v3, v6, v3		C sum 4 x 4 bytes into 4 32-bit fields
+
+L(rt):
+	li	r7, -16			C FIXME: does all ppc32 and ppc64 ABIs
+	stvx	v3, r7, r1		C FIXME: ...support storing below sp?
+
+	lwz	r7, -16(r1)
+	add	r8, r8, r7
+	lwz	r7, -12(r1)
+	add	r8, r8, r7
+	lwz	r7, -8(r1)
+	add	r8, r8, r7
+	lwz	r7, -4(r1)
+	add	r8, r8, r7
+
+C Handle outer loop for huge n.  We inherit cr7 and r0 from above.
+LIMB64(`ble	cr7, L(ret)
+	vxor	v3, v3, v3		C zero total count
+	mr	n, r9
+	cmpd	cr7, n, r0
+	ble	cr7, L(2)
+	addis	r9, n, -LIMBS_PER_CHUNK	C remaining n
+	lis	n, LIMBS_PER_CHUNK
+L(2):	srdi	r7, n, 2		C loop count corresponding to n
+	mtctr	r7			C copy n to count register
+	b	L(top)
+')
+
+L(ret):	mr	r3, r8
+	mtspr	256, r10
+	blr
+EPILOGUE()
+
+DEF_OBJECT(cnsts,16)
+	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+	.byte	0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
+
+	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+	.byte	0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
+C Masks for high end of number
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
+C Masks for low end of number
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	.byte	0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+
+	.byte	0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+	.byte	0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
+END_OBJECT(cnsts)
+ASM_END()