X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Fpowerpc64%2Fvmx%2Fpopcount.asm;fp=gmp%2Fmpn%2Fpowerpc64%2Fvmx%2Fpopcount.asm;h=b9f5896fb7683965495a1ba7a0e2fc91f0a1200c;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/powerpc64/vmx/popcount.asm b/gmp/mpn/powerpc64/vmx/popcount.asm new file mode 100644 index 00000000..b9f5896f --- /dev/null +++ b/gmp/mpn/powerpc64/vmx/popcount.asm @@ -0,0 +1,260 @@ +dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount. + +dnl Copyright 2006 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C 7400,7410 (G4): 2.75 +C 744x,745x (G4+): 2.25 +C 970 (G5): 5.3 + +C STATUS +C * Works for all sizes and alignments. + +C TODO +C * Tune the awkward huge n outer loop code. +C * Two lvx, two vperm, and two vxor could make us a similar hamdist. +C * For the 970, a combined VMX+intop approach might be best. +C * Compress cnsts table in 64-bit mode, only half the values are needed. + +define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) +define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) +define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) + +define(`OPERATION_popcount') + +ifdef(`OPERATION_popcount',` + define(`func',`mpn_popcount') + define(`up', `r3') + define(`n', `r4') + define(`HAM', `dnl') +') +ifdef(`OPERATION_hamdist',` + define(`func',`mpn_hamdist') + define(`up', `r3') + define(`vp', `r4') + define(`n', `r5') + define(`HAM', `$1') +') + +define(`x01010101',`v2') +define(`x00110011',`v7') +define(`x00001111',`v10') +define(`cnt1',`v11') +define(`cnt2',`v12') +define(`cnt4',`v13') + +ifelse(GMP_LIMB_BITS,32,` + define(`LIMB32',` $1') + define(`LIMB64',`') +',` + define(`LIMB32',`') + define(`LIMB64',` $1') +') + +C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow +C in vsum4ubs. For large operands, we work in chunks, of size LIMBS_PER_CHUNK. +define(`LIMBS_PER_CHUNK', 0x1000) +define(`LIMBS_CHUNK_THRES', 0x1001) + +ASM_START() +PROLOGUE(mpn_popcount) + mfspr r10, 256 + oris r0, r10, 0xfffc C Set VRSAVE bit 0-13 + mtspr 256, r0 + +ifdef(`HAVE_ABI_mode32', +` rldicl n, n, 0, 32') C zero extend n + +C Load various constants into vector registers + LEAL( r11, cnsts) + li r12, 16 + vspltisb cnt1, 1 C 0x0101...01 used as shift count + vspltisb cnt2, 2 C 0x0202...02 used as shift count + vspltisb cnt4, 4 C 0x0404...04 used as shift count + lvx x01010101, 0, r11 C 0x3333...33 + lvx x00110011, r12, r11 C 0x5555...55 + vspltisb x00001111, 15 C 0x0f0f...0f + +LIMB64(`lis r0, LIMBS_CHUNK_THRES ') +LIMB64(`cmpd cr7, n, r0 ') + + lvx v0, 0, up + addi r7, r11, 96 + rlwinm r6, up, 2,26,29 + lvx v8, r7, r6 + vand v0, v0, v8 + +LIMB32(`rlwinm r8, up, 30,30,31 ') +LIMB64(`rlwinm r8, up, 29,31,31 ') + add n, n, r8 C compensate n for rounded down `up' + + vxor v1, v1, v1 + li r8, 0 C grand total count + + vxor v3, v3, v3 C zero total count + + addic. n, n, -LIMBS_PER_VR + ble L(sum) + + addic. n, n, -LIMBS_PER_VR + ble L(lsum) + +C For 64-bit machines, handle huge n that would overflow vsum4ubs +LIMB64(`ble cr7, L(small) ') +LIMB64(`addis r9, n, -LIMBS_PER_CHUNK ') C remaining n +LIMB64(`lis n, LIMBS_PER_CHUNK ') +L(small): + + +LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n +LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n + addi r7, r7, 1 + mtctr r7 C copy n to count register + b L(ent) + + ALIGN(8) +L(top): lvx v0, 0, up + li r7, 128 C prefetch distance +L(ent): lvx v1, r12, up + addi up, up, 32 + vsr v4, v0, cnt1 + vsr v5, v1, cnt1 + dcbt up, r7 C prefetch + vand v8, v4, x01010101 + vand v9, v5, x01010101 + vsububm v0, v0, v8 C 64 2-bit accumulators (0..2) + vsububm v1, v1, v9 C 64 2-bit accumulators (0..2) + vsr v4, v0, cnt2 + vsr v5, v1, cnt2 + vand v8, v0, x00110011 + vand v9, v1, x00110011 + vand v4, v4, x00110011 + vand v5, v5, x00110011 + vaddubm v0, v4, v8 C 32 4-bit accumulators (0..4) + vaddubm v1, v5, v9 C 32 4-bit accumulators (0..4) + vaddubm v8, v0, v1 C 32 4-bit accumulators (0..8) + vsr v9, v8, cnt4 + vand v6, v8, x00001111 + vand v9, v9, x00001111 + vaddubm v6, v9, v6 C 16 8-bit accumulators (0..16) + vsum4ubs v3, v6, v3 C sum 4 x 4 bytes into 4 32-bit fields + bdnz L(top) + + andi. n, n, eval(LIMBS_PER_2VR-1) + beq L(rt) + + lvx v0, 0, up + vxor v1, v1, v1 + cmpwi n, LIMBS_PER_VR + ble L(sum) +L(lsum): + vor v1, v0, v0 + lvx v0, r12, up +L(sum): +LIMB32(`rlwinm r6, n, 4,26,27 ') +LIMB64(`rlwinm r6, n, 5,26,26 ') + addi r7, r11, 32 + lvx v8, r7, r6 + vand v0, v0, v8 + + vsr v4, v0, cnt1 + vsr v5, v1, cnt1 + vand v8, v4, x01010101 + vand v9, v5, x01010101 + vsububm v0, v0, v8 C 64 2-bit accumulators (0..2) + vsububm v1, v1, v9 C 64 2-bit accumulators (0..2) + vsr v4, v0, cnt2 + vsr v5, v1, cnt2 + vand v8, v0, x00110011 + vand v9, v1, x00110011 + vand v4, v4, x00110011 + vand v5, v5, x00110011 + vaddubm v0, v4, v8 C 32 4-bit accumulators (0..4) + vaddubm v1, v5, v9 C 32 4-bit accumulators (0..4) + vaddubm v8, v0, v1 C 32 4-bit accumulators (0..8) + vsr v9, v8, cnt4 + vand v6, v8, x00001111 + vand v9, v9, x00001111 + vaddubm v6, v9, v6 C 16 8-bit accumulators (0..16) + vsum4ubs v3, v6, v3 C sum 4 x 4 bytes into 4 32-bit fields + +L(rt): + li r7, -16 C FIXME: does all ppc32 and ppc64 ABIs + stvx v3, r7, r1 C FIXME: ...support storing below sp? + + lwz r7, -16(r1) + add r8, r8, r7 + lwz r7, -12(r1) + add r8, r8, r7 + lwz r7, -8(r1) + add r8, r8, r7 + lwz r7, -4(r1) + add r8, r8, r7 + +C Handle outer loop for huge n. We inherit cr7 and r0 from above. +LIMB64(`ble cr7, L(ret) + vxor v3, v3, v3 C zero total count + mr n, r9 + cmpd cr7, n, r0 + ble cr7, L(2) + addis r9, n, -LIMBS_PER_CHUNK C remaining n + lis n, LIMBS_PER_CHUNK +L(2): srdi r7, n, 2 C loop count corresponding to n + mtctr r7 C copy n to count register + b L(top) +') + +L(ret): mr r3, r8 + mtspr 256, r10 + blr +EPILOGUE() + +DEF_OBJECT(cnsts,16) + .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 + .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 + + .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 + .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 +C Masks for high end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +C Masks for low end of number + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff +END_OBJECT(cnsts) +ASM_END()