X-Git-Url: https://oss.titaniummirror.com/gitweb/?a=blobdiff_plain;f=gmp%2Fmpn%2Fpowerpc32%2Fmod_34lsub1.asm;fp=gmp%2Fmpn%2Fpowerpc32%2Fmod_34lsub1.asm;h=fa0f0139ee5067edeac70abb360cc199b1c82ecd;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git

diff --git a/gmp/mpn/powerpc32/mod_34lsub1.asm b/gmp/mpn/powerpc32/mod_34lsub1.asm
new file mode 100644
index 00000000..fa0f0139
--- /dev/null
+++ b/gmp/mpn/powerpc32/mod_34lsub1.asm
@@ -0,0 +1,134 @@
+dnl  PowerPC-32 mpn_mod_34lsub1 -- mpn remainder mod 2^24-1.
+
+dnl  Copyright 2002, 2003, 2005 Free Software Foundation, Inc.
+
+dnl  This file is part of the GNU MP Library.
+
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+
+C                cycles/limb
+C 603e:            ?
+C 604e:            3
+C 75x (G3):        3
+C 7400,7410 (G4):  3
+C 744x,745x (G4+): 3
+C power4/ppc970:   2.5
+C power5:          2.5
+
+C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
+C
+C There seems no need to schedule the loads back, the code is still 3.0 c/l
+C on 750/7400 no matter where they're placed.
+C
+C Alternatives:
+C
+C Fetching half words would allow add instead for accumulating, instead of
+C adde and its serialization.  An outer loop would be required though, since
+C 2^16 halfwords can overflow.  lhz+add would be 2.0 c/l, but if there's
+C also a bdz or bdnz for each and a pointer update say every three limbs
+C then the total would be 2.67 c/l which isn't much faster than the current
+C simpler code.
+
+ASM_START()
+PROLOGUE(mpn_mod_34lsub1)
+
+	C r3	src
+	C r4	size
+
+	mtctr	r4
+	addic	r6, r3, 8		C &src[2], and clear CA
+
+	lwz	r3, 0(r3)		C acc0 = src[0]
+	bdz	L(done)
+
+	lwz	r4, -4(r6)		C acc1 = src[1]
+	bdz	L(two)
+
+	lwz	r5, 0(r6)		C acc2 = src[2]
+	lis	r7, 0			C no carry if just three limbs
+
+	bdz	L(three)
+	lis	r7, 1			C 0x10000 carry pos
+
+L(top):
+	C r3	acc0
+	C r4	acc1
+	C r5	acc2
+	C r6	src, incrementing
+	C r7	carry pos
+
+	lwz	r0, 4(r6)
+	adde	r3, r3, r0
+	bdz	L(end0)
+
+	lwz	r0, 8(r6)
+	adde	r4, r4, r0
+	bdz	L(end1)
+
+	lwzu	r0, 12(r6)
+	adde	r5, r5, r0
+	bdnz	L(top)
+
+
+	srwi	r7, r7, 8
+L(end0):
+	srwi	r7, r7, 8
+L(end1):
+	subfe	r0, r0, r0		C -1 if not CA
+
+	andc	r7, r7, r0		C final carry, 0x10000, 0x100, 1 or 0
+L(three):
+	rlwinm	r6, r3, 0,8,31		C acc0 low
+
+	add	r7, r7, r6
+	rlwinm	r6, r3, 8,24,31		C acc0 high
+
+	add	r7, r7, r6
+	rlwinm	r6, r4, 8,8,23		C acc1 low
+
+	add	r7, r7, r6
+	rlwinm	r6, r4, 16,16,31	C acc1 high
+
+	add	r7, r7, r6
+	rlwinm	r6, r5, 16,8,15		C acc2 low
+
+	add	r7, r7, r6
+	rlwinm	r6, r5, 24,8,31		C acc2 high
+
+	add	r3, r7, r6
+
+L(done):
+	blr
+
+L(two):
+	C r3	acc0
+	C r4	acc1
+
+	rlwinm	r5, r3, 8,24,31		C acc0 high
+	rlwinm	r3, r3, 0,8,31		C acc0 low
+
+	add	r3, r3, r5		C acc0 high + low
+	rlwinm	r5, r4, 16,16,31	C acc1 high
+
+	add	r3, r3, r5		C add acc1 high
+	rlwinm	r5, r4, 8,8,23		C acc1 low
+
+	add	r3, r3, r5		C add acc1 low
+
+	blr
+
+EPILOGUE()