--- /dev/null
+dnl SPARC v9 mpn_add_n -- Add two limb vectors of the same length > 0 and
+dnl store sum in a third limb vector.
+
+dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C cycles/limb
+C UltraSPARC 1&2: 4
+C UltraSPARC 3: 4.5
+
+C Compute carry-out from the most significant bits of u,v, and r, where
+C r=u+v+carry_in, using logic operations.
+
+C This code runs at 4 cycles/limb on UltraSPARC 1 and 2. It has a 4 insn
+C recurrency, and the UltraSPARC 1 and 2 the IE units are 100% saturated.
+C Therefore, it seems futile to try to optimize this any further...
+
+C INPUT PARAMETERS
+define(`rp',`%i0')
+define(`up',`%i1')
+define(`vp',`%i2')
+define(`n',`%i3')
+
+define(`u0',`%l0')
+define(`u1',`%l2')
+define(`u2',`%l4')
+define(`u3',`%l6')
+define(`v0',`%l1')
+define(`v1',`%l3')
+define(`v2',`%l5')
+define(`v3',`%l7')
+
+define(`cy',`%i4')
+
+define(`fanop',`fitod %f0,%f2') dnl A quasi nop running in the FA pipe
+define(`fmnop',`fmuld %f0,%f0,%f4') dnl A quasi nop running in the FM pipe
+
+ASM_START()
+ REGISTER(%g2,#scratch)
+ REGISTER(%g3,#scratch)
+PROLOGUE(mpn_add_n)
+ save %sp,-160,%sp
+
+ fitod %f0,%f0 C make sure f0 contains small, quiet number
+ subcc n,4,%g0
+ bl,pn %icc,.Loop0
+ mov 0,cy
+
+ ldx [up+0],u0
+ ldx [vp+0],v0
+ add up,32,up
+ ldx [up-24],u1
+ ldx [vp+8],v1
+ add vp,32,vp
+ ldx [up-16],u2
+ ldx [vp-16],v2
+ ldx [up-8],u3
+ ldx [vp-8],v3
+ subcc n,8,n
+ add u0,v0,%g1 C main add
+ add %g1,cy,%g4 C carry add
+ or u0,v0,%g2
+ bl,pn %icc,.Lend4567
+ fanop
+ b,a .Loop
+
+ .align 16
+C START MAIN LOOP
+.Loop: andn %g2,%g4,%g2
+ and u0,v0,%g3
+ ldx [up+0],u0
+ fanop
+C --
+ or %g3,%g2,%g2
+ ldx [vp+0],v0
+ add up,32,up
+ fanop
+C --
+ srlx %g2,63,cy
+ add u1,v1,%g1
+ stx %g4,[rp+0]
+ fanop
+C --
+ add %g1,cy,%g4
+ or u1,v1,%g2
+ fmnop
+ fanop
+C --
+ andn %g2,%g4,%g2
+ and u1,v1,%g3
+ ldx [up-24],u1
+ fanop
+C --
+ or %g3,%g2,%g2
+ ldx [vp+8],v1
+ add vp,32,vp
+ fanop
+C --
+ srlx %g2,63,cy
+ add u2,v2,%g1
+ stx %g4,[rp+8]
+ fanop
+C --
+ add %g1,cy,%g4
+ or u2,v2,%g2
+ fmnop
+ fanop
+C --
+ andn %g2,%g4,%g2
+ and u2,v2,%g3
+ ldx [up-16],u2
+ fanop
+C --
+ or %g3,%g2,%g2
+ ldx [vp-16],v2
+ add rp,32,rp
+ fanop
+C --
+ srlx %g2,63,cy
+ add u3,v3,%g1
+ stx %g4,[rp-16]
+ fanop
+C --
+ add %g1,cy,%g4
+ or u3,v3,%g2
+ fmnop
+ fanop
+C --
+ andn %g2,%g4,%g2
+ and u3,v3,%g3
+ ldx [up-8],u3
+ fanop
+C --
+ or %g3,%g2,%g2
+ subcc n,4,n
+ ldx [vp-8],v3
+ fanop
+C --
+ srlx %g2,63,cy
+ add u0,v0,%g1
+ stx %g4,[rp-8]
+ fanop
+C --
+ add %g1,cy,%g4
+ or u0,v0,%g2
+ bge,pt %icc,.Loop
+ fanop
+C END MAIN LOOP
+.Lend4567:
+ andn %g2,%g4,%g2
+ and u0,v0,%g3
+ or %g3,%g2,%g2
+ srlx %g2,63,cy
+ add u1,v1,%g1
+ stx %g4,[rp+0]
+ add %g1,cy,%g4
+ or u1,v1,%g2
+ andn %g2,%g4,%g2
+ and u1,v1,%g3
+ or %g3,%g2,%g2
+ srlx %g2,63,cy
+ add u2,v2,%g1
+ stx %g4,[rp+8]
+ add %g1,cy,%g4
+ or u2,v2,%g2
+ andn %g2,%g4,%g2
+ and u2,v2,%g3
+ or %g3,%g2,%g2
+ add rp,32,rp
+ srlx %g2,63,cy
+ add u3,v3,%g1
+ stx %g4,[rp-16]
+ add %g1,cy,%g4
+ or u3,v3,%g2
+ andn %g2,%g4,%g2
+ and u3,v3,%g3
+ or %g3,%g2,%g2
+ srlx %g2,63,cy
+ stx %g4,[rp-8]
+
+ addcc n,4,n
+ bz,pn %icc,.Lret
+ fanop
+
+.Loop0: ldx [up],u0
+ add up,8,up
+ ldx [vp],v0
+ add vp,8,vp
+ add rp,8,rp
+ subcc n,1,n
+ add u0,v0,%g1
+ or u0,v0,%g2
+ add %g1,cy,%g4
+ and u0,v0,%g3
+ andn %g2,%g4,%g2
+ stx %g4,[rp-8]
+ or %g3,%g2,%g2
+ bnz,pt %icc,.Loop0
+ srlx %g2,63,cy
+
+.Lret: mov cy,%i0
+ ret
+ restore
+EPILOGUE(mpn_add_n)