dnl SPARC v9 32-bit mpn_sqr_diagonal. dnl Copyright 2001, 2003 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C INPUT PARAMETERS C rp i0 C up i1 C n i2 C This code uses a very deep software pipeline, due to the need for moving data C forth and back between the integer registers and floating-point registers. C C A VIS variant of this code would make the pipeline less deep, since the C masking now done in the integer unit could take place in the floating-point C unit using the FAND instruction. It would be possible to save several cycles C too. C C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and C not much slower from the Ecache. It would perhaps be possible to shave off C one cycle, but not easily. We cannot do better than 10 cycles/limb with the C used instructions, since we have 10 memory operations per limb. But a VIS C variant could run three cycles faster than the corresponding non-VIS code. C This is non-pipelined code showing the algorithm: C C .Loop: C lduw [up+0],%g4 C 00000000hhhhllll C sllx %g4,16,%g3 C 0000hhhhllll0000 C or %g3,%g4,%g2 C 0000hhhhXXXXllll C andn %g2,%g5,%g2 C 0000hhhh0000llll C stx %g2,[%fp+80] C ldd [%fp+80],%f0 C fitod %f0,%f4 C hi16 C fitod %f1,%f6 C lo16 C ld [up+0],%f9 C fxtod %f8,%f2 C fmuld %f2,%f4,%f4 C fmuld %f2,%f6,%f6 C fdtox %f4,%f4 C fdtox %f6,%f6 C std %f4,[%fp-24] C std %f6,[%fp-16] C ldx [%fp-24],%g2 C ldx [%fp-16],%g1 C sllx %g2,16,%g2 C add %g2,%g1,%g1 C stw %g1,[rp+0] C srlx %g1,32,%l0 C stw %l0,[rp+4] C add up,4,up C subcc n,1,n C bne,pt %icc,.Loop C add rp,8,rp define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe ASM_START() TEXT ALIGN(4) .Lnoll: .word 0 PROLOGUE(mpn_sqr_diagonal) save %sp,-256,%sp ifdef(`PIC', `.Lpc: rd %pc,%o7 ld [%o7+.Lnoll-.Lpc],%f8', ` sethi %hi(.Lnoll),%g1 ld [%g1+%lo(.Lnoll)],%f8') sethi %hi(0xffff0000),%g5 add %i1,-8,%i1 lduw [%i1+8],%g4 add %i1,4,%i1 C s1_ptr++ sllx %g4,16,%g3 C 0000hhhhllll0000 or %g3,%g4,%g2 C 0000hhhhXXXXllll subcc %i2,1,%i2 bne,pt %icc,.L_grt_1 andn %g2,%g5,%g2 C 0000hhhh0000llll add %i1,4,%i1 C s1_ptr++ stx %g2,[%fp+80] ld [%i1],%f9 ldd [%fp+80],%f0 fxtod %f8,%f2 fitod %f0,%f4 fitod %f1,%f6 fmuld %f2,%f4,%f4 fmuld %f2,%f6,%f6 fdtox %f4,%f4 fdtox %f6,%f6 std %f4,[%fp-24] std %f6,[%fp-16] add %fp, 80, %l3 add %fp, -24, %l4 add %fp, 72, %l5 b .L1 add %fp, -40, %l6 .L_grt_1: stx %g2,[%fp+80] lduw [%i1+8],%g4 add %i1,4,%i1 C s1_ptr++ sllx %g4,16,%g3 C 0000hhhhllll0000 or %g3,%g4,%g2 C 0000hhhhXXXXllll subcc %i2,1,%i2 bne,pt %icc,.L_grt_2 andn %g2,%g5,%g2 C 0000hhhh0000llll stx %g2,[%fp+72] ld [%i1],%f9 add %i1,4,%i1 C s1_ptr++ ldd [%fp+80],%f0 fxtod %f8,%f2 fitod %f0,%f4 fitod %f1,%f6 fmuld %f2,%f4,%f4 ld [%i1],%f9 fmuld %f2,%f6,%f6 ldd [%fp+72],%f0 fdtox %f4,%f4 fdtox %f6,%f6 std %f4,[%fp-24] fxtod %f8,%f2 std %f6,[%fp-16] fitod %f0,%f4 fitod %f1,%f6 fmuld %f2,%f4,%f4 fmuld %f2,%f6,%f6 fdtox %f4,%f4 add %fp, 72, %l3 add %fp, -40, %l4 add %fp, 80, %l5 b .L2 add %fp, -24, %l6 .L_grt_2: stx %g2,[%fp+72] lduw [%i1+8],%g4 ld [%i1],%f9 add %i1,4,%i1 C s1_ptr++ ldd [%fp+80],%f0 sllx %g4,16,%g3 C 0000hhhhllll0000 or %g3,%g4,%g2 C 0000hhhhXXXXllll subcc %i2,1,%i2 fxtod %f8,%f2 bne,pt %icc,.L_grt_3 andn %g2,%g5,%g2 C 0000hhhh0000llll stx %g2,[%fp+80] fitod %f0,%f4 fitod %f1,%f6 fmuld %f2,%f4,%f4 ld [%i1],%f9 fmuld %f2,%f6,%f6 add %i1,4,%i1 C s1_ptr++ ldd [%fp+72],%f0 fdtox %f4,%f4 fdtox %f6,%f6 std %f4,[%fp-24] fxtod %f8,%f2 std %f6,[%fp-16] fitod %f0,%f4 fitod %f1,%f6 fmuld %f2,%f4,%f4 ld [%i1],%f9 add %fp, 80, %l3 fmuld %f2,%f6,%f6 add %fp, -24, %l4 ldd [%fp+80],%f0 add %fp, 72, %l5 fdtox %f4,%f4 b .L3 add %fp, -40, %l6 .L_grt_3: stx %g2,[%fp+80] fitod %f0,%f4 lduw [%i1+8],%g4 fitod %f1,%f6 fmuld %f2,%f4,%f4 ld [%i1],%f9 fmuld %f2,%f6,%f6 add %i1,4,%i1 C s1_ptr++ ldd [%fp+72],%f0 fdtox %f4,%f4 sllx %g4,16,%g3 C 0000hhhhllll0000 fdtox %f6,%f6 or %g3,%g4,%g2 C 0000hhhhXXXXllll subcc %i2,1,%i2 std %f4,[%fp-24] fxtod %f8,%f2 std %f6,[%fp-16] bne,pt %icc,.L_grt_4 andn %g2,%g5,%g2 C 0000hhhh0000llll stx %g2,[%fp+72] fitod %f0,%f4 fitod %f1,%f6 add %fp, 72, %l3 fmuld %f2,%f4,%f4 add %fp, -40, %l4 ld [%i1],%f9 fmuld %f2,%f6,%f6 add %i1,4,%i1 C s1_ptr++ ldd [%fp+80],%f0 add %fp, 80, %l5 fdtox %f4,%f4 b .L4 add %fp, -24, %l6 .L_grt_4: stx %g2,[%fp+72] fitod %f0,%f4 lduw [%i1+8],%g4 fitod %f1,%f6 fmuld %f2,%f4,%f4 ld [%i1],%f9 fmuld %f2,%f6,%f6 add %i1,4,%i1 C s1_ptr++ ldd [%fp+80],%f0 fdtox %f4,%f4 sllx %g4,16,%g3 C 0000hhhhllll0000 fdtox %f6,%f6 or %g3,%g4,%g2 C 0000hhhhXXXXllll subcc %i2,1,%i2 std %f4,[%fp-40] fxtod %f8,%f2 std %f6,[%fp-32] be,pn %icc,.L5 andn %g2,%g5,%g2 C 0000hhhh0000llll b,a .Loop .align 16 C --- LOOP BEGIN .Loop: nop nop stx %g2,[%fp+80] fitod %f0,%f4 C --- nop nop lduw [%i1+8],%g4 fitod %f1,%f6 C --- nop nop ldx [%fp-24],%g2 C p16 fanop C --- nop nop ldx [%fp-16],%g1 C p0 fmuld %f2,%f4,%f4 C --- sllx %g2,16,%g2 C align p16 add %i0,8,%i0 C res_ptr++ ld [%i1],%f9 fmuld %f2,%f6,%f6 C --- add %g2,%g1,%g1 C add p16 to p0 (ADD1) add %i1,4,%i1 C s1_ptr++ ldd [%fp+72],%f0 fanop C --- srlx %g1,32,%l0 nop stw %g1,[%i0-8] fdtox %f4,%f4 C --- sllx %g4,16,%g3 C 0000hhhhllll0000 nop stw %l0,[%i0-4] fdtox %f6,%f6 C --- or %g3,%g4,%g2 C 0000hhhhXXXXllll subcc %i2,1,%i2 std %f4,[%fp-24] fxtod %f8,%f2 C --- std %f6,[%fp-16] andn %g2,%g5,%g2 C 0000hhhh0000llll be,pn %icc,.Lend fanop C --- LOOP MIDDLE nop nop stx %g2,[%fp+72] fitod %f0,%f4 C --- nop nop lduw [%i1+8],%g4 fitod %f1,%f6 C --- nop nop ldx [%fp-40],%g2 C p16 fanop C --- nop nop ldx [%fp-32],%g1 C p0 fmuld %f2,%f4,%f4 C --- sllx %g2,16,%g2 C align p16 add %i0,8,%i0 C res_ptr++ ld [%i1],%f9 fmuld %f2,%f6,%f6 C --- add %g2,%g1,%g1 C add p16 to p0 (ADD1) add %i1,4,%i1 C s1_ptr++ ldd [%fp+80],%f0 fanop C --- srlx %g1,32,%l0 nop stw %g1,[%i0-8] fdtox %f4,%f4 C --- sllx %g4,16,%g3 C 0000hhhhllll0000 nop stw %l0,[%i0-4] fdtox %f6,%f6 C --- or %g3,%g4,%g2 C 0000hhhhXXXXllll subcc %i2,1,%i2 std %f4,[%fp-40] fxtod %f8,%f2 C --- std %f6,[%fp-32] andn %g2,%g5,%g2 C 0000hhhh0000llll bne,pt %icc,.Loop fanop C --- LOOP END .L5: add %fp, 80, %l3 add %fp, -24, %l4 add %fp, 72, %l5 b .Ltail add %fp, -40, %l6 .Lend: add %fp, 72, %l3 add %fp, -40, %l4 add %fp, 80, %l5 add %fp, -24, %l6 .Ltail: stx %g2,[%l3] fitod %f0,%f4 fitod %f1,%f6 ldx [%l4],%g2 C p16 ldx [%l4+8],%g1 C p0 fmuld %f2,%f4,%f4 sllx %g2,16,%g2 C align p16 add %i0,8,%i0 C res_ptr++ ld [%i1],%f9 fmuld %f2,%f6,%f6 add %g2,%g1,%g1 C add p16 to p0 (ADD1) add %i1,4,%i1 C s1_ptr++ ldd [%l5],%f0 srlx %g1,32,%l0 stw %g1,[%i0-8] fdtox %f4,%f4 stw %l0,[%i0-4] .L4: fdtox %f6,%f6 std %f4,[%l4] fxtod %f8,%f2 std %f6,[%l4+8] fitod %f0,%f4 fitod %f1,%f6 ldx [%l6],%g2 C p16 ldx [%l6+8],%g1 C p0 fmuld %f2,%f4,%f4 sllx %g2,16,%g2 C align p16 add %i0,8,%i0 C res_ptr++ ld [%i1],%f9 fmuld %f2,%f6,%f6 add %g2,%g1,%g1 C add p16 to p0 (ADD1) ldd [%l3],%f0 srlx %g1,32,%l0 stw %g1,[%i0-8] fdtox %f4,%f4 stw %l0,[%i0-4] .L3: fdtox %f6,%f6 std %f4,[%l6] fxtod %f8,%f2 std %f6,[%l6+8] fitod %f0,%f4 fitod %f1,%f6 ldx [%l4],%g2 C p16 ldx [%l4+8],%g1 C p0 fmuld %f2,%f4,%f4 sllx %g2,16,%g2 C align p16 add %i0,8,%i0 C res_ptr++ fmuld %f2,%f6,%f6 add %g2,%g1,%g1 C add p16 to p0 (ADD1) srlx %g1,32,%l0 stw %g1,[%i0-8] fdtox %f4,%f4 stw %l0,[%i0-4] .L2: fdtox %f6,%f6 std %f4,[%l4] std %f6,[%l4+8] ldx [%l6],%g2 C p16 ldx [%l6+8],%g1 C p0 sllx %g2,16,%g2 C align p16 add %i0,8,%i0 C res_ptr++ add %g2,%g1,%g1 C add p16 to p0 (ADD1) srlx %g1,32,%l0 stw %g1,[%i0-8] stw %l0,[%i0-4] .L1: ldx [%l4],%g2 C p16 ldx [%l4+8],%g1 C p0 sllx %g2,16,%g2 C align p16 add %i0,8,%i0 C res_ptr++ add %g2,%g1,%g1 C add p16 to p0 (ADD1) srlx %g1,32,%l0 stw %g1,[%i0-8] stw %l0,[%i0-4] ret restore %g0,%g0,%o0 EPILOGUE(mpn_sqr_diagonal)