X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Fsparc32%2Fv9%2Fsqr_diagonal.asm;fp=gmp%2Fmpn%2Fsparc32%2Fv9%2Fsqr_diagonal.asm;h=e4a78c5de7a2be5de1fa32b71748b27dba613833;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/sparc32/v9/sqr_diagonal.asm b/gmp/mpn/sparc32/v9/sqr_diagonal.asm new file mode 100644 index 00000000..e4a78c5d --- /dev/null +++ b/gmp/mpn/sparc32/v9/sqr_diagonal.asm @@ -0,0 +1,451 @@ +dnl SPARC v9 32-bit mpn_sqr_diagonal. + +dnl Copyright 2001, 2003 Free Software Foundation, Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + + +include(`../config.m4') + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 + +C This code uses a very deep software pipeline, due to the need for moving data +C forth and back between the integer registers and floating-point registers. +C +C A VIS variant of this code would make the pipeline less deep, since the +C masking now done in the integer unit could take place in the floating-point +C unit using the FAND instruction. It would be possible to save several cycles +C too. +C +C On UltraSPARC 1 and 2, this code runs at 11 cycles/limb from the Dcache and +C not much slower from the Ecache. It would perhaps be possible to shave off +C one cycle, but not easily. We cannot do better than 10 cycles/limb with the +C used instructions, since we have 10 memory operations per limb. But a VIS +C variant could run three cycles faster than the corresponding non-VIS code. + +C This is non-pipelined code showing the algorithm: +C +C .Loop: +C lduw [up+0],%g4 C 00000000hhhhllll +C sllx %g4,16,%g3 C 0000hhhhllll0000 +C or %g3,%g4,%g2 C 0000hhhhXXXXllll +C andn %g2,%g5,%g2 C 0000hhhh0000llll +C stx %g2,[%fp+80] +C ldd [%fp+80],%f0 +C fitod %f0,%f4 C hi16 +C fitod %f1,%f6 C lo16 +C ld [up+0],%f9 +C fxtod %f8,%f2 +C fmuld %f2,%f4,%f4 +C fmuld %f2,%f6,%f6 +C fdtox %f4,%f4 +C fdtox %f6,%f6 +C std %f4,[%fp-24] +C std %f6,[%fp-16] +C ldx [%fp-24],%g2 +C ldx [%fp-16],%g1 +C sllx %g2,16,%g2 +C add %g2,%g1,%g1 +C stw %g1,[rp+0] +C srlx %g1,32,%l0 +C stw %l0,[rp+4] +C add up,4,up +C subcc n,1,n +C bne,pt %icc,.Loop +C add rp,8,rp + +define(`fanop',`fitod %f12,%f10') dnl A quasi nop running in the FA pipe + +ASM_START() + + TEXT + ALIGN(4) +.Lnoll: + .word 0 + +PROLOGUE(mpn_sqr_diagonal) + save %sp,-256,%sp + +ifdef(`PIC', +`.Lpc: rd %pc,%o7 + ld [%o7+.Lnoll-.Lpc],%f8', +` sethi %hi(.Lnoll),%g1 + ld [%g1+%lo(.Lnoll)],%f8') + + sethi %hi(0xffff0000),%g5 + add %i1,-8,%i1 + + lduw [%i1+8],%g4 + add %i1,4,%i1 C s1_ptr++ + sllx %g4,16,%g3 C 0000hhhhllll0000 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + bne,pt %icc,.L_grt_1 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + add %i1,4,%i1 C s1_ptr++ + stx %g2,[%fp+80] + ld [%i1],%f9 + ldd [%fp+80],%f0 + fxtod %f8,%f2 + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + fmuld %f2,%f6,%f6 + fdtox %f4,%f4 + fdtox %f6,%f6 + std %f4,[%fp-24] + std %f6,[%fp-16] + + add %fp, 80, %l3 + add %fp, -24, %l4 + add %fp, 72, %l5 + b .L1 + add %fp, -40, %l6 + +.L_grt_1: + stx %g2,[%fp+80] + lduw [%i1+8],%g4 + add %i1,4,%i1 C s1_ptr++ + sllx %g4,16,%g3 C 0000hhhhllll0000 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + bne,pt %icc,.L_grt_2 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + stx %g2,[%fp+72] + ld [%i1],%f9 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + fxtod %f8,%f2 + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + ldd [%fp+72],%f0 + fdtox %f4,%f4 + fdtox %f6,%f6 + std %f4,[%fp-24] + fxtod %f8,%f2 + std %f6,[%fp-16] + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + fmuld %f2,%f6,%f6 + fdtox %f4,%f4 + + add %fp, 72, %l3 + add %fp, -40, %l4 + add %fp, 80, %l5 + b .L2 + add %fp, -24, %l6 + +.L_grt_2: + stx %g2,[%fp+72] + lduw [%i1+8],%g4 + ld [%i1],%f9 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + sllx %g4,16,%g3 C 0000hhhhllll0000 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + fxtod %f8,%f2 + bne,pt %icc,.L_grt_3 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + stx %g2,[%fp+80] + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+72],%f0 + fdtox %f4,%f4 + fdtox %f6,%f6 + std %f4,[%fp-24] + fxtod %f8,%f2 + std %f6,[%fp-16] + fitod %f0,%f4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + add %fp, 80, %l3 + fmuld %f2,%f6,%f6 + add %fp, -24, %l4 + ldd [%fp+80],%f0 + add %fp, 72, %l5 + fdtox %f4,%f4 + b .L3 + add %fp, -40, %l6 + +.L_grt_3: + stx %g2,[%fp+80] + fitod %f0,%f4 + lduw [%i1+8],%g4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+72],%f0 + fdtox %f4,%f4 + sllx %g4,16,%g3 C 0000hhhhllll0000 + fdtox %f6,%f6 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-24] + fxtod %f8,%f2 + std %f6,[%fp-16] + bne,pt %icc,.L_grt_4 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + stx %g2,[%fp+72] + fitod %f0,%f4 + fitod %f1,%f6 + add %fp, 72, %l3 + fmuld %f2,%f4,%f4 + add %fp, -40, %l4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + add %fp, 80, %l5 + fdtox %f4,%f4 + b .L4 + add %fp, -24, %l6 + +.L_grt_4: + stx %g2,[%fp+72] + fitod %f0,%f4 + lduw [%i1+8],%g4 + fitod %f1,%f6 + fmuld %f2,%f4,%f4 + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + fdtox %f4,%f4 + sllx %g4,16,%g3 C 0000hhhhllll0000 + fdtox %f6,%f6 + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-40] + fxtod %f8,%f2 + std %f6,[%fp-32] + be,pn %icc,.L5 + andn %g2,%g5,%g2 C 0000hhhh0000llll + + b,a .Loop + + .align 16 +C --- LOOP BEGIN +.Loop: nop + nop + stx %g2,[%fp+80] + fitod %f0,%f4 +C --- + nop + nop + lduw [%i1+8],%g4 + fitod %f1,%f6 +C --- + nop + nop + ldx [%fp-24],%g2 C p16 + fanop +C --- + nop + nop + ldx [%fp-16],%g1 C p0 + fmuld %f2,%f4,%f4 +C --- + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 +C --- + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+72],%f0 + fanop +C --- + srlx %g1,32,%l0 + nop + stw %g1,[%i0-8] + fdtox %f4,%f4 +C --- + sllx %g4,16,%g3 C 0000hhhhllll0000 + nop + stw %l0,[%i0-4] + fdtox %f6,%f6 +C --- + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-24] + fxtod %f8,%f2 +C --- + std %f6,[%fp-16] + andn %g2,%g5,%g2 C 0000hhhh0000llll + be,pn %icc,.Lend + fanop +C --- LOOP MIDDLE + nop + nop + stx %g2,[%fp+72] + fitod %f0,%f4 +C --- + nop + nop + lduw [%i1+8],%g4 + fitod %f1,%f6 +C --- + nop + nop + ldx [%fp-40],%g2 C p16 + fanop +C --- + nop + nop + ldx [%fp-32],%g1 C p0 + fmuld %f2,%f4,%f4 +C --- + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 +C --- + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i1,4,%i1 C s1_ptr++ + ldd [%fp+80],%f0 + fanop +C --- + srlx %g1,32,%l0 + nop + stw %g1,[%i0-8] + fdtox %f4,%f4 +C --- + sllx %g4,16,%g3 C 0000hhhhllll0000 + nop + stw %l0,[%i0-4] + fdtox %f6,%f6 +C --- + or %g3,%g4,%g2 C 0000hhhhXXXXllll + subcc %i2,1,%i2 + std %f4,[%fp-40] + fxtod %f8,%f2 +C --- + std %f6,[%fp-32] + andn %g2,%g5,%g2 C 0000hhhh0000llll + bne,pt %icc,.Loop + fanop +C --- LOOP END + +.L5: add %fp, 80, %l3 + add %fp, -24, %l4 + add %fp, 72, %l5 + b .Ltail + add %fp, -40, %l6 + +.Lend: add %fp, 72, %l3 + add %fp, -40, %l4 + add %fp, 80, %l5 + add %fp, -24, %l6 +.Ltail: stx %g2,[%l3] + fitod %f0,%f4 + fitod %f1,%f6 + ldx [%l4],%g2 C p16 + ldx [%l4+8],%g1 C p0 + fmuld %f2,%f4,%f4 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + add %i1,4,%i1 C s1_ptr++ + ldd [%l5],%f0 + srlx %g1,32,%l0 + stw %g1,[%i0-8] + fdtox %f4,%f4 + stw %l0,[%i0-4] +.L4: fdtox %f6,%f6 + std %f4,[%l4] + fxtod %f8,%f2 + std %f6,[%l4+8] + + fitod %f0,%f4 + fitod %f1,%f6 + ldx [%l6],%g2 C p16 + ldx [%l6+8],%g1 C p0 + fmuld %f2,%f4,%f4 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + ld [%i1],%f9 + fmuld %f2,%f6,%f6 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + ldd [%l3],%f0 + srlx %g1,32,%l0 + stw %g1,[%i0-8] + fdtox %f4,%f4 + stw %l0,[%i0-4] +.L3: fdtox %f6,%f6 + std %f4,[%l6] + fxtod %f8,%f2 + std %f6,[%l6+8] + + fitod %f0,%f4 + fitod %f1,%f6 + ldx [%l4],%g2 C p16 + ldx [%l4+8],%g1 C p0 + fmuld %f2,%f4,%f4 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + fmuld %f2,%f6,%f6 + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + srlx %g1,32,%l0 + stw %g1,[%i0-8] + fdtox %f4,%f4 + stw %l0,[%i0-4] +.L2: fdtox %f6,%f6 + std %f4,[%l4] + std %f6,[%l4+8] + + ldx [%l6],%g2 C p16 + ldx [%l6+8],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + srlx %g1,32,%l0 + stw %g1,[%i0-8] + stw %l0,[%i0-4] + +.L1: ldx [%l4],%g2 C p16 + ldx [%l4+8],%g1 C p0 + sllx %g2,16,%g2 C align p16 + add %i0,8,%i0 C res_ptr++ + add %g2,%g1,%g1 C add p16 to p0 (ADD1) + srlx %g1,32,%l0 + stw %g1,[%i0-8] + stw %l0,[%i0-4] + + ret + restore %g0,%g0,%o0 + +EPILOGUE(mpn_sqr_diagonal)