dnl HP-PA 2.0 64-bit mpn_sqr_diagonal. dnl Copyright 2001, 2002, 2003 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. dnl This code runs at 7.25 cycles/limb on PA8000 and 7.75 cycles/limb on dnl PA8500. The cache would saturate at 5 cycles/limb, so there is some room dnl for optimization. include(`../config.m4') C INPUT PARAMETERS define(`rp',`%r26') define(`up',`%r25') define(`n',`%r24') define(`p00',`%r28') define(`p32',`%r29') define(`p64',`%r31') define(`t0',`%r19') define(`t1',`%r20') ifdef(`HAVE_ABI_2_0w', ` .level 2.0w ',` .level 2.0 ') PROLOGUE(mpn_sqr_diagonal) ldo 128(%r30),%r30 fldds,ma 8(up),%fr8 addib,= -1,n,L(end1) nop fldds,ma 8(up),%fr4 xmpyu %fr8l,%fr8r,%fr10 fstd %fr10,-120(%r30) xmpyu %fr8r,%fr8r,%fr9 fstd %fr9,0(rp) xmpyu %fr8l,%fr8l,%fr11 fstd %fr11,8(rp) addib,= -1,n,L(end2) ldo 16(rp),rp LDEF(loop) fldds,ma 8(up),%fr8 C load next up limb xmpyu %fr4l,%fr4r,%fr6 fstd %fr6,-128(%r30) xmpyu %fr4r,%fr4r,%fr5 C multiply in fp regs fstd %fr5,0(rp) xmpyu %fr4l,%fr4l,%fr7 fstd %fr7,8(rp) ldd -120(%r30),p32 ldd -16(rp),p00 C accumulate in int regs ldd -8(rp),p64 depd,z p32,30,31,t0 add t0,p00,p00 std p00,-16(rp) extrd,u p32,32,33,t1 add,dc t1,p64,p64 std p64,-8(rp) addib,= -1,n,L(exit) ldo 16(rp),rp fldds,ma 8(up),%fr4 xmpyu %fr8l,%fr8r,%fr10 fstd %fr10,-120(%r30) xmpyu %fr8r,%fr8r,%fr9 fstd %fr9,0(rp) xmpyu %fr8l,%fr8l,%fr11 fstd %fr11,8(rp) ldd -128(%r30),p32 ldd -16(rp),p00 ldd -8(rp),p64 depd,z p32,30,31,t0 add t0,p00,p00 std p00,-16(rp) extrd,u p32,32,33,t1 add,dc t1,p64,p64 std p64,-8(rp) addib,<> -1,n,L(loop) ldo 16(rp),rp LDEF(end2) xmpyu %fr4l,%fr4r,%fr6 fstd %fr6,-128(%r30) xmpyu %fr4r,%fr4r,%fr5 fstd %fr5,0(rp) xmpyu %fr4l,%fr4l,%fr7 fstd %fr7,8(rp) ldd -120(%r30),p32 ldd -16(rp),p00 ldd -8(rp),p64 depd,z p32,30,31,t0 add t0,p00,p00 std p00,-16(rp) extrd,u p32,32,33,t1 add,dc t1,p64,p64 std p64,-8(rp) ldo 16(rp),rp ldd -128(%r30),p32 ldd -16(rp),p00 ldd -8(rp),p64 depd,z p32,30,31,t0 add t0,p00,p00 std p00,-16(rp) extrd,u p32,32,33,t1 add,dc t1,p64,p64 std p64,-8(rp) bve (%r2) ldo -128(%r30),%r30 LDEF(exit) xmpyu %fr8l,%fr8r,%fr10 fstd %fr10,-120(%r30) xmpyu %fr8r,%fr8r,%fr9 fstd %fr9,0(rp) xmpyu %fr8l,%fr8l,%fr11 fstd %fr11,8(rp) ldd -128(%r30),p32 ldd -16(rp),p00 ldd -8(rp),p64 depd,z p32,31,32,t0 add t0,p00,p00 extrd,u p32,31,32,t1 add,dc t1,p64,p64 add t0,p00,p00 add,dc t1,p64,p64 std p00,-16(rp) std p64,-8(rp) ldo 16(rp),rp ldd -120(%r30),p32 ldd -16(rp),p00 ldd -8(rp),p64 depd,z p32,31,32,t0 add t0,p00,p00 extrd,u p32,31,32,t1 add,dc t1,p64,p64 add t0,p00,p00 add,dc t1,p64,p64 std p00,-16(rp) std p64,-8(rp) bve (%r2) ldo -128(%r30),%r30 LDEF(end1) xmpyu %fr8l,%fr8r,%fr10 fstd %fr10,-128(%r30) xmpyu %fr8r,%fr8r,%fr9 fstd %fr9,0(rp) xmpyu %fr8l,%fr8l,%fr11 fstd %fr11,8(rp) ldo 16(rp),rp ldd -128(%r30),p32 ldd -16(rp),p00 ldd -8(rp),p64 depd,z p32,31,32,t0 add t0,p00,p00 extrd,u p32,31,32,t1 add,dc t1,p64,p64 add t0,p00,p00 add,dc t1,p64,p64 std p00,-16(rp) std p64,-8(rp) bve (%r2) ldo -128(%r30),%r30 EPILOGUE(mpn_sqr_diagonal)