dnl IA-64 mpn_sqr_diagonal. Helper for sqr_basecase. dnl Copyright 2001, 2002, 2004 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C cycles/limb C Itanium: 4 C Itanium 2: 2 C TODO C * Perhaps avoid ctop loop. Unfortunately, a cloop loop running at 1 c/l C would need prohibitive 8-way unrolling. C * Instead of messing too much with this, write a nifty mpn_sqr_basecase. C INPUT PARAMETERS C rp = r32 C sp = r33 C n = r34 ASM_START() PROLOGUE(mpn_sqr_diagonal) .prologue .save ar.lc, r2 .save pr, r15 .body ifdef(`HAVE_ABI_32', ` addp4 r32 = 0, r32 addp4 r33 = 0, r33 zxt4 r34 = r34 ;; ') ldf8 f32 = [r33], 8 C M load rp[0] early mov r2 = ar.lc C I0 mov r14 = ar.ec C I0 mov r15 = pr C I0 add r19 = -1, r34 C M I decr n add r18 = 8, r32 C M I rp for high limb ;; mov ar.lc = r19 C I0 mov ar.ec = 5 C I0 mov pr.rot = 1<<16 C I0 ;; br.cexit.spnt .Ldone C B ;; ALIGN(32) .Loop: (p16) ldf8 f32 = [r33], 8 C M (p19) xma.l f36 = f35, f35, f0 C F (p21) stf8 [r32] = f38, 16 C M2 M3 (p19) xma.hu f40 = f35, f35, f0 C F (p21) stf8 [r18] = f42, 16 C M2 M3 br.ctop.dptk .Loop C B ;; .Ldone: stf8 [r32] = f38 C M2 M3 stf8 [r18] = f42 C M2 M3 mov ar.ec = r14 C I0 ;; mov pr = r15, 0x1ffff C I0 mov ar.lc = r2 C I0 br.ret.sptk.many b0 C B EPILOGUE(mpn_sqr_diagonal) ASM_END()