X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Fsparc64%2Faddmul_1.asm;fp=gmp%2Fmpn%2Fsparc64%2Faddmul_1.asm;h=bd83c6562c1298a67268b167e1ba470b1346d1d8;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/sparc64/addmul_1.asm b/gmp/mpn/sparc64/addmul_1.asm new file mode 100644 index 00000000..bd83c656 --- /dev/null +++ b/gmp/mpn/sparc64/addmul_1.asm @@ -0,0 +1,596 @@ +dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add +dnl the result to a second limb vector. + +dnl Copyright 1998, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, +dnl Inc. + +dnl This file is part of the GNU MP Library. + +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. + +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. + +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C cycles/limb +C UltraSPARC 1&2: 14 +C UltraSPARC 3: 17.5 + +C Algorithm: We use eight floating-point multiplies per limb product, with the +C invariant v operand split into four 16-bit pieces, and the up operand split +C into 32-bit pieces. We sum pairs of 48-bit partial products using +C floating-point add, then convert the four 49-bit product-sums and transfer +C them to the integer unit. + +C Possible optimizations: +C 0. Rewrite to use algorithm of mpn_addmul_2. +C 1. Align the stack area where we transfer the four 49-bit product-sums +C to a 32-byte boundary. That would minimize the cache collision. +C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would +C be to align the area to map to the area immediately before up?) +C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the +C develop mpn_addmul_2. This would save many integer instructions. +C 3. Unrolling. Questionable if it is worth the code expansion, given that +C it could only save 1 cycle/limb. +C 4. Specialize for particular v values. If its upper 32 bits are zero, we +C could save many operations, in the FPU (fmuld), but more so in the IEU +C since we'll be summing 48-bit quantities, which might be simpler. +C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and +C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should +C not be greater than needed for L2 cache latency, and also not so great +C that i16 needs to be copied. +C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want +C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU +C ops.) + +C Instruction classification (as per UltraSPARC-1/2 functional units): +C 8 FM +C 10 FA +C 12 MEM +C 10 ISHIFT + 14 IADDLOG +C 1 BRANCH +C 55 insns totally (plus one mov insn that should be optimized out) + +C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we +C sustain the peak execution rate of 4 instructions/cycle. + +C INPUT PARAMETERS +C rp i0 +C up i1 +C n i2 +C v i3 + +ASM_START() + REGISTER(%g2,#scratch) + REGISTER(%g3,#scratch) + +define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') +define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') +define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') +define(`u00',`%f32') define(`u32', `%f34') +define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') +define(`cy',`%g1') +define(`rlimb',`%g3') +define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') +define(`xffffffff',`%l7') +define(`xffff',`%o0') + +PROLOGUE(mpn_addmul_1) + +C Initialization. (1) Split v operand into four 16-bit chunks and store them +C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs +C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. + + save %sp, -256, %sp + mov -1, %g4 + srlx %g4, 48, xffff C store mask in register `xffff' + and %i3, xffff, %g2 + stx %g2, [%sp+2223+0] + srlx %i3, 16, %g3 + and %g3, xffff, %g3 + stx %g3, [%sp+2223+8] + srlx %i3, 32, %g2 + and %g2, xffff, %g2 + stx %g2, [%sp+2223+16] + srlx %i3, 48, %g3 + stx %g3, [%sp+2223+24] + srlx %g4, 32, xffffffff C store mask in register `xffffffff' + + sllx %i2, 3, %i2 + mov 0, cy C clear cy + add %i0, %i2, %i0 + add %i1, %i2, %i1 + neg %i2 + add %i1, 4, %i5 + add %i0, -32, %i4 + add %i0, -16, %i0 + + ldd [%sp+2223+0], v00 + ldd [%sp+2223+8], v16 + ldd [%sp+2223+16], v32 + ldd [%sp+2223+24], v48 + ld [%sp+2223+0],%f2 C zero f2 + ld [%sp+2223+0],%f4 C zero f4 + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fxtod v00, v00 + fxtod v16, v16 + fxtod v32, v32 + fxtod v48, v48 + +C Start real work. (We sneakingly read f3 and f5 above...) +C The software pipeline is very deep, requiring 4 feed-in stages. + + fxtod %f2, u00 + fxtod %f4, u32 + fmuld u00, v00, a00 + fmuld u00, v16, a16 + fmuld u00, v32, p32 + fmuld u32, v00, r32 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %icc, .L_two_or_more + fmuld u32, v16, r48 + +.L_one: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + ldx [%i0+%i2], rlimb C read rp[i] + fdtox r80, a16 + ldx [%sp+2223+0], i00 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + add %i2, 8, %i2 + + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + add i00, %g5, %g5 C i00+ now in g5 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_1 + add %i2, 8, %i2 + +.L_two_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + fdtox a32, a32 + fxtod %f2, u00 + fxtod %f4, u32 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %icc, .L_three_or_more + fmuld u32, v16, r48 + +.L_two: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + ldx [%sp+2223+16], i32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + std a16, [%sp+2223+8] + std a32, [%sp+2223+16] + std a48, [%sp+2223+24] + add %i2, 8, %i2 + + fdtox r64, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + add i00, %g5, %g5 C i00+ now in g5 + fdtox r80, a16 + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_2 + add %i2, 8, %i2 + +.L_three_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + faddd p16, r80, a16 + fmuld u00, v48, p48 + addcc %i2, 8, %i2 + bnz,pt %icc, .L_four_or_more + fmuld u32, v16, r48 + +.L_three: + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + add cy, %g5, %o4 C x = prev(i00) + cy + b .L_out_3 + add %i2, 8, %i2 + +.L_four_or_more: + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 + sllx i48, 32, %l6 C (i48 << 32) + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + faddd p16, r80, a16 + fmuld u00, v48, p48 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %icc, .Loop + fmuld u32, v16, r48 + +.L_four: + b,a .L_out_4 + +C BEGIN MAIN LOOP + .align 16 +.Loop: +C 00 + srlx %o4, 16, %o5 C (x >> 16) + ld [%i5+%i2], %f3 C read low 32 bits of up[i] + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 +C 01 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + ld [%i1+%i2], %f5 C read high 32 bits of up[i] + fdtox a00, a00 +C 02 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 +C 03 + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 +C 04 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 +C 05 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + fxtod %f2, u00 +C 06 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + fxtod %f4, u32 +C 07 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 +C 08 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + fmuld u00, v00, p00 +C 09 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + fmuld u00, v16, p16 +C 10 + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + fmuld u00, v32, p32 +C 11 + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + faddd p00, r64, a00 + fmuld u32, v00, r32 +C 12 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + faddd p16, r80, a16 + fmuld u00, v48, p48 +C 13 + add cy, %g5, %o4 C x = prev(i00) + cy + addcc %i2, 8, %i2 + bnz,pt %icc, .Loop + fmuld u32, v16, r48 +C END MAIN LOOP + +.L_out_4: + srlx %o4, 16, %o5 C (x >> 16) + fmuld u32, v32, r64 C FIXME not urgent + faddd p32, r32, a32 + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox a00, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + faddd p48, r48, a48 + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fmuld u32, v48, r80 C FIXME not urgent + fdtox a16, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + fdtox a32, a32 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + fdtox a48, a48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + std a32, [%sp+2223+16] + add %l6, %o2, %o2 C mi64- in %o2 + std a48, [%sp+2223+24] + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_3: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + fdtox r64, a00 + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + ldx [%i0+%i2], rlimb C read rp[i] + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + fdtox r80, a16 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + ldx [%sp+2223+16], i32 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + ldx [%sp+2223+24], i48 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + std a00, [%sp+2223+0] + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + std a16, [%sp+2223+8] + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_2: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx rlimb, 32, %g4 C HI(rlimb) + and rlimb, xffffffff, %g5 C LO(rlimb) + srlx %o2, 48, %o7 C (mi64 >> 48) + add i00, %g5, %g5 C i00+ now in g5 + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + ldx [%sp+2223+0], i00 + srlx i16, 48, %l4 C (i16 >> 48) + mov i16, %g2 + ldx [%sp+2223+8], i16 + srlx i48, 16, %l5 C (i48 >> 16) + add i32, %g4, %g4 C i32+ now in g4 + sllx i48, 32, %l6 C (i48 << 32) + or %i3, %o5, %o5 + srlx %g4, 32, %o3 C (i32 >> 32) + add %l5, %l4, %o1 C hi64- in %o1 + sllx %g4, 16, %o2 C (i32 << 16) + add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT + sllx %o1, 48, %o3 C (hi64 << 48) + add %g2, %o2, %o2 C mi64- in %o2 + add %l6, %o2, %o2 C mi64- in %o2 + sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT + stx %o5, [%i4+%i2] + add cy, %g5, %o4 C x = prev(i00) + cy + add %i2, 8, %i2 +.L_out_1: + srlx %o4, 16, %o5 C (x >> 16) + add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT + and %o4, xffff, %o5 C (x & 0xffff) + srlx %o2, 48, %o7 C (mi64 >> 48) + sllx %o2, 16, %i3 C (mi64 << 16) + add %o7, %o1, cy C new cy + or %i3, %o5, %o5 + stx %o5, [%i4+%i2] + + sllx i00, 0, %g2 + add %g2, cy, cy + sllx i16, 16, %g3 + add %g3, cy, cy + + return %i7+8 + mov cy, %o0 +EPILOGUE(mpn_addmul_1)