X-Git-Url: https://oss.titaniummirror.com/gitweb?a=blobdiff_plain;f=gmp%2Fmpn%2Fx86%2Fpentium4%2Fsse2%2Fmul_basecase.asm;fp=gmp%2Fmpn%2Fx86%2Fpentium4%2Fsse2%2Fmul_basecase.asm;h=2628e5eb72dfe894ffc203d710c6be823b0a6e56;hb=6fed43773c9b0ce596dca5686f37ac3fc0fa11c0;hp=0000000000000000000000000000000000000000;hpb=27b11d56b743098deb193d510b337ba22dc52e5c;p=msp430-gcc.git diff --git a/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm b/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm new file mode 100644 index 00000000..2628e5eb --- /dev/null +++ b/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm @@ -0,0 +1,651 @@ +dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). + +dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc. +dnl +dnl This file is part of the GNU MP Library. +dnl +dnl The GNU MP Library is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU Lesser General Public License as published +dnl by the Free Software Foundation; either version 3 of the License, or (at +dnl your option) any later version. +dnl +dnl The GNU MP Library is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public +dnl License for more details. +dnl +dnl You should have received a copy of the GNU Lesser General Public License +dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. + +include(`../config.m4') + +C TODO: +C * Improve ad-hoc outer loop code and register handling. Some feed-in +C scheduling could improve things by several cycles per outer iteration. +C * In code for un <= 3, try keeping accumulation operands in registers, +C without storing intermediates to rp. +C * We might want to keep 32 in a free mm register, since the register form is +C 3 bytes and the immediate form is 4 bytes. About 70 bytes to save. +C * Look into different loop alignment, we now expand the code about 50 bytes +C with possibly needless alignment. +C * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry. +C * Use OSP, should solve feed-in latency problems. +C * Save a few tens of bytes by doing cross-jumping for Loel0, etc. +C * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers +C so that they can share feed-in code, and changing the branch targets from +C L to Lm. + +C cycles/limb +C P6 model 9 (Banias) ? +C P6 model 13 (Dothan) 5.24 +C P6 model 14 (Yonah) ? +C P4 model 0-1 (Willamette): 5 +C P4 model 2 (Northwood): 4.60 at 32 limbs +C P4 model 3-4 (Prescott): 4.94 at 32 limbs + +C INPUT PARAMETERS +C rp sp + 4 +C up sp + 8 +C un sp + 12 +C vp sp + 16 +C vn sp + 20 + + TEXT + ALIGN(16) +PROLOGUE(mpn_mul_basecase) + push %esi + push %ebx + mov 12(%esp), %edx C rp + mov 16(%esp), %eax C up + mov 20(%esp), %ecx C un + mov 24(%esp), %esi C vp + mov 28(%esp), %ebx C vn + movd (%esi), %mm7 C +L(ent): cmp $3, %ecx + ja L(big) + movd (%eax), %mm6 + pmuludq %mm7, %mm6 + jz L(un3) + cmp $2, %ecx + jz L(un2) + +L(un1): movd %mm6, (%edx) C un=1 + psrlq $32, %mm6 C un=1 + movd %mm6, 4(%edx) C un=1 + jmp L(rtr) C un=1 + +L(un2): movd 4(%eax), %mm1 C un=2 + pmuludq %mm7, %mm1 C un=2 + movd %mm6, (%edx) C un=2 + psrlq $32, %mm6 C un=2 + paddq %mm1, %mm6 C un=2 + movd %mm6, 4(%edx) C un=2 + psrlq $32, %mm6 C un=2 + movd %mm6, 8(%edx) C un=2 + dec %ebx C un=2 + jz L(rtr) C un=2 + movd 4(%esi), %mm7 C un=2 + movd (%eax), %mm6 C un=2 + pmuludq %mm7, %mm6 C un=2 + movd 4(%eax), %mm1 C un=2 + movd 4(%edx), %mm4 C un=2 + pmuludq %mm7, %mm1 C un=2 + movd 8(%edx), %mm5 C un=2 + paddq %mm4, %mm6 C un=2 + paddq %mm1, %mm5 C un=2 + movd %mm6, 4(%edx) C un=2 + psrlq $32, %mm6 C un=2 + paddq %mm5, %mm6 C un=2 + movd %mm6, 8(%edx) C un=2 + psrlq $32, %mm6 C un=2 + movd %mm6, 12(%edx) C un=2 +L(rtr): emms + pop %ebx + pop %esi + ret + +L(un3): movd 4(%eax), %mm1 C un=3 + pmuludq %mm7, %mm1 C un=3 + movd 8(%eax), %mm2 C un=3 + pmuludq %mm7, %mm2 C un=3 + movd %mm6, (%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm1, %mm6 C un=3 + movd %mm6, 4(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm2, %mm6 C un=3 + movd %mm6, 8(%edx) C un=3 + psrlq $32, %mm6 C un=3 + movd %mm6, 12(%edx) C un=3 + dec %ebx C un=3 + jz L(rtr) C un=3 + movd 4(%esi), %mm7 C un=3 + movd (%eax), %mm6 C un=3 + pmuludq %mm7, %mm6 C un=3 + movd 4(%eax), %mm1 C un=3 + movd 4(%edx), %mm4 C un=3 + pmuludq %mm7, %mm1 C un=3 + movd 8(%eax), %mm2 C un=3 + movd 8(%edx), %mm5 C un=3 + pmuludq %mm7, %mm2 C un=3 + paddq %mm4, %mm6 C un=3 + paddq %mm1, %mm5 C un=3 + movd 12(%edx), %mm4 C un=3 + movd %mm6, 4(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm5, %mm6 C un=3 + paddq %mm2, %mm4 C un=3 + movd %mm6, 8(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm4, %mm6 C un=3 + movd %mm6, 12(%edx) C un=3 + psrlq $32, %mm6 C un=3 + movd %mm6, 16(%edx) C un=3 + dec %ebx C un=3 + jz L(rtr) C un=3 + movd 8(%esi), %mm7 C un=3 + movd (%eax), %mm6 C un=3 + pmuludq %mm7, %mm6 C un=3 + movd 4(%eax), %mm1 C un=3 + movd 8(%edx), %mm4 C un=3 + pmuludq %mm7, %mm1 C un=3 + movd 8(%eax), %mm2 C un=3 + movd 12(%edx), %mm5 C un=3 + pmuludq %mm7, %mm2 C un=3 + paddq %mm4, %mm6 C un=3 + paddq %mm1, %mm5 C un=3 + movd 16(%edx), %mm4 C un=3 + movd %mm6, 8(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm5, %mm6 C un=3 + paddq %mm2, %mm4 C un=3 + movd %mm6, 12(%edx) C un=3 + psrlq $32, %mm6 C un=3 + paddq %mm4, %mm6 C un=3 + movd %mm6, 16(%edx) C un=3 + psrlq $32, %mm6 C un=3 + movd %mm6, 20(%edx) C un=3 + jmp L(rtr) + + +L(big): push %edi + pxor %mm6, %mm6 + lea 4(%esi), %esi + and $3, %ecx + jz L(0) + cmp $2, %ecx + jc L(1) + jz L(2) + jmp L(3) C FIXME: one case should fall through + + +L(0): movd (%eax), %mm3 C m 0 + sub 24(%esp), %ecx C inner loop count m 0 + mov %ecx, 24(%esp) C update loop count for later m 0 + pmuludq %mm7, %mm3 C m 0 + movd 4(%eax), %mm0 C m 0 + pmuludq %mm7, %mm0 C m 0 + movd 8(%eax), %mm1 C m 0 + jmp L(m00) C m 0 + ALIGN(16) C m 0 +L(lpm0): + pmuludq %mm7, %mm4 C m 0 + paddq %mm0, %mm6 C m 0 + movd (%eax), %mm3 C m 0 + movd %mm6, -12(%edx) C m 0 + psrlq $32, %mm6 C m 0 + pmuludq %mm7, %mm3 C m 0 + paddq %mm1, %mm6 C m 0 + movd 4(%eax), %mm0 C m 0 + movd %mm6, -8(%edx) C m 0 + psrlq $32, %mm6 C m 0 + pmuludq %mm7, %mm0 C m 0 + paddq %mm4, %mm6 C m 0 + movd 8(%eax), %mm1 C m 0 + movd %mm6, -4(%edx) C m 0 + psrlq $32, %mm6 C m 0 +L(m00): pmuludq %mm7, %mm1 C m 0 + paddq %mm3, %mm6 C m 0 + movd 12(%eax), %mm4 C m 0 + movd %mm6, (%edx) C m 0 + psrlq $32, %mm6 C m 0 + lea 16(%eax), %eax C m 0 + lea 16(%edx), %edx C m 0 + add $4, %ecx C m 0 + ja L(lpm0) C m 0 + pmuludq %mm7, %mm4 C m 0 + paddq %mm0, %mm6 C m 0 + movd %mm6, -12(%edx) C m 0 + psrlq $32, %mm6 C m 0 + paddq %mm1, %mm6 C m 0 + mov 16(%esp), %edi C rp 0 + jmp L(x0) + +L(olp0): + lea 4(%edi), %edi C am 0 + movd (%esi), %mm7 C am 0 + lea 4(%esi), %esi C am 0 + mov %edi, %edx C rp am 0 + mov 20(%esp), %eax C up am 0 + movd (%eax), %mm3 C am 0 + mov 24(%esp), %ecx C inner loop count am 0 + pxor %mm6, %mm6 C am 0 + pmuludq %mm7, %mm3 C am 0 + movd 4(%eax), %mm0 C am 0 + movd (%edx), %mm5 C am 0 + pmuludq %mm7, %mm0 C am 0 + movd 8(%eax), %mm1 C am 0 + paddq %mm3, %mm5 C am 0 + movd 4(%edx), %mm4 C am 0 + jmp L(am00) C am 0 + ALIGN(16) C mm 0 +L(lam0): + pmuludq %mm7, %mm2 C am 0 + paddq %mm4, %mm6 C am 0 + movd (%eax), %mm3 C am 0 + paddq %mm1, %mm5 C am 0 + movd -4(%edx), %mm4 C am 0 + movd %mm6, -12(%edx) C am 0 + psrlq $32, %mm6 C am 0 + pmuludq %mm7, %mm3 C am 0 + paddq %mm5, %mm6 C am 0 + movd 4(%eax), %mm0 C am 0 + paddq %mm2, %mm4 C am 0 + movd (%edx), %mm5 C am 0 + movd %mm6, -8(%edx) C am 0 + psrlq $32, %mm6 C am 0 + pmuludq %mm7, %mm0 C am 0 + paddq %mm4, %mm6 C am 0 + movd 8(%eax), %mm1 C am 0 + paddq %mm3, %mm5 C am 0 + movd 4(%edx), %mm4 C am 0 + movd %mm6, -4(%edx) C am 0 + psrlq $32, %mm6 C am 0 +L(am00): + pmuludq %mm7, %mm1 C am 0 + paddq %mm5, %mm6 C am 0 + movd 12(%eax), %mm2 C am 0 + paddq %mm0, %mm4 C am 0 + movd 8(%edx), %mm5 C am 0 + movd %mm6, (%edx) C am 0 + psrlq $32, %mm6 C am 0 + lea 16(%eax), %eax C am 0 + lea 16(%edx), %edx C am 0 + add $4, %ecx C am 0 + jnz L(lam0) C am 0 + pmuludq %mm7, %mm2 C am 0 + paddq %mm4, %mm6 C am 0 + paddq %mm1, %mm5 C am 0 + movd -4(%edx), %mm4 C am 0 + movd %mm6, -12(%edx) C am 0 + psrlq $32, %mm6 C am 0 + paddq %mm5, %mm6 C am 0 + paddq %mm2, %mm4 C am 0 +L(x0): movd %mm6, -8(%edx) C am 0 + psrlq $32, %mm6 C am 0 + paddq %mm4, %mm6 C am 0 + movd %mm6, -4(%edx) C am 0 + psrlq $32, %mm6 C am 0 + movd %mm6, (%edx) C am 0 + dec %ebx C am 0 + jnz L(olp0) C am 0 +L(oel0): + emms C 0 + pop %edi C 0 + pop %ebx C 0 + pop %esi C 0 + ret C 0 + + +L(1): movd (%eax), %mm4 C m 1 + sub 24(%esp), %ecx C m 1 + mov %ecx, 24(%esp) C update loop count for later m 1 + pmuludq %mm7, %mm4 C m 1 + movd 4(%eax), %mm3 C m 1 + pmuludq %mm7, %mm3 C m 1 + movd 8(%eax), %mm0 C m 1 + jmp L(m01) C m 1 + ALIGN(16) C m 1 +L(lpm1): + pmuludq %mm7, %mm4 C m 1 + paddq %mm0, %mm6 C m 1 + movd 4(%eax), %mm3 C m 1 + movd %mm6, -8(%edx) C m 1 + psrlq $32, %mm6 C m 1 + pmuludq %mm7, %mm3 C m 1 + paddq %mm1, %mm6 C m 1 + movd 8(%eax), %mm0 C m 1 + movd %mm6, -4(%edx) C m 1 + psrlq $32, %mm6 C m 1 +L(m01): pmuludq %mm7, %mm0 C m 1 + paddq %mm4, %mm6 C m 1 + movd 12(%eax), %mm1 C m 1 + movd %mm6, (%edx) C m 1 + psrlq $32, %mm6 C m 1 + pmuludq %mm7, %mm1 C m 1 + paddq %mm3, %mm6 C m 1 + movd 16(%eax), %mm4 C m 1 + movd %mm6, 4(%edx) C m 1 + psrlq $32, %mm6 C m 1 + lea 16(%eax), %eax C m 1 + lea 16(%edx), %edx C m 1 + add $4, %ecx C m 1 + ja L(lpm1) C m 1 + pmuludq %mm7, %mm4 C m 1 + paddq %mm0, %mm6 C m 1 + movd %mm6, -8(%edx) C m 1 + psrlq $32, %mm6 C m 1 + paddq %mm1, %mm6 C m 1 + mov 16(%esp), %edi C rp 1 + jmp L(x1) + +L(olp1): + lea 4(%edi), %edi C am 1 + movd (%esi), %mm7 C am 1 + lea 4(%esi), %esi C am 1 + mov %edi, %edx C rp am 1 + mov 20(%esp), %eax C up am 1 + movd (%eax), %mm2 C am 1 + mov 24(%esp), %ecx C inner loop count am 1 + pxor %mm6, %mm6 C am 1 + pmuludq %mm7, %mm2 C am 1 + movd 4(%eax), %mm3 C am 1 + movd (%edx), %mm4 C am 1 + pmuludq %mm7, %mm3 C am 1 + movd 8(%eax), %mm0 C am 1 + paddq %mm2, %mm4 C am 1 + movd 4(%edx), %mm5 C am 1 + jmp L(am01) C am 1 + ALIGN(16) C am 1 +L(lam1): + pmuludq %mm7, %mm2 C am 1 + paddq %mm4, %mm6 C am 1 + movd 4(%eax), %mm3 C am 1 + paddq %mm1, %mm5 C am 1 + movd (%edx), %mm4 C am 1 + movd %mm6, -8(%edx) C am 1 + psrlq $32, %mm6 C am 1 + pmuludq %mm7, %mm3 C am 1 + paddq %mm5, %mm6 C am 1 + movd 8(%eax), %mm0 C am 1 + paddq %mm2, %mm4 C am 1 + movd 4(%edx), %mm5 C am 1 + movd %mm6, -4(%edx) C am 1 + psrlq $32, %mm6 C am 1 +L(am01): + pmuludq %mm7, %mm0 C am 1 + paddq %mm4, %mm6 C am 1 + movd 12(%eax), %mm1 C am 1 + paddq %mm3, %mm5 C am 1 + movd 8(%edx), %mm4 C am 1 + movd %mm6, (%edx) C am 1 + psrlq $32, %mm6 C am 1 + pmuludq %mm7, %mm1 C am 1 + paddq %mm5, %mm6 C am 1 + movd 16(%eax), %mm2 C am 1 + paddq %mm0, %mm4 C am 1 + movd 12(%edx), %mm5 C am 1 + movd %mm6, 4(%edx) C am 1 + psrlq $32, %mm6 C am 1 + lea 16(%eax), %eax C am 1 + lea 16(%edx), %edx C am 1 + add $4, %ecx C am 1 + jnz L(lam1) C am 1 + pmuludq %mm7, %mm2 C am 1 + paddq %mm4, %mm6 C am 1 + paddq %mm1, %mm5 C am 1 + movd (%edx), %mm4 C am 1 + movd %mm6, -8(%edx) C am 1 + psrlq $32, %mm6 C am 1 + paddq %mm5, %mm6 C am 1 + paddq %mm2, %mm4 C am 1 +L(x1): movd %mm6, -4(%edx) C am 1 + psrlq $32, %mm6 C am 1 + paddq %mm4, %mm6 C am 1 + movd %mm6, (%edx) C am 1 + psrlq $32, %mm6 C am 1 + movd %mm6, 4(%edx) C am 1 + dec %ebx C am 1 + jnz L(olp1) C am 1 +L(oel1): + emms C 1 + pop %edi C 1 + pop %ebx C 1 + pop %esi C 1 + ret C 1 + + +L(2): movd (%eax), %mm1 C m 2 + sub 24(%esp), %ecx C m 2 + mov %ecx, 24(%esp) C update loop count for later m 2 + pmuludq %mm7, %mm1 C m 2 + movd 4(%eax), %mm4 C m 2 + pmuludq %mm7, %mm4 C m 2 + movd 8(%eax), %mm3 C m 2 + jmp L(m10) C m 2 + ALIGN(16) C m 2 +L(lpm2): + pmuludq %mm7, %mm4 C m 2 + paddq %mm0, %mm6 C m 2 + movd 8(%eax), %mm3 C m 2 + movd %mm6, -4(%edx) C m 2 + psrlq $32, %mm6 C m 2 +L(m10): pmuludq %mm7, %mm3 C m 2 + paddq %mm1, %mm6 C m 2 + movd 12(%eax), %mm0 C m 2 + movd %mm6, (%edx) C m 2 + psrlq $32, %mm6 C m 2 + pmuludq %mm7, %mm0 C m 2 + paddq %mm4, %mm6 C m 2 + movd 16(%eax), %mm1 C m 2 + movd %mm6, 4(%edx) C m 2 + psrlq $32, %mm6 C m 2 + pmuludq %mm7, %mm1 C m 2 + paddq %mm3, %mm6 C m 2 + movd 20(%eax), %mm4 C m 2 + movd %mm6, 8(%edx) C m 2 + psrlq $32, %mm6 C m 2 + lea 16(%eax), %eax C m 2 + lea 16(%edx), %edx C m 2 + add $4, %ecx C m 2 + ja L(lpm2) C m 2 + pmuludq %mm7, %mm4 C m 2 + paddq %mm0, %mm6 C m 2 + movd %mm6, -4(%edx) C m 2 + psrlq $32, %mm6 C m 2 + paddq %mm1, %mm6 C m 2 + mov 16(%esp), %edi C rp 2 + jmp L(x2) + +L(olp2): + lea 4(%edi), %edi C am 2 + movd (%esi), %mm7 C am 2 + lea 4(%esi), %esi C am 2 + mov %edi, %edx C rp am 2 + mov 20(%esp), %eax C up am 2 + movd (%eax), %mm1 C am 2 + mov 24(%esp), %ecx C inner loop count am 2 + pxor %mm6, %mm6 C am 2 + pmuludq %mm7, %mm1 C am 2 + movd 4(%eax), %mm2 C am 2 + movd (%edx), %mm5 C am 2 + pmuludq %mm7, %mm2 C am 2 + movd 8(%eax), %mm3 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + jmp L(am10) C am 2 + ALIGN(16) C am 2 +L(lam2): + pmuludq %mm7, %mm2 C am 2 + paddq %mm4, %mm6 C am 2 + movd 8(%eax), %mm3 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + movd %mm6, -4(%edx) C am 2 + psrlq $32, %mm6 C am 2 +L(am10): + pmuludq %mm7, %mm3 C am 2 + paddq %mm5, %mm6 C am 2 + movd 12(%eax), %mm0 C am 2 + paddq %mm2, %mm4 C am 2 + movd 8(%edx), %mm5 C am 2 + movd %mm6, (%edx) C am 2 + psrlq $32, %mm6 C am 2 + pmuludq %mm7, %mm0 C am 2 + paddq %mm4, %mm6 C am 2 + movd 16(%eax), %mm1 C am 2 + paddq %mm3, %mm5 C am 2 + movd 12(%edx), %mm4 C am 2 + movd %mm6, 4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + pmuludq %mm7, %mm1 C am 2 + paddq %mm5, %mm6 C am 2 + movd 20(%eax), %mm2 C am 2 + paddq %mm0, %mm4 C am 2 + movd 16(%edx), %mm5 C am 2 + movd %mm6, 8(%edx) C am 2 + psrlq $32, %mm6 C am 2 + lea 16(%eax), %eax C am 2 + lea 16(%edx), %edx C am 2 + add $4, %ecx C am 2 + jnz L(lam2) C am 2 + pmuludq %mm7, %mm2 C am 2 + paddq %mm4, %mm6 C am 2 + paddq %mm1, %mm5 C am 2 + movd 4(%edx), %mm4 C am 2 + movd %mm6, -4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + paddq %mm5, %mm6 C am 2 + paddq %mm2, %mm4 C am 2 +L(x2): movd %mm6, (%edx) C am 2 + psrlq $32, %mm6 C am 2 + paddq %mm4, %mm6 C am 2 + movd %mm6, 4(%edx) C am 2 + psrlq $32, %mm6 C am 2 + movd %mm6, 8(%edx) C am 2 + dec %ebx C am 2 + jnz L(olp2) C am 2 +L(oel2): + emms C 2 + pop %edi C 2 + pop %ebx C 2 + pop %esi C 2 + ret C 2 + + +L(3): movd (%eax), %mm0 C m 3 + sub 24(%esp), %ecx C m 3 + mov %ecx, 24(%esp) C update loop count for later m 3 + pmuludq %mm7, %mm0 C m 3 + movd 4(%eax), %mm1 C m 3 + pmuludq %mm7, %mm1 C m 3 + movd 8(%eax), %mm4 C m 3 + jmp L(lpm3) C m 3 + ALIGN(16) C m 3 +L(lpm3): + pmuludq %mm7, %mm4 C m 3 + paddq %mm0, %mm6 C m 3 + movd 12(%eax), %mm3 C m 3 + movd %mm6, (%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm3 C m 3 + paddq %mm1, %mm6 C m 3 + movd 16(%eax), %mm0 C m 3 + movd %mm6, 4(%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm0 C m 3 + paddq %mm4, %mm6 C m 3 + movd 20(%eax), %mm1 C m 3 + movd %mm6, 8(%edx) C m 3 + psrlq $32, %mm6 C m 3 + pmuludq %mm7, %mm1 C m 3 + paddq %mm3, %mm6 C m 3 + movd 24(%eax), %mm4 C m 3 + movd %mm6, 12(%edx) C m 3 + psrlq $32, %mm6 C m 3 + lea 16(%eax), %eax C m 3 + lea 16(%edx), %edx C m 3 + add $4, %ecx C m 3 + ja L(lpm3) C m 3 + pmuludq %mm7, %mm4 C m 3 + paddq %mm0, %mm6 C m 3 + movd %mm6, (%edx) C m 3 + psrlq $32, %mm6 C m 3 + paddq %mm1, %mm6 C m 3 + mov 16(%esp), %edi C rp 3 + jmp L(x3) + +L(olp3): + lea 4(%edi), %edi C am 3 + movd (%esi), %mm7 C am 3 + lea 4(%esi), %esi C am 3 + mov %edi, %edx C rp am 3 + mov 20(%esp), %eax C up am 3 + movd (%eax), %mm0 C am 3 + mov 24(%esp), %ecx C inner loop count am 3 + pxor %mm6, %mm6 C am 3 + pmuludq %mm7, %mm0 C am 3 + movd 4(%eax), %mm1 C am 3 + movd (%edx), %mm4 C am 3 + pmuludq %mm7, %mm1 C am 3 + movd 8(%eax), %mm2 C am 3 + paddq %mm0, %mm4 C am 3 + movd 4(%edx), %mm5 C am 3 + jmp L(lam3) C am 3 + ALIGN(16) C am 3 +L(lam3): + pmuludq %mm7, %mm2 C am 3 + paddq %mm4, %mm6 C am 3 + movd 12(%eax), %mm3 C am 3 + paddq %mm1, %mm5 C am 3 + movd 8(%edx), %mm4 C am 3 + movd %mm6, (%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm3 C am 3 + paddq %mm5, %mm6 C am 3 + movd 16(%eax), %mm0 C am 3 + paddq %mm2, %mm4 C am 3 + movd 12(%edx), %mm5 C am 3 + movd %mm6, 4(%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm0 C am 3 + paddq %mm4, %mm6 C am 3 + movd 20(%eax), %mm1 C am 3 + paddq %mm3, %mm5 C am 3 + movd 16(%edx), %mm4 C am 3 + movd %mm6, 8(%edx) C am 3 + psrlq $32, %mm6 C am 3 + pmuludq %mm7, %mm1 C am 3 + paddq %mm5, %mm6 C am 3 + movd 24(%eax), %mm2 C am 3 + paddq %mm0, %mm4 C am 3 + movd 20(%edx), %mm5 C am 3 + movd %mm6, 12(%edx) C am 3 + psrlq $32, %mm6 C am 3 + lea 16(%eax), %eax C am 3 + lea 16(%edx), %edx C am 3 + add $4, %ecx C am 3 + jnz L(lam3) C am 3 + pmuludq %mm7, %mm2 C am 3 + paddq %mm4, %mm6 C am 3 + paddq %mm1, %mm5 C am 3 + movd 8(%edx), %mm4 C am 3 + movd %mm6, (%edx) C am 3 + psrlq $32, %mm6 C am 3 + paddq %mm5, %mm6 C am 3 + paddq %mm2, %mm4 C am 3 +L(x3): movd %mm6, 4(%edx) C am 3 + psrlq $32, %mm6 C am 3 + paddq %mm4, %mm6 C am 3 + movd %mm6, 8(%edx) C am 3 + psrlq $32, %mm6 C am 3 + movd %mm6, 12(%edx) C am 3 + dec %ebx C am 3 + jnz L(olp3) C am 3 +L(oel3): + emms C 3 + pop %edi C 3 + pop %ebx C 3 + pop %esi C 3 + ret C 3 +EPILOGUE()