]> oss.titaniummirror.com Git - msp430-gcc.git/blobdiff - gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
Imported gcc-4.4.3
[msp430-gcc.git] / gmp / mpn / x86 / pentium4 / sse2 / mul_basecase.asm
diff --git a/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm b/gmp/mpn/x86/pentium4/sse2/mul_basecase.asm
new file mode 100644 (file)
index 0000000..2628e5e
--- /dev/null
@@ -0,0 +1,651 @@
+dnl  mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
+
+dnl  Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
+dnl
+dnl  This file is part of the GNU MP Library.
+dnl
+dnl  The GNU MP Library is free software; you can redistribute it and/or modify
+dnl  it under the terms of the GNU Lesser General Public License as published
+dnl  by the Free Software Foundation; either version 3 of the License, or (at
+dnl  your option) any later version.
+dnl
+dnl  The GNU MP Library is distributed in the hope that it will be useful, but
+dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
+dnl  License for more details.
+dnl
+dnl  You should have received a copy of the GNU Lesser General Public License
+dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C TODO:
+C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
+C    scheduling could improve things by several cycles per outer iteration.
+C  * In code for un <= 3, try keeping accumulation operands in registers,
+C    without storing intermediates to rp.
+C  * We might want to keep 32 in a free mm register, since the register form is
+C    3 bytes and the immediate form is 4 bytes.  About 70 bytes to save.
+C  * Look into different loop alignment, we now expand the code about 50 bytes
+C    with possibly needless alignment.
+C  * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
+C  * Use OSP, should solve feed-in latency problems.
+C  * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
+C  * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
+C    so that they can share feed-in code, and changing the branch targets from
+C    L<n> to Lm<nn>.
+
+C                           cycles/limb
+C P6 model 9   (Banias)         ?
+C P6 model 13  (Dothan)         5.24
+C P6 model 14  (Yonah)          ?
+C P4 model 0-1 (Willamette):    5
+C P4 model 2   (Northwood):     4.60 at 32 limbs
+C P4 model 3-4 (Prescott):      4.94 at 32 limbs
+
+C INPUT PARAMETERS
+C rp           sp + 4
+C up           sp + 8
+C un           sp + 12
+C vp           sp + 16
+C vn           sp + 20
+
+       TEXT
+       ALIGN(16)
+PROLOGUE(mpn_mul_basecase)
+       push    %esi
+       push    %ebx
+       mov     12(%esp), %edx          C rp
+       mov     16(%esp), %eax          C up
+       mov     20(%esp), %ecx          C un
+       mov     24(%esp), %esi          C vp
+       mov     28(%esp), %ebx          C vn
+       movd    (%esi), %mm7            C
+L(ent):        cmp     $3, %ecx
+       ja      L(big)
+       movd    (%eax), %mm6
+       pmuludq %mm7, %mm6
+       jz      L(un3)
+       cmp     $2, %ecx
+       jz      L(un2)
+
+L(un1):        movd    %mm6, (%edx)            C                               un=1
+       psrlq   $32, %mm6               C                               un=1
+       movd    %mm6, 4(%edx)           C                               un=1
+       jmp     L(rtr)                  C                               un=1
+
+L(un2):        movd    4(%eax), %mm1           C                               un=2
+       pmuludq %mm7, %mm1              C                               un=2
+       movd    %mm6, (%edx)            C                               un=2
+       psrlq   $32, %mm6               C                               un=2
+       paddq   %mm1, %mm6              C                               un=2
+       movd    %mm6, 4(%edx)           C                               un=2
+       psrlq   $32, %mm6               C                               un=2
+       movd    %mm6, 8(%edx)           C                               un=2
+      dec      %ebx                    C                               un=2
+      jz       L(rtr)                  C                               un=2
+       movd    4(%esi), %mm7           C                               un=2
+       movd    (%eax), %mm6            C                               un=2
+       pmuludq %mm7, %mm6              C                               un=2
+       movd    4(%eax), %mm1           C                               un=2
+       movd    4(%edx), %mm4           C                               un=2
+       pmuludq %mm7, %mm1              C                               un=2
+       movd    8(%edx), %mm5           C                               un=2
+       paddq   %mm4, %mm6              C                               un=2
+       paddq   %mm1, %mm5              C                               un=2
+       movd    %mm6, 4(%edx)           C                               un=2
+       psrlq   $32, %mm6               C                               un=2
+       paddq   %mm5, %mm6              C                               un=2
+       movd    %mm6, 8(%edx)           C                               un=2
+       psrlq   $32, %mm6               C                               un=2
+       movd    %mm6, 12(%edx)          C                               un=2
+L(rtr):        emms
+       pop     %ebx
+       pop     %esi
+       ret
+
+L(un3):        movd    4(%eax), %mm1           C                               un=3
+       pmuludq %mm7, %mm1              C                               un=3
+       movd    8(%eax), %mm2           C                               un=3
+       pmuludq %mm7, %mm2              C                               un=3
+       movd    %mm6, (%edx)            C                               un=3
+       psrlq   $32, %mm6               C                               un=3
+       paddq   %mm1, %mm6              C                               un=3
+       movd    %mm6, 4(%edx)           C                               un=3
+       psrlq   $32, %mm6               C                               un=3
+       paddq   %mm2, %mm6              C                               un=3
+       movd    %mm6, 8(%edx)           C                               un=3
+       psrlq   $32, %mm6               C                               un=3
+       movd    %mm6, 12(%edx)          C                               un=3
+      dec      %ebx                    C                               un=3
+      jz       L(rtr)                  C                               un=3
+       movd    4(%esi), %mm7           C                               un=3
+       movd    (%eax), %mm6            C                               un=3
+       pmuludq %mm7, %mm6              C                               un=3
+       movd    4(%eax), %mm1           C                               un=3
+       movd    4(%edx), %mm4           C                               un=3
+       pmuludq %mm7, %mm1              C                               un=3
+       movd    8(%eax), %mm2           C                               un=3
+       movd    8(%edx), %mm5           C                               un=3
+       pmuludq %mm7, %mm2              C                               un=3
+       paddq   %mm4, %mm6              C                               un=3
+       paddq   %mm1, %mm5              C                               un=3
+       movd    12(%edx), %mm4          C                               un=3
+       movd    %mm6, 4(%edx)           C                               un=3
+       psrlq   $32, %mm6               C                               un=3
+       paddq   %mm5, %mm6              C                               un=3
+       paddq   %mm2, %mm4              C                               un=3
+       movd    %mm6, 8(%edx)           C                               un=3
+       psrlq   $32, %mm6               C                               un=3
+       paddq   %mm4, %mm6              C                               un=3
+       movd    %mm6, 12(%edx)          C                               un=3
+       psrlq   $32, %mm6               C                               un=3
+       movd    %mm6, 16(%edx)          C                               un=3
+      dec      %ebx                    C                               un=3
+      jz       L(rtr)                  C                               un=3
+       movd    8(%esi), %mm7           C                               un=3
+       movd    (%eax), %mm6            C                               un=3
+       pmuludq %mm7, %mm6              C                               un=3
+       movd    4(%eax), %mm1           C                               un=3
+       movd    8(%edx), %mm4           C                               un=3
+       pmuludq %mm7, %mm1              C                               un=3
+       movd    8(%eax), %mm2           C                               un=3
+       movd    12(%edx), %mm5          C                               un=3
+       pmuludq %mm7, %mm2              C                               un=3
+       paddq   %mm4, %mm6              C                               un=3
+       paddq   %mm1, %mm5              C                               un=3
+       movd    16(%edx), %mm4          C                               un=3
+       movd    %mm6, 8(%edx)           C                               un=3
+       psrlq   $32, %mm6               C                               un=3
+       paddq   %mm5, %mm6              C                               un=3
+       paddq   %mm2, %mm4              C                               un=3
+       movd    %mm6, 12(%edx)          C                               un=3
+       psrlq   $32, %mm6               C                               un=3
+       paddq   %mm4, %mm6              C                               un=3
+       movd    %mm6, 16(%edx)          C                               un=3
+       psrlq   $32, %mm6               C                               un=3
+       movd    %mm6, 20(%edx)          C                               un=3
+       jmp     L(rtr)
+
+
+L(big):        push    %edi
+       pxor    %mm6, %mm6
+       lea     4(%esi), %esi
+       and     $3, %ecx
+       jz      L(0)
+       cmp     $2, %ecx
+       jc      L(1)
+       jz      L(2)
+       jmp     L(3)                    C FIXME: one case should fall through
+
+
+L(0):  movd    (%eax), %mm3            C                               m 0
+       sub     24(%esp), %ecx          C inner loop count              m 0
+       mov     %ecx, 24(%esp)          C update loop count for later   m 0
+       pmuludq %mm7, %mm3              C                               m 0
+       movd    4(%eax), %mm0           C                               m 0
+       pmuludq %mm7, %mm0              C                               m 0
+       movd    8(%eax), %mm1           C                               m 0
+       jmp     L(m00)                  C                               m 0
+       ALIGN(16)                       C                               m 0
+L(lpm0):
+       pmuludq %mm7, %mm4              C                               m 0
+       paddq   %mm0, %mm6              C                               m 0
+       movd    (%eax), %mm3            C                               m 0
+       movd    %mm6, -12(%edx)         C                               m 0
+       psrlq   $32, %mm6               C                               m 0
+       pmuludq %mm7, %mm3              C                               m 0
+       paddq   %mm1, %mm6              C                               m 0
+       movd    4(%eax), %mm0           C                               m 0
+       movd    %mm6, -8(%edx)          C                               m 0
+       psrlq   $32, %mm6               C                               m 0
+       pmuludq %mm7, %mm0              C                               m 0
+       paddq   %mm4, %mm6              C                               m 0
+       movd    8(%eax), %mm1           C                               m 0
+       movd    %mm6, -4(%edx)          C                               m 0
+       psrlq   $32, %mm6               C                               m 0
+L(m00):        pmuludq %mm7, %mm1              C                               m 0
+       paddq   %mm3, %mm6              C                               m 0
+       movd    12(%eax), %mm4          C                               m 0
+       movd    %mm6, (%edx)            C                               m 0
+       psrlq   $32, %mm6               C                               m 0
+       lea     16(%eax), %eax          C                               m 0
+       lea     16(%edx), %edx          C                               m 0
+       add     $4, %ecx                C                               m 0
+       ja      L(lpm0)                 C                               m 0
+       pmuludq %mm7, %mm4              C                               m 0
+       paddq   %mm0, %mm6              C                               m 0
+       movd    %mm6, -12(%edx)         C                               m 0
+       psrlq   $32, %mm6               C                               m 0
+       paddq   %mm1, %mm6              C                               m 0
+       mov     16(%esp), %edi          C rp                              0
+       jmp     L(x0)
+
+L(olp0):
+       lea     4(%edi), %edi           C                               am 0
+       movd    (%esi), %mm7            C                               am 0
+       lea     4(%esi), %esi           C                               am 0
+       mov     %edi, %edx              C rp                            am 0
+       mov     20(%esp), %eax          C up                            am 0
+       movd    (%eax), %mm3            C                               am 0
+       mov     24(%esp), %ecx          C inner loop count              am 0
+       pxor    %mm6, %mm6              C                               am 0
+       pmuludq %mm7, %mm3              C                               am 0
+       movd    4(%eax), %mm0           C                               am 0
+       movd    (%edx), %mm5            C                               am 0
+       pmuludq %mm7, %mm0              C                               am 0
+       movd    8(%eax), %mm1           C                               am 0
+       paddq   %mm3, %mm5              C                               am 0
+       movd    4(%edx), %mm4           C                               am 0
+       jmp     L(am00)                 C                               am 0
+       ALIGN(16)                       C                               mm 0
+L(lam0):
+       pmuludq %mm7, %mm2              C                               am 0
+       paddq   %mm4, %mm6              C                               am 0
+       movd    (%eax), %mm3            C                               am 0
+       paddq   %mm1, %mm5              C                               am 0
+       movd    -4(%edx), %mm4          C                               am 0
+       movd    %mm6, -12(%edx)         C                               am 0
+       psrlq   $32, %mm6               C                               am 0
+       pmuludq %mm7, %mm3              C                               am 0
+       paddq   %mm5, %mm6              C                               am 0
+       movd    4(%eax), %mm0           C                               am 0
+       paddq   %mm2, %mm4              C                               am 0
+       movd    (%edx), %mm5            C                               am 0
+       movd    %mm6, -8(%edx)          C                               am 0
+       psrlq   $32, %mm6               C                               am 0
+       pmuludq %mm7, %mm0              C                               am 0
+       paddq   %mm4, %mm6              C                               am 0
+       movd    8(%eax), %mm1           C                               am 0
+       paddq   %mm3, %mm5              C                               am 0
+       movd    4(%edx), %mm4           C                               am 0
+       movd    %mm6, -4(%edx)          C                               am 0
+       psrlq   $32, %mm6               C                               am 0
+L(am00):
+       pmuludq %mm7, %mm1              C                               am 0
+       paddq   %mm5, %mm6              C                               am 0
+       movd    12(%eax), %mm2          C                               am 0
+       paddq   %mm0, %mm4              C                               am 0
+       movd    8(%edx), %mm5           C                               am 0
+       movd    %mm6, (%edx)            C                               am 0
+       psrlq   $32, %mm6               C                               am 0
+       lea     16(%eax), %eax          C                               am 0
+       lea     16(%edx), %edx          C                               am 0
+       add     $4, %ecx                C                               am 0
+       jnz     L(lam0)                 C                               am 0
+       pmuludq %mm7, %mm2              C                               am 0
+       paddq   %mm4, %mm6              C                               am 0
+       paddq   %mm1, %mm5              C                               am 0
+       movd    -4(%edx), %mm4          C                               am 0
+       movd    %mm6, -12(%edx)         C                               am 0
+       psrlq   $32, %mm6               C                               am 0
+       paddq   %mm5, %mm6              C                               am 0
+       paddq   %mm2, %mm4              C                               am 0
+L(x0): movd    %mm6, -8(%edx)          C                               am 0
+       psrlq   $32, %mm6               C                               am 0
+       paddq   %mm4, %mm6              C                               am 0
+       movd    %mm6, -4(%edx)          C                               am 0
+       psrlq   $32, %mm6               C                               am 0
+       movd    %mm6, (%edx)            C                               am 0
+       dec     %ebx                    C                               am 0
+       jnz     L(olp0)                 C                               am 0
+L(oel0):
+       emms                            C                                  0
+       pop     %edi                    C                                  0
+       pop     %ebx                    C                                  0
+       pop     %esi                    C                                  0
+       ret                             C                                  0
+
+
+L(1):  movd    (%eax), %mm4            C                               m 1
+       sub     24(%esp), %ecx          C                               m 1
+       mov     %ecx, 24(%esp)          C update loop count for later   m 1
+       pmuludq %mm7, %mm4              C                               m 1
+       movd    4(%eax), %mm3           C                               m 1
+       pmuludq %mm7, %mm3              C                               m 1
+       movd    8(%eax), %mm0           C                               m 1
+       jmp     L(m01)                  C                               m 1
+       ALIGN(16)                       C                               m 1
+L(lpm1):
+       pmuludq %mm7, %mm4              C                               m 1
+       paddq   %mm0, %mm6              C                               m 1
+       movd    4(%eax), %mm3           C                               m 1
+       movd    %mm6, -8(%edx)          C                               m 1
+       psrlq   $32, %mm6               C                               m 1
+       pmuludq %mm7, %mm3              C                               m 1
+       paddq   %mm1, %mm6              C                               m 1
+       movd    8(%eax), %mm0           C                               m 1
+       movd    %mm6, -4(%edx)          C                               m 1
+       psrlq   $32, %mm6               C                               m 1
+L(m01):        pmuludq %mm7, %mm0              C                               m 1
+       paddq   %mm4, %mm6              C                               m 1
+       movd    12(%eax), %mm1          C                               m 1
+       movd    %mm6, (%edx)            C                               m 1
+       psrlq   $32, %mm6               C                               m 1
+       pmuludq %mm7, %mm1              C                               m 1
+       paddq   %mm3, %mm6              C                               m 1
+       movd    16(%eax), %mm4          C                               m 1
+       movd    %mm6, 4(%edx)           C                               m 1
+       psrlq   $32, %mm6               C                               m 1
+       lea     16(%eax), %eax          C                               m 1
+       lea     16(%edx), %edx          C                               m 1
+       add     $4, %ecx                C                               m 1
+       ja      L(lpm1)                 C                               m 1
+       pmuludq %mm7, %mm4              C                               m 1
+       paddq   %mm0, %mm6              C                               m 1
+       movd    %mm6, -8(%edx)          C                               m 1
+       psrlq   $32, %mm6               C                               m 1
+       paddq   %mm1, %mm6              C                               m 1
+       mov     16(%esp), %edi          C rp                              1
+       jmp     L(x1)
+
+L(olp1):
+       lea     4(%edi), %edi           C                               am 1
+       movd    (%esi), %mm7            C                               am 1
+       lea     4(%esi), %esi           C                               am 1
+       mov     %edi, %edx              C rp                            am 1
+       mov     20(%esp), %eax          C up                            am 1
+       movd    (%eax), %mm2            C                               am 1
+       mov     24(%esp), %ecx          C inner loop count              am 1
+       pxor    %mm6, %mm6              C                               am 1
+       pmuludq %mm7, %mm2              C                               am 1
+       movd    4(%eax), %mm3           C                               am 1
+       movd    (%edx), %mm4            C                               am 1
+       pmuludq %mm7, %mm3              C                               am 1
+       movd    8(%eax), %mm0           C                               am 1
+       paddq   %mm2, %mm4              C                               am 1
+       movd    4(%edx), %mm5           C                               am 1
+       jmp     L(am01)                 C                               am 1
+       ALIGN(16)                       C                               am 1
+L(lam1):
+       pmuludq %mm7, %mm2              C                               am 1
+       paddq   %mm4, %mm6              C                               am 1
+       movd    4(%eax), %mm3           C                               am 1
+       paddq   %mm1, %mm5              C                               am 1
+       movd    (%edx), %mm4            C                               am 1
+       movd    %mm6, -8(%edx)          C                               am 1
+       psrlq   $32, %mm6               C                               am 1
+       pmuludq %mm7, %mm3              C                               am 1
+       paddq   %mm5, %mm6              C                               am 1
+       movd    8(%eax), %mm0           C                               am 1
+       paddq   %mm2, %mm4              C                               am 1
+       movd    4(%edx), %mm5           C                               am 1
+       movd    %mm6, -4(%edx)          C                               am 1
+       psrlq   $32, %mm6               C                               am 1
+L(am01):
+       pmuludq %mm7, %mm0              C                               am 1
+       paddq   %mm4, %mm6              C                               am 1
+       movd    12(%eax), %mm1          C                               am 1
+       paddq   %mm3, %mm5              C                               am 1
+       movd    8(%edx), %mm4           C                               am 1
+       movd    %mm6, (%edx)            C                               am 1
+       psrlq   $32, %mm6               C                               am 1
+       pmuludq %mm7, %mm1              C                               am 1
+       paddq   %mm5, %mm6              C                               am 1
+       movd    16(%eax), %mm2          C                               am 1
+       paddq   %mm0, %mm4              C                               am 1
+       movd    12(%edx), %mm5          C                               am 1
+       movd    %mm6, 4(%edx)           C                               am 1
+       psrlq   $32, %mm6               C                               am 1
+       lea     16(%eax), %eax          C                               am 1
+       lea     16(%edx), %edx          C                               am 1
+       add     $4, %ecx                C                               am 1
+       jnz     L(lam1)                 C                               am 1
+       pmuludq %mm7, %mm2              C                               am 1
+       paddq   %mm4, %mm6              C                               am 1
+       paddq   %mm1, %mm5              C                               am 1
+       movd    (%edx), %mm4            C                               am 1
+       movd    %mm6, -8(%edx)          C                               am 1
+       psrlq   $32, %mm6               C                               am 1
+       paddq   %mm5, %mm6              C                               am 1
+       paddq   %mm2, %mm4              C                               am 1
+L(x1): movd    %mm6, -4(%edx)          C                               am 1
+       psrlq   $32, %mm6               C                               am 1
+       paddq   %mm4, %mm6              C                               am 1
+       movd    %mm6, (%edx)            C                               am 1
+       psrlq   $32, %mm6               C                               am 1
+       movd    %mm6, 4(%edx)           C                               am 1
+       dec     %ebx                    C                               am 1
+       jnz     L(olp1)                 C                               am 1
+L(oel1):
+       emms                            C                                  1
+       pop     %edi                    C                                  1
+       pop     %ebx                    C                                  1
+       pop     %esi                    C                                  1
+       ret                             C                                  1
+
+
+L(2):  movd    (%eax), %mm1            C                               m 2
+       sub     24(%esp), %ecx          C                               m 2
+       mov     %ecx, 24(%esp)          C update loop count for later   m 2
+       pmuludq %mm7, %mm1              C                               m 2
+       movd    4(%eax), %mm4           C                               m 2
+       pmuludq %mm7, %mm4              C                               m 2
+       movd    8(%eax), %mm3           C                               m 2
+       jmp     L(m10)                  C                               m 2
+       ALIGN(16)                       C                               m 2
+L(lpm2):
+       pmuludq %mm7, %mm4              C                               m 2
+       paddq   %mm0, %mm6              C                               m 2
+       movd    8(%eax), %mm3           C                               m 2
+       movd    %mm6, -4(%edx)          C                               m 2
+       psrlq   $32, %mm6               C                               m 2
+L(m10):        pmuludq %mm7, %mm3              C                               m 2
+       paddq   %mm1, %mm6              C                               m 2
+       movd    12(%eax), %mm0          C                               m 2
+       movd    %mm6, (%edx)            C                               m 2
+       psrlq   $32, %mm6               C                               m 2
+       pmuludq %mm7, %mm0              C                               m 2
+       paddq   %mm4, %mm6              C                               m 2
+       movd    16(%eax), %mm1          C                               m 2
+       movd    %mm6, 4(%edx)           C                               m 2
+       psrlq   $32, %mm6               C                               m 2
+       pmuludq %mm7, %mm1              C                               m 2
+       paddq   %mm3, %mm6              C                               m 2
+       movd    20(%eax), %mm4          C                               m 2
+       movd    %mm6, 8(%edx)           C                               m 2
+       psrlq   $32, %mm6               C                               m 2
+       lea     16(%eax), %eax          C                               m 2
+       lea     16(%edx), %edx          C                               m 2
+       add     $4, %ecx                C                               m 2
+       ja      L(lpm2)                 C                               m 2
+       pmuludq %mm7, %mm4              C                               m 2
+       paddq   %mm0, %mm6              C                               m 2
+       movd    %mm6, -4(%edx)          C                               m 2
+       psrlq   $32, %mm6               C                               m 2
+       paddq   %mm1, %mm6              C                               m 2
+       mov     16(%esp), %edi          C rp                              2
+       jmp     L(x2)
+
+L(olp2):
+       lea     4(%edi), %edi           C                               am 2
+       movd    (%esi), %mm7            C                               am 2
+       lea     4(%esi), %esi           C                               am 2
+       mov     %edi, %edx              C rp                            am 2
+       mov     20(%esp), %eax          C up                            am 2
+       movd    (%eax), %mm1            C                               am 2
+       mov     24(%esp), %ecx          C inner loop count              am 2
+       pxor    %mm6, %mm6              C                               am 2
+       pmuludq %mm7, %mm1              C                               am 2
+       movd    4(%eax), %mm2           C                               am 2
+       movd    (%edx), %mm5            C                               am 2
+       pmuludq %mm7, %mm2              C                               am 2
+       movd    8(%eax), %mm3           C                               am 2
+       paddq   %mm1, %mm5              C                               am 2
+       movd    4(%edx), %mm4           C                               am 2
+       jmp     L(am10)                 C                               am 2
+       ALIGN(16)                       C                               am 2
+L(lam2):
+       pmuludq %mm7, %mm2              C                               am 2
+       paddq   %mm4, %mm6              C                               am 2
+       movd    8(%eax), %mm3           C                               am 2
+       paddq   %mm1, %mm5              C                               am 2
+       movd    4(%edx), %mm4           C                               am 2
+       movd    %mm6, -4(%edx)          C                               am 2
+       psrlq   $32, %mm6               C                               am 2
+L(am10):
+       pmuludq %mm7, %mm3              C                               am 2
+       paddq   %mm5, %mm6              C                               am 2
+       movd    12(%eax), %mm0          C                               am 2
+       paddq   %mm2, %mm4              C                               am 2
+       movd    8(%edx), %mm5           C                               am 2
+       movd    %mm6, (%edx)            C                               am 2
+       psrlq   $32, %mm6               C                               am 2
+       pmuludq %mm7, %mm0              C                               am 2
+       paddq   %mm4, %mm6              C                               am 2
+       movd    16(%eax), %mm1          C                               am 2
+       paddq   %mm3, %mm5              C                               am 2
+       movd    12(%edx), %mm4          C                               am 2
+       movd    %mm6, 4(%edx)           C                               am 2
+       psrlq   $32, %mm6               C                               am 2
+       pmuludq %mm7, %mm1              C                               am 2
+       paddq   %mm5, %mm6              C                               am 2
+       movd    20(%eax), %mm2          C                               am 2
+       paddq   %mm0, %mm4              C                               am 2
+       movd    16(%edx), %mm5          C                               am 2
+       movd    %mm6, 8(%edx)           C                               am 2
+       psrlq   $32, %mm6               C                               am 2
+       lea     16(%eax), %eax          C                               am 2
+       lea     16(%edx), %edx          C                               am 2
+       add     $4, %ecx                C                               am 2
+       jnz     L(lam2)                 C                               am 2
+       pmuludq %mm7, %mm2              C                               am 2
+       paddq   %mm4, %mm6              C                               am 2
+       paddq   %mm1, %mm5              C                               am 2
+       movd    4(%edx), %mm4           C                               am 2
+       movd    %mm6, -4(%edx)          C                               am 2
+       psrlq   $32, %mm6               C                               am 2
+       paddq   %mm5, %mm6              C                               am 2
+       paddq   %mm2, %mm4              C                               am 2
+L(x2): movd    %mm6, (%edx)            C                               am 2
+       psrlq   $32, %mm6               C                               am 2
+       paddq   %mm4, %mm6              C                               am 2
+       movd    %mm6, 4(%edx)           C                               am 2
+       psrlq   $32, %mm6               C                               am 2
+       movd    %mm6, 8(%edx)           C                               am 2
+       dec     %ebx                    C                               am 2
+       jnz     L(olp2)                 C                               am 2
+L(oel2):
+       emms                            C                                  2
+       pop     %edi                    C                                  2
+       pop     %ebx                    C                                  2
+       pop     %esi                    C                                  2
+       ret                             C                                  2
+
+
+L(3):  movd    (%eax), %mm0            C                               m 3
+       sub     24(%esp), %ecx          C                               m 3
+       mov     %ecx, 24(%esp)          C update loop count for later   m 3
+       pmuludq %mm7, %mm0              C                               m 3
+       movd    4(%eax), %mm1           C                               m 3
+       pmuludq %mm7, %mm1              C                               m 3
+       movd    8(%eax), %mm4           C                               m 3
+       jmp     L(lpm3)                 C                               m 3
+       ALIGN(16)                       C                               m 3
+L(lpm3):
+       pmuludq %mm7, %mm4              C                               m 3
+       paddq   %mm0, %mm6              C                               m 3
+       movd    12(%eax), %mm3          C                               m 3
+       movd    %mm6, (%edx)            C                               m 3
+       psrlq   $32, %mm6               C                               m 3
+       pmuludq %mm7, %mm3              C                               m 3
+       paddq   %mm1, %mm6              C                               m 3
+       movd    16(%eax), %mm0          C                               m 3
+       movd    %mm6, 4(%edx)           C                               m 3
+       psrlq   $32, %mm6               C                               m 3
+       pmuludq %mm7, %mm0              C                               m 3
+       paddq   %mm4, %mm6              C                               m 3
+       movd    20(%eax), %mm1          C                               m 3
+       movd    %mm6, 8(%edx)           C                               m 3
+       psrlq   $32, %mm6               C                               m 3
+       pmuludq %mm7, %mm1              C                               m 3
+       paddq   %mm3, %mm6              C                               m 3
+       movd    24(%eax), %mm4          C                               m 3
+       movd    %mm6, 12(%edx)          C                               m 3
+       psrlq   $32, %mm6               C                               m 3
+       lea     16(%eax), %eax          C                               m 3
+       lea     16(%edx), %edx          C                               m 3
+       add     $4, %ecx                C                               m 3
+       ja      L(lpm3)                 C                               m 3
+       pmuludq %mm7, %mm4              C                               m 3
+       paddq   %mm0, %mm6              C                               m 3
+       movd    %mm6, (%edx)            C                               m 3
+       psrlq   $32, %mm6               C                               m 3
+       paddq   %mm1, %mm6              C                               m 3
+       mov     16(%esp), %edi          C rp                              3
+       jmp     L(x3)
+
+L(olp3):
+       lea     4(%edi), %edi           C                               am 3
+       movd    (%esi), %mm7            C                               am 3
+       lea     4(%esi), %esi           C                               am 3
+       mov     %edi, %edx              C rp                            am 3
+       mov     20(%esp), %eax          C up                            am 3
+       movd    (%eax), %mm0            C                               am 3
+       mov     24(%esp), %ecx          C inner loop count              am 3
+       pxor    %mm6, %mm6              C                               am 3
+       pmuludq %mm7, %mm0              C                               am 3
+       movd    4(%eax), %mm1           C                               am 3
+       movd    (%edx), %mm4            C                               am 3
+       pmuludq %mm7, %mm1              C                               am 3
+       movd    8(%eax), %mm2           C                               am 3
+       paddq   %mm0, %mm4              C                               am 3
+       movd    4(%edx), %mm5           C                               am 3
+       jmp     L(lam3)                 C                               am 3
+       ALIGN(16)                       C                               am 3
+L(lam3):
+       pmuludq %mm7, %mm2              C                               am 3
+       paddq   %mm4, %mm6              C                               am 3
+       movd    12(%eax), %mm3          C                               am 3
+       paddq   %mm1, %mm5              C                               am 3
+       movd    8(%edx), %mm4           C                               am 3
+       movd    %mm6, (%edx)            C                               am 3
+       psrlq   $32, %mm6               C                               am 3
+       pmuludq %mm7, %mm3              C                               am 3
+       paddq   %mm5, %mm6              C                               am 3
+       movd    16(%eax), %mm0          C                               am 3
+       paddq   %mm2, %mm4              C                               am 3
+       movd    12(%edx), %mm5          C                               am 3
+       movd    %mm6, 4(%edx)           C                               am 3
+       psrlq   $32, %mm6               C                               am 3
+       pmuludq %mm7, %mm0              C                               am 3
+       paddq   %mm4, %mm6              C                               am 3
+       movd    20(%eax), %mm1          C                               am 3
+       paddq   %mm3, %mm5              C                               am 3
+       movd    16(%edx), %mm4          C                               am 3
+       movd    %mm6, 8(%edx)           C                               am 3
+       psrlq   $32, %mm6               C                               am 3
+       pmuludq %mm7, %mm1              C                               am 3
+       paddq   %mm5, %mm6              C                               am 3
+       movd    24(%eax), %mm2          C                               am 3
+       paddq   %mm0, %mm4              C                               am 3
+       movd    20(%edx), %mm5          C                               am 3
+       movd    %mm6, 12(%edx)          C                               am 3
+       psrlq   $32, %mm6               C                               am 3
+       lea     16(%eax), %eax          C                               am 3
+       lea     16(%edx), %edx          C                               am 3
+       add     $4, %ecx                C                               am 3
+       jnz     L(lam3)                 C                               am 3
+       pmuludq %mm7, %mm2              C                               am 3
+       paddq   %mm4, %mm6              C                               am 3
+       paddq   %mm1, %mm5              C                               am 3
+       movd    8(%edx), %mm4           C                               am 3
+       movd    %mm6, (%edx)            C                               am 3
+       psrlq   $32, %mm6               C                               am 3
+       paddq   %mm5, %mm6              C                               am 3
+       paddq   %mm2, %mm4              C                               am 3
+L(x3): movd    %mm6, 4(%edx)           C                               am 3
+       psrlq   $32, %mm6               C                               am 3
+       paddq   %mm4, %mm6              C                               am 3
+       movd    %mm6, 8(%edx)           C                               am 3
+       psrlq   $32, %mm6               C                               am 3
+       movd    %mm6, 12(%edx)          C                               am 3
+       dec     %ebx                    C                               am 3
+       jnz     L(olp3)                 C                               am 3
+L(oel3):
+       emms                            C                                  3
+       pop     %edi                    C                                  3
+       pop     %ebx                    C                                  3
+       pop     %esi                    C                                  3
+       ret                             C                                  3
+EPILOGUE()