--- /dev/null
+dnl AMD64 mpn_sqr_basecase.
+
+dnl Contributed to the GNU project by Torbjorn Granlund.
+
+dnl Copyright 2008, 2009 Free Software Foundation, Inc.
+
+dnl This file is part of the GNU MP Library.
+
+dnl The GNU MP Library is free software; you can redistribute it and/or modify
+dnl it under the terms of the GNU Lesser General Public License as published
+dnl by the Free Software Foundation; either version 3 of the License, or (at
+dnl your option) any later version.
+
+dnl The GNU MP Library is distributed in the hope that it will be useful, but
+dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
+dnl License for more details.
+
+dnl You should have received a copy of the GNU Lesser General Public License
+dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
+
+include(`../config.m4')
+
+C The inner loops of this code are the result of running a code generation and
+C optimization tool suite written by David Harvey and Torbjorn Granlund.
+
+C NOTES
+C * This code only handles operands up to SQR_KARATSUBA_THRESHOLD_MAX. That
+C means we can safely use 32-bit operations for all sizes, unlike in e.g.,
+C mpn_addmul_1.
+C * The jump table could probably be optimized, at least for non-pic.
+C * The special code for n=1,2,3 was quickly written. It is probably too
+C large and unnecessarily slow.
+C * Consider combining small cases code so that the n=k-1 code jumps into
+C the middle of the n=k code.
+C * Avoid saving registers for small cases code.
+C * Needed variables:
+C n r11 input size
+C i r8 work left, initially n
+C j r9 inner loop count
+C r15 unused
+C v0 r13
+C v1 r14
+C rp rdi
+C up rsi
+C w0 rbx
+C w1 rcx
+C w2 rbp
+C w3 r10
+C tp r12
+C lo rax
+C hi rdx
+C rsp
+
+C INPUT PARAMETERS
+define(`rp', `%rdi')
+define(`up', `%rsi')
+define(`n_param', `%rdx')
+
+C We should really trim this, for better spatial locality. Alternatively,
+C we could grab the upper part of the stack area, leaving the lower part
+C instead of the upper part unused.
+define(`SQR_KARATSUBA_THRESHOLD_MAX', 120)
+define(`STACK_ALLOC', eval(8*2*SQR_KARATSUBA_THRESHOLD_MAX))
+
+define(`n', `%r11')
+define(`tp', `%r12')
+define(`i', `%r8')
+define(`j', `%r9')
+define(`v0', `%r13')
+define(`v1', `%r14')
+define(`w0', `%rbx')
+define(`w1', `%rcx')
+define(`w2', `%rbp')
+define(`w3', `%r10')
+
+define(`SPECIAL_CODE_FOR_4',1)
+
+
+ASM_START()
+ TEXT
+ ALIGN(16)
+
+PROLOGUE(mpn_sqr_basecase)
+ add $-48, %rsp
+ mov %rbx, 40(%rsp)
+ mov %rbp, 32(%rsp)
+ mov %r12, 24(%rsp)
+ mov %r13, 16(%rsp)
+ mov %r14, 8(%rsp)
+
+ mov R32(n_param), R32(n) C free original n register (rdx)
+ mov R32(n_param), R32(%rcx)
+ and $3, R32(%rcx)
+ lea 4(%rcx), %rbx
+ cmp $4, R32(n_param)
+ cmovg %rbx, %rcx
+ lea L(jmptab)(%rip), %rax
+ jmp *(%rax,%rcx,8)
+ JUMPTABSECT
+ ALIGN(8)
+L(jmptab):
+ .quad L(4)
+ .quad L(1)
+ .quad L(2)
+ .quad L(3)
+ .quad L(0m4)
+ .quad L(1m4)
+ .quad L(2m4)
+ .quad L(3m4)
+ TEXT
+
+L(1): mov (up), %rax
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ add $40, %rsp
+ pop %rbx
+ ret
+
+L(2): mov (up), %rax
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, %r9
+ mov 8(up), %rax
+ mul %rax
+ mov %rax, %r10
+ mov %rdx, %r11
+ mov 8(up), %rax
+ mov (up), %rbx
+ mul %rbx
+ add %rax, %r9
+ adc %rdx, %r10
+ adc $0, %r11
+ add %rax, %r9
+ mov %r9, 8(rp)
+ adc %rdx, %r10
+ mov %r10, 16(rp)
+ adc $0, %r11
+ mov %r11, 24(rp)
+ add $40, %rsp
+ pop %rbx
+ ret
+
+L(3): mov (up), %rax
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ mov 8(up), %rax
+ mul %rax
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ mov 16(up), %rax
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+
+ mov (up), %rbx
+ mov 8(up), %rax
+ mul %rbx
+ mov %rax, %r8
+ mov %rdx, %r9
+ mov 16(up), %rax
+ mul %rbx
+ xor R32(%r10), R32(%r10)
+ add %rax, %r9
+ adc %rdx, %r10
+
+ mov 8(up), %rbx
+ mov 16(up), %rax
+ mul %rbx
+ xor R32(%r11), R32(%r11)
+ add %rax, %r10
+ adc %rdx, %r11
+ add %r8, %r8
+ adc %r9, %r9
+ adc %r10, %r10
+ adc %r11, %r11
+ mov $0, R32(%rbx)
+ adc %rbx, %rbx
+ add %r8, 8(rp)
+ adc %r9, 16(rp)
+ adc %r10, 24(rp)
+ adc %r11, 32(rp)
+ adc %rbx, 40(rp)
+ add $40, %rsp
+ pop %rbx
+ ret
+
+ifdef(`SPECIAL_CODE_FOR_4',`
+L(4): mov (up), %rax
+ mul %rax
+ mov %rax, (rp)
+ mov %rdx, 8(rp)
+ mov 8(up), %rax
+ mul %rax
+ mov %rax, 16(rp)
+ mov %rdx, 24(rp)
+ mov 16(up), %rax
+ mul %rax
+ mov %rax, 32(rp)
+ mov %rdx, 40(rp)
+ mov 24(up), %rax
+ mul %rax
+ mov %rax, 48(rp)
+ mov %rdx, 56(rp)
+
+ mov (up), %rbx
+ mov 8(up), %rax
+ mul %rbx
+ mov %rax, %r8
+ mov %rdx, %r9
+ mov 16(up), %rax
+ mul %rbx
+ xor R32(%r10), R32(%r10)
+ add %rax, %r9
+ adc %rdx, %r10
+ mov 24(up), %rax
+ mul %rbx
+ xor R32(%r11), R32(%r11)
+ add %rax, %r10
+ adc %rdx, %r11
+ mov 8(up), %rbx
+ mov 16(up), %rax
+ mul %rbx
+ xor R32(%r12), R32(%r12)
+ add %rax, %r10
+ adc %rdx, %r11
+ adc $0, %r12
+ mov 24(up), %rax
+ mul %rbx
+ add %rax, %r11
+ adc %rdx, %r12
+ mov 16(up), %rbx
+ mov 24(up), %rax
+ mul %rbx
+ xor R32(%rbp), R32(%rbp)
+ add %rax, %r12
+ adc %rdx, %rbp
+
+ add %r8, %r8
+ adc %r9, %r9
+ adc %r10, %r10
+ adc %r11, %r11
+ adc %r12, %r12
+ mov $0, R32(%rbx)
+ adc %rbp, %rbp
+
+ adc %rbx, %rbx
+ add %r8, 8(rp)
+ adc %r9, 16(rp)
+ adc %r10, 24(rp)
+ adc %r11, 32(rp)
+ adc %r12, 40(rp)
+ adc %rbp, 48(rp)
+ adc %rbx, 56(rp)
+ add $24, %rsp
+ pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+')
+
+L(0m4): add $-STACK_ALLOC, %rsp
+ lea (%rsp,n,8), tp C point tp in middle of result operand
+ lea (up,n,8), up C point up at end of input operand
+
+ lea -1(n), i
+C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
+ mov $-1, j
+ sub i, j
+
+ lea -24(tp), tp C offset FIXME
+
+ mov (up,j,8), v0
+ mov 8(up,j,8), %rax
+ mul v0
+ xor R32(w2), R32(w2)
+ mov %rax, w0
+ mov 16(up,j,8), %rax
+ mov %rdx, w3
+ jmp L(L3)
+
+ ALIGN(16)
+L(mul_1_m3_top):
+ add %rax, w2
+ mov w3, (tp,j,8)
+ mov (up,j,8), %rax
+ adc %rdx, w1
+ xor R32(w0), R32(w0)
+ mul v0
+ xor R32(w3), R32(w3)
+ mov w2, 8(tp,j,8)
+ add %rax, w1
+ adc %rdx, w0
+ mov 8(up,j,8), %rax
+ mov w1, 16(tp,j,8)
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov 16(up,j,8), %rax
+ adc %rdx, w3
+L(L3): xor R32(w1), R32(w1)
+ mul v0
+ add %rax, w3
+ mov 24(up,j,8), %rax
+ adc %rdx, w2
+ mov w0, 24(tp,j,8)
+ mul v0
+ add $4, j
+ js L(mul_1_m3_top)
+
+ add %rax, w2
+ mov w3, (tp)
+ adc %rdx, w1
+ mov w2, 8(tp)
+ mov w1, 16(tp)
+ lea eval(24+2*8)(tp), tp C tp += 2, undo offset FIXME
+ifdef(`SPECIAL_CODE_FOR_4',`',`
+ cmp $3, R32(i)
+ je L(last)
+')
+ jmp L(dowhile)
+
+L(1m4): add $-STACK_ALLOC, %rsp
+ lea (%rsp,n,8), tp C point tp in middle of result operand
+ lea (up,n,8), up C point up at end of input operand
+
+ lea (n), i
+C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
+ mov $3, R32(j)
+ sub i, j
+
+ lea 8(up), up C offset FIXME
+
+ mov -32(up,j,8), v0 C u0
+ mov -24(up,j,8), v1 C u1
+ mov -24(up,j,8), %rax C u1
+ mul v0 C u0 * u1
+ mov %rdx, w1
+ xor R32(w2), R32(w2)
+ mov %rax, -24(tp,j,8)
+ jmp L(m0)
+
+ ALIGN(16)
+L(mul_2_m0_top):
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov -24(up,j,8), %rax
+ mov $0, R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1 C v1 * u0
+ add %rax, w1
+ mov w0, -24(tp,j,8)
+ adc %rdx, w2
+L(m0): mov -16(up,j,8), %rax C u2, u6 ...
+ mul v0 C u0 * u2
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ adc $0, R32(w3)
+ mov $0, R32(w0)
+ mov w1, -16(tp,j,8)
+ mul v1
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mul v0
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ add %rax, w3
+ mov w2, -8(tp,j,8)
+ adc %rdx, w0
+ mov (up,j,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+ add $4, j
+ mov -32(up,j,8), %rax
+ mov w3, -32(tp,j,8)
+ js L(mul_2_m0_top)
+
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, -8(tp)
+ mov w1, (tp)
+
+ lea -8(up), up C undo offset FIXME
+ lea eval(3*8)(tp), tp C tp += 3
+ add $-2, R32(i) C i -= 2
+ cmp $3, R32(i)
+ je L(last)
+ jmp L(dowhile)
+
+
+
+L(2m4): add $-STACK_ALLOC, %rsp
+ lea (%rsp,n,8), tp C point tp in middle of result operand
+ lea (up,n,8), up C point up at end of input operand
+
+ lea -1(n), i
+C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
+ mov $1, R32(j)
+ sub i, j
+
+ lea -24(tp), tp C offset FIXME
+
+ mov -16(up,j,8), v0
+ mov -8(up,j,8), %rax
+ mul v0
+ mov %rax, w2
+ mov (up,j,8), %rax
+ mov %rdx, w1
+ jmp L(L1)
+
+ ALIGN(16)
+L(mul_1_m1_top):
+ add %rax, w2
+ mov w3, (tp,j,8)
+ mov (up,j,8), %rax
+ adc %rdx, w1
+L(L1): xor R32(w0), R32(w0)
+ mul v0
+ xor R32(w3), R32(w3)
+ mov w2, 8(tp,j,8)
+ add %rax, w1
+ adc %rdx, w0
+ mov 8(up,j,8), %rax
+ mov w1, 16(tp,j,8)
+ xor R32(w2), R32(w2)
+ mul v0
+ add %rax, w0
+ mov 16(up,j,8), %rax
+ adc %rdx, w3
+ xor R32(w1), R32(w1)
+ mul v0
+ add %rax, w3
+ mov 24(up,j,8), %rax
+ adc %rdx, w2
+ mov w0, 24(tp,j,8)
+ mul v0
+ add $4, j
+ js L(mul_1_m1_top)
+
+ add %rax, w2
+ mov w3, (tp)
+ adc %rdx, w1
+ mov w2, 8(tp)
+ mov w1, 16(tp)
+
+ lea eval(24+2*8)(tp), tp C tp += 2, undo offset FIXME
+ jmp L(dowhile_mid)
+
+
+
+L(3m4): add $-STACK_ALLOC, %rsp
+ lea (%rsp,n,8), tp C point tp in middle of result operand
+ lea (up,n,8), up C point up at end of input operand
+
+ lea (n), i
+C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
+ mov $1, R32(j)
+ sub i, j
+
+ lea 8(up), up C offset FIXME
+
+ mov -16(up,j,8), v0
+ mov -8(up,j,8), v1
+ mov -8(up,j,8), %rax
+ mul v0 C v0 * u0
+ mov %rdx, w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ mov %rax, -8(tp,j,8)
+ jmp L(m2)
+
+ ALIGN(16)
+L(mul_2_m2_top):
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov -24(up,j,8), %rax
+ mov $0, R32(w2)
+ mul v0
+ add %rax, w0
+ mov -24(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1 C v1 * u0
+ add %rax, w1
+ mov w0, -24(tp,j,8)
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ mul v0
+ mov $0, R32(w3)
+ add %rax, w1
+ adc %rdx, w2
+ mov -16(up,j,8), %rax
+ adc $0, R32(w3)
+ mov $0, R32(w0)
+ mov w1, -16(tp,j,8)
+ mul v1
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mul v0
+ add %rax, w2
+ mov -8(up,j,8), %rax
+ adc %rdx, w3
+ adc $0, R32(w0)
+ mul v1
+ add %rax, w3
+ mov w2, -8(tp,j,8)
+ adc %rdx, w0
+L(m2): mov (up,j,8), %rax
+ mul v0
+ add %rax, w3
+ adc %rdx, w0
+ adc $0, R32(w1)
+ add $4, j
+ mov -32(up,j,8), %rax
+ mov w3, -32(tp,j,8)
+ js L(mul_2_m2_top)
+
+ mul v1
+ add %rax, w0
+ adc %rdx, w1
+ mov w0, -8(tp)
+ mov w1, (tp)
+
+ lea -8(up), up C undo offset FIXME
+ lea eval(3*8)(tp), tp C tp += 3
+ add $-2, R32(i) C i -= 2
+ jmp L(dowhile_mid)
+
+L(dowhile):
+C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
+ mov $-1, j
+ sub i, j
+
+ lea -24(tp), tp C offset FIXME
+ lea -8(up), up C offset FIXME
+
+ mov 16(up,j,8), v0
+ mov 24(up,j,8), v1
+ mov 24(up,j,8), %rax
+ mul v0
+ xor R32(w3), R32(w3)
+ add %rax, 24(tp,j,8)
+ adc %rdx, w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ jmp L(am2)
+
+ ALIGN(16)
+L(addmul_2_m2_top):
+ add w3, (tp,j,8)
+ adc %rax, w0
+ mov 8(up,j,8), %rax
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mul v0
+ add %rax, w0
+ mov 8(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1 C v1 * u0
+ add w0, 8(tp,j,8)
+ adc %rax, w1
+ adc %rdx, w2
+ mov 16(up,j,8), %rax
+ mov $0, R32(w3)
+ mul v0 C v0 * u1
+ add %rax, w1
+ mov 16(up,j,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mul v1 C v1 * u1
+ add w1, 16(tp,j,8)
+ adc %rax, w2
+ mov 24(up,j,8), %rax
+ adc %rdx, w3
+ mul v0
+ mov $0, R32(w0)
+ add %rax, w2
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mov 24(up,j,8), %rax
+ adc $0, R32(w0)
+ mul v1
+ add w2, 24(tp,j,8)
+ adc %rax, w3
+ adc %rdx, w0
+L(am2): mov 32(up,j,8), %rax
+ mul v0
+ add %rax, w3
+ mov 32(up,j,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add $4, j
+ js L(addmul_2_m2_top)
+
+ add w3, (tp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, 8(tp)
+ mov w1, 16(tp)
+
+ lea eval(2*8)(tp), tp C tp += 2
+ add $-2, R32(i) C i -= 2
+
+ lea 24(tp), tp C undo offset FIXME
+ lea 8(up), up C undo offset FIXME
+
+L(dowhile_mid):
+C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
+ mov $1, R32(j)
+ sub i, j
+
+ lea -24(tp), tp C offset FIXME
+ lea -8(up), up C offset FIXME
+
+ mov (up,j,8), v0
+ mov 8(up,j,8), v1
+ mov 8(up,j,8), %rax
+ mul v0
+ xor R32(w1), R32(w1)
+ add %rax, 8(tp,j,8)
+ adc %rdx, w1
+ xor R32(w2), R32(w2)
+ jmp L(20)
+
+ ALIGN(16)
+L(addmul_2_m0_top):
+ add w3, (tp,j,8)
+ adc %rax, w0
+ mov 8(up,j,8), %rax
+ adc %rdx, w1
+ mov $0, R32(w2)
+ mul v0
+ add %rax, w0
+ mov 8(up,j,8), %rax
+ adc %rdx, w1
+ adc $0, R32(w2)
+ mul v1 C v1 * u0
+ add w0, 8(tp,j,8)
+ adc %rax, w1
+ adc %rdx, w2
+L(20): mov 16(up,j,8), %rax
+ mov $0, R32(w3)
+ mul v0 C v0 * u1
+ add %rax, w1
+ mov 16(up,j,8), %rax
+ adc %rdx, w2
+ adc $0, R32(w3)
+ mul v1 C v1 * u1
+ add w1, 16(tp,j,8)
+ adc %rax, w2
+ mov 24(up,j,8), %rax
+ adc %rdx, w3
+ mul v0
+ mov $0, R32(w0)
+ add %rax, w2
+ adc %rdx, w3
+ mov $0, R32(w1)
+ mov 24(up,j,8), %rax
+ adc $0, R32(w0)
+ mul v1
+ add w2, 24(tp,j,8)
+ adc %rax, w3
+ adc %rdx, w0
+ mov 32(up,j,8), %rax
+ mul v0
+ add %rax, w3
+ mov 32(up,j,8), %rax
+ adc %rdx, w0
+ adc $0, R32(w1)
+ mul v1
+ add $4, j
+ js L(addmul_2_m0_top)
+
+ add w3, (tp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, 8(tp)
+ mov w1, 16(tp)
+
+ lea 24(tp), tp C undo offset FIXME
+ lea 8(up), up C undo offset FIXME
+
+ lea eval(2*8)(tp), tp C tp += 2
+ add $-2, R32(i) C i -= 2
+
+ cmp $3, R32(i)
+ jne L(dowhile)
+
+L(last):
+
+C Function mpn_addmul_2s_2
+ mov -24(up), v0
+ mov -16(up), v1
+ mov -16(up), %rax
+ mul v0
+ xor R32(w3), R32(w3)
+ add %rax, -32(tp)
+ adc %rdx, w3
+ xor R32(w0), R32(w0)
+ xor R32(w1), R32(w1)
+ mov -8(up), %rax
+ mul v0
+ add %rax, w3
+ mov -8(up), %rax
+ adc %rdx, w0
+ mul v1
+ add w3, -24(tp)
+ adc %rax, w0
+ adc %rdx, w1
+ mov w0, -16(tp)
+ mov w1, -8(tp)
+
+C Function mpn_sqr_diag_addlsh1
+ mov R32(n), R32(j)
+ shl $3, n
+ sub n, up
+
+ mov (%rsp), %r11
+
+ bt $0, j
+ lea -4(j,j),j
+ jc L(odd)
+
+L(evn): lea (rp,j,8), rp
+ lea (up,j,4), up
+ lea 8(%rsp,j,8), tp
+ neg j
+
+ add %r11, %r11
+ sbb R32(%rbx), R32(%rbx) C save CF
+ mov (up,j,4), %rax
+ mul %rax
+ add %rdx, %r11
+ mov %rax, (rp,j,8)
+ jmp L(d0)
+
+L(odd): lea -16(rp,j,8), rp
+ lea -8(up,j,4), up
+ lea -8(%rsp,j,8), tp
+ neg j
+
+ add %r11, %r11
+ sbb R32(%rbp), R32(%rbp) C save CF
+ mov 8(up,j,4), %rax
+ mul %rax
+ add %rdx, %r11
+ mov %rax, 16(rp,j,8)
+ jmp L(d1)
+
+ ALIGN(16)
+L(top): mov (up,j,4), %rax
+ mul %rax
+ add R32(%rbp), R32(%rbp) C restore carry
+ adc %rax, %r10
+ adc %rdx, %r11
+ mov %r10, (rp,j,8)
+L(d0): mov %r11, 8(rp,j,8)
+ mov (tp,j,8), %r10
+ adc %r10, %r10
+ mov 8(tp,j,8), %r11
+ adc %r11, %r11
+ nop
+ sbb R32(%rbp), R32(%rbp) C save CF
+ mov 8(up,j,4), %rax
+ mul %rax
+ add R32(%rbx), R32(%rbx) C restore carry
+ adc %rax, %r10
+ adc %rdx, %r11
+ mov %r10, 16(rp,j,8)
+L(d1): mov %r11, 24(rp,j,8)
+ mov 16(tp,j,8), %r10
+ adc %r10, %r10
+ mov 24(tp,j,8), %r11
+ adc %r11, %r11
+ sbb R32(%rbx), R32(%rbx) C save CF
+ add $4, j
+ js L(top)
+
+L(end): mov (up,j,4), %rax
+ mul %rax
+ add R32(%rbp), R32(%rbp) C restore carry
+ adc %rax, %r10
+ adc %rdx, %r11
+ mov %r10, (rp,j,8)
+ mov %r11, 8(rp,j,8)
+ mov (tp,j,8), %r10
+ adc %r10, %r10
+ sbb R32(%rbp), R32(%rbp) C save CF
+ neg R32(%rbp)
+ mov 8(up,j,4), %rax
+ mul %rax
+ add R32(%rbx), R32(%rbx) C restore carry
+ adc %rax, %r10
+ adc %rbp, %rdx
+ mov %r10, 16(rp,j,8)
+ mov %rdx, 24(rp,j,8)
+
+ add $eval(8+STACK_ALLOC), %rsp
+ pop %r14
+ pop %r13
+ pop %r12
+ pop %rbp
+ pop %rbx
+ ret
+EPILOGUE()