dnl AMD64 mpn_sqr_basecase. dnl Contributed to the GNU project by Torbjorn Granlund. dnl Copyright 2008, 2009 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C The inner loops of this code are the result of running a code generation and C optimization tool suite written by David Harvey and Torbjorn Granlund. C NOTES C * This code only handles operands up to SQR_KARATSUBA_THRESHOLD_MAX. That C means we can safely use 32-bit operations for all sizes, unlike in e.g., C mpn_addmul_1. C * The jump table could probably be optimized, at least for non-pic. C * The special code for n=1,2,3 was quickly written. It is probably too C large and unnecessarily slow. C * Consider combining small cases code so that the n=k-1 code jumps into C the middle of the n=k code. C * Avoid saving registers for small cases code. C * Needed variables: C n r11 input size C i r8 work left, initially n C j r9 inner loop count C r15 unused C v0 r13 C v1 r14 C rp rdi C up rsi C w0 rbx C w1 rcx C w2 rbp C w3 r10 C tp r12 C lo rax C hi rdx C rsp C INPUT PARAMETERS define(`rp', `%rdi') define(`up', `%rsi') define(`n_param', `%rdx') C We should really trim this, for better spatial locality. Alternatively, C we could grab the upper part of the stack area, leaving the lower part C instead of the upper part unused. define(`SQR_KARATSUBA_THRESHOLD_MAX', 120) define(`STACK_ALLOC', eval(8*2*SQR_KARATSUBA_THRESHOLD_MAX)) define(`n', `%r11') define(`tp', `%r12') define(`i', `%r8') define(`j', `%r9') define(`v0', `%r13') define(`v1', `%r14') define(`w0', `%rbx') define(`w1', `%rcx') define(`w2', `%rbp') define(`w3', `%r10') define(`SPECIAL_CODE_FOR_4',1) ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_sqr_basecase) add $-48, %rsp mov %rbx, 40(%rsp) mov %rbp, 32(%rsp) mov %r12, 24(%rsp) mov %r13, 16(%rsp) mov %r14, 8(%rsp) mov R32(n_param), R32(n) C free original n register (rdx) mov R32(n_param), R32(%rcx) and $3, R32(%rcx) lea 4(%rcx), %rbx cmp $4, R32(n_param) cmovg %rbx, %rcx lea L(jmptab)(%rip), %rax jmp *(%rax,%rcx,8) JUMPTABSECT ALIGN(8) L(jmptab): .quad L(4) .quad L(1) .quad L(2) .quad L(3) .quad L(0m4) .quad L(1m4) .quad L(2m4) .quad L(3m4) TEXT L(1): mov (up), %rax mul %rax mov %rax, (rp) mov %rdx, 8(rp) add $40, %rsp pop %rbx ret L(2): mov (up), %rax mul %rax mov %rax, (rp) mov %rdx, %r9 mov 8(up), %rax mul %rax mov %rax, %r10 mov %rdx, %r11 mov 8(up), %rax mov (up), %rbx mul %rbx add %rax, %r9 adc %rdx, %r10 adc $0, %r11 add %rax, %r9 mov %r9, 8(rp) adc %rdx, %r10 mov %r10, 16(rp) adc $0, %r11 mov %r11, 24(rp) add $40, %rsp pop %rbx ret L(3): mov (up), %rax mul %rax mov %rax, (rp) mov %rdx, 8(rp) mov 8(up), %rax mul %rax mov %rax, 16(rp) mov %rdx, 24(rp) mov 16(up), %rax mul %rax mov %rax, 32(rp) mov %rdx, 40(rp) mov (up), %rbx mov 8(up), %rax mul %rbx mov %rax, %r8 mov %rdx, %r9 mov 16(up), %rax mul %rbx xor R32(%r10), R32(%r10) add %rax, %r9 adc %rdx, %r10 mov 8(up), %rbx mov 16(up), %rax mul %rbx xor R32(%r11), R32(%r11) add %rax, %r10 adc %rdx, %r11 add %r8, %r8 adc %r9, %r9 adc %r10, %r10 adc %r11, %r11 mov $0, R32(%rbx) adc %rbx, %rbx add %r8, 8(rp) adc %r9, 16(rp) adc %r10, 24(rp) adc %r11, 32(rp) adc %rbx, 40(rp) add $40, %rsp pop %rbx ret ifdef(`SPECIAL_CODE_FOR_4',` L(4): mov (up), %rax mul %rax mov %rax, (rp) mov %rdx, 8(rp) mov 8(up), %rax mul %rax mov %rax, 16(rp) mov %rdx, 24(rp) mov 16(up), %rax mul %rax mov %rax, 32(rp) mov %rdx, 40(rp) mov 24(up), %rax mul %rax mov %rax, 48(rp) mov %rdx, 56(rp) mov (up), %rbx mov 8(up), %rax mul %rbx mov %rax, %r8 mov %rdx, %r9 mov 16(up), %rax mul %rbx xor R32(%r10), R32(%r10) add %rax, %r9 adc %rdx, %r10 mov 24(up), %rax mul %rbx xor R32(%r11), R32(%r11) add %rax, %r10 adc %rdx, %r11 mov 8(up), %rbx mov 16(up), %rax mul %rbx xor R32(%r12), R32(%r12) add %rax, %r10 adc %rdx, %r11 adc $0, %r12 mov 24(up), %rax mul %rbx add %rax, %r11 adc %rdx, %r12 mov 16(up), %rbx mov 24(up), %rax mul %rbx xor R32(%rbp), R32(%rbp) add %rax, %r12 adc %rdx, %rbp add %r8, %r8 adc %r9, %r9 adc %r10, %r10 adc %r11, %r11 adc %r12, %r12 mov $0, R32(%rbx) adc %rbp, %rbp adc %rbx, %rbx add %r8, 8(rp) adc %r9, 16(rp) adc %r10, 24(rp) adc %r11, 32(rp) adc %r12, 40(rp) adc %rbp, 48(rp) adc %rbx, 56(rp) add $24, %rsp pop %r12 pop %rbp pop %rbx ret ') L(0m4): add $-STACK_ALLOC, %rsp lea (%rsp,n,8), tp C point tp in middle of result operand lea (up,n,8), up C point up at end of input operand lea -1(n), i C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1]) mov $-1, j sub i, j lea -24(tp), tp C offset FIXME mov (up,j,8), v0 mov 8(up,j,8), %rax mul v0 xor R32(w2), R32(w2) mov %rax, w0 mov 16(up,j,8), %rax mov %rdx, w3 jmp L(L3) ALIGN(16) L(mul_1_m3_top): add %rax, w2 mov w3, (tp,j,8) mov (up,j,8), %rax adc %rdx, w1 xor R32(w0), R32(w0) mul v0 xor R32(w3), R32(w3) mov w2, 8(tp,j,8) add %rax, w1 adc %rdx, w0 mov 8(up,j,8), %rax mov w1, 16(tp,j,8) xor R32(w2), R32(w2) mul v0 add %rax, w0 mov 16(up,j,8), %rax adc %rdx, w3 L(L3): xor R32(w1), R32(w1) mul v0 add %rax, w3 mov 24(up,j,8), %rax adc %rdx, w2 mov w0, 24(tp,j,8) mul v0 add $4, j js L(mul_1_m3_top) add %rax, w2 mov w3, (tp) adc %rdx, w1 mov w2, 8(tp) mov w1, 16(tp) lea eval(24+2*8)(tp), tp C tp += 2, undo offset FIXME ifdef(`SPECIAL_CODE_FOR_4',`',` cmp $3, R32(i) je L(last) ') jmp L(dowhile) L(1m4): add $-STACK_ALLOC, %rsp lea (%rsp,n,8), tp C point tp in middle of result operand lea (up,n,8), up C point up at end of input operand lea (n), i C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1) mov $3, R32(j) sub i, j lea 8(up), up C offset FIXME mov -32(up,j,8), v0 C u0 mov -24(up,j,8), v1 C u1 mov -24(up,j,8), %rax C u1 mul v0 C u0 * u1 mov %rdx, w1 xor R32(w2), R32(w2) mov %rax, -24(tp,j,8) jmp L(m0) ALIGN(16) L(mul_2_m0_top): mul v1 add %rax, w0 adc %rdx, w1 mov -24(up,j,8), %rax mov $0, R32(w2) mul v0 add %rax, w0 mov -24(up,j,8), %rax adc %rdx, w1 adc $0, R32(w2) mul v1 C v1 * u0 add %rax, w1 mov w0, -24(tp,j,8) adc %rdx, w2 L(m0): mov -16(up,j,8), %rax C u2, u6 ... mul v0 C u0 * u2 mov $0, R32(w3) add %rax, w1 adc %rdx, w2 mov -16(up,j,8), %rax adc $0, R32(w3) mov $0, R32(w0) mov w1, -16(tp,j,8) mul v1 add %rax, w2 mov -8(up,j,8), %rax adc %rdx, w3 mov $0, R32(w1) mul v0 add %rax, w2 mov -8(up,j,8), %rax adc %rdx, w3 adc $0, R32(w0) mul v1 add %rax, w3 mov w2, -8(tp,j,8) adc %rdx, w0 mov (up,j,8), %rax mul v0 add %rax, w3 adc %rdx, w0 adc $0, R32(w1) add $4, j mov -32(up,j,8), %rax mov w3, -32(tp,j,8) js L(mul_2_m0_top) mul v1 add %rax, w0 adc %rdx, w1 mov w0, -8(tp) mov w1, (tp) lea -8(up), up C undo offset FIXME lea eval(3*8)(tp), tp C tp += 3 add $-2, R32(i) C i -= 2 cmp $3, R32(i) je L(last) jmp L(dowhile) L(2m4): add $-STACK_ALLOC, %rsp lea (%rsp,n,8), tp C point tp in middle of result operand lea (up,n,8), up C point up at end of input operand lea -1(n), i C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i]) mov $1, R32(j) sub i, j lea -24(tp), tp C offset FIXME mov -16(up,j,8), v0 mov -8(up,j,8), %rax mul v0 mov %rax, w2 mov (up,j,8), %rax mov %rdx, w1 jmp L(L1) ALIGN(16) L(mul_1_m1_top): add %rax, w2 mov w3, (tp,j,8) mov (up,j,8), %rax adc %rdx, w1 L(L1): xor R32(w0), R32(w0) mul v0 xor R32(w3), R32(w3) mov w2, 8(tp,j,8) add %rax, w1 adc %rdx, w0 mov 8(up,j,8), %rax mov w1, 16(tp,j,8) xor R32(w2), R32(w2) mul v0 add %rax, w0 mov 16(up,j,8), %rax adc %rdx, w3 xor R32(w1), R32(w1) mul v0 add %rax, w3 mov 24(up,j,8), %rax adc %rdx, w2 mov w0, 24(tp,j,8) mul v0 add $4, j js L(mul_1_m1_top) add %rax, w2 mov w3, (tp) adc %rdx, w1 mov w2, 8(tp) mov w1, 16(tp) lea eval(24+2*8)(tp), tp C tp += 2, undo offset FIXME jmp L(dowhile_mid) L(3m4): add $-STACK_ALLOC, %rsp lea (%rsp,n,8), tp C point tp in middle of result operand lea (up,n,8), up C point up at end of input operand lea (n), i C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i) mov $1, R32(j) sub i, j lea 8(up), up C offset FIXME mov -16(up,j,8), v0 mov -8(up,j,8), v1 mov -8(up,j,8), %rax mul v0 C v0 * u0 mov %rdx, w3 xor R32(w0), R32(w0) xor R32(w1), R32(w1) mov %rax, -8(tp,j,8) jmp L(m2) ALIGN(16) L(mul_2_m2_top): mul v1 add %rax, w0 adc %rdx, w1 mov -24(up,j,8), %rax mov $0, R32(w2) mul v0 add %rax, w0 mov -24(up,j,8), %rax adc %rdx, w1 adc $0, R32(w2) mul v1 C v1 * u0 add %rax, w1 mov w0, -24(tp,j,8) adc %rdx, w2 mov -16(up,j,8), %rax mul v0 mov $0, R32(w3) add %rax, w1 adc %rdx, w2 mov -16(up,j,8), %rax adc $0, R32(w3) mov $0, R32(w0) mov w1, -16(tp,j,8) mul v1 add %rax, w2 mov -8(up,j,8), %rax adc %rdx, w3 mov $0, R32(w1) mul v0 add %rax, w2 mov -8(up,j,8), %rax adc %rdx, w3 adc $0, R32(w0) mul v1 add %rax, w3 mov w2, -8(tp,j,8) adc %rdx, w0 L(m2): mov (up,j,8), %rax mul v0 add %rax, w3 adc %rdx, w0 adc $0, R32(w1) add $4, j mov -32(up,j,8), %rax mov w3, -32(tp,j,8) js L(mul_2_m2_top) mul v1 add %rax, w0 adc %rdx, w1 mov w0, -8(tp) mov w1, (tp) lea -8(up), up C undo offset FIXME lea eval(3*8)(tp), tp C tp += 3 add $-2, R32(i) C i -= 2 jmp L(dowhile_mid) L(dowhile): C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i) mov $-1, j sub i, j lea -24(tp), tp C offset FIXME lea -8(up), up C offset FIXME mov 16(up,j,8), v0 mov 24(up,j,8), v1 mov 24(up,j,8), %rax mul v0 xor R32(w3), R32(w3) add %rax, 24(tp,j,8) adc %rdx, w3 xor R32(w0), R32(w0) xor R32(w1), R32(w1) jmp L(am2) ALIGN(16) L(addmul_2_m2_top): add w3, (tp,j,8) adc %rax, w0 mov 8(up,j,8), %rax adc %rdx, w1 mov $0, R32(w2) mul v0 add %rax, w0 mov 8(up,j,8), %rax adc %rdx, w1 adc $0, R32(w2) mul v1 C v1 * u0 add w0, 8(tp,j,8) adc %rax, w1 adc %rdx, w2 mov 16(up,j,8), %rax mov $0, R32(w3) mul v0 C v0 * u1 add %rax, w1 mov 16(up,j,8), %rax adc %rdx, w2 adc $0, R32(w3) mul v1 C v1 * u1 add w1, 16(tp,j,8) adc %rax, w2 mov 24(up,j,8), %rax adc %rdx, w3 mul v0 mov $0, R32(w0) add %rax, w2 adc %rdx, w3 mov $0, R32(w1) mov 24(up,j,8), %rax adc $0, R32(w0) mul v1 add w2, 24(tp,j,8) adc %rax, w3 adc %rdx, w0 L(am2): mov 32(up,j,8), %rax mul v0 add %rax, w3 mov 32(up,j,8), %rax adc %rdx, w0 adc $0, R32(w1) mul v1 add $4, j js L(addmul_2_m2_top) add w3, (tp) adc %rax, w0 adc %rdx, w1 mov w0, 8(tp) mov w1, 16(tp) lea eval(2*8)(tp), tp C tp += 2 add $-2, R32(i) C i -= 2 lea 24(tp), tp C undo offset FIXME lea 8(up), up C undo offset FIXME L(dowhile_mid): C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i) mov $1, R32(j) sub i, j lea -24(tp), tp C offset FIXME lea -8(up), up C offset FIXME mov (up,j,8), v0 mov 8(up,j,8), v1 mov 8(up,j,8), %rax mul v0 xor R32(w1), R32(w1) add %rax, 8(tp,j,8) adc %rdx, w1 xor R32(w2), R32(w2) jmp L(20) ALIGN(16) L(addmul_2_m0_top): add w3, (tp,j,8) adc %rax, w0 mov 8(up,j,8), %rax adc %rdx, w1 mov $0, R32(w2) mul v0 add %rax, w0 mov 8(up,j,8), %rax adc %rdx, w1 adc $0, R32(w2) mul v1 C v1 * u0 add w0, 8(tp,j,8) adc %rax, w1 adc %rdx, w2 L(20): mov 16(up,j,8), %rax mov $0, R32(w3) mul v0 C v0 * u1 add %rax, w1 mov 16(up,j,8), %rax adc %rdx, w2 adc $0, R32(w3) mul v1 C v1 * u1 add w1, 16(tp,j,8) adc %rax, w2 mov 24(up,j,8), %rax adc %rdx, w3 mul v0 mov $0, R32(w0) add %rax, w2 adc %rdx, w3 mov $0, R32(w1) mov 24(up,j,8), %rax adc $0, R32(w0) mul v1 add w2, 24(tp,j,8) adc %rax, w3 adc %rdx, w0 mov 32(up,j,8), %rax mul v0 add %rax, w3 mov 32(up,j,8), %rax adc %rdx, w0 adc $0, R32(w1) mul v1 add $4, j js L(addmul_2_m0_top) add w3, (tp) adc %rax, w0 adc %rdx, w1 mov w0, 8(tp) mov w1, 16(tp) lea 24(tp), tp C undo offset FIXME lea 8(up), up C undo offset FIXME lea eval(2*8)(tp), tp C tp += 2 add $-2, R32(i) C i -= 2 cmp $3, R32(i) jne L(dowhile) L(last): C Function mpn_addmul_2s_2 mov -24(up), v0 mov -16(up), v1 mov -16(up), %rax mul v0 xor R32(w3), R32(w3) add %rax, -32(tp) adc %rdx, w3 xor R32(w0), R32(w0) xor R32(w1), R32(w1) mov -8(up), %rax mul v0 add %rax, w3 mov -8(up), %rax adc %rdx, w0 mul v1 add w3, -24(tp) adc %rax, w0 adc %rdx, w1 mov w0, -16(tp) mov w1, -8(tp) C Function mpn_sqr_diag_addlsh1 mov R32(n), R32(j) shl $3, n sub n, up mov (%rsp), %r11 bt $0, j lea -4(j,j),j jc L(odd) L(evn): lea (rp,j,8), rp lea (up,j,4), up lea 8(%rsp,j,8), tp neg j add %r11, %r11 sbb R32(%rbx), R32(%rbx) C save CF mov (up,j,4), %rax mul %rax add %rdx, %r11 mov %rax, (rp,j,8) jmp L(d0) L(odd): lea -16(rp,j,8), rp lea -8(up,j,4), up lea -8(%rsp,j,8), tp neg j add %r11, %r11 sbb R32(%rbp), R32(%rbp) C save CF mov 8(up,j,4), %rax mul %rax add %rdx, %r11 mov %rax, 16(rp,j,8) jmp L(d1) ALIGN(16) L(top): mov (up,j,4), %rax mul %rax add R32(%rbp), R32(%rbp) C restore carry adc %rax, %r10 adc %rdx, %r11 mov %r10, (rp,j,8) L(d0): mov %r11, 8(rp,j,8) mov (tp,j,8), %r10 adc %r10, %r10 mov 8(tp,j,8), %r11 adc %r11, %r11 nop sbb R32(%rbp), R32(%rbp) C save CF mov 8(up,j,4), %rax mul %rax add R32(%rbx), R32(%rbx) C restore carry adc %rax, %r10 adc %rdx, %r11 mov %r10, 16(rp,j,8) L(d1): mov %r11, 24(rp,j,8) mov 16(tp,j,8), %r10 adc %r10, %r10 mov 24(tp,j,8), %r11 adc %r11, %r11 sbb R32(%rbx), R32(%rbx) C save CF add $4, j js L(top) L(end): mov (up,j,4), %rax mul %rax add R32(%rbp), R32(%rbp) C restore carry adc %rax, %r10 adc %rdx, %r11 mov %r10, (rp,j,8) mov %r11, 8(rp,j,8) mov (tp,j,8), %r10 adc %r10, %r10 sbb R32(%rbp), R32(%rbp) C save CF neg R32(%rbp) mov 8(up,j,4), %rax mul %rax add R32(%rbx), R32(%rbx) C restore carry adc %rax, %r10 adc %rbp, %rdx mov %r10, 16(rp,j,8) mov %rdx, 24(rp,j,8) add $eval(8+STACK_ALLOC), %rsp pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx ret EPILOGUE()