dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. dnl Copyright 2007, 2008 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. dnl The GNU MP Library is free software; you can redistribute it and/or modify dnl it under the terms of the GNU Lesser General Public License as published dnl by the Free Software Foundation; either version 3 of the License, or (at dnl your option) any later version. dnl The GNU MP Library is distributed in the hope that it will be useful, but dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public dnl License for more details. dnl You should have received a copy of the GNU Lesser General Public License dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. include(`../config.m4') C norm frac C K8 20 20 C P4 73 73 C P6-15 37 37 C TODO C * Perhaps compute the inverse without relying on divq? Could either use C Newton's method and mulq, or perhaps the faster fdiv. C * The loop has not been carefully tuned, nor analysed for critical path C length. It seems that 20 c/l is a bit long, compared to the 13 c/l for C mpn_divrem_1. C * Clean up. This code is really crude. C INPUT PARAMETERS define(`qp', `%rdi') define(`fn', `%rsi') define(`up_param', `%rdx') define(`un_param', `%rcx') define(`dp', `%r8') define(`dinv', `%r9') C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 C cnt qp d dinv ASM_START() TEXT ALIGN(16) PROLOGUE(mpn_divrem_2) push %r15 lea (%rdx,%rcx,8), %rax push %r14 push %r13 mov %rsi, %r13 push %r12 lea -24(%rax), %r12 push %rbp mov %rdi, %rbp push %rbx mov 8(%r8), %r11 mov -8(%rax), %r9 mov (%r8), %r8 mov -16(%rax), %r10 xor R32(%r15), R32(%r15) cmp %r9, %r11 ja L(2) setb %dl cmp %r10, %r8 setbe %al or %al, %dl jne L(23) L(2): lea -3(%rcx,%r13), %rbx C un + fn - 3 test %rbx, %rbx js L(6) mov %r11, %rdx mov $-1, %rax not %rdx div %r11 mov %r11, %rdx mov %rax, %rdi imul %rax, %rdx mov %rdx, %r14 mul %r8 mov %rdx, %rcx mov $-1, %rdx add %r8, %r14 adc $0, %rdx add %rcx, %r14 adc $0, %rdx js L(8) L(18): dec %rdi sub %r11, %r14 sbb $0, %rdx jns L(18) L(8): C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 C n2 un n1 dinv qp d0 d1 up fn msl C n2 un -d1 n1 dinv XX XX ifdef(`NEW',` lea (%rbp,%rbx,8), %rbp mov %rbx, %rcx C un mov %r9, %rbx mov %rdi, %r9 C di mov %r10, %r14 mov %r11, %rsi neg %rsi C -d1 ALIGN(16) L(loop): mov %r9, %rax C di ncp mul %rbx C 0, 18 add %r14, %rax C 4 mov %rax, %r10 C q0 5 adc %rbx, %rdx C 5 mov %rdx, %rdi C q 6 imul %rsi, %rdx C 6 mov %r8, %rax C ncp lea (%rdx, %r14), %rbx C n1 -= ... 7 mul %rdi C 7 xor R32(%r14), R32(%r14) C cmp %rcx, %r13 C jg L(19) C mov (%r12), %r14 C sub $8, %r12 C L(19): sub %r8, %r14 C ncp sbb %r11, %rbx C 9 sub %rax, %r14 C 11 sbb %rdx, %rbx C 12 inc %rdi C 7 xor R32(%rdx), R32(%rdx) C cmp %r10, %rbx C 13 mov %r8, %rax C d1 ncp adc $-1, %rdx C mask 14 add %rdx, %rdi C q-- 15 and %rdx, %rax C d0 or 0 15 and %r11, %rdx C d1 or 0 15 add %rax, %r14 C 16 adc %rdx, %rbx C 16 cmp %r11, %rbx C 17 jae L(fix) C L(bck): mov %rdi, (%rbp) C sub $8, %rbp C dec %rcx jns L(loop) mov %r14, %r10 mov %rbx, %r9 ',` lea (%rbp,%rbx,8), %rbp mov %rbx, %rcx mov %r9, %rax mov %r10, %rsi ALIGN(16) L(loop): mov %rax, %r14 C 0, 19 mul %rdi C 0 mov %r11, %r9 C 1 add %rsi, %rax C 4 mov %rax, %rbx C q0 5 adc %r14, %rdx C q 5 lea 1(%rdx), %r10 C 6 mov %rdx, %rax C 6 imul %rdx, %r9 C 6 sub %r9, %rsi C 10 xor R32(%r9), R32(%r9) C mul %r8 C 7 cmp %rcx, %r13 C jg L(13) C mov (%r12), %r9 C sub $8, %r12 C L(13): sub %r8, %r9 C ncp sbb %r11, %rsi C 11 sub %rax, %r9 C 11 sbb %rdx, %rsi C 12 cmp %rbx, %rsi C 13 sbb %rax, %rax C 14 not %rax C 15 add %rax, %r10 C 16 mov %r8, %rbx C ncp and %rax, %rbx C 16 and %r11, %rax C 16 add %rbx, %r9 C 17 adc %rsi, %rax C 18 cmp %rax, %r11 C 19 jbe L(fix) C L(bck): mov %r10, (%rbp) C sub $8, %rbp C mov %r9, %rsi C 18 dec %rcx jns L(loop) mov %rsi, %r10 mov %rax, %r9 ') L(6): mov %r10, 8(%r12) mov %r9, 16(%r12) pop %rbx pop %rbp pop %r12 pop %r13 pop %r14 mov %r15, %rax pop %r15 ret L(23): inc R32(%r15) sub %r8, %r10 sbb %r11, %r9 jmp L(2) ifdef(`NEW',` L(fix): seta %dl cmp %r8, %r14 setae %al orb %dl, %al je L(bck) inc %rdi sub %r8, %r14 sbb %r11, %rbx jmp L(bck) ',` L(fix): jb L(88) cmp %r8, %r9 jb L(bck) L(88): inc %r10 sub %r8, %r9 sbb %r11, %rax jmp L(bck) ') EPILOGUE()