-/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001
+/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+ 2004, 2005, 2006, 2009
Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
-Free Software Foundation; either version 2, or (at your option) any
+Free Software Foundation; either version 3, or (at your option) any
later version.
-In addition to the permissions in the GNU General Public License, the
-Free Software Foundation gives you unlimited permission to link the
-compiled version of this file into combinations with other programs,
-and to distribute those combinations without any restriction coming
-from the use of this file. (The General Public License restrictions
-do apply in other respects; for example, they cover modification of
-the file, and distribution when not linked into a combine
-executable.)
-
This file is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
-You should have received a copy of the GNU General Public License
-along with this program; see the file COPYING. If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA. */
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
+
-!! libgcc routines for the Hitachi / SuperH SH CPUs.
+!! libgcc routines for the Renesas / SuperH SH CPUs.
!! Contributed by Steve Chamberlain.
!! sac@cygnus.com
ELF local label prefixes by J"orn Rennecke
amylaar@cygnus.com */
-#ifdef __ELF__
-#define LOCAL(X) .L_##X
-#else
-#define LOCAL(X) L_##X
-#endif
-
-#ifdef __linux__
-#define GLOBAL(X) __##X
-#endif
-
-#ifndef GLOBAL
-#define GLOBAL(X) ___##X
-#endif
+#include "lib1funcs.h"
-#if defined __SH5__ && ! defined __SH4_NOFPU__
-#define FMOVD_WORKS
+/* t-vxworks needs to build both PIC and non-PIC versions of libgcc,
+ so it is more convenient to define NO_FPSCR_VALUES here than to
+ define it on the command line. */
+#if defined __vxworks && defined __PIC__
+#define NO_FPSCR_VALUES
#endif
-
+
#if ! __SH5__
#ifdef L_ashiftrt
.global GLOBAL(ashiftrt_r4_0)
.global GLOBAL(ashiftrt_r4_31)
.global GLOBAL(ashiftrt_r4_32)
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_0))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_1))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_2))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_3))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_4))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_5))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_6))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_7))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_8))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_9))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_10))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_11))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_12))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_13))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_14))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_15))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_16))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_17))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_18))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_19))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_20))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_21))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_22))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_23))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_24))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_25))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_26))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_27))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_28))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_29))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_30))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_31))
+ HIDDEN_FUNC(GLOBAL(ashiftrt_r4_32))
+
.align 1
GLOBAL(ashiftrt_r4_32):
GLOBAL(ashiftrt_r4_31):
GLOBAL(ashiftrt_r4_0):
rts
nop
+
+ ENDFUNC(GLOBAL(ashiftrt_r4_0))
+ ENDFUNC(GLOBAL(ashiftrt_r4_1))
+ ENDFUNC(GLOBAL(ashiftrt_r4_2))
+ ENDFUNC(GLOBAL(ashiftrt_r4_3))
+ ENDFUNC(GLOBAL(ashiftrt_r4_4))
+ ENDFUNC(GLOBAL(ashiftrt_r4_5))
+ ENDFUNC(GLOBAL(ashiftrt_r4_6))
+ ENDFUNC(GLOBAL(ashiftrt_r4_7))
+ ENDFUNC(GLOBAL(ashiftrt_r4_8))
+ ENDFUNC(GLOBAL(ashiftrt_r4_9))
+ ENDFUNC(GLOBAL(ashiftrt_r4_10))
+ ENDFUNC(GLOBAL(ashiftrt_r4_11))
+ ENDFUNC(GLOBAL(ashiftrt_r4_12))
+ ENDFUNC(GLOBAL(ashiftrt_r4_13))
+ ENDFUNC(GLOBAL(ashiftrt_r4_14))
+ ENDFUNC(GLOBAL(ashiftrt_r4_15))
+ ENDFUNC(GLOBAL(ashiftrt_r4_16))
+ ENDFUNC(GLOBAL(ashiftrt_r4_17))
+ ENDFUNC(GLOBAL(ashiftrt_r4_18))
+ ENDFUNC(GLOBAL(ashiftrt_r4_19))
+ ENDFUNC(GLOBAL(ashiftrt_r4_20))
+ ENDFUNC(GLOBAL(ashiftrt_r4_21))
+ ENDFUNC(GLOBAL(ashiftrt_r4_22))
+ ENDFUNC(GLOBAL(ashiftrt_r4_23))
+ ENDFUNC(GLOBAL(ashiftrt_r4_24))
+ ENDFUNC(GLOBAL(ashiftrt_r4_25))
+ ENDFUNC(GLOBAL(ashiftrt_r4_26))
+ ENDFUNC(GLOBAL(ashiftrt_r4_27))
+ ENDFUNC(GLOBAL(ashiftrt_r4_28))
+ ENDFUNC(GLOBAL(ashiftrt_r4_29))
+ ENDFUNC(GLOBAL(ashiftrt_r4_30))
+ ENDFUNC(GLOBAL(ashiftrt_r4_31))
+ ENDFUNC(GLOBAL(ashiftrt_r4_32))
#endif
#ifdef L_ashiftrt_n
!
.global GLOBAL(ashrsi3)
+ HIDDEN_FUNC(GLOBAL(ashrsi3))
.align 2
GLOBAL(ashrsi3):
mov #31,r0
rts
nop
+ ENDFUNC(GLOBAL(ashrsi3))
#endif
#ifdef L_ashiftlt
! (none)
!
.global GLOBAL(ashlsi3)
+ HIDDEN_FUNC(GLOBAL(ashlsi3))
.align 2
GLOBAL(ashlsi3):
mov #31,r0
rts
nop
+ ENDFUNC(GLOBAL(ashlsi3))
#endif
#ifdef L_lshiftrt
! (none)
!
.global GLOBAL(lshrsi3)
+ HIDDEN_FUNC(GLOBAL(lshrsi3))
.align 2
GLOBAL(lshrsi3):
mov #31,r0
rts
nop
+ ENDFUNC(GLOBAL(lshrsi3))
#endif
-#ifdef L_movstr
+#ifdef L_movmem
.text
-! done all the large groups, do the remainder
-
-! jump to movstr+
-done:
- add #64,r5
- mova GLOBAL(movstrSI0),r0
+ .balign 4
+ .global GLOBAL(movmem)
+ HIDDEN_FUNC(GLOBAL(movmem))
+ HIDDEN_ALIAS(movstr,movmem)
+ /* This would be a lot simpler if r6 contained the byte count
+ minus 64, and we wouldn't be called here for a byte count of 64. */
+GLOBAL(movmem):
+ sts.l pr,@-r15
shll2 r6
- add r6,r0
- jmp @r0
- add #64,r4
- .align 4
- .global GLOBAL(movstrSI64)
-GLOBAL(movstrSI64):
+ bsr GLOBAL(movmemSI52+2)
+ mov.l @(48,r5),r0
+ .balign 4
+LOCAL(movmem_loop): /* Reached with rts */
mov.l @(60,r5),r0
+ add #-64,r6
mov.l r0,@(60,r4)
- .global GLOBAL(movstrSI60)
-GLOBAL(movstrSI60):
+ tst r6,r6
mov.l @(56,r5),r0
+ bt LOCAL(movmem_done)
mov.l r0,@(56,r4)
- .global GLOBAL(movstrSI56)
-GLOBAL(movstrSI56):
+ cmp/pl r6
mov.l @(52,r5),r0
+ add #64,r5
mov.l r0,@(52,r4)
- .global GLOBAL(movstrSI52)
-GLOBAL(movstrSI52):
- mov.l @(48,r5),r0
- mov.l r0,@(48,r4)
- .global GLOBAL(movstrSI48)
-GLOBAL(movstrSI48):
- mov.l @(44,r5),r0
- mov.l r0,@(44,r4)
- .global GLOBAL(movstrSI44)
-GLOBAL(movstrSI44):
- mov.l @(40,r5),r0
- mov.l r0,@(40,r4)
- .global GLOBAL(movstrSI40)
-GLOBAL(movstrSI40):
- mov.l @(36,r5),r0
- mov.l r0,@(36,r4)
- .global GLOBAL(movstrSI36)
-GLOBAL(movstrSI36):
- mov.l @(32,r5),r0
- mov.l r0,@(32,r4)
- .global GLOBAL(movstrSI32)
-GLOBAL(movstrSI32):
- mov.l @(28,r5),r0
- mov.l r0,@(28,r4)
- .global GLOBAL(movstrSI28)
-GLOBAL(movstrSI28):
- mov.l @(24,r5),r0
- mov.l r0,@(24,r4)
- .global GLOBAL(movstrSI24)
-GLOBAL(movstrSI24):
- mov.l @(20,r5),r0
- mov.l r0,@(20,r4)
- .global GLOBAL(movstrSI20)
-GLOBAL(movstrSI20):
- mov.l @(16,r5),r0
- mov.l r0,@(16,r4)
- .global GLOBAL(movstrSI16)
-GLOBAL(movstrSI16):
- mov.l @(12,r5),r0
- mov.l r0,@(12,r4)
- .global GLOBAL(movstrSI12)
-GLOBAL(movstrSI12):
- mov.l @(8,r5),r0
- mov.l r0,@(8,r4)
- .global GLOBAL(movstrSI8)
-GLOBAL(movstrSI8):
- mov.l @(4,r5),r0
- mov.l r0,@(4,r4)
- .global GLOBAL(movstrSI4)
-GLOBAL(movstrSI4):
- mov.l @(0,r5),r0
- mov.l r0,@(0,r4)
-GLOBAL(movstrSI0):
+ add #64,r4
+ bt GLOBAL(movmemSI52)
+! done all the large groups, do the remainder
+! jump to movmem+
+ mova GLOBAL(movmemSI4)+4,r0
+ add r6,r0
+ jmp @r0
+LOCAL(movmem_done): ! share slot insn, works out aligned.
+ lds.l @r15+,pr
+ mov.l r0,@(56,r4)
+ mov.l @(52,r5),r0
rts
- nop
-
- .align 4
-
- .global GLOBAL(movstr)
-GLOBAL(movstr):
+ mov.l r0,@(52,r4)
+ .balign 4
+! ??? We need aliases movstr* for movmem* for the older libraries. These
+! aliases will be removed at the some point in the future.
+ .global GLOBAL(movmemSI64)
+ HIDDEN_FUNC(GLOBAL(movmemSI64))
+ HIDDEN_ALIAS(movstrSI64,movmemSI64)
+GLOBAL(movmemSI64):
mov.l @(60,r5),r0
mov.l r0,@(60,r4)
-
+ .global GLOBAL(movmemSI60)
+ HIDDEN_FUNC(GLOBAL(movmemSI60))
+ HIDDEN_ALIAS(movstrSI60,movmemSI60)
+GLOBAL(movmemSI60):
mov.l @(56,r5),r0
mov.l r0,@(56,r4)
-
+ .global GLOBAL(movmemSI56)
+ HIDDEN_FUNC(GLOBAL(movmemSI56))
+ HIDDEN_ALIAS(movstrSI56,movmemSI56)
+GLOBAL(movmemSI56):
mov.l @(52,r5),r0
mov.l r0,@(52,r4)
-
+ .global GLOBAL(movmemSI52)
+ HIDDEN_FUNC(GLOBAL(movmemSI52))
+ HIDDEN_ALIAS(movstrSI52,movmemSI52)
+GLOBAL(movmemSI52):
mov.l @(48,r5),r0
mov.l r0,@(48,r4)
-
+ .global GLOBAL(movmemSI48)
+ HIDDEN_FUNC(GLOBAL(movmemSI48))
+ HIDDEN_ALIAS(movstrSI48,movmemSI48)
+GLOBAL(movmemSI48):
mov.l @(44,r5),r0
mov.l r0,@(44,r4)
-
+ .global GLOBAL(movmemSI44)
+ HIDDEN_FUNC(GLOBAL(movmemSI44))
+ HIDDEN_ALIAS(movstrSI44,movmemSI44)
+GLOBAL(movmemSI44):
mov.l @(40,r5),r0
mov.l r0,@(40,r4)
-
+ .global GLOBAL(movmemSI40)
+ HIDDEN_FUNC(GLOBAL(movmemSI40))
+ HIDDEN_ALIAS(movstrSI40,movmemSI40)
+GLOBAL(movmemSI40):
mov.l @(36,r5),r0
mov.l r0,@(36,r4)
-
+ .global GLOBAL(movmemSI36)
+ HIDDEN_FUNC(GLOBAL(movmemSI36))
+ HIDDEN_ALIAS(movstrSI36,movmemSI36)
+GLOBAL(movmemSI36):
mov.l @(32,r5),r0
mov.l r0,@(32,r4)
-
+ .global GLOBAL(movmemSI32)
+ HIDDEN_FUNC(GLOBAL(movmemSI32))
+ HIDDEN_ALIAS(movstrSI32,movmemSI32)
+GLOBAL(movmemSI32):
mov.l @(28,r5),r0
mov.l r0,@(28,r4)
-
+ .global GLOBAL(movmemSI28)
+ HIDDEN_FUNC(GLOBAL(movmemSI28))
+ HIDDEN_ALIAS(movstrSI28,movmemSI28)
+GLOBAL(movmemSI28):
mov.l @(24,r5),r0
mov.l r0,@(24,r4)
-
+ .global GLOBAL(movmemSI24)
+ HIDDEN_FUNC(GLOBAL(movmemSI24))
+ HIDDEN_ALIAS(movstrSI24,movmemSI24)
+GLOBAL(movmemSI24):
mov.l @(20,r5),r0
mov.l r0,@(20,r4)
-
+ .global GLOBAL(movmemSI20)
+ HIDDEN_FUNC(GLOBAL(movmemSI20))
+ HIDDEN_ALIAS(movstrSI20,movmemSI20)
+GLOBAL(movmemSI20):
mov.l @(16,r5),r0
mov.l r0,@(16,r4)
-
+ .global GLOBAL(movmemSI16)
+ HIDDEN_FUNC(GLOBAL(movmemSI16))
+ HIDDEN_ALIAS(movstrSI16,movmemSI16)
+GLOBAL(movmemSI16):
mov.l @(12,r5),r0
mov.l r0,@(12,r4)
-
+ .global GLOBAL(movmemSI12)
+ HIDDEN_FUNC(GLOBAL(movmemSI12))
+ HIDDEN_ALIAS(movstrSI12,movmemSI12)
+GLOBAL(movmemSI12):
mov.l @(8,r5),r0
mov.l r0,@(8,r4)
-
+ .global GLOBAL(movmemSI8)
+ HIDDEN_FUNC(GLOBAL(movmemSI8))
+ HIDDEN_ALIAS(movstrSI8,movmemSI8)
+GLOBAL(movmemSI8):
mov.l @(4,r5),r0
mov.l r0,@(4,r4)
-
+ .global GLOBAL(movmemSI4)
+ HIDDEN_FUNC(GLOBAL(movmemSI4))
+ HIDDEN_ALIAS(movstrSI4,movmemSI4)
+GLOBAL(movmemSI4):
mov.l @(0,r5),r0
+ rts
mov.l r0,@(0,r4)
- add #-16,r6
- cmp/pl r6
- bf done
-
- add #64,r5
- bra GLOBAL(movstr)
- add #64,r4
+ ENDFUNC(GLOBAL(movmemSI64))
+ ENDFUNC(GLOBAL(movmemSI60))
+ ENDFUNC(GLOBAL(movmemSI56))
+ ENDFUNC(GLOBAL(movmemSI52))
+ ENDFUNC(GLOBAL(movmemSI48))
+ ENDFUNC(GLOBAL(movmemSI44))
+ ENDFUNC(GLOBAL(movmemSI40))
+ ENDFUNC(GLOBAL(movmemSI36))
+ ENDFUNC(GLOBAL(movmemSI32))
+ ENDFUNC(GLOBAL(movmemSI28))
+ ENDFUNC(GLOBAL(movmemSI24))
+ ENDFUNC(GLOBAL(movmemSI20))
+ ENDFUNC(GLOBAL(movmemSI16))
+ ENDFUNC(GLOBAL(movmemSI12))
+ ENDFUNC(GLOBAL(movmemSI8))
+ ENDFUNC(GLOBAL(movmemSI4))
+ ENDFUNC(GLOBAL(movmem))
#endif
-#ifdef L_movstr_i4
+#ifdef L_movmem_i4
.text
- .global GLOBAL(movstr_i4_even)
- .global GLOBAL(movstr_i4_odd)
- .global GLOBAL(movstrSI12_i4)
+ .global GLOBAL(movmem_i4_even)
+ .global GLOBAL(movmem_i4_odd)
+ .global GLOBAL(movmemSI12_i4)
+
+ HIDDEN_FUNC(GLOBAL(movmem_i4_even))
+ HIDDEN_FUNC(GLOBAL(movmem_i4_odd))
+ HIDDEN_FUNC(GLOBAL(movmemSI12_i4))
+
+ HIDDEN_ALIAS(movstr_i4_even,movmem_i4_even)
+ HIDDEN_ALIAS(movstr_i4_odd,movmem_i4_odd)
+ HIDDEN_ALIAS(movstrSI12_i4,movmemSI12_i4)
.p2align 5
-L_movstr_2mod4_end:
+L_movmem_2mod4_end:
mov.l r0,@(16,r4)
rts
mov.l r1,@(20,r4)
.p2align 2
-GLOBAL(movstr_i4_odd):
+GLOBAL(movmem_i4_even):
+ mov.l @r5+,r0
+ bra L_movmem_start_even
+ mov.l @r5+,r1
+
+GLOBAL(movmem_i4_odd):
mov.l @r5+,r1
add #-4,r4
mov.l @r5+,r2
mov.l r1,@(4,r4)
mov.l r2,@(8,r4)
-L_movstr_loop:
+L_movmem_loop:
mov.l r3,@(12,r4)
dt r6
mov.l @r5+,r0
- bt/s L_movstr_2mod4_end
+ bt/s L_movmem_2mod4_end
mov.l @r5+,r1
add #16,r4
-L_movstr_start_even:
+L_movmem_start_even:
mov.l @r5+,r2
mov.l @r5+,r3
mov.l r0,@r4
dt r6
mov.l r1,@(4,r4)
- bf/s L_movstr_loop
+ bf/s L_movmem_loop
mov.l r2,@(8,r4)
rts
mov.l r3,@(12,r4)
-GLOBAL(movstr_i4_even):
- mov.l @r5+,r0
- bra L_movstr_start_even
- mov.l @r5+,r1
+ ENDFUNC(GLOBAL(movmem_i4_even))
+ ENDFUNC(GLOBAL(movmem_i4_odd))
.p2align 4
-GLOBAL(movstrSI12_i4):
+GLOBAL(movmemSI12_i4):
mov.l @r5,r0
mov.l @(4,r5),r1
mov.l @(8,r5),r2
mov.l r1,@(4,r4)
rts
mov.l r2,@(8,r4)
+
+ ENDFUNC(GLOBAL(movmemSI12_i4))
#endif
#ifdef L_mulsi3
.global GLOBAL(mulsi3)
+ HIDDEN_FUNC(GLOBAL(mulsi3))
! r4 = aabb
! r5 = ccdd
rts
add r2,r0
-
+ ENDFUNC(GLOBAL(mulsi3))
#endif
#endif /* ! __SH5__ */
#ifdef L_sdivsi3_i4
.title "SH DIVIDE"
-!! 4 byte integer Divide code for the Hitachi SH
+!! 4 byte integer Divide code for the Renesas SH
#ifdef __SH4__
!! args in r4 and r5, result in fpul, clobber dr0, dr2
.global GLOBAL(sdivsi3_i4)
+ HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
GLOBAL(sdivsi3_i4):
lds r4,fpul
float fpul,dr0
rts
ftrc dr0,fpul
+ ENDFUNC(GLOBAL(sdivsi3_i4))
#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
!! args in r4 and r5, result in fpul, clobber r2, dr0, dr2
.mode SHcompact
#endif
.global GLOBAL(sdivsi3_i4)
+ HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
GLOBAL(sdivsi3_i4):
sts.l fpscr,@-r15
mov #8,r2
rts
lds.l @r15+,fpscr
+ ENDFUNC(GLOBAL(sdivsi3_i4))
#endif /* ! __SH5__ || __SH5__ == 32 */
#endif /* ! __SH4__ */
#endif
#ifdef L_sdivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
- sh3e code. */
+ sh2e/sh3e code. */
#if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__)
!!
!! Steve Chamberlain
!!
!!
-!! args in r4 and r5, result in r0 clobber r1,r2,r3
+!! args in r4 and r5, result in r0 clobber r1, r2, r3, and t bit
.global GLOBAL(sdivsi3)
#if __SHMEDIA__
.text
#endif
.align 2
+#if 0
/* The assembly code that follows is a hand-optimized version of the C
code that follows. Note that the registers that are modified are
exactly those listed as clobbered in the patterns divsi3_i1 and
muls.l r0, r2, r0
add.l r0, r63, r0
blink tr0, r63
-#else
+#elif 0 /* ! 0 */
+ // inputs: r4,r5
+ // clobbered: r1,r2,r3,r18,r19,r20,r21,r25,tr0
+ // result in r0
+GLOBAL(sdivsi3):
+ // can create absolute value without extra latency,
+ // but dependent on proper sign extension of inputs:
+ // shari.l r5,31,r2
+ // xor r5,r2,r20
+ // sub r20,r2,r20 // r20 is now absolute value of r5, zero-extended.
+ shari.l r5,31,r2
+ ori r2,1,r2
+ muls.l r5,r2,r20 // r20 is now absolute value of r5, zero-extended.
+ movi 0xffffffffffffbb0c,r19 // shift count eqiv 76
+ shari.l r4,31,r3
+ nsb r20,r0
+ shlld r20,r0,r25
+ shlri r25,48,r25
+ sub r19,r25,r1
+ mmulfx.w r1,r1,r2
+ mshflo.w r1,r63,r1
+ // If r4 was to be used in-place instead of r21, could use this sequence
+ // to compute absolute:
+ // sub r63,r4,r19 // compute absolute value of r4
+ // shlri r4,32,r3 // into lower 32 bit of r4, keeping
+ // mcmv r19,r3,r4 // the sign in the upper 32 bits intact.
+ ori r3,1,r3
+ mmulfx.w r25,r2,r2
+ sub r19,r0,r0
+ muls.l r4,r3,r21
+ msub.w r1,r2,r2
+ addi r2,-2,r1
+ mulu.l r21,r1,r19
+ mmulfx.w r2,r2,r2
+ shlli r1,15,r1
+ shlrd r19,r0,r19
+ mulu.l r19,r20,r3
+ mmacnfx.wl r25,r2,r1
+ ptabs r18,tr0
+ sub r21,r3,r25
+
+ mulu.l r25,r1,r2
+ addi r0,14,r0
+ xor r4,r5,r18
+ shlrd r2,r0,r2
+ mulu.l r2,r20,r3
+ add r19,r2,r19
+ shari.l r18,31,r18
+ sub r25,r3,r25
+
+ mulu.l r25,r1,r2
+ sub r25,r20,r25
+ add r19,r18,r19
+ shlrd r2,r0,r2
+ mulu.l r2,r20,r3
+ addi r25,1,r25
+ add r19,r2,r19
+
+ cmpgt r25,r3,r25
+ add.l r19,r25,r0
+ xor r0,r18,r0
+ blink tr0,r63
+#else /* ! 0 && ! 0 */
+
+ // inputs: r4,r5
+ // clobbered: r1,r18,r19,r20,r21,r25,tr0
+ // result in r0
+ HIDDEN_FUNC(GLOBAL(sdivsi3_2))
+#ifndef __pic__
+ FUNC(GLOBAL(sdivsi3))
+GLOBAL(sdivsi3): /* this is the shcompact entry point */
+ // The special SHmedia entry point sdivsi3_1 prevents accidental linking
+ // with the SHcompact implementation, which clobbers tr1 / tr2.
+ .global GLOBAL(sdivsi3_1)
+GLOBAL(sdivsi3_1):
+ .global GLOBAL(div_table_internal)
+ movi (GLOBAL(div_table_internal) >> 16) & 65535, r20
+ shori GLOBAL(div_table_internal) & 65535, r20
+#endif
+ .global GLOBAL(sdivsi3_2)
+ // div_table in r20
+ // clobbered: r1,r18,r19,r21,r25,tr0
+GLOBAL(sdivsi3_2):
+ nsb r5, r1
+ shlld r5, r1, r25 // normalize; [-2 ..1, 1..2) in s2.62
+ shari r25, 58, r21 // extract 5(6) bit index (s2.4 with hole -1..1)
+ ldx.ub r20, r21, r19 // u0.8
+ shari r25, 32, r25 // normalize to s2.30
+ shlli r21, 1, r21
+ muls.l r25, r19, r19 // s2.38
+ ldx.w r20, r21, r21 // s2.14
+ ptabs r18, tr0
+ shari r19, 24, r19 // truncate to s2.14
+ sub r21, r19, r19 // some 11 bit inverse in s1.14
+ muls.l r19, r19, r21 // u0.28
+ sub r63, r1, r1
+ addi r1, 92, r1
+ muls.l r25, r21, r18 // s2.58
+ shlli r19, 45, r19 // multiply by two and convert to s2.58
+ /* bubble */
+ sub r19, r18, r18
+ shari r18, 28, r18 // some 22 bit inverse in s1.30
+ muls.l r18, r25, r0 // s2.60
+ muls.l r18, r4, r25 // s32.30
+ /* bubble */
+ shari r0, 16, r19 // s-16.44
+ muls.l r19, r18, r19 // s-16.74
+ shari r25, 63, r0
+ shari r4, 14, r18 // s19.-14
+ shari r19, 30, r19 // s-16.44
+ muls.l r19, r18, r19 // s15.30
+ xor r21, r0, r21 // You could also use the constant 1 << 27.
+ add r21, r25, r21
+ sub r21, r19, r21
+ shard r21, r1, r21
+ sub r21, r0, r0
+ blink tr0, r63
+#ifndef __pic__
+ ENDFUNC(GLOBAL(sdivsi3))
+#endif
+ ENDFUNC(GLOBAL(sdivsi3_2))
+#endif
+#elif defined __SHMEDIA__
+/* m5compact-nofpu */
+ // clobbered: r18,r19,r20,r21,r25,tr0,tr1,tr2
+ .mode SHmedia
+ .section .text..SHmedia32,"ax"
+ .align 2
+ FUNC(GLOBAL(sdivsi3))
+GLOBAL(sdivsi3):
+ pt/l LOCAL(sdivsi3_dontsub), tr0
+ pt/l LOCAL(sdivsi3_loop), tr1
+ ptabs/l r18,tr2
+ shari.l r4,31,r18
+ shari.l r5,31,r19
+ xor r4,r18,r20
+ xor r5,r19,r21
+ sub.l r20,r18,r20
+ sub.l r21,r19,r21
+ xor r18,r19,r19
+ shlli r21,32,r25
+ addi r25,-1,r21
+ addz.l r20,r63,r20
+LOCAL(sdivsi3_loop):
+ shlli r20,1,r20
+ bgeu/u r21,r20,tr0
+ sub r20,r21,r20
+LOCAL(sdivsi3_dontsub):
+ addi.l r25,-1,r25
+ bnei r25,-32,tr1
+ xor r20,r19,r20
+ sub.l r20,r19,r0
+ blink tr2,r63
+ ENDFUNC(GLOBAL(sdivsi3))
+#else /* ! __SHMEDIA__ */
+ FUNC(GLOBAL(sdivsi3))
GLOBAL(sdivsi3):
mov r4,r1
mov r5,r0
div0: rts
mov #0,r0
+ ENDFUNC(GLOBAL(sdivsi3))
#endif /* ! __SHMEDIA__ */
#endif /* ! __SH4__ */
#endif
#ifdef L_udivsi3_i4
.title "SH DIVIDE"
-!! 4 byte integer Divide code for the Hitachi SH
+!! 4 byte integer Divide code for the Renesas SH
#ifdef __SH4__
-!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4
+!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4,
+!! and t bit
.global GLOBAL(udivsi3_i4)
+ HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
mov #1,r1
cmp/hi r1,r5
#ifdef FMOVD_WORKS
fmov.d @r0+,dr4
#else
-#ifdef __LITTLE_ENDIAN__
- fmov.s @r0+,fr5
- fmov.s @r0,fr4
-#else
- fmov.s @r0+,fr4
- fmov.s @r0,fr5
-#endif
+ fmov.s @r0+,DR40
+ fmov.s @r0,DR41
#endif
float fpul,dr0
xor r1,r5
L1:
.double 2147483648
-#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
+ ENDFUNC(GLOBAL(udivsi3_i4))
+#elif defined (__SH5__) && ! defined (__SH4_NOFPU__)
+#if ! __SH5__ || __SH5__ == 32
+!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33
+ .mode SHmedia
+ .global GLOBAL(udivsi3_i4)
+ HIDDEN_FUNC(GLOBAL(udivsi3_i4))
+GLOBAL(udivsi3_i4):
+ addz.l r4,r63,r20
+ addz.l r5,r63,r21
+ fmov.qd r20,dr0
+ fmov.qd r21,dr32
+ ptabs r18,tr0
+ float.qd dr0,dr0
+ float.qd dr32,dr32
+ fdiv.d dr0,dr32,dr0
+ ftrc.dq dr0,dr32
+ fmov.s fr33,fr32
+ blink tr0,r63
+
+ ENDFUNC(GLOBAL(udivsi3_i4))
+#endif /* ! __SH5__ || __SH5__ == 32 */
+#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4
-#if ! __SH5__ || __SH5__ == 32
-#if __SH5__
- .mode SHcompact
-#endif
.global GLOBAL(udivsi3_i4)
+ HIDDEN_FUNC(GLOBAL(udivsi3_i4))
GLOBAL(udivsi3_i4):
mov #1,r1
cmp/hi r1,r5
#ifdef FMOVD_WORKS
fmov.d @r0+,dr4
#else
-#ifdef __LITTLE_ENDIAN__
- fmov.s @r0+,fr5
- fmov.s @r0,fr4
-#else
- fmov.s @r0+,fr4
- fmov.s @r0,fr5
-#endif
+ fmov.s @r0+,DR40
+ fmov.s @r0,DR41
#endif
float fpul,dr0
xor r1,r5
#endif
.double 2147483648
-#endif /* ! __SH5__ || __SH5__ == 32 */
+ ENDFUNC(GLOBAL(udivsi3_i4))
#endif /* ! __SH4__ */
#endif
#ifdef L_udivsi3
/* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
- sh3e code. */
+ sh2e/sh3e code. */
#if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__)
-!!
-!! Steve Chamberlain
-!! sac@cygnus.com
-!!
-!!
!! args in r4 and r5, result in r0, clobbers r4, pr, and t bit
.global GLOBAL(udivsi3)
+ HIDDEN_FUNC(GLOBAL(udivsi3))
#if __SHMEDIA__
#if __SH5__ == 32
.text
#endif
.align 2
+#if 0
/* The assembly code that follows is a hand-optimized version of the C
code that follows. Note that the registers that are modified are
exactly those listed as clobbered in the patterns udivsi3_i1 and
blink tr0, r63
#else
GLOBAL(udivsi3):
-longway:
- mov #0,r0
- div0u
- ! get one bit from the msb of the numerator into the T
- ! bit and divide it by whats in r5. Put the answer bit
- ! into the T bit so it can come out again at the bottom
-
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
-
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
-shortway:
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
-
-vshortway:
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4 ; div1 r5,r0
- rotcl r4
-ret: rts
- mov r4,r0
+ // inputs: r4,r5
+ // clobbered: r18,r19,r20,r21,r22,r25,tr0
+ // result in r0.
+ addz.l r5,r63,r22
+ nsb r22,r0
+ shlld r22,r0,r25
+ shlri r25,48,r25
+ movi 0xffffffffffffbb0c,r20 // shift count eqiv 76
+ sub r20,r25,r21
+ mmulfx.w r21,r21,r19
+ mshflo.w r21,r63,r21
+ ptabs r18,tr0
+ mmulfx.w r25,r19,r19
+ sub r20,r0,r0
+ /* bubble */
+ msub.w r21,r19,r19
+ addi r19,-2,r21 /* It would be nice for scheduling to do this add to r21
+ before the msub.w, but we need a different value for
+ r19 to keep errors under control. */
+ mulu.l r4,r21,r18
+ mmulfx.w r19,r19,r19
+ shlli r21,15,r21
+ shlrd r18,r0,r18
+ mulu.l r18,r22,r20
+ mmacnfx.wl r25,r19,r21
+ /* bubble */
+ sub r4,r20,r25
+
+ mulu.l r25,r21,r19
+ addi r0,14,r0
+ /* bubble */
+ shlrd r19,r0,r19
+ mulu.l r19,r22,r20
+ add r18,r19,r18
+ /* bubble */
+ sub.l r25,r20,r25
+
+ mulu.l r25,r21,r19
+ addz.l r25,r63,r25
+ sub r25,r22,r25
+ shlrd r19,r0,r19
+ mulu.l r19,r22,r20
+ addi r25,1,r25
+ add r18,r19,r18
+
+ cmpgt r25,r20,r25
+ add.l r18,r25,r0
+ blink tr0,r63
+#endif
+#elif defined (__SHMEDIA__)
+/* m5compact-nofpu - more emphasis on code size than on speed, but don't
+ ignore speed altogether - div1 needs 9 cycles, subc 7 and rotcl 4.
+ So use a short shmedia loop. */
+ // clobbered: r20,r21,r25,tr0,tr1,tr2
+ .mode SHmedia
+ .section .text..SHmedia32,"ax"
+ .align 2
+GLOBAL(udivsi3):
+ pt/l LOCAL(udivsi3_dontsub), tr0
+ pt/l LOCAL(udivsi3_loop), tr1
+ ptabs/l r18,tr2
+ shlli r5,32,r25
+ addi r25,-1,r21
+ addz.l r4,r63,r20
+LOCAL(udivsi3_loop):
+ shlli r20,1,r20
+ bgeu/u r21,r20,tr0
+ sub r20,r21,r20
+LOCAL(udivsi3_dontsub):
+ addi.l r25,-1,r25
+ bnei r25,-32,tr1
+ add.l r20,r63,r0
+ blink tr2,r63
+#else /* ! defined (__SHMEDIA__) */
+LOCAL(div8):
+ div1 r5,r4
+LOCAL(div7):
+ div1 r5,r4; div1 r5,r4; div1 r5,r4
+ div1 r5,r4; div1 r5,r4; div1 r5,r4; rts; div1 r5,r4
+
+LOCAL(divx4):
+ div1 r5,r4; rotcl r0
+ div1 r5,r4; rotcl r0
+ div1 r5,r4; rotcl r0
+ rts; div1 r5,r4
+GLOBAL(udivsi3):
+ sts.l pr,@-r15
+ extu.w r5,r0
+ cmp/eq r5,r0
+#ifdef __sh1__
+ bf LOCAL(large_divisor)
+#else
+ bf/s LOCAL(large_divisor)
+#endif
+ div0u
+ swap.w r4,r0
+ shlr16 r4
+ bsr LOCAL(div8)
+ shll16 r5
+ bsr LOCAL(div7)
+ div1 r5,r4
+ xtrct r4,r0
+ xtrct r0,r4
+ bsr LOCAL(div8)
+ swap.w r4,r4
+ bsr LOCAL(div7)
+ div1 r5,r4
+ lds.l @r15+,pr
+ xtrct r4,r0
+ swap.w r0,r0
+ rotcl r0
+ rts
+ shlr16 r5
+
+LOCAL(large_divisor):
+#ifdef __sh1__
+ div0u
+#endif
+ mov #0,r0
+ xtrct r4,r0
+ xtrct r0,r4
+ bsr LOCAL(divx4)
+ rotcl r0
+ bsr LOCAL(divx4)
+ rotcl r0
+ bsr LOCAL(divx4)
+ rotcl r0
+ bsr LOCAL(divx4)
+ rotcl r0
+ lds.l @r15+,pr
+ rts
+ rotcl r0
+
+ ENDFUNC(GLOBAL(udivsi3))
#endif /* ! __SHMEDIA__ */
#endif /* __SH4__ */
-#endif
+#endif /* L_udivsi3 */
+
+#ifdef L_udivdi3
+#ifdef __SHMEDIA__
+ .mode SHmedia
+ .section .text..SHmedia32,"ax"
+ .align 2
+ .global GLOBAL(udivdi3)
+ FUNC(GLOBAL(udivdi3))
+GLOBAL(udivdi3):
+ HIDDEN_ALIAS(udivdi3_internal,udivdi3)
+ shlri r3,1,r4
+ nsb r4,r22
+ shlld r3,r22,r6
+ shlri r6,49,r5
+ movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */
+ sub r21,r5,r1
+ mmulfx.w r1,r1,r4
+ mshflo.w r1,r63,r1
+ sub r63,r22,r20 // r63 == 64 % 64
+ mmulfx.w r5,r4,r4
+ pta LOCAL(large_divisor),tr0
+ addi r20,32,r9
+ msub.w r1,r4,r1
+ madd.w r1,r1,r1
+ mmulfx.w r1,r1,r4
+ shlri r6,32,r7
+ bgt/u r9,r63,tr0 // large_divisor
+ mmulfx.w r5,r4,r4
+ shlri r2,32+14,r19
+ addi r22,-31,r0
+ msub.w r1,r4,r1
+
+ mulu.l r1,r7,r4
+ addi r1,-3,r5
+ mulu.l r5,r19,r5
+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+ shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+ the case may be, %0000000000000000 000.11111111111, still */
+ muls.l r1,r4,r4 /* leaving at least one sign bit. */
+ mulu.l r5,r3,r8
+ mshalds.l r1,r21,r1
+ shari r4,26,r4
+ shlld r8,r0,r8
+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+ sub r2,r8,r2
+ /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */
+
+ shlri r2,22,r21
+ mulu.l r21,r1,r21
+ shlld r5,r0,r8
+ addi r20,30-22,r0
+ shlrd r21,r0,r21
+ mulu.l r21,r3,r5
+ add r8,r21,r8
+ mcmpgt.l r21,r63,r21 // See Note 1
+ addi r20,30,r0
+ mshfhi.l r63,r21,r21
+ sub r2,r5,r2
+ andc r2,r21,r2
+
+ /* small divisor: need a third divide step */
+ mulu.l r2,r1,r7
+ ptabs r18,tr0
+ addi r2,1,r2
+ shlrd r7,r0,r7
+ mulu.l r7,r3,r5
+ add r8,r7,r8
+ sub r2,r3,r2
+ cmpgt r2,r5,r5
+ add r8,r5,r2
+ /* could test r3 here to check for divide by zero. */
+ blink tr0,r63
+
+LOCAL(large_divisor):
+ mmulfx.w r5,r4,r4
+ shlrd r2,r9,r25
+ shlri r25,32,r8
+ msub.w r1,r4,r1
+
+ mulu.l r1,r7,r4
+ addi r1,-3,r5
+ mulu.l r5,r8,r5
+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+ shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+ the case may be, %0000000000000000 000.11111111111, still */
+ muls.l r1,r4,r4 /* leaving at least one sign bit. */
+ shlri r5,14-1,r8
+ mulu.l r8,r7,r5
+ mshalds.l r1,r21,r1
+ shari r4,26,r4
+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+ sub r25,r5,r25
+ /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */
+
+ shlri r25,22,r21
+ mulu.l r21,r1,r21
+ pta LOCAL(no_lo_adj),tr0
+ addi r22,32,r0
+ shlri r21,40,r21
+ mulu.l r21,r7,r5
+ add r8,r21,r8
+ shlld r2,r0,r2
+ sub r25,r5,r25
+ bgtu/u r7,r25,tr0 // no_lo_adj
+ addi r8,1,r8
+ sub r25,r7,r25
+LOCAL(no_lo_adj):
+ mextr4 r2,r25,r2
+
+ /* large_divisor: only needs a few adjustments. */
+ mulu.l r8,r6,r5
+ ptabs r18,tr0
+ /* bubble */
+ cmpgtu r5,r2,r5
+ sub r8,r5,r2
+ blink tr0,r63
+ ENDFUNC(GLOBAL(udivdi3))
+/* Note 1: To shift the result of the second divide stage so that the result
+ always fits into 32 bits, yet we still reduce the rest sufficiently
+ would require a lot of instructions to do the shifts just right. Using
+ the full 64 bit shift result to multiply with the divisor would require
+ four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
+ Fortunately, if the upper 32 bits of the shift result are nonzero, we
+ know that the rest after taking this partial result into account will
+ fit into 32 bits. So we just clear the upper 32 bits of the rest if the
+ upper 32 bits of the partial result are nonzero. */
+#endif /* __SHMEDIA__ */
+#endif /* L_udivdi3 */
+
+#ifdef L_divdi3
+#ifdef __SHMEDIA__
+ .mode SHmedia
+ .section .text..SHmedia32,"ax"
+ .align 2
+ .global GLOBAL(divdi3)
+ FUNC(GLOBAL(divdi3))
+GLOBAL(divdi3):
+ pta GLOBAL(udivdi3_internal),tr0
+ shari r2,63,r22
+ shari r3,63,r23
+ xor r2,r22,r2
+ xor r3,r23,r3
+ sub r2,r22,r2
+ sub r3,r23,r3
+ beq/u r22,r23,tr0
+ ptabs r18,tr1
+ blink tr0,r18
+ sub r63,r2,r2
+ blink tr1,r63
+ ENDFUNC(GLOBAL(divdi3))
+#endif /* __SHMEDIA__ */
+#endif /* L_divdi3 */
+
+#ifdef L_umoddi3
+#ifdef __SHMEDIA__
+ .mode SHmedia
+ .section .text..SHmedia32,"ax"
+ .align 2
+ .global GLOBAL(umoddi3)
+ FUNC(GLOBAL(umoddi3))
+GLOBAL(umoddi3):
+ HIDDEN_ALIAS(umoddi3_internal,umoddi3)
+ shlri r3,1,r4
+ nsb r4,r22
+ shlld r3,r22,r6
+ shlri r6,49,r5
+ movi 0xffffffffffffbaf1,r21 /* .l shift count 17. */
+ sub r21,r5,r1
+ mmulfx.w r1,r1,r4
+ mshflo.w r1,r63,r1
+ sub r63,r22,r20 // r63 == 64 % 64
+ mmulfx.w r5,r4,r4
+ pta LOCAL(large_divisor),tr0
+ addi r20,32,r9
+ msub.w r1,r4,r1
+ madd.w r1,r1,r1
+ mmulfx.w r1,r1,r4
+ shlri r6,32,r7
+ bgt/u r9,r63,tr0 // large_divisor
+ mmulfx.w r5,r4,r4
+ shlri r2,32+14,r19
+ addi r22,-31,r0
+ msub.w r1,r4,r1
+
+ mulu.l r1,r7,r4
+ addi r1,-3,r5
+ mulu.l r5,r19,r5
+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+ shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+ the case may be, %0000000000000000 000.11111111111, still */
+ muls.l r1,r4,r4 /* leaving at least one sign bit. */
+ mulu.l r5,r3,r5
+ mshalds.l r1,r21,r1
+ shari r4,26,r4
+ shlld r5,r0,r5
+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+ sub r2,r5,r2
+ /* Can do second step of 64 : 32 div now, using r1 and the rest in r2. */
+
+ shlri r2,22,r21
+ mulu.l r21,r1,r21
+ addi r20,30-22,r0
+ /* bubble */ /* could test r3 here to check for divide by zero. */
+ shlrd r21,r0,r21
+ mulu.l r21,r3,r5
+ mcmpgt.l r21,r63,r21 // See Note 1
+ addi r20,30,r0
+ mshfhi.l r63,r21,r21
+ sub r2,r5,r2
+ andc r2,r21,r2
+
+ /* small divisor: need a third divide step */
+ mulu.l r2,r1,r7
+ ptabs r18,tr0
+ sub r2,r3,r8 /* re-use r8 here for rest - r3 */
+ shlrd r7,r0,r7
+ mulu.l r7,r3,r5
+ /* bubble */
+ addi r8,1,r7
+ cmpgt r7,r5,r7
+ cmvne r7,r8,r2
+ sub r2,r5,r2
+ blink tr0,r63
+
+LOCAL(large_divisor):
+ mmulfx.w r5,r4,r4
+ shlrd r2,r9,r25
+ shlri r25,32,r8
+ msub.w r1,r4,r1
+
+ mulu.l r1,r7,r4
+ addi r1,-3,r5
+ mulu.l r5,r8,r5
+ sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+ shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+ the case may be, %0000000000000000 000.11111111111, still */
+ muls.l r1,r4,r4 /* leaving at least one sign bit. */
+ shlri r5,14-1,r8
+ mulu.l r8,r7,r5
+ mshalds.l r1,r21,r1
+ shari r4,26,r4
+ add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+ sub r25,r5,r25
+ /* Can do second step of 64 : 32 div now, using r1 and the rest in r25. */
+
+ shlri r25,22,r21
+ mulu.l r21,r1,r21
+ pta LOCAL(no_lo_adj),tr0
+ addi r22,32,r0
+ shlri r21,40,r21
+ mulu.l r21,r7,r5
+ add r8,r21,r8
+ shlld r2,r0,r2
+ sub r25,r5,r25
+ bgtu/u r7,r25,tr0 // no_lo_adj
+ addi r8,1,r8
+ sub r25,r7,r25
+LOCAL(no_lo_adj):
+ mextr4 r2,r25,r2
+
+ /* large_divisor: only needs a few adjustments. */
+ mulu.l r8,r6,r5
+ ptabs r18,tr0
+ add r2,r6,r7
+ cmpgtu r5,r2,r8
+ cmvne r8,r7,r2
+ sub r2,r5,r2
+ shlrd r2,r22,r2
+ blink tr0,r63
+ ENDFUNC(GLOBAL(umoddi3))
+/* Note 1: To shift the result of the second divide stage so that the result
+ always fits into 32 bits, yet we still reduce the rest sufficiently
+ would require a lot of instructions to do the shifts just right. Using
+ the full 64 bit shift result to multiply with the divisor would require
+ four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
+ Fortunately, if the upper 32 bits of the shift result are nonzero, we
+ know that the rest after taking this partial result into account will
+ fit into 32 bits. So we just clear the upper 32 bits of the rest if the
+ upper 32 bits of the partial result are nonzero. */
+#endif /* __SHMEDIA__ */
+#endif /* L_umoddi3 */
+
+#ifdef L_moddi3
+#ifdef __SHMEDIA__
+ .mode SHmedia
+ .section .text..SHmedia32,"ax"
+ .align 2
+ .global GLOBAL(moddi3)
+ FUNC(GLOBAL(moddi3))
+GLOBAL(moddi3):
+ pta GLOBAL(umoddi3_internal),tr0
+ shari r2,63,r22
+ shari r3,63,r23
+ xor r2,r22,r2
+ xor r3,r23,r3
+ sub r2,r22,r2
+ sub r3,r23,r3
+ beq/u r22,r63,tr0
+ ptabs r18,tr1
+ blink tr0,r18
+ sub r63,r2,r2
+ blink tr1,r63
+ ENDFUNC(GLOBAL(moddi3))
+#endif /* __SHMEDIA__ */
+#endif /* L_moddi3 */
+
#ifdef L_set_fpscr
-#if defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || __SH5__ == 32
+#if !defined (__SH2A_NOFPU__)
+#if defined (__SH2E__) || defined (__SH2A__) || defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || __SH5__ == 32
#ifdef __SH5__
.mode SHcompact
#endif
.global GLOBAL(set_fpscr)
+ HIDDEN_FUNC(GLOBAL(set_fpscr))
GLOBAL(set_fpscr):
lds r4,fpscr
+#ifdef __PIC__
+ mov.l r12,@-r15
+#ifdef __vxworks
+ mov.l LOCAL(set_fpscr_L0_base),r12
+ mov.l LOCAL(set_fpscr_L0_index),r0
+ mov.l @r12,r12
+ mov.l @(r0,r12),r12
+#else
+ mova LOCAL(set_fpscr_L0),r0
+ mov.l LOCAL(set_fpscr_L0),r12
+ add r0,r12
+#endif
+ mov.l LOCAL(set_fpscr_L1),r0
+ mov.l @(r0,r12),r1
+ mov.l @r15+,r12
+#else
mov.l LOCAL(set_fpscr_L1),r1
+#endif
swap.w r4,r0
or #24,r0
#ifndef FMOVD_WORKS
xor #16,r0
#endif
-#if defined(__SH4__)
+#if defined(__SH4__) || defined (__SH2A_DOUBLE__)
swap.w r0,r3
mov.l r3,@(4,r1)
-#else /* defined(__SH3E__) || defined(__SH4_SINGLE*__) */
+#else /* defined (__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */
swap.w r0,r2
mov.l r2,@r1
#endif
#else
xor #24,r0
#endif
-#if defined(__SH4__)
+#if defined(__SH4__) || defined (__SH2A_DOUBLE__)
swap.w r0,r2
rts
mov.l r2,@r1
-#else /* defined(__SH3E__) || defined(__SH4_SINGLE*__) */
+#else /* defined(__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */
swap.w r0,r3
rts
mov.l r3,@(4,r1)
#endif
.align 2
+#ifdef __PIC__
+#ifdef __vxworks
+LOCAL(set_fpscr_L0_base):
+ .long ___GOTT_BASE__
+LOCAL(set_fpscr_L0_index):
+ .long ___GOTT_INDEX__
+#else
+LOCAL(set_fpscr_L0):
+ .long _GLOBAL_OFFSET_TABLE_
+#endif
+LOCAL(set_fpscr_L1):
+ .long GLOBAL(fpscr_values@GOT)
+#else
LOCAL(set_fpscr_L1):
.long GLOBAL(fpscr_values)
+#endif
+
+ ENDFUNC(GLOBAL(set_fpscr))
+#ifndef NO_FPSCR_VALUES
#ifdef __ELF__
.comm GLOBAL(fpscr_values),8,4
#else
.comm GLOBAL(fpscr_values),8
#endif /* ELF */
-#endif /* SH3E / SH4 */
+#endif /* NO_FPSCR_VALUES */
+#endif /* SH2E / SH3E / SH4 */
+#endif /* __SH2A_NOFPU__ */
#endif /* L_set_fpscr */
#ifdef L_ic_invalidate
#if __SH5__ == 32
.mode SHmedia
.section .text..SHmedia32,"ax"
.align 2
+ .global GLOBAL(init_trampoline)
+ HIDDEN_FUNC(GLOBAL(init_trampoline))
+GLOBAL(init_trampoline):
+ st.l r0,8,r2
+#ifdef __LITTLE_ENDIAN__
+ movi 9,r20
+ shori 0x402b,r20
+ shori 0xd101,r20
+ shori 0xd002,r20
+#else
+ movi 0xffffffffffffd002,r20
+ shori 0xd101,r20
+ shori 0x402b,r20
+ shori 9,r20
+#endif
+ st.q r0,0,r20
+ st.l r0,12,r3
+ ENDFUNC(GLOBAL(init_trampoline))
.global GLOBAL(ic_invalidate)
+ HIDDEN_FUNC(GLOBAL(ic_invalidate))
GLOBAL(ic_invalidate):
+ ocbwb r0,0
+ synco
icbi r0, 0
ptabs r18, tr0
synci
blink tr0, r63
-#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__)
+ ENDFUNC(GLOBAL(ic_invalidate))
+#elif defined(__SH4A__)
.global GLOBAL(ic_invalidate)
+ HIDDEN_FUNC(GLOBAL(ic_invalidate))
GLOBAL(ic_invalidate):
ocbwb @r4
- mova 0f,r0
- mov.w 1f,r1
-/* Compute how many cache lines 0f is away from r4. */
- sub r0,r4
- and r1,r4
-/* Prepare to branch to 0f plus the cache-line offset. */
- add # 0f - 1f,r4
- braf r4
- nop
-1:
- .short 0x1fe0
+ synco
+ rts
+ icbi @r4
+ ENDFUNC(GLOBAL(ic_invalidate))
+#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))
+ /* For system code, we use ic_invalidate_line_i, but user code
+ needs a different mechanism. A kernel call is generally not
+ available, and it would also be slow. Different SH4 variants use
+ different sizes and associativities of the Icache. We use a small
+ bit of dispatch code that can be put hidden in every shared object,
+ which calls the actual processor-specific invalidation code in a
+ separate module.
+ Or if you have operating system support, the OS could mmap the
+ procesor-specific code from a single page, since it is highly
+ repetitive. */
+ .global GLOBAL(ic_invalidate)
+ HIDDEN_FUNC(GLOBAL(ic_invalidate))
+GLOBAL(ic_invalidate):
+#ifdef __pic__
+#ifdef __vxworks
+ mov.l 1f,r1
+ mov.l 2f,r0
+ mov.l @r1,r1
+ mov.l 0f,r2
+ mov.l @(r0,r1),r0
+#else
+ mov.l 1f,r1
+ mova 1f,r0
+ mov.l 0f,r2
+ add r1,r0
+#endif
+ mov.l @(r0,r2),r1
+#else
+ mov.l 0f,r1
+#endif
+ ocbwb @r4
+ mov.l @(8,r1),r0
+ sub r1,r4
+ and r4,r0
+ add r1,r0
+ jmp @r0
+ mov.l @(4,r1),r0
+ .align 2
+#ifndef __pic__
+0: .long GLOBAL(ic_invalidate_array)
+#else /* __pic__ */
+ .global GLOBAL(ic_invalidate_array)
+0: .long GLOBAL(ic_invalidate_array)@GOT
+#ifdef __vxworks
+1: .long ___GOTT_BASE__
+2: .long ___GOTT_INDEX__
+#else
+1: .long _GLOBAL_OFFSET_TABLE_
+#endif
+ ENDFUNC(GLOBAL(ic_invalidate))
+#endif /* __pic__ */
+#endif /* SH4 */
+#endif /* L_ic_invalidate */
+
+#ifdef L_ic_invalidate_array
+#if defined(__SH4A__) || (defined (__FORCE_SH4A__) && (defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))))
+ .global GLOBAL(ic_invalidate_array)
+ /* This is needed when an SH4 dso with trampolines is used on SH4A. */
+ .global GLOBAL(ic_invalidate_array)
+ FUNC(GLOBAL(ic_invalidate_array))
+GLOBAL(ic_invalidate_array):
+ add r1,r4
+ synco
+ rts
+ icbi @r4
+ .long 0
+ ENDFUNC(GLOBAL(ic_invalidate_array))
+#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))
+ .global GLOBAL(ic_invalidate_array)
.p2align 5
+ FUNC(GLOBAL(ic_invalidate_array))
/* This must be aligned to the beginning of a cache line. */
-0:
- .rept 256 /* There are 256 cache lines of 32 bytes. */
+GLOBAL(ic_invalidate_array):
+#ifndef WAYS
+#define WAYS 4
+#define WAY_SIZE 0x4000
+#endif
+#if WAYS == 1
+ .rept WAY_SIZE * WAYS / 32
+ rts
+ nop
+ .rept 7
+ .long WAY_SIZE - 32
+ .endr
+ .endr
+#elif WAYS <= 6
+ .rept WAY_SIZE * WAYS / 32
+ braf r0
+ add #-8,r0
+ .long WAY_SIZE + 8
+ .long WAY_SIZE - 32
+ .rept WAYS-2
+ braf r0
+ nop
+ .endr
+ .rept 7 - WAYS
+ rts
+ nop
+ .endr
+ .endr
+#else /* WAYS > 6 */
+ /* This variant needs two different pages for mmap-ing. */
+ .rept WAYS-1
+ .rept WAY_SIZE / 32
+ braf r0
+ nop
+ .long WAY_SIZE
+ .rept 6
+ .long WAY_SIZE - 32
+ .endr
+ .endr
+ .endr
+ .rept WAY_SIZE / 32
rts
.rept 15
nop
.endr
.endr
+#endif /* WAYS */
+ ENDFUNC(GLOBAL(ic_invalidate_array))
#endif /* SH4 */
-#endif /* L_ic_invalidate */
+#endif /* L_ic_invalidate_array */
#if defined (__SH5__) && __SH5__ == 32
#ifdef L_shcompact_call_trampoline
will be expanded into r2/r3 upon return. */
.global GLOBAL(GCC_shcompact_call_trampoline)
+ FUNC(GLOBAL(GCC_shcompact_call_trampoline))
GLOBAL(GCC_shcompact_call_trampoline):
ptabs/l r0, tr0 /* Prepare to call the actual function. */
movi ((datalabel LOCAL(ct_main_table) - 31 * 2) >> 16) & 65535, r0
shari r2, 32, r2
#endif
blink tr0, r63
+
+ ENDFUNC(GLOBAL(GCC_shcompact_call_trampoline))
#endif /* L_shcompact_call_trampoline */
#ifdef L_shcompact_return_trampoline
.section .text..SHmedia32, "ax"
.align 2
.global GLOBAL(GCC_shcompact_return_trampoline)
+ HIDDEN_FUNC(GLOBAL(GCC_shcompact_return_trampoline))
GLOBAL(GCC_shcompact_return_trampoline):
ptabs/l r18, tr0
#if __LITTLE_ENDIAN__
#endif
or r3, r2, r2
blink tr0, r63
+
+ ENDFUNC(GLOBAL(GCC_shcompact_return_trampoline))
#endif /* L_shcompact_return_trampoline */
#ifdef L_shcompact_incoming_args
.align 2
/* This function stores 64-bit general-purpose registers back in
- the stack, starting at @(r1), where the cookie is supposed to
- have been stored, and loads the address in which each register
- was stored into itself. Its execution time is linear on the
+ the stack, and loads the address in which each register
+ was stored into itself. The lower 32 bits of r17 hold the address
+ to begin storing, and the upper 32 bits of r17 hold the cookie.
+ Its execution time is linear on the
number of registers that actually have to be copied, and it is
optimized for structures larger than 64 bits, as opposed to
- invidivual `long long' arguments. See sh.h for details on the
+ individual `long long' arguments. See sh.h for details on the
actual bit pattern. */
.global GLOBAL(GCC_shcompact_incoming_args)
+ FUNC(GLOBAL(GCC_shcompact_incoming_args))
GLOBAL(GCC_shcompact_incoming_args):
ptabs/l r18, tr0 /* Prepare to return. */
shlri r17, 32, r0 /* Load the cookie. */
- movi ((datalabel LOCAL(ia_main_table) - 31 * 2) >> 16) & 65535, r35
+ movi ((datalabel LOCAL(ia_main_table) - 31 * 2) >> 16) & 65535, r43
pt/l LOCAL(ia_loop), tr1
add.l r17, r63, r17
- shori ((datalabel LOCAL(ia_main_table) - 31 * 2)) & 65535, r35
+ shori ((datalabel LOCAL(ia_main_table) - 31 * 2)) & 65535, r43
LOCAL(ia_loop):
- nsb r0, r28
- shlli r28, 1, r29
- ldx.w r35, r29, r30
+ nsb r0, r36
+ shlli r36, 1, r37
+ ldx.w r43, r37, r38
LOCAL(ia_main_label):
- ptrel/l r30, tr2
+ ptrel/l r38, tr2
blink tr2, r63
LOCAL(ia_r2_ld): /* Store r2 and load its address. */
- movi 3, r30
- shlli r30, 29, r31
- and r0, r31, r32
- andc r0, r31, r0
+ movi 3, r38
+ shlli r38, 29, r39
+ and r0, r39, r40
+ andc r0, r39, r0
stx.q r17, r63, r2
add.l r17, r63, r2
addi.l r17, 8, r17
- beq/u r31, r32, tr1
+ beq/u r39, r40, tr1
LOCAL(ia_r3_ld): /* Store r3 and load its address. */
- movi 3, r30
- shlli r30, 26, r31
- and r0, r31, r32
- andc r0, r31, r0
+ movi 3, r38
+ shlli r38, 26, r39
+ and r0, r39, r40
+ andc r0, r39, r0
stx.q r17, r63, r3
add.l r17, r63, r3
addi.l r17, 8, r17
- beq/u r31, r32, tr1
+ beq/u r39, r40, tr1
LOCAL(ia_r4_ld): /* Store r4 and load its address. */
- movi 3, r30
- shlli r30, 23, r31
- and r0, r31, r32
- andc r0, r31, r0
+ movi 3, r38
+ shlli r38, 23, r39
+ and r0, r39, r40
+ andc r0, r39, r0
stx.q r17, r63, r4
add.l r17, r63, r4
addi.l r17, 8, r17
- beq/u r31, r32, tr1
+ beq/u r39, r40, tr1
LOCAL(ia_r5_ld): /* Store r5 and load its address. */
- movi 3, r30
- shlli r30, 20, r31
- and r0, r31, r32
- andc r0, r31, r0
+ movi 3, r38
+ shlli r38, 20, r39
+ and r0, r39, r40
+ andc r0, r39, r0
stx.q r17, r63, r5
add.l r17, r63, r5
addi.l r17, 8, r17
- beq/u r31, r32, tr1
+ beq/u r39, r40, tr1
LOCAL(ia_r6_ld): /* Store r6 and load its address. */
- movi 3, r30
- shlli r30, 16, r31
- and r0, r31, r32
- andc r0, r31, r0
+ movi 3, r38
+ shlli r38, 16, r39
+ and r0, r39, r40
+ andc r0, r39, r0
stx.q r17, r63, r6
add.l r17, r63, r6
addi.l r17, 8, r17
- beq/u r31, r32, tr1
+ beq/u r39, r40, tr1
LOCAL(ia_r7_ld): /* Store r7 and load its address. */
- movi 3 << 12, r31
- and r0, r31, r32
- andc r0, r31, r0
+ movi 3 << 12, r39
+ and r0, r39, r40
+ andc r0, r39, r0
stx.q r17, r63, r7
add.l r17, r63, r7
addi.l r17, 8, r17
- beq/u r31, r32, tr1
+ beq/u r39, r40, tr1
LOCAL(ia_r8_ld): /* Store r8 and load its address. */
- movi 3 << 8, r31
- and r0, r31, r32
- andc r0, r31, r0
+ movi 3 << 8, r39
+ and r0, r39, r40
+ andc r0, r39, r0
stx.q r17, r63, r8
add.l r17, r63, r8
addi.l r17, 8, r17
- beq/u r31, r32, tr1
+ beq/u r39, r40, tr1
LOCAL(ia_r9_ld): /* Store r9 and load its address. */
stx.q r17, r63, r9
add.l r17, r63, r9
blink tr0, r63
LOCAL(ia_r2_push): /* Push r2 onto the stack. */
- movi 1, r30
- shlli r30, 29, r31
- andc r0, r31, r0
+ movi 1, r38
+ shlli r38, 29, r39
+ andc r0, r39, r0
stx.q r17, r63, r2
addi.l r17, 8, r17
blink tr1, r63
LOCAL(ia_r3_push): /* Push r3 onto the stack. */
- movi 1, r30
- shlli r30, 26, r31
- andc r0, r31, r0
+ movi 1, r38
+ shlli r38, 26, r39
+ andc r0, r39, r0
stx.q r17, r63, r3
addi.l r17, 8, r17
blink tr1, r63
LOCAL(ia_r4_push): /* Push r4 onto the stack. */
- movi 1, r30
- shlli r30, 23, r31
- andc r0, r31, r0
+ movi 1, r38
+ shlli r38, 23, r39
+ andc r0, r39, r0
stx.q r17, r63, r4
addi.l r17, 8, r17
blink tr1, r63
LOCAL(ia_r5_push): /* Push r5 onto the stack. */
- movi 1, r30
- shlli r30, 20, r31
- andc r0, r31, r0
+ movi 1, r38
+ shlli r38, 20, r39
+ andc r0, r39, r0
stx.q r17, r63, r5
addi.l r17, 8, r17
blink tr1, r63
LOCAL(ia_r6_push): /* Push r6 onto the stack. */
- movi 1, r30
- shlli r30, 16, r31
- andc r0, r31, r0
+ movi 1, r38
+ shlli r38, 16, r39
+ andc r0, r39, r0
stx.q r17, r63, r6
addi.l r17, 8, r17
blink tr1, r63
LOCAL(ia_r7_push): /* Push r7 onto the stack. */
- movi 1 << 12, r31
- andc r0, r31, r0
+ movi 1 << 12, r39
+ andc r0, r39, r0
stx.q r17, r63, r7
addi.l r17, 8, r17
blink tr1, r63
LOCAL(ia_r8_push): /* Push r8 onto the stack. */
- movi 1 << 8, r31
- andc r0, r31, r0
+ movi 1 << 8, r39
+ andc r0, r39, r0
stx.q r17, r63, r8
addi.l r17, 8, r17
blink tr1, r63
LOCAL(ia_push_seq): /* Push a sequence of registers onto the stack. */
- andi r0, 7 << 1, r30
- movi (LOCAL(ia_end_of_push_seq) >> 16) & 65535, r32
- shlli r30, 2, r31
- shori LOCAL(ia_end_of_push_seq) & 65535, r32
- sub.l r32, r31, r33
- ptabs/l r33, tr2
+ andi r0, 7 << 1, r38
+ movi (LOCAL(ia_end_of_push_seq) >> 16) & 65535, r40
+ shlli r38, 2, r39
+ shori LOCAL(ia_end_of_push_seq) & 65535, r40
+ sub.l r40, r39, r41
+ ptabs/l r41, tr2
blink tr2, r63
LOCAL(ia_stack_of_push_seq): /* Beginning of push sequence. */
stx.q r17, r63, r3
LOCAL(ia_return): /* Return. */
blink tr0, r63
LOCAL(ia_end_of_push_seq): /* Label used to compute the first push instruction. */
+ ENDFUNC(GLOBAL(GCC_shcompact_incoming_args))
#endif /* L_shcompact_incoming_args */
#endif
#if __SH5__
#endif
.align 3 /* It is copied in units of 8 bytes in SHmedia mode. */
.global GLOBAL(GCC_nested_trampoline)
+ HIDDEN_FUNC(GLOBAL(GCC_nested_trampoline))
GLOBAL(GCC_nested_trampoline):
.mode SHmedia
ptrel/u r63, tr0
ld.l r0, 28, r1
#endif
blink tr1, r63
+
+ ENDFUNC(GLOBAL(GCC_nested_trampoline))
#endif /* L_nested_trampoline */
#endif /* __SH5__ */
#if __SH5__ == 32
.align 2
#ifndef __SH4_NOFPU__
.global GLOBAL(GCC_push_shmedia_regs)
+ FUNC(GLOBAL(GCC_push_shmedia_regs))
GLOBAL(GCC_push_shmedia_regs):
addi.l r15, -14*8, r15
fst.d r15, 13*8, dr62
fst.d r15, 2*8, dr40
fst.d r15, 1*8, dr38
fst.d r15, 0*8, dr36
-#endif
+#else /* ! __SH4_NOFPU__ */
.global GLOBAL(GCC_push_shmedia_regs_nofpu)
+ FUNC(GLOBAL(GCC_push_shmedia_regs_nofpu))
GLOBAL(GCC_push_shmedia_regs_nofpu):
+#endif /* ! __SH4_NOFPU__ */
ptabs/l r18, tr0
addi.l r15, -27*8, r15
gettr tr7, r62
st.q r15, 1*8, r29
st.q r15, 0*8, r28
blink tr0, r63
-
-#ifndef __SH4_NOFPU__
+#ifndef __SH4_NOFPU__
+ ENDFUNC(GLOBAL(GCC_push_shmedia_regs))
+#else
+ ENDFUNC(GLOBAL(GCC_push_shmedia_regs_nofpu))
+#endif
+#ifndef __SH4_NOFPU__
.global GLOBAL(GCC_pop_shmedia_regs)
+ FUNC(GLOBAL(GCC_pop_shmedia_regs))
GLOBAL(GCC_pop_shmedia_regs):
pt .L0, tr1
movi 41*8, r0
fld.d r15, 28*8, dr38
fld.d r15, 27*8, dr36
blink tr1, r63
-#endif
+#else /* ! __SH4_NOFPU__ */
.global GLOBAL(GCC_pop_shmedia_regs_nofpu)
+ FUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu))
GLOBAL(GCC_pop_shmedia_regs_nofpu):
+#endif /* ! __SH4_NOFPU__ */
movi 27*8, r0
.L0:
ptabs r18, tr0
ld.q r15, 0*8, r28
add.l r15, r0, r15
blink tr0, r63
+
+#ifndef __SH4_NOFPU__
+ ENDFUNC(GLOBAL(GCC_pop_shmedia_regs))
+#else
+ ENDFUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu))
+#endif
#endif /* __SH5__ == 32 */
#endif /* L_push_pop_shmedia_regs */
+
+#ifdef L_div_table
+#if __SH5__
+#if defined(__pic__) && defined(__SHMEDIA__)
+ .global GLOBAL(sdivsi3)
+ FUNC(GLOBAL(sdivsi3))
+#if __SH5__ == 32
+ .section .text..SHmedia32,"ax"
+#else
+ .text
+#endif
+#if 0
+/* ??? FIXME: Presumably due to a linker bug, exporting data symbols
+ in a text section does not work (at least for shared libraries):
+ the linker sets the LSB of the address as if this was SHmedia code. */
+#define TEXT_DATA_BUG
+#endif
+ .align 2
+ // inputs: r4,r5
+ // clobbered: r1,r18,r19,r20,r21,r25,tr0
+ // result in r0
+ .global GLOBAL(sdivsi3)
+GLOBAL(sdivsi3):
+#ifdef TEXT_DATA_BUG
+ ptb datalabel Local_div_table,tr0
+#else
+ ptb GLOBAL(div_table_internal),tr0
+#endif
+ nsb r5, r1
+ shlld r5, r1, r25 // normalize; [-2 ..1, 1..2) in s2.62
+ shari r25, 58, r21 // extract 5(6) bit index (s2.4 with hole -1..1)
+ /* bubble */
+ gettr tr0,r20
+ ldx.ub r20, r21, r19 // u0.8
+ shari r25, 32, r25 // normalize to s2.30
+ shlli r21, 1, r21
+ muls.l r25, r19, r19 // s2.38
+ ldx.w r20, r21, r21 // s2.14
+ ptabs r18, tr0
+ shari r19, 24, r19 // truncate to s2.14
+ sub r21, r19, r19 // some 11 bit inverse in s1.14
+ muls.l r19, r19, r21 // u0.28
+ sub r63, r1, r1
+ addi r1, 92, r1
+ muls.l r25, r21, r18 // s2.58
+ shlli r19, 45, r19 // multiply by two and convert to s2.58
+ /* bubble */
+ sub r19, r18, r18
+ shari r18, 28, r18 // some 22 bit inverse in s1.30
+ muls.l r18, r25, r0 // s2.60
+ muls.l r18, r4, r25 // s32.30
+ /* bubble */
+ shari r0, 16, r19 // s-16.44
+ muls.l r19, r18, r19 // s-16.74
+ shari r25, 63, r0
+ shari r4, 14, r18 // s19.-14
+ shari r19, 30, r19 // s-16.44
+ muls.l r19, r18, r19 // s15.30
+ xor r21, r0, r21 // You could also use the constant 1 << 27.
+ add r21, r25, r21
+ sub r21, r19, r21
+ shard r21, r1, r21
+ sub r21, r0, r0
+ blink tr0, r63
+ ENDFUNC(GLOBAL(sdivsi3))
+/* This table has been generated by divtab.c .
+Defects for bias -330:
+ Max defect: 6.081536e-07 at -1.000000e+00
+ Min defect: 2.849516e-08 at 1.030651e+00
+ Max 2nd step defect: 9.606539e-12 at -1.000000e+00
+ Min 2nd step defect: 0.000000e+00 at 0.000000e+00
+ Defect at 1: 1.238659e-07
+ Defect at -2: 1.061708e-07 */
+#else /* ! __pic__ || ! __SHMEDIA__ */
+ .section .rodata
+#endif /* __pic__ */
+#if defined(TEXT_DATA_BUG) && defined(__pic__) && defined(__SHMEDIA__)
+ .balign 2
+ .type Local_div_table,@object
+ .size Local_div_table,128
+/* negative division constants */
+ .word -16638
+ .word -17135
+ .word -17737
+ .word -18433
+ .word -19103
+ .word -19751
+ .word -20583
+ .word -21383
+ .word -22343
+ .word -23353
+ .word -24407
+ .word -25582
+ .word -26863
+ .word -28382
+ .word -29965
+ .word -31800
+/* negative division factors */
+ .byte 66
+ .byte 70
+ .byte 75
+ .byte 81
+ .byte 87
+ .byte 93
+ .byte 101
+ .byte 109
+ .byte 119
+ .byte 130
+ .byte 142
+ .byte 156
+ .byte 172
+ .byte 192
+ .byte 214
+ .byte 241
+ .skip 16
+Local_div_table:
+ .skip 16
+/* positive division factors */
+ .byte 241
+ .byte 214
+ .byte 192
+ .byte 172
+ .byte 156
+ .byte 142
+ .byte 130
+ .byte 119
+ .byte 109
+ .byte 101
+ .byte 93
+ .byte 87
+ .byte 81
+ .byte 75
+ .byte 70
+ .byte 66
+/* positive division constants */
+ .word 31801
+ .word 29966
+ .word 28383
+ .word 26864
+ .word 25583
+ .word 24408
+ .word 23354
+ .word 22344
+ .word 21384
+ .word 20584
+ .word 19752
+ .word 19104
+ .word 18434
+ .word 17738
+ .word 17136
+ .word 16639
+ .section .rodata
+#endif /* TEXT_DATA_BUG */
+ .balign 2
+ .type GLOBAL(div_table),@object
+ .size GLOBAL(div_table),128
+/* negative division constants */
+ .word -16638
+ .word -17135
+ .word -17737
+ .word -18433
+ .word -19103
+ .word -19751
+ .word -20583
+ .word -21383
+ .word -22343
+ .word -23353
+ .word -24407
+ .word -25582
+ .word -26863
+ .word -28382
+ .word -29965
+ .word -31800
+/* negative division factors */
+ .byte 66
+ .byte 70
+ .byte 75
+ .byte 81
+ .byte 87
+ .byte 93
+ .byte 101
+ .byte 109
+ .byte 119
+ .byte 130
+ .byte 142
+ .byte 156
+ .byte 172
+ .byte 192
+ .byte 214
+ .byte 241
+ .skip 16
+ .global GLOBAL(div_table)
+GLOBAL(div_table):
+ HIDDEN_ALIAS(div_table_internal,div_table)
+ .skip 16
+/* positive division factors */
+ .byte 241
+ .byte 214
+ .byte 192
+ .byte 172
+ .byte 156
+ .byte 142
+ .byte 130
+ .byte 119
+ .byte 109
+ .byte 101
+ .byte 93
+ .byte 87
+ .byte 81
+ .byte 75
+ .byte 70
+ .byte 66
+/* positive division constants */
+ .word 31801
+ .word 29966
+ .word 28383
+ .word 26864
+ .word 25583
+ .word 24408
+ .word 23354
+ .word 22344
+ .word 21384
+ .word 20584
+ .word 19752
+ .word 19104
+ .word 18434
+ .word 17738
+ .word 17136
+ .word 16639
+
+#elif defined (__SH3__) || defined (__SH3E__) || defined (__SH4__) || defined (__SH4_SINGLE__) || defined (__SH4_SINGLE_ONLY__) || defined (__SH4_NOFPU__)
+/* This code used shld, thus is not suitable for SH1 / SH2. */
+
+/* Signed / unsigned division without use of FPU, optimized for SH4.
+ Uses a lookup table for divisors in the range -128 .. +128, and
+ div1 with case distinction for larger divisors in three more ranges.
+ The code is lumped together with the table to allow the use of mova. */
+#ifdef __LITTLE_ENDIAN__
+#define L_LSB 0
+#define L_LSWMSB 1
+#define L_MSWLSB 2
+#else
+#define L_LSB 3
+#define L_LSWMSB 2
+#define L_MSWLSB 1
+#endif
+
+ .balign 4
+ .global GLOBAL(udivsi3_i4i)
+ FUNC(GLOBAL(udivsi3_i4i))
+GLOBAL(udivsi3_i4i):
+ mov.w LOCAL(c128_w), r1
+ div0u
+ mov r4,r0
+ shlr8 r0
+ cmp/hi r1,r5
+ extu.w r5,r1
+ bf LOCAL(udiv_le128)
+ cmp/eq r5,r1
+ bf LOCAL(udiv_ge64k)
+ shlr r0
+ mov r5,r1
+ shll16 r5
+ mov.l r4,@-r15
+ div1 r5,r0
+ mov.l r1,@-r15
+ div1 r5,r0
+ div1 r5,r0
+ bra LOCAL(udiv_25)
+ div1 r5,r0
+
+LOCAL(div_le128):
+ mova LOCAL(div_table_ix),r0
+ bra LOCAL(div_le128_2)
+ mov.b @(r0,r5),r1
+LOCAL(udiv_le128):
+ mov.l r4,@-r15
+ mova LOCAL(div_table_ix),r0
+ mov.b @(r0,r5),r1
+ mov.l r5,@-r15
+LOCAL(div_le128_2):
+ mova LOCAL(div_table_inv),r0
+ mov.l @(r0,r1),r1
+ mov r5,r0
+ tst #0xfe,r0
+ mova LOCAL(div_table_clz),r0
+ dmulu.l r1,r4
+ mov.b @(r0,r5),r1
+ bt/s LOCAL(div_by_1)
+ mov r4,r0
+ mov.l @r15+,r5
+ sts mach,r0
+ /* clrt */
+ addc r4,r0
+ mov.l @r15+,r4
+ rotcr r0
+ rts
+ shld r1,r0
+
+LOCAL(div_by_1_neg):
+ neg r4,r0
+LOCAL(div_by_1):
+ mov.l @r15+,r5
+ rts
+ mov.l @r15+,r4
+
+LOCAL(div_ge64k):
+ bt/s LOCAL(div_r8)
+ div0u
+ shll8 r5
+ bra LOCAL(div_ge64k_2)
+ div1 r5,r0
+LOCAL(udiv_ge64k):
+ cmp/hi r0,r5
+ mov r5,r1
+ bt LOCAL(udiv_r8)
+ shll8 r5
+ mov.l r4,@-r15
+ div1 r5,r0
+ mov.l r1,@-r15
+LOCAL(div_ge64k_2):
+ div1 r5,r0
+ mov.l LOCAL(zero_l),r1
+ .rept 4
+ div1 r5,r0
+ .endr
+ mov.l r1,@-r15
+ div1 r5,r0
+ mov.w LOCAL(m256_w),r1
+ div1 r5,r0
+ mov.b r0,@(L_LSWMSB,r15)
+ xor r4,r0
+ and r1,r0
+ bra LOCAL(div_ge64k_end)
+ xor r4,r0
+
+LOCAL(div_r8):
+ shll16 r4
+ bra LOCAL(div_r8_2)
+ shll8 r4
+LOCAL(udiv_r8):
+ mov.l r4,@-r15
+ shll16 r4
+ clrt
+ shll8 r4
+ mov.l r5,@-r15
+LOCAL(div_r8_2):
+ rotcl r4
+ mov r0,r1
+ div1 r5,r1
+ mov r4,r0
+ rotcl r0
+ mov r5,r4
+ div1 r5,r1
+ .rept 5
+ rotcl r0; div1 r5,r1
+ .endr
+ rotcl r0
+ mov.l @r15+,r5
+ div1 r4,r1
+ mov.l @r15+,r4
+ rts
+ rotcl r0
+
+ ENDFUNC(GLOBAL(udivsi3_i4i))
+
+ .global GLOBAL(sdivsi3_i4i)
+ FUNC(GLOBAL(sdivsi3_i4i))
+ /* This is link-compatible with a GLOBAL(sdivsi3) call,
+ but we effectively clobber only r1. */
+GLOBAL(sdivsi3_i4i):
+ mov.l r4,@-r15
+ cmp/pz r5
+ mov.w LOCAL(c128_w), r1
+ bt/s LOCAL(pos_divisor)
+ cmp/pz r4
+ mov.l r5,@-r15
+ neg r5,r5
+ bt/s LOCAL(neg_result)
+ cmp/hi r1,r5
+ neg r4,r4
+LOCAL(pos_result):
+ extu.w r5,r0
+ bf LOCAL(div_le128)
+ cmp/eq r5,r0
+ mov r4,r0
+ shlr8 r0
+ bf/s LOCAL(div_ge64k)
+ cmp/hi r0,r5
+ div0u
+ shll16 r5
+ div1 r5,r0
+ div1 r5,r0
+ div1 r5,r0
+LOCAL(udiv_25):
+ mov.l LOCAL(zero_l),r1
+ div1 r5,r0
+ div1 r5,r0
+ mov.l r1,@-r15
+ .rept 3
+ div1 r5,r0
+ .endr
+ mov.b r0,@(L_MSWLSB,r15)
+ xtrct r4,r0
+ swap.w r0,r0
+ .rept 8
+ div1 r5,r0
+ .endr
+ mov.b r0,@(L_LSWMSB,r15)
+LOCAL(div_ge64k_end):
+ .rept 8
+ div1 r5,r0
+ .endr
+ mov.l @r15+,r4 ! zero-extension and swap using LS unit.
+ extu.b r0,r0
+ mov.l @r15+,r5
+ or r4,r0
+ mov.l @r15+,r4
+ rts
+ rotcl r0
+
+LOCAL(div_le128_neg):
+ tst #0xfe,r0
+ mova LOCAL(div_table_ix),r0
+ mov.b @(r0,r5),r1
+ mova LOCAL(div_table_inv),r0
+ bt/s LOCAL(div_by_1_neg)
+ mov.l @(r0,r1),r1
+ mova LOCAL(div_table_clz),r0
+ dmulu.l r1,r4
+ mov.b @(r0,r5),r1
+ mov.l @r15+,r5
+ sts mach,r0
+ /* clrt */
+ addc r4,r0
+ mov.l @r15+,r4
+ rotcr r0
+ shld r1,r0
+ rts
+ neg r0,r0
+
+LOCAL(pos_divisor):
+ mov.l r5,@-r15
+ bt/s LOCAL(pos_result)
+ cmp/hi r1,r5
+ neg r4,r4
+LOCAL(neg_result):
+ extu.w r5,r0
+ bf LOCAL(div_le128_neg)
+ cmp/eq r5,r0
+ mov r4,r0
+ shlr8 r0
+ bf/s LOCAL(div_ge64k_neg)
+ cmp/hi r0,r5
+ div0u
+ mov.l LOCAL(zero_l),r1
+ shll16 r5
+ div1 r5,r0
+ mov.l r1,@-r15
+ .rept 7
+ div1 r5,r0
+ .endr
+ mov.b r0,@(L_MSWLSB,r15)
+ xtrct r4,r0
+ swap.w r0,r0
+ .rept 8
+ div1 r5,r0
+ .endr
+ mov.b r0,@(L_LSWMSB,r15)
+LOCAL(div_ge64k_neg_end):
+ .rept 8
+ div1 r5,r0
+ .endr
+ mov.l @r15+,r4 ! zero-extension and swap using LS unit.
+ extu.b r0,r1
+ mov.l @r15+,r5
+ or r4,r1
+LOCAL(div_r8_neg_end):
+ mov.l @r15+,r4
+ rotcl r1
+ rts
+ neg r1,r0
+
+LOCAL(div_ge64k_neg):
+ bt/s LOCAL(div_r8_neg)
+ div0u
+ shll8 r5
+ mov.l LOCAL(zero_l),r1
+ .rept 6
+ div1 r5,r0
+ .endr
+ mov.l r1,@-r15
+ div1 r5,r0
+ mov.w LOCAL(m256_w),r1
+ div1 r5,r0
+ mov.b r0,@(L_LSWMSB,r15)
+ xor r4,r0
+ and r1,r0
+ bra LOCAL(div_ge64k_neg_end)
+ xor r4,r0
+
+LOCAL(c128_w):
+ .word 128
+
+LOCAL(div_r8_neg):
+ clrt
+ shll16 r4
+ mov r4,r1
+ shll8 r1
+ mov r5,r4
+ .rept 7
+ rotcl r1; div1 r5,r0
+ .endr
+ mov.l @r15+,r5
+ rotcl r1
+ bra LOCAL(div_r8_neg_end)
+ div1 r4,r0
+
+LOCAL(m256_w):
+ .word 0xff00
+/* This table has been generated by divtab-sh4.c. */
+ .balign 4
+LOCAL(div_table_clz):
+ .byte 0
+ .byte 1
+ .byte 0
+ .byte -1
+ .byte -1
+ .byte -2
+ .byte -2
+ .byte -2
+ .byte -2
+ .byte -3
+ .byte -3
+ .byte -3
+ .byte -3
+ .byte -3
+ .byte -3
+ .byte -3
+ .byte -3
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -4
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -5
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+ .byte -6
+/* Lookup table translating positive divisor to index into table of
+ normalized inverse. N.B. the '0' entry is also the last entry of the
+ previous table, and causes an unaligned access for division by zero. */
+LOCAL(div_table_ix):
+ .byte -6
+ .byte -128
+ .byte -128
+ .byte 0
+ .byte -128
+ .byte -64
+ .byte 0
+ .byte 64
+ .byte -128
+ .byte -96
+ .byte -64
+ .byte -32
+ .byte 0
+ .byte 32
+ .byte 64
+ .byte 96
+ .byte -128
+ .byte -112
+ .byte -96
+ .byte -80
+ .byte -64
+ .byte -48
+ .byte -32
+ .byte -16
+ .byte 0
+ .byte 16
+ .byte 32
+ .byte 48
+ .byte 64
+ .byte 80
+ .byte 96
+ .byte 112
+ .byte -128
+ .byte -120
+ .byte -112
+ .byte -104
+ .byte -96
+ .byte -88
+ .byte -80
+ .byte -72
+ .byte -64
+ .byte -56
+ .byte -48
+ .byte -40
+ .byte -32
+ .byte -24
+ .byte -16
+ .byte -8
+ .byte 0
+ .byte 8
+ .byte 16
+ .byte 24
+ .byte 32
+ .byte 40
+ .byte 48
+ .byte 56
+ .byte 64
+ .byte 72
+ .byte 80
+ .byte 88
+ .byte 96
+ .byte 104
+ .byte 112
+ .byte 120
+ .byte -128
+ .byte -124
+ .byte -120
+ .byte -116
+ .byte -112
+ .byte -108
+ .byte -104
+ .byte -100
+ .byte -96
+ .byte -92
+ .byte -88
+ .byte -84
+ .byte -80
+ .byte -76
+ .byte -72
+ .byte -68
+ .byte -64
+ .byte -60
+ .byte -56
+ .byte -52
+ .byte -48
+ .byte -44
+ .byte -40
+ .byte -36
+ .byte -32
+ .byte -28
+ .byte -24
+ .byte -20
+ .byte -16
+ .byte -12
+ .byte -8
+ .byte -4
+ .byte 0
+ .byte 4
+ .byte 8
+ .byte 12
+ .byte 16
+ .byte 20
+ .byte 24
+ .byte 28
+ .byte 32
+ .byte 36
+ .byte 40
+ .byte 44
+ .byte 48
+ .byte 52
+ .byte 56
+ .byte 60
+ .byte 64
+ .byte 68
+ .byte 72
+ .byte 76
+ .byte 80
+ .byte 84
+ .byte 88
+ .byte 92
+ .byte 96
+ .byte 100
+ .byte 104
+ .byte 108
+ .byte 112
+ .byte 116
+ .byte 120
+ .byte 124
+ .byte -128
+/* 1/64 .. 1/127, normalized. There is an implicit leading 1 in bit 32. */
+ .balign 4
+LOCAL(zero_l):
+ .long 0x0
+ .long 0xF81F81F9
+ .long 0xF07C1F08
+ .long 0xE9131AC0
+ .long 0xE1E1E1E2
+ .long 0xDAE6076C
+ .long 0xD41D41D5
+ .long 0xCD856891
+ .long 0xC71C71C8
+ .long 0xC0E07039
+ .long 0xBACF914D
+ .long 0xB4E81B4F
+ .long 0xAF286BCB
+ .long 0xA98EF607
+ .long 0xA41A41A5
+ .long 0x9EC8E952
+ .long 0x9999999A
+ .long 0x948B0FCE
+ .long 0x8F9C18FA
+ .long 0x8ACB90F7
+ .long 0x86186187
+ .long 0x81818182
+ .long 0x7D05F418
+ .long 0x78A4C818
+ .long 0x745D1746
+ .long 0x702E05C1
+ .long 0x6C16C16D
+ .long 0x68168169
+ .long 0x642C8591
+ .long 0x60581606
+ .long 0x5C9882BA
+ .long 0x58ED2309
+LOCAL(div_table_inv):
+ .long 0x55555556
+ .long 0x51D07EAF
+ .long 0x4E5E0A73
+ .long 0x4AFD6A06
+ .long 0x47AE147B
+ .long 0x446F8657
+ .long 0x41414142
+ .long 0x3E22CBCF
+ .long 0x3B13B13C
+ .long 0x38138139
+ .long 0x3521CFB3
+ .long 0x323E34A3
+ .long 0x2F684BDB
+ .long 0x2C9FB4D9
+ .long 0x29E4129F
+ .long 0x27350B89
+ .long 0x24924925
+ .long 0x21FB7813
+ .long 0x1F7047DD
+ .long 0x1CF06ADB
+ .long 0x1A7B9612
+ .long 0x18118119
+ .long 0x15B1E5F8
+ .long 0x135C8114
+ .long 0x11111112
+ .long 0xECF56BF
+ .long 0xC9714FC
+ .long 0xA6810A7
+ .long 0x8421085
+ .long 0x624DD30
+ .long 0x4104105
+ .long 0x2040811
+ /* maximum error: 0.987342 scaled: 0.921875*/
+
+ ENDFUNC(GLOBAL(sdivsi3_i4i))
+#endif /* SH3 / SH4 */
+
+#endif /* L_div_table */
+
+#ifdef L_udiv_qrnnd_16
+#if !__SHMEDIA__
+ HIDDEN_FUNC(GLOBAL(udiv_qrnnd_16))
+ /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */
+ /* n1 < d, but n1 might be larger than d1. */
+ .global GLOBAL(udiv_qrnnd_16)
+ .balign 8
+GLOBAL(udiv_qrnnd_16):
+ div0u
+ cmp/hi r6,r0
+ bt .Lots
+ .rept 16
+ div1 r6,r0
+ .endr
+ extu.w r0,r1
+ bt 0f
+ add r6,r0
+0: rotcl r1
+ mulu.w r1,r5
+ xtrct r4,r0
+ swap.w r0,r0
+ sts macl,r2
+ cmp/hs r2,r0
+ sub r2,r0
+ bt 0f
+ addc r5,r0
+ add #-1,r1
+ bt 0f
+1: add #-1,r1
+ rts
+ add r5,r0
+ .balign 8
+.Lots:
+ sub r5,r0
+ swap.w r4,r1
+ xtrct r0,r1
+ clrt
+ mov r1,r0
+ addc r5,r0
+ mov #-1,r1
+ SL1(bf, 1b,
+ shlr16 r1)
+0: rts
+ nop
+ ENDFUNC(GLOBAL(udiv_qrnnd_16))
+#endif /* !__SHMEDIA__ */
+#endif /* L_udiv_qrnnd_16 */