Imported gcc-4.4.3

[msp430-gcc.git] / gcc / config / sh / lib1funcs.asm
diff --git a/gcc/config/sh/lib1funcs.asm b/gcc/config/sh/lib1funcs.asm

index 1f61a8dcbc40481f3df31dca25600893bc3192a8..529acb162f87f5e733c5e98659614af64649a10d 100644 (file)
--- a/gcc/config/sh/lib1funcs.asm
+++ b/gcc/config/sh/lib1funcs.asm
@@ -1,31 +1,28 @@
-/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001
+/* Copyright (C) 1994, 1995, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
+   2004, 2005, 2006, 2009
     Free Software Foundation, Inc.
  
  This file is free software; you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
-Free Software Foundation; either version 2, or (at your option) any
+Free Software Foundation; either version 3, or (at your option) any
  later version.
  
-In addition to the permissions in the GNU General Public License, the
-Free Software Foundation gives you unlimited permission to link the
-compiled version of this file into combinations with other programs,
-and to distribute those combinations without any restriction coming
-from the use of this file.  (The General Public License restrictions
-do apply in other respects; for example, they cover modification of
-the file, and distribution when not linked into a combine
-executable.)
-
  This file is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.
  
-You should have received a copy of the GNU General Public License
-along with this program; see the file COPYING.  If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA.  */
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
  
-!! libgcc routines for the Hitachi / SuperH SH CPUs.
+!! libgcc routines for the Renesas / SuperH SH CPUs.
  !! Contributed by Steve Chamberlain.
  !! sac@cygnus.com
  
@@ -37,24 +34,15 @@ Boston, MA 02111-1307, USA.  */
     ELF local label prefixes by J"orn Rennecke
     amylaar@cygnus.com  */
  
-#ifdef __ELF__
-#define LOCAL(X) .L_##X
-#else
-#define LOCAL(X) L_##X
-#endif
-
-#ifdef __linux__
-#define GLOBAL(X) __##X
-#endif
-
-#ifndef GLOBAL
-#define GLOBAL(X) ___##X
-#endif
+#include "lib1funcs.h"
  
-#if defined __SH5__ && ! defined __SH4_NOFPU__
-#define FMOVD_WORKS
+/* t-vxworks needs to build both PIC and non-PIC versions of libgcc,
+   so it is more convenient to define NO_FPSCR_VALUES here than to
+   define it on the command line.  */
+#if defined __vxworks && defined __PIC__
+#define NO_FPSCR_VALUES
  #endif
-
+       
  #if ! __SH5__
  #ifdef L_ashiftrt
         .global GLOBAL(ashiftrt_r4_0)
@@ -91,6 +79,40 @@ Boston, MA 02111-1307, USA.  */
         .global GLOBAL(ashiftrt_r4_31)
         .global GLOBAL(ashiftrt_r4_32)
  
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_0))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_1))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_2))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_3))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_4))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_5))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_6))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_7))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_8))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_9))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_10))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_11))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_12))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_13))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_14))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_15))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_16))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_17))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_18))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_19))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_20))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_21))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_22))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_23))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_24))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_25))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_26))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_27))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_28))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_29))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_30))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_31))
+       HIDDEN_FUNC(GLOBAL(ashiftrt_r4_32))
+
         .align  1
  GLOBAL(ashiftrt_r4_32):
  GLOBAL(ashiftrt_r4_31):
@@ -170,6 +192,40 @@ GLOBAL(ashiftrt_r4_1):
  GLOBAL(ashiftrt_r4_0):
         rts
         nop
+
+       ENDFUNC(GLOBAL(ashiftrt_r4_0))
+       ENDFUNC(GLOBAL(ashiftrt_r4_1))
+       ENDFUNC(GLOBAL(ashiftrt_r4_2))
+       ENDFUNC(GLOBAL(ashiftrt_r4_3))
+       ENDFUNC(GLOBAL(ashiftrt_r4_4))
+       ENDFUNC(GLOBAL(ashiftrt_r4_5))
+       ENDFUNC(GLOBAL(ashiftrt_r4_6))
+       ENDFUNC(GLOBAL(ashiftrt_r4_7))
+       ENDFUNC(GLOBAL(ashiftrt_r4_8))
+       ENDFUNC(GLOBAL(ashiftrt_r4_9))
+       ENDFUNC(GLOBAL(ashiftrt_r4_10))
+       ENDFUNC(GLOBAL(ashiftrt_r4_11))
+       ENDFUNC(GLOBAL(ashiftrt_r4_12))
+       ENDFUNC(GLOBAL(ashiftrt_r4_13))
+       ENDFUNC(GLOBAL(ashiftrt_r4_14))
+       ENDFUNC(GLOBAL(ashiftrt_r4_15))
+       ENDFUNC(GLOBAL(ashiftrt_r4_16))
+       ENDFUNC(GLOBAL(ashiftrt_r4_17))
+       ENDFUNC(GLOBAL(ashiftrt_r4_18))
+       ENDFUNC(GLOBAL(ashiftrt_r4_19))
+       ENDFUNC(GLOBAL(ashiftrt_r4_20))
+       ENDFUNC(GLOBAL(ashiftrt_r4_21))
+       ENDFUNC(GLOBAL(ashiftrt_r4_22))
+       ENDFUNC(GLOBAL(ashiftrt_r4_23))
+       ENDFUNC(GLOBAL(ashiftrt_r4_24))
+       ENDFUNC(GLOBAL(ashiftrt_r4_25))
+       ENDFUNC(GLOBAL(ashiftrt_r4_26))
+       ENDFUNC(GLOBAL(ashiftrt_r4_27))
+       ENDFUNC(GLOBAL(ashiftrt_r4_28))
+       ENDFUNC(GLOBAL(ashiftrt_r4_29))
+       ENDFUNC(GLOBAL(ashiftrt_r4_30))
+       ENDFUNC(GLOBAL(ashiftrt_r4_31))
+       ENDFUNC(GLOBAL(ashiftrt_r4_32))
  #endif
  
  #ifdef L_ashiftrt_n
@@ -192,6 +248,7 @@ GLOBAL(ashiftrt_r4_0):
  !
  
         .global GLOBAL(ashrsi3)
+       HIDDEN_FUNC(GLOBAL(ashrsi3))
         .align  2
  GLOBAL(ashrsi3):
         mov     #31,r0
@@ -319,6 +376,7 @@ LOCAL(ashrsi3_0):
         rts
         nop
  
+       ENDFUNC(GLOBAL(ashrsi3))
  #endif
  
  #ifdef L_ashiftlt
@@ -340,6 +398,7 @@ LOCAL(ashrsi3_0):
  ! (none)
  !
         .global GLOBAL(ashlsi3)
+       HIDDEN_FUNC(GLOBAL(ashlsi3))
         .align  2
  GLOBAL(ashlsi3):
         mov     #31,r0
@@ -476,6 +535,7 @@ LOCAL(ashlsi3_0):
         rts
         nop
  
+       ENDFUNC(GLOBAL(ashlsi3))
  #endif
  
  #ifdef L_lshiftrt
@@ -497,6 +557,7 @@ LOCAL(ashlsi3_0):
  ! (none)
  !
         .global GLOBAL(lshrsi3)
+       HIDDEN_FUNC(GLOBAL(lshrsi3))
         .align  2
  GLOBAL(lshrsi3):
         mov     #31,r0
@@ -633,165 +694,196 @@ LOCAL(lshrsi3_0):
         rts
         nop
  
+       ENDFUNC(GLOBAL(lshrsi3))
  #endif
  
-#ifdef L_movstr
+#ifdef L_movmem
         .text
-! done all the large groups, do the remainder
-
-! jump to movstr+
-done:
-       add     #64,r5
-       mova    GLOBAL(movstrSI0),r0
+       .balign 4
+       .global GLOBAL(movmem)
+       HIDDEN_FUNC(GLOBAL(movmem))
+       HIDDEN_ALIAS(movstr,movmem)
+       /* This would be a lot simpler if r6 contained the byte count
+          minus 64, and we wouldn't be called here for a byte count of 64.  */
+GLOBAL(movmem):
+       sts.l   pr,@-r15
         shll2   r6
-       add     r6,r0
-       jmp     @r0
-       add     #64,r4
-       .align  4
-       .global GLOBAL(movstrSI64)
-GLOBAL(movstrSI64):
+       bsr     GLOBAL(movmemSI52+2)
+       mov.l   @(48,r5),r0
+       .balign 4
+LOCAL(movmem_loop): /* Reached with rts */
         mov.l   @(60,r5),r0
+       add     #-64,r6
         mov.l   r0,@(60,r4)
-       .global GLOBAL(movstrSI60)
-GLOBAL(movstrSI60):
+       tst     r6,r6
         mov.l   @(56,r5),r0
+       bt      LOCAL(movmem_done)
         mov.l   r0,@(56,r4)
-       .global GLOBAL(movstrSI56)
-GLOBAL(movstrSI56):
+       cmp/pl  r6
         mov.l   @(52,r5),r0
+       add     #64,r5
         mov.l   r0,@(52,r4)
-       .global GLOBAL(movstrSI52)
-GLOBAL(movstrSI52):
-       mov.l   @(48,r5),r0
-       mov.l   r0,@(48,r4)
-       .global GLOBAL(movstrSI48)
-GLOBAL(movstrSI48):
-       mov.l   @(44,r5),r0
-       mov.l   r0,@(44,r4)
-       .global GLOBAL(movstrSI44)
-GLOBAL(movstrSI44):
-       mov.l   @(40,r5),r0
-       mov.l   r0,@(40,r4)
-       .global GLOBAL(movstrSI40)
-GLOBAL(movstrSI40):
-       mov.l   @(36,r5),r0
-       mov.l   r0,@(36,r4)
-       .global GLOBAL(movstrSI36)
-GLOBAL(movstrSI36):
-       mov.l   @(32,r5),r0
-       mov.l   r0,@(32,r4)
-       .global GLOBAL(movstrSI32)
-GLOBAL(movstrSI32):
-       mov.l   @(28,r5),r0
-       mov.l   r0,@(28,r4)
-       .global GLOBAL(movstrSI28)
-GLOBAL(movstrSI28):
-       mov.l   @(24,r5),r0
-       mov.l   r0,@(24,r4)
-       .global GLOBAL(movstrSI24)
-GLOBAL(movstrSI24):
-       mov.l   @(20,r5),r0
-       mov.l   r0,@(20,r4)
-       .global GLOBAL(movstrSI20)
-GLOBAL(movstrSI20):
-       mov.l   @(16,r5),r0
-       mov.l   r0,@(16,r4)
-       .global GLOBAL(movstrSI16)
-GLOBAL(movstrSI16):
-       mov.l   @(12,r5),r0
-       mov.l   r0,@(12,r4)
-       .global GLOBAL(movstrSI12)
-GLOBAL(movstrSI12):
-       mov.l   @(8,r5),r0
-       mov.l   r0,@(8,r4)
-       .global GLOBAL(movstrSI8)
-GLOBAL(movstrSI8):
-       mov.l   @(4,r5),r0
-       mov.l   r0,@(4,r4)
-       .global GLOBAL(movstrSI4)
-GLOBAL(movstrSI4):
-       mov.l   @(0,r5),r0
-       mov.l   r0,@(0,r4)
-GLOBAL(movstrSI0):
+       add     #64,r4
+       bt      GLOBAL(movmemSI52)
+! done all the large groups, do the remainder
+! jump to movmem+
+       mova    GLOBAL(movmemSI4)+4,r0
+       add     r6,r0
+       jmp     @r0
+LOCAL(movmem_done): ! share slot insn, works out aligned.
+       lds.l   @r15+,pr
+       mov.l   r0,@(56,r4)
+       mov.l   @(52,r5),r0
         rts
-       nop
-
-       .align  4
-
-       .global GLOBAL(movstr)
-GLOBAL(movstr):
+       mov.l   r0,@(52,r4)
+       .balign 4
+! ??? We need aliases movstr* for movmem* for the older libraries.  These
+! aliases will be removed at the some point in the future.
+       .global GLOBAL(movmemSI64)
+       HIDDEN_FUNC(GLOBAL(movmemSI64))
+       HIDDEN_ALIAS(movstrSI64,movmemSI64)
+GLOBAL(movmemSI64):
         mov.l   @(60,r5),r0
         mov.l   r0,@(60,r4)
-
+       .global GLOBAL(movmemSI60)
+       HIDDEN_FUNC(GLOBAL(movmemSI60))
+       HIDDEN_ALIAS(movstrSI60,movmemSI60)
+GLOBAL(movmemSI60):
         mov.l   @(56,r5),r0
         mov.l   r0,@(56,r4)
-
+       .global GLOBAL(movmemSI56)
+       HIDDEN_FUNC(GLOBAL(movmemSI56))
+       HIDDEN_ALIAS(movstrSI56,movmemSI56)
+GLOBAL(movmemSI56):
         mov.l   @(52,r5),r0
         mov.l   r0,@(52,r4)
-
+       .global GLOBAL(movmemSI52)
+       HIDDEN_FUNC(GLOBAL(movmemSI52))
+       HIDDEN_ALIAS(movstrSI52,movmemSI52)
+GLOBAL(movmemSI52):
         mov.l   @(48,r5),r0
         mov.l   r0,@(48,r4)
-
+       .global GLOBAL(movmemSI48)
+       HIDDEN_FUNC(GLOBAL(movmemSI48))
+       HIDDEN_ALIAS(movstrSI48,movmemSI48)
+GLOBAL(movmemSI48):
         mov.l   @(44,r5),r0
         mov.l   r0,@(44,r4)
-
+       .global GLOBAL(movmemSI44)
+       HIDDEN_FUNC(GLOBAL(movmemSI44))
+       HIDDEN_ALIAS(movstrSI44,movmemSI44)
+GLOBAL(movmemSI44):
         mov.l   @(40,r5),r0
         mov.l   r0,@(40,r4)
-
+       .global GLOBAL(movmemSI40)
+       HIDDEN_FUNC(GLOBAL(movmemSI40))
+       HIDDEN_ALIAS(movstrSI40,movmemSI40)
+GLOBAL(movmemSI40):
         mov.l   @(36,r5),r0
         mov.l   r0,@(36,r4)
-
+       .global GLOBAL(movmemSI36)
+       HIDDEN_FUNC(GLOBAL(movmemSI36))
+       HIDDEN_ALIAS(movstrSI36,movmemSI36)
+GLOBAL(movmemSI36):
         mov.l   @(32,r5),r0
         mov.l   r0,@(32,r4)
-
+       .global GLOBAL(movmemSI32)
+       HIDDEN_FUNC(GLOBAL(movmemSI32))
+       HIDDEN_ALIAS(movstrSI32,movmemSI32)
+GLOBAL(movmemSI32):
         mov.l   @(28,r5),r0
         mov.l   r0,@(28,r4)
-
+       .global GLOBAL(movmemSI28)
+       HIDDEN_FUNC(GLOBAL(movmemSI28))
+       HIDDEN_ALIAS(movstrSI28,movmemSI28)
+GLOBAL(movmemSI28):
         mov.l   @(24,r5),r0
         mov.l   r0,@(24,r4)
-
+       .global GLOBAL(movmemSI24)
+       HIDDEN_FUNC(GLOBAL(movmemSI24))
+       HIDDEN_ALIAS(movstrSI24,movmemSI24)
+GLOBAL(movmemSI24):
         mov.l   @(20,r5),r0
         mov.l   r0,@(20,r4)
-
+       .global GLOBAL(movmemSI20)
+       HIDDEN_FUNC(GLOBAL(movmemSI20))
+       HIDDEN_ALIAS(movstrSI20,movmemSI20)
+GLOBAL(movmemSI20):
         mov.l   @(16,r5),r0
         mov.l   r0,@(16,r4)
-
+       .global GLOBAL(movmemSI16)
+       HIDDEN_FUNC(GLOBAL(movmemSI16))
+       HIDDEN_ALIAS(movstrSI16,movmemSI16)
+GLOBAL(movmemSI16):
         mov.l   @(12,r5),r0
         mov.l   r0,@(12,r4)
-
+       .global GLOBAL(movmemSI12)
+       HIDDEN_FUNC(GLOBAL(movmemSI12))
+       HIDDEN_ALIAS(movstrSI12,movmemSI12)
+GLOBAL(movmemSI12):
         mov.l   @(8,r5),r0
         mov.l   r0,@(8,r4)
-
+       .global GLOBAL(movmemSI8)
+       HIDDEN_FUNC(GLOBAL(movmemSI8))
+       HIDDEN_ALIAS(movstrSI8,movmemSI8)
+GLOBAL(movmemSI8):
         mov.l   @(4,r5),r0
         mov.l   r0,@(4,r4)
-
+       .global GLOBAL(movmemSI4)
+       HIDDEN_FUNC(GLOBAL(movmemSI4))
+       HIDDEN_ALIAS(movstrSI4,movmemSI4)
+GLOBAL(movmemSI4):
         mov.l   @(0,r5),r0
+       rts
         mov.l   r0,@(0,r4)
  
-       add     #-16,r6
-       cmp/pl  r6
-       bf      done
-
-       add     #64,r5
-       bra     GLOBAL(movstr)
-       add     #64,r4
+       ENDFUNC(GLOBAL(movmemSI64))
+       ENDFUNC(GLOBAL(movmemSI60))
+       ENDFUNC(GLOBAL(movmemSI56))
+       ENDFUNC(GLOBAL(movmemSI52))
+       ENDFUNC(GLOBAL(movmemSI48))
+       ENDFUNC(GLOBAL(movmemSI44))
+       ENDFUNC(GLOBAL(movmemSI40))
+       ENDFUNC(GLOBAL(movmemSI36))
+       ENDFUNC(GLOBAL(movmemSI32))
+       ENDFUNC(GLOBAL(movmemSI28))
+       ENDFUNC(GLOBAL(movmemSI24))
+       ENDFUNC(GLOBAL(movmemSI20))
+       ENDFUNC(GLOBAL(movmemSI16))
+       ENDFUNC(GLOBAL(movmemSI12))
+       ENDFUNC(GLOBAL(movmemSI8))
+       ENDFUNC(GLOBAL(movmemSI4))
+       ENDFUNC(GLOBAL(movmem))
  #endif
  
-#ifdef L_movstr_i4
+#ifdef L_movmem_i4
         .text
-       .global GLOBAL(movstr_i4_even)
-       .global GLOBAL(movstr_i4_odd)
-       .global GLOBAL(movstrSI12_i4)
+       .global GLOBAL(movmem_i4_even)
+       .global GLOBAL(movmem_i4_odd)
+       .global GLOBAL(movmemSI12_i4)
+
+       HIDDEN_FUNC(GLOBAL(movmem_i4_even))
+       HIDDEN_FUNC(GLOBAL(movmem_i4_odd))
+       HIDDEN_FUNC(GLOBAL(movmemSI12_i4))
+
+       HIDDEN_ALIAS(movstr_i4_even,movmem_i4_even)
+       HIDDEN_ALIAS(movstr_i4_odd,movmem_i4_odd)
+       HIDDEN_ALIAS(movstrSI12_i4,movmemSI12_i4)
  
         .p2align        5
-L_movstr_2mod4_end:
+L_movmem_2mod4_end:
         mov.l   r0,@(16,r4)
         rts
         mov.l   r1,@(20,r4)
  
         .p2align        2
  
-GLOBAL(movstr_i4_odd):
+GLOBAL(movmem_i4_even):
+       mov.l   @r5+,r0
+       bra     L_movmem_start_even
+       mov.l   @r5+,r1
+
+GLOBAL(movmem_i4_odd):
         mov.l   @r5+,r1
         add     #-4,r4
         mov.l   @r5+,r2
@@ -799,31 +891,29 @@ GLOBAL(movstr_i4_odd):
         mov.l   r1,@(4,r4)
         mov.l   r2,@(8,r4)
  
-L_movstr_loop:
+L_movmem_loop:
         mov.l   r3,@(12,r4)
         dt      r6
         mov.l   @r5+,r0
-       bt/s    L_movstr_2mod4_end
+       bt/s    L_movmem_2mod4_end
         mov.l   @r5+,r1
         add     #16,r4
-L_movstr_start_even:
+L_movmem_start_even:
         mov.l   @r5+,r2
         mov.l   @r5+,r3
         mov.l   r0,@r4
         dt      r6
         mov.l   r1,@(4,r4)
-       bf/s    L_movstr_loop
+       bf/s    L_movmem_loop
         mov.l   r2,@(8,r4)
         rts
         mov.l   r3,@(12,r4)
  
-GLOBAL(movstr_i4_even):
-       mov.l   @r5+,r0
-       bra     L_movstr_start_even
-       mov.l   @r5+,r1
+       ENDFUNC(GLOBAL(movmem_i4_even))
+       ENDFUNC(GLOBAL(movmem_i4_odd))
  
         .p2align        4
-GLOBAL(movstrSI12_i4):
+GLOBAL(movmemSI12_i4):
         mov.l   @r5,r0
         mov.l   @(4,r5),r1
         mov.l   @(8,r5),r2
@@ -831,12 +921,15 @@ GLOBAL(movstrSI12_i4):
         mov.l   r1,@(4,r4)
         rts
         mov.l   r2,@(8,r4)
+
+       ENDFUNC(GLOBAL(movmemSI12_i4))
  #endif
  
  #ifdef L_mulsi3
  
  
         .global GLOBAL(mulsi3)
+       HIDDEN_FUNC(GLOBAL(mulsi3))
  
  ! r4 =       aabb
  ! r5 =       ccdd
@@ -869,16 +962,17 @@ hiset:    sts     macl,r0         ! r0 = bb*dd
         rts
         add     r2,r0
  
-
+       ENDFUNC(GLOBAL(mulsi3))
  #endif
  #endif /* ! __SH5__ */
  #ifdef L_sdivsi3_i4
         .title "SH DIVIDE"
-!! 4 byte integer Divide code for the Hitachi SH
+!! 4 byte integer Divide code for the Renesas SH
  #ifdef __SH4__
  !! args in r4 and r5, result in fpul, clobber dr0, dr2
  
         .global GLOBAL(sdivsi3_i4)
+       HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
  GLOBAL(sdivsi3_i4):
         lds r4,fpul
         float fpul,dr0
@@ -888,6 +982,7 @@ GLOBAL(sdivsi3_i4):
         rts
         ftrc dr0,fpul
  
+       ENDFUNC(GLOBAL(sdivsi3_i4))
  #elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
  !! args in r4 and r5, result in fpul, clobber r2, dr0, dr2
  
@@ -896,6 +991,7 @@ GLOBAL(sdivsi3_i4):
         .mode   SHcompact
  #endif
         .global GLOBAL(sdivsi3_i4)
+       HIDDEN_FUNC(GLOBAL(sdivsi3_i4))
  GLOBAL(sdivsi3_i4):
         sts.l fpscr,@-r15
         mov #8,r2
@@ -910,13 +1006,14 @@ GLOBAL(sdivsi3_i4):
         rts
         lds.l @r15+,fpscr
  
+       ENDFUNC(GLOBAL(sdivsi3_i4))
  #endif /* ! __SH5__ || __SH5__ == 32 */
  #endif /* ! __SH4__ */
  #endif
  
  #ifdef L_sdivsi3
  /* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
-   sh3e code.  */
+   sh2e/sh3e code.  */
  #if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__)
  !!
  !! Steve Chamberlain
@@ -924,7 +1021,7 @@ GLOBAL(sdivsi3_i4):
  !!
  !!
  
-!! args in r4 and r5, result in r0 clobber r1,r2,r3
+!! args in r4 and r5, result in r0 clobber r1, r2, r3, and t bit
  
         .global GLOBAL(sdivsi3)
  #if __SHMEDIA__
@@ -934,6 +1031,7 @@ GLOBAL(sdivsi3_i4):
         .text
  #endif
         .align  2
+#if 0
  /* The assembly code that follows is a hand-optimized version of the C
     code that follows.  Note that the registers that are modified are
     exactly those listed as clobbered in the patterns divsi3_i1 and
@@ -991,7 +1089,162 @@ LOCAL(sdivsi3_dontadd):
         muls.l  r0, r2, r0
         add.l   r0, r63, r0
         blink   tr0, r63
-#else
+#elif 0 /* ! 0 */
+ // inputs: r4,r5
+ // clobbered: r1,r2,r3,r18,r19,r20,r21,r25,tr0
+ // result in r0
+GLOBAL(sdivsi3):
+ // can create absolute value without extra latency,
+ // but dependent on proper sign extension of inputs:
+ // shari.l r5,31,r2
+ // xor r5,r2,r20
+ // sub r20,r2,r20 // r20 is now absolute value of r5, zero-extended.
+ shari.l r5,31,r2
+ ori r2,1,r2
+ muls.l r5,r2,r20 // r20 is now absolute value of r5, zero-extended.
+ movi 0xffffffffffffbb0c,r19 // shift count eqiv 76
+ shari.l r4,31,r3
+ nsb r20,r0
+ shlld r20,r0,r25
+ shlri r25,48,r25
+ sub r19,r25,r1
+ mmulfx.w r1,r1,r2
+ mshflo.w r1,r63,r1
+ // If r4 was to be used in-place instead of r21, could use this sequence
+ // to compute absolute:
+ // sub r63,r4,r19 // compute absolute value of r4
+ // shlri r4,32,r3 // into lower 32 bit of r4, keeping
+ // mcmv r19,r3,r4 // the sign in the upper 32 bits intact.
+ ori r3,1,r3
+ mmulfx.w r25,r2,r2
+ sub r19,r0,r0
+ muls.l r4,r3,r21
+ msub.w r1,r2,r2
+ addi r2,-2,r1
+ mulu.l r21,r1,r19
+ mmulfx.w r2,r2,r2
+ shlli r1,15,r1
+ shlrd r19,r0,r19
+ mulu.l r19,r20,r3
+ mmacnfx.wl r25,r2,r1
+ ptabs r18,tr0
+ sub r21,r3,r25
+
+ mulu.l r25,r1,r2
+ addi r0,14,r0
+ xor r4,r5,r18
+ shlrd r2,r0,r2
+ mulu.l r2,r20,r3
+ add r19,r2,r19
+ shari.l r18,31,r18
+ sub r25,r3,r25
+
+ mulu.l r25,r1,r2
+ sub r25,r20,r25
+ add r19,r18,r19
+ shlrd r2,r0,r2
+ mulu.l r2,r20,r3
+ addi r25,1,r25
+ add r19,r2,r19
+
+ cmpgt r25,r3,r25
+ add.l r19,r25,r0
+ xor r0,r18,r0
+ blink tr0,r63
+#else /* ! 0 && ! 0 */
+
+ // inputs: r4,r5
+ // clobbered: r1,r18,r19,r20,r21,r25,tr0
+ // result in r0
+       HIDDEN_FUNC(GLOBAL(sdivsi3_2))
+#ifndef __pic__
+       FUNC(GLOBAL(sdivsi3))
+GLOBAL(sdivsi3): /* this is the shcompact entry point */
+ // The special SHmedia entry point sdivsi3_1 prevents accidental linking
+ // with the SHcompact implementation, which clobbers tr1 / tr2.
+ .global GLOBAL(sdivsi3_1)
+GLOBAL(sdivsi3_1):
+ .global GLOBAL(div_table_internal)
+ movi (GLOBAL(div_table_internal) >> 16) & 65535, r20
+ shori GLOBAL(div_table_internal) & 65535, r20
+#endif
+ .global GLOBAL(sdivsi3_2)
+ // div_table in r20
+ // clobbered: r1,r18,r19,r21,r25,tr0
+GLOBAL(sdivsi3_2):
+ nsb r5, r1
+ shlld r5, r1, r25    // normalize; [-2 ..1, 1..2) in s2.62
+ shari r25, 58, r21   // extract 5(6) bit index (s2.4 with hole -1..1)
+ ldx.ub r20, r21, r19 // u0.8
+ shari r25, 32, r25   // normalize to s2.30
+ shlli r21, 1, r21
+ muls.l r25, r19, r19 // s2.38
+ ldx.w r20, r21, r21  // s2.14
+  ptabs r18, tr0
+ shari r19, 24, r19   // truncate to s2.14
+ sub r21, r19, r19    // some 11 bit inverse in s1.14
+ muls.l r19, r19, r21 // u0.28
+  sub r63, r1, r1
+  addi r1, 92, r1
+ muls.l r25, r21, r18 // s2.58
+ shlli r19, 45, r19   // multiply by two and convert to s2.58
+  /* bubble */
+ sub r19, r18, r18
+ shari r18, 28, r18   // some 22 bit inverse in s1.30
+ muls.l r18, r25, r0  // s2.60
+  muls.l r18, r4, r25 // s32.30
+  /* bubble */
+ shari r0, 16, r19   // s-16.44
+ muls.l r19, r18, r19 // s-16.74
+  shari r25, 63, r0
+  shari r4, 14, r18   // s19.-14
+ shari r19, 30, r19   // s-16.44
+ muls.l r19, r18, r19 // s15.30
+  xor r21, r0, r21    // You could also use the constant 1 << 27.
+  add r21, r25, r21
+ sub r21, r19, r21
+ shard r21, r1, r21
+ sub r21, r0, r0
+ blink tr0, r63
+#ifndef __pic__
+       ENDFUNC(GLOBAL(sdivsi3))
+#endif
+       ENDFUNC(GLOBAL(sdivsi3_2))
+#endif
+#elif defined __SHMEDIA__
+/* m5compact-nofpu */
+ // clobbered: r18,r19,r20,r21,r25,tr0,tr1,tr2
+       .mode   SHmedia
+       .section        .text..SHmedia32,"ax"
+       .align  2
+       FUNC(GLOBAL(sdivsi3))
+GLOBAL(sdivsi3):
+       pt/l LOCAL(sdivsi3_dontsub), tr0
+       pt/l LOCAL(sdivsi3_loop), tr1
+       ptabs/l r18,tr2
+       shari.l r4,31,r18
+       shari.l r5,31,r19
+       xor r4,r18,r20
+       xor r5,r19,r21
+       sub.l r20,r18,r20
+       sub.l r21,r19,r21
+       xor r18,r19,r19
+       shlli r21,32,r25
+       addi r25,-1,r21
+       addz.l r20,r63,r20
+LOCAL(sdivsi3_loop):
+       shlli r20,1,r20
+       bgeu/u r21,r20,tr0
+       sub r20,r21,r20
+LOCAL(sdivsi3_dontsub):
+       addi.l r25,-1,r25
+       bnei r25,-32,tr1
+       xor r20,r19,r20
+       sub.l r20,r19,r0
+       blink tr2,r63
+       ENDFUNC(GLOBAL(sdivsi3))
+#else /* ! __SHMEDIA__ */
+       FUNC(GLOBAL(sdivsi3))
  GLOBAL(sdivsi3):
         mov     r4,r1
         mov     r5,r0
@@ -1076,17 +1329,20 @@ GLOBAL(sdivsi3):
  div0:  rts
         mov     #0,r0
  
+       ENDFUNC(GLOBAL(sdivsi3))
  #endif /* ! __SHMEDIA__ */
  #endif /* ! __SH4__ */
  #endif
  #ifdef L_udivsi3_i4
  
         .title "SH DIVIDE"
-!! 4 byte integer Divide code for the Hitachi SH
+!! 4 byte integer Divide code for the Renesas SH
  #ifdef __SH4__
-!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4
+!! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4,
+!! and t bit
  
         .global GLOBAL(udivsi3_i4)
+       HIDDEN_FUNC(GLOBAL(udivsi3_i4))
  GLOBAL(udivsi3_i4):
         mov #1,r1
         cmp/hi r1,r5
@@ -1098,13 +1354,8 @@ GLOBAL(udivsi3_i4):
  #ifdef FMOVD_WORKS
         fmov.d @r0+,dr4
  #else
-#ifdef __LITTLE_ENDIAN__
-       fmov.s @r0+,fr5
-       fmov.s @r0,fr4
-#else
-       fmov.s @r0+,fr4
-       fmov.s @r0,fr5
-#endif
+       fmov.s @r0+,DR40
+       fmov.s @r0,DR41
  #endif
         float fpul,dr0
         xor r1,r5
@@ -1127,14 +1378,33 @@ trivial:
  L1:
         .double 2147483648
  
-#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__) || (defined (__SH5__) && ! defined __SH4_NOFPU__)
+       ENDFUNC(GLOBAL(udivsi3_i4))
+#elif defined (__SH5__) && ! defined (__SH4_NOFPU__)
+#if ! __SH5__ || __SH5__ == 32
+!! args in r4 and r5, result in fpul, clobber r20, r21, dr0, fr33
+       .mode   SHmedia
+       .global GLOBAL(udivsi3_i4)
+       HIDDEN_FUNC(GLOBAL(udivsi3_i4))
+GLOBAL(udivsi3_i4):
+       addz.l  r4,r63,r20
+       addz.l  r5,r63,r21
+       fmov.qd r20,dr0
+       fmov.qd r21,dr32
+       ptabs   r18,tr0
+       float.qd dr0,dr0
+       float.qd dr32,dr32
+       fdiv.d  dr0,dr32,dr0
+       ftrc.dq dr0,dr32
+       fmov.s fr33,fr32
+       blink tr0,r63
+
+       ENDFUNC(GLOBAL(udivsi3_i4))
+#endif /* ! __SH5__ || __SH5__ == 32 */
+#elif defined(__SH4_SINGLE__) || defined(__SH4_SINGLE_ONLY__)
  !! args in r4 and r5, result in fpul, clobber r0, r1, r4, r5, dr0, dr2, dr4
  
-#if ! __SH5__ || __SH5__ == 32
-#if __SH5__
-       .mode   SHcompact
-#endif
         .global GLOBAL(udivsi3_i4)
+       HIDDEN_FUNC(GLOBAL(udivsi3_i4))
  GLOBAL(udivsi3_i4):
         mov #1,r1
         cmp/hi r1,r5
@@ -1148,13 +1418,8 @@ GLOBAL(udivsi3_i4):
  #ifdef FMOVD_WORKS
         fmov.d @r0+,dr4
  #else
-#ifdef __LITTLE_ENDIAN__
-       fmov.s @r0+,fr5
-       fmov.s @r0,fr4
-#else
-       fmov.s @r0+,fr4
-       fmov.s @r0,fr5
-#endif
+       fmov.s @r0+,DR40
+       fmov.s @r0,DR41
  #endif
         float fpul,dr0
         xor r1,r5
@@ -1183,22 +1448,18 @@ L1:
  #endif
         .double 2147483648
  
-#endif /* ! __SH5__ || __SH5__ == 32 */
+       ENDFUNC(GLOBAL(udivsi3_i4))
  #endif /* ! __SH4__ */
  #endif
  
  #ifdef L_udivsi3
  /* __SH4_SINGLE_ONLY__ keeps this part for link compatibility with
-   sh3e code.  */
+   sh2e/sh3e code.  */
  #if (! defined(__SH4__) && ! defined (__SH4_SINGLE__)) || defined (__linux__)
-!!
-!! Steve Chamberlain
-!! sac@cygnus.com
-!!
-!!
  
  !! args in r4 and r5, result in r0, clobbers r4, pr, and t bit
         .global GLOBAL(udivsi3)
+       HIDDEN_FUNC(GLOBAL(udivsi3))
  
  #if __SHMEDIA__
  #if __SH5__ == 32
@@ -1207,6 +1468,7 @@ L1:
         .text
  #endif
         .align  2
+#if 0
  /* The assembly code that follows is a hand-optimized version of the C
     code that follows.  Note that the registers that are modified are
     exactly those listed as clobbered in the patterns udivsi3_i1 and
@@ -1252,74 +1514,487 @@ LOCAL(udivsi3_dontadd):
         blink   tr0, r63
  #else
  GLOBAL(udivsi3):
-longway:
-       mov     #0,r0
-       div0u
-       ! get one bit from the msb of the numerator into the T
-       ! bit and divide it by whats in r5.  Put the answer bit
-       ! into the T bit so it can come out again at the bottom
-
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-shortway:
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-
-vshortway:
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4 ; div1 r5,r0
-       rotcl   r4
-ret:   rts
-       mov     r4,r0
+ // inputs: r4,r5
+ // clobbered: r18,r19,r20,r21,r22,r25,tr0
+ // result in r0.
+ addz.l r5,r63,r22
+ nsb r22,r0
+ shlld r22,r0,r25
+ shlri r25,48,r25
+ movi 0xffffffffffffbb0c,r20 // shift count eqiv 76
+ sub r20,r25,r21
+ mmulfx.w r21,r21,r19
+ mshflo.w r21,r63,r21
+ ptabs r18,tr0
+ mmulfx.w r25,r19,r19
+ sub r20,r0,r0
+ /* bubble */
+ msub.w r21,r19,r19
+ addi r19,-2,r21 /* It would be nice for scheduling to do this add to r21
+                   before the msub.w, but we need a different value for
+                   r19 to keep errors under control.  */
+ mulu.l r4,r21,r18
+ mmulfx.w r19,r19,r19
+ shlli r21,15,r21
+ shlrd r18,r0,r18
+ mulu.l r18,r22,r20
+ mmacnfx.wl r25,r19,r21
+ /* bubble */
+ sub r4,r20,r25
+
+ mulu.l r25,r21,r19
+ addi r0,14,r0
+ /* bubble */
+ shlrd r19,r0,r19
+ mulu.l r19,r22,r20
+ add r18,r19,r18
+ /* bubble */
+ sub.l r25,r20,r25
+
+ mulu.l r25,r21,r19
+ addz.l r25,r63,r25
+ sub r25,r22,r25
+ shlrd r19,r0,r19
+ mulu.l r19,r22,r20
+ addi r25,1,r25
+ add r18,r19,r18
+
+ cmpgt r25,r20,r25
+ add.l r18,r25,r0
+ blink tr0,r63
+#endif
+#elif defined (__SHMEDIA__)
+/* m5compact-nofpu - more emphasis on code size than on speed, but don't
+   ignore speed altogether - div1 needs 9 cycles, subc 7 and rotcl 4.
+   So use a short shmedia loop.  */
+ // clobbered: r20,r21,r25,tr0,tr1,tr2
+       .mode   SHmedia
+       .section        .text..SHmedia32,"ax"
+       .align  2
+GLOBAL(udivsi3):
+ pt/l LOCAL(udivsi3_dontsub), tr0
+ pt/l LOCAL(udivsi3_loop), tr1
+ ptabs/l r18,tr2
+ shlli r5,32,r25
+ addi r25,-1,r21
+ addz.l r4,r63,r20
+LOCAL(udivsi3_loop):
+ shlli r20,1,r20
+ bgeu/u r21,r20,tr0
+ sub r20,r21,r20
+LOCAL(udivsi3_dontsub):
+ addi.l r25,-1,r25
+ bnei r25,-32,tr1
+ add.l r20,r63,r0
+ blink tr2,r63
+#else /* ! defined (__SHMEDIA__) */
+LOCAL(div8):
+ div1 r5,r4
+LOCAL(div7):
+ div1 r5,r4; div1 r5,r4; div1 r5,r4
+ div1 r5,r4; div1 r5,r4; div1 r5,r4; rts; div1 r5,r4
+
+LOCAL(divx4):
+ div1 r5,r4; rotcl r0
+ div1 r5,r4; rotcl r0
+ div1 r5,r4; rotcl r0
+ rts; div1 r5,r4
  
+GLOBAL(udivsi3):
+ sts.l pr,@-r15
+ extu.w r5,r0
+ cmp/eq r5,r0
+#ifdef __sh1__
+ bf LOCAL(large_divisor)
+#else
+ bf/s LOCAL(large_divisor)
+#endif
+ div0u
+ swap.w r4,r0
+ shlr16 r4
+ bsr LOCAL(div8)
+ shll16 r5
+ bsr LOCAL(div7)
+ div1 r5,r4
+ xtrct r4,r0
+ xtrct r0,r4
+ bsr LOCAL(div8)
+ swap.w r4,r4
+ bsr LOCAL(div7)
+ div1 r5,r4
+ lds.l @r15+,pr
+ xtrct r4,r0
+ swap.w r0,r0
+ rotcl r0
+ rts
+ shlr16 r5
+
+LOCAL(large_divisor):
+#ifdef __sh1__
+ div0u
+#endif
+ mov #0,r0
+ xtrct r4,r0
+ xtrct r0,r4
+ bsr LOCAL(divx4)
+ rotcl r0
+ bsr LOCAL(divx4)
+ rotcl r0
+ bsr LOCAL(divx4)
+ rotcl r0
+ bsr LOCAL(divx4)
+ rotcl r0
+ lds.l @r15+,pr
+ rts
+ rotcl r0
+
+       ENDFUNC(GLOBAL(udivsi3))
  #endif /* ! __SHMEDIA__ */
  #endif /* __SH4__ */
-#endif
+#endif /* L_udivsi3 */
+
+#ifdef L_udivdi3
+#ifdef __SHMEDIA__
+       .mode   SHmedia
+       .section        .text..SHmedia32,"ax"
+       .align  2
+       .global GLOBAL(udivdi3)
+       FUNC(GLOBAL(udivdi3))
+GLOBAL(udivdi3):
+       HIDDEN_ALIAS(udivdi3_internal,udivdi3)
+       shlri r3,1,r4
+       nsb r4,r22
+       shlld r3,r22,r6
+       shlri r6,49,r5
+       movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */
+       sub r21,r5,r1
+       mmulfx.w r1,r1,r4
+       mshflo.w r1,r63,r1
+       sub r63,r22,r20 // r63 == 64 % 64
+       mmulfx.w r5,r4,r4
+       pta LOCAL(large_divisor),tr0
+       addi r20,32,r9
+       msub.w r1,r4,r1
+       madd.w r1,r1,r1
+       mmulfx.w r1,r1,r4
+       shlri r6,32,r7
+       bgt/u r9,r63,tr0 // large_divisor
+       mmulfx.w r5,r4,r4
+       shlri r2,32+14,r19
+       addi r22,-31,r0
+       msub.w r1,r4,r1
+
+       mulu.l r1,r7,r4
+       addi r1,-3,r5
+       mulu.l r5,r19,r5
+       sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+       shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+                        the case may be, %0000000000000000 000.11111111111, still */
+       muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+       mulu.l r5,r3,r8
+       mshalds.l r1,r21,r1
+       shari r4,26,r4
+       shlld r8,r0,r8
+       add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+       sub r2,r8,r2
+       /* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */
+
+       shlri r2,22,r21
+       mulu.l r21,r1,r21
+       shlld r5,r0,r8
+       addi r20,30-22,r0
+       shlrd r21,r0,r21
+       mulu.l r21,r3,r5
+       add r8,r21,r8
+       mcmpgt.l r21,r63,r21 // See Note 1
+       addi r20,30,r0
+       mshfhi.l r63,r21,r21
+       sub r2,r5,r2
+       andc r2,r21,r2
+
+       /* small divisor: need a third divide step */
+       mulu.l r2,r1,r7
+       ptabs r18,tr0
+       addi r2,1,r2
+       shlrd r7,r0,r7
+       mulu.l r7,r3,r5
+       add r8,r7,r8
+       sub r2,r3,r2
+       cmpgt r2,r5,r5
+       add r8,r5,r2
+       /* could test r3 here to check for divide by zero.  */
+       blink tr0,r63
+
+LOCAL(large_divisor):
+       mmulfx.w r5,r4,r4
+       shlrd r2,r9,r25
+       shlri r25,32,r8
+       msub.w r1,r4,r1
+
+       mulu.l r1,r7,r4
+       addi r1,-3,r5
+       mulu.l r5,r8,r5
+       sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+       shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+                        the case may be, %0000000000000000 000.11111111111, still */
+       muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+       shlri r5,14-1,r8
+       mulu.l r8,r7,r5
+       mshalds.l r1,r21,r1
+       shari r4,26,r4
+       add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+       sub r25,r5,r25
+       /* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */
+
+       shlri r25,22,r21
+       mulu.l r21,r1,r21
+       pta LOCAL(no_lo_adj),tr0
+       addi r22,32,r0
+       shlri r21,40,r21
+       mulu.l r21,r7,r5
+       add r8,r21,r8
+       shlld r2,r0,r2
+       sub r25,r5,r25
+       bgtu/u r7,r25,tr0 // no_lo_adj
+       addi r8,1,r8
+       sub r25,r7,r25
+LOCAL(no_lo_adj):
+       mextr4 r2,r25,r2
+
+       /* large_divisor: only needs a few adjustments.  */
+       mulu.l r8,r6,r5
+       ptabs r18,tr0
+       /* bubble */
+       cmpgtu r5,r2,r5
+       sub r8,r5,r2
+       blink tr0,r63
+       ENDFUNC(GLOBAL(udivdi3))
+/* Note 1: To shift the result of the second divide stage so that the result
+   always fits into 32 bits, yet we still reduce the rest sufficiently
+   would require a lot of instructions to do the shifts just right.  Using
+   the full 64 bit shift result to multiply with the divisor would require
+   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
+   Fortunately, if the upper 32 bits of the shift result are nonzero, we
+   know that the rest after taking this partial result into account will
+   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the
+   upper 32 bits of the partial result are nonzero.  */
+#endif /* __SHMEDIA__ */
+#endif /* L_udivdi3 */
+
+#ifdef L_divdi3
+#ifdef __SHMEDIA__
+       .mode   SHmedia
+       .section        .text..SHmedia32,"ax"
+       .align  2
+       .global GLOBAL(divdi3)
+       FUNC(GLOBAL(divdi3))
+GLOBAL(divdi3):
+       pta GLOBAL(udivdi3_internal),tr0
+       shari r2,63,r22
+       shari r3,63,r23
+       xor r2,r22,r2
+       xor r3,r23,r3
+       sub r2,r22,r2
+       sub r3,r23,r3
+       beq/u r22,r23,tr0
+       ptabs r18,tr1
+       blink tr0,r18
+       sub r63,r2,r2
+       blink tr1,r63
+       ENDFUNC(GLOBAL(divdi3))
+#endif /* __SHMEDIA__ */
+#endif /* L_divdi3 */
+
+#ifdef L_umoddi3
+#ifdef __SHMEDIA__
+       .mode   SHmedia
+       .section        .text..SHmedia32,"ax"
+       .align  2
+       .global GLOBAL(umoddi3)
+       FUNC(GLOBAL(umoddi3))
+GLOBAL(umoddi3):
+       HIDDEN_ALIAS(umoddi3_internal,umoddi3)
+       shlri r3,1,r4
+       nsb r4,r22
+       shlld r3,r22,r6
+       shlri r6,49,r5
+       movi 0xffffffffffffbaf1,r21 /* .l shift count 17.  */
+       sub r21,r5,r1
+       mmulfx.w r1,r1,r4
+       mshflo.w r1,r63,r1
+       sub r63,r22,r20 // r63 == 64 % 64
+       mmulfx.w r5,r4,r4
+       pta LOCAL(large_divisor),tr0
+       addi r20,32,r9
+       msub.w r1,r4,r1
+       madd.w r1,r1,r1
+       mmulfx.w r1,r1,r4
+       shlri r6,32,r7
+       bgt/u r9,r63,tr0 // large_divisor
+       mmulfx.w r5,r4,r4
+       shlri r2,32+14,r19
+       addi r22,-31,r0
+       msub.w r1,r4,r1
+
+       mulu.l r1,r7,r4
+       addi r1,-3,r5
+       mulu.l r5,r19,r5
+       sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+       shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+                        the case may be, %0000000000000000 000.11111111111, still */
+       muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+       mulu.l r5,r3,r5
+       mshalds.l r1,r21,r1
+       shari r4,26,r4
+       shlld r5,r0,r5
+       add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+       sub r2,r5,r2
+       /* Can do second step of 64 : 32 div now, using r1 and the rest in r2.  */
+
+       shlri r2,22,r21
+       mulu.l r21,r1,r21
+       addi r20,30-22,r0
+       /* bubble */ /* could test r3 here to check for divide by zero.  */
+       shlrd r21,r0,r21
+       mulu.l r21,r3,r5
+       mcmpgt.l r21,r63,r21 // See Note 1
+       addi r20,30,r0
+       mshfhi.l r63,r21,r21
+       sub r2,r5,r2
+       andc r2,r21,r2
+
+       /* small divisor: need a third divide step */
+       mulu.l r2,r1,r7
+       ptabs r18,tr0
+       sub r2,r3,r8 /* re-use r8 here for rest - r3 */
+       shlrd r7,r0,r7
+       mulu.l r7,r3,r5
+       /* bubble */
+       addi r8,1,r7
+       cmpgt r7,r5,r7
+       cmvne r7,r8,r2
+       sub r2,r5,r2
+       blink tr0,r63
+
+LOCAL(large_divisor):
+       mmulfx.w r5,r4,r4
+       shlrd r2,r9,r25
+       shlri r25,32,r8
+       msub.w r1,r4,r1
+
+       mulu.l r1,r7,r4
+       addi r1,-3,r5
+       mulu.l r5,r8,r5
+       sub r63,r4,r4 // Negate to make sure r1 ends up <= 1/r2
+       shlri r4,2,r4 /* chop off leading %0000000000000000 001.00000000000 - or, as
+                        the case may be, %0000000000000000 000.11111111111, still */
+       muls.l r1,r4,r4 /* leaving at least one sign bit.  */
+       shlri r5,14-1,r8
+       mulu.l r8,r7,r5
+       mshalds.l r1,r21,r1
+       shari r4,26,r4
+       add r1,r4,r1 // 31 bit unsigned reciprocal now in r1 (msb equiv. 0.5)
+       sub r25,r5,r25
+       /* Can do second step of 64 : 32 div now, using r1 and the rest in r25.  */
+
+       shlri r25,22,r21
+       mulu.l r21,r1,r21
+       pta LOCAL(no_lo_adj),tr0
+       addi r22,32,r0
+       shlri r21,40,r21
+       mulu.l r21,r7,r5
+       add r8,r21,r8
+       shlld r2,r0,r2
+       sub r25,r5,r25
+       bgtu/u r7,r25,tr0 // no_lo_adj
+       addi r8,1,r8
+       sub r25,r7,r25
+LOCAL(no_lo_adj):
+       mextr4 r2,r25,r2
+
+       /* large_divisor: only needs a few adjustments.  */
+       mulu.l r8,r6,r5
+       ptabs r18,tr0
+       add r2,r6,r7
+       cmpgtu r5,r2,r8
+       cmvne r8,r7,r2
+       sub r2,r5,r2
+       shlrd r2,r22,r2
+       blink tr0,r63
+       ENDFUNC(GLOBAL(umoddi3))
+/* Note 1: To shift the result of the second divide stage so that the result
+   always fits into 32 bits, yet we still reduce the rest sufficiently
+   would require a lot of instructions to do the shifts just right.  Using
+   the full 64 bit shift result to multiply with the divisor would require
+   four extra instructions for the upper 32 bits (shift / mulu / shift / sub).
+   Fortunately, if the upper 32 bits of the shift result are nonzero, we
+   know that the rest after taking this partial result into account will
+   fit into 32 bits.  So we just clear the upper 32 bits of the rest if the
+   upper 32 bits of the partial result are nonzero.  */
+#endif /* __SHMEDIA__ */
+#endif /* L_umoddi3 */
+
+#ifdef L_moddi3
+#ifdef __SHMEDIA__
+       .mode   SHmedia
+       .section        .text..SHmedia32,"ax"
+       .align  2
+       .global GLOBAL(moddi3)
+       FUNC(GLOBAL(moddi3))
+GLOBAL(moddi3):
+       pta GLOBAL(umoddi3_internal),tr0
+       shari r2,63,r22
+       shari r3,63,r23
+       xor r2,r22,r2
+       xor r3,r23,r3
+       sub r2,r22,r2
+       sub r3,r23,r3
+       beq/u r22,r63,tr0
+       ptabs r18,tr1
+       blink tr0,r18
+       sub r63,r2,r2
+       blink tr1,r63
+       ENDFUNC(GLOBAL(moddi3))
+#endif /* __SHMEDIA__ */
+#endif /* L_moddi3 */
+
  #ifdef L_set_fpscr
-#if defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || __SH5__ == 32
+#if !defined (__SH2A_NOFPU__)
+#if defined (__SH2E__) || defined (__SH2A__) || defined (__SH3E__) || defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || __SH5__ == 32
  #ifdef __SH5__
         .mode   SHcompact
  #endif
         .global GLOBAL(set_fpscr)
+       HIDDEN_FUNC(GLOBAL(set_fpscr))
  GLOBAL(set_fpscr):
         lds r4,fpscr
+#ifdef __PIC__
+       mov.l   r12,@-r15
+#ifdef __vxworks
+       mov.l   LOCAL(set_fpscr_L0_base),r12
+       mov.l   LOCAL(set_fpscr_L0_index),r0
+       mov.l   @r12,r12
+       mov.l   @(r0,r12),r12
+#else
+       mova    LOCAL(set_fpscr_L0),r0
+       mov.l   LOCAL(set_fpscr_L0),r12
+       add     r0,r12
+#endif
+       mov.l   LOCAL(set_fpscr_L1),r0
+       mov.l   @(r0,r12),r1
+       mov.l   @r15+,r12
+#else
         mov.l LOCAL(set_fpscr_L1),r1
+#endif
         swap.w r4,r0
         or #24,r0
  #ifndef FMOVD_WORKS
         xor #16,r0
  #endif
-#if defined(__SH4__)
+#if defined(__SH4__) || defined (__SH2A_DOUBLE__)
         swap.w r0,r3
         mov.l r3,@(4,r1)
-#else /* defined(__SH3E__) || defined(__SH4_SINGLE*__) */
+#else /* defined (__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */
         swap.w r0,r2
         mov.l r2,@r1
  #endif
@@ -1328,62 +2003,209 @@ GLOBAL(set_fpscr):
  #else
         xor #24,r0
  #endif
-#if defined(__SH4__)
+#if defined(__SH4__) || defined (__SH2A_DOUBLE__)
         swap.w r0,r2
         rts
         mov.l r2,@r1
-#else /* defined(__SH3E__) || defined(__SH4_SINGLE*__) */
+#else /* defined(__SH2E__) || defined(__SH3E__) || defined(__SH4_SINGLE*__) */
         swap.w r0,r3
         rts
         mov.l r3,@(4,r1)
  #endif
         .align 2
+#ifdef __PIC__
+#ifdef __vxworks
+LOCAL(set_fpscr_L0_base):
+       .long ___GOTT_BASE__
+LOCAL(set_fpscr_L0_index):
+       .long ___GOTT_INDEX__
+#else
+LOCAL(set_fpscr_L0):
+       .long _GLOBAL_OFFSET_TABLE_
+#endif
+LOCAL(set_fpscr_L1):
+       .long GLOBAL(fpscr_values@GOT)
+#else
  LOCAL(set_fpscr_L1):
         .long GLOBAL(fpscr_values)
+#endif
+
+       ENDFUNC(GLOBAL(set_fpscr))
+#ifndef NO_FPSCR_VALUES
  #ifdef __ELF__
          .comm   GLOBAL(fpscr_values),8,4
  #else
          .comm   GLOBAL(fpscr_values),8
  #endif /* ELF */
-#endif /* SH3E / SH4 */
+#endif /* NO_FPSCR_VALUES */
+#endif /* SH2E / SH3E / SH4 */
+#endif /* __SH2A_NOFPU__ */
  #endif /* L_set_fpscr */
  #ifdef L_ic_invalidate
  #if __SH5__ == 32
         .mode   SHmedia
         .section        .text..SHmedia32,"ax"
         .align  2
+       .global GLOBAL(init_trampoline)
+       HIDDEN_FUNC(GLOBAL(init_trampoline))
+GLOBAL(init_trampoline):
+       st.l    r0,8,r2
+#ifdef __LITTLE_ENDIAN__
+       movi    9,r20
+       shori   0x402b,r20
+       shori   0xd101,r20
+       shori   0xd002,r20
+#else
+       movi    0xffffffffffffd002,r20
+       shori   0xd101,r20
+       shori   0x402b,r20
+       shori   9,r20
+#endif
+       st.q    r0,0,r20
+       st.l    r0,12,r3
+       ENDFUNC(GLOBAL(init_trampoline))
         .global GLOBAL(ic_invalidate)
+       HIDDEN_FUNC(GLOBAL(ic_invalidate))
  GLOBAL(ic_invalidate):
+       ocbwb   r0,0
+       synco
         icbi    r0, 0
         ptabs   r18, tr0
         synci
         blink   tr0, r63
-#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__)
+       ENDFUNC(GLOBAL(ic_invalidate))
+#elif defined(__SH4A__)
         .global GLOBAL(ic_invalidate)
+       HIDDEN_FUNC(GLOBAL(ic_invalidate))
  GLOBAL(ic_invalidate):
         ocbwb   @r4
-       mova    0f,r0
-       mov.w   1f,r1
-/* Compute how many cache lines 0f is away from r4.  */
-       sub     r0,r4
-       and     r1,r4
-/* Prepare to branch to 0f plus the cache-line offset.  */
-       add     # 0f - 1f,r4
-       braf    r4
-       nop
-1:
-       .short  0x1fe0
+       synco
+       rts
+       icbi    @r4
+       ENDFUNC(GLOBAL(ic_invalidate))
+#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))
+       /* For system code, we use ic_invalidate_line_i, but user code
+          needs a different mechanism.  A kernel call is generally not
+          available, and it would also be slow.  Different SH4 variants use
+          different sizes and associativities of the Icache.  We use a small
+          bit of dispatch code that can be put hidden in every shared object,
+          which calls the actual processor-specific invalidation code in a
+          separate module.
+          Or if you have operating system support, the OS could mmap the
+          procesor-specific code from a single page, since it is highly
+          repetitive.  */
+       .global GLOBAL(ic_invalidate)
+       HIDDEN_FUNC(GLOBAL(ic_invalidate))
+GLOBAL(ic_invalidate):
+#ifdef __pic__
+#ifdef __vxworks
+       mov.l   1f,r1
+       mov.l   2f,r0
+       mov.l   @r1,r1
+       mov.l   0f,r2
+       mov.l   @(r0,r1),r0
+#else
+       mov.l   1f,r1
+       mova    1f,r0
+       mov.l   0f,r2
+       add     r1,r0
+#endif
+       mov.l   @(r0,r2),r1
+#else
+       mov.l   0f,r1
+#endif
+       ocbwb   @r4
+       mov.l   @(8,r1),r0
+       sub     r1,r4
+       and     r4,r0
+       add     r1,r0
+       jmp     @r0
+       mov.l   @(4,r1),r0
+       .align  2
+#ifndef __pic__
+0:     .long   GLOBAL(ic_invalidate_array)
+#else /* __pic__ */
+       .global GLOBAL(ic_invalidate_array)
+0:     .long   GLOBAL(ic_invalidate_array)@GOT
+#ifdef __vxworks
+1:     .long   ___GOTT_BASE__
+2:     .long   ___GOTT_INDEX__
+#else
+1:     .long   _GLOBAL_OFFSET_TABLE_
+#endif
+       ENDFUNC(GLOBAL(ic_invalidate))
+#endif /* __pic__ */
+#endif /* SH4 */
+#endif /* L_ic_invalidate */
+
+#ifdef L_ic_invalidate_array
+#if defined(__SH4A__) || (defined (__FORCE_SH4A__) && (defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))))
+       .global GLOBAL(ic_invalidate_array)
+       /* This is needed when an SH4 dso with trampolines is used on SH4A.  */
+       .global GLOBAL(ic_invalidate_array)
+       FUNC(GLOBAL(ic_invalidate_array))
+GLOBAL(ic_invalidate_array):
+       add     r1,r4
+       synco
+       rts
+       icbi    @r4
+       .long   0
+       ENDFUNC(GLOBAL(ic_invalidate_array))
+#elif defined(__SH4_SINGLE__) || defined(__SH4__) || defined(__SH4_SINGLE_ONLY__) || (defined(__SH4_NOFPU__) && !defined(__SH5__))
+       .global GLOBAL(ic_invalidate_array)
         .p2align 5
+       FUNC(GLOBAL(ic_invalidate_array))
  /* This must be aligned to the beginning of a cache line.  */
-0:
-       .rept   256 /* There are 256 cache lines of 32 bytes.  */
+GLOBAL(ic_invalidate_array):
+#ifndef WAYS
+#define WAYS 4
+#define WAY_SIZE 0x4000
+#endif
+#if WAYS == 1
+       .rept   WAY_SIZE * WAYS / 32
+       rts
+       nop
+       .rept   7
+       .long   WAY_SIZE - 32
+       .endr
+       .endr
+#elif WAYS <= 6
+       .rept   WAY_SIZE * WAYS / 32
+       braf    r0
+       add     #-8,r0
+       .long   WAY_SIZE + 8
+       .long   WAY_SIZE - 32
+       .rept   WAYS-2
+       braf    r0
+       nop
+       .endr
+       .rept   7 - WAYS
+       rts
+       nop
+       .endr
+       .endr
+#else /* WAYS > 6 */
+       /* This variant needs two different pages for mmap-ing.  */
+       .rept   WAYS-1
+       .rept   WAY_SIZE / 32
+       braf    r0
+       nop
+       .long   WAY_SIZE
+       .rept 6
+       .long   WAY_SIZE - 32
+       .endr
+       .endr
+       .endr
+       .rept   WAY_SIZE / 32
         rts
         .rept   15
         nop
         .endr
         .endr
+#endif /* WAYS */
+       ENDFUNC(GLOBAL(ic_invalidate_array))
  #endif /* SH4 */
-#endif /* L_ic_invalidate */
+#endif /* L_ic_invalidate_array */
  
  #if defined (__SH5__) && __SH5__ == 32
  #ifdef L_shcompact_call_trampoline
@@ -1440,6 +2262,7 @@ LOCAL(ct_main_table):
         will be expanded into r2/r3 upon return.  */
         
         .global GLOBAL(GCC_shcompact_call_trampoline)
+       FUNC(GLOBAL(GCC_shcompact_call_trampoline))
  GLOBAL(GCC_shcompact_call_trampoline):
         ptabs/l r0, tr0 /* Prepare to call the actual function.  */
         movi    ((datalabel LOCAL(ct_main_table) - 31 * 2) >> 16) & 65535, r0
@@ -1790,6 +2613,8 @@ LOCAL(ct_ret_wide):       /* Call the function, so that we can unpack its
         shari   r2, 32, r2
  #endif
         blink   tr0, r63
+
+       ENDFUNC(GLOBAL(GCC_shcompact_call_trampoline))
  #endif /* L_shcompact_call_trampoline */
  
  #ifdef L_shcompact_return_trampoline
@@ -1802,6 +2627,7 @@ LOCAL(ct_ret_wide):       /* Call the function, so that we can unpack its
         .section        .text..SHmedia32, "ax"
         .align  2
         .global GLOBAL(GCC_shcompact_return_trampoline)
+       HIDDEN_FUNC(GLOBAL(GCC_shcompact_return_trampoline))
  GLOBAL(GCC_shcompact_return_trampoline):
         ptabs/l r18, tr0
  #if __LITTLE_ENDIAN__
@@ -1813,6 +2639,8 @@ GLOBAL(GCC_shcompact_return_trampoline):
  #endif
         or      r3, r2, r2
         blink   tr0, r63
+
+       ENDFUNC(GLOBAL(GCC_shcompact_return_trampoline))
  #endif /* L_shcompact_return_trampoline */
  
  #ifdef L_shcompact_incoming_args
@@ -1857,148 +2685,150 @@ LOCAL(ia_main_table):
         .align  2
         
       /* This function stores 64-bit general-purpose registers back in
-       the stack, starting at @(r1), where the cookie is supposed to
-       have been stored, and loads the address in which each register
-       was stored into itself.  Its execution time is linear on the
+       the stack, and loads the address in which each register
+       was stored into itself.  The lower 32 bits of r17 hold the address
+       to begin storing, and the upper 32 bits of r17 hold the cookie.
+       Its execution time is linear on the
         number of registers that actually have to be copied, and it is
         optimized for structures larger than 64 bits, as opposed to
-       invidivual `long long' arguments.  See sh.h for details on the
+       individual `long long' arguments.  See sh.h for details on the
         actual bit pattern.  */
         
         .global GLOBAL(GCC_shcompact_incoming_args)
+       FUNC(GLOBAL(GCC_shcompact_incoming_args))
  GLOBAL(GCC_shcompact_incoming_args):
         ptabs/l r18, tr0        /* Prepare to return.  */
         shlri   r17, 32, r0     /* Load the cookie.  */
-       movi    ((datalabel LOCAL(ia_main_table) - 31 * 2) >> 16) & 65535, r35
+       movi    ((datalabel LOCAL(ia_main_table) - 31 * 2) >> 16) & 65535, r43
         pt/l    LOCAL(ia_loop), tr1
         add.l   r17, r63, r17
-       shori   ((datalabel LOCAL(ia_main_table) - 31 * 2)) & 65535, r35
+       shori   ((datalabel LOCAL(ia_main_table) - 31 * 2)) & 65535, r43
  LOCAL(ia_loop):
-       nsb     r0, r28
-       shlli   r28, 1, r29
-       ldx.w   r35, r29, r30
+       nsb     r0, r36
+       shlli   r36, 1, r37
+       ldx.w   r43, r37, r38
  LOCAL(ia_main_label):
-       ptrel/l r30, tr2
+       ptrel/l r38, tr2
         blink   tr2, r63
  LOCAL(ia_r2_ld):       /* Store r2 and load its address.  */
-       movi    3, r30
-       shlli   r30, 29, r31
-       and     r0, r31, r32
-       andc    r0, r31, r0
+       movi    3, r38
+       shlli   r38, 29, r39
+       and     r0, r39, r40
+       andc    r0, r39, r0
         stx.q   r17, r63, r2
         add.l   r17, r63, r2
         addi.l  r17, 8, r17
-       beq/u   r31, r32, tr1
+       beq/u   r39, r40, tr1
  LOCAL(ia_r3_ld):       /* Store r3 and load its address.  */
-       movi    3, r30
-       shlli   r30, 26, r31
-       and     r0, r31, r32
-       andc    r0, r31, r0
+       movi    3, r38
+       shlli   r38, 26, r39
+       and     r0, r39, r40
+       andc    r0, r39, r0
         stx.q   r17, r63, r3
         add.l   r17, r63, r3
         addi.l  r17, 8, r17
-       beq/u   r31, r32, tr1
+       beq/u   r39, r40, tr1
  LOCAL(ia_r4_ld):       /* Store r4 and load its address.  */
-       movi    3, r30
-       shlli   r30, 23, r31
-       and     r0, r31, r32
-       andc    r0, r31, r0
+       movi    3, r38
+       shlli   r38, 23, r39
+       and     r0, r39, r40
+       andc    r0, r39, r0
         stx.q   r17, r63, r4
         add.l   r17, r63, r4
         addi.l  r17, 8, r17
-       beq/u   r31, r32, tr1
+       beq/u   r39, r40, tr1
  LOCAL(ia_r5_ld):       /* Store r5 and load its address.  */
-       movi    3, r30
-       shlli   r30, 20, r31
-       and     r0, r31, r32
-       andc    r0, r31, r0
+       movi    3, r38
+       shlli   r38, 20, r39
+       and     r0, r39, r40
+       andc    r0, r39, r0
         stx.q   r17, r63, r5
         add.l   r17, r63, r5
         addi.l  r17, 8, r17
-       beq/u   r31, r32, tr1
+       beq/u   r39, r40, tr1
  LOCAL(ia_r6_ld):       /* Store r6 and load its address.  */
-       movi    3, r30
-       shlli   r30, 16, r31
-       and     r0, r31, r32
-       andc    r0, r31, r0
+       movi    3, r38
+       shlli   r38, 16, r39
+       and     r0, r39, r40
+       andc    r0, r39, r0
         stx.q   r17, r63, r6
         add.l   r17, r63, r6
         addi.l  r17, 8, r17
-       beq/u   r31, r32, tr1
+       beq/u   r39, r40, tr1
  LOCAL(ia_r7_ld):       /* Store r7 and load its address.  */
-       movi    3 << 12, r31
-       and     r0, r31, r32
-       andc    r0, r31, r0
+       movi    3 << 12, r39
+       and     r0, r39, r40
+       andc    r0, r39, r0
         stx.q   r17, r63, r7
         add.l   r17, r63, r7
         addi.l  r17, 8, r17
-       beq/u   r31, r32, tr1
+       beq/u   r39, r40, tr1
  LOCAL(ia_r8_ld):       /* Store r8 and load its address.  */
-       movi    3 << 8, r31
-       and     r0, r31, r32
-       andc    r0, r31, r0
+       movi    3 << 8, r39
+       and     r0, r39, r40
+       andc    r0, r39, r0
         stx.q   r17, r63, r8
         add.l   r17, r63, r8
         addi.l  r17, 8, r17
-       beq/u   r31, r32, tr1
+       beq/u   r39, r40, tr1
  LOCAL(ia_r9_ld):       /* Store r9 and load its address.  */
         stx.q   r17, r63, r9
         add.l   r17, r63, r9
         blink   tr0, r63
  LOCAL(ia_r2_push):     /* Push r2 onto the stack.  */
-       movi    1, r30
-       shlli   r30, 29, r31
-       andc    r0, r31, r0
+       movi    1, r38
+       shlli   r38, 29, r39
+       andc    r0, r39, r0
         stx.q   r17, r63, r2
         addi.l  r17, 8, r17
         blink   tr1, r63
  LOCAL(ia_r3_push):     /* Push r3 onto the stack.  */
-       movi    1, r30
-       shlli   r30, 26, r31
-       andc    r0, r31, r0
+       movi    1, r38
+       shlli   r38, 26, r39
+       andc    r0, r39, r0
         stx.q   r17, r63, r3
         addi.l  r17, 8, r17
         blink   tr1, r63
  LOCAL(ia_r4_push):     /* Push r4 onto the stack.  */
-       movi    1, r30
-       shlli   r30, 23, r31
-       andc    r0, r31, r0
+       movi    1, r38
+       shlli   r38, 23, r39
+       andc    r0, r39, r0
         stx.q   r17, r63, r4
         addi.l  r17, 8, r17
         blink   tr1, r63
  LOCAL(ia_r5_push):     /* Push r5 onto the stack.  */
-       movi    1, r30
-       shlli   r30, 20, r31
-       andc    r0, r31, r0
+       movi    1, r38
+       shlli   r38, 20, r39
+       andc    r0, r39, r0
         stx.q   r17, r63, r5
         addi.l  r17, 8, r17
         blink   tr1, r63
  LOCAL(ia_r6_push):     /* Push r6 onto the stack.  */
-       movi    1, r30
-       shlli   r30, 16, r31
-       andc    r0, r31, r0
+       movi    1, r38
+       shlli   r38, 16, r39
+       andc    r0, r39, r0
         stx.q   r17, r63, r6
         addi.l  r17, 8, r17
         blink   tr1, r63
  LOCAL(ia_r7_push):     /* Push r7 onto the stack.  */
-       movi    1 << 12, r31
-       andc    r0, r31, r0
+       movi    1 << 12, r39
+       andc    r0, r39, r0
         stx.q   r17, r63, r7
         addi.l  r17, 8, r17
         blink   tr1, r63
  LOCAL(ia_r8_push):     /* Push r8 onto the stack.  */
-       movi    1 << 8, r31
-       andc    r0, r31, r0
+       movi    1 << 8, r39
+       andc    r0, r39, r0
         stx.q   r17, r63, r8
         addi.l  r17, 8, r17
         blink   tr1, r63
  LOCAL(ia_push_seq):    /* Push a sequence of registers onto the stack.  */
-       andi    r0, 7 << 1, r30
-       movi    (LOCAL(ia_end_of_push_seq) >> 16) & 65535, r32
-       shlli   r30, 2, r31
-       shori   LOCAL(ia_end_of_push_seq) & 65535, r32
-       sub.l   r32, r31, r33
-       ptabs/l r33, tr2
+       andi    r0, 7 << 1, r38
+       movi    (LOCAL(ia_end_of_push_seq) >> 16) & 65535, r40
+       shlli   r38, 2, r39
+       shori   LOCAL(ia_end_of_push_seq) & 65535, r40
+       sub.l   r40, r39, r41
+       ptabs/l r41, tr2
         blink   tr2, r63
  LOCAL(ia_stack_of_push_seq):    /* Beginning of push sequence.  */
         stx.q   r17, r63, r3
@@ -2018,6 +2848,7 @@ LOCAL(ia_r9_push):        /* Push r9 onto the stack.  */
  LOCAL(ia_return):      /* Return.  */
         blink   tr0, r63
  LOCAL(ia_end_of_push_seq): /* Label used to compute the first push instruction.  */
+       ENDFUNC(GLOBAL(GCC_shcompact_incoming_args))
  #endif /* L_shcompact_incoming_args */
  #endif
  #if __SH5__
@@ -2029,6 +2860,7 @@ LOCAL(ia_end_of_push_seq): /* Label used to compute the first push instruction.
  #endif
         .align  3 /* It is copied in units of 8 bytes in SHmedia mode.  */
         .global GLOBAL(GCC_nested_trampoline)
+       HIDDEN_FUNC(GLOBAL(GCC_nested_trampoline))
  GLOBAL(GCC_nested_trampoline):
         .mode   SHmedia
         ptrel/u r63, tr0
@@ -2045,6 +2877,8 @@ GLOBAL(GCC_nested_trampoline):
         ld.l    r0, 28, r1
  #endif
         blink   tr1, r63
+
+       ENDFUNC(GLOBAL(GCC_nested_trampoline))
  #endif /* L_nested_trampoline */
  #endif /* __SH5__ */
  #if __SH5__ == 32
@@ -2054,6 +2888,7 @@ GLOBAL(GCC_nested_trampoline):
         .align  2
  #ifndef __SH4_NOFPU__  
         .global GLOBAL(GCC_push_shmedia_regs)
+       FUNC(GLOBAL(GCC_push_shmedia_regs))
  GLOBAL(GCC_push_shmedia_regs):
         addi.l  r15, -14*8, r15
         fst.d   r15, 13*8, dr62
@@ -2070,9 +2905,11 @@ GLOBAL(GCC_push_shmedia_regs):
         fst.d   r15,  2*8, dr40
         fst.d   r15,  1*8, dr38
         fst.d   r15,  0*8, dr36
-#endif
+#else /* ! __SH4_NOFPU__ */
         .global GLOBAL(GCC_push_shmedia_regs_nofpu)
+       FUNC(GLOBAL(GCC_push_shmedia_regs_nofpu))
  GLOBAL(GCC_push_shmedia_regs_nofpu):
+#endif /* ! __SH4_NOFPU__ */
         ptabs/l r18, tr0
         addi.l  r15, -27*8, r15
         gettr   tr7, r62
@@ -2106,9 +2943,14 @@ GLOBAL(GCC_push_shmedia_regs_nofpu):
         st.q    r15,  1*8, r29
         st.q    r15,  0*8, r28
         blink   tr0, r63
-
-#ifndef __SH4_NOFPU__
+#ifndef __SH4_NOFPU__  
+       ENDFUNC(GLOBAL(GCC_push_shmedia_regs))
+#else
+       ENDFUNC(GLOBAL(GCC_push_shmedia_regs_nofpu))
+#endif
+#ifndef __SH4_NOFPU__  
         .global GLOBAL(GCC_pop_shmedia_regs)
+       FUNC(GLOBAL(GCC_pop_shmedia_regs))
  GLOBAL(GCC_pop_shmedia_regs):
         pt      .L0, tr1
         movi    41*8, r0
@@ -2127,9 +2969,11 @@ GLOBAL(GCC_pop_shmedia_regs):
         fld.d   r15, 28*8, dr38
         fld.d   r15, 27*8, dr36
         blink   tr1, r63
-#endif
+#else /* ! __SH4_NOFPU__       */
         .global GLOBAL(GCC_pop_shmedia_regs_nofpu)
+       FUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu))
  GLOBAL(GCC_pop_shmedia_regs_nofpu):
+#endif /* ! __SH4_NOFPU__      */
         movi    27*8, r0
  .L0:
         ptabs   r18, tr0
@@ -2165,5 +3009,917 @@ GLOBAL(GCC_pop_shmedia_regs_nofpu):
         ld.q    r15,  0*8, r28
         add.l   r15, r0, r15
         blink   tr0, r63
+
+#ifndef __SH4_NOFPU__
+       ENDFUNC(GLOBAL(GCC_pop_shmedia_regs))
+#else
+       ENDFUNC(GLOBAL(GCC_pop_shmedia_regs_nofpu))
+#endif
  #endif /* __SH5__ == 32 */
  #endif /* L_push_pop_shmedia_regs */
+
+#ifdef L_div_table
+#if __SH5__
+#if defined(__pic__) && defined(__SHMEDIA__)
+       .global GLOBAL(sdivsi3)
+       FUNC(GLOBAL(sdivsi3))
+#if __SH5__ == 32
+       .section        .text..SHmedia32,"ax"
+#else
+       .text
+#endif
+#if 0
+/* ??? FIXME: Presumably due to a linker bug, exporting data symbols
+   in a text section does not work (at least for shared libraries):
+   the linker sets the LSB of the address as if this was SHmedia code.  */
+#define TEXT_DATA_BUG
+#endif
+       .align  2
+ // inputs: r4,r5
+ // clobbered: r1,r18,r19,r20,r21,r25,tr0
+ // result in r0
+ .global GLOBAL(sdivsi3)
+GLOBAL(sdivsi3):
+#ifdef TEXT_DATA_BUG
+ ptb datalabel Local_div_table,tr0
+#else
+ ptb GLOBAL(div_table_internal),tr0
+#endif
+ nsb r5, r1
+ shlld r5, r1, r25    // normalize; [-2 ..1, 1..2) in s2.62
+ shari r25, 58, r21   // extract 5(6) bit index (s2.4 with hole -1..1)
+ /* bubble */
+ gettr tr0,r20
+ ldx.ub r20, r21, r19 // u0.8
+ shari r25, 32, r25   // normalize to s2.30
+ shlli r21, 1, r21
+ muls.l r25, r19, r19 // s2.38
+ ldx.w r20, r21, r21  // s2.14
+  ptabs r18, tr0
+ shari r19, 24, r19   // truncate to s2.14
+ sub r21, r19, r19    // some 11 bit inverse in s1.14
+ muls.l r19, r19, r21 // u0.28
+  sub r63, r1, r1
+  addi r1, 92, r1
+ muls.l r25, r21, r18 // s2.58
+ shlli r19, 45, r19   // multiply by two and convert to s2.58
+  /* bubble */
+ sub r19, r18, r18
+ shari r18, 28, r18   // some 22 bit inverse in s1.30
+ muls.l r18, r25, r0  // s2.60
+  muls.l r18, r4, r25 // s32.30
+  /* bubble */
+ shari r0, 16, r19   // s-16.44
+ muls.l r19, r18, r19 // s-16.74
+  shari r25, 63, r0
+  shari r4, 14, r18   // s19.-14
+ shari r19, 30, r19   // s-16.44
+ muls.l r19, r18, r19 // s15.30
+  xor r21, r0, r21    // You could also use the constant 1 << 27.
+  add r21, r25, r21
+ sub r21, r19, r21
+ shard r21, r1, r21
+ sub r21, r0, r0
+ blink tr0, r63
+       ENDFUNC(GLOBAL(sdivsi3))
+/* This table has been generated by divtab.c .
+Defects for bias -330:
+   Max defect: 6.081536e-07 at -1.000000e+00
+   Min defect: 2.849516e-08 at 1.030651e+00
+   Max 2nd step defect: 9.606539e-12 at -1.000000e+00
+   Min 2nd step defect: 0.000000e+00 at 0.000000e+00
+   Defect at 1: 1.238659e-07
+   Defect at -2: 1.061708e-07 */
+#else /* ! __pic__ || ! __SHMEDIA__ */
+       .section        .rodata
+#endif /* __pic__ */
+#if defined(TEXT_DATA_BUG) && defined(__pic__) && defined(__SHMEDIA__)
+       .balign 2
+       .type   Local_div_table,@object
+       .size   Local_div_table,128
+/* negative division constants */
+       .word   -16638
+       .word   -17135
+       .word   -17737
+       .word   -18433
+       .word   -19103
+       .word   -19751
+       .word   -20583
+       .word   -21383
+       .word   -22343
+       .word   -23353
+       .word   -24407
+       .word   -25582
+       .word   -26863
+       .word   -28382
+       .word   -29965
+       .word   -31800
+/* negative division factors */
+       .byte   66
+       .byte   70
+       .byte   75
+       .byte   81
+       .byte   87
+       .byte   93
+       .byte   101
+       .byte   109
+       .byte   119
+       .byte   130
+       .byte   142
+       .byte   156
+       .byte   172
+       .byte   192
+       .byte   214
+       .byte   241
+       .skip 16
+Local_div_table:
+       .skip 16
+/* positive division factors */
+       .byte   241
+       .byte   214
+       .byte   192
+       .byte   172
+       .byte   156
+       .byte   142
+       .byte   130
+       .byte   119
+       .byte   109
+       .byte   101
+       .byte   93
+       .byte   87
+       .byte   81
+       .byte   75
+       .byte   70
+       .byte   66
+/* positive division constants */
+       .word   31801
+       .word   29966
+       .word   28383
+       .word   26864
+       .word   25583
+       .word   24408
+       .word   23354
+       .word   22344
+       .word   21384
+       .word   20584
+       .word   19752
+       .word   19104
+       .word   18434
+       .word   17738
+       .word   17136
+       .word   16639
+       .section        .rodata
+#endif /* TEXT_DATA_BUG */
+       .balign 2
+       .type   GLOBAL(div_table),@object
+       .size   GLOBAL(div_table),128
+/* negative division constants */
+       .word   -16638
+       .word   -17135
+       .word   -17737
+       .word   -18433
+       .word   -19103
+       .word   -19751
+       .word   -20583
+       .word   -21383
+       .word   -22343
+       .word   -23353
+       .word   -24407
+       .word   -25582
+       .word   -26863
+       .word   -28382
+       .word   -29965
+       .word   -31800
+/* negative division factors */
+       .byte   66
+       .byte   70
+       .byte   75
+       .byte   81
+       .byte   87
+       .byte   93
+       .byte   101
+       .byte   109
+       .byte   119
+       .byte   130
+       .byte   142
+       .byte   156
+       .byte   172
+       .byte   192
+       .byte   214
+       .byte   241
+       .skip 16
+       .global GLOBAL(div_table)
+GLOBAL(div_table):
+       HIDDEN_ALIAS(div_table_internal,div_table)
+       .skip 16
+/* positive division factors */
+       .byte   241
+       .byte   214
+       .byte   192
+       .byte   172
+       .byte   156
+       .byte   142
+       .byte   130
+       .byte   119
+       .byte   109
+       .byte   101
+       .byte   93
+       .byte   87
+       .byte   81
+       .byte   75
+       .byte   70
+       .byte   66
+/* positive division constants */
+       .word   31801
+       .word   29966
+       .word   28383
+       .word   26864
+       .word   25583
+       .word   24408
+       .word   23354
+       .word   22344
+       .word   21384
+       .word   20584
+       .word   19752
+       .word   19104
+       .word   18434
+       .word   17738
+       .word   17136
+       .word   16639
+
+#elif defined (__SH3__) || defined (__SH3E__) || defined (__SH4__) || defined (__SH4_SINGLE__) || defined (__SH4_SINGLE_ONLY__) || defined (__SH4_NOFPU__)
+/* This code used shld, thus is not suitable for SH1 / SH2.  */
+
+/* Signed / unsigned division without use of FPU, optimized for SH4.
+   Uses a lookup table for divisors in the range -128 .. +128, and
+   div1 with case distinction for larger divisors in three more ranges.
+   The code is lumped together with the table to allow the use of mova.  */
+#ifdef __LITTLE_ENDIAN__
+#define L_LSB 0
+#define L_LSWMSB 1
+#define L_MSWLSB 2
+#else
+#define L_LSB 3
+#define L_LSWMSB 2
+#define L_MSWLSB 1
+#endif
+
+       .balign 4
+       .global GLOBAL(udivsi3_i4i)
+       FUNC(GLOBAL(udivsi3_i4i))
+GLOBAL(udivsi3_i4i):
+       mov.w LOCAL(c128_w), r1
+       div0u
+       mov r4,r0
+       shlr8 r0
+       cmp/hi r1,r5
+       extu.w r5,r1
+       bf LOCAL(udiv_le128)
+       cmp/eq r5,r1
+       bf LOCAL(udiv_ge64k)
+       shlr r0
+       mov r5,r1
+       shll16 r5
+       mov.l r4,@-r15
+       div1 r5,r0
+       mov.l r1,@-r15
+       div1 r5,r0
+       div1 r5,r0
+       bra LOCAL(udiv_25)
+       div1 r5,r0
+
+LOCAL(div_le128):
+       mova LOCAL(div_table_ix),r0
+       bra LOCAL(div_le128_2)
+       mov.b @(r0,r5),r1
+LOCAL(udiv_le128):
+       mov.l r4,@-r15
+       mova LOCAL(div_table_ix),r0
+       mov.b @(r0,r5),r1
+       mov.l r5,@-r15
+LOCAL(div_le128_2):
+       mova LOCAL(div_table_inv),r0
+       mov.l @(r0,r1),r1
+       mov r5,r0
+       tst #0xfe,r0
+       mova LOCAL(div_table_clz),r0
+       dmulu.l r1,r4
+       mov.b @(r0,r5),r1
+       bt/s LOCAL(div_by_1)
+       mov r4,r0
+       mov.l @r15+,r5
+       sts mach,r0
+       /* clrt */
+       addc r4,r0
+       mov.l @r15+,r4
+       rotcr r0
+       rts
+       shld r1,r0
+
+LOCAL(div_by_1_neg):
+       neg r4,r0
+LOCAL(div_by_1):
+       mov.l @r15+,r5
+       rts
+       mov.l @r15+,r4
+
+LOCAL(div_ge64k):
+       bt/s LOCAL(div_r8)
+       div0u
+       shll8 r5
+       bra LOCAL(div_ge64k_2)
+       div1 r5,r0
+LOCAL(udiv_ge64k):
+       cmp/hi r0,r5
+       mov r5,r1
+       bt LOCAL(udiv_r8)
+       shll8 r5
+       mov.l r4,@-r15
+       div1 r5,r0
+       mov.l r1,@-r15
+LOCAL(div_ge64k_2):
+       div1 r5,r0
+       mov.l LOCAL(zero_l),r1
+       .rept 4
+       div1 r5,r0
+       .endr
+       mov.l r1,@-r15
+       div1 r5,r0
+       mov.w LOCAL(m256_w),r1
+       div1 r5,r0
+       mov.b r0,@(L_LSWMSB,r15)
+       xor r4,r0
+       and r1,r0
+       bra LOCAL(div_ge64k_end)
+       xor r4,r0
+       
+LOCAL(div_r8):
+       shll16 r4
+       bra LOCAL(div_r8_2)
+       shll8 r4
+LOCAL(udiv_r8):
+       mov.l r4,@-r15
+       shll16 r4
+       clrt
+       shll8 r4
+       mov.l r5,@-r15
+LOCAL(div_r8_2):
+       rotcl r4
+       mov r0,r1
+       div1 r5,r1
+       mov r4,r0
+       rotcl r0
+       mov r5,r4
+       div1 r5,r1
+       .rept 5
+       rotcl r0; div1 r5,r1
+       .endr
+       rotcl r0
+       mov.l @r15+,r5
+       div1 r4,r1
+       mov.l @r15+,r4
+       rts
+       rotcl r0
+
+       ENDFUNC(GLOBAL(udivsi3_i4i))
+
+       .global GLOBAL(sdivsi3_i4i)
+       FUNC(GLOBAL(sdivsi3_i4i))
+       /* This is link-compatible with a GLOBAL(sdivsi3) call,
+          but we effectively clobber only r1.  */
+GLOBAL(sdivsi3_i4i):
+       mov.l r4,@-r15
+       cmp/pz r5
+       mov.w LOCAL(c128_w), r1
+       bt/s LOCAL(pos_divisor)
+       cmp/pz r4
+       mov.l r5,@-r15
+       neg r5,r5
+       bt/s LOCAL(neg_result)
+       cmp/hi r1,r5
+       neg r4,r4
+LOCAL(pos_result):
+       extu.w r5,r0
+       bf LOCAL(div_le128)
+       cmp/eq r5,r0
+       mov r4,r0
+       shlr8 r0
+       bf/s LOCAL(div_ge64k)
+       cmp/hi r0,r5
+       div0u
+       shll16 r5
+       div1 r5,r0
+       div1 r5,r0
+       div1 r5,r0
+LOCAL(udiv_25):
+       mov.l LOCAL(zero_l),r1
+       div1 r5,r0
+       div1 r5,r0
+       mov.l r1,@-r15
+       .rept 3
+       div1 r5,r0
+       .endr
+       mov.b r0,@(L_MSWLSB,r15)
+       xtrct r4,r0
+       swap.w r0,r0
+       .rept 8
+       div1 r5,r0
+       .endr
+       mov.b r0,@(L_LSWMSB,r15)
+LOCAL(div_ge64k_end):
+       .rept 8
+       div1 r5,r0
+       .endr
+       mov.l @r15+,r4 ! zero-extension and swap using LS unit.
+       extu.b r0,r0
+       mov.l @r15+,r5
+       or r4,r0
+       mov.l @r15+,r4
+       rts
+       rotcl r0
+
+LOCAL(div_le128_neg):
+       tst #0xfe,r0
+       mova LOCAL(div_table_ix),r0
+       mov.b @(r0,r5),r1
+       mova LOCAL(div_table_inv),r0
+       bt/s LOCAL(div_by_1_neg)
+       mov.l @(r0,r1),r1
+       mova LOCAL(div_table_clz),r0
+       dmulu.l r1,r4
+       mov.b @(r0,r5),r1
+       mov.l @r15+,r5
+       sts mach,r0
+       /* clrt */
+       addc r4,r0
+       mov.l @r15+,r4
+       rotcr r0
+       shld r1,r0
+       rts
+       neg r0,r0
+
+LOCAL(pos_divisor):
+       mov.l r5,@-r15
+       bt/s LOCAL(pos_result)
+       cmp/hi r1,r5
+       neg r4,r4
+LOCAL(neg_result):
+       extu.w r5,r0
+       bf LOCAL(div_le128_neg)
+       cmp/eq r5,r0
+       mov r4,r0
+       shlr8 r0
+       bf/s LOCAL(div_ge64k_neg)
+       cmp/hi r0,r5
+       div0u
+       mov.l LOCAL(zero_l),r1
+       shll16 r5
+       div1 r5,r0
+       mov.l r1,@-r15
+       .rept 7
+       div1 r5,r0
+       .endr
+       mov.b r0,@(L_MSWLSB,r15)
+       xtrct r4,r0
+       swap.w r0,r0
+       .rept 8
+       div1 r5,r0
+       .endr
+       mov.b r0,@(L_LSWMSB,r15)
+LOCAL(div_ge64k_neg_end):
+       .rept 8
+       div1 r5,r0
+       .endr
+       mov.l @r15+,r4 ! zero-extension and swap using LS unit.
+       extu.b r0,r1
+       mov.l @r15+,r5
+       or r4,r1
+LOCAL(div_r8_neg_end):
+       mov.l @r15+,r4
+       rotcl r1
+       rts
+       neg r1,r0
+
+LOCAL(div_ge64k_neg):
+       bt/s LOCAL(div_r8_neg)
+       div0u
+       shll8 r5
+       mov.l LOCAL(zero_l),r1
+       .rept 6
+       div1 r5,r0
+       .endr
+       mov.l r1,@-r15
+       div1 r5,r0
+       mov.w LOCAL(m256_w),r1
+       div1 r5,r0
+       mov.b r0,@(L_LSWMSB,r15)
+       xor r4,r0
+       and r1,r0
+       bra LOCAL(div_ge64k_neg_end)
+       xor r4,r0
+
+LOCAL(c128_w):
+       .word 128
+
+LOCAL(div_r8_neg):
+       clrt
+       shll16 r4
+       mov r4,r1
+       shll8 r1
+       mov r5,r4
+       .rept 7
+       rotcl r1; div1 r5,r0
+       .endr
+       mov.l @r15+,r5
+       rotcl r1
+       bra LOCAL(div_r8_neg_end)
+       div1 r4,r0
+
+LOCAL(m256_w):
+       .word 0xff00
+/* This table has been generated by divtab-sh4.c.  */
+       .balign 4
+LOCAL(div_table_clz):
+       .byte   0
+       .byte   1
+       .byte   0
+       .byte   -1
+       .byte   -1
+       .byte   -2
+       .byte   -2
+       .byte   -2
+       .byte   -2
+       .byte   -3
+       .byte   -3
+       .byte   -3
+       .byte   -3
+       .byte   -3
+       .byte   -3
+       .byte   -3
+       .byte   -3
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -4
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -5
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+       .byte   -6
+/* Lookup table translating positive divisor to index into table of
+   normalized inverse.  N.B. the '0' entry is also the last entry of the
+ previous table, and causes an unaligned access for division by zero.  */
+LOCAL(div_table_ix):
+       .byte   -6
+       .byte   -128
+       .byte   -128
+       .byte   0
+       .byte   -128
+       .byte   -64
+       .byte   0
+       .byte   64
+       .byte   -128
+       .byte   -96
+       .byte   -64
+       .byte   -32
+       .byte   0
+       .byte   32
+       .byte   64
+       .byte   96
+       .byte   -128
+       .byte   -112
+       .byte   -96
+       .byte   -80
+       .byte   -64
+       .byte   -48
+       .byte   -32
+       .byte   -16
+       .byte   0
+       .byte   16
+       .byte   32
+       .byte   48
+       .byte   64
+       .byte   80
+       .byte   96
+       .byte   112
+       .byte   -128
+       .byte   -120
+       .byte   -112
+       .byte   -104
+       .byte   -96
+       .byte   -88
+       .byte   -80
+       .byte   -72
+       .byte   -64
+       .byte   -56
+       .byte   -48
+       .byte   -40
+       .byte   -32
+       .byte   -24
+       .byte   -16
+       .byte   -8
+       .byte   0
+       .byte   8
+       .byte   16
+       .byte   24
+       .byte   32
+       .byte   40
+       .byte   48
+       .byte   56
+       .byte   64
+       .byte   72
+       .byte   80
+       .byte   88
+       .byte   96
+       .byte   104
+       .byte   112
+       .byte   120
+       .byte   -128
+       .byte   -124
+       .byte   -120
+       .byte   -116
+       .byte   -112
+       .byte   -108
+       .byte   -104
+       .byte   -100
+       .byte   -96
+       .byte   -92
+       .byte   -88
+       .byte   -84
+       .byte   -80
+       .byte   -76
+       .byte   -72
+       .byte   -68
+       .byte   -64
+       .byte   -60
+       .byte   -56
+       .byte   -52
+       .byte   -48
+       .byte   -44
+       .byte   -40
+       .byte   -36
+       .byte   -32
+       .byte   -28
+       .byte   -24
+       .byte   -20
+       .byte   -16
+       .byte   -12
+       .byte   -8
+       .byte   -4
+       .byte   0
+       .byte   4
+       .byte   8
+       .byte   12
+       .byte   16
+       .byte   20
+       .byte   24
+       .byte   28
+       .byte   32
+       .byte   36
+       .byte   40
+       .byte   44
+       .byte   48
+       .byte   52
+       .byte   56
+       .byte   60
+       .byte   64
+       .byte   68
+       .byte   72
+       .byte   76
+       .byte   80
+       .byte   84
+       .byte   88
+       .byte   92
+       .byte   96
+       .byte   100
+       .byte   104
+       .byte   108
+       .byte   112
+       .byte   116
+       .byte   120
+       .byte   124
+       .byte   -128
+/* 1/64 .. 1/127, normalized.  There is an implicit leading 1 in bit 32.  */
+       .balign 4
+LOCAL(zero_l):
+       .long   0x0
+       .long   0xF81F81F9
+       .long   0xF07C1F08
+       .long   0xE9131AC0
+       .long   0xE1E1E1E2
+       .long   0xDAE6076C
+       .long   0xD41D41D5
+       .long   0xCD856891
+       .long   0xC71C71C8
+       .long   0xC0E07039
+       .long   0xBACF914D
+       .long   0xB4E81B4F
+       .long   0xAF286BCB
+       .long   0xA98EF607
+       .long   0xA41A41A5
+       .long   0x9EC8E952
+       .long   0x9999999A
+       .long   0x948B0FCE
+       .long   0x8F9C18FA
+       .long   0x8ACB90F7
+       .long   0x86186187
+       .long   0x81818182
+       .long   0x7D05F418
+       .long   0x78A4C818
+       .long   0x745D1746
+       .long   0x702E05C1
+       .long   0x6C16C16D
+       .long   0x68168169
+       .long   0x642C8591
+       .long   0x60581606
+       .long   0x5C9882BA
+       .long   0x58ED2309
+LOCAL(div_table_inv):
+       .long   0x55555556
+       .long   0x51D07EAF
+       .long   0x4E5E0A73
+       .long   0x4AFD6A06
+       .long   0x47AE147B
+       .long   0x446F8657
+       .long   0x41414142
+       .long   0x3E22CBCF
+       .long   0x3B13B13C
+       .long   0x38138139
+       .long   0x3521CFB3
+       .long   0x323E34A3
+       .long   0x2F684BDB
+       .long   0x2C9FB4D9
+       .long   0x29E4129F
+       .long   0x27350B89
+       .long   0x24924925
+       .long   0x21FB7813
+       .long   0x1F7047DD
+       .long   0x1CF06ADB
+       .long   0x1A7B9612
+       .long   0x18118119
+       .long   0x15B1E5F8
+       .long   0x135C8114
+       .long   0x11111112
+       .long   0xECF56BF
+       .long   0xC9714FC
+       .long   0xA6810A7
+       .long   0x8421085
+       .long   0x624DD30
+       .long   0x4104105
+       .long   0x2040811
+       /* maximum error: 0.987342 scaled: 0.921875*/
+
+       ENDFUNC(GLOBAL(sdivsi3_i4i))
+#endif /* SH3 / SH4 */
+
+#endif /* L_div_table */
+
+#ifdef L_udiv_qrnnd_16
+#if !__SHMEDIA__
+       HIDDEN_FUNC(GLOBAL(udiv_qrnnd_16))
+       /* r0: rn r1: qn */ /* r0: n1 r4: n0 r5: d r6: d1 */ /* r2: __m */
+       /* n1 < d, but n1 might be larger than d1.  */
+       .global GLOBAL(udiv_qrnnd_16)
+       .balign 8
+GLOBAL(udiv_qrnnd_16):
+       div0u
+       cmp/hi r6,r0
+       bt .Lots
+       .rept 16
+       div1 r6,r0 
+       .endr
+       extu.w r0,r1
+       bt 0f
+       add r6,r0
+0:     rotcl r1
+       mulu.w r1,r5
+       xtrct r4,r0
+       swap.w r0,r0
+       sts macl,r2
+       cmp/hs r2,r0
+       sub r2,r0
+       bt 0f
+       addc r5,r0
+       add #-1,r1
+       bt 0f
+1:     add #-1,r1
+       rts
+       add r5,r0
+       .balign 8
+.Lots:
+       sub r5,r0
+       swap.w r4,r1
+       xtrct r0,r1
+       clrt
+       mov r1,r0
+       addc r5,r0
+       mov #-1,r1
+       SL1(bf, 1b,
+       shlr16 r1)
+0:     rts
+       nop
+       ENDFUNC(GLOBAL(udiv_qrnnd_16))
+#endif /* !__SHMEDIA__ */
+#endif /* L_udiv_qrnnd_16 */