Imported gcc-4.4.3

[msp430-gcc.git] / gcc / config / xtensa / lib1funcs.asm
diff --git a/gcc/config/xtensa/lib1funcs.asm b/gcc/config/xtensa/lib1funcs.asm

index a40f11b5ff817392dda153445a7ee321fdb9f469..071b9171177c81e7638acc3eb4d1dee075b2386a 100644 (file)
--- a/gcc/config/xtensa/lib1funcs.asm
+++ b/gcc/config/xtensa/lib1funcs.asm
@@ -1,48 +1,109 @@
  /* Assembly functions for the Xtensa version of libgcc1.
-   Copyright (C) 2001,2002 Free Software Foundation, Inc.
+   Copyright (C) 2001, 2002, 2003, 2005, 2006, 2007, 2009
+   Free Software Foundation, Inc.
     Contributed by Bob Wilson (bwilson@tensilica.com) at Tensilica.
  
  This file is part of GCC.
  
  GCC is free software; you can redistribute it and/or modify it under
  the terms of the GNU General Public License as published by the Free
-Software Foundation; either version 2, or (at your option) any later
+Software Foundation; either version 3, or (at your option) any later
  version.
  
-In addition to the permissions in the GNU General Public License, the
-Free Software Foundation gives you unlimited permission to link the
-compiled version of this file into combinations with other programs,
-and to distribute those combinations without any restriction coming
-from the use of this file.  (The General Public License restrictions
-do apply in other respects; for example, they cover modification of
-the file, and distribution when not linked into a combine
-executable.)
-
  GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  WARRANTY; without even the implied warranty of MERCHANTABILITY or
  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  for more details.
  
-You should have received a copy of the GNU General Public License
-along with GCC; see the file COPYING.  If not, write to the Free
-Software Foundation, 59 Temple Place - Suite 330, Boston, MA
-02111-1307, USA.  */
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+<http://www.gnu.org/licenses/>.  */
+
+#include "xtensa-config.h"
+
+/* Define macros for the ABS and ADDX* instructions to handle cases
+   where they are not included in the Xtensa processor configuration.  */
+
+       .macro  do_abs dst, src, tmp
+#if XCHAL_HAVE_ABS
+       abs     \dst, \src
+#else
+       neg     \tmp, \src
+       movgez  \tmp, \src, \src
+       mov     \dst, \tmp
+#endif
+       .endm
+
+       .macro  do_addx2 dst, as, at, tmp
+#if XCHAL_HAVE_ADDX
+       addx2   \dst, \as, \at
+#else
+       slli    \tmp, \as, 1
+       add     \dst, \tmp, \at
+#endif
+       .endm
+
+       .macro  do_addx4 dst, as, at, tmp
+#if XCHAL_HAVE_ADDX
+       addx4   \dst, \as, \at
+#else
+       slli    \tmp, \as, 2
+       add     \dst, \tmp, \at
+#endif
+       .endm
+
+       .macro  do_addx8 dst, as, at, tmp
+#if XCHAL_HAVE_ADDX
+       addx8   \dst, \as, \at
+#else
+       slli    \tmp, \as, 3
+       add     \dst, \tmp, \at
+#endif
+       .endm
+
+/* Define macros for leaf function entry and return, supporting either the
+   standard register windowed ABI or the non-windowed call0 ABI.  These
+   macros do not allocate any extra stack space, so they only work for
+   leaf functions that do not need to spill anything to the stack.  */
+
+       .macro leaf_entry reg, size
+#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
+       entry \reg, \size
+#else
+       /* do nothing */
+#endif
+       .endm
+
+       .macro leaf_return
+#if XCHAL_HAVE_WINDOWED && !__XTENSA_CALL0_ABI__
+       retw
+#else
+       ret
+#endif
+       .endm
  
-#include "xtensa/xtensa-config.h"
  
  #ifdef L_mulsi3
         .align  4
         .global __mulsi3
-       .type   __mulsi3,@function
+       .type   __mulsi3, @function
  __mulsi3:
-       entry   sp, 16
+       leaf_entry sp, 16
  
-#if XCHAL_HAVE_MUL16
+#if XCHAL_HAVE_MUL32
+       mull    a2, a2, a3
+
+#elif XCHAL_HAVE_MUL16
         or      a4, a2, a3
         srai    a4, a4, 16
         bnez    a4, .LMUL16
         mul16u  a2, a2, a3
-       retw
+       leaf_return
  .LMUL16:
         srai    a4, a2, 16
         srai    a5, a3, 16
@@ -56,122 +117,323 @@ __mulsi3:
  #elif XCHAL_HAVE_MAC16
         mul.aa.hl a2, a3
         mula.aa.lh a2, a3
-       rsr     a5, 16 # ACCLO
+       rsr     a5, ACCLO
         umul.aa.ll a2, a3
-       rsr     a4, 16 # ACCLO
+       rsr     a4, ACCLO
         slli    a5, a5, 16
         add     a2, a4, a5
  
-#else /* !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MAC16 */
+#else /* !MUL32 && !MUL16 && !MAC16 */
  
-        # Multiply one bit at a time, but unroll the loop 4x to better
-        # exploit the addx instructions.
-        
-        # Peel the first iteration to save a cycle on init
+       /* Multiply one bit at a time, but unroll the loop 4x to better
+          exploit the addx instructions and avoid overhead.
+          Peel the first iteration to save a cycle on init.  */
  
-        # avoid negative numbers 
+       /* Avoid negative numbers.  */
+       xor     a5, a2, a3      /* Top bit is 1 if one input is negative.  */
+       do_abs  a3, a3, a6
+       do_abs  a2, a2, a6
  
-       xor     a5, a2, a3  # top bit is 1 iff one of the inputs is negative
-       abs     a3, a3
-       abs     a2, a2
+       /* Swap so the second argument is smaller.  */
+       sub     a7, a2, a3
+       mov     a4, a3
+       movgez  a4, a2, a7      /* a4 = max (a2, a3) */
+       movltz  a3, a2, a7      /* a3 = min (a2, a3) */
  
-        # swap so that second argument is smaller
-        sub     a7, a2, a3
-        mov     a4, a3
-        movgez  a4, a2, a7  # a4 = max(a2, a3) 
-        movltz  a3, a2, a7  # a3 = min(a2, a3)
+       movi    a2, 0
+       extui   a6, a3, 0, 1
+       movnez  a2, a4, a6
  
-        movi    a2, 0
-        extui   a6, a3, 0, 1
-        movnez  a2, a4, a6
+       do_addx2 a7, a4, a2, a7
+       extui   a6, a3, 1, 1
+       movnez  a2, a7, a6
  
-        addx2   a7, a4, a2
-        extui   a6, a3, 1, 1
-        movnez  a2, a7, a6
+       do_addx4 a7, a4, a2, a7
+       extui   a6, a3, 2, 1
+       movnez  a2, a7, a6
  
-        addx4   a7, a4, a2
-        extui   a6, a3, 2, 1
-        movnez  a2, a7, a6
+       do_addx8 a7, a4, a2, a7
+       extui   a6, a3, 3, 1
+       movnez  a2, a7, a6
  
-        addx8   a7, a4, a2
-        extui   a6, a3, 3, 1
-        movnez  a2, a7, a6
+       bgeui   a3, 16, .Lmult_main_loop
+       neg     a3, a2
+       movltz  a2, a3, a5
+       leaf_return
  
-        bgeui   a3, 16, .Lmult_main_loop
-        neg     a3, a2
-        movltz  a2, a3, a5
-        retw
+       .align  4
+.Lmult_main_loop:
+       srli    a3, a3, 4
+       slli    a4, a4, 4
  
+       add     a7, a4, a2
+       extui   a6, a3, 0, 1
+       movnez  a2, a7, a6
  
-        .align  4
-.Lmult_main_loop:
-        srli    a3, a3, 4
-        slli    a4, a4, 4
+       do_addx2 a7, a4, a2, a7
+       extui   a6, a3, 1, 1
+       movnez  a2, a7, a6
  
-        add     a7, a4, a2
-        extui   a6, a3, 0, 1
-        movnez  a2, a7, a6
+       do_addx4 a7, a4, a2, a7
+       extui   a6, a3, 2, 1
+       movnez  a2, a7, a6
  
-        addx2   a7, a4, a2
-        extui   a6, a3, 1, 1
-        movnez  a2, a7, a6
+       do_addx8 a7, a4, a2, a7
+       extui   a6, a3, 3, 1
+       movnez  a2, a7, a6
  
-        addx4   a7, a4, a2
-        extui   a6, a3, 2, 1
-        movnez  a2, a7, a6
+       bgeui   a3, 16, .Lmult_main_loop
  
-        addx8   a7, a4, a2
-        extui   a6, a3, 3, 1
-        movnez  a2, a7, a6
+       neg     a3, a2
+       movltz  a2, a3, a5
  
+#endif /* !MUL32 && !MUL16 && !MAC16 */
  
-        bgeui   a3, 16, .Lmult_main_loop
+       leaf_return
+       .size   __mulsi3, . - __mulsi3
  
-        neg     a3, a2
-        movltz  a2, a3, a5
+#endif /* L_mulsi3 */
  
-#endif /* !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MAC16 */
  
-       retw
-.Lfe0:
-       .size   __mulsi3,.Lfe0-__mulsi3
+#ifdef L_umulsidi3
  
-#endif /* L_mulsi3 */
+#if !XCHAL_HAVE_MUL16 && !XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MAC16
+#define XCHAL_NO_MUL 1
+#endif
  
+       .align  4
+       .global __umulsidi3
+       .type   __umulsidi3, @function
+__umulsidi3:
+#if __XTENSA_CALL0_ABI__
+       leaf_entry sp, 32
+       addi    sp, sp, -32
+       s32i    a12, sp, 16
+       s32i    a13, sp, 20
+       s32i    a14, sp, 24
+       s32i    a15, sp, 28
+#elif XCHAL_NO_MUL
+       /* This is not really a leaf function; allocate enough stack space
+          to allow CALL12s to a helper function.  */
+       leaf_entry sp, 48
+#else
+       leaf_entry sp, 16
+#endif
+
+#ifdef __XTENSA_EB__
+#define wh a2
+#define wl a3
+#else
+#define wh a3
+#define wl a2
+#endif /* __XTENSA_EB__ */
+
+       /* This code is taken from the mulsf3 routine in ieee754-sf.S.
+          See more comments there.  */
+
+#if XCHAL_HAVE_MUL32_HIGH
+       mull    a6, a2, a3
+       muluh   wh, a2, a3
+       mov     wl, a6
+
+#else /* ! MUL32_HIGH */
+
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
+       /* a0 and a8 will be clobbered by calling the multiply function
+          but a8 is not used here and need not be saved.  */
+       s32i    a0, sp, 0
+#endif
+
+#if XCHAL_HAVE_MUL16 || XCHAL_HAVE_MUL32
+
+#define a2h a4
+#define a3h a5
+
+       /* Get the high halves of the inputs into registers.  */
+       srli    a2h, a2, 16
+       srli    a3h, a3, 16
+
+#define a2l a2
+#define a3l a3
+
+#if XCHAL_HAVE_MUL32 && !XCHAL_HAVE_MUL16
+       /* Clear the high halves of the inputs.  This does not matter
+          for MUL16 because the high bits are ignored.  */
+       extui   a2, a2, 0, 16
+       extui   a3, a3, 0, 16
+#endif
+#endif /* MUL16 || MUL32 */
  
-       # Some Xtensa configurations include the NSAU (unsigned
-       # normalize shift amount) instruction which computes the number
-       # of leading zero bits.  For other configurations, the "nsau"
-       # operation is implemented as a macro.
-       
-#if !XCHAL_HAVE_NSA
-       .macro  nsau cnt, val, tmp, a
+
+#if XCHAL_HAVE_MUL16
+
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+       mul16u  dst, xreg ## xhalf, yreg ## yhalf
+
+#elif XCHAL_HAVE_MUL32
+
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+       mull    dst, xreg ## xhalf, yreg ## yhalf
+
+#elif XCHAL_HAVE_MAC16
+
+/* The preprocessor insists on inserting a space when concatenating after
+   a period in the definition of do_mul below.  These macros are a workaround
+   using underscores instead of periods when doing the concatenation.  */
+#define umul_aa_ll umul.aa.ll
+#define umul_aa_lh umul.aa.lh
+#define umul_aa_hl umul.aa.hl
+#define umul_aa_hh umul.aa.hh
+
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+       umul_aa_ ## xhalf ## yhalf      xreg, yreg; \
+       rsr     dst, ACCLO
+
+#else /* no multiply hardware */
+
+#define set_arg_l(dst, src) \
+       extui   dst, src, 0, 16
+#define set_arg_h(dst, src) \
+       srli    dst, src, 16
+
+#if __XTENSA_CALL0_ABI__
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+       set_arg_ ## xhalf (a13, xreg); \
+       set_arg_ ## yhalf (a14, yreg); \
+       call0   .Lmul_mulsi3; \
+       mov     dst, a12
+#else
+#define do_mul(dst, xreg, xhalf, yreg, yhalf) \
+       set_arg_ ## xhalf (a14, xreg); \
+       set_arg_ ## yhalf (a15, yreg); \
+       call12  .Lmul_mulsi3; \
+       mov     dst, a14
+#endif /* __XTENSA_CALL0_ABI__ */
+
+#endif /* no multiply hardware */
+
+       /* Add pp1 and pp2 into a6 with carry-out in a9.  */
+       do_mul(a6, a2, l, a3, h)        /* pp 1 */
+       do_mul(a11, a2, h, a3, l)       /* pp 2 */
+       movi    a9, 0
+       add     a6, a6, a11
+       bgeu    a6, a11, 1f
+       addi    a9, a9, 1
+1:
+       /* Shift the high half of a9/a6 into position in a9.  Note that
+          this value can be safely incremented without any carry-outs.  */
+       ssai    16
+       src     a9, a9, a6
+
+       /* Compute the low word into a6.  */
+       do_mul(a11, a2, l, a3, l)       /* pp 0 */
+       sll     a6, a6
+       add     a6, a6, a11
+       bgeu    a6, a11, 1f
+       addi    a9, a9, 1
+1:
+       /* Compute the high word into wh.  */
+       do_mul(wh, a2, h, a3, h)        /* pp 3 */
+       add     wh, wh, a9
+       mov     wl, a6
+
+#endif /* !MUL32_HIGH */
+
+#if __XTENSA_CALL0_ABI__ && XCHAL_NO_MUL
+       /* Restore the original return address.  */
+       l32i    a0, sp, 0
+#endif
+#if __XTENSA_CALL0_ABI__
+       l32i    a12, sp, 16
+       l32i    a13, sp, 20
+       l32i    a14, sp, 24
+       l32i    a15, sp, 28
+       addi    sp, sp, 32
+#endif
+       leaf_return
+
+#if XCHAL_NO_MUL
+
+       /* For Xtensa processors with no multiply hardware, this simplified
+          version of _mulsi3 is used for multiplying 16-bit chunks of
+          the floating-point mantissas.  When using CALL0, this function
+          uses a custom ABI: the inputs are passed in a13 and a14, the
+          result is returned in a12, and a8 and a15 are clobbered.  */
+       .align  4
+.Lmul_mulsi3:
+       leaf_entry sp, 16
+       .macro mul_mulsi3_body dst, src1, src2, tmp1, tmp2
+       movi    \dst, 0
+1:     add     \tmp1, \src2, \dst
+       extui   \tmp2, \src1, 0, 1
+       movnez  \dst, \tmp1, \tmp2
+
+       do_addx2 \tmp1, \src2, \dst, \tmp1
+       extui   \tmp2, \src1, 1, 1
+       movnez  \dst, \tmp1, \tmp2
+
+       do_addx4 \tmp1, \src2, \dst, \tmp1
+       extui   \tmp2, \src1, 2, 1
+       movnez  \dst, \tmp1, \tmp2
+
+       do_addx8 \tmp1, \src2, \dst, \tmp1
+       extui   \tmp2, \src1, 3, 1
+       movnez  \dst, \tmp1, \tmp2
+
+       srli    \src1, \src1, 4
+       slli    \src2, \src2, 4
+       bnez    \src1, 1b
+       .endm
+#if __XTENSA_CALL0_ABI__
+       mul_mulsi3_body a12, a13, a14, a15, a8
+#else
+       /* The result will be written into a2, so save that argument in a4.  */
+       mov     a4, a2
+       mul_mulsi3_body a2, a4, a3, a5, a6
+#endif
+       leaf_return
+#endif /* XCHAL_NO_MUL */
+
+       .size   __umulsidi3, . - __umulsidi3
+
+#endif /* L_umulsidi3 */
+
+
+/* Define a macro for the NSAU (unsigned normalize shift amount)
+   instruction, which computes the number of leading zero bits,
+   to handle cases where it is not included in the Xtensa processor
+   configuration.  */
+
+       .macro  do_nsau cnt, val, tmp, a
+#if XCHAL_HAVE_NSA
+       nsau    \cnt, \val
+#else
         mov     \a, \val
         movi    \cnt, 0
         extui   \tmp, \a, 16, 16
         bnez    \tmp, 0f
         movi    \cnt, 16
         slli    \a, \a, 16
-0:     
+0:
         extui   \tmp, \a, 24, 8
         bnez    \tmp, 1f
         addi    \cnt, \cnt, 8
         slli    \a, \a, 8
-1:     
+1:
         movi    \tmp, __nsau_data
         extui   \a, \a, 24, 8
         add     \tmp, \tmp, \a
         l8ui    \tmp, \tmp, 0
         add     \cnt, \cnt, \tmp
-       .endm
  #endif /* !XCHAL_HAVE_NSA */
+       .endm
  
-#ifdef L_nsau
+#ifdef L_clz
         .section .rodata
         .align  4
         .global __nsau_data
-       .type   __nsau_data,@object
-__nsau_data:   
+       .type   __nsau_data, @object
+__nsau_data:
  #if !XCHAL_HAVE_NSA
         .byte   8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4
         .byte   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
@@ -190,36 +452,80 @@ __nsau_data:
         .byte   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
         .byte   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
  #endif /* !XCHAL_HAVE_NSA */
-.Lfe1:
-       .size   __nsau_data,.Lfe1-__nsau_data
+       .size   __nsau_data, . - __nsau_data
         .hidden __nsau_data
-#endif /* L_nsau */
+#endif /* L_clz */
+
+
+#ifdef L_clzsi2
+       .align  4
+       .global __clzsi2
+       .type   __clzsi2, @function
+__clzsi2:
+       leaf_entry sp, 16
+       do_nsau a2, a2, a3, a4
+       leaf_return
+       .size   __clzsi2, . - __clzsi2
+
+#endif /* L_clzsi2 */
+
+
+#ifdef L_ctzsi2
+       .align  4
+       .global __ctzsi2
+       .type   __ctzsi2, @function
+__ctzsi2:
+       leaf_entry sp, 16
+       neg     a3, a2
+       and     a3, a3, a2
+       do_nsau a2, a3, a4, a5
+       neg     a2, a2
+       addi    a2, a2, 31
+       leaf_return
+       .size   __ctzsi2, . - __ctzsi2
+
+#endif /* L_ctzsi2 */
+
+
+#ifdef L_ffssi2
+       .align  4
+       .global __ffssi2
+       .type   __ffssi2, @function
+__ffssi2:
+       leaf_entry sp, 16
+       neg     a3, a2
+       and     a3, a3, a2
+       do_nsau a2, a3, a4, a5
+       neg     a2, a2
+       addi    a2, a2, 32
+       leaf_return
+       .size   __ffssi2, . - __ffssi2
+
+#endif /* L_ffssi2 */
  
  
  #ifdef L_udivsi3
         .align  4
         .global __udivsi3
-       .type   __udivsi3,@function
+       .type   __udivsi3, @function
  __udivsi3:
-       entry   sp, 16
-       bltui   a3, 2, .Lle_one # check if the divisor <= 1
-
-       mov     a6, a2          # keep dividend in a6
-#if XCHAL_HAVE_NSA
-       nsau    a5, a6          # dividend_shift = nsau(dividend)
-       nsau    a4, a3          # divisor_shift = nsau(divisor)
-#else /* !XCHAL_HAVE_NSA */
-       nsau    a5, a6, a2, a7  # dividend_shift = nsau(dividend)
-       nsau    a4, a3, a2, a7  # divisor_shift = nsau(divisor)
-#endif /* !XCHAL_HAVE_NSA */
+       leaf_entry sp, 16
+#if XCHAL_HAVE_DIV32
+       quou    a2, a2, a3
+#else
+       bltui   a3, 2, .Lle_one /* check if the divisor <= 1 */
+
+       mov     a6, a2          /* keep dividend in a6 */
+       do_nsau a5, a6, a2, a7  /* dividend_shift = nsau (dividend) */
+       do_nsau a4, a3, a2, a7  /* divisor_shift = nsau (divisor) */
         bgeu    a5, a4, .Lspecial
  
-       sub     a4, a4, a5      # count = divisor_shift - dividend_shift
+       sub     a4, a4, a5      /* count = divisor_shift - dividend_shift */
         ssl     a4
-       sll     a3, a3          # divisor <<= count
-       movi    a2, 0           # quotient = 0
+       sll     a3, a3          /* divisor <<= count */
+       movi    a2, 0           /* quotient = 0 */
  
-       # test-subtract-and-shift loop; one quotient bit on each iteration
+       /* test-subtract-and-shift loop; one quotient bit on each iteration */
  #if XCHAL_HAVE_LOOPS
         loopnez a4, .Lloopend
  #endif /* XCHAL_HAVE_LOOPS */
@@ -237,26 +543,32 @@ __udivsi3:
  .Lloopend:
  
         bltu    a6, a3, .Lreturn
-       addi    a2, a2, 1       # increment quotient if dividend >= divisor
+       addi    a2, a2, 1       /* increment quotient if dividend >= divisor */
  .Lreturn:
-       retw
+       leaf_return
+
+.Lle_one:
+       beqz    a3, .Lerror     /* if divisor == 1, return the dividend */
+       leaf_return
  
  .Lspecial:
-       # return dividend >= divisor
-       movi    a2, 0
-       bltu    a6, a3, .Lreturn2
+       /* return dividend >= divisor */
+       bltu    a6, a3, .Lreturn0
         movi    a2, 1
-.Lreturn2:
-       retw
+       leaf_return
  
-.Lle_one:
-       beqz    a3, .Lerror     # if divisor == 1, return the dividend
-       retw
  .Lerror:
-       movi    a2, 0           # just return 0; could throw an exception
-       retw
-.Lfe2:
-       .size   __udivsi3,.Lfe2-__udivsi3
+       /* Divide by zero: Use an illegal instruction to force an exception.
+          The subsequent "DIV0" string can be recognized by the exception
+          handler to identify the real cause of the exception.  */
+       ill
+       .ascii  "DIV0"
+
+.Lreturn0:
+       movi    a2, 0
+#endif /* XCHAL_HAVE_DIV32 */
+       leaf_return
+       .size   __udivsi3, . - __udivsi3
  
  #endif /* L_udivsi3 */
  
@@ -264,28 +576,26 @@ __udivsi3:
  #ifdef L_divsi3
         .align  4
         .global __divsi3
-       .type   __divsi3,@function
+       .type   __divsi3, @function
  __divsi3:
-       entry   sp, 16
-       xor     a7, a2, a3      # sign = dividend ^ divisor
-       abs     a6, a2          # udividend = abs(dividend)
-       abs     a3, a3          # udivisor = abs(divisor)
-       bltui   a3, 2, .Lle_one # check if udivisor <= 1
-#if XCHAL_HAVE_NSA
-       nsau    a5, a6          # udividend_shift = nsau(udividend)
-       nsau    a4, a3          # udivisor_shift = nsau(udivisor)
-#else /* !XCHAL_HAVE_NSA */
-       nsau    a5, a6, a2, a8  # udividend_shift = nsau(udividend)
-       nsau    a4, a3, a2, a8  # udivisor_shift = nsau(udivisor)
-#endif /* !XCHAL_HAVE_NSA */
+       leaf_entry sp, 16
+#if XCHAL_HAVE_DIV32
+       quos    a2, a2, a3
+#else
+       xor     a7, a2, a3      /* sign = dividend ^ divisor */
+       do_abs  a6, a2, a4      /* udividend = abs (dividend) */
+       do_abs  a3, a3, a4      /* udivisor = abs (divisor) */
+       bltui   a3, 2, .Lle_one /* check if udivisor <= 1 */
+       do_nsau a5, a6, a2, a8  /* udividend_shift = nsau (udividend) */
+       do_nsau a4, a3, a2, a8  /* udivisor_shift = nsau (udivisor) */
         bgeu    a5, a4, .Lspecial
  
-       sub     a4, a4, a5      # count = udivisor_shift - udividend_shift
+       sub     a4, a4, a5      /* count = udivisor_shift - udividend_shift */
         ssl     a4
-       sll     a3, a3          # udivisor <<= count
-       movi    a2, 0           # quotient = 0
+       sll     a3, a3          /* udivisor <<= count */
+       movi    a2, 0           /* quotient = 0 */
  
-       # test-subtract-and-shift loop; one quotient bit on each iteration
+       /* test-subtract-and-shift loop; one quotient bit on each iteration */
  #if XCHAL_HAVE_LOOPS
         loopnez a4, .Lloopend
  #endif /* XCHAL_HAVE_LOOPS */
@@ -303,31 +613,37 @@ __divsi3:
  .Lloopend:
  
         bltu    a6, a3, .Lreturn
-       addi    a2, a2, 1       # increment quotient if udividend >= udivisor
+       addi    a2, a2, 1       /* increment if udividend >= udivisor */
  .Lreturn:
         neg     a5, a2
-       movltz  a2, a5, a7      # return (sign < 0) ? -quotient : quotient
-       retw
+       movltz  a2, a5, a7      /* return (sign < 0) ? -quotient : quotient */
+       leaf_return
+
+.Lle_one:
+       beqz    a3, .Lerror
+       neg     a2, a6          /* if udivisor == 1, then return... */
+       movgez  a2, a6, a7      /* (sign < 0) ? -udividend : udividend */
+       leaf_return
  
  .Lspecial:
-       movi    a2, 0
-       bltu    a6, a3, .Lreturn2 #  if dividend < divisor, return 0
+       bltu    a6, a3, .Lreturn0 /* if dividend < divisor, return 0 */
         movi    a2, 1
         movi    a4, -1
-       movltz  a2, a4, a7      # else return (sign < 0) ? -1 :  1 
-.Lreturn2:
-       retw
+       movltz  a2, a4, a7      /* else return (sign < 0) ? -1 : 1 */
+       leaf_return
  
-.Lle_one:
-       beqz    a3, .Lerror
-       neg     a2, a6          # if udivisor == 1, then return...
-       movgez  a2, a6, a7      # (sign < 0) ? -udividend : udividend
-       retw
  .Lerror:
-       movi    a2, 0           # just return 0; could throw an exception
-       retw
-.Lfe3:
-       .size   __divsi3,.Lfe3-__divsi3
+       /* Divide by zero: Use an illegal instruction to force an exception.
+          The subsequent "DIV0" string can be recognized by the exception
+          handler to identify the real cause of the exception.  */
+       ill
+       .ascii  "DIV0"
+
+.Lreturn0:
+       movi    a2, 0
+#endif /* XCHAL_HAVE_DIV32 */
+       leaf_return
+       .size   __divsi3, . - __divsi3
  
  #endif /* L_divsi3 */
  
@@ -335,25 +651,23 @@ __divsi3:
  #ifdef L_umodsi3
         .align  4
         .global __umodsi3
-       .type   __umodsi3,@function
+       .type   __umodsi3, @function
  __umodsi3:
-       entry   sp, 16
-       bltui   a3, 2, .Lle_one # check if the divisor is <= 1
-
-#if XCHAL_HAVE_NSA
-       nsau    a5, a2          # dividend_shift = nsau(dividend)
-       nsau    a4, a3          # divisor_shift = nsau(divisor)
-#else /* !XCHAL_HAVE_NSA */
-       nsau    a5, a2, a6, a7  # dividend_shift = nsau(dividend)
-       nsau    a4, a3, a6, a7  # divisor_shift = nsau(divisor)
-#endif /* !XCHAL_HAVE_NSA */
+       leaf_entry sp, 16
+#if XCHAL_HAVE_DIV32
+       remu    a2, a2, a3
+#else
+       bltui   a3, 2, .Lle_one /* check if the divisor is <= 1 */
+
+       do_nsau a5, a2, a6, a7  /* dividend_shift = nsau (dividend) */
+       do_nsau a4, a3, a6, a7  /* divisor_shift = nsau (divisor) */
         bgeu    a5, a4, .Lspecial
  
-       sub     a4, a4, a5      # count = divisor_shift - dividend_shift
+       sub     a4, a4, a5      /* count = divisor_shift - dividend_shift */
         ssl     a4
-       sll     a3, a3          # divisor <<= count
+       sll     a3, a3          /* divisor <<= count */
  
-       # test-subtract-and-shift loop
+       /* test-subtract-and-shift loop */
  #if XCHAL_HAVE_LOOPS
         loopnez a4, .Lloopend
  #endif /* XCHAL_HAVE_LOOPS */
@@ -368,24 +682,26 @@ __umodsi3:
  #endif /* !XCHAL_HAVE_LOOPS */
  .Lloopend:
  
+.Lspecial:
         bltu    a2, a3, .Lreturn
-       sub     a2, a2, a3      # subtract once more if dividend >= divisor
+       sub     a2, a2, a3      /* subtract once more if dividend >= divisor */
  .Lreturn:
-       retw
-
-.Lspecial:
-       bltu    a2, a3, .Lreturn2
-       sub     a2, a2, a3      # subtract once if dividend >= divisor
-.Lreturn2:
-       retw
+       leaf_return
  
  .Lle_one:
-       # the divisor is either 0 or 1, so just return 0.
-       # someday we may want to throw an exception if the divisor is 0.
+       bnez    a3, .Lreturn0
+
+       /* Divide by zero: Use an illegal instruction to force an exception.
+          The subsequent "DIV0" string can be recognized by the exception
+          handler to identify the real cause of the exception.  */
+       ill
+       .ascii  "DIV0"
+
+.Lreturn0:
         movi    a2, 0
-       retw
-.Lfe4:
-       .size   __umodsi3,.Lfe4-__umodsi3
+#endif /* XCHAL_HAVE_DIV32 */
+       leaf_return
+       .size   __umodsi3, . - __umodsi3
  
  #endif /* L_umodsi3 */
  
@@ -393,27 +709,25 @@ __umodsi3:
  #ifdef L_modsi3
         .align  4
         .global __modsi3
-       .type   __modsi3,@function
+       .type   __modsi3, @function
  __modsi3:
-       entry   sp, 16
-       mov     a7, a2          # save original (signed) dividend
-       abs     a2, a2          # udividend = abs(dividend)
-       abs     a3, a3          # udivisor = abs(divisor)
-       bltui   a3, 2, .Lle_one # check if udivisor <= 1
-#if XCHAL_HAVE_NSA
-       nsau    a5, a2          # udividend_shift = nsau(udividend)
-       nsau    a4, a3          # udivisor_shift = nsau(udivisor)
-#else /* !XCHAL_HAVE_NSA */
-       nsau    a5, a2, a6, a8  # udividend_shift = nsau(udividend)
-       nsau    a4, a3, a6, a8  # udivisor_shift = nsau(udivisor)
-#endif /* !XCHAL_HAVE_NSA */
+       leaf_entry sp, 16
+#if XCHAL_HAVE_DIV32
+       rems    a2, a2, a3
+#else
+       mov     a7, a2          /* save original (signed) dividend */
+       do_abs  a2, a2, a4      /* udividend = abs (dividend) */
+       do_abs  a3, a3, a4      /* udivisor = abs (divisor) */
+       bltui   a3, 2, .Lle_one /* check if udivisor <= 1 */
+       do_nsau a5, a2, a6, a8  /* udividend_shift = nsau (udividend) */
+       do_nsau a4, a3, a6, a8  /* udivisor_shift = nsau (udivisor) */
         bgeu    a5, a4, .Lspecial
  
-       sub     a4, a4, a5      # count = udivisor_shift - udividend_shift
+       sub     a4, a4, a5      /* count = udivisor_shift - udividend_shift */
         ssl     a4
-       sll     a3, a3          # udivisor <<= count
+       sll     a3, a3          /* udivisor <<= count */
  
-       # test-subtract-and-shift loop
+       /* test-subtract-and-shift loop */
  #if XCHAL_HAVE_LOOPS
         loopnez a4, .Lloopend
  #endif /* XCHAL_HAVE_LOOPS */
@@ -428,29 +742,104 @@ __modsi3:
  #endif /* !XCHAL_HAVE_LOOPS */
  .Lloopend:
  
+.Lspecial:
         bltu    a2, a3, .Lreturn
-       sub     a2, a2, a3      # subtract once more if udividend >= udivisor
+       sub     a2, a2, a3      /* subtract again if udividend >= udivisor */
  .Lreturn:
         bgez    a7, .Lpositive
-       neg     a2, a2          # if (dividend < 0), return -udividend
-.Lpositive:    
-       retw
-
-.Lspecial:
-       bltu    a2, a3, .Lreturn2
-       sub     a2, a2, a3      # subtract once if dividend >= divisor
-.Lreturn2:
-       bgez    a7, .Lpositive2
-       neg     a2, a2          # if (dividend < 0), return -udividend
-.Lpositive2:   
-       retw
+       neg     a2, a2          /* if (dividend < 0), return -udividend */
+.Lpositive:
+       leaf_return
  
  .Lle_one:
-       # udivisor is either 0 or 1, so just return 0.
-       # someday we may want to throw an exception if udivisor is 0.
+       bnez    a3, .Lreturn0
+
+       /* Divide by zero: Use an illegal instruction to force an exception.
+          The subsequent "DIV0" string can be recognized by the exception
+          handler to identify the real cause of the exception.  */
+       ill
+       .ascii  "DIV0"
+
+.Lreturn0:
         movi    a2, 0
-       retw
-.Lfe5:
-       .size   __modsi3,.Lfe5-__modsi3
+#endif /* XCHAL_HAVE_DIV32 */
+       leaf_return
+       .size   __modsi3, . - __modsi3
  
  #endif /* L_modsi3 */
+
+
+#ifdef __XTENSA_EB__
+#define uh a2
+#define ul a3
+#else
+#define uh a3
+#define ul a2
+#endif /* __XTENSA_EB__ */
+
+
+#ifdef L_ashldi3
+       .align  4
+       .global __ashldi3
+       .type   __ashldi3, @function
+__ashldi3:
+       leaf_entry sp, 16
+       ssl     a4
+       bgei    a4, 32, .Llow_only
+       src     uh, uh, ul
+       sll     ul, ul
+       leaf_return
+
+.Llow_only:
+       sll     uh, ul
+       movi    ul, 0
+       leaf_return
+       .size   __ashldi3, . - __ashldi3
+
+#endif /* L_ashldi3 */
+
+
+#ifdef L_ashrdi3
+       .align  4
+       .global __ashrdi3
+       .type   __ashrdi3, @function
+__ashrdi3:
+       leaf_entry sp, 16
+       ssr     a4
+       bgei    a4, 32, .Lhigh_only
+       src     ul, uh, ul
+       sra     uh, uh
+       leaf_return
+
+.Lhigh_only:
+       sra     ul, uh
+       srai    uh, uh, 31
+       leaf_return
+       .size   __ashrdi3, . - __ashrdi3
+
+#endif /* L_ashrdi3 */
+
+
+#ifdef L_lshrdi3
+       .align  4
+       .global __lshrdi3
+       .type   __lshrdi3, @function
+__lshrdi3:
+       leaf_entry sp, 16
+       ssr     a4
+       bgei    a4, 32, .Lhigh_only1
+       src     ul, uh, ul
+       srl     uh, uh
+       leaf_return
+
+.Lhigh_only1:
+       srl     ul, uh
+       movi    uh, 0
+       leaf_return
+       .size   __lshrdi3, . - __lshrdi3
+
+#endif /* L_lshrdi3 */
+
+
+#include "ieee754-df.S"
+#include "ieee754-sf.S"