-;; libgcc routines for the Hitachi H8/300 CPU.
+;; libgcc routines for the Renesas H8/300 CPU.
;; Contributed by Steve Chamberlain <sac@cygnus.com>
+;; Optimizations by Toshiyasu Morita <toshiyasu.morita@renesas.com>
-/* Copyright (C) 1994, 2000, 2001 Free Software Foundation, Inc.
+/* Copyright (C) 1994, 2000, 2001, 2002, 2003, 2004, 2009
+ Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
-Free Software Foundation; either version 2, or (at your option) any
+Free Software Foundation; either version 3, or (at your option) any
later version.
-In addition to the permissions in the GNU General Public License, the
-Free Software Foundation gives you unlimited permission to link the
-compiled version of this file into combinations with other programs,
-and to distribute those combinations without any restriction coming
-from the use of this file. (The General Public License restrictions
-do apply in other respects; for example, they cover modification of
-the file, and distribution when not linked into a combine
-executable.)
-
This file is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
-You should have received a copy of the GNU General Public License
-along with this program; see the file COPYING. If not, write to
-the Free Software Foundation, 59 Temple Place - Suite 330,
-Boston, MA 02111-1307, USA. */
+Under Section 7 of GPL version 3, you are granted additional
+permissions described in the GCC Runtime Library Exception, version
+3.1, as published by the Free Software Foundation.
+
+You should have received a copy of the GNU General Public License and
+a copy of the GCC Runtime Library Exception along with this program;
+see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+<http://www.gnu.org/licenses/>. */
/* Assembler register definitions. */
#define S2H r6h
#ifdef __H8300__
-#define MOVP mov.w /* pointers are 16 bits */
-#define ADDP add.w
-#define CMPP cmp.w
#define PUSHP push
#define POPP pop
#define S2P r6
#endif
-#if defined (__H8300H__) || defined (__H8300S__)
-#define MOVP mov.l /* pointers are 32 bits */
-#define ADDP add.l
-#define CMPP cmp.l
+#if defined (__H8300H__) || defined (__H8300S__) || defined (__H8300SX__)
#define PUSHP push.l
#define POPP pop.l
#endif
#ifdef __H8300H__
+#ifdef __NORMAL_MODE__
+ .h8300hn
+#else
.h8300h
#endif
+#endif
#ifdef __H8300S__
+#ifdef __NORMAL_MODE__
+ .h8300sn
+#else
.h8300s
#endif
+#endif
+#ifdef __H8300SX__
+#ifdef __NORMAL_MODE__
+ .h8300sxn
+#else
+ .h8300sx
+#endif
+#endif
#ifdef L_cmpsi2
#ifdef __H8300__
.align 2
.global ___cmpsi2
___cmpsi2:
- cmp.w A2,A0
- bne .L2
- cmp.w A3,A1
+ cmp.w A0,A2
bne .L2
+ cmp.w A1,A3
+ bne .L4
mov.w #1,A0
rts
.L2:
- cmp.w A0,A2
- bgt .L4
- bne .L3
- cmp.w A1,A3
- bls .L3
-.L4:
- sub.w A0,A0
- rts
+ bgt .L5
.L3:
mov.w #2,A0
+ rts
+.L4:
+ bls .L3
.L5:
+ sub.w A0,A0
rts
.end
#endif
.align 2
.global ___ucmpsi2
___ucmpsi2:
- cmp.w A2,A0
- bne .L2
- cmp.w A3,A1
+ cmp.w A0,A2
bne .L2
+ cmp.w A1,A3
+ bne .L4
mov.w #1,A0
rts
.L2:
- cmp.w A0,A2
- bhi .L4
- bne .L3
- cmp.w A1,A3
- bls .L3
-.L4:
- sub.w A0,A0
- rts
+ bhi .L5
.L3:
mov.w #2,A0
+ rts
+.L4:
+ bls .L3
.L5:
+ sub.w A0,A0
rts
.end
#endif
;; "supporting routines".
; general purpose normalize routine
-;
+;
; divisor in A0
; dividend in A1
; turns both into +ve numbers, and leaves what the answer sign
.section .text
.align 2
divnorm:
- mov.b #0x0,A2L
or A0H,A0H ; is divisor > 0
- bge _lab1
+ stc ccr,A2L
+ bge _lab1
not A0H ; no - then make it +ve
not A0L
- adds #1,A0
- xor #0x1,A2L ; and remember that in A2L
+ adds #1,A0
_lab1: or A1H,A1H ; look at dividend
- bge _lab2
+ bge _lab2
not A1H ; it is -ve, make it positive
not A1L
adds #1,A1
- xor #0x1,A2L; and toggle sign of result
+ xor #0x8,A2L; and toggle sign of result
_lab2: rts
;; Basically the same, except that the sign of the divisor determines
;; the sign.
modnorm:
- mov.b #0x0,A2L
or A0H,A0H ; is divisor > 0
- bge _lab7
+ stc ccr,A2L
+ bge _lab7
not A0H ; no - then make it +ve
not A0L
- adds #1,A0
- xor #0x1,A2L ; and remember that in A2L
+ adds #1,A0
_lab7: or A1H,A1H ; look at dividend
- bge _lab8
+ bge _lab8
not A1H ; it is -ve, make it positive
not A1L
adds #1,A1
___divhi3:
bsr divnorm
bsr ___udivhi3
-negans: or A2L,A2L ; should answer be negative ?
+negans: btst #3,A2L ; should answer be negative ?
beq _lab4
not A0H ; yes, so make it so
not A0L
adds #1,A0
-_lab4: rts
+_lab4: rts
; A0=A0%A1 signed
.global ___udivhi3
___udivhi3:
- ; A0 A1 A2 A3
+ ; A0 A1 A2 A3
; Nn Dd P
- sub.w A3,A3 ; Nn Dd xP 00
- or A1H,A1H
+ sub.w A3,A3 ; Nn Dd xP 00
+ or A1H,A1H
bne divlongway
- or A0H,A0H
- beq _lab6
+ or A0H,A0H
+ beq _lab6
; we know that D == 0 and N is != 0
mov.b A0H,A3L ; Nn Dd xP 0N
mov.b A3L,A0L ; Qq
mov.b A3H,A3L ; m
mov.b #0x0,A3H ; Qq 0m
- rts
+ rts
; D != 0 - which means the denominator is
; loop around to get the result.
mov.b #0x8,A2H ; 8
div8: add.b A0L,A0L ; n*=2
rotxl A3L ; Make remainder bigger
- rotxl A3H
+ rotxl A3H
sub.w A1,A3 ; Q-=N
bhs setbit ; set a bit ?
add.w A1,A3 ; no : too far , Q+=N
- dec A2H
- bne div8 ; next bit
- rts
+ dec A2H
+ bne div8 ; next bit
+ rts
setbit: inc A0L ; do insert bit
- dec A2H
- bne div8 ; next bit
- rts
+ dec A2H
+ bne div8 ; next bit
+ rts
#endif /* __H8300__ */
#endif /* L_divhi3 */
;; 4 byte integer divides for the H8/300.
;;
-;; We have one routine which does all the work and lots of
+;; We have one routine which does all the work and lots of
;; little ones which prepare the args and massage the sign.
;; We bunch all of this into one object file since there are several
;; "supporting routines".
#ifdef __H8300__
divnorm:
- mov.b #0,S2L ; keep the sign in S2
mov.b A0H,A0H ; is the numerator -ve
+ stc ccr,S2L ; keep the sign in bit 3 of S2L
bge postive
; negate arg
addx #0,A1H
addx #0,A0L
addx #0,A0H
-
- mov.b #1,S2L ; the sign will be -ve
postive:
mov.b A2H,A2H ; is the denominator -ve
bge postive2
- not A2L
+ not A2L
not A2H
not A3L
not A3H
- add.b #1,A3L
+ add.b #1,A3L
addx #0,A3H
addx #0,A2L
addx #0,A2H
- xor #1,S2L ; toggle result sign
+ xor.b #0x08,S2L ; toggle the result sign
postive2:
rts
;; Basically the same, except that the sign of the divisor determines
;; the sign.
modnorm:
- mov.b #0,S2L ; keep the sign in S2
mov.b A0H,A0H ; is the numerator -ve
+ stc ccr,S2L ; keep the sign in bit 3 of S2L
bge mpostive
; negate arg
addx #0,A1H
addx #0,A0L
addx #0,A0H
-
- mov.b #1,S2L ; the sign will be -ve
mpostive:
mov.b A2H,A2H ; is the denominator -ve
bge mpostive2
- not A2L
+ not A2L
not A2H
not A3L
not A3H
- add.b #1,A3L
+ add.b #1,A3L
addx #0,A3H
addx #0,A2L
addx #0,A2H
#else /* __H8300H__ */
divnorm:
- mov.b #0,S2L ; keep the sign in S2
mov.l A0P,A0P ; is the numerator -ve
+ stc ccr,S2L ; keep the sign in bit 3 of S2L
bge postive
neg.l A0P ; negate arg
- mov.b #1,S2L ; the sign will be -ve
postive:
mov.l A1P,A1P ; is the denominator -ve
bge postive2
neg.l A1P ; negate arg
- xor.b #1,S2L ; toggle result sign
+ xor.b #0x08,S2L ; toggle the result sign
postive2:
rts
;; Basically the same, except that the sign of the divisor determines
;; the sign.
modnorm:
- mov.b #0,S2L ; keep the sign in S2
mov.l A0P,A0P ; is the numerator -ve
+ stc ccr,S2L ; keep the sign in bit 3 of S2L
bge mpostive
neg.l A0P ; negate arg
- mov.b #1,S2L ; the sign will be -ve
mpostive:
mov.l A1P,A1P ; is the denominator -ve
; denominator in A2/A3
.global ___modsi3
___modsi3:
- PUSHP S2P
+#ifdef __H8300__
+ PUSHP S2P
PUSHP S0P
PUSHP S1P
-
bsr modnorm
bsr divmodsi4
-#ifdef __H8300__
mov S0,A0
mov S1,A1
+ bra exitdiv
#else
- mov.l S0P,A0P
-#endif
+ PUSHP S2P
+ bsr modnorm
+ bsr ___udivsi3
+ mov.l er3,er0
bra exitdiv
+#endif
+ ;; H8/300H and H8S version of ___udivsi3 is defined later in
+ ;; the file.
+#ifdef __H8300__
.global ___udivsi3
___udivsi3:
PUSHP S2P
PUSHP S0P
PUSHP S1P
- mov.b #0,S2L ; keep sign low
bsr divmodsi4
- bra exitdiv
+ bra reti
+#endif
.global ___umodsi3
___umodsi3:
+#ifdef __H8300__
PUSHP S2P
PUSHP S0P
PUSHP S1P
- mov.b #0,S2L ; keep sign low
bsr divmodsi4
-#ifdef __H8300__
mov S0,A0
mov S1,A1
+ bra reti
#else
- mov.l S0P,A0P
+ bsr ___udivsi3
+ mov.l er3,er0
+ rts
#endif
- bra exitdiv
-
+
.global ___divsi3
___divsi3:
+#ifdef __H8300__
PUSHP S2P
PUSHP S0P
PUSHP S1P
jsr divnorm
jsr divmodsi4
+#else
+ PUSHP S2P
+ jsr divnorm
+ bsr ___udivsi3
+#endif
; examine what the sign should be
exitdiv:
- POPP S1P
- POPP S0P
-
- or S2L,S2L
+ btst #3,S2L
beq reti
-
+
; should be -ve
#ifdef __H8300__
not A0H
#endif
reti:
+#ifdef __H8300__
+ POPP S1P
+ POPP S0P
+#endif
POPP S2P
- rts
+ rts
- ; takes A0/A1 numerator (A0P for 300H)
- ; A2/A3 denominator (A1P for 300H)
- ; returns A0/A1 quotient (A0P for 300H)
- ; S0/S1 remainder (S0P for 300H)
- ; trashes S2
+ ; takes A0/A1 numerator (A0P for H8/300H)
+ ; A2/A3 denominator (A1P for H8/300H)
+ ; returns A0/A1 quotient (A0P for H8/300H)
+ ; S0/S1 remainder (S0P for H8/300H)
+ ; trashes S2H
#ifdef __H8300__
mov.b A2H,S2H
or A2L,S2H
or A3H,S2H
- bne DenHighZero
+ bne DenHighNonZero
mov.b A0H,A0H
bne NumByte0Zero
mov.b A0L,A0L
mov.b S1H,S1L
mov.b #0x0,S1H
- rts
+ rts
; have to do the divide by shift and test
-DenHighZero:
+DenHighNonZero:
mov.b A0H,S1L
mov.b A0L,A0H
mov.b A1H,A0L
sub.w A3,S1 ; does it all fit
subx A2L,S0L
subx A2H,S0H
- bhs setone
+ bhs setone
add.w A3,S1 ; no, restore mistake
addx A2L,S0L
dec S2H
bne nextbit
- rts
-
+ rts
+
setone:
inc A1L
dec S2H
bne nextbit
- rts
+ rts
#else /* __H8300H__ */
-divmodsi4:
- sub.l S0P,S0P ; zero play area
+ ;; This function also computes the remainder and stores it in er3.
+ .global ___udivsi3
+___udivsi3:
mov.w A1E,A1E ; denominator top word 0?
- bne DenHighZero
+ bne DenHighNonZero
; do it the easy way, see page 107 in manual
mov.w A0E,A2
divxu.w A1,A2P
mov.w A2E,A0E
divxu.w A1,A0P
- mov.w A0E,S0
+ mov.w A0E,A3
mov.w A2,A0E
- extu.l S0P
+ extu.l A3P
rts
-DenHighZero:
- mov.w A0E,A2
- mov.b A2H,S0L
- mov.b A2L,A2H
- mov.b A0H,A2L
- mov.w A2,A0E
- mov.b A0L,A0H
- mov.b #0,A0L
- mov.b #24,S2H ; only do 24 iterations
-
-nextbit:
- shll.l A0P ; double the answer guess
- rotxl.l S0P ; double remainder
- sub.l A1P,S0P ; does it all fit?
- bhs setone
-
- add.l A1P,S0P ; no, restore mistake
- dec S2H
- bne nextbit
- rts
-
-setone:
- inc A0L
- dec S2H
- bne nextbit
+ ; er0 = er0 / er1
+ ; er3 = er0 % er1
+ ; trashes er1 er2
+ ; expects er1 >= 2^16
+DenHighNonZero:
+ mov.l er0,er3
+ mov.l er1,er2
+#ifdef __H8300H__
+divmod_L21:
+ shlr.l er0
+ shlr.l er2 ; make divisor < 2^16
+ mov.w e2,e2
+ bne divmod_L21
+#else
+ shlr.l #2,er2 ; make divisor < 2^16
+ mov.w e2,e2
+ beq divmod_L22A
+divmod_L21:
+ shlr.l #2,er0
+divmod_L22:
+ shlr.l #2,er2 ; make divisor < 2^16
+ mov.w e2,e2
+ bne divmod_L21
+divmod_L22A:
+ rotxl.w r2
+ bcs divmod_L23
+ shlr.l er0
+ bra divmod_L24
+divmod_L23:
+ rotxr.w r2
+ shlr.l #2,er0
+divmod_L24:
+#endif
+ ;; At this point,
+ ;; er0 contains shifted dividend
+ ;; er1 contains divisor
+ ;; er2 contains shifted divisor
+ ;; er3 contains dividend, later remainder
+ divxu.w r2,er0 ; r0 now contains the approximate quotient (AQ)
+ extu.l er0
+ beq divmod_L25
+ subs #1,er0 ; er0 = AQ - 1
+ mov.w e1,r2
+ mulxu.w r0,er2 ; er2 = upper (AQ - 1) * divisor
+ sub.w r2,e3 ; dividend - 65536 * er2
+ mov.w r1,r2
+ mulxu.w r0,er2 ; compute er3 = remainder (tentative)
+ sub.l er2,er3 ; er3 = dividend - (AQ - 1) * divisor
+divmod_L25:
+ cmp.l er1,er3 ; is divisor < remainder?
+ blo divmod_L26
+ adds #1,er0
+ sub.l er1,er3 ; correct the remainder
+divmod_L26:
rts
#endif
;; HImode multiply.
; The H8/300 only has an 8*8->16 multiply.
; The answer is the same as:
-;
+;
; product = (srca.l * srcb.l) + ((srca.h * srcb.l) + (srcb.h * srca.l)) * 256
; (we can ignore A1.h * A0.h cause that will all off the top)
; A0 in
-; A1 in
+; A1 in
; A0 answer
#ifdef __H8300__
.global ___mulhi3
___mulhi3:
mov.b A1L,A2L ; A2l gets srcb.l
- mulxu A0L,A2 ; A2 gets first sub product
+ mulxu A0L,A2 ; A2 gets first sub product
mov.b A0H,A3L ; prepare for
mulxu A1L,A3 ; second sub product
add.b A3L,A2H ; sum first two terms
mov.b A1H,A3L ; third sub product
- mulxu A0L,A3
+ mulxu A0L,A3
add.b A3L,A2H ; almost there
mov.w A2,A0 ; that is
#ifdef L_mulsi3
;; SImode multiply.
-;;
+;;
;; I think that shift and add may be sufficient for this. Using the
;; supplied 8x8->16 would need 10 ops of 14 cycles each + overhead. This way
;; the inner loop uses maybe 20 cycles + overhead, but terminates
;; A0/A1 src_a
;; A2/A3 src_b
;;
-;; while (a)
+;; while (a)
;; {
;; if (a & 1)
;; r += b;
___mulsi3:
PUSHP S0P
PUSHP S1P
- PUSHP S2P
-
+
sub.w S0,S0
sub.w S1,S1
-
+
; while (a)
_top: mov.w A0,A0
bne _more
rotxr A0L
rotxr A1H
rotxr A1L
-
+
; b <<= 1
add.w A3,A3
addx A2L,A2L
bra _top
_done:
- mov.w S0,A0
+ mov.w S0,A0
mov.w S1,A1
- POPP S2P
POPP S1P
POPP S0P
rts
#else /* __H8300H__ */
+;
+; mulsi3 for H8/300H - based on Renesas SH implementation
+;
+; by Toshiyasu Morita
+;
+; Old code:
+;
+; 16b * 16b = 372 states (worst case)
+; 32b * 32b = 724 states (worst case)
+;
+; New code:
+;
+; 16b * 16b = 48 states
+; 16b * 32b = 72 states
+; 32b * 32b = 92 states
+;
+
.global ___mulsi3
___mulsi3:
- sub.l A2P,A2P
+ mov.w r1,r2 ; ( 2 states) b * d
+ mulxu r0,er2 ; (22 states)
- ; while (a)
-_top: mov.l A0P,A0P
- beq _done
+ mov.w e0,r3 ; ( 2 states) a * d
+ beq L_skip1 ; ( 4 states)
+ mulxu r1,er3 ; (22 states)
+ add.w r3,e2 ; ( 2 states)
- ; if (a & 1)
- bld #0,A0L
- bcc _nobit
+L_skip1:
+ mov.w e1,r3 ; ( 2 states) c * b
+ beq L_skip2 ; ( 4 states)
+ mulxu r0,er3 ; (22 states)
+ add.w r3,e2 ; ( 2 states)
- ; r += b
- add.l A1P,A2P
-
-_nobit:
- ; a >>= 1
- shlr.l A0P
-
- ; b <<= 1
- shll.l A1P
- bra _top
-
-_done:
- mov.l A2P,A0P
- rts
+L_skip2:
+ mov.l er2,er0 ; ( 2 states)
+ rts ; (10 states)
#endif
#endif /* L_mulsi3 */
space. For the H8/300H and H8S, the C version is good enough. */
#ifdef __H8300__
/* We still treat NANs different than libgcc2.c, but then, the
- behaviour is undefined anyways. */
+ behavior is undefined anyways. */
.global ___fixunssfsi
___fixunssfsi:
- cmp.b #0x47,r0h
+ cmp.b #0x4f,r0h
bge Large_num
jmp @___fixsfsi
Large_num: