X-Git-Url: https://oss.titaniummirror.com/gitweb?p=msp430-binutils.git;a=blobdiff_plain;f=ld%2Femultempl%2Fspu_ovl.S;h=509397a6090731a53e5f01ce49ad8a3d13143b5d;hp=96601d744e2f41dac380fe1ee7769bd8b023ad32;hb=88750007d7869f178f0ba528f41efd3b74c424cf;hpb=6df9443a374e2b81278c61b8afc0a1eef7db280b diff --git a/ld/emultempl/spu_ovl.S b/ld/emultempl/spu_ovl.S index 96601d7..509397a 100644 --- a/ld/emultempl/spu_ovl.S +++ b/ld/emultempl/spu_ovl.S @@ -1,6 +1,6 @@ /* Overlay manager for SPU. - Copyright 2006, 2007 Free Software Foundation, Inc. + Copyright 2006, 2007, 2008 Free Software Foundation, Inc. This file is part of the GNU Binutils. @@ -19,164 +19,258 @@ Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. */ -/** - * MFC DMA defn's. - */ +/* MFC DMA defn's. */ #define MFC_GET_CMD 0x40 #define MFC_MAX_DMA_SIZE 0x4000 #define MFC_TAG_UPDATE_ALL 2 #define MFC_TAG_ID 0 - -/** - * Temporary register allocations. - * These are saved/restored here. - */ -#define tab $75 -#define cgbits $75 -#define add64 $75 -#define ealo $75 -#define newmask $75 -#define tagstat $75 -#define bchn $75 -#define rv1 $75 - -#define off $76 -#define off64 $76 -#define maxsize $76 -#define oldmask $76 -#define sz $76 -#define lnkr $76 -#define rv2 $76 - -#define cur $77 -#define cmp $77 -#define buf $77 -#define genwi $77 -#define tagid $77 -#define cmd $77 -#define rv3 $77 - -#define cgshuf $78 - -#define vma $6 - -#define map $7 -#define osize $7 -#define cmp2 $7 - -#define ea64 $8 -#define retval $8 - -#ifdef OVLY_IRQ_SAVE -#define irqtmp $8 -#define irq_stat $9 -#endif - - .extern _ovly_table - .extern _ovly_buf_table +/* Register usage. */ +#define reserved1 $75 +#define parm $75 +#define tab1 reserved1 +#define tab2 reserved1 +#define vma reserved1 +#define oldvma reserved1 +#define newmask reserved1 +#define map reserved1 + +#define reserved2 $76 +#define off1 reserved2 +#define off2 reserved2 +#define present1 reserved2 +#define present2 reserved2 +#define sz reserved2 +#define cmp reserved2 +#define add64 reserved2 +#define cgbits reserved2 +#define off3 reserved2 +#define off4 reserved2 +#define addr4 reserved2 +#define off5 reserved2 +#define tagstat reserved2 + +#define reserved3 $77 +#define size1 reserved3 +#define size2 reserved3 +#define rv3 reserved3 +#define ealo reserved3 +#define cmd reserved3 +#define off64 reserved3 +#define tab3 reserved3 +#define tab4 reserved3 +#define tab5 reserved3 + +#define reserved4 $78 +#define ovl reserved4 +#define rv2 reserved4 +#define rv5 reserved4 +#define cgshuf reserved4 +#define newovl reserved4 +#define irqtmp1 reserved4 +#define irqtmp2 reserved4 + +#define reserved5 $79 +#define target reserved5 + +#define save1 $74 +#define rv4 save1 +#define rv7 save1 +#define tagid save1 +#define maxsize save1 +#define pbyte save1 +#define pbit save1 + +#define save2 $73 +#define cur save2 +#define rv6 save2 +#define osize save2 +#define zovl save2 +#define oldovl save2 +#define newvma save2 + +#define save3 $72 +#define rv1 save3 +#define ea64 save3 +#define buf3 save3 +#define genwi save3 +#define newmap save3 +#define oldmask save3 + +#define save4 $71 +#define irq_stat save4 .text - .align 4 - .type __rv_pattern, @object - .size __rv_pattern, 16 + .align 4 + .type __rv_pattern, @object + .size __rv_pattern, 16 __rv_pattern: - .word 0x00010203, 0x1c1d1e1f, 0x00010203, 0x10111213 - .type __cg_pattern, @object - .size __cg_pattern, 16 + .word 0x00010203, 0x10111213, 0x80808080, 0x80808080 + + .type __cg_pattern, @object + .size __cg_pattern, 16 __cg_pattern: - .word 0x04050607, 0x80808080, 0x80808080, 0x80808080 + .word 0x04050607, 0x80808080, 0x80808080, 0x80808080 -/** + .type __ovly_current, @object + .size __ovly_current, 16 +__ovly_current: + .space 16 + +/* * __ovly_return - stub for returning from overlay functions. * - * inputs: - * $lr link register + * On entry the four slots of $lr are: + * __ovly_return, prev ovl index, caller return addr, undefined. * - * outputs: - * $78 old partition number, to be reloaded - * $79 return address in old partion number + * Load the previous overlay and jump to the caller return address. + * Updates __ovly_current. */ - .global __ovly_return - .type __ovly_return, @function - - .word 0 + .align 4 + .global __ovly_return + .type __ovly_return, @function __ovly_return: - shlqbyi $78, $lr, 4 - shlqbyi $79, $lr, 8 - biz $78, $79 - .size __ovly_return, . - __ovly_return - -/** + ila tab1, _ovly_table - 16 # 0,2 0 + shlqbyi ovl, $lr, 4 # 1,4 0 +#nop + shlqbyi target, $lr, 8 # 1,4 1 +#nop; lnop +#nop; lnop + shli off1, ovl, 4 # 0,4 4 +#lnop +#nop + hbr ovly_ret9, target # 1,15 5 +#nop; lnop +#nop; lnop +#nop + lqx vma, tab1, off1 # 1,6 8 +#ifdef OVLY_IRQ_SAVE + nop + stqd save4, -64($sp) # 1,6 9 +#else +#nop; lnop +#endif +#nop; lnop +#nop; lnop +#nop; lnop +#nop; lnop +#nop + rotqbyi size1, vma, 4 # 1,4 14 +#nop + stqd save3, -48($sp) # 1,6 15 +#nop + stqd save2, -32($sp) # 1,6 16 +#nop + stqd save1, -16($sp) # 1,6 17 + andi present1, size1, 1 # 0,2 18 + stqr ovl, __ovly_current # 1,6 18 +#nop; lnop +#nop + brz present1, do_load # 1,4 20 +ovly_ret9: +#nop + bi target # 1,4 21 + +/* * __ovly_load - copy an overlay partion to local store. * - * inputs: - * $78 partition number to be loaded. - * $79 branch target in new partition. - * $lr link register, containing return addr. - * - * outputs: - * $lr new link register, returning through __ovly_return. + * On entry $75 points to a word consisting of the overlay index in + * the top 14 bits, and the target address in the bottom 18 bits. * - * Copy a new overlay partition into local store, or return - * immediately if the partition is already resident. + * Sets up $lr to return via __ovly_return. If $lr is already set + * to return via __ovly_return, don't change it. In that case we + * have a tail call from one overlay function to another. + * Updates __ovly_current. */ - .global __ovly_load - .type __ovly_load, @function - + .align 3 + .global __ovly_load + .type __ovly_load, @function __ovly_load: -/* Save temporary registers to stack. */ - stqd $6, -16($sp) - stqd $7, -32($sp) - stqd $8, -48($sp) +#if OVL_STUB_SIZE == 8 +######## +#nop + lqd target, 0(parm) # 1,6 -11 +#nop; lnop +#nop; lnop +#nop; lnop +#nop; lnop +#nop; lnop +#nop + rotqby target, target, parm # 1,4 -5 + ila tab2, _ovly_table - 16 # 0,2 -4 + stqd save3, -48($sp) # 1,6 -4 +#nop + stqd save2, -32($sp) # 1,6 -3 +#nop + stqd save1, -16($sp) # 1,6 -2 + rotmi ovl, target, -18 # 0,4 -1 + hbr ovly_load9, target # 1,15 -1 + ila rv1, __ovly_return # 0,2 0 +#lnop +#nop; lnop +#nop + lqr cur, __ovly_current # 1,6 2 + shli off2, ovl, 4 # 0,4 3 + stqr ovl, __ovly_current # 1,6 3 + ceq rv2, $lr, rv1 # 0,2 4 + lqr rv3, __rv_pattern # 1,6 4 +#nop; lnop +#nop; lnop +#nop + lqx vma, tab2, off2 # 1,6 7 +######## +#else /* OVL_STUB_SIZE == 16 */ +######## + ila tab2, _ovly_table - 16 # 0,2 0 + stqd save3, -48($sp) # 1,6 0 + ila rv1, __ovly_return # 0,2 1 + stqd save2, -32($sp) # 1,6 1 + shli off2, ovl, 4 # 0,4 2 + lqr cur, __ovly_current # 1,6 2 + nop + stqr ovl, __ovly_current # 1,6 3 + ceq rv2, $lr, rv1 # 0,2 4 + lqr rv3, __rv_pattern # 1,6 4 +#nop + hbr ovly_load9, target # 1,15 5 +#nop + lqx vma, tab2, off2 # 1,6 6 +#nop + stqd save1, -16($sp) # 1,6 7 +######## +#endif +#nop; lnop +#nop; lnop +#nop + shufb rv4, rv1, cur, rv3 # 1,4 10 +#nop + fsmb rv5, rv2 # 1,4 11 +#nop + rotqmbyi rv6, $lr, -8 # 1,4 12 +#nop + rotqbyi size2, vma, 4 # 1,4 13 +#nop + lqd save3, -48($sp) # 1,6 14 +#nop; lnop + or rv7, rv4, rv6 # 0,2 16 + lqd save2, -32($sp) # 1,6 16 + andi present2, size2, 1 # 0,2 17 #ifdef OVLY_IRQ_SAVE -/* Save irq state, then disable interrupts. */ - stqd $9, -64($sp) - ila irqtmp, __ovly_irq_save - rdch irq_stat, $SPU_RdMachStat - bid irqtmp -__ovly_irq_save: + stqd save4, -64($sp) # 1,6 17 +#else + lnop # 1,0 17 #endif - -/* Set branch hint to overlay target. */ - hbr __ovly_load_ret, $79 - -/* Get caller's overlay index by back chaining through stack frames. - * Loop until end of stack (back chain all-zeros) or - * encountered a link register we set here. */ - lqd bchn, 0($sp) - ila retval, __ovly_return - -__ovly_backchain_loop: - lqd lnkr, 16(bchn) - lqd bchn, 0(bchn) - ceq cmp, lnkr, retval - ceqi cmp2, bchn, 0 - or cmp, cmp, cmp2 - brz cmp, __ovly_backchain_loop - -/* If we reached the zero back-chain, then lnkr is bogus. Clear the - * part of lnkr that we use later (slot 3). */ - rotqbyi cmp2, cmp2, 4 - andc lnkr, lnkr, cmp2 - -/* Set lr = {__ovly_return, prev ovl ndx, caller return adr, callee ovl ndx}. */ - lqd rv1, (__rv_pattern-__ovly_return+4)(retval) - shufb rv2, retval, lnkr, rv1 - shufb rv3, $lr, $78, rv1 - fsmbi rv1, 0xff - selb rv2, rv2, rv3, rv1 -/* If we have a tail call from one overlay function to another overlay, - then lr is already set up. Don't change it. */ - ceq rv1, $lr, retval - fsmb rv1, rv1 - selb $lr, rv2, $lr, rv1 - -/* Branch to $79 if non-overlay */ - brz $78, __ovly_load_restore - -/* Load values from _ovly_table[$78]. + selb $lr, rv7, $lr, rv5 # 0,2 18 + lqd save1, -16($sp) # 1,6 18 +#nop + brz present2, do_load # 1,4 19 +ovly_load9: +#nop + bi target # 1,4 20 + +/* If we get here, we are about to load a new overlay. + * "vma" contains the relevant entry from _ovly_table[]. * extern struct { * u32 vma; * u32 size; @@ -184,111 +278,194 @@ __ovly_backchain_loop: * u32 buf; * } _ovly_table[]; */ - shli off, $78, 4 - ila tab, _ovly_table - 16 - lqx vma, tab, off - rotqbyi buf, vma, 12 - -/* Load values from _ovly_buf_table[buf]. - * extern struct { - * u32 mapped; - * } _ovly_buf_table[]; - */ - ila tab, _ovly_buf_table - ai off, buf, -1 - shli off, off, 2 - lqx map, tab, off - rotqby cur, map, off - -/* Branch to $79 now if overlay is already mapped. */ - ceq cmp, $78, cur - brnz cmp, __ovly_load_restore - -/* Marker for profiling code. If we get here, we are about to load - * a new overlay. - */ - .global __ovly_load_event - .type __ovly_load_event, @function + .align 3 + .global __ovly_load_event + .type __ovly_load_event, @function __ovly_load_event: - -/* Set _ovly_buf_table[buf].mapped = $78. */ - cwx genwi, tab, off - shufb map, $78, map, genwi - stqx map, tab, off - -/* A new partition needs to be loaded. Prepare for DMA loop. - * _EAR_ is the 64b base EA, filled in at run time by the - * loader, and indicating the value for SPU executable image start. - */ - lqd cgshuf, (__cg_pattern-__ovly_return+4)(retval) - rotqbyi osize, vma, 4 - rotqbyi sz, vma, 8 - lqa ea64, _EAR_ - +do_load: +#ifdef OVLY_IRQ_SAVE + ila irqtmp1, do_load10 # 0,2 -5 + rotqbyi sz, vma, 8 # 1,4 -5 +#nop + rdch irq_stat, $SPU_RdMachStat # 1,6 -4 +#nop + bid irqtmp1 # 1,4 -3 +do_load10: + nop +#else +#nop + rotqbyi sz, vma, 8 # 1,4 0 +#endif + rotqbyi osize, vma, 4 # 1,4 1 +#nop + lqa ea64, _EAR_ # 1,6 2 +#nop + lqr cgshuf, __cg_pattern # 1,6 3 + +/* We could predict the branch at the end of this loop by adding a few + instructions, and there are plenty of free cycles to do so without + impacting loop execution time. However, it doesn't make a great + deal of sense since we need to wait for the dma to complete anyway. */ __ovly_xfer_loop: -/* 64b add to compute next ea64. */ - rotqmbyi off64, sz, -4 - cg cgbits, ea64, off64 - shufb add64, cgbits, cgbits, cgshuf - addx add64, ea64, off64 - ori ea64, add64, 0 - -/* Setup DMA parameters, then issue DMA request. */ - rotqbyi ealo, add64, 4 - ila maxsize, MFC_MAX_DMA_SIZE - cgt cmp, osize, maxsize - selb sz, osize, maxsize, cmp - ila tagid, MFC_TAG_ID - wrch $MFC_LSA, vma - wrch $MFC_EAH, ea64 - wrch $MFC_EAL, ealo - wrch $MFC_Size, sz - wrch $MFC_TagId, tagid - ila cmd, MFC_GET_CMD - wrch $MFC_Cmd, cmd - -/* Increment vma, decrement size, branch back as needed. */ - a vma, vma, sz - sf osize, sz, osize - brnz osize, __ovly_xfer_loop - -/* Save app's tagmask, wait for DMA complete, restore mask. */ - rdch oldmask, $MFC_RdTagMask +#nop + rotqmbyi off64, sz, -4 # 1,4 4 +#nop; lnop +#nop; lnop +#nop; lnop + cg cgbits, ea64, off64 # 0,2 8 +#lnop +#nop; lnop +#nop + shufb add64, cgbits, cgbits, cgshuf # 1,4 10 +#nop; lnop +#nop; lnop +#nop; lnop + addx add64, ea64, off64 # 0,2 14 +#lnop + ila maxsize, MFC_MAX_DMA_SIZE # 0,2 15 + lnop + ori ea64, add64, 0 # 0,2 16 + rotqbyi ealo, add64, 4 # 1,4 16 + cgt cmp, osize, maxsize # 0,2 17 + wrch $MFC_LSA, vma # 1,6 17 +#nop; lnop + selb sz, osize, maxsize, cmp # 0,2 19 + wrch $MFC_EAH, ea64 # 1,6 19 + ila tagid, MFC_TAG_ID # 0,2 20 + wrch $MFC_EAL, ealo # 1,6 20 + ila cmd, MFC_GET_CMD # 0,2 21 + wrch $MFC_Size, sz # 1,6 21 + sf osize, sz, osize # 0,2 22 + wrch $MFC_TagId, tagid # 1,6 22 + a vma, vma, sz # 0,2 23 + wrch $MFC_Cmd, cmd # 1,6 23 +#nop + brnz osize, __ovly_xfer_loop # 1,4 24 + +/* Now update our data structions while waiting for DMA to complete. + Low bit of .size needs to be cleared on the _ovly_table entry + corresponding to the evicted overlay, and set on the entry for the + newly loaded overlay. Note that no overlay may in fact be evicted + as _ovly_buf_table[] starts with all zeros. Don't zap .size entry + for zero index! Also of course update the _ovly_buf_table entry. */ +#nop + lqr newovl, __ovly_current # 1,6 25 +#nop; lnop +#nop; lnop +#nop; lnop +#nop; lnop +#nop; lnop + shli off3, newovl, 4 # 0,4 31 +#lnop + ila tab3, _ovly_table - 16 # 0,2 32 +#lnop +#nop + fsmbi pbyte, 0x100 # 1,4 33 +#nop; lnop +#nop + lqx vma, tab3, off3 # 1,6 35 +#nop; lnop + andi pbit, pbyte, 1 # 0,2 37 + lnop +#nop; lnop +#nop; lnop +#nop; lnop + or newvma, vma, pbit # 0,2 41 + rotqbyi buf3, vma, 12 # 1,4 41 +#nop; lnop +#nop + stqx newvma, tab3, off3 # 1,6 43 +#nop; lnop + shli off4, buf3, 2 # 1,4 45 +#lnop + ila tab4, _ovly_buf_table - 4 # 0,2 46 +#lnop +#nop; lnop +#nop; lnop +#nop + lqx map, tab4, off4 # 1,6 49 +#nop + cwx genwi, tab4, off4 # 1,4 50 + a addr4, tab4, off4 # 0,2 51 +#lnop +#nop; lnop +#nop; lnop +#nop; lnop +#nop + rotqby oldovl, map, addr4 # 1,4 55 +#nop + shufb newmap, newovl, map, genwi # 0,4 56 #if MFC_TAG_ID < 16 - ilh newmask, 1 << MFC_TAG_ID + ila newmask, 1 << MFC_TAG_ID # 0,2 57 #else - ilhu newmask, 1 << (MFC_TAG_ID - 16) + ilhu newmask, 1 << (MFC_TAG_ID - 16) # 0,2 57 #endif - wrch $MFC_WrTagMask, newmask - ila tagstat, MFC_TAG_UPDATE_ALL - wrch $MFC_WrTagUpdate, tagstat - rdch tagstat, $MFC_RdTagStat - sync - wrch $MFC_WrTagMask, oldmask - - .global _ovly_debug_event - .type _ovly_debug_event, @function -_ovly_debug_event: -/* GDB inserts debugger trap here. */ - nop - -__ovly_load_restore: +#lnop +#nop; lnop +#nop; lnop + stqd newmap, 0(addr4) # 1,6 60 + +/* Save app's tagmask, wait for DMA complete, restore mask. */ + ila tagstat, MFC_TAG_UPDATE_ALL # 0,2 61 + rdch oldmask, $MFC_RdTagMask # 1,6 61 +#nop + wrch $MFC_WrTagMask, newmask # 1,6 62 +#nop + wrch $MFC_WrTagUpdate, tagstat # 1,6 63 +#nop + rdch tagstat, $MFC_RdTagStat # 1,6 64 +#nop + sync # 1,4 65 +/* Any hint prior to the sync is lost. A hint here allows the branch + to complete 15 cycles after the hint. With no hint the branch will + take 18 or 19 cycles. */ + ila tab5, _ovly_table - 16 # 0,2 66 + hbr do_load99, target # 1,15 66 + shli off5, oldovl, 4 # 0,4 67 + wrch $MFC_WrTagMask, oldmask # 1,6 67 + ceqi zovl, oldovl, 0 # 0,2 68 +#lnop +#nop; lnop +#nop + fsm zovl, zovl # 1,4 70 +#nop + lqx oldvma, tab5, off5 # 1,6 71 +#nop + lqd save3, -48($sp) # 1,6 72 +#nop; lnop + andc pbit, pbit, zovl # 0,2 74 + lqd save2, -32($sp) # 1,6 74 #ifdef OVLY_IRQ_SAVE -/* Conditionally re-enable interrupts. */ - andi irq_stat, irq_stat, 1 - ila irqtmp, __ovly_irq_restore - binze irq_stat, irqtmp -__ovly_irq_restore: - lqd $9, -64($sp) + ila irqtmp2, do_load90 # 0,2 75 +#lnop + andi irq_stat, irq_stat, 1 # 0,2 76 +#lnop +#else +#nop; lnop +#nop; lnop +#endif + andc oldvma, oldvma, pbit # 0,2 77 + lqd save1, -16($sp) # 1,6 77 + nop # 0,0 78 +#lnop +#nop + stqx oldvma, tab5, off5 # 1,6 79 +#nop +#ifdef OVLY_IRQ_SAVE + binze irq_stat, irqtmp2 # 1,4 80 +do_load90: +#nop + lqd save4, -64($sp) # 1,6 84 +#else +#nop; lnop #endif -/* Restore saved registers. */ - lqd $8, -48($sp) - lqd $7, -32($sp) - lqd $6, -16($sp) - -__ovly_load_ret: + .global _ovly_debug_event + .type _ovly_debug_event, @function +_ovly_debug_event: + nop /* Branch to target address. */ - bi $79 +do_load99: + bi target # 1,4 81/85 - .size __ovly_load, . - __ovly_load + .size __ovly_load, . - __ovly_load