+ int i;
+ rtx note = NULL_RTX;
+
+ /* In .mbb and .bbb bundles, check if CALL_INSN isn't in the
+ first or second slot. If it is and has REG_EH_NOTE set, copy it
+ to following nops, as br.call sets rp to the address of following
+ bundle and therefore an EH region end must be on a bundle
+ boundary. */
+ insn = PREV_INSN (insn);
+ for (i = 0; i < 3; i++)
+ {
+ do
+ insn = next_active_insn (insn);
+ while (GET_CODE (insn) == INSN
+ && get_attr_empty (insn) == EMPTY_YES);
+ if (GET_CODE (insn) == CALL_INSN)
+ note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
+ else if (note)
+ {
+ int code;
+
+ gcc_assert ((code = recog_memoized (insn)) == CODE_FOR_nop
+ || code == CODE_FOR_nop_b);
+ if (find_reg_note (insn, REG_EH_REGION, NULL_RTX))
+ note = NULL_RTX;
+ else
+ REG_NOTES (insn)
+ = gen_rtx_EXPR_LIST (REG_EH_REGION, XEXP (note, 0),
+ REG_NOTES (insn));
+ }
+ }
+ }
+#endif
+}
+
+/* The following function does insn bundling. Bundling means
+ inserting templates and nop insns to fit insn groups into permitted
+ templates. Instruction scheduling uses NDFA (non-deterministic
+ finite automata) encoding informations about the templates and the
+ inserted nops. Nondeterminism of the automata permits follows
+ all possible insn sequences very fast.
+
+ Unfortunately it is not possible to get information about inserting
+ nop insns and used templates from the automata states. The
+ automata only says that we can issue an insn possibly inserting
+ some nops before it and using some template. Therefore insn
+ bundling in this function is implemented by using DFA
+ (deterministic finite automata). We follow all possible insn
+ sequences by inserting 0-2 nops (that is what the NDFA describe for
+ insn scheduling) before/after each insn being bundled. We know the
+ start of simulated processor cycle from insn scheduling (insn
+ starting a new cycle has TImode).
+
+ Simple implementation of insn bundling would create enormous
+ number of possible insn sequences satisfying information about new
+ cycle ticks taken from the insn scheduling. To make the algorithm
+ practical we use dynamic programming. Each decision (about
+ inserting nops and implicitly about previous decisions) is described
+ by structure bundle_state (see above). If we generate the same
+ bundle state (key is automaton state after issuing the insns and
+ nops for it), we reuse already generated one. As consequence we
+ reject some decisions which cannot improve the solution and
+ reduce memory for the algorithm.
+
+ When we reach the end of EBB (extended basic block), we choose the
+ best sequence and then, moving back in EBB, insert templates for
+ the best alternative. The templates are taken from querying
+ automaton state for each insn in chosen bundle states.
+
+ So the algorithm makes two (forward and backward) passes through
+ EBB. There is an additional forward pass through EBB for Itanium1
+ processor. This pass inserts more nops to make dependency between
+ a producer insn and MMMUL/MMSHF at least 4 cycles long. */
+
+static void
+bundling (FILE *dump, int verbose, rtx prev_head_insn, rtx tail)
+{
+ struct bundle_state *curr_state, *next_state, *best_state;
+ rtx insn, next_insn;
+ int insn_num;
+ int i, bundle_end_p, only_bundle_end_p, asm_p;
+ int pos = 0, max_pos, template0, template1;
+ rtx b;
+ rtx nop;
+ enum attr_type type;
+
+ insn_num = 0;
+ /* Count insns in the EBB. */
+ for (insn = NEXT_INSN (prev_head_insn);
+ insn && insn != tail;
+ insn = NEXT_INSN (insn))
+ if (INSN_P (insn))
+ insn_num++;
+ if (insn_num == 0)
+ return;
+ bundling_p = 1;
+ dfa_clean_insn_cache ();
+ initiate_bundle_state_table ();
+ index_to_bundle_states = XNEWVEC (struct bundle_state *, insn_num + 2);
+ /* First (forward) pass -- generation of bundle states. */
+ curr_state = get_free_bundle_state ();
+ curr_state->insn = NULL;
+ curr_state->before_nops_num = 0;
+ curr_state->after_nops_num = 0;
+ curr_state->insn_num = 0;
+ curr_state->cost = 0;
+ curr_state->accumulated_insns_num = 0;
+ curr_state->branch_deviation = 0;
+ curr_state->middle_bundle_stops = 0;
+ curr_state->next = NULL;
+ curr_state->originator = NULL;
+ state_reset (curr_state->dfa_state);
+ index_to_bundle_states [0] = curr_state;
+ insn_num = 0;
+ /* Shift cycle mark if it is put on insn which could be ignored. */
+ for (insn = NEXT_INSN (prev_head_insn);
+ insn != tail;
+ insn = NEXT_INSN (insn))
+ if (INSN_P (insn)
+ && (ia64_safe_itanium_class (insn) == ITANIUM_CLASS_IGNORE
+ || GET_CODE (PATTERN (insn)) == USE
+ || GET_CODE (PATTERN (insn)) == CLOBBER)
+ && GET_MODE (insn) == TImode)
+ {
+ PUT_MODE (insn, VOIDmode);
+ for (next_insn = NEXT_INSN (insn);
+ next_insn != tail;
+ next_insn = NEXT_INSN (next_insn))
+ if (INSN_P (next_insn)
+ && ia64_safe_itanium_class (next_insn) != ITANIUM_CLASS_IGNORE
+ && GET_CODE (PATTERN (next_insn)) != USE
+ && GET_CODE (PATTERN (next_insn)) != CLOBBER
+ && INSN_CODE (next_insn) != CODE_FOR_insn_group_barrier)
+ {
+ PUT_MODE (next_insn, TImode);
+ break;
+ }
+ }
+ /* Forward pass: generation of bundle states. */
+ for (insn = get_next_important_insn (NEXT_INSN (prev_head_insn), tail);
+ insn != NULL_RTX;
+ insn = next_insn)
+ {
+ gcc_assert (INSN_P (insn)
+ && ia64_safe_itanium_class (insn) != ITANIUM_CLASS_IGNORE
+ && GET_CODE (PATTERN (insn)) != USE
+ && GET_CODE (PATTERN (insn)) != CLOBBER);
+ type = ia64_safe_type (insn);
+ next_insn = get_next_important_insn (NEXT_INSN (insn), tail);
+ insn_num++;
+ index_to_bundle_states [insn_num] = NULL;
+ for (curr_state = index_to_bundle_states [insn_num - 1];
+ curr_state != NULL;
+ curr_state = next_state)
+ {
+ pos = curr_state->accumulated_insns_num % 3;
+ next_state = curr_state->next;
+ /* We must fill up the current bundle in order to start a
+ subsequent asm insn in a new bundle. Asm insn is always
+ placed in a separate bundle. */
+ only_bundle_end_p
+ = (next_insn != NULL_RTX
+ && INSN_CODE (insn) == CODE_FOR_insn_group_barrier
+ && ia64_safe_type (next_insn) == TYPE_UNKNOWN);
+ /* We may fill up the current bundle if it is the cycle end
+ without a group barrier. */
+ bundle_end_p
+ = (only_bundle_end_p || next_insn == NULL_RTX
+ || (GET_MODE (next_insn) == TImode
+ && INSN_CODE (insn) != CODE_FOR_insn_group_barrier));
+ if (type == TYPE_F || type == TYPE_B || type == TYPE_L
+ || type == TYPE_S
+ /* We need to insert 2 nops for cases like M_MII. To
+ guarantee issuing all insns on the same cycle for
+ Itanium 1, we need to issue 2 nops after the first M
+ insn (MnnMII where n is a nop insn). */
+ || ((type == TYPE_M || type == TYPE_A)
+ && ia64_tune == PROCESSOR_ITANIUM
+ && !bundle_end_p && pos == 1))
+ issue_nops_and_insn (curr_state, 2, insn, bundle_end_p,
+ only_bundle_end_p);
+ issue_nops_and_insn (curr_state, 1, insn, bundle_end_p,
+ only_bundle_end_p);
+ issue_nops_and_insn (curr_state, 0, insn, bundle_end_p,
+ only_bundle_end_p);
+ }
+ gcc_assert (index_to_bundle_states [insn_num]);
+ for (curr_state = index_to_bundle_states [insn_num];
+ curr_state != NULL;
+ curr_state = curr_state->next)
+ if (verbose >= 2 && dump)
+ {
+ /* This structure is taken from generated code of the
+ pipeline hazard recognizer (see file insn-attrtab.c).
+ Please don't forget to change the structure if a new
+ automaton is added to .md file. */
+ struct DFA_chip
+ {
+ unsigned short one_automaton_state;
+ unsigned short oneb_automaton_state;
+ unsigned short two_automaton_state;
+ unsigned short twob_automaton_state;
+ };
+
+ fprintf
+ (dump,
+ "// Bundle state %d (orig %d, cost %d, nops %d/%d, insns %d, branch %d, mid.stops %d state %d) for %d\n",
+ curr_state->unique_num,
+ (curr_state->originator == NULL
+ ? -1 : curr_state->originator->unique_num),
+ curr_state->cost,
+ curr_state->before_nops_num, curr_state->after_nops_num,
+ curr_state->accumulated_insns_num, curr_state->branch_deviation,
+ curr_state->middle_bundle_stops,
+ (ia64_tune == PROCESSOR_ITANIUM
+ ? ((struct DFA_chip *) curr_state->dfa_state)->oneb_automaton_state
+ : ((struct DFA_chip *) curr_state->dfa_state)->twob_automaton_state),
+ INSN_UID (insn));
+ }
+ }
+
+ /* We should find a solution because the 2nd insn scheduling has
+ found one. */
+ gcc_assert (index_to_bundle_states [insn_num]);
+ /* Find a state corresponding to the best insn sequence. */
+ best_state = NULL;
+ for (curr_state = index_to_bundle_states [insn_num];
+ curr_state != NULL;
+ curr_state = curr_state->next)
+ /* We are just looking at the states with fully filled up last
+ bundle. The first we prefer insn sequences with minimal cost
+ then with minimal inserted nops and finally with branch insns
+ placed in the 3rd slots. */
+ if (curr_state->accumulated_insns_num % 3 == 0
+ && (best_state == NULL || best_state->cost > curr_state->cost
+ || (best_state->cost == curr_state->cost
+ && (curr_state->accumulated_insns_num
+ < best_state->accumulated_insns_num
+ || (curr_state->accumulated_insns_num
+ == best_state->accumulated_insns_num
+ && (curr_state->branch_deviation
+ < best_state->branch_deviation
+ || (curr_state->branch_deviation
+ == best_state->branch_deviation
+ && curr_state->middle_bundle_stops
+ < best_state->middle_bundle_stops)))))))
+ best_state = curr_state;
+ /* Second (backward) pass: adding nops and templates. */
+ gcc_assert (best_state);
+ insn_num = best_state->before_nops_num;
+ template0 = template1 = -1;
+ for (curr_state = best_state;
+ curr_state->originator != NULL;
+ curr_state = curr_state->originator)
+ {
+ insn = curr_state->insn;
+ asm_p = (GET_CODE (PATTERN (insn)) == ASM_INPUT
+ || asm_noperands (PATTERN (insn)) >= 0);
+ insn_num++;
+ if (verbose >= 2 && dump)
+ {
+ struct DFA_chip
+ {
+ unsigned short one_automaton_state;
+ unsigned short oneb_automaton_state;
+ unsigned short two_automaton_state;
+ unsigned short twob_automaton_state;
+ };
+
+ fprintf
+ (dump,
+ "// Best %d (orig %d, cost %d, nops %d/%d, insns %d, branch %d, mid.stops %d, state %d) for %d\n",
+ curr_state->unique_num,
+ (curr_state->originator == NULL
+ ? -1 : curr_state->originator->unique_num),
+ curr_state->cost,
+ curr_state->before_nops_num, curr_state->after_nops_num,
+ curr_state->accumulated_insns_num, curr_state->branch_deviation,
+ curr_state->middle_bundle_stops,
+ (ia64_tune == PROCESSOR_ITANIUM
+ ? ((struct DFA_chip *) curr_state->dfa_state)->oneb_automaton_state
+ : ((struct DFA_chip *) curr_state->dfa_state)->twob_automaton_state),
+ INSN_UID (insn));
+ }
+ /* Find the position in the current bundle window. The window can
+ contain at most two bundles. Two bundle window means that
+ the processor will make two bundle rotation. */
+ max_pos = get_max_pos (curr_state->dfa_state);
+ if (max_pos == 6
+ /* The following (negative template number) means that the
+ processor did one bundle rotation. */
+ || (max_pos == 3 && template0 < 0))
+ {
+ /* We are at the end of the window -- find template(s) for
+ its bundle(s). */
+ pos = max_pos;
+ if (max_pos == 3)
+ template0 = get_template (curr_state->dfa_state, 3);
+ else
+ {
+ template1 = get_template (curr_state->dfa_state, 3);
+ template0 = get_template (curr_state->dfa_state, 6);
+ }
+ }
+ if (max_pos > 3 && template1 < 0)
+ /* It may happen when we have the stop inside a bundle. */