C++程序  |  1615行  |  59.22 KB


/*---------------------------------------------------------------*/
/*--- begin                                 host_reg_alloc2.c ---*/
/*---------------------------------------------------------------*/

/*
   This file is part of Valgrind, a dynamic binary instrumentation
   framework.

   Copyright (C) 2004-2017 OpenWorks LLP
      info@open-works.net

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
   02110-1301, USA.

   The GNU General Public License is contained in the file COPYING.

   Neither the names of the U.S. Department of Energy nor the
   University of California nor the names of its contributors may be
   used to endorse or promote products derived from this software
   without prior written permission.
*/

#include "libvex_basictypes.h"
#include "libvex.h"

#include "main_util.h"
#include "host_generic_regs.h"

/* Set to 1 for lots of debugging output. */
#define DEBUG_REGALLOC 0


/* TODO 27 Oct 04:

   Better consistency checking from what isMove tells us.

   We can possibly do V-V coalescing even when the src is spilled,
   providing we can arrange for the dst to have the same spill slot.

   Note that state[].hreg is the same as the available real regs.

   Generally rationalise data structures.  */


/* Records information on virtual register live ranges.  Computed once
   and remains unchanged after that. */
typedef
   struct {
      /* Becomes live for the first time after this insn ... */
      Short live_after;
      /* Becomes dead for the last time before this insn ... */
      Short dead_before;
      /* The "home" spill slot, if needed.  Never changes. */
      Short spill_offset;
      Short spill_size;
      /* What kind of register this is. */
      HRegClass reg_class;
   }
   VRegLR;


/* Records information on real-register live ranges.  Computed once
   and remains unchanged after that. */
typedef
   struct {
      HReg rreg;
      /* Becomes live after this insn ... */
      Short live_after;
      /* Becomes dead before this insn ... */
      Short dead_before;
   }
   RRegLR;


/* An array of the following structs (rreg_state) comprises the
   running state of the allocator.  It indicates what the current
   disposition of each allocatable real register is.  The array gets
   updated as the allocator processes instructions.  The identity of
   the register is not recorded here, because the index of this
   structure in doRegisterAllocation()'s |rreg_state| is the index
   number of the register, and the register itself can be extracted
   from the RRegUniverse supplied to doRegisterAllocation(). */
typedef
   struct {
      /* ------ FIELDS WHICH DO NOT CHANGE ------ */
      /* Is this involved in any HLRs?  (only an optimisation hint) */
      Bool has_hlrs;
      /* ------ FIELDS WHICH DO CHANGE ------ */
      /* 6 May 07: rearranged fields below so the whole struct fits
         into 16 bytes on both x86 and amd64. */
      /* Used when .disp == Bound and we are looking for vregs to
         spill. */
      Bool is_spill_cand;
      /* Optimisation: used when .disp == Bound.  Indicates when the
         rreg has the same value as the spill slot for the associated
         vreg.  Is safely left at False, and becomes True after a
         spill store or reload for this rreg. */
      Bool eq_spill_slot;
      /* What's it's current disposition? */
      enum { Free,     /* available for use */
             Unavail,  /* in a real-reg live range */
             Bound     /* in use (holding value of some vreg) */
           }
           disp;
      /* If .disp == Bound, what vreg is it bound to? */
      HReg vreg;
   }
   RRegState;


/* The allocator also maintains a redundant array of indexes
   (vreg_state) from vreg numbers back to entries in rreg_state.  It
   is redundant because iff vreg_state[i] == j then
   hregNumber(rreg_state[j].vreg) == i -- that is, the two entries
   point at each other.  The purpose of this is to speed up activities
   which involve looking for a particular vreg: there is no need to
   scan the rreg_state looking for it, just index directly into
   vreg_state.  The FAQ "does this vreg already have an associated
   rreg" is the main beneficiary.  

   To indicate, in vreg_state[i], that a given vreg is not currently
   associated with any rreg, that entry can be set to INVALID_RREG_NO.

   Because the vreg_state entries are signed Shorts, the max number
   of vregs that can be handed by regalloc is 32767.
*/

#define INVALID_RREG_NO ((Short)(-1))

#define IS_VALID_VREGNO(_zz) ((_zz) >= 0 && (_zz) < n_vregs)
#define IS_VALID_RREGNO(_zz) ((_zz) >= 0 && (_zz) < n_rregs)


/* Search forward from some given point in the incoming instruction
   sequence.  Point is to select a virtual register to spill, by
   finding the vreg which is mentioned as far ahead as possible, in
   the hope that this will minimise the number of consequent reloads.

   Only do the search for vregs which are Bound in the running state,
   and for which the .is_spill_cand field is set.  This allows the
   caller to arbitrarily restrict the set of spill candidates to be
   considered.

   To do this we don't actually need to see the incoming instruction
   stream.  Rather, what we need us the HRegUsage records for the
   incoming instruction stream.  Hence that is passed in.

   Returns an index into the state array indicating the (v,r) pair to
   spill, or -1 if none was found.  */
static
Int findMostDistantlyMentionedVReg ( 
   HRegUsage*   reg_usages_in,
   Int          search_from_instr,
   Int          num_instrs,
   RRegState*   state,
   Int          n_state
)
{
   Int k, m;
   Int furthest_k = -1;
   Int furthest   = -1;
   vassert(search_from_instr >= 0);
   for (k = 0; k < n_state; k++) {
      if (!state[k].is_spill_cand)
         continue;
      vassert(state[k].disp == Bound);
      for (m = search_from_instr; m < num_instrs; m++) {
         if (HRegUsage__contains(&reg_usages_in[m], state[k].vreg))
            break;
      }
      if (m > furthest) {
         furthest   = m;
         furthest_k = k;
      }
   }
   return furthest_k;
}


/* Check that this vreg has been assigned a sane spill offset. */
inline
static void sanity_check_spill_offset ( VRegLR* vreg )
{
   switch (vreg->reg_class) {
      case HRcVec128: case HRcFlt64:
         vassert(0 == ((UShort)vreg->spill_offset % 16)); break;
      default:
         vassert(0 == ((UShort)vreg->spill_offset % 8)); break;
   }
}


/* Double the size of the real-reg live-range array, if needed. */
__attribute__((noinline)) 
static void ensureRRLRspace_SLOW ( RRegLR** info, Int* size, Int used )
{
   Int     k;
   RRegLR* arr2;
   if (0)
      vex_printf("ensureRRISpace: %d -> %d\n", *size, 2 * *size);
   vassert(used == *size);
   arr2 = LibVEX_Alloc_inline(2 * *size * sizeof(RRegLR));
   for (k = 0; k < *size; k++)
      arr2[k] = (*info)[k];
   *size *= 2;
   *info = arr2;
}
inline
static void ensureRRLRspace ( RRegLR** info, Int* size, Int used )
{
   if (LIKELY(used < *size)) return;
   ensureRRLRspace_SLOW(info, size, used);
}


/* Sort an array of RRegLR entries by either the .live_after or
   .dead_before fields.  This is performance-critical. */
static void sortRRLRarray ( RRegLR* arr, 
                            Int size, Bool by_live_after )
{
   Int    incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
                       9841, 29524, 88573, 265720,
                       797161, 2391484 };
   Int    lo = 0;
   Int    hi = size-1;
   Int    i, j, h, bigN, hp;
   RRegLR v;

   vassert(size >= 0);
   if (size == 0)
      return;

   bigN = hi - lo + 1; if (bigN < 2) return;
   hp = 0; while (hp < 14 && incs[hp] < bigN) hp++; hp--;

   if (by_live_after) {

      for ( ; hp >= 0; hp--) {
         h = incs[hp];
         for (i = lo + h; i <= hi; i++) {
            v = arr[i];
            j = i;
            while (arr[j-h].live_after > v.live_after) {
               arr[j] = arr[j-h];
               j = j - h;
               if (j <= (lo + h - 1)) break;
            }
            arr[j] = v;
         }
      }

   } else {

      for ( ; hp >= 0; hp--) {
         h = incs[hp];
         for (i = lo + h; i <= hi; i++) {
            v = arr[i];
            j = i;
            while (arr[j-h].dead_before > v.dead_before) {
               arr[j] = arr[j-h];
               j = j - h;
               if (j <= (lo + h - 1)) break;
            }
            arr[j] = v;
         }
      }

   }
}


/* Compute the index of the highest and lowest 1 in a ULong,
   respectively.  Results are undefined if the argument is zero.
   Don't pass it zero :) */
static inline UInt ULong__maxIndex ( ULong w64 ) {
   return 63 - __builtin_clzll(w64);
}

static inline UInt ULong__minIndex ( ULong w64 ) {
   return __builtin_ctzll(w64);
}


/* Vectorised memset, copied from Valgrind's m_libcbase.c. */
static void* local_memset ( void *destV, Int c, SizeT sz )
{
#  define IS_4_ALIGNED(aaa_p) (0 == (((HWord)(aaa_p)) & ((HWord)0x3)))

   UInt   c4;
   UChar* d = destV;
   UChar  uc = c;

   while ((!IS_4_ALIGNED(d)) && sz >= 1) {
      d[0] = uc;
      d++;
      sz--;
   }
   if (sz == 0)
      return destV;
   c4 = uc;
   c4 |= (c4 << 8);
   c4 |= (c4 << 16);
   while (sz >= 16) {
      ((UInt*)d)[0] = c4;
      ((UInt*)d)[1] = c4;
      ((UInt*)d)[2] = c4;
      ((UInt*)d)[3] = c4;
      d += 16;
      sz -= 16;
   }
   while (sz >= 4) {
      ((UInt*)d)[0] = c4;
      d += 4;
      sz -= 4;
   }
   while (sz >= 1) {
      d[0] = c;
      d++;
      sz--;
   }
   return destV;

#  undef IS_4_ALIGNED
}


/* A target-independent register allocator.  Requires various
   functions which it uses to deal abstractly with instructions and
   registers, since it cannot have any target-specific knowledge.

   Returns a new list of instructions, which, as a result of the
   behaviour of mapRegs, will be in-place modifications of the
   original instructions.

   Requires that the incoming code has been generated using
   vreg numbers 0, 1 .. n_vregs-1.  Appearance of a vreg outside
   that range is a checked run-time error.

   Takes an expandable array of pointers to unallocated insns.
   Returns an expandable array of pointers to allocated insns.
*/
HInstrArray* doRegisterAllocation (

   /* Incoming virtual-registerised code. */ 
   HInstrArray* instrs_in,

   /* The real-register universe to use.  This contains facts about
      real registers, one of which is the set of registers available
      for allocation. */
   const RRegUniverse* univ,

   /* Return True iff the given insn is a reg-reg move, in which
      case also return the src and dst regs. */
   Bool (*isMove) ( const HInstr*, HReg*, HReg* ),

   /* Get info about register usage in this insn. */
   void (*getRegUsage) ( HRegUsage*, const HInstr*, Bool ),

   /* Apply a reg-reg mapping to an insn. */
   void (*mapRegs) ( HRegRemap*, HInstr*, Bool ),

   /* Return one, or, if we're unlucky, two insn(s) to spill/restore a
      real reg to a spill slot byte offset.  The two leading HInstr**
      args are out parameters, through which the generated insns are
      returned.  Also (optionally) a 'directReload' function, which
      attempts to replace a given instruction by one which reads
      directly from a specified spill slot.  May be NULL, in which
      case the optimisation is not attempted. */
   void    (*genSpill)  ( HInstr**, HInstr**, HReg, Int, Bool ),
   void    (*genReload) ( HInstr**, HInstr**, HReg, Int, Bool ),
   HInstr* (*directReload) ( HInstr*, HReg, Short ),
   Int     guest_sizeB,

   /* For debug printing only. */
   void (*ppInstr) ( const HInstr*, Bool ),
   void (*ppReg) ( HReg ),

   /* 32/64bit mode */
   Bool mode64
)
{
#  define N_SPILL64S  (LibVEX_N_SPILL_BYTES / 8)

   const Bool eq_spill_opt = True;

   /* Info on vregs and rregs.  Computed once and remains
      unchanged. */
   Int     n_vregs;
   VRegLR* vreg_lrs; /* [0 .. n_vregs-1] */

   /* We keep two copies of the real-reg live range info, one sorted
      by .live_after and the other by .dead_before.  First the
      unsorted info is created in the _la variant is copied into the
      _db variant.  Once that's done both of them are sorted. 
      We also need two integer cursors which record the next
      location in the two arrays to consider. */
   RRegLR* rreg_lrs_la;
   RRegLR* rreg_lrs_db;
   Int     rreg_lrs_size;
   Int     rreg_lrs_used;
   Int     rreg_lrs_la_next;
   Int     rreg_lrs_db_next;

   /* Info on register usage in the incoming instruction array.
      Computed once and remains unchanged, more or less; updated
      sometimes by the direct-reload optimisation. */
   HRegUsage* reg_usage_arr; /* [0 .. instrs_in->arr_used-1] */

   /* Used when constructing vreg_lrs (for allocating stack
      slots). */
   Short ss_busy_until_before[N_SPILL64S];

   /* Used when constructing rreg_lrs. */
   Int* rreg_live_after;
   Int* rreg_dead_before;

   /* Running state of the core allocation algorithm. */
   RRegState* rreg_state;  /* [0 .. n_rregs-1] */
   Int        n_rregs;

   /* .. and the redundant backward map */
   /* Each value is 0 .. n_rregs-1 or is INVALID_RREG_NO.
      This inplies n_rregs must be <= 32768. */
   Short*     vreg_state;  /* [0 .. n_vregs-1] */

   /* The vreg -> rreg map constructed and then applied to each
      instr. */
   HRegRemap remap;

   /* The output array of instructions. */
   HInstrArray* instrs_out;

   /* Sanity checks are expensive.  They are only done periodically,
      not at each insn processed. */
   Bool do_sanity_check;

   vassert(0 == (guest_sizeB % LibVEX_GUEST_STATE_ALIGN));
   vassert(0 == (LibVEX_N_SPILL_BYTES % LibVEX_GUEST_STATE_ALIGN));
   vassert(0 == (N_SPILL64S % 2));

   /* The live range numbers are signed shorts, and so limiting the
      number of insns to 15000 comfortably guards against them
      overflowing 32k. */
   vassert(instrs_in->arr_used <= 15000);

#  define INVALID_INSTRNO (-2)

#  define EMIT_INSTR(_instr)                  \
      do {                                    \
        HInstr* _tmp = (_instr);              \
        if (DEBUG_REGALLOC) {                 \
           vex_printf("**  ");                \
           (*ppInstr)(_tmp, mode64);          \
           vex_printf("\n\n");                \
        }                                     \
        addHInstr ( instrs_out, _tmp );       \
      } while (0)

#   define PRINT_STATE						   \
      do {							   \
         Int z, q;						   \
         for (z = 0; z < n_rregs; z++) {			   \
            vex_printf("  rreg_state[%2d] = ", z);		   \
            (*ppReg)(univ->regs[z]);	       			   \
            vex_printf("  \t");					   \
            switch (rreg_state[z].disp) {			   \
               case Free:    vex_printf("Free\n"); break;	   \
               case Unavail: vex_printf("Unavail\n"); break;	   \
               case Bound:   vex_printf("BoundTo "); 		   \
                             (*ppReg)(rreg_state[z].vreg);	   \
                             vex_printf("\n"); break;		   \
            }							   \
         }							   \
         vex_printf("\n  vreg_state[0 .. %d]:\n    ", n_vregs-1);  \
         q = 0;                                                    \
         for (z = 0; z < n_vregs; z++) {                           \
            if (vreg_state[z] == INVALID_RREG_NO)                  \
               continue;                                           \
            vex_printf("[%d] -> %d   ", z, vreg_state[z]);         \
            q++;                                                   \
            if (q > 0 && (q % 6) == 0)                             \
               vex_printf("\n    ");                               \
         }                                                         \
         vex_printf("\n");                                         \
      } while (0)


   /* --------- Stage 0: set up output array --------- */
   /* --------- and allocate/initialise running state. --------- */

   instrs_out = newHInstrArray();

   /* ... and initialise running state. */
   /* n_rregs is no more than a short name for n_available_real_regs. */
   n_rregs = univ->allocable;
   n_vregs = instrs_in->n_vregs;

   /* If this is not so, vreg_state entries will overflow. */
   vassert(n_vregs < 32767);

   /* If this is not so, the universe we have is nonsensical. */
   vassert(n_rregs > 0);

   rreg_state = LibVEX_Alloc_inline(n_rregs * sizeof(RRegState));
   vreg_state = LibVEX_Alloc_inline(n_vregs * sizeof(Short));

   for (Int j = 0; j < n_rregs; j++) {
      rreg_state[j].has_hlrs      = False;
      rreg_state[j].disp          = Free;
      rreg_state[j].vreg          = INVALID_HREG;
      rreg_state[j].is_spill_cand = False;
      rreg_state[j].eq_spill_slot = False;
   }

   for (Int j = 0; j < n_vregs; j++)
      vreg_state[j] = INVALID_RREG_NO;


   /* --------- Stage 1: compute vreg live ranges. --------- */
   /* --------- Stage 2: compute rreg live ranges. --------- */

   /* ------ start of SET UP TO COMPUTE VREG LIVE RANGES ------ */

   /* This is relatively simple, because (1) we only seek the complete
      end-to-end live range of each vreg, and are not interested in
      any holes in it, and (2) the vregs are conveniently numbered 0
      .. n_vregs-1, so we can just dump the results in a
      pre-allocated array. */

   vreg_lrs = NULL;
   if (n_vregs > 0)
      vreg_lrs = LibVEX_Alloc_inline(sizeof(VRegLR) * n_vregs);

   for (Int j = 0; j < n_vregs; j++) {
      vreg_lrs[j].live_after     = INVALID_INSTRNO;
      vreg_lrs[j].dead_before    = INVALID_INSTRNO;
      vreg_lrs[j].spill_offset   = 0;
      vreg_lrs[j].spill_size     = 0;
      vreg_lrs[j].reg_class      = HRcINVALID;
   }

   /* An array to hold the reg-usage info for the incoming
      instructions. */
   reg_usage_arr
      = LibVEX_Alloc_inline(sizeof(HRegUsage) * instrs_in->arr_used-1);

   /* ------ end of SET UP TO COMPUTE VREG LIVE RANGES ------ */

   /* ------ start of SET UP TO COMPUTE RREG LIVE RANGES ------ */

   /* This is more complex than Stage 1, because we need to compute
      exactly all the live ranges of all the allocatable real regs,
      and we don't know in advance how many there will be. */

   rreg_lrs_used = 0;
   rreg_lrs_size = 4;
   rreg_lrs_la = LibVEX_Alloc_inline(rreg_lrs_size * sizeof(RRegLR));
   rreg_lrs_db = NULL; /* we'll create this later */

   /* We'll need to track live range start/end points seperately for
      each rreg.  Sigh. */
   vassert(n_rregs > 0);
   rreg_live_after  = LibVEX_Alloc_inline(n_rregs * sizeof(Int));
   rreg_dead_before = LibVEX_Alloc_inline(n_rregs * sizeof(Int));

   for (Int j = 0; j < n_rregs; j++) {
      rreg_live_after[j] = 
      rreg_dead_before[j] = INVALID_INSTRNO;
   }

   /* ------ end of SET UP TO COMPUTE RREG LIVE RANGES ------ */

   /* ------ start of ITERATE OVER INSNS ------ */

   for (Int ii = 0; ii < instrs_in->arr_used; ii++) {

      (*getRegUsage)( &reg_usage_arr[ii], instrs_in->arr[ii], mode64 );

      if (0) {
         vex_printf("\n%d  stage1: ", ii);
         (*ppInstr)(instrs_in->arr[ii], mode64);
         vex_printf("\n");
         ppHRegUsage(univ, &reg_usage_arr[ii]);
      }

      /* ------ start of DEAL WITH VREG LIVE RANGES ------ */

      /* for each virtual reg mentioned in the insn ... */
      for (Int j = 0; j < reg_usage_arr[ii].n_vRegs; j++) {

         HReg vreg = reg_usage_arr[ii].vRegs[j];
         vassert(hregIsVirtual(vreg));

         Int k = hregIndex(vreg);
         if (k < 0 || k >= n_vregs) {
            vex_printf("\n");
            (*ppInstr)(instrs_in->arr[ii], mode64);
            vex_printf("\n");
            vex_printf("vreg %d, n_vregs %d\n", k, n_vregs);
            vpanic("doRegisterAllocation: out-of-range vreg");
         }

         /* Take the opportunity to note its regclass.  We'll need
            that when allocating spill slots. */
         if (vreg_lrs[k].reg_class == HRcINVALID) {
            /* First mention of this vreg. */
            vreg_lrs[k].reg_class = hregClass(vreg);
         } else {
            /* Seen it before, so check for consistency. */
            vassert(vreg_lrs[k].reg_class == hregClass(vreg));
         }

         /* Now consider live ranges. */
         switch (reg_usage_arr[ii].vMode[j]) {
            case HRmRead: 
               if (vreg_lrs[k].live_after == INVALID_INSTRNO) {
                  vex_printf("\n\nOFFENDING VREG = %d\n", k);
                  vpanic("doRegisterAllocation: "
                         "first event for vreg is Read");
               }
               vreg_lrs[k].dead_before = toShort(ii + 1);
               break;
            case HRmWrite:
               if (vreg_lrs[k].live_after == INVALID_INSTRNO)
                  vreg_lrs[k].live_after = toShort(ii);
               vreg_lrs[k].dead_before = toShort(ii + 1);
               break;
            case HRmModify:
               if (vreg_lrs[k].live_after == INVALID_INSTRNO) {
                  vex_printf("\n\nOFFENDING VREG = %d\n", k);
                  vpanic("doRegisterAllocation: "
                         "first event for vreg is Modify");
               }
               vreg_lrs[k].dead_before = toShort(ii + 1);
               break;
            default:
               vpanic("doRegisterAllocation(1)");
         } /* switch */

      } /* iterate over virtual registers */

      /* ------ end of DEAL WITH VREG LIVE RANGES ------ */

      /* ------ start of DEAL WITH RREG LIVE RANGES ------ */

      /* If this doesn't hold, the following iteration over real registers
         will fail miserably. */
      vassert(N_RREGUNIVERSE_REGS == 64);

      const ULong rRead      = reg_usage_arr[ii].rRead;
      const ULong rWritten   = reg_usage_arr[ii].rWritten;
      const ULong rMentioned = rRead | rWritten;

      UInt rReg_minIndex;
      UInt rReg_maxIndex;
      if (rMentioned == 0) {
         /* There are no real register uses in this insn.  Set
            rReg_{min,max}Index so that the following loop doesn't iterate
            at all, so as to avoid wasting time. */
         rReg_minIndex = 1;
         rReg_maxIndex = 0;
      } else {
         rReg_minIndex = ULong__minIndex(rMentioned);
         rReg_maxIndex = ULong__maxIndex(rMentioned);
         /* Don't bother to look at registers which are not available
            to the allocator.  We asserted above that n_rregs > 0, so
            n_rregs-1 is safe. */
         if (rReg_maxIndex >= n_rregs)
            rReg_maxIndex = n_rregs-1;
      }

      /* for each allocator-available real reg mentioned in the insn ... */
      /* Note.  We are allocating only over the real regs available to
         the allocator.  Others, eg the stack or baseblock pointers,
         are unavailable to allocation and so we never visit them.
         Hence the iteration is cut off at n_rregs-1, since n_rregs ==
         univ->allocable. */
      for (Int j = rReg_minIndex; j <= rReg_maxIndex; j++) {

         const ULong jMask = 1ULL << j;
         if (LIKELY((rMentioned & jMask) == 0))
            continue;

         const Bool isR = (rRead    & jMask) != 0;
         const Bool isW = (rWritten & jMask) != 0;

         /* Dummy initialisations of flush_la and flush_db to avoid
            possible bogus uninit-var warnings from gcc. */
         Int  flush_la = INVALID_INSTRNO, flush_db = INVALID_INSTRNO;
         Bool flush = False;

         if (isW && !isR) {
            flush_la = rreg_live_after[j];
            flush_db = rreg_dead_before[j];
            if (flush_la != INVALID_INSTRNO && flush_db != INVALID_INSTRNO)
               flush = True;
            rreg_live_after[j]  = ii;
            rreg_dead_before[j] = ii+1;
         } else if (!isW && isR) {
            if (rreg_live_after[j] == INVALID_INSTRNO) {
               vex_printf("\nOFFENDING RREG = ");
               (*ppReg)(univ->regs[j]);
               vex_printf("\n");
               vex_printf("\nOFFENDING instr = ");
               (*ppInstr)(instrs_in->arr[ii], mode64);
               vex_printf("\n");
               vpanic("doRegisterAllocation: "
                      "first event for rreg is Read");
            }
            rreg_dead_before[j] = ii+1;
         } else {
            vassert(isR && isW);
            if (rreg_live_after[j] == INVALID_INSTRNO) {
               vex_printf("\nOFFENDING RREG = ");
               (*ppReg)(univ->regs[j]);
               vex_printf("\n");
               vex_printf("\nOFFENDING instr = ");
               (*ppInstr)(instrs_in->arr[ii], mode64);
               vex_printf("\n");
               vpanic("doRegisterAllocation: "
                      "first event for rreg is Modify");
            }
            rreg_dead_before[j] = ii+1;
         }

         if (flush) {
            vassert(flush_la != INVALID_INSTRNO);
            vassert(flush_db != INVALID_INSTRNO);
            ensureRRLRspace(&rreg_lrs_la, &rreg_lrs_size, rreg_lrs_used);
            if (0) 
               vex_printf("FLUSH 1 (%d,%d)\n", flush_la, flush_db);
            rreg_lrs_la[rreg_lrs_used].rreg        = univ->regs[j];
            rreg_lrs_la[rreg_lrs_used].live_after  = toShort(flush_la);
            rreg_lrs_la[rreg_lrs_used].dead_before = toShort(flush_db);
            rreg_lrs_used++;
         }

      } /* iterate over rregs in the instr */

      /* ------ end of DEAL WITH RREG LIVE RANGES ------ */

   } /* iterate over insns */

   /* ------ end of ITERATE OVER INSNS ------ */

   /* ------ start of FINALISE RREG LIVE RANGES ------ */

   /* Now finish up any live ranges left over. */
   for (Int j = 0; j < n_rregs; j++) {

      if (0) {
         vex_printf("residual %d:  %d %d\n", j, rreg_live_after[j],
                                                rreg_dead_before[j]);
      }
      vassert( (rreg_live_after[j] == INVALID_INSTRNO 
                && rreg_dead_before[j] == INVALID_INSTRNO)
              ||
               (rreg_live_after[j] != INVALID_INSTRNO 
                && rreg_dead_before[j] != INVALID_INSTRNO)
            );

      if (rreg_live_after[j] == INVALID_INSTRNO)
         continue;

      ensureRRLRspace(&rreg_lrs_la, &rreg_lrs_size, rreg_lrs_used);
      if (0)
         vex_printf("FLUSH 2 (%d,%d)\n", 
                    rreg_live_after[j], rreg_dead_before[j]);
      rreg_lrs_la[rreg_lrs_used].rreg        = univ->regs[j];
      rreg_lrs_la[rreg_lrs_used].live_after  = toShort(rreg_live_after[j]);
      rreg_lrs_la[rreg_lrs_used].dead_before = toShort(rreg_dead_before[j]);
      rreg_lrs_used++;
   }

   /* Compute summary hints for choosing real regs.  If a real reg is
      involved in a hard live range, record that fact in the fixed
      part of the running rreg_state.  Later, when offered a choice between
      rregs, it's better to choose one which is not marked as having
      any HLRs, since ones with HLRs may need to be spilled around
      their HLRs.  Correctness of final assignment is unaffected by
      this mechanism -- it is only an optimisation. */

   for (Int j = 0; j < rreg_lrs_used; j++) {
      HReg rreg = rreg_lrs_la[j].rreg;
      vassert(!hregIsVirtual(rreg));
      /* rreg is involved in a HLR.  Record this info in the array, if
         there is space. */
      UInt ix = hregIndex(rreg);
      vassert(ix < n_rregs);
      rreg_state[ix].has_hlrs = True;
   }
   if (0) {
      for (Int j = 0; j < n_rregs; j++) {
         if (!rreg_state[j].has_hlrs)
            continue;
         ppReg(univ->regs[j]);
         vex_printf(" hinted\n");
      }
   }

   /* Finally, copy the _la variant into the _db variant and
      sort both by their respective fields. */
   rreg_lrs_db = LibVEX_Alloc_inline(rreg_lrs_used * sizeof(RRegLR));
   for (Int j = 0; j < rreg_lrs_used; j++)
      rreg_lrs_db[j] = rreg_lrs_la[j];

   sortRRLRarray( rreg_lrs_la, rreg_lrs_used, True /* by .live_after*/  );
   sortRRLRarray( rreg_lrs_db, rreg_lrs_used, False/* by .dead_before*/ );

   /* And set up the cursors. */
   rreg_lrs_la_next = 0;
   rreg_lrs_db_next = 0;

   for (Int j = 1; j < rreg_lrs_used; j++) {
      vassert(rreg_lrs_la[j-1].live_after  <= rreg_lrs_la[j].live_after);
      vassert(rreg_lrs_db[j-1].dead_before <= rreg_lrs_db[j].dead_before);
   }

   /* ------ end of FINALISE RREG LIVE RANGES ------ */

   if (DEBUG_REGALLOC) {
      for (Int j = 0; j < n_vregs; j++) {
         vex_printf("vreg %d:  la = %d,  db = %d\n", 
                    j, vreg_lrs[j].live_after, vreg_lrs[j].dead_before );
      }
   }

   if (DEBUG_REGALLOC) {
      vex_printf("RRegLRs by LA:\n");
      for (Int j = 0; j < rreg_lrs_used; j++) {
         vex_printf("  ");
         (*ppReg)(rreg_lrs_la[j].rreg);
         vex_printf("      la = %d,  db = %d\n",
                    rreg_lrs_la[j].live_after, rreg_lrs_la[j].dead_before );
      }
      vex_printf("RRegLRs by DB:\n");
      for (Int j = 0; j < rreg_lrs_used; j++) {
         vex_printf("  ");
         (*ppReg)(rreg_lrs_db[j].rreg);
         vex_printf("      la = %d,  db = %d\n",
                    rreg_lrs_db[j].live_after, rreg_lrs_db[j].dead_before );
      }
   }

   /* --------- Stage 3: allocate spill slots. --------- */

   /* Each spill slot is 8 bytes long.  For vregs which take more than
      64 bits to spill (classes Flt64 and Vec128), we have to allocate
      two consecutive spill slots.  For 256 bit registers (class
      Vec256), we have to allocate four consecutive spill slots.

      For Vec128-class on PowerPC, the spill slot's actual address
      must be 16-byte aligned.  Since the spill slot's address is
      computed as an offset from the guest state pointer, and since
      the user of the generated code must set that pointer to a
      32-aligned value, we have the residual obligation here of
      choosing a 16-aligned spill slot offset for Vec128-class values.
      Since each spill slot is 8 bytes long, that means for
      Vec128-class values we must allocated a spill slot number which
      is zero mod 2.

      Similarly, for Vec256 class on amd64, find a spill slot number
      which is zero mod 4.  This guarantees it will be 32 byte
      aligned, which isn't actually necessary on amd64 (we use movUpd
      etc to spill), but seems like good practice.

      Do a rank-based allocation of vregs to spill slot numbers.  We
      put as few values as possible in spill slots, but nevertheless
      need to have a spill slot available for all vregs, just in case.
   */
   /* Int max_ss_no = -1; */

   local_memset(ss_busy_until_before, 0, sizeof(ss_busy_until_before));

   for (Int j = 0; j < n_vregs; j++) {

      /* True iff this vreg is unused.  In which case we also expect
         that the reg_class field for it has not been set.  */
      if (vreg_lrs[j].live_after == INVALID_INSTRNO) {
         vassert(vreg_lrs[j].reg_class == HRcINVALID);
         continue;
      }

      /* The spill slots are 64 bits in size.  As per the comment on
         definition of HRegClass in host_generic_regs.h, that means,
         to spill a vreg of class Flt64 or Vec128, we'll need to find
         two adjacent spill slots to use.  For Vec256, we'll need to
         find four adjacent slots to use.  Note, this logic needs to
         kept in sync with the size info on the definition of
         HRegClass. */
      Int ss_no = -1;
      switch (vreg_lrs[j].reg_class) {

         case HRcVec128: case HRcFlt64:
            /* Find two adjacent free slots in which between them
               provide up to 128 bits in which to spill the vreg.
               Since we are trying to find an even:odd pair, move
               along in steps of 2 (slots). */
            for (ss_no = 0; ss_no < N_SPILL64S-1; ss_no += 2)
               if (ss_busy_until_before[ss_no+0] <= vreg_lrs[j].live_after
                   && ss_busy_until_before[ss_no+1] <= vreg_lrs[j].live_after)
                  break;
            if (ss_no >= N_SPILL64S-1) {
               vpanic("LibVEX_N_SPILL_BYTES is too low.  " 
                      "Increase and recompile.");
            }
            ss_busy_until_before[ss_no+0] = vreg_lrs[j].dead_before;
            ss_busy_until_before[ss_no+1] = vreg_lrs[j].dead_before;
            break;

         default:
            /* The ordinary case -- just find a single spill slot. */
            /* Find the lowest-numbered spill slot which is available
               at the start point of this interval, and assign the
               interval to it. */
            for (ss_no = 0; ss_no < N_SPILL64S; ss_no++)
               if (ss_busy_until_before[ss_no] <= vreg_lrs[j].live_after)
                  break;
            if (ss_no == N_SPILL64S) {
               vpanic("LibVEX_N_SPILL_BYTES is too low.  " 
                      "Increase and recompile.");
            }
            ss_busy_until_before[ss_no] = vreg_lrs[j].dead_before;
            break;

      } /* switch (vreg_lrs[j].reg_class) */

      /* This reflects LibVEX's hard-wired knowledge of the baseBlock
         layout: the guest state, then two equal sized areas following
         it for two sets of shadow state, and then the spill area. */
      vreg_lrs[j].spill_offset = toShort(guest_sizeB * 3 + ss_no * 8);

      /* Independent check that we've made a sane choice of slot */
      sanity_check_spill_offset( &vreg_lrs[j] );
      /* if (j > max_ss_no) */
      /*    max_ss_no = j; */
   }

   if (0) {
      vex_printf("\n\n");
      for (Int j = 0; j < n_vregs; j++)
         vex_printf("vreg %d    --> spill offset %d\n",
                    j, vreg_lrs[j].spill_offset);
   }

   /* --------- Stage 4: establish rreg preferences --------- */

   /* It may be advantageous to allocating certain vregs to specific
      rregs, as a way of avoiding reg-reg moves later.  Here we
      establish which, if any, rreg each vreg would prefer to be in.
      Note that this constrains the allocator -- ideally we end up
      with as few as possible vregs expressing a preference.  

      This is an optimisation: if the .preferred_rreg field is never
      set to anything different from INVALID_HREG, the allocator still
      works. */

   /* 30 Dec 04: removed this mechanism as it does not seem to
      help. */

   /* --------- Stage 5: process instructions --------- */

   /* This is the main loop of the allocator.  First, we need to
      correctly set up our running state, which tracks the status of
      each real register. */

   /* ------ BEGIN: Process each insn in turn. ------ */

   for (Int ii = 0; ii < instrs_in->arr_used; ii++) {

      if (DEBUG_REGALLOC) {
         vex_printf("\n====----====---- Insn %d ----====----====\n", ii);
         vex_printf("---- ");
         (*ppInstr)(instrs_in->arr[ii], mode64);
         vex_printf("\n\nInitial state:\n");
         PRINT_STATE;
         vex_printf("\n");
      }

      /* ------------ Sanity checks ------------ */

      /* Sanity checks are expensive.  So they are done only once
         every 17 instructions, and just before the last
         instruction. */
      do_sanity_check
         = toBool(
              False /* Set to True for sanity checking of all insns. */
              || ii == instrs_in->arr_used-1
              || (ii > 0 && (ii % 17) == 0)
           );

      if (do_sanity_check) {

         /* Sanity check 1: all rregs with a hard live range crossing
            this insn must be marked as unavailable in the running
            state. */
         for (Int j = 0; j < rreg_lrs_used; j++) {
            if (rreg_lrs_la[j].live_after < ii
                && ii < rreg_lrs_la[j].dead_before) {
               /* ii is the middle of a hard live range for some real
                  reg.  Check it's marked as such in the running
                  state. */
               HReg reg = rreg_lrs_la[j].rreg;

               if (0) {
                  vex_printf("considering la %d .. db %d   reg = ", 
                             rreg_lrs_la[j].live_after, 
                             rreg_lrs_la[j].dead_before);
                  (*ppReg)(reg);
                  vex_printf("\n");
               }

               /* assert that this rreg is marked as unavailable */
               vassert(!hregIsVirtual(reg));
               vassert(rreg_state[hregIndex(reg)].disp == Unavail);
            }
         }

         /* Sanity check 2: conversely, all rregs marked as
            unavailable in the running rreg_state must have a
            corresponding hard live range entry in the rreg_lrs
            array. */
         for (Int j = 0; j < n_rregs; j++) {
            vassert(rreg_state[j].disp == Bound
                    || rreg_state[j].disp == Free
                    || rreg_state[j].disp == Unavail);
            if (rreg_state[j].disp != Unavail)
               continue;
            Int k;
            for (k = 0; k < rreg_lrs_used; k++) {
               HReg reg = rreg_lrs_la[k].rreg;
               vassert(!hregIsVirtual(reg));
               if (hregIndex(reg) == j
                   && rreg_lrs_la[k].live_after < ii 
                   && ii < rreg_lrs_la[k].dead_before) 
                  break;
            }
            /* If this vassertion fails, we couldn't find a
               corresponding HLR. */
            vassert(k < rreg_lrs_used);
         }

         /* Sanity check 3: all vreg-rreg bindings must bind registers
            of the same class. */
         for (Int j = 0; j < n_rregs; j++) {
            if (rreg_state[j].disp != Bound) {
               vassert(rreg_state[j].eq_spill_slot == False);
               continue;
            }
            vassert(hregClass(univ->regs[j]) 
                    == hregClass(rreg_state[j].vreg));
            vassert( hregIsVirtual(rreg_state[j].vreg));
         }

         /* Sanity check 4: the vreg_state and rreg_state
            mutually-redundant mappings are consistent.  If
            rreg_state[j].vreg points at some vreg_state entry then
            that vreg_state entry should point back at
            rreg_state[j]. */
         for (Int j = 0; j < n_rregs; j++) {
            if (rreg_state[j].disp != Bound)
               continue;
            Int k = hregIndex(rreg_state[j].vreg);
            vassert(IS_VALID_VREGNO(k));
            vassert(vreg_state[k] == j);
         }
         for (Int j = 0; j < n_vregs; j++) {
            Int k = vreg_state[j];
            if (k == INVALID_RREG_NO)
               continue;
            vassert(IS_VALID_RREGNO(k));
            vassert(rreg_state[k].disp == Bound);
            vassert(hregIndex(rreg_state[k].vreg) == j);
         }

      } /* if (do_sanity_check) */

      /* ------------ end of Sanity checks ------------ */

      /* Do various optimisations pertaining to register coalescing
         and preferencing:
            MOV  v <-> v   coalescing (done here).
            MOV  v <-> r   coalescing (not yet, if ever)
      */
      /* If doing a reg-reg move between two vregs, and the src's live
         range ends here and the dst's live range starts here, bind
         the dst to the src's rreg, and that's all. */
      HReg vregS = INVALID_HREG;
      HReg vregD = INVALID_HREG;
      if ( (*isMove)( instrs_in->arr[ii], &vregS, &vregD ) ) {
         if (!hregIsVirtual(vregS)) goto cannot_coalesce;
         if (!hregIsVirtual(vregD)) goto cannot_coalesce;
         /* Check that *isMove is not telling us a bunch of lies ... */
         vassert(hregClass(vregS) == hregClass(vregD));
         Int k = hregIndex(vregS);
         Int m = hregIndex(vregD);
         vassert(IS_VALID_VREGNO(k));
         vassert(IS_VALID_VREGNO(m));
         if (vreg_lrs[k].dead_before != ii + 1) goto cannot_coalesce;
         if (vreg_lrs[m].live_after != ii) goto cannot_coalesce;
         if (DEBUG_REGALLOC) {
         vex_printf("COALESCE ");
            (*ppReg)(vregS);
            vex_printf(" -> ");
            (*ppReg)(vregD);
            vex_printf("\n\n");
         }
         /* Find the state entry for vregS. */
         Int n = vreg_state[k]; /* k is the index of vregS */
         if (n == INVALID_RREG_NO) {
            /* vregS is not currently in a real register.  So we can't
               do the coalescing.  Give up. */
            goto cannot_coalesce;
         }
         vassert(IS_VALID_RREGNO(n));

         /* Finally, we can do the coalescing.  It's trivial -- merely
            claim vregS's register for vregD. */
         rreg_state[n].vreg = vregD;
         vassert(IS_VALID_VREGNO(hregIndex(vregD)));
         vassert(IS_VALID_VREGNO(hregIndex(vregS)));
         vreg_state[hregIndex(vregD)] = toShort(n);
         vreg_state[hregIndex(vregS)] = INVALID_RREG_NO;

         /* This rreg has become associated with a different vreg and
            hence with a different spill slot.  Play safe. */
         rreg_state[n].eq_spill_slot = False;

         /* Move on to the next insn.  We skip the post-insn stuff for
            fixed registers, since this move should not interact with
            them in any way. */
         continue;
      }
     cannot_coalesce:

      /* ------ Free up rregs bound to dead vregs ------ */

      /* Look for vregs whose live range has just ended, and 
	 mark the associated rreg as free. */

      for (Int j = 0; j < n_rregs; j++) {
         if (rreg_state[j].disp != Bound)
            continue;
         UInt vregno = hregIndex(rreg_state[j].vreg);
         vassert(IS_VALID_VREGNO(vregno));
         if (vreg_lrs[vregno].dead_before <= ii) {
            rreg_state[j].disp = Free;
            rreg_state[j].eq_spill_slot = False;
            Int m = hregIndex(rreg_state[j].vreg);
            vassert(IS_VALID_VREGNO(m));
            vreg_state[m] = INVALID_RREG_NO;
            if (DEBUG_REGALLOC) {
               vex_printf("free up "); 
               (*ppReg)(univ->regs[j]); 
               vex_printf("\n");
            }
         }
      }

      /* ------ Pre-instruction actions for fixed rreg uses ------ */

      /* Now we have to deal with rregs which are about to be made
         live by this instruction -- in other words, are entering into
         one of their live ranges.  If any such rreg holds a vreg, we
         will have to free up the rreg.  The simplest solution which
         is correct is to spill the rreg.

         Note we could do better:
         * Could move it into some other free rreg, if one is available 

         Do this efficiently, by incrementally stepping along an array
         of rreg HLRs that are known to be sorted by start point
         (their .live_after field).
      */
      while (True) {
         vassert(rreg_lrs_la_next >= 0);
         vassert(rreg_lrs_la_next <= rreg_lrs_used);
         if (rreg_lrs_la_next == rreg_lrs_used)
            break; /* no more real reg live ranges to consider */
         if (ii < rreg_lrs_la[rreg_lrs_la_next].live_after)
            break; /* next live range does not yet start */
         vassert(ii == rreg_lrs_la[rreg_lrs_la_next].live_after);
         /* rreg_lrs_la[rreg_lrs_la_next].rreg needs to be freed up.
            Find the associated rreg_state entry. */
         /* Note, re ii == rreg_lrs_la[rreg_lrs_la_next].live_after.
            Real register live ranges are guaranteed to be well-formed
            in that they start with a write to the register -- Stage 2
            rejects any code not satisfying this.  So the correct
            question to ask is whether
            rreg_lrs_la[rreg_lrs_la_next].live_after == ii, that is,
            whether the reg becomes live after this insn -- rather
            than before it. */
         if (DEBUG_REGALLOC) {
            vex_printf("need to free up rreg: ");
            (*ppReg)(rreg_lrs_la[rreg_lrs_la_next].rreg);
            vex_printf("\n\n");
         }
         Int k = hregIndex(rreg_lrs_la[rreg_lrs_la_next].rreg);

         /* If this fails, we don't have an entry for this rreg.
            Which we should. */
         vassert(IS_VALID_RREGNO(k));
         Int m = hregIndex(rreg_state[k].vreg);
         if (rreg_state[k].disp == Bound) {
            /* Yes, there is an associated vreg.  Spill it if it's
               still live. */
            vassert(IS_VALID_VREGNO(m));
            vreg_state[m] = INVALID_RREG_NO;
            if (vreg_lrs[m].dead_before > ii) {
               vassert(vreg_lrs[m].reg_class != HRcINVALID);
               if ((!eq_spill_opt) || !rreg_state[k].eq_spill_slot) {
                  HInstr* spill1 = NULL;
                  HInstr* spill2 = NULL;
                  (*genSpill)( &spill1, &spill2, univ->regs[k],
                               vreg_lrs[m].spill_offset, mode64 );
                  vassert(spill1 || spill2); /* can't both be NULL */
                  if (spill1)
                     EMIT_INSTR(spill1);
                  if (spill2)
                     EMIT_INSTR(spill2);
               }
               rreg_state[k].eq_spill_slot = True;
            }
         }
         rreg_state[k].disp = Unavail;
         rreg_state[k].vreg = INVALID_HREG;
         rreg_state[k].eq_spill_slot = False;

         /* check for further rregs entering HLRs at this point */
         rreg_lrs_la_next++;
      }

      if (DEBUG_REGALLOC) {
         vex_printf("After pre-insn actions for fixed regs:\n");
         PRINT_STATE;
         vex_printf("\n");
      }

      /* ------ Deal with the current instruction. ------ */

      /* Finally we can begin the processing of this instruction
         itself.  The aim is to free up enough rregs for this insn.
         This may generate spill stores since we may have to evict
         some vregs currently in rregs.  Also generates spill loads.
         We also build up the final vreg->rreg mapping to be applied
         to the insn. */

      initHRegRemap(&remap);

      /* ------------ BEGIN directReload optimisation ----------- */

      /* If the instruction reads exactly one vreg which is currently
         in a spill slot, and this is last use of that vreg, see if we
         can convert the instruction into one that reads directly from
         the spill slot.  This is clearly only possible for x86 and
         amd64 targets, since ppc and arm are load-store
         architectures.  If successful, replace instrs_in->arr[ii]
         with this new instruction, and recompute its reg usage, so
         that the change is invisible to the standard-case handling
         that follows. */
      
      if (directReload && reg_usage_arr[ii].n_vRegs <= 2) {
         Bool  debug_direct_reload = False;
         HReg  cand     = INVALID_HREG;
         Bool  nreads   = 0;
         Short spilloff = 0;

         for (Int j = 0; j < reg_usage_arr[ii].n_vRegs; j++) {

            HReg vreg = reg_usage_arr[ii].vRegs[j];
            vassert(hregIsVirtual(vreg));

            if (reg_usage_arr[ii].vMode[j] == HRmRead) {
               nreads++;
               Int m = hregIndex(vreg);
               vassert(IS_VALID_VREGNO(m));
               Int k = vreg_state[m];
               if (!IS_VALID_RREGNO(k)) {
                  /* ok, it is spilled.  Now, is this its last use? */
                  vassert(vreg_lrs[m].dead_before >= ii+1);
                  if (vreg_lrs[m].dead_before == ii+1
                      && hregIsInvalid(cand)) {
                     spilloff = vreg_lrs[m].spill_offset;
                     cand = vreg;
                  }
               }
            }
         }

         if (nreads == 1 && ! hregIsInvalid(cand)) {
            HInstr* reloaded;
            if (reg_usage_arr[ii].n_vRegs == 2)
               vassert(! sameHReg(reg_usage_arr[ii].vRegs[0],
                                  reg_usage_arr[ii].vRegs[1]));

            reloaded = directReload ( instrs_in->arr[ii], cand, spilloff );
            if (debug_direct_reload && !reloaded) {
               vex_printf("[%3d] ", spilloff); ppHReg(cand); vex_printf(" "); 
               ppInstr(instrs_in->arr[ii], mode64); 
            }
            if (reloaded) {
               /* Update info about the insn, so it looks as if it had
                  been in this form all along. */
               instrs_in->arr[ii] = reloaded;
               (*getRegUsage)( &reg_usage_arr[ii], instrs_in->arr[ii], mode64 );
               if (debug_direct_reload && !reloaded) {
                  vex_printf("  -->  ");
                  ppInstr(reloaded, mode64);
               }
            }

            if (debug_direct_reload && !reloaded)
               vex_printf("\n");
         }

      }

      /* ------------ END directReload optimisation ------------ */

      /* for each virtual reg mentioned in the insn ... */
      for (Int j = 0; j < reg_usage_arr[ii].n_vRegs; j++) {

         HReg vreg = reg_usage_arr[ii].vRegs[j];
         vassert(hregIsVirtual(vreg));

         if (0) {
            vex_printf("considering "); (*ppReg)(vreg); vex_printf("\n");
         }

         /* Now we're trying to find a rreg for "vreg".  First of all,
            if it already has an rreg assigned, we don't need to do
            anything more.  Inspect the current state to find out. */
         Int m = hregIndex(vreg);
         vassert(IS_VALID_VREGNO(m));
         Int n = vreg_state[m];
         if (IS_VALID_RREGNO(n)) {
            vassert(rreg_state[n].disp == Bound);
            addToHRegRemap(&remap, vreg, univ->regs[n]);
            /* If this rreg is written or modified, mark it as different
               from any spill slot value. */
            if (reg_usage_arr[ii].vMode[j] != HRmRead)
               rreg_state[n].eq_spill_slot = False;
            continue;
         } else {
            vassert(n == INVALID_RREG_NO);
         }

         /* No luck.  The next thing to do is see if there is a
            currently free rreg available, of the correct class.  If
            so, bag it.  NOTE, we could improve this by selecting an
            rreg for which the next live-range event is as far ahead
            as possible. */
         Int k_suboptimal = -1;
         Int k;
         for (k = 0; k < n_rregs; k++) {
            if (rreg_state[k].disp != Free
                || hregClass(univ->regs[k]) != hregClass(vreg))
               continue;
            if (rreg_state[k].has_hlrs) {
               /* Well, at least we can use k_suboptimal if we really
                  have to.  Keep on looking for a better candidate. */
               k_suboptimal = k;
            } else {
               /* Found a preferable reg.  Use it. */
               k_suboptimal = -1;
               break;
            }
         }
         if (k_suboptimal >= 0)
            k = k_suboptimal;

         if (k < n_rregs) {
            rreg_state[k].disp = Bound;
            rreg_state[k].vreg = vreg;
            Int p = hregIndex(vreg);
            vassert(IS_VALID_VREGNO(p));
            vreg_state[p] = toShort(k);
            addToHRegRemap(&remap, vreg, univ->regs[k]);
            /* Generate a reload if needed.  This only creates needed
               reloads because the live range builder for vregs will
               guarantee that the first event for a vreg is a write.
               Hence, if this reference is not a write, it cannot be
               the first reference for this vreg, and so a reload is
               indeed needed. */
            if (reg_usage_arr[ii].vMode[j] != HRmWrite) {
               vassert(vreg_lrs[p].reg_class != HRcINVALID);
               HInstr* reload1 = NULL;
               HInstr* reload2 = NULL;
               (*genReload)( &reload1, &reload2, univ->regs[k],
                             vreg_lrs[p].spill_offset, mode64 );
               vassert(reload1 || reload2); /* can't both be NULL */
               if (reload1)
                  EMIT_INSTR(reload1);
               if (reload2)
                  EMIT_INSTR(reload2);
               /* This rreg is read or modified by the instruction.
                  If it's merely read we can claim it now equals the
                  spill slot, but not so if it is modified. */
               if (reg_usage_arr[ii].vMode[j] == HRmRead) {
                  rreg_state[k].eq_spill_slot = True;
               } else {
                  vassert(reg_usage_arr[ii].vMode[j] == HRmModify);
                  rreg_state[k].eq_spill_slot = False;
               }
            } else {
               rreg_state[k].eq_spill_slot = False;
            }

            continue;
         }

         /* Well, now we have no option but to spill a vreg.  It's
            important to make a good choice of vreg to spill, and of
            course we need to be careful not to spill a vreg which is
            needed by this insn. */

         /* First, mark in the rreg_state, those rregs which are not spill
            candidates, due to holding a vreg mentioned by this
            instruction.  Or being of the wrong class. */
         for (k = 0; k < n_rregs; k++) {
            rreg_state[k].is_spill_cand = False;
            if (rreg_state[k].disp != Bound)
               continue;
            if (hregClass(univ->regs[k]) != hregClass(vreg))
               continue;
            rreg_state[k].is_spill_cand = True;
            /* Note, the following loop visits only the virtual regs
               mentioned by the instruction. */
            for (m = 0; m < reg_usage_arr[ii].n_vRegs; m++) {
               if (sameHReg(rreg_state[k].vreg, reg_usage_arr[ii].vRegs[m])) {
                  rreg_state[k].is_spill_cand = False;
                  break;
               }
            }
         }

         /* We can choose to spill any rreg satisfying
            rreg_state[r].is_spill_cand (so to speak).  Choose r so that
            the next use of its associated vreg is as far ahead as
            possible, in the hope that this will minimise the number
            of consequent reloads required. */
         Int spillee
            = findMostDistantlyMentionedVReg ( 
                 reg_usage_arr, ii+1, instrs_in->arr_used, rreg_state, n_rregs );

         if (spillee == -1) {
            /* Hmmmmm.  There don't appear to be any spill candidates.
               We're hosed. */
            vex_printf("reg_alloc: can't find a register in class: ");
            ppHRegClass(hregClass(vreg));
            vex_printf("\n");
            vpanic("reg_alloc: can't create a free register.");
         }

         /* Right.  So we're going to spill rreg_state[spillee]. */
         vassert(IS_VALID_RREGNO(spillee));
         vassert(rreg_state[spillee].disp == Bound);
         /* check it's the right class */
         vassert(hregClass(univ->regs[spillee]) == hregClass(vreg));
         /* check we're not ejecting the vreg for which we are trying
            to free up a register. */
         vassert(! sameHReg(rreg_state[spillee].vreg, vreg));

         m = hregIndex(rreg_state[spillee].vreg);
         vassert(IS_VALID_VREGNO(m));

         /* So here's the spill store.  Assert that we're spilling a
            live vreg. */
         vassert(vreg_lrs[m].dead_before > ii);
         vassert(vreg_lrs[m].reg_class != HRcINVALID);
         if ((!eq_spill_opt) || !rreg_state[spillee].eq_spill_slot) {
            HInstr* spill1 = NULL;
            HInstr* spill2 = NULL;
            (*genSpill)( &spill1, &spill2, univ->regs[spillee],
                         vreg_lrs[m].spill_offset, mode64 );
            vassert(spill1 || spill2); /* can't both be NULL */
            if (spill1)
               EMIT_INSTR(spill1);
            if (spill2)
               EMIT_INSTR(spill2);
         }

         /* Update the rreg_state to reflect the new assignment for this
            rreg. */
         rreg_state[spillee].vreg = vreg;
         vreg_state[m] = INVALID_RREG_NO;

         rreg_state[spillee].eq_spill_slot = False; /* be safe */

         m = hregIndex(vreg);
         vassert(IS_VALID_VREGNO(m));
         vreg_state[m] = toShort(spillee);

         /* Now, if this vreg is being read or modified (as opposed to
            written), we have to generate a reload for it. */
         if (reg_usage_arr[ii].vMode[j] != HRmWrite) {
            vassert(vreg_lrs[m].reg_class != HRcINVALID);
            HInstr* reload1 = NULL;
            HInstr* reload2 = NULL;
            (*genReload)( &reload1, &reload2, univ->regs[spillee],
                          vreg_lrs[m].spill_offset, mode64 );
            vassert(reload1 || reload2); /* can't both be NULL */
            if (reload1)
               EMIT_INSTR(reload1);
            if (reload2)
               EMIT_INSTR(reload2);
            /* This rreg is read or modified by the instruction.
               If it's merely read we can claim it now equals the
               spill slot, but not so if it is modified. */
            if (reg_usage_arr[ii].vMode[j] == HRmRead) {
               rreg_state[spillee].eq_spill_slot = True;
            } else {
               vassert(reg_usage_arr[ii].vMode[j] == HRmModify);
               rreg_state[spillee].eq_spill_slot = False;
            }
         }

         /* So after much twisting and turning, we have vreg mapped to
            rreg_state[spillee].rreg.  Note that in the map. */
         addToHRegRemap(&remap, vreg, univ->regs[spillee]);

      } /* iterate over virtual registers in this instruction. */

      /* We've finished clowning around with registers in this instruction.
         Three results:
         - the running rreg_state[] has been updated
         - a suitable vreg->rreg mapping for this instruction has been 
           constructed
         - spill and reload instructions may have been emitted.

        The final step is to apply the mapping to the instruction, 
        and emit that.
      */

      /* NOTE, DESTRUCTIVELY MODIFIES instrs_in->arr[ii]. */
      (*mapRegs)( &remap, instrs_in->arr[ii], mode64 );
      EMIT_INSTR( instrs_in->arr[ii] );

      if (DEBUG_REGALLOC) {
         vex_printf("After dealing with current insn:\n");
         PRINT_STATE;
         vex_printf("\n");
      }

      /* ------ Post-instruction actions for fixed rreg uses ------ */

      /* Now we need to check for rregs exiting fixed live ranges
         after this instruction, and if so mark them as free. */
      while (True) {
         vassert(rreg_lrs_db_next >= 0);
         vassert(rreg_lrs_db_next <= rreg_lrs_used);
         if (rreg_lrs_db_next == rreg_lrs_used)
            break; /* no more real reg live ranges to consider */
         if (ii+1 < rreg_lrs_db[rreg_lrs_db_next].dead_before)
            break; /* next live range does not yet start */
         vassert(ii+1 == rreg_lrs_db[rreg_lrs_db_next].dead_before);
         /* rreg_lrs_db[[rreg_lrs_db_next].rreg is exiting a hard live
            range.  Mark it as such in the main rreg_state array. */
         HReg reg = rreg_lrs_db[rreg_lrs_db_next].rreg;
         vassert(!hregIsVirtual(reg));
         Int k = hregIndex(reg);
         vassert(IS_VALID_RREGNO(k));
         vassert(rreg_state[k].disp == Unavail);
         rreg_state[k].disp = Free;
         rreg_state[k].vreg = INVALID_HREG;
         rreg_state[k].eq_spill_slot = False;

         /* check for further rregs leaving HLRs at this point */
         rreg_lrs_db_next++;
      }

      if (DEBUG_REGALLOC) {
         vex_printf("After post-insn actions for fixed regs:\n");
         PRINT_STATE;
         vex_printf("\n");
      }

   } /* iterate over insns */

   /* ------ END: Process each insn in turn. ------ */

   /* free(rreg_state); */
   /* free(rreg_lrs); */
   /* if (vreg_lrs) free(vreg_lrs); */

   /* Paranoia */
   vassert(rreg_lrs_la_next == rreg_lrs_used);
   vassert(rreg_lrs_db_next == rreg_lrs_used);

   return instrs_out;

#  undef INVALID_INSTRNO
#  undef EMIT_INSTR
#  undef PRINT_STATE
}



/*---------------------------------------------------------------*/
/*---                                       host_reg_alloc2.c ---*/
/*---------------------------------------------------------------*/