/* -*- mode: C; c-basic-offset: 3; -*- */

/*---------------------------------------------------------------*/
/*--- begin                                          ir_opt.c ---*/
/*---------------------------------------------------------------*/

/*
   This file is part of Valgrind, a dynamic binary instrumentation
   framework.

   Copyright (C) 2004-2015 OpenWorks LLP
      info@open-works.net

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
   02110-1301, USA.

   The GNU General Public License is contained in the file COPYING.

   Neither the names of the U.S. Department of Energy nor the
   University of California nor the names of its contributors may be
   used to endorse or promote products derived from this software
   without prior written permission.
*/

#include "libvex_basictypes.h"
#include "libvex_ir.h"
#include "libvex.h"

#include "main_util.h"
#include "main_globals.h"
#include "ir_opt.h"


/* Set to 1 for lots of debugging output. */
#define DEBUG_IROPT 0

/* Set to 1 to gather some statistics. Currently only for sameIRExprs. */
#define STATS_IROPT 0


/* What iropt does, 29 Dec 04.

   It takes an IRSB and produces a new one with the same meaning,
   defined thus:

   After execution of the new BB, all guest state and guest memory is
   the same as after execution of the original.  This is true
   regardless of how the block was exited (at the end vs side exit).

   In addition, parts of the guest state will be identical to that
   created by execution of the original at the following observation
   points:

   * In a dirty helper call, any parts of the guest state that the
     helper states that it reads or modifies will be up to date.
     Also, guest memory will be up to date.  Parts of the guest state
     not marked as being read or modified by the helper cannot be
     assumed to be up-to-date at the point where the helper is called.

   * If iropt_register_updates == VexRegUpdSpAtMemAccess :
     The guest state is only up to date only as explained above
     (i.e. at SB exits and as specified by dirty helper call).
     Also, the stack pointer register is up to date at memory
     exception points (as this is needed for the stack extension
     logic in m_signals.c).

   * If iropt_register_updates == VexRegUpdUnwindregsAtMemAccess :
     Immediately prior to any load or store, those parts of the guest
     state marked as requiring precise exceptions will be up to date.
     Also, guest memory will be up to date.  Parts of the guest state
     not marked as requiring precise exceptions cannot be assumed to
     be up-to-date at the point of the load/store.

   * If iropt_register_updates == VexRegUpdAllregsAtMemAccess:
     Same as minimal, but all the guest state is up to date at memory
     exception points.

   * If iropt_register_updates == VexRegUpdAllregsAtEachInsn :
     Guest state is up to date at each instruction.

   The relative order of loads and stores (including loads/stores of
   guest memory done by dirty helpers annotated as such) is not
   changed.  However, the relative order of loads with no intervening
   stores/modifies may be changed.  

   Transformation order
   ~~~~~~~~~~~~~~~~~~~~

   There are three levels of optimisation, controlled by
   vex_control.iropt_level.  Define first:

   "Cheap transformations" are the following sequence:
      * Redundant-Get removal
      * Redundant-Put removal
      * Constant propagation/folding
      * Dead code removal
      * Specialisation of clean helper functions
      * Dead code removal

   "Expensive transformations" are the following sequence:
      * CSE
      * Folding of add/sub chains
      * Redundant-GetI removal
      * Redundant-PutI removal
      * Dead code removal

   Then the transformations are as follows, as defined by
   vex_control.iropt_level:

   Level 0: 
      * Flatten into atomic form.

   Level 1: the following sequence:
      * Flatten into atomic form.
      * Cheap transformations.

   Level 2: the following sequence
      * Flatten into atomic form.
      * Cheap transformations.
      * If block contains any floating or vector types, CSE.
      * If block contains GetI or PutI, Expensive transformations.
      * Try unrolling loops.  Three possible outcomes:
        - No effect: do nothing more.
        - Unrolled a loop, and block does not contain GetI or PutI:
          Do: * CSE
              * Dead code removal
        - Unrolled a loop, and block contains GetI or PutI:
          Do: * Expensive transformations
              * Cheap transformations
*/

/* Implementation notes, 29 Dec 04.

   TODO (important): I think rPutI removal ignores precise exceptions
   and is therefore in a sense, wrong.  In the sense that PutIs are
   assumed not to write parts of the guest state that we need to have
   up-to-date at loads/stores.  So far on x86 guest that has not
   mattered since indeed only the x87 FP registers and tags are
   accessed using GetI/PutI, and there is no need so far for them to
   be up to date at mem exception points.  The rPutI pass should be
   fixed.

   TODO: improve pessimistic handling of precise exceptions
     in the tree builder.

   TODO: check interaction of rGetI and dirty helpers. 
   
   F64i constants are treated differently from other constants.
   They are not regarded as atoms, and instead lifted off and
   bound to temps.  This allows them to participate in CSE, which
   is important for getting good performance for x86 guest code.

   CSE up F64 literals (already doing F64is)

   CSE: consider carefully the requirement for precise exns
        prior to making CSE any more aggressive.  */


/*---------------------------------------------------------------*/
/*--- Finite mappery, of a sort                               ---*/
/*---------------------------------------------------------------*/

/* General map from HWord-sized thing HWord-sized thing.  Could be by
   hashing, but it's not clear whether or not this would really be any
   faster. */

typedef
   struct {
      Bool*  inuse;
      HWord* key;
      HWord* val;
      Int    size;
      Int    used;
   }
   HashHW;

static HashHW* newHHW ( void )
{
   HashHW* h = LibVEX_Alloc_inline(sizeof(HashHW));
   h->size   = 8;
   h->used   = 0;
   h->inuse  = LibVEX_Alloc_inline(h->size * sizeof(Bool));
   h->key    = LibVEX_Alloc_inline(h->size * sizeof(HWord));
   h->val    = LibVEX_Alloc_inline(h->size * sizeof(HWord));
   return h;
}


/* Look up key in the map. */

static Bool lookupHHW ( HashHW* h, /*OUT*/HWord* val, HWord key )
{
   Int i;
   /* vex_printf("lookupHHW(%llx)\n", key ); */
   for (i = 0; i < h->used; i++) {
      if (h->inuse[i] && h->key[i] == key) {
         if (val)
            *val = h->val[i];
         return True;
      }
   }
   return False;
}


/* Add key->val to the map.  Replaces any existing binding for key. */

static void addToHHW ( HashHW* h, HWord key, HWord val )
{
   Int i, j;
   /* vex_printf("addToHHW(%llx, %llx)\n", key, val); */

   /* Find and replace existing binding, if any. */
   for (i = 0; i < h->used; i++) {
      if (h->inuse[i] && h->key[i] == key) {
         h->val[i] = val;
         return;
      }
   }

   /* Ensure a space is available. */
   if (h->used == h->size) {
      /* Copy into arrays twice the size. */
      Bool*  inuse2 = LibVEX_Alloc_inline(2 * h->size * sizeof(Bool));
      HWord* key2   = LibVEX_Alloc_inline(2 * h->size * sizeof(HWord));
      HWord* val2   = LibVEX_Alloc_inline(2 * h->size * sizeof(HWord));
      for (i = j = 0; i < h->size; i++) {
         if (!h->inuse[i]) continue;
         inuse2[j] = True;
         key2[j] = h->key[i];
         val2[j] = h->val[i];
         j++;
      }
      h->used = j;
      h->size *= 2;
      h->inuse = inuse2;
      h->key = key2;
      h->val = val2;
   }

   /* Finally, add it. */
   vassert(h->used < h->size);
   h->inuse[h->used] = True;
   h->key[h->used] = key;
   h->val[h->used] = val;
   h->used++;
}


/*---------------------------------------------------------------*/
/*--- Flattening out a BB into atomic SSA form                ---*/
/*---------------------------------------------------------------*/

/* Non-critical helper, heuristic for reducing the number of tmp-tmp
   copies made by flattening.  If in doubt return False. */

static Bool isFlat ( IRExpr* e )
{
   if (e->tag == Iex_Get) 
      return True;
   if (e->tag == Iex_Binop)
      return toBool( isIRAtom(e->Iex.Binop.arg1) 
                     && isIRAtom(e->Iex.Binop.arg2) );
   if (e->tag == Iex_Load)
      return isIRAtom(e->Iex.Load.addr);
   return False;
}

/* Flatten out 'ex' so it is atomic, returning a new expression with
   the same value, after having appended extra IRTemp assignments to
   the end of 'bb'. */

static IRExpr* flatten_Expr ( IRSB* bb, IRExpr* ex )
{
   Int i;
   IRExpr** newargs;
   IRType ty = typeOfIRExpr(bb->tyenv, ex);
   IRTemp t1;

   switch (ex->tag) {

      case Iex_GetI:
         t1 = newIRTemp(bb->tyenv, ty);
         addStmtToIRSB(bb, IRStmt_WrTmp(t1,
            IRExpr_GetI(ex->Iex.GetI.descr,
                        flatten_Expr(bb, ex->Iex.GetI.ix),
                        ex->Iex.GetI.bias)));
         return IRExpr_RdTmp(t1);

      case Iex_Get:
         t1 = newIRTemp(bb->tyenv, ty);
         addStmtToIRSB(bb, 
            IRStmt_WrTmp(t1, ex));
         return IRExpr_RdTmp(t1);

      case Iex_Qop: {
         IRQop* qop = ex->Iex.Qop.details;
         t1 = newIRTemp(bb->tyenv, ty);
         addStmtToIRSB(bb, IRStmt_WrTmp(t1, 
            IRExpr_Qop(qop->op,
                         flatten_Expr(bb, qop->arg1),
                         flatten_Expr(bb, qop->arg2),
                         flatten_Expr(bb, qop->arg3),
                         flatten_Expr(bb, qop->arg4))));
         return IRExpr_RdTmp(t1);
      }

      case Iex_Triop: {
         IRTriop* triop = ex->Iex.Triop.details;
         t1 = newIRTemp(bb->tyenv, ty);
         addStmtToIRSB(bb, IRStmt_WrTmp(t1, 
            IRExpr_Triop(triop->op,
                         flatten_Expr(bb, triop->arg1),
                         flatten_Expr(bb, triop->arg2),
                         flatten_Expr(bb, triop->arg3))));
         return IRExpr_RdTmp(t1);
      }

      case Iex_Binop:
         t1 = newIRTemp(bb->tyenv, ty);
         addStmtToIRSB(bb, IRStmt_WrTmp(t1, 
            IRExpr_Binop(ex->Iex.Binop.op,
                         flatten_Expr(bb, ex->Iex.Binop.arg1),
                         flatten_Expr(bb, ex->Iex.Binop.arg2))));
         return IRExpr_RdTmp(t1);

      case Iex_Unop:
         t1 = newIRTemp(bb->tyenv, ty);
         addStmtToIRSB(bb, IRStmt_WrTmp(t1, 
            IRExpr_Unop(ex->Iex.Unop.op,
                        flatten_Expr(bb, ex->Iex.Unop.arg))));
         return IRExpr_RdTmp(t1);

      case Iex_Load:
         t1 = newIRTemp(bb->tyenv, ty);
         addStmtToIRSB(bb, IRStmt_WrTmp(t1,
            IRExpr_Load(ex->Iex.Load.end,
                        ex->Iex.Load.ty, 
                        flatten_Expr(bb, ex->Iex.Load.addr))));
         return IRExpr_RdTmp(t1);

      case Iex_CCall:
         newargs = shallowCopyIRExprVec(ex->Iex.CCall.args);
         for (i = 0; newargs[i]; i++)
            newargs[i] = flatten_Expr(bb, newargs[i]);
         t1 = newIRTemp(bb->tyenv, ty);
         addStmtToIRSB(bb, IRStmt_WrTmp(t1,
            IRExpr_CCall(ex->Iex.CCall.cee,
                         ex->Iex.CCall.retty,
                         newargs)));
         return IRExpr_RdTmp(t1);

      case Iex_ITE:
         t1 = newIRTemp(bb->tyenv, ty);
         addStmtToIRSB(bb, IRStmt_WrTmp(t1,
            IRExpr_ITE(flatten_Expr(bb, ex->Iex.ITE.cond),
                       flatten_Expr(bb, ex->Iex.ITE.iftrue),
                       flatten_Expr(bb, ex->Iex.ITE.iffalse))));
         return IRExpr_RdTmp(t1);

      case Iex_Const:
         /* Lift F64i constants out onto temps so they can be CSEd
            later. */
         if (ex->Iex.Const.con->tag == Ico_F64i) {
            t1 = newIRTemp(bb->tyenv, ty);
            addStmtToIRSB(bb, IRStmt_WrTmp(t1,
               IRExpr_Const(ex->Iex.Const.con)));
            return IRExpr_RdTmp(t1);
         } else {
            /* Leave all other constants alone. */
            return ex;
         }

      case Iex_RdTmp:
         return ex;

      default:
         vex_printf("\n");
         ppIRExpr(ex); 
         vex_printf("\n");
         vpanic("flatten_Expr");
   }
}


/* Append a completely flattened form of 'st' to the end of 'bb'. */

static void flatten_Stmt ( IRSB* bb, IRStmt* st )
{
   Int i;
   IRExpr   *e1, *e2, *e3, *e4, *e5;
   IRDirty  *d,  *d2;
   IRCAS    *cas, *cas2;
   IRPutI   *puti, *puti2;
   IRLoadG  *lg;
   IRStoreG *sg;
   switch (st->tag) {
      case Ist_Put:
         if (isIRAtom(st->Ist.Put.data)) {
            /* optimisation to reduce the amount of heap wasted
               by the flattener */
            addStmtToIRSB(bb, st);
         } else {
            /* general case, always correct */
            e1 = flatten_Expr(bb, st->Ist.Put.data);
            addStmtToIRSB(bb, IRStmt_Put(st->Ist.Put.offset, e1));
         }
         break;
      case Ist_PutI:
         puti = st->Ist.PutI.details;
         e1 = flatten_Expr(bb, puti->ix);
         e2 = flatten_Expr(bb, puti->data);
         puti2 = mkIRPutI(puti->descr, e1, puti->bias, e2);
         addStmtToIRSB(bb, IRStmt_PutI(puti2));
         break;
      case Ist_WrTmp:
         if (isFlat(st->Ist.WrTmp.data)) {
            /* optimisation, to reduce the number of tmp-tmp
               copies generated */
            addStmtToIRSB(bb, st);
         } else {
            /* general case, always correct */
            e1 = flatten_Expr(bb, st->Ist.WrTmp.data);
            addStmtToIRSB(bb, IRStmt_WrTmp(st->Ist.WrTmp.tmp, e1));
         }
         break;
      case Ist_Store:
         e1 = flatten_Expr(bb, st->Ist.Store.addr);
         e2 = flatten_Expr(bb, st->Ist.Store.data);
         addStmtToIRSB(bb, IRStmt_Store(st->Ist.Store.end, e1,e2));
         break;
      case Ist_StoreG:
         sg = st->Ist.StoreG.details;
         e1 = flatten_Expr(bb, sg->addr);
         e2 = flatten_Expr(bb, sg->data);
         e3 = flatten_Expr(bb, sg->guard);
         addStmtToIRSB(bb, IRStmt_StoreG(sg->end, e1, e2, e3));
         break;
      case Ist_LoadG:
         lg = st->Ist.LoadG.details;
         e1 = flatten_Expr(bb, lg->addr);
         e2 = flatten_Expr(bb, lg->alt);
         e3 = flatten_Expr(bb, lg->guard);
         addStmtToIRSB(bb, IRStmt_LoadG(lg->end, lg->cvt, lg->dst,
                                        e1, e2, e3));
         break;
      case Ist_CAS:
         cas  = st->Ist.CAS.details;
         e1   = flatten_Expr(bb, cas->addr);
         e2   = cas->expdHi ? flatten_Expr(bb, cas->expdHi) : NULL;
         e3   = flatten_Expr(bb, cas->expdLo);
         e4   = cas->dataHi ? flatten_Expr(bb, cas->dataHi) : NULL;
         e5   = flatten_Expr(bb, cas->dataLo);
         cas2 = mkIRCAS( cas->oldHi, cas->oldLo, cas->end,
                         e1, e2, e3, e4, e5 );
         addStmtToIRSB(bb, IRStmt_CAS(cas2));
         break;
      case Ist_LLSC:
         e1 = flatten_Expr(bb, st->Ist.LLSC.addr);
         e2 = st->Ist.LLSC.storedata
                 ? flatten_Expr(bb, st->Ist.LLSC.storedata)
                 : NULL;
         addStmtToIRSB(bb, IRStmt_LLSC(st->Ist.LLSC.end,
                                       st->Ist.LLSC.result, e1, e2));
         break;
      case Ist_Dirty:
         d = st->Ist.Dirty.details;
         d2 = emptyIRDirty();
         *d2 = *d;
         d2->args = shallowCopyIRExprVec(d2->args);
         if (d2->mFx != Ifx_None) {
            d2->mAddr = flatten_Expr(bb, d2->mAddr);
         } else {
            vassert(d2->mAddr == NULL);
         }
         d2->guard = flatten_Expr(bb, d2->guard);
         for (i = 0; d2->args[i]; i++) {
            IRExpr* arg = d2->args[i];
            if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg)))
               d2->args[i] = flatten_Expr(bb, arg);
         }
         addStmtToIRSB(bb, IRStmt_Dirty(d2));
         break;
      case Ist_NoOp:
      case Ist_MBE:
      case Ist_IMark:
         addStmtToIRSB(bb, st);
         break;
      case Ist_AbiHint:
         e1 = flatten_Expr(bb, st->Ist.AbiHint.base);
         e2 = flatten_Expr(bb, st->Ist.AbiHint.nia);
         addStmtToIRSB(bb, IRStmt_AbiHint(e1, st->Ist.AbiHint.len, e2));
         break;
      case Ist_Exit:
         e1 = flatten_Expr(bb, st->Ist.Exit.guard);
         addStmtToIRSB(bb, IRStmt_Exit(e1, st->Ist.Exit.jk,
                                       st->Ist.Exit.dst,
                                       st->Ist.Exit.offsIP));
         break;
      default:
         vex_printf("\n");
         ppIRStmt(st); 
         vex_printf("\n");
         vpanic("flatten_Stmt");
   }
}


static IRSB* flatten_BB ( IRSB* in )
{
   Int   i;
   IRSB* out;
   out = emptyIRSB();
   out->tyenv = deepCopyIRTypeEnv( in->tyenv );
   for (i = 0; i < in->stmts_used; i++)
      if (in->stmts[i])
         flatten_Stmt( out, in->stmts[i] );
   out->next     = flatten_Expr( out, in->next );
   out->jumpkind = in->jumpkind;
   out->offsIP   = in->offsIP;
   return out;
}


/*---------------------------------------------------------------*/
/*--- In-place removal of redundant GETs                      ---*/
/*---------------------------------------------------------------*/

/* Scan forwards, building up an environment binding (min offset, max
   offset) pairs to values, which will either be temps or constants.

   On seeing 't = Get(minoff,maxoff)', look up (minoff,maxoff) in the
   env and if it matches, replace the Get with the stored value.  If
   there is no match, add a (minoff,maxoff) :-> t binding.

   On seeing 'Put (minoff,maxoff) = t or c', first remove in the env
   any binding which fully or partially overlaps with (minoff,maxoff).
   Then add a new (minoff,maxoff) :-> t or c binding.  */

/* Extract the min/max offsets from a guest state array descriptor. */

inline
static void getArrayBounds ( IRRegArray* descr, 
                             UInt* minoff, UInt* maxoff )
{
   *minoff = descr->base;
   *maxoff = *minoff + descr->nElems*sizeofIRType(descr->elemTy) - 1;
   vassert((*minoff & ~0xFFFF) == 0);
   vassert((*maxoff & ~0xFFFF) == 0);
   vassert(*minoff <= *maxoff);
}

/* Create keys, of the form ((minoffset << 16) | maxoffset). */

static UInt mk_key_GetPut ( Int offset, IRType ty )
{
   /* offset should fit in 16 bits. */
   UInt minoff = offset;
   UInt maxoff = minoff + sizeofIRType(ty) - 1;
   vassert((minoff & ~0xFFFF) == 0);
   vassert((maxoff & ~0xFFFF) == 0);
   return (minoff << 16) | maxoff;
}

static UInt mk_key_GetIPutI ( IRRegArray* descr )
{
   UInt minoff, maxoff;
   getArrayBounds( descr, &minoff, &maxoff );
   vassert((minoff & ~0xFFFF) == 0);
   vassert((maxoff & ~0xFFFF) == 0);
   return (minoff << 16) | maxoff;
}

/* Supposing h has keys of the form generated by mk_key_GetPut and
   mk_key_GetIPutI, invalidate any key which overlaps (k_lo
   .. k_hi). 
*/
static void invalidateOverlaps ( HashHW* h, UInt k_lo, UInt k_hi )
{
   Int  j;
   UInt e_lo, e_hi;
   vassert(k_lo <= k_hi);
   /* invalidate any env entries which in any way overlap (k_lo
      .. k_hi) */
   /* vex_printf("invalidate %d .. %d\n", k_lo, k_hi ); */

   for (j = 0; j < h->used; j++) {
      if (!h->inuse[j]) 
         continue;
      e_lo = (((UInt)h->key[j]) >> 16) & 0xFFFF;
      e_hi = ((UInt)h->key[j]) & 0xFFFF;
      vassert(e_lo <= e_hi);
      if (e_hi < k_lo || k_hi < e_lo)
         continue; /* no overlap possible */
      else
         /* overlap; invalidate */
         h->inuse[j] = False;
   }
}


static void redundant_get_removal_BB ( IRSB* bb )
{
   HashHW* env = newHHW();
   UInt    key = 0; /* keep gcc -O happy */
   Int     i, j;
   HWord   val;

   for (i = 0; i < bb->stmts_used; i++) {
      IRStmt* st = bb->stmts[i];

      if (st->tag == Ist_NoOp)
         continue;

      /* Deal with Gets */
      if (st->tag == Ist_WrTmp
          && st->Ist.WrTmp.data->tag == Iex_Get) {
         /* st is 't = Get(...)'.  Look up in the environment and see
            if the Get can be replaced. */
         IRExpr* get = st->Ist.WrTmp.data;
         key = (HWord)mk_key_GetPut( get->Iex.Get.offset, 
                                     get->Iex.Get.ty );
         if (lookupHHW(env, &val, (HWord)key)) {
            /* found it */
            /* Note, we could do better here.  If the types are
               different we don't do the substitution, since doing so
               could lead to invalidly-typed IR.  An improvement would
               be to stick in a reinterpret-style cast, although that
               would make maintaining flatness more difficult. */
            IRExpr* valE    = (IRExpr*)val;
            Bool    typesOK = toBool( typeOfIRExpr(bb->tyenv,valE) 
                                      == st->Ist.WrTmp.data->Iex.Get.ty );
            if (typesOK && DEBUG_IROPT) {
               vex_printf("rGET: "); ppIRExpr(get);
               vex_printf("  ->  "); ppIRExpr(valE);
               vex_printf("\n");
            }
            if (typesOK)
               bb->stmts[i] = IRStmt_WrTmp(st->Ist.WrTmp.tmp, valE);
         } else {
            /* Not found, but at least we know that t and the Get(...)
               are now associated.  So add a binding to reflect that
               fact. */
            addToHHW( env, (HWord)key, 
                           (HWord)(void*)(IRExpr_RdTmp(st->Ist.WrTmp.tmp)) );
         }
      }

      /* Deal with Puts: invalidate any env entries overlapped by this
         Put */
      if (st->tag == Ist_Put || st->tag == Ist_PutI) {
         UInt k_lo, k_hi;
         if (st->tag == Ist_Put) {
            key = mk_key_GetPut( st->Ist.Put.offset, 
                                 typeOfIRExpr(bb->tyenv,st->Ist.Put.data) );
         } else {
            vassert(st->tag == Ist_PutI);
            key = mk_key_GetIPutI( st->Ist.PutI.details->descr );
         }

         k_lo = (key >> 16) & 0xFFFF;
         k_hi = key & 0xFFFF;
         invalidateOverlaps(env, k_lo, k_hi);
      }
      else
      if (st->tag == Ist_Dirty) {
         /* Deal with dirty helpers which write or modify guest state.
            Invalidate the entire env.  We could do a lot better
            here. */
         IRDirty* d      = st->Ist.Dirty.details;
         Bool     writes = False;
         for (j = 0; j < d->nFxState; j++) {
            if (d->fxState[j].fx == Ifx_Modify 
                || d->fxState[j].fx == Ifx_Write)
            writes = True;
         }
         if (writes) {
            /* dump the entire env (not clever, but correct ...) */
            for (j = 0; j < env->used; j++)
               env->inuse[j] = False;
            if (0) vex_printf("rGET: trash env due to dirty helper\n");
         }
      }

      /* add this one to the env, if appropriate */
      if (st->tag == Ist_Put) {
         vassert(isIRAtom(st->Ist.Put.data));
         addToHHW( env, (HWord)key, (HWord)(st->Ist.Put.data));
      }

   } /* for (i = 0; i < bb->stmts_used; i++) */

}


/*---------------------------------------------------------------*/
/*--- In-place removal of redundant PUTs                      ---*/
/*---------------------------------------------------------------*/

/* Find any Get uses in st and invalidate any partially or fully
   overlapping ranges listed in env.  Due to the flattening phase, the
   only stmt kind we expect to find a Get on is IRStmt_WrTmp. */

static void handle_gets_Stmt ( 
               HashHW* env, 
               IRStmt* st,
               Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
               VexRegisterUpdates pxControl
            )
{
   Int     j;
   UInt    key = 0; /* keep gcc -O happy */
   Bool    isGet;
   Bool    memRW = False;
   IRExpr* e;

   switch (st->tag) {

      /* This is the only interesting case.  Deal with Gets in the RHS
         expression. */
      case Ist_WrTmp:
         e = st->Ist.WrTmp.data;
         switch (e->tag) {
            case Iex_Get:
               isGet = True;
               key = mk_key_GetPut ( e->Iex.Get.offset, e->Iex.Get.ty );
               break;
            case Iex_GetI:
               isGet = True;
               key = mk_key_GetIPutI ( e->Iex.GetI.descr );
               break;
            case Iex_Load:
               isGet = False;
               memRW = True;
               break;
            default: 
               isGet = False;
         }
         if (isGet) {
            UInt k_lo, k_hi;
            k_lo = (key >> 16) & 0xFFFF;
            k_hi = key & 0xFFFF;
            invalidateOverlaps(env, k_lo, k_hi);
         }
         break;

      /* Be very conservative for dirty helper calls; dump the entire
         environment.  The helper might read guest state, in which
         case it needs to be flushed first.  Also, the helper might
         access guest memory, in which case all parts of the guest
         state requiring precise exceptions needs to be flushed.  The
         crude solution is just to flush everything; we could easily
         enough do a lot better if needed. */
      /* Probably also overly-conservative, but also dump everything
         if we hit a memory bus event (fence, lock, unlock).  Ditto
         AbiHints, CASs, LLs and SCs. */
      case Ist_AbiHint:
         vassert(isIRAtom(st->Ist.AbiHint.base));
         vassert(isIRAtom(st->Ist.AbiHint.nia));
         /* fall through */
      case Ist_MBE:
      case Ist_Dirty:
      case Ist_CAS:
      case Ist_LLSC:
         for (j = 0; j < env->used; j++)
            env->inuse[j] = False;
         break;

      /* all other cases are boring. */
      case Ist_Store:
         vassert(isIRAtom(st->Ist.Store.addr));
         vassert(isIRAtom(st->Ist.Store.data));
         memRW = True;
         break;
      case Ist_StoreG: {
         IRStoreG* sg = st->Ist.StoreG.details;
         vassert(isIRAtom(sg->addr));
         vassert(isIRAtom(sg->data));
         vassert(isIRAtom(sg->guard));
         memRW = True;
         break;
      }
      case Ist_LoadG: {
         IRLoadG* lg = st->Ist.LoadG.details;
         vassert(isIRAtom(lg->addr));
         vassert(isIRAtom(lg->alt));
         vassert(isIRAtom(lg->guard));
         memRW = True;
         break;
      }
      case Ist_Exit:
         vassert(isIRAtom(st->Ist.Exit.guard));
         break;

      case Ist_Put:
         vassert(isIRAtom(st->Ist.Put.data));
         break;

      case Ist_PutI:
         vassert(isIRAtom(st->Ist.PutI.details->ix));
         vassert(isIRAtom(st->Ist.PutI.details->data));
         break;

      case Ist_NoOp:
      case Ist_IMark:
         break;

      default:
         vex_printf("\n");
         ppIRStmt(st);
         vex_printf("\n");
         vpanic("handle_gets_Stmt");
   }

   if (memRW) {
      /* This statement accesses memory.  So we might need to dump all parts
         of the environment corresponding to guest state that may not
         be reordered with respect to memory references.  That means
         at least the stack pointer. */
      switch (pxControl) {
         case VexRegUpdAllregsAtMemAccess:
            /* Precise exceptions required at mem access.
               Flush all guest state. */
            for (j = 0; j < env->used; j++)
               env->inuse[j] = False;
            break;
         case VexRegUpdSpAtMemAccess:
            /* We need to dump the stack pointer
               (needed for stack extension in m_signals.c).
               preciseMemExnsFn will use vex_control.iropt_register_updates
               to verify only the sp is to be checked. */
            /* fallthrough */
         case VexRegUpdUnwindregsAtMemAccess:
            for (j = 0; j < env->used; j++) {
               if (!env->inuse[j])
                  continue;
               /* Just flush the minimal amount required, as computed by
                  preciseMemExnsFn. */
               HWord k_lo = (env->key[j] >> 16) & 0xFFFF;
               HWord k_hi = env->key[j] & 0xFFFF;
               if (preciseMemExnsFn( k_lo, k_hi, pxControl ))
                  env->inuse[j] = False;
            }
            break;
         case VexRegUpdAllregsAtEachInsn:
            // VexRegUpdAllregsAtEachInsn cannot happen here.
            // fall through
         case VexRegUpd_INVALID:
         default:
            vassert(0);
      }
   } /* if (memRW) */

}


/* Scan backwards, building up a set of (min offset, max
   offset) pairs, indicating those parts of the guest state
   for which the next event is a write.

   On seeing a conditional exit, empty the set.

   On seeing 'Put (minoff,maxoff) = t or c', if (minoff,maxoff) is
   completely within the set, remove the Put.  Otherwise, add
   (minoff,maxoff) to the set.

   On seeing 'Get (minoff,maxoff)', remove any part of the set
   overlapping (minoff,maxoff).  The same has to happen for any events
   which implicitly read parts of the guest state: dirty helper calls
   and loads/stores.
*/

static void redundant_put_removal_BB ( 
               IRSB* bb,
               Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
               VexRegisterUpdates pxControl
            )
{
   Int     i, j;
   Bool    isPut;
   IRStmt* st;
   UInt    key = 0; /* keep gcc -O happy */

   vassert(pxControl < VexRegUpdAllregsAtEachInsn);

   HashHW* env = newHHW();

   /* Initialise the running env with the fact that the final exit
      writes the IP (or, whatever it claims to write.  We don't
      care.) */
   key = mk_key_GetPut(bb->offsIP, typeOfIRExpr(bb->tyenv, bb->next));
   addToHHW(env, (HWord)key, 0);

   /* And now scan backwards through the statements. */
   for (i = bb->stmts_used-1; i >= 0; i--) {
      st = bb->stmts[i];

      if (st->tag == Ist_NoOp)
         continue;

      /* Deal with conditional exits. */
      if (st->tag == Ist_Exit) {
         //Bool re_add;
         /* Need to throw out from the env, any part of it which
            doesn't overlap with the guest state written by this exit.
            Since the exit only writes one section, it's simplest to
            do this: (1) check whether env contains a write that
            completely overlaps the write done by this exit; (2) empty
            out env; and (3) if (1) was true, add the write done by
            this exit.

            To make (1) a bit simpler, merely search for a write that
            exactly matches the one done by this exit.  That's safe
            because it will fail as often or more often than a full
            overlap check, and failure to find an overlapping write in
            env is the safe case (we just nuke env if that
            happens). */
         //vassert(isIRAtom(st->Ist.Exit.guard));
         /* (1) */
         //key = mk_key_GetPut(st->Ist.Exit.offsIP,
         //                    typeOfIRConst(st->Ist.Exit.dst));
         //re_add = lookupHHW(env, NULL, key);
         /* (2) */
         for (j = 0; j < env->used; j++)
            env->inuse[j] = False;
         /* (3) */
         //if (0 && re_add) 
         //   addToHHW(env, (HWord)key, 0);
         continue;
      }

      /* Deal with Puts */
      switch (st->tag) {
         case Ist_Put: 
            isPut = True;
            key = mk_key_GetPut( st->Ist.Put.offset, 
                                 typeOfIRExpr(bb->tyenv,st->Ist.Put.data) );
            vassert(isIRAtom(st->Ist.Put.data));
            break;
         case Ist_PutI:
            isPut = True;
            key = mk_key_GetIPutI( st->Ist.PutI.details->descr );
            vassert(isIRAtom(st->Ist.PutI.details->ix));
            vassert(isIRAtom(st->Ist.PutI.details->data));
            break;
         default: 
            isPut = False;
      }
      if (isPut && st->tag != Ist_PutI) {
         /* See if any single entry in env overlaps this Put.  This is
            simplistic in that the transformation is valid if, say, two
            or more entries in the env overlap this Put, but the use of
            lookupHHW will only find a single entry which exactly
            overlaps this Put.  This is suboptimal but safe. */
         if (lookupHHW(env, NULL, (HWord)key)) {
            /* This Put is redundant because a later one will overwrite
               it.  So NULL (nop) it out. */
            if (DEBUG_IROPT) {
               vex_printf("rPUT: "); ppIRStmt(st);
               vex_printf("\n");
            }
            bb->stmts[i] = IRStmt_NoOp();
         } else {
            /* We can't demonstrate that this Put is redundant, so add it
               to the running collection. */
            addToHHW(env, (HWord)key, 0);
         }
         continue;
      }

      /* Deal with Gets.  These remove bits of the environment since
         appearance of a Get means that the next event for that slice
         of the guest state is no longer a write, but a read.  Also
         deals with implicit reads of guest state needed to maintain
         precise exceptions. */
      handle_gets_Stmt( env, st, preciseMemExnsFn, pxControl );
   }
}


/*---------------------------------------------------------------*/
/*--- Constant propagation and folding                        ---*/
/*---------------------------------------------------------------*/

#if STATS_IROPT
/* How often sameIRExprs was invoked */
static UInt invocation_count;
/* How often sameIRExprs recursed through IRTemp assignments */
static UInt recursion_count;
/* How often sameIRExprs found identical IRExprs */
static UInt success_count;
/* How often recursing through assignments to IRTemps helped
   establishing equality. */
static UInt recursion_success_count;
/* Whether or not recursing through an IRTemp assignment helped 
   establishing IRExpr equality for a given sameIRExprs invocation. */
static Bool recursion_helped;
/* Whether or not a given sameIRExprs invocation recursed through an
   IRTemp assignment */
static Bool recursed;
/* Maximum number of nodes ever visited when comparing two IRExprs. */
static UInt max_nodes_visited;
#endif /* STATS_IROPT */

/* Count the number of nodes visited for a given sameIRExprs invocation. */
static UInt num_nodes_visited;

/* Do not visit more than NODE_LIMIT nodes when comparing two IRExprs.
   This is to guard against performance degradation by visiting large
   trees without success. */
#define NODE_LIMIT 30


/* The env in this section is a map from IRTemp to IRExpr*,
   that is, an array indexed by IRTemp. */

/* Do both expressions compute the same value? The answer is generally
   conservative, i.e. it will report that the expressions do not compute
   the same value when in fact they do. The reason is that we do not
   keep track of changes in the guest state and memory. Thusly, two
   Get's, GetI's or Load's, even when accessing the same location, will be
   assumed to compute different values. After all the accesses may happen
   at different times and the guest state / memory can have changed in
   the meantime.

   XXX IMPORTANT XXX the two expressions must have the same IR type.
   DO NOT CALL HERE WITH DIFFERENTLY-TYPED EXPRESSIONS. */

/* JRS 20-Mar-2012: split sameIRExprs_aux into a fast inlineable
   wrapper that deals with the common tags-don't-match case, and a
   slower out of line general case.  Saves a few insns. */

__attribute__((noinline))
static Bool sameIRExprs_aux2 ( IRExpr** env, IRExpr* e1, IRExpr* e2 );

inline
static Bool sameIRExprs_aux ( IRExpr** env, IRExpr* e1, IRExpr* e2 )
{
   if (e1->tag != e2->tag) return False;
   return sameIRExprs_aux2(env, e1, e2);
}

__attribute__((noinline))
static Bool sameIRExprs_aux2 ( IRExpr** env, IRExpr* e1, IRExpr* e2 )
{
   if (num_nodes_visited++ > NODE_LIMIT) return False;

   switch (e1->tag) {
      case Iex_RdTmp:
         if (e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp) return True;

         if (env[e1->Iex.RdTmp.tmp] && env[e2->Iex.RdTmp.tmp]) {
            Bool same = sameIRExprs_aux(env, env[e1->Iex.RdTmp.tmp],
                                        env[e2->Iex.RdTmp.tmp]);
#if STATS_IROPT
            recursed = True;
            if (same) recursion_helped = True;
#endif
            return same;
         }
         return False;

      case Iex_Get:
      case Iex_GetI:
      case Iex_Load:
         /* Guest state / memory could have changed in the meantime. */
         return False;

      case Iex_Binop:
         return toBool( e1->Iex.Binop.op == e2->Iex.Binop.op
                        && sameIRExprs_aux( env, e1->Iex.Binop.arg1,
                                                 e2->Iex.Binop.arg1 )
                        && sameIRExprs_aux( env, e1->Iex.Binop.arg2,
                                                 e2->Iex.Binop.arg2 ));

      case Iex_Unop:
         return toBool( e1->Iex.Unop.op == e2->Iex.Unop.op
                        && sameIRExprs_aux( env, e1->Iex.Unop.arg,
                                                 e2->Iex.Unop.arg ));

      case Iex_Const: {
         IRConst *c1 = e1->Iex.Const.con;
         IRConst *c2 = e2->Iex.Const.con;
         vassert(c1->tag == c2->tag);
         switch (c1->tag) {
            case Ico_U1:   return toBool( c1->Ico.U1  == c2->Ico.U1 );
            case Ico_U8:   return toBool( c1->Ico.U8  == c2->Ico.U8 );
            case Ico_U16:  return toBool( c1->Ico.U16 == c2->Ico.U16 );
            case Ico_U32:  return toBool( c1->Ico.U32 == c2->Ico.U32 );
            case Ico_U64:  return toBool( c1->Ico.U64 == c2->Ico.U64 );
            default: break;
         }
         return False;
      }

      case Iex_Triop: {
         IRTriop *tri1 = e1->Iex.Triop.details;
         IRTriop *tri2 = e2->Iex.Triop.details;
         return toBool( tri1->op == tri2->op
                        && sameIRExprs_aux( env, tri1->arg1, tri2->arg1 )
                        && sameIRExprs_aux( env, tri1->arg2, tri2->arg2 )
                        && sameIRExprs_aux( env, tri1->arg3, tri2->arg3 ));
      }

      case Iex_ITE:
         return toBool(    sameIRExprs_aux( env, e1->Iex.ITE.cond,
                                                 e2->Iex.ITE.cond )
                        && sameIRExprs_aux( env, e1->Iex.ITE.iftrue,
                                                 e2->Iex.ITE.iftrue )
                        && sameIRExprs_aux( env, e1->Iex.ITE.iffalse,
                                                 e2->Iex.ITE.iffalse ));

      default:
         /* Not very likely to be "same". */
         break;
   }

   return False;
}

inline
static Bool sameIRExprs ( IRExpr** env, IRExpr* e1, IRExpr* e2 )
{
   Bool same;

   num_nodes_visited = 0;
   same = sameIRExprs_aux(env, e1, e2);

#if STATS_IROPT
   ++invocation_count;
   if (recursed) ++recursion_count;
   success_count += same;
   if (same && recursion_helped)
      ++recursion_success_count;
   if (num_nodes_visited > max_nodes_visited)
      max_nodes_visited = num_nodes_visited;
   recursed = False; /* reset */
   recursion_helped = False;  /* reset */
#endif /* STATS_IROPT */

   return same;
}


/* Debugging-only hack (not used in production runs): make a guess
   whether sameIRExprs might assert due to the two args being of
   different types.  If in doubt return False.  Is only used when
   --vex-iropt-level > 0, that is, vex_control.iropt_verbosity > 0.
   Bad because it duplicates functionality from typeOfIRExpr.  See
   comment on the single use point below for rationale. */
static
Bool debug_only_hack_sameIRExprs_might_assert ( IRExpr* e1, IRExpr* e2 )
{
   if (e1->tag != e2->tag) return False;
   switch (e1->tag) {
      case Iex_Const: {
         /* The only interesting case */
         IRConst *c1 = e1->Iex.Const.con;
         IRConst *c2 = e2->Iex.Const.con;
         return c1->tag != c2->tag;
      }
      default:
         break;
   }
   return False;
}


/* Is this literally IRExpr_Const(IRConst_U32(0)) ? */
static Bool isZeroU32 ( IRExpr* e )
{
   return toBool( e->tag == Iex_Const 
                  && e->Iex.Const.con->tag == Ico_U32
                  && e->Iex.Const.con->Ico.U32 == 0);
}

/* Is this literally IRExpr_Const(IRConst_U64(0)) ?
   Currently unused; commented out to avoid compiler warning */
#if 0
static Bool isZeroU64 ( IRExpr* e )
{
   return toBool( e->tag == Iex_Const 
                  && e->Iex.Const.con->tag == Ico_U64
                  && e->Iex.Const.con->Ico.U64 == 0);
}
#endif

/* Is this literally IRExpr_Const(IRConst_V128(0)) ? */
static Bool isZeroV128 ( IRExpr* e )
{
   return toBool( e->tag == Iex_Const 
                  && e->Iex.Const.con->tag == Ico_V128
                  && e->Iex.Const.con->Ico.V128 == 0x0000);
}

/* Is this literally IRExpr_Const(IRConst_V256(0)) ? */
static Bool isZeroV256 ( IRExpr* e )
{
   return toBool( e->tag == Iex_Const 
                  && e->Iex.Const.con->tag == Ico_V256
                  && e->Iex.Const.con->Ico.V256 == 0x00000000);
}

/* Is this an integer constant with value 0 ? */
static Bool isZeroU ( IRExpr* e )
{
   if (e->tag != Iex_Const) return False;
   switch (e->Iex.Const.con->tag) {
      case Ico_U1:    return toBool( e->Iex.Const.con->Ico.U1  == 0);
      case Ico_U8:    return toBool( e->Iex.Const.con->Ico.U8  == 0);
      case Ico_U16:   return toBool( e->Iex.Const.con->Ico.U16 == 0);
      case Ico_U32:   return toBool( e->Iex.Const.con->Ico.U32 == 0);
      case Ico_U64:   return toBool( e->Iex.Const.con->Ico.U64 == 0);
      default: vpanic("isZeroU");
   }
}

/* Is this an integer constant with value 1---1b ? */
static Bool isOnesU ( IRExpr* e )
{
   if (e->tag != Iex_Const) return False;
   switch (e->Iex.Const.con->tag) {
      case Ico_U8:    return toBool( e->Iex.Const.con->Ico.U8  == 0xFF);
      case Ico_U16:   return toBool( e->Iex.Const.con->Ico.U16 == 0xFFFF);
      case Ico_U32:   return toBool( e->Iex.Const.con->Ico.U32
                                     == 0xFFFFFFFF);
      case Ico_U64:   return toBool( e->Iex.Const.con->Ico.U64
                                     == 0xFFFFFFFFFFFFFFFFULL);
      default: ppIRExpr(e); vpanic("isOnesU");
   }
}

static Bool notBool ( Bool b )
{
   if (b == True) return False;
   if (b == False) return True;
   vpanic("notBool");
}

/* Make a zero which has the same type as the result of the given
   primop. */
static IRExpr* mkZeroOfPrimopResultType ( IROp op )
{
   switch (op) {
      case Iop_CmpNE32: return IRExpr_Const(IRConst_U1(toBool(0)));
      case Iop_Xor8:  return IRExpr_Const(IRConst_U8(0));
      case Iop_Xor16: return IRExpr_Const(IRConst_U16(0));
      case Iop_Sub32:
      case Iop_Xor32: return IRExpr_Const(IRConst_U32(0));
      case Iop_And64:
      case Iop_Sub64:
      case Iop_Xor64: return IRExpr_Const(IRConst_U64(0));
      case Iop_XorV128:
      case Iop_AndV128: return IRExpr_Const(IRConst_V128(0));
      case Iop_XorV256:
      case Iop_AndV256: return IRExpr_Const(IRConst_V256(0));
      default: vpanic("mkZeroOfPrimopResultType: bad primop");
   }
}

/* Make a value containing all 1-bits, which has the same type as the
   result of the given primop. */
static IRExpr* mkOnesOfPrimopResultType ( IROp op )
{
   switch (op) {
      case Iop_CmpEQ32:
      case Iop_CmpEQ64:
         return IRExpr_Const(IRConst_U1(toBool(1)));
      case Iop_Or8:
         return IRExpr_Const(IRConst_U8(0xFF));
      case Iop_Or16:
         return IRExpr_Const(IRConst_U16(0xFFFF));
      case Iop_Or32:
         return IRExpr_Const(IRConst_U32(0xFFFFFFFF));
      case Iop_CmpEQ8x8:
      case Iop_Or64:
         return IRExpr_Const(IRConst_U64(0xFFFFFFFFFFFFFFFFULL));
      case Iop_CmpEQ8x16:
      case Iop_CmpEQ16x8:
      case Iop_CmpEQ32x4:
         return IRExpr_Const(IRConst_V128(0xFFFF));
      default:
         ppIROp(op);
         vpanic("mkOnesOfPrimopResultType: bad primop");
   }
}

/* Helpers for folding Clz32/64. */
static UInt fold_Clz64 ( ULong value )
{
   UInt i;
   vassert(value != 0ULL); /* no defined semantics for arg==0 */
   for (i = 0; i < 64; ++i) {
      if (0ULL != (value & (((ULong)1) << (63 - i)))) return i;
   }
   vassert(0);
   /*NOTREACHED*/
   return 0;
}

static UInt fold_Clz32 ( UInt value )
{
   UInt i;
   vassert(value != 0); /* no defined semantics for arg==0 */
   for (i = 0; i < 32; ++i) {
      if (0 != (value & (((UInt)1) << (31 - i)))) return i;
   }
   vassert(0);
   /*NOTREACHED*/
   return 0;
}

/* V64 holds 8 summary-constant bits in V128/V256 style.  Convert to
   the corresponding real constant. */
//XXX re-check this before use
//static ULong de_summarise_V64 ( UChar v64 )
//{
//   ULong r = 0;
//   if (v64 & (1<<0)) r |= 0x00000000000000FFULL;
//   if (v64 & (1<<1)) r |= 0x000000000000FF00ULL;
//   if (v64 & (1<<2)) r |= 0x0000000000FF0000ULL;
//   if (v64 & (1<<3)) r |= 0x00000000FF000000ULL;
//   if (v64 & (1<<4)) r |= 0x000000FF00000000ULL;
//   if (v64 & (1<<5)) r |= 0x0000FF0000000000ULL;
//   if (v64 & (1<<6)) r |= 0x00FF000000000000ULL;
//   if (v64 & (1<<7)) r |= 0xFF00000000000000ULL;
//   return r;
//}

/* Helper for arbitrary expression pattern matching in flat IR.  If
   'e' is a reference to a tmp, look it up in env -- repeatedly, if
   necessary -- until it resolves to a non-tmp.  Note that this can
   return NULL if it can't resolve 'e' to a new expression, which will
   be the case if 'e' is instead defined by an IRStmt (IRDirty or
   LLSC). */
static IRExpr* chase ( IRExpr** env, IRExpr* e )
{
   /* Why is this loop guaranteed to terminate?  Because all tmps must
      have definitions before use, hence a tmp cannot be bound
      (directly or indirectly) to itself. */
   while (e->tag == Iex_RdTmp) {
      if (0) { vex_printf("chase "); ppIRExpr(e); vex_printf("\n"); }
      e = env[(Int)e->Iex.RdTmp.tmp];
      if (e == NULL) break;
   }
   return e;
}

/* Similar to |chase|, but follows at most one level of tmp reference. */
static IRExpr* chase1 ( IRExpr** env, IRExpr* e )
{
   if (e == NULL || e->tag != Iex_RdTmp)
      return e;
   else
      return env[(Int)e->Iex.RdTmp.tmp];
}

static IRExpr* fold_Expr ( IRExpr** env, IRExpr* e )
{
   Int     shift;
   IRExpr* e2 = e; /* e2 is the result of folding e, if possible */

   switch (e->tag) {
   case Iex_Unop:
      /* UNARY ops */
      if (e->Iex.Unop.arg->tag == Iex_Const) {
         switch (e->Iex.Unop.op) {
         case Iop_1Uto8:
            e2 = IRExpr_Const(IRConst_U8(toUChar(
                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                    ? 1 : 0)));
            break;
         case Iop_1Uto32:
            e2 = IRExpr_Const(IRConst_U32(
                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                    ? 1 : 0));
            break;
         case Iop_1Uto64:
            e2 = IRExpr_Const(IRConst_U64(
                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                    ? 1 : 0));
            break;

         case Iop_1Sto8:
            e2 = IRExpr_Const(IRConst_U8(toUChar(
                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                    ? 0xFF : 0)));
            break;
         case Iop_1Sto16:
            e2 = IRExpr_Const(IRConst_U16(toUShort(
                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                    ? 0xFFFF : 0)));
            break;
         case Iop_1Sto32:
            e2 = IRExpr_Const(IRConst_U32(
                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                    ? 0xFFFFFFFF : 0));
            break;
         case Iop_1Sto64:
            e2 = IRExpr_Const(IRConst_U64(
                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
                    ? 0xFFFFFFFFFFFFFFFFULL : 0));
            break;

         case Iop_8Sto32: {
            UInt u32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U8;
            u32 <<= 24;
            u32 = (Int)u32 >> 24;   /* signed shift */
            e2 = IRExpr_Const(IRConst_U32(u32));
            break;
         }
         case Iop_16Sto32: {
            UInt u32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U16;
            u32 <<= 16;
            u32 = (Int)u32 >> 16;   /* signed shift */
            e2 = IRExpr_Const(IRConst_U32(u32));
            break;
         }
         case Iop_8Uto64:
            e2 = IRExpr_Const(IRConst_U64(
                    0xFFULL & e->Iex.Unop.arg->Iex.Const.con->Ico.U8));
            break;
         case Iop_16Uto64:
            e2 = IRExpr_Const(IRConst_U64(
                    0xFFFFULL & e->Iex.Unop.arg->Iex.Const.con->Ico.U16));
            break;
         case Iop_8Uto32:
            e2 = IRExpr_Const(IRConst_U32(
                    0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U8));
            break;
         case Iop_8Sto16: {
            UShort u16 = e->Iex.Unop.arg->Iex.Const.con->Ico.U8;
            u16 <<= 8;
            u16 = (Short)u16 >> 8;  /* signed shift */
            e2 = IRExpr_Const(IRConst_U16(u16));
            break;
         }
         case Iop_8Uto16:
            e2 = IRExpr_Const(IRConst_U16(
                    0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U8));
            break;
         case Iop_16Uto32:
            e2 = IRExpr_Const(IRConst_U32(
                    0xFFFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U16));
            break;
         case Iop_32to16:
            e2 = IRExpr_Const(IRConst_U16(toUShort(
                    0xFFFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
            break;
         case Iop_32to8:
            e2 = IRExpr_Const(IRConst_U8(toUChar(
                    0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
            break;
         case Iop_32to1:
            e2 = IRExpr_Const(IRConst_U1(toBool(
                    1 == (1 & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)
                 )));
            break;
         case Iop_64to1:
            e2 = IRExpr_Const(IRConst_U1(toBool(
                    1 == (1 & e->Iex.Unop.arg->Iex.Const.con->Ico.U64)
                 )));
            break;

         case Iop_NotV128:
            e2 = IRExpr_Const(IRConst_V128(
                    ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.V128)));
            break;
         case Iop_Not64:
            e2 = IRExpr_Const(IRConst_U64(
                    ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U64)));
            break;
         case Iop_Not32:
            e2 = IRExpr_Const(IRConst_U32(
                    ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
            break;
         case Iop_Not16:
            e2 = IRExpr_Const(IRConst_U16(toUShort(
                    ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U16))));
            break;
         case Iop_Not8:
            e2 = IRExpr_Const(IRConst_U8(toUChar(
                    ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U8))));
            break;

         case Iop_Not1:
            e2 = IRExpr_Const(IRConst_U1(
                    notBool(e->Iex.Unop.arg->Iex.Const.con->Ico.U1)));
            break;

         case Iop_64to8: {
            ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
            w64 &= 0xFFULL;
            e2 = IRExpr_Const(IRConst_U8( (UChar)w64 ));
            break;
         }
         case Iop_64to16: {
            ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
            w64 &= 0xFFFFULL;
            e2 = IRExpr_Const(IRConst_U16( (UShort)w64 ));
            break;
         }
         case Iop_64to32: {
            ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
            w64 &= 0x00000000FFFFFFFFULL;
            e2 = IRExpr_Const(IRConst_U32( (UInt)w64 ));
            break;
         }
         case Iop_64HIto32: {
            ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
            w64 >>= 32;
            e2 = IRExpr_Const(IRConst_U32( (UInt)w64 ));
            break;
         }
         case Iop_32Uto64:
            e2 = IRExpr_Const(IRConst_U64(
                    0xFFFFFFFFULL 
                    & e->Iex.Unop.arg->Iex.Const.con->Ico.U32));
            break;
         case Iop_16Sto64: {
            ULong u64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U16;
            u64 <<= 48;
            u64 = (Long)u64 >> 48;   /* signed shift */
            e2 = IRExpr_Const(IRConst_U64(u64));
            break;
         }
         case Iop_32Sto64: {
            ULong u64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
            u64 <<= 32;
            u64 = (Long)u64 >> 32;   /* signed shift */
            e2 = IRExpr_Const(IRConst_U64(u64));
            break;
         }

         case Iop_16to8: {
            UShort w16 = e->Iex.Unop.arg->Iex.Const.con->Ico.U16;
            w16 &= 0xFF;
            e2 = IRExpr_Const(IRConst_U8( (UChar)w16 ));
            break;
         }
         case Iop_16HIto8: {
            UShort w16 = e->Iex.Unop.arg->Iex.Const.con->Ico.U16;
            w16 >>= 8;
            w16 &= 0xFF;
            e2 = IRExpr_Const(IRConst_U8( (UChar)w16 ));
            break;
         }

         case Iop_CmpNEZ8:
            e2 = IRExpr_Const(IRConst_U1(toBool(
                    0 != 
                    (0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U8)
                 )));
            break;
         case Iop_CmpNEZ32:
            e2 = IRExpr_Const(IRConst_U1(toBool(
                    0 != 
                    (0xFFFFFFFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)
                 )));
            break;
         case Iop_CmpNEZ64:
            e2 = IRExpr_Const(IRConst_U1(toBool(
                    0ULL != e->Iex.Unop.arg->Iex.Const.con->Ico.U64
                 )));
            break;

         case Iop_CmpwNEZ32: {
            UInt w32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
            if (w32 == 0)
               e2 = IRExpr_Const(IRConst_U32( 0 ));
            else
               e2 = IRExpr_Const(IRConst_U32( 0xFFFFFFFF ));
            break;
         }
         case Iop_CmpwNEZ64: {
            ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
            if (w64 == 0)
               e2 = IRExpr_Const(IRConst_U64( 0 ));
            else
               e2 = IRExpr_Const(IRConst_U64( 0xFFFFFFFFFFFFFFFFULL ));
            break;
         }

         case Iop_Left32: {
            UInt u32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
            Int  s32 = (Int)(u32 & 0xFFFFFFFF);
            s32 = (s32 | (-s32));
            e2 = IRExpr_Const( IRConst_U32( (UInt)s32 ));
            break;
         }

         case Iop_Left64: {
            ULong u64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
            Long  s64 = (Long)u64;
            s64 = (s64 | (-s64));
            e2 = IRExpr_Const( IRConst_U64( (ULong)s64 ));
            break;
         }

         case Iop_Clz32: {
            UInt u32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
            if (u32 != 0)
               e2 = IRExpr_Const(IRConst_U32(fold_Clz32(u32)));
            break;
         }
         case Iop_Clz64: {
            ULong u64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
            if (u64 != 0ULL)
               e2 = IRExpr_Const(IRConst_U64(fold_Clz64(u64)));
            break;
         }

         /* For these vector ones, can't fold all cases, but at least
            do the most obvious one.  Could do better here using
            summarise/desummarise of vector constants, but too
            difficult to verify; hence just handle the zero cases. */
         case Iop_32UtoV128: {
            UInt u32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
            if (0 == u32) {
               e2 = IRExpr_Const(IRConst_V128(0x0000));
            } else {
               goto unhandled;
            }
            break;
         }
         case Iop_V128to64: {
            UShort v128 = e->Iex.Unop.arg->Iex.Const.con->Ico.V128;
            if (0 == ((v128 >> 0) & 0xFF)) {
               e2 = IRExpr_Const(IRConst_U64(0));
            } else {
               goto unhandled;
            }
            break;
         }
         case Iop_V128HIto64: {
            UShort v128 = e->Iex.Unop.arg->Iex.Const.con->Ico.V128;
            if (0 == ((v128 >> 8) & 0xFF)) {
               e2 = IRExpr_Const(IRConst_U64(0));
            } else {
               goto unhandled;
            }
            break;
         }
         case Iop_64UtoV128: {
            ULong u64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
            if (0 == u64) {
               e2 = IRExpr_Const(IRConst_V128(0x0000));
            } else {
               goto unhandled;
            }
            break;
         }

         /* Even stupider (although still correct ..) */
         case Iop_V256to64_0: case Iop_V256to64_1:
         case Iop_V256to64_2: case Iop_V256to64_3: {
            UInt v256 = e->Iex.Unop.arg->Iex.Const.con->Ico.V256;
            if (v256 == 0x00000000) {
               e2 = IRExpr_Const(IRConst_U64(0));
            } else {
               goto unhandled;
            }
            break;
         }

         case Iop_ZeroHI64ofV128: {
            /* Could do better here -- only need to look at the bottom 64 bits
               of the argument, really. */
            UShort v128 = e->Iex.Unop.arg->Iex.Const.con->Ico.V128;
            if (v128 == 0x0000) {
               e2 = IRExpr_Const(IRConst_V128(0x0000));
            } else {
               goto unhandled;
            }
            break;
         }

         default: 
            goto unhandled;
      }
      }
      break;

   case Iex_Binop:
      /* BINARY ops */
      if (e->Iex.Binop.arg1->tag == Iex_Const
          && e->Iex.Binop.arg2->tag == Iex_Const) {
         /* cases where both args are consts */
         switch (e->Iex.Binop.op) {

            /* -- Or -- */
            case Iop_Or8:
               e2 = IRExpr_Const(IRConst_U8(toUChar( 
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
                        | e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
               break;
            case Iop_Or16:
               e2 = IRExpr_Const(IRConst_U16(toUShort(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U16
                        | e->Iex.Binop.arg2->Iex.Const.con->Ico.U16))));
               break;
            case Iop_Or32:
               e2 = IRExpr_Const(IRConst_U32(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                        | e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
               break;
            case Iop_Or64:
               e2 = IRExpr_Const(IRConst_U64(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                        | e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
               break;
            case Iop_OrV128:
               e2 = IRExpr_Const(IRConst_V128(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.V128
                        | e->Iex.Binop.arg2->Iex.Const.con->Ico.V128)));
               break;

            /* -- Xor -- */
            case Iop_Xor8:
               e2 = IRExpr_Const(IRConst_U8(toUChar( 
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
                        ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
               break;
            case Iop_Xor16:
               e2 = IRExpr_Const(IRConst_U16(toUShort(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U16
                        ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U16))));
               break;
            case Iop_Xor32:
               e2 = IRExpr_Const(IRConst_U32(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                        ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
               break;
            case Iop_Xor64:
               e2 = IRExpr_Const(IRConst_U64(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                        ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
               break;
            case Iop_XorV128:
               e2 = IRExpr_Const(IRConst_V128(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.V128
                        ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.V128)));
               break;

            /* -- And -- */
            case Iop_And8:
               e2 = IRExpr_Const(IRConst_U8(toUChar( 
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
                        & e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
               break;
            case Iop_And16:
               e2 = IRExpr_Const(IRConst_U16(toUShort(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U16
                        & e->Iex.Binop.arg2->Iex.Const.con->Ico.U16))));
               break;
            case Iop_And32:
               e2 = IRExpr_Const(IRConst_U32(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                        & e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
               break;
            case Iop_And64:
               e2 = IRExpr_Const(IRConst_U64(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                        & e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
               break;
            case Iop_AndV128:
               e2 = IRExpr_Const(IRConst_V128(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.V128
                        & e->Iex.Binop.arg2->Iex.Const.con->Ico.V128)));
               break;

            /* -- Add -- */
            case Iop_Add8:
               e2 = IRExpr_Const(IRConst_U8(toUChar( 
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
                        + e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
               break;
            case Iop_Add32:
               e2 = IRExpr_Const(IRConst_U32(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                        + e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
               break;
            case Iop_Add64:
               e2 = IRExpr_Const(IRConst_U64(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                        + e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
               break;

            /* -- Sub -- */
            case Iop_Sub8:
               e2 = IRExpr_Const(IRConst_U8(toUChar( 
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
                        - e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
               break;
            case Iop_Sub32:
               e2 = IRExpr_Const(IRConst_U32(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                        - e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
               break;
            case Iop_Sub64:
               e2 = IRExpr_Const(IRConst_U64(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                        - e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
               break;

            /* -- Max32U -- */
            case Iop_Max32U: {
               UInt u32a = e->Iex.Binop.arg1->Iex.Const.con->Ico.U32;
               UInt u32b = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
               UInt res  = u32a > u32b ? u32a : u32b;
               e2 = IRExpr_Const(IRConst_U32(res));
               break;
            }

            /* -- Mul -- */
            case Iop_Mul32:
               e2 = IRExpr_Const(IRConst_U32(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                        * e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
               break;
            case Iop_Mul64:
               e2 = IRExpr_Const(IRConst_U64(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                        * e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
               break;

            case Iop_MullS32: {
               /* very paranoid */
               UInt  u32a = e->Iex.Binop.arg1->Iex.Const.con->Ico.U32;
               UInt  u32b = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
               Int   s32a = (Int)u32a;
               Int   s32b = (Int)u32b;
               Long  s64a = (Long)s32a;
               Long  s64b = (Long)s32b;
               Long  sres = s64a * s64b;
               ULong ures = (ULong)sres;
               e2 = IRExpr_Const(IRConst_U64(ures));
               break;
            }

            /* -- Shl -- */
            case Iop_Shl32:
               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
               shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
               if (shift >= 0 && shift <= 31)
                  e2 = IRExpr_Const(IRConst_U32(
                          (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                           << shift)));
               break;
            case Iop_Shl64:
               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
               shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
               if (shift >= 0 && shift <= 63)
                  e2 = IRExpr_Const(IRConst_U64(
                          (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                           << shift)));
               break;

            /* -- Sar -- */
            case Iop_Sar32: {
               /* paranoid ... */
               /*signed*/ Int s32;
               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
               s32   = (Int)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32);
               shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
               if (shift >= 0 && shift <= 31) {
                  s32 >>=/*signed*/ shift;
                  e2 = IRExpr_Const(IRConst_U32((UInt)s32));
               }
               break;
            }
            case Iop_Sar64: {
               /* paranoid ... */
               /*signed*/ Long s64;
               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
               s64   = (Long)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64);
               shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
               if (shift >= 0 && shift <= 63) {
                  s64 >>=/*signed*/ shift;
                  e2 = IRExpr_Const(IRConst_U64((ULong)s64));
               }
               break;
            }

            /* -- Shr -- */
            case Iop_Shr32: {
               /* paranoid ... */
               /*unsigned*/ UInt u32;
               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
               u32   = (UInt)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32);
               shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
               if (shift >= 0 && shift <= 31) {
                  u32 >>=/*unsigned*/ shift;
                  e2 = IRExpr_Const(IRConst_U32(u32));
               }
               break;
            }
            case Iop_Shr64: {
               /* paranoid ... */
               /*unsigned*/ ULong u64;
               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
               u64   = (ULong)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64);
               shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
               if (shift >= 0 && shift <= 63) {
                  u64 >>=/*unsigned*/ shift;
                  e2 = IRExpr_Const(IRConst_U64(u64));
               }
               break;
            }

            /* -- CmpEQ -- */
            case Iop_CmpEQ32:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                        == e->Iex.Binop.arg2->Iex.Const.con->Ico.U32))));
               break;
            case Iop_CmpEQ64:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                        == e->Iex.Binop.arg2->Iex.Const.con->Ico.U64))));
               break;

            /* -- CmpNE -- */
            case Iop_CmpNE8:
            case Iop_CasCmpNE8:
            case Iop_ExpCmpNE8:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       ((0xFF & e->Iex.Binop.arg1->Iex.Const.con->Ico.U8)
                        != (0xFF & e->Iex.Binop.arg2->Iex.Const.con->Ico.U8)))));
               break;
            case Iop_CmpNE32:
            case Iop_CasCmpNE32:
            case Iop_ExpCmpNE32:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
                        != e->Iex.Binop.arg2->Iex.Const.con->Ico.U32))));
               break;
            case Iop_CmpNE64:
            case Iop_CasCmpNE64:
            case Iop_ExpCmpNE64:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
                        != e->Iex.Binop.arg2->Iex.Const.con->Ico.U64))));
               break;

            /* -- CmpLEU -- */
            case Iop_CmpLE32U:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       ((UInt)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
                        <= (UInt)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
               break;
            case Iop_CmpLE64U:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       ((ULong)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64)
                        <= (ULong)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)))));
               break;

            /* -- CmpLES -- */
            case Iop_CmpLE32S:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       ((Int)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
                        <= (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
               break;
            case Iop_CmpLE64S:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       ((Long)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64)
                        <= (Long)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)))));
               break;

            /* -- CmpLTS -- */
            case Iop_CmpLT32S:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       ((Int)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
                        < (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
               break;
            case Iop_CmpLT64S:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       ((Long)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64)
                        < (Long)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)))));
               break;

            /* -- CmpLTU -- */
            case Iop_CmpLT32U:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       ((UInt)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
                        < (UInt)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
               break;
            case Iop_CmpLT64U:
               e2 = IRExpr_Const(IRConst_U1(toBool(
                       ((ULong)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64)
                        < (ULong)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)))));
               break;

            /* -- CmpORD -- */
            case Iop_CmpORD32S: {
               /* very paranoid */
               UInt  u32a = e->Iex.Binop.arg1->Iex.Const.con->Ico.U32;
               UInt  u32b = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
               Int   s32a = (Int)u32a;
               Int   s32b = (Int)u32b;
               Int   r = 0x2; /* EQ */
               if (s32a < s32b) {
                  r = 0x8; /* LT */
               } 
               else if (s32a > s32b) {
                  r = 0x4; /* GT */
               }
               e2 = IRExpr_Const(IRConst_U32(r));
               break;
            }

            /* -- nHLto2n -- */
            case Iop_32HLto64:
               e2 = IRExpr_Const(IRConst_U64(
                       (((ULong)(e->Iex.Binop.arg1
                                  ->Iex.Const.con->Ico.U32)) << 32)
                       | ((ULong)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)) 
                    ));
               break;
            case Iop_64HLto128:
               /* We can't fold this, because there is no way to
                  express he result in IR, but at least pretend to
                  handle it, so as to stop getting blasted with
                  no-rule-for-this-primop messages. */
               break;
            /* For this vector one, can't fold all cases, but at
               least do the most obvious one.  Could do better here
               using summarise/desummarise of vector constants, but
               too difficult to verify; hence just handle the zero
               cases. */
            case Iop_64HLtoV128: {
               ULong argHi = e->Iex.Binop.arg1->Iex.Const.con->Ico.U64;
               ULong argLo = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
               if (0 == argHi && 0 == argLo) {
                  e2 = IRExpr_Const(IRConst_V128(0));
               } else {
                  goto unhandled;
               }
               break;
            }
            /* Same reasoning for the 256-bit version. */
            case Iop_V128HLtoV256: {
               IRExpr* argHi = e->Iex.Binop.arg1;
               IRExpr* argLo = e->Iex.Binop.arg2;
               if (isZeroV128(argHi) && isZeroV128(argLo)) {
                  e2 = IRExpr_Const(IRConst_V256(0));
               } else {
                  goto unhandled;
               }
               break;
            }

            /* -- V128 stuff -- */
            case Iop_InterleaveLO8x16: {
               /* This turns up a lot in Memcheck instrumentation of
                  Icc generated code.  I don't know why. */
               UShort arg1 =  e->Iex.Binop.arg1->Iex.Const.con->Ico.V128;
               UShort arg2 =  e->Iex.Binop.arg2->Iex.Const.con->Ico.V128;
               if (0 == arg1 && 0 == arg2) {
                  e2 = IRExpr_Const(IRConst_V128(0));
               } else {
                  goto unhandled;
               }
               break;
            }

            default:
               goto unhandled;
         }

      } else {

         /* other cases (identities, etc) */
         switch (e->Iex.Binop.op) {

            case Iop_Shl32:
            case Iop_Shl64:
            case Iop_Shr64:
            case Iop_Sar64:
               /* Shl32/Shl64/Shr64/Sar64(x,0) ==> x */
               if (isZeroU(e->Iex.Binop.arg2)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               /* Shl32/Shl64/Shr64(0,x) ==> 0 */
               if (isZeroU(e->Iex.Binop.arg1)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               break;

            case Iop_Sar32:
            case Iop_Shr32:
               /* Shr32/Sar32(x,0) ==> x */
               if (isZeroU(e->Iex.Binop.arg2)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               break;

            case Iop_Or8:
            case Iop_Or16:
            case Iop_Or32:
            case Iop_Or64:
            case Iop_Max32U:
               /* Or8/Or16/Or32/Or64/Max32U(x,0) ==> x */
               if (isZeroU(e->Iex.Binop.arg2)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               /* Or8/Or16/Or32/Or64/Max32U(0,x) ==> x */
               if (isZeroU(e->Iex.Binop.arg1)) {
                  e2 = e->Iex.Binop.arg2;
                  break;
               }
               /* Or8/Or16/Or32/Or64/Max32U(x,1---1b) ==> 1---1b */
               /* Or8/Or16/Or32/Or64/Max32U(1---1b,x) ==> 1---1b */
               if (isOnesU(e->Iex.Binop.arg1) || isOnesU(e->Iex.Binop.arg2)) {
                  e2 = mkOnesOfPrimopResultType(e->Iex.Binop.op);
                  break;
               }
               /* Or8/Or16/Or32/Or64/Max32U(t,t) ==> t, for some IRTemp t */
               if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               break;

            case Iop_Add8:
               /* Add8(t,t) ==> t << 1.
                  Memcheck doesn't understand that
                  x+x produces a defined least significant bit, and it seems
                  simplest just to get rid of the problem by rewriting it
                  out, since the opportunity to do so exists. */
               if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
                  e2 = IRExpr_Binop(Iop_Shl8, e->Iex.Binop.arg1,
                                    IRExpr_Const(IRConst_U8(1)));
                  break;
               }
               break;

               /* NB no Add16(t,t) case yet as no known test case exists */

            case Iop_Add32:
            case Iop_Add64:
               /* Add32/Add64(x,0) ==> x */
               if (isZeroU(e->Iex.Binop.arg2)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               /* Add32/Add64(0,x) ==> x */
               if (isZeroU(e->Iex.Binop.arg1)) {
                  e2 = e->Iex.Binop.arg2;
                  break;
               }
               /* Add32/Add64(t,t) ==> t << 1. Same rationale as for Add8. */
               if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
                  e2 = IRExpr_Binop(
                          e->Iex.Binop.op == Iop_Add32 ? Iop_Shl32 : Iop_Shl64,
                          e->Iex.Binop.arg1, IRExpr_Const(IRConst_U8(1)));
                  break;
               }
               break;

            case Iop_Sub32:
            case Iop_Sub64:
               /* Sub32/Sub64(x,0) ==> x */
               if (isZeroU(e->Iex.Binop.arg2)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               /* Sub32/Sub64(t,t) ==> 0, for some IRTemp t */
               if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
                  e2 = mkZeroOfPrimopResultType(e->Iex.Binop.op);
                  break;
               }
               break;
            case Iop_Sub8x16:
               /* Sub8x16(x,0) ==> x */
               if (isZeroV128(e->Iex.Binop.arg2)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               break;

            case Iop_And8:
            case Iop_And16:
            case Iop_And32:
            case Iop_And64:
               /* And8/And16/And32/And64(x,1---1b) ==> x */
               if (isOnesU(e->Iex.Binop.arg2)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               /* And8/And16/And32/And64(1---1b,x) ==> x */
               if (isOnesU(e->Iex.Binop.arg1)) {
                  e2 = e->Iex.Binop.arg2;
                  break;
               }
               /* And8/And16/And32/And64(x,0) ==> 0 */
               if (isZeroU(e->Iex.Binop.arg2)) {
                  e2 = e->Iex.Binop.arg2;
                  break;
               }
               /* And8/And16/And32/And64(0,x) ==> 0 */
               if (isZeroU(e->Iex.Binop.arg1)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               /* And8/And16/And32/And64(t,t) ==> t, for some IRTemp t */
               if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               break;

            case Iop_AndV128:
            case Iop_AndV256:
               /* And8/And16/AndV128/AndV256(t,t) 
                  ==> t, for some IRTemp t */
               if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               /* Deal with either arg zero.  Could handle other And
                  cases here too. */
               if (e->Iex.Binop.op == Iop_AndV256
                   && (isZeroV256(e->Iex.Binop.arg1)
                       || isZeroV256(e->Iex.Binop.arg2))) {
                  e2 =  mkZeroOfPrimopResultType(e->Iex.Binop.op);
                  break;
               } else if (e->Iex.Binop.op == Iop_AndV128
                          && (isZeroV128(e->Iex.Binop.arg1)
                              || isZeroV128(e->Iex.Binop.arg2))) {
                  e2 =  mkZeroOfPrimopResultType(e->Iex.Binop.op);
                  break;
               }
               break;

            case Iop_OrV128:
            case Iop_OrV256:
               /* V128/V256(t,t) ==> t, for some IRTemp t */
               if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
                  e2 = e->Iex.Binop.arg1;
                  break;
               }
               /* OrV128(t,0) ==> t */
               if (e->Iex.Binop.op == Iop_OrV128) {
                  if (isZeroV128(e->Iex.Binop.arg2)) {
                     e2 = e->Iex.Binop.arg1;
                     break;
                  }
                  if (isZeroV128(e->Iex.Binop.arg1)) {
                     e2 = e->Iex.Binop.arg2;
                     break;
                  }
               }
               /* OrV256(t,0) ==> t */
               if (e->Iex.Binop.op == Iop_OrV256) {
                  if (isZeroV256(e->Iex.Binop.arg2)) {
                     e2 = e->Iex.Binop.arg1;
                     break;
                  }
                  //Disabled because there's no known test case right now.
                  //if (isZeroV256(e->Iex.Binop.arg1)) {
                  //   e2 = e->Iex.Binop.arg2;
                  //   break;
                  //}
               }
               break;

            case Iop_Xor8:
            case Iop_Xor16:
            case Iop_Xor32:
            case Iop_Xor64:
            case Iop_XorV128:
            case Iop_XorV256:
               /* Xor8/16/32/64/V128(t,t) ==> 0, for some IRTemp t */
               if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
                  e2 = mkZeroOfPrimopResultType(e->Iex.Binop.op);
                  break;
               }
               /* XorV128(t,0) ==> t */
               if (e->Iex.Binop.op == Iop_XorV128) {
                  if (isZeroV128(e->Iex.Binop.arg2)) {
                     e2 = e->Iex.Binop.arg1;
                     break;
                  }
                  //Disabled because there's no known test case right now.
                  //if (isZeroV128(e->Iex.Binop.arg1)) {
                  //   e2 = e->Iex.Binop.arg2;
                  //   break;
                  //}
               } else {
                  /* Xor8/16/32/64(0,t) ==> t */
                  if (isZeroU(e->Iex.Binop.arg1)) {
                     e2 = e->Iex.Binop.arg2;
                     break;
                  }
                  /* Xor8/16/32/64(t,0) ==> t */
                  if (isZeroU(e->Iex.Binop.arg2)) {
                     e2 = e->Iex.Binop.arg1;
                     break;
                  }
               }
               break;

            case Iop_CmpNE32:
               /* CmpNE32(t,t) ==> 0, for some IRTemp t */
               if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
                  e2 = mkZeroOfPrimopResultType(e->Iex.Binop.op);
                  break;
               }
               /* CmpNE32(1Uto32(b), 0) ==> b */
               if (isZeroU32(e->Iex.Binop.arg2)) {
                  IRExpr* a1 = chase(env, e->Iex.Binop.arg1);
                  if (a1 && a1->tag == Iex_Unop 
                         && a1->Iex.Unop.op == Iop_1Uto32) {
                     e2 = a1->Iex.Unop.arg;
                     break;
                  }
               }
               break;

            case Iop_CmpEQ32:
            case Iop_CmpEQ64:
            case Iop_CmpEQ8x8:
            case Iop_CmpEQ8x16:
            case Iop_CmpEQ16x8:
            case Iop_CmpEQ32x4:
               if (sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
                  e2 = mkOnesOfPrimopResultType(e->Iex.Binop.op);
                  break;
               }
               break;

            default:
               break;
         }
      }
      break;

   case Iex_ITE:
      /* ITE */
      /* is the discriminant is a constant? */
      if (e->Iex.ITE.cond->tag == Iex_Const) {
         /* assured us by the IR type rules */
         vassert(e->Iex.ITE.cond->Iex.Const.con->tag == Ico_U1);
         e2 = e->Iex.ITE.cond->Iex.Const.con->Ico.U1
                 ? e->Iex.ITE.iftrue : e->Iex.ITE.iffalse;
      }
      else
      /* are the arms identical? (pretty weedy test) */
      if (sameIRExprs(env, e->Iex.ITE.iftrue,
                           e->Iex.ITE.iffalse)) {
         e2 = e->Iex.ITE.iffalse;
      }
      break;

   default:
      /* not considered */
      break;
   }

   /* Show cases where we've found but not folded 'op(t,t)'.  Be
      careful not to call sameIRExprs with values of different types,
      though, else it will assert (and so it should!).  We can't
      conveniently call typeOfIRExpr on the two args without a whole
      bunch of extra plumbing to pass in a type env, so just use a
      hacky test to check the arguments are not anything that might
      sameIRExprs to assert.  This is only OK because this kludge is
      only used for debug printing, not for "real" operation.  For
      "real" operation (ie, all other calls to sameIRExprs), it is
      essential that the to args have the same type.
 
      The "right" solution is to plumb the containing block's
      IRTypeEnv through to here and use typeOfIRExpr to be sure.  But
      that's a bunch of extra parameter passing which will just slow
      down the normal case, for no purpose. */
   if (vex_control.iropt_verbosity > 0 
       && e == e2 
       && e->tag == Iex_Binop
       && !debug_only_hack_sameIRExprs_might_assert(e->Iex.Binop.arg1,
                                                    e->Iex.Binop.arg2)
       && sameIRExprs(env, e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
      vex_printf("vex iropt: fold_Expr: no ident rule for: ");
      ppIRExpr(e);
      vex_printf("\n");
   }

   /* Show the overall results of folding. */
   if (DEBUG_IROPT && e2 != e) {
      vex_printf("FOLD: "); 
      ppIRExpr(e); vex_printf("  ->  ");
      ppIRExpr(e2); vex_printf("\n");
   }

   return e2;

 unhandled:
#  if 0
   vex_printf("\n\n");
   ppIRExpr(e);
   vpanic("fold_Expr: no rule for the above");
#  else
   if (vex_control.iropt_verbosity > 0) {
      vex_printf("vex iropt: fold_Expr: no const rule for: ");
      ppIRExpr(e);
      vex_printf("\n");
   }
   return e2;
#  endif
}


/* Apply the subst to a simple 1-level expression -- guaranteed to be
   1-level due to previous flattening pass. */

static IRExpr* subst_Expr ( IRExpr** env, IRExpr* ex )
{
   switch (ex->tag) {
      case Iex_RdTmp:
         if (env[(Int)ex->Iex.RdTmp.tmp] != NULL) {
            IRExpr *rhs = env[(Int)ex->Iex.RdTmp.tmp];
            if (rhs->tag == Iex_RdTmp)
               return rhs;
            if (rhs->tag == Iex_Const
                && rhs->Iex.Const.con->tag != Ico_F64i)
               return rhs;
         }
         /* not bound in env */
         return ex;

      case Iex_Const:
      case Iex_Get:
         return ex;

      case Iex_GetI:
         vassert(isIRAtom(ex->Iex.GetI.ix));
         return IRExpr_GetI(
            ex->Iex.GetI.descr,
            subst_Expr(env, ex->Iex.GetI.ix),
            ex->Iex.GetI.bias
         );

      case Iex_Qop: {
         IRQop* qop = ex->Iex.Qop.details;
         vassert(isIRAtom(qop->arg1));
         vassert(isIRAtom(qop->arg2));
         vassert(isIRAtom(qop->arg3));
         vassert(isIRAtom(qop->arg4));
         return IRExpr_Qop(
                   qop->op,
                   subst_Expr(env, qop->arg1),
                   subst_Expr(env, qop->arg2),
                   subst_Expr(env, qop->arg3),
                   subst_Expr(env, qop->arg4)
                );
      }

      case Iex_Triop: {
         IRTriop* triop = ex->Iex.Triop.details;
         vassert(isIRAtom(triop->arg1));
         vassert(isIRAtom(triop->arg2));
         vassert(isIRAtom(triop->arg3));
         return IRExpr_Triop(
                   triop->op,
                   subst_Expr(env, triop->arg1),
                   subst_Expr(env, triop->arg2),
                   subst_Expr(env, triop->arg3)
                );
      }

      case Iex_Binop:
         vassert(isIRAtom(ex->Iex.Binop.arg1));
         vassert(isIRAtom(ex->Iex.Binop.arg2));
         return IRExpr_Binop(
                   ex->Iex.Binop.op,
                   subst_Expr(env, ex->Iex.Binop.arg1),
                   subst_Expr(env, ex->Iex.Binop.arg2)
                );

      case Iex_Unop:
         vassert(isIRAtom(ex->Iex.Unop.arg));
         return IRExpr_Unop(
                   ex->Iex.Unop.op,
                   subst_Expr(env, ex->Iex.Unop.arg)
                );

      case Iex_Load:
         vassert(isIRAtom(ex->Iex.Load.addr));
         return IRExpr_Load(
                   ex->Iex.Load.end,
                   ex->Iex.Load.ty,
                   subst_Expr(env, ex->Iex.Load.addr)
                );

      case Iex_CCall: {
         Int      i;
         IRExpr** args2 = shallowCopyIRExprVec(ex->Iex.CCall.args);
         for (i = 0; args2[i]; i++) {
            vassert(isIRAtom(args2[i]));
            args2[i] = subst_Expr(env, args2[i]);
         }
         return IRExpr_CCall(
                   ex->Iex.CCall.cee,
                   ex->Iex.CCall.retty,
                   args2 
                );
      }

      case Iex_ITE:
         vassert(isIRAtom(ex->Iex.ITE.cond));
         vassert(isIRAtom(ex->Iex.ITE.iftrue));
         vassert(isIRAtom(ex->Iex.ITE.iffalse));
         return IRExpr_ITE(
                   subst_Expr(env, ex->Iex.ITE.cond),
                   subst_Expr(env, ex->Iex.ITE.iftrue),
                   subst_Expr(env, ex->Iex.ITE.iffalse)
                );

      default:
         vex_printf("\n\n"); ppIRExpr(ex);
         vpanic("subst_Expr");
      
   }
}


/* Apply the subst to stmt, then fold the result as much as possible.
   Much simplified due to stmt being previously flattened.  As a
   result of this, the stmt may wind up being turned into a no-op.  
*/
static IRStmt* subst_and_fold_Stmt ( IRExpr** env, IRStmt* st )
{
#  if 0
   vex_printf("\nsubst and fold stmt\n");
   ppIRStmt(st);
   vex_printf("\n");
#  endif

   switch (st->tag) {
      case Ist_AbiHint:
         vassert(isIRAtom(st->Ist.AbiHint.base));
         vassert(isIRAtom(st->Ist.AbiHint.nia));
         return IRStmt_AbiHint(
                   fold_Expr(env, subst_Expr(env, st->Ist.AbiHint.base)),
                   st->Ist.AbiHint.len,
                   fold_Expr(env, subst_Expr(env, st->Ist.AbiHint.nia))
                );
      case Ist_Put:
         vassert(isIRAtom(st->Ist.Put.data));
         return IRStmt_Put(
                   st->Ist.Put.offset, 
                   fold_Expr(env, subst_Expr(env, st->Ist.Put.data)) 
                );

      case Ist_PutI: {
         IRPutI *puti, *puti2;
         puti = st->Ist.PutI.details;
         vassert(isIRAtom(puti->ix));
         vassert(isIRAtom(puti->data));
         puti2 = mkIRPutI(puti->descr,
                          fold_Expr(env, subst_Expr(env, puti->ix)),
                          puti->bias,
                          fold_Expr(env, subst_Expr(env, puti->data)));
         return IRStmt_PutI(puti2);
      }

      case Ist_WrTmp:
         /* This is the one place where an expr (st->Ist.WrTmp.data) is
            allowed to be more than just a constant or a tmp. */
         return IRStmt_WrTmp(
                   st->Ist.WrTmp.tmp,
                   fold_Expr(env, subst_Expr(env, st->Ist.WrTmp.data))
                );

      case Ist_Store:
         vassert(isIRAtom(st->Ist.Store.addr));
         vassert(isIRAtom(st->Ist.Store.data));
         return IRStmt_Store(
                   st->Ist.Store.end,
                   fold_Expr(env, subst_Expr(env, st->Ist.Store.addr)),
                   fold_Expr(env, subst_Expr(env, st->Ist.Store.data))
                );

      case Ist_StoreG: {
         IRStoreG* sg = st->Ist.StoreG.details;
         vassert(isIRAtom(sg->addr));
         vassert(isIRAtom(sg->data));
         vassert(isIRAtom(sg->guard));
         IRExpr* faddr  = fold_Expr(env, subst_Expr(env, sg->addr));
         IRExpr* fdata  = fold_Expr(env, subst_Expr(env, sg->data));
         IRExpr* fguard = fold_Expr(env, subst_Expr(env, sg->guard));
         if (fguard->tag == Iex_Const) {
            /* The condition on this store has folded down to a constant. */
            vassert(fguard->Iex.Const.con->tag == Ico_U1);
            if (fguard->Iex.Const.con->Ico.U1 == False) {
               return IRStmt_NoOp();
            } else {
               vassert(fguard->Iex.Const.con->Ico.U1 == True);
               return IRStmt_Store(sg->end, faddr, fdata);
            }
         }
         return IRStmt_StoreG(sg->end, faddr, fdata, fguard);
      }

      case Ist_LoadG: {
         /* This is complicated.  If the guard folds down to 'false',
            we can replace it with an assignment 'dst := alt', but if
            the guard folds down to 'true', we can't conveniently
            replace it with an unconditional load, because doing so
            requires generating a new temporary, and that is not easy
            to do at this point. */
         IRLoadG* lg = st->Ist.LoadG.details;
         vassert(isIRAtom(lg->addr));
         vassert(isIRAtom(lg->alt));
         vassert(isIRAtom(lg->guard));
         IRExpr* faddr  = fold_Expr(env, subst_Expr(env, lg->addr));
         IRExpr* falt   = fold_Expr(env, subst_Expr(env, lg->alt));
         IRExpr* fguard = fold_Expr(env, subst_Expr(env, lg->guard));
         if (fguard->tag == Iex_Const) {
            /* The condition on this load has folded down to a constant. */
            vassert(fguard->Iex.Const.con->tag == Ico_U1);
            if (fguard->Iex.Const.con->Ico.U1 == False) {
               /* The load is not going to happen -- instead 'alt' is
                  assigned to 'dst'.  */
               return IRStmt_WrTmp(lg->dst, falt);
            } else {
               vassert(fguard->Iex.Const.con->Ico.U1 == True);
               /* The load is always going to happen.  We want to
                  convert to an unconditional load and assign to 'dst'
                  (IRStmt_WrTmp).  Problem is we need an extra temp to
                  hold the loaded value, but none is available.
                  Instead, reconstitute the conditional load (with
                  folded args, of course) and let the caller of this
                  routine deal with the problem. */
            }
         }
         return IRStmt_LoadG(lg->end, lg->cvt, lg->dst, faddr, falt, fguard);
      }

      case Ist_CAS: {
         IRCAS *cas, *cas2;
         cas = st->Ist.CAS.details;
         vassert(isIRAtom(cas->addr));
         vassert(cas->expdHi == NULL || isIRAtom(cas->expdHi));
         vassert(isIRAtom(cas->expdLo));
         vassert(cas->dataHi == NULL || isIRAtom(cas->dataHi));
         vassert(isIRAtom(cas->dataLo));
         cas2 = mkIRCAS(
                   cas->oldHi, cas->oldLo, cas->end, 
                   fold_Expr(env, subst_Expr(env, cas->addr)),
                   cas->expdHi ? fold_Expr(env, subst_Expr(env, cas->expdHi))
                               : NULL,
                   fold_Expr(env, subst_Expr(env, cas->expdLo)),
                   cas->dataHi ? fold_Expr(env, subst_Expr(env, cas->dataHi))
                               : NULL,
                   fold_Expr(env, subst_Expr(env, cas->dataLo))
                );
         return IRStmt_CAS(cas2);
      }

      case Ist_LLSC:
         vassert(isIRAtom(st->Ist.LLSC.addr));
         if (st->Ist.LLSC.storedata)
            vassert(isIRAtom(st->Ist.LLSC.storedata));
         return IRStmt_LLSC(
                   st->Ist.LLSC.end,
                   st->Ist.LLSC.result,
                   fold_Expr(env, subst_Expr(env, st->Ist.LLSC.addr)),
                   st->Ist.LLSC.storedata
                      ? fold_Expr(env, subst_Expr(env, st->Ist.LLSC.storedata))
                      : NULL
                );

      case Ist_Dirty: {
         Int     i;
         IRDirty *d, *d2;
         d = st->Ist.Dirty.details;
         d2 = emptyIRDirty();
         *d2 = *d;
         d2->args = shallowCopyIRExprVec(d2->args);
         if (d2->mFx != Ifx_None) {
            vassert(isIRAtom(d2->mAddr));
            d2->mAddr = fold_Expr(env, subst_Expr(env, d2->mAddr));
         }
         vassert(isIRAtom(d2->guard));
         d2->guard = fold_Expr(env, subst_Expr(env, d2->guard));
         for (i = 0; d2->args[i]; i++) {
            IRExpr* arg = d2->args[i];
            if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg))) {
               vassert(isIRAtom(arg));
               d2->args[i] = fold_Expr(env, subst_Expr(env, arg));
            }
         }
         return IRStmt_Dirty(d2);
      }

      case Ist_IMark:
         return IRStmt_IMark(st->Ist.IMark.addr,
                             st->Ist.IMark.len,
                             st->Ist.IMark.delta);

      case Ist_NoOp:
         return IRStmt_NoOp();

      case Ist_MBE:
         return IRStmt_MBE(st->Ist.MBE.event);

      case Ist_Exit: {
         IRExpr* fcond;
         vassert(isIRAtom(st->Ist.Exit.guard));
         fcond = fold_Expr(env, subst_Expr(env, st->Ist.Exit.guard));
         if (fcond->tag == Iex_Const) {
            /* Interesting.  The condition on this exit has folded down to
               a constant. */
            vassert(fcond->Iex.Const.con->tag == Ico_U1);
            if (fcond->Iex.Const.con->Ico.U1 == False) {
               /* exit is never going to happen, so dump the statement. */
               return IRStmt_NoOp();
            } else {
               vassert(fcond->Iex.Const.con->Ico.U1 == True);
               /* Hmmm.  The exit has become unconditional.  Leave it
                  as it is for now, since we'd have to truncate the BB
                  at this point, which is tricky.  Such truncation is
                  done later by the dead-code elimination pass. */
               /* fall out into the reconstruct-the-exit code. */
               if (vex_control.iropt_verbosity > 0) 
                  /* really a misuse of vex_control.iropt_verbosity */
                  vex_printf("vex iropt: IRStmt_Exit became unconditional\n");
            }
         }
         return IRStmt_Exit(fcond, st->Ist.Exit.jk,
                                   st->Ist.Exit.dst, st->Ist.Exit.offsIP);
      }

   default:
      vex_printf("\n"); ppIRStmt(st);
      vpanic("subst_and_fold_Stmt");
   }
}


IRSB* cprop_BB ( IRSB* in )
{
   Int      i;
   IRSB*    out;
   IRStmt*  st2;
   Int      n_tmps = in->tyenv->types_used;
   IRExpr** env = LibVEX_Alloc_inline(n_tmps * sizeof(IRExpr*));
   /* Keep track of IRStmt_LoadGs that we need to revisit after
      processing all the other statements. */
   const Int N_FIXUPS = 16;
   Int fixups[N_FIXUPS]; /* indices in the stmt array of 'out' */
   Int n_fixups = 0;

   out = emptyIRSB();
   out->tyenv = deepCopyIRTypeEnv( in->tyenv );

   /* Set up the env with which travels forward.  This holds a
      substitution, mapping IRTemps to IRExprs. The environment 
      is to be applied as we move along.  Keys are IRTemps.
      Values are IRExpr*s.
   */
   for (i = 0; i < n_tmps; i++)
      env[i] = NULL;

   /* For each original SSA-form stmt ... */
   for (i = 0; i < in->stmts_used; i++) {

      /* First apply the substitution to the current stmt.  This
         propagates in any constants and tmp-tmp assignments
         accumulated prior to this point.  As part of the subst_Stmt
         call, also then fold any constant expressions resulting. */

      st2 = in->stmts[i];

      /* perhaps st2 is already a no-op? */
      if (st2->tag == Ist_NoOp) continue;

      st2 = subst_and_fold_Stmt( env, st2 );

      /* Deal with some post-folding special cases. */
      switch (st2->tag) {

         /* If the statement has been folded into a no-op, forget
            it. */
         case Ist_NoOp:
            continue;

         /* If the statement assigns to an IRTemp add it to the
            running environment. This is for the benefit of copy
            propagation and to allow sameIRExpr look through
            IRTemps. */
         case Ist_WrTmp: {
            vassert(env[(Int)(st2->Ist.WrTmp.tmp)] == NULL);
            env[(Int)(st2->Ist.WrTmp.tmp)] = st2->Ist.WrTmp.data;

            /* 't1 = t2' -- don't add to BB; will be optimized out */
            if (st2->Ist.WrTmp.data->tag == Iex_RdTmp)
               continue;

            /* 't = const' && 'const != F64i' -- don't add to BB 
               Note, we choose not to propagate const when const is an
               F64i, so that F64i literals can be CSE'd later.  This
               helps x86 floating point code generation. */
            if (st2->Ist.WrTmp.data->tag == Iex_Const
                && st2->Ist.WrTmp.data->Iex.Const.con->tag != Ico_F64i) {
               continue;
            }
            /* else add it to the output, as normal */
            break;
         }

         case Ist_LoadG: {
            IRLoadG* lg    = st2->Ist.LoadG.details;
            IRExpr*  guard = lg->guard;
            if (guard->tag == Iex_Const) {
               /* The guard has folded to a constant, and that
                  constant must be 1:I1, since subst_and_fold_Stmt
                  folds out the case 0:I1 by itself. */
               vassert(guard->Iex.Const.con->tag == Ico_U1);
               vassert(guard->Iex.Const.con->Ico.U1 == True);
               /* Add a NoOp here as a placeholder, and make a note of
                  where it is in the output block.  Afterwards we'll
                  come back here and transform the NoOp and the LoadG
                  into a load-convert pair.  The fixups[] entry
                  refers to the inserted NoOp, and we expect to find
                  the relevant LoadG immediately after it. */
               vassert(n_fixups >= 0 && n_fixups <= N_FIXUPS);
               if (n_fixups < N_FIXUPS) {
                  fixups[n_fixups++] = out->stmts_used;
                  addStmtToIRSB( out, IRStmt_NoOp() );
               }
            }
            /* And always add the LoadG to the output, regardless. */
            break;
         }

      default:
         break;
      }

      /* Not interesting, copy st2 into the output block. */
      addStmtToIRSB( out, st2 );
   }

#  if STATS_IROPT
   vex_printf("sameIRExpr: invoked = %u/%u  equal = %u/%u max_nodes = %u\n",
              invocation_count, recursion_count, success_count,
              recursion_success_count, max_nodes_visited);
#  endif

   out->next     = subst_Expr( env, in->next );
   out->jumpkind = in->jumpkind;
   out->offsIP   = in->offsIP;

   /* Process any leftover unconditional LoadGs that we noticed
      in the main pass. */
   vassert(n_fixups >= 0 && n_fixups <= N_FIXUPS);
   for (i = 0; i < n_fixups; i++) {
      Int ix = fixups[i];
      /* Carefully verify that the LoadG has the expected form. */
      vassert(ix >= 0 && ix+1 < out->stmts_used);
      IRStmt* nop = out->stmts[ix];
      IRStmt* lgu = out->stmts[ix+1];
      vassert(nop->tag == Ist_NoOp);
      vassert(lgu->tag == Ist_LoadG);
      IRLoadG* lg    = lgu->Ist.LoadG.details;
      IRExpr*  guard = lg->guard;
      vassert(guard->Iex.Const.con->tag == Ico_U1);
      vassert(guard->Iex.Const.con->Ico.U1 == True);
      /* Figure out the load and result types, and the implied
         conversion operation. */
      IRType cvtRes = Ity_INVALID, cvtArg = Ity_INVALID;
      typeOfIRLoadGOp(lg->cvt, &cvtRes, &cvtArg);
      IROp cvtOp = Iop_INVALID;
      switch (lg->cvt) {
         case ILGop_IdentV128:
         case ILGop_Ident64:
         case ILGop_Ident32: break;
         case ILGop_8Uto32:  cvtOp = Iop_8Uto32;  break;
         case ILGop_8Sto32:  cvtOp = Iop_8Sto32;  break;
         case ILGop_16Uto32: cvtOp = Iop_16Uto32; break;
         case ILGop_16Sto32: cvtOp = Iop_16Sto32; break;
         default: vpanic("cprop_BB: unhandled ILGOp");
      }
      /* Replace the placeholder NoOp by the required unconditional
         load. */
      IRTemp tLoaded = newIRTemp(out->tyenv, cvtArg);
      out->stmts[ix] 
         = IRStmt_WrTmp(tLoaded,
                        IRExpr_Load(lg->end, cvtArg, lg->addr));
      /* Replace the LoadG by a conversion from the loaded value's
         type to the required result type. */
      out->stmts[ix+1]
         = IRStmt_WrTmp(
              lg->dst, cvtOp == Iop_INVALID
                          ? IRExpr_RdTmp(tLoaded)
                          : IRExpr_Unop(cvtOp, IRExpr_RdTmp(tLoaded)));
   }

   return out;
}


/*---------------------------------------------------------------*/
/*--- Dead code (t = E) removal                               ---*/
/*---------------------------------------------------------------*/

/* As a side effect, also removes all code following an unconditional
   side exit. */

/* The type of the HashHW map is: a map from IRTemp to nothing
   -- really just operating a set or IRTemps.
*/

inline
static void addUses_Temp ( Bool* set, IRTemp tmp )
{
   set[(Int)tmp] = True;
}

static void addUses_Expr ( Bool* set, IRExpr* e )
{
   Int i;
   switch (e->tag) {
      case Iex_GetI:
         addUses_Expr(set, e->Iex.GetI.ix);
         return;
      case Iex_ITE:
         addUses_Expr(set, e->Iex.ITE.cond);
         addUses_Expr(set, e->Iex.ITE.iftrue);
         addUses_Expr(set, e->Iex.ITE.iffalse);
         return;
      case Iex_CCall:
         for (i = 0; e->Iex.CCall.args[i]; i++)
            addUses_Expr(set, e->Iex.CCall.args[i]);
         return;
      case Iex_Load:
         addUses_Expr(set, e->Iex.Load.addr);
         return;
      case Iex_Qop:
         addUses_Expr(set, e->Iex.Qop.details->arg1);
         addUses_Expr(set, e->Iex.Qop.details->arg2);
         addUses_Expr(set, e->Iex.Qop.details->arg3);
         addUses_Expr(set, e->Iex.Qop.details->arg4);
         return;
      case Iex_Triop:
         addUses_Expr(set, e->Iex.Triop.details->arg1);
         addUses_Expr(set, e->Iex.Triop.details->arg2);
         addUses_Expr(set, e->Iex.Triop.details->arg3);
         return;
      case Iex_Binop:
         addUses_Expr(set, e->Iex.Binop.arg1);
         addUses_Expr(set, e->Iex.Binop.arg2);
         return;
      case Iex_Unop:
         addUses_Expr(set, e->Iex.Unop.arg);
         return;
      case Iex_RdTmp:
         addUses_Temp(set, e->Iex.RdTmp.tmp);
         return;
      case Iex_Const:
      case Iex_Get:
         return;
      default:
         vex_printf("\n");
         ppIRExpr(e);
         vpanic("addUses_Expr");
   }
}

static void addUses_Stmt ( Bool* set, IRStmt* st )
{
   Int      i;
   IRDirty* d;
   IRCAS*   cas;
   switch (st->tag) {
      case Ist_AbiHint:
         addUses_Expr(set, st->Ist.AbiHint.base);
         addUses_Expr(set, st->Ist.AbiHint.nia);
         return;
      case Ist_PutI:
         addUses_Expr(set, st->Ist.PutI.details->ix);
         addUses_Expr(set, st->Ist.PutI.details->data);
         return;
      case Ist_WrTmp:
         addUses_Expr(set, st->Ist.WrTmp.data);
         return;
      case Ist_Put:
         addUses_Expr(set, st->Ist.Put.data);
         return;
      case Ist_Store:
         addUses_Expr(set, st->Ist.Store.addr);
         addUses_Expr(set, st->Ist.Store.data);
         return;
      case Ist_StoreG: {
         IRStoreG* sg = st->Ist.StoreG.details;
         addUses_Expr(set, sg->addr);
         addUses_Expr(set, sg->data);
         addUses_Expr(set, sg->guard);
         return;
      }
      case Ist_LoadG: {
         IRLoadG* lg = st->Ist.LoadG.details;
         addUses_Expr(set, lg->addr);
         addUses_Expr(set, lg->alt);
         addUses_Expr(set, lg->guard);
         return;
      }
      case Ist_CAS:
         cas = st->Ist.CAS.details;
         addUses_Expr(set, cas->addr);
         if (cas->expdHi)
            addUses_Expr(set, cas->expdHi);
         addUses_Expr(set, cas->expdLo);
         if (cas->dataHi)
            addUses_Expr(set, cas->dataHi);
         addUses_Expr(set, cas->dataLo);
         return;
      case Ist_LLSC:
         addUses_Expr(set, st->Ist.LLSC.addr);
         if (st->Ist.LLSC.storedata)
            addUses_Expr(set, st->Ist.LLSC.storedata);
         return;
      case Ist_Dirty:
         d = st->Ist.Dirty.details;
         if (d->mFx != Ifx_None)
            addUses_Expr(set, d->mAddr);
         addUses_Expr(set, d->guard);
         for (i = 0; d->args[i] != NULL; i++) {
            IRExpr* arg = d->args[i];
            if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg)))
               addUses_Expr(set, arg);
         }
         return;
      case Ist_NoOp:
      case Ist_IMark:
      case Ist_MBE:
         return;
      case Ist_Exit:
         addUses_Expr(set, st->Ist.Exit.guard);
         return;
      default:
         vex_printf("\n");
         ppIRStmt(st);
         vpanic("addUses_Stmt");
   }
}


/* Is this literally IRExpr_Const(IRConst_U1(False)) ? */
static Bool isZeroU1 ( IRExpr* e )
{
   return toBool( e->tag == Iex_Const
                  && e->Iex.Const.con->tag == Ico_U1
                  && e->Iex.Const.con->Ico.U1 == False );
}

/* Is this literally IRExpr_Const(IRConst_U1(True)) ? */
static Bool isOneU1 ( IRExpr* e )
{
   return toBool( e->tag == Iex_Const
                  && e->Iex.Const.con->tag == Ico_U1
                  && e->Iex.Const.con->Ico.U1 == True );
}


/* Note, this destructively modifies the given IRSB. */

/* Scan backwards through statements, carrying a set of IRTemps which
   are known to be used after the current point.  On encountering 't =
   E', delete the binding if it is not used.  Otherwise, add any temp
   uses to the set and keep on moving backwards.

   As an enhancement, the first (backwards) pass searches for IR exits
   with always-taken conditions and notes the location of the earliest
   one in the block.  If any such are found, a second pass copies the
   exit destination and jump kind to the bb-end.  Then, the exit and
   all statements following it are turned into no-ops.
*/

/* notstatic */ void do_deadcode_BB ( IRSB* bb )
{
   Int     i, i_unconditional_exit;
   Int     n_tmps = bb->tyenv->types_used;
   Bool*   set = LibVEX_Alloc_inline(n_tmps * sizeof(Bool));
   IRStmt* st;

   for (i = 0; i < n_tmps; i++)
      set[i] = False;

   /* start off by recording IRTemp uses in the next field. */
   addUses_Expr(set, bb->next);

   /* First pass */

   /* Work backwards through the stmts */
   i_unconditional_exit = -1;
   for (i = bb->stmts_used-1; i >= 0; i--) {
      st = bb->stmts[i];
      if (st->tag == Ist_NoOp)
         continue;
      /* take note of any unconditional exits */
      if (st->tag == Ist_Exit
          && isOneU1(st->Ist.Exit.guard))
         i_unconditional_exit = i;
      if (st->tag == Ist_WrTmp
          && set[(Int)(st->Ist.WrTmp.tmp)] == False) {
          /* it's an IRTemp which never got used.  Delete it. */
         if (DEBUG_IROPT) {
            vex_printf("DEAD: ");
            ppIRStmt(st);
            vex_printf("\n");
         }
         bb->stmts[i] = IRStmt_NoOp();
      }
      else
      if (st->tag == Ist_Dirty
          && st->Ist.Dirty.details->guard
          && isZeroU1(st->Ist.Dirty.details->guard)) {
         /* This is a dirty helper which will never get called.
            Delete it. */
         bb->stmts[i] = IRStmt_NoOp();
       }
       else {
         /* Note any IRTemp uses made by the current statement. */
         addUses_Stmt(set, st);
      }
   }

   /* Optional second pass: if any unconditional exits were found, 
      delete them and all following statements. */

   if (i_unconditional_exit != -1) {
      if (0) vex_printf("ZAPPING ALL FORWARDS from %d\n", 
                        i_unconditional_exit);
      vassert(i_unconditional_exit >= 0 
              && i_unconditional_exit < bb->stmts_used);
      bb->next 
         = IRExpr_Const( bb->stmts[i_unconditional_exit]->Ist.Exit.dst );
      bb->jumpkind
         = bb->stmts[i_unconditional_exit]->Ist.Exit.jk;
      bb->offsIP
         = bb->stmts[i_unconditional_exit]->Ist.Exit.offsIP;
      for (i = i_unconditional_exit; i < bb->stmts_used; i++)
         bb->stmts[i] = IRStmt_NoOp();
   }
}


/*---------------------------------------------------------------*/
/*--- Specialisation of helper function calls, in             ---*/
/*--- collaboration with the front end                        ---*/
/*---------------------------------------------------------------*/

static 
IRSB* spec_helpers_BB(
         IRSB* bb,
         IRExpr* (*specHelper) (const HChar*, IRExpr**, IRStmt**, Int)
      )
{
   Int     i;
   IRStmt* st;
   IRExpr* ex;
   Bool    any = False;

   for (i = bb->stmts_used-1; i >= 0; i--) {
      st = bb->stmts[i];

      if (st->tag != Ist_WrTmp
          || st->Ist.WrTmp.data->tag != Iex_CCall)
         continue;

      ex = (*specHelper)( st->Ist.WrTmp.data->Iex.CCall.cee->name,
                          st->Ist.WrTmp.data->Iex.CCall.args,
                          &bb->stmts[0], i );
      if (!ex)
        /* the front end can't think of a suitable replacement */
        continue;

      /* We got something better.  Install it in the bb. */
      any = True;
      bb->stmts[i]
         = IRStmt_WrTmp(st->Ist.WrTmp.tmp, ex);

      if (0) {
         vex_printf("SPEC: ");
         ppIRExpr(st->Ist.WrTmp.data);
         vex_printf("  -->  ");
         ppIRExpr(ex);
         vex_printf("\n");
      }
   }

   if (any)
      bb = flatten_BB(bb);
   return bb;
}


/*---------------------------------------------------------------*/
/*--- Determination of guest state aliasing relationships     ---*/
/*---------------------------------------------------------------*/

/* These are helper functions for CSE and GetI/PutI transformations.

   Determine, to the extent possible, the relationship between two
   guest state accesses.  The possible outcomes are:

   * Exact alias.  These two accesses denote precisely the same
     piece of the guest state.

   * Definitely no alias.  These two accesses are guaranteed not to
     overlap any part of the guest state.

   * Unknown -- if neither of the above can be established.

   If in doubt, return Unknown.  */

typedef
   enum { ExactAlias, NoAlias, UnknownAlias }
   GSAliasing;


/* Produces the alias relation between an indexed guest
   state access and a non-indexed access. */

static
GSAliasing getAliasingRelation_IC ( IRRegArray* descr1, IRExpr* ix1,
                                    Int offset2, IRType ty2 )
{
   UInt minoff1, maxoff1, minoff2, maxoff2;

   getArrayBounds( descr1, &minoff1, &maxoff1 );
   minoff2 = offset2;
   maxoff2 = minoff2 + sizeofIRType(ty2) - 1;

   if (maxoff1 < minoff2 || maxoff2 < minoff1)
      return NoAlias;

   /* Could probably do better here if required.  For the moment
      however just claim not to know anything more. */
   return UnknownAlias;
}


/* Produces the alias relation between two indexed guest state
   accesses. */

static
GSAliasing getAliasingRelation_II ( 
              IRRegArray* descr1, IRExpr* ix1, Int bias1,
              IRRegArray* descr2, IRExpr* ix2, Int bias2
           )
{
   UInt minoff1, maxoff1, minoff2, maxoff2;
   Int  iters;

   /* First try hard to show they don't alias. */
   getArrayBounds( descr1, &minoff1, &maxoff1 );
   getArrayBounds( descr2, &minoff2, &maxoff2 );
   if (maxoff1 < minoff2 || maxoff2 < minoff1)
      return NoAlias;

   /* So the two arrays at least partially overlap.  To get any
      further we'll have to be sure that the descriptors are
      identical. */
   if (!eqIRRegArray(descr1, descr2))
      return UnknownAlias;

   /* The descriptors are identical.  Now the only difference can be
      in the index expressions.  If they cannot be shown to be
      identical, we have to say we don't know what the aliasing
      relation will be.  Now, since the IR is flattened, the index
      expressions should be atoms -- either consts or tmps.  So that
      makes the comparison simple. */
   vassert(isIRAtom(ix1));
   vassert(isIRAtom(ix2));
   if (!eqIRAtom(ix1,ix2))
      return UnknownAlias;

   /* Ok, the index expressions are identical.  So now the only way
      they can be different is in the bias.  Normalise this
      paranoidly, to reliably establish equality/non-equality. */

   /* So now we know that the GetI and PutI index the same array
      with the same base.  Are the offsets the same, modulo the
      array size?  Do this paranoidly. */
   vassert(descr1->nElems == descr2->nElems);
   vassert(descr1->elemTy == descr2->elemTy);
   vassert(descr1->base   == descr2->base);
   iters = 0;
   while (bias1 < 0 || bias2 < 0) {
      bias1 += descr1->nElems;
      bias2 += descr1->nElems;
      iters++;
      if (iters > 10)
         vpanic("getAliasingRelation: iters");
   }
   vassert(bias1 >= 0 && bias2 >= 0);
   bias1 %= descr1->nElems;
   bias2 %= descr1->nElems;
   vassert(bias1 >= 0 && bias1 < descr1->nElems);
   vassert(bias2 >= 0 && bias2 < descr1->nElems);

   /* Finally, biasP and biasG are normalised into the range 
      0 .. descrP/G->nElems - 1.  And so we can establish
      equality/non-equality. */

   return bias1==bias2 ? ExactAlias : NoAlias;
}


/*---------------------------------------------------------------*/
/*--- Common Subexpression Elimination                        ---*/
/*---------------------------------------------------------------*/

/* Expensive in time and space. */

/* Uses two environments: 
   a IRTemp -> IRTemp mapping 
   a mapping from AvailExpr* to IRTemp 
*/

typedef
   struct {
      enum { TCc, TCt } tag;
      union { IRTemp tmp; IRConst* con; } u;
   }
   TmpOrConst;

static Bool eqTmpOrConst ( TmpOrConst* tc1, TmpOrConst* tc2 )
{
   if (tc1->tag != tc2->tag)
      return False;
   switch (tc1->tag) {
      case TCc:
         return eqIRConst(tc1->u.con, tc2->u.con);
      case TCt:
         return tc1->u.tmp == tc2->u.tmp;
      default:
         vpanic("eqTmpOrConst");
   }
}

static Bool eqIRCallee ( IRCallee* cee1, IRCallee* cee2 )
{
   Bool eq = cee1->addr == cee2->addr;
   if (eq) {
      vassert(cee1->regparms == cee2->regparms);
      vassert(cee1->mcx_mask == cee2->mcx_mask);
      /* Names should be the same too, but we don't bother to
         check. */
   }
   return eq;
}

/* Convert an atomic IRExpr* to a TmpOrConst. */
static void irExpr_to_TmpOrConst ( /*OUT*/TmpOrConst* tc, IRExpr* e )
{
   switch (e->tag) {
      case Iex_RdTmp:
         tc->tag   = TCt;
         tc->u.tmp = e->Iex.RdTmp.tmp;
         break;
      case Iex_Const:
         tc->tag   = TCc;
         tc->u.con = e->Iex.Const.con;
         break;
      default:
         /* Getting here is a serious error.  It means that the
            presented arg isn't an IR atom, as it should be. */
         vpanic("irExpr_to_TmpOrConst");
   }
}

/* Convert a TmpOrConst to an atomic IRExpr*. */
static IRExpr* tmpOrConst_to_IRExpr ( TmpOrConst* tc )
{
   switch (tc->tag) {
      case TCc: return IRExpr_Const(tc->u.con);
      case TCt: return IRExpr_RdTmp(tc->u.tmp);
      default:  vpanic("tmpOrConst_to_IRExpr");
   }
}

/* Convert a NULL terminated IRExpr* vector to an array of
   TmpOrConsts, and a length. */
static void irExprVec_to_TmpOrConsts ( /*OUT*/TmpOrConst** outs,
                                       /*OUT*/Int* nOuts,
                                       IRExpr** ins )
{
   Int i, n;
   /* We have to make two passes, one to count, one to copy. */
   for (n = 0; ins[n]; n++)
      ;
   *outs  = LibVEX_Alloc_inline(n * sizeof(TmpOrConst));
   *nOuts = n;
   /* and now copy .. */
   for (i = 0; i < n; i++) {
      IRExpr*     arg = ins[i];
      TmpOrConst* dst = &(*outs)[i];
      irExpr_to_TmpOrConst(dst, arg);
   }
}

typedef
   struct {
      enum { Ut, Btt, Btc, Bct, Cf64i, Ittt, Itct, Ittc, Itcc, GetIt,
             CCall, Load
      } tag;
      union {
         /* unop(tmp) */
         struct {
            IROp   op;
            IRTemp arg;
         } Ut;
         /* binop(tmp,tmp) */
         struct {
            IROp   op;
            IRTemp arg1;
            IRTemp arg2;
         } Btt;
         /* binop(tmp,const) */
         struct {
            IROp    op;
            IRTemp  arg1;
            IRConst con2;
         } Btc;
         /* binop(const,tmp) */
         struct {
            IROp    op;
            IRConst con1;
            IRTemp  arg2;
         } Bct;
         /* F64i-style const */
         struct {
            ULong f64i;
         } Cf64i;
         /* ITE(tmp,tmp,tmp) */
         struct {
            IRTemp co;
            IRTemp e1;
            IRTemp e0;
         } Ittt;
         /* ITE(tmp,tmp,const) */
         struct {
            IRTemp  co;
            IRTemp  e1;
            IRConst con0;
         } Ittc;
         /* ITE(tmp,const,tmp) */
         struct {
            IRTemp  co;
            IRConst con1;
            IRTemp  e0;
         } Itct;
         /* ITE(tmp,const,const) */
         struct {
            IRTemp  co;
            IRConst con1;
            IRConst con0;
         } Itcc;
         /* GetI(descr,tmp,bias)*/
         struct {
            IRRegArray* descr;
            IRTemp      ix;
            Int         bias;
         } GetIt;
         /* Clean helper call */
         struct {
            IRCallee*   cee;
            TmpOrConst* args;
            Int         nArgs;
            IRType      retty;
         } CCall;
         /* Load(end,ty,addr) */
         struct {
            IREndness  end;
            IRType     ty;
            TmpOrConst addr;
         } Load;
      } u;
   }
   AvailExpr;

static Bool eq_AvailExpr ( AvailExpr* a1, AvailExpr* a2 )
{
   if (LIKELY(a1->tag != a2->tag))
      return False;
   switch (a1->tag) {
      case Ut: 
         return toBool(
                a1->u.Ut.op == a2->u.Ut.op 
                && a1->u.Ut.arg == a2->u.Ut.arg);
      case Btt: 
         return toBool(
                a1->u.Btt.op == a2->u.Btt.op
                && a1->u.Btt.arg1 == a2->u.Btt.arg1
                && a1->u.Btt.arg2 == a2->u.Btt.arg2);
      case Btc: 
         return toBool(
                a1->u.Btc.op == a2->u.Btc.op
                && a1->u.Btc.arg1 == a2->u.Btc.arg1
                && eqIRConst(&a1->u.Btc.con2, &a2->u.Btc.con2));
      case Bct: 
         return toBool(
                a1->u.Bct.op == a2->u.Bct.op
                && a1->u.Bct.arg2 == a2->u.Bct.arg2
                && eqIRConst(&a1->u.Bct.con1, &a2->u.Bct.con1));
      case Cf64i: 
         return toBool(a1->u.Cf64i.f64i == a2->u.Cf64i.f64i);
      case Ittt:
         return toBool(a1->u.Ittt.co == a2->u.Ittt.co
                       && a1->u.Ittt.e1 == a2->u.Ittt.e1
                       && a1->u.Ittt.e0 == a2->u.Ittt.e0);
      case Ittc:
         return toBool(a1->u.Ittc.co == a2->u.Ittc.co
                       && a1->u.Ittc.e1 == a2->u.Ittc.e1
                       && eqIRConst(&a1->u.Ittc.con0, &a2->u.Ittc.con0));
      case Itct:
         return toBool(a1->u.Itct.co == a2->u.Itct.co
                       && eqIRConst(&a1->u.Itct.con1, &a2->u.Itct.con1)
                       && a1->u.Itct.e0 == a2->u.Itct.e0);
      case Itcc:
         return toBool(a1->u.Itcc.co == a2->u.Itcc.co
                       && eqIRConst(&a1->u.Itcc.con1, &a2->u.Itcc.con1)
                       && eqIRConst(&a1->u.Itcc.con0, &a2->u.Itcc.con0));
      case GetIt:
         return toBool(eqIRRegArray(a1->u.GetIt.descr, a2->u.GetIt.descr) 
                       && a1->u.GetIt.ix == a2->u.GetIt.ix
                       && a1->u.GetIt.bias == a2->u.GetIt.bias);
      case CCall: {
         Int  i, n;
         Bool eq = a1->u.CCall.nArgs == a2->u.CCall.nArgs
                   && eqIRCallee(a1->u.CCall.cee, a2->u.CCall.cee);
         if (eq) {
            n = a1->u.CCall.nArgs;
            for (i = 0; i < n; i++) {
               if (!eqTmpOrConst( &a1->u.CCall.args[i],
                                  &a2->u.CCall.args[i] )) {
                  eq = False;
                  break;
               }
            }
         }
         if (eq) vassert(a1->u.CCall.retty == a2->u.CCall.retty);
         return eq;  
      }
      case Load: {
         Bool eq = toBool(a1->u.Load.end == a2->u.Load.end
                          && a1->u.Load.ty == a2->u.Load.ty
                          && eqTmpOrConst(&a1->u.Load.addr, &a2->u.Load.addr));
         return eq;
      }
      default:
         vpanic("eq_AvailExpr");
   }
}

static IRExpr* availExpr_to_IRExpr ( AvailExpr* ae ) 
{
   IRConst *con, *con0, *con1;
   switch (ae->tag) {
      case Ut:
         return IRExpr_Unop( ae->u.Ut.op, IRExpr_RdTmp(ae->u.Ut.arg) );
      case Btt:
         return IRExpr_Binop( ae->u.Btt.op,
                              IRExpr_RdTmp(ae->u.Btt.arg1),
                              IRExpr_RdTmp(ae->u.Btt.arg2) );
      case Btc:
         con = LibVEX_Alloc_inline(sizeof(IRConst));
         *con = ae->u.Btc.con2;
         return IRExpr_Binop( ae->u.Btc.op,
                              IRExpr_RdTmp(ae->u.Btc.arg1), 
                              IRExpr_Const(con) );
      case Bct:
         con = LibVEX_Alloc_inline(sizeof(IRConst));
         *con = ae->u.Bct.con1;
         return IRExpr_Binop( ae->u.Bct.op,
                              IRExpr_Const(con), 
                              IRExpr_RdTmp(ae->u.Bct.arg2) );
      case Cf64i:
         return IRExpr_Const(IRConst_F64i(ae->u.Cf64i.f64i));
      case Ittt:
         return IRExpr_ITE(IRExpr_RdTmp(ae->u.Ittt.co), 
                           IRExpr_RdTmp(ae->u.Ittt.e1), 
                           IRExpr_RdTmp(ae->u.Ittt.e0));
      case Ittc:
         con0 = LibVEX_Alloc_inline(sizeof(IRConst));
         *con0 = ae->u.Ittc.con0;
         return IRExpr_ITE(IRExpr_RdTmp(ae->u.Ittc.co), 
                           IRExpr_RdTmp(ae->u.Ittc.e1),
                           IRExpr_Const(con0));
      case Itct:
         con1 = LibVEX_Alloc_inline(sizeof(IRConst));
         *con1 = ae->u.Itct.con1;
         return IRExpr_ITE(IRExpr_RdTmp(ae->u.Itct.co), 
                           IRExpr_Const(con1),
                           IRExpr_RdTmp(ae->u.Itct.e0));

      case Itcc:
         con0 = LibVEX_Alloc_inline(sizeof(IRConst));
         con1 = LibVEX_Alloc_inline(sizeof(IRConst));
         *con0 = ae->u.Itcc.con0;
         *con1 = ae->u.Itcc.con1;
         return IRExpr_ITE(IRExpr_RdTmp(ae->u.Itcc.co), 
                           IRExpr_Const(con1),
                           IRExpr_Const(con0));
      case GetIt:
         return IRExpr_GetI(ae->u.GetIt.descr,
                            IRExpr_RdTmp(ae->u.GetIt.ix),
                            ae->u.GetIt.bias);
      case CCall: {
         Int i, n = ae->u.CCall.nArgs;
         vassert(n >= 0);
         IRExpr** vec = LibVEX_Alloc_inline((n+1) * sizeof(IRExpr*));
         vec[n] = NULL;
         for (i = 0; i < n; i++) {
            vec[i] = tmpOrConst_to_IRExpr(&ae->u.CCall.args[i]);
         }
         return IRExpr_CCall(ae->u.CCall.cee,
                             ae->u.CCall.retty,
                             vec);
      }
      case Load:
         return IRExpr_Load(ae->u.Load.end, ae->u.Load.ty,
                            tmpOrConst_to_IRExpr(&ae->u.Load.addr));
      default:
         vpanic("availExpr_to_IRExpr");
   }
}

inline
static IRTemp subst_AvailExpr_Temp ( HashHW* env, IRTemp tmp )
{
   HWord res;
   /* env :: IRTemp -> IRTemp */
   if (lookupHHW( env, &res, (HWord)tmp ))
      return (IRTemp)res;
   else
      return tmp;
}

inline
static void subst_AvailExpr_TmpOrConst ( /*MB_MOD*/TmpOrConst* tc,
                                          HashHW* env )
{
   /* env :: IRTemp -> IRTemp */
   if (tc->tag == TCt) {
      tc->u.tmp = subst_AvailExpr_Temp( env, tc->u.tmp );
   }
}

static void subst_AvailExpr ( HashHW* env, AvailExpr* ae )
{
   /* env :: IRTemp -> IRTemp */
   switch (ae->tag) {
      case Ut:
         ae->u.Ut.arg = subst_AvailExpr_Temp( env, ae->u.Ut.arg );
         break;
      case Btt:
         ae->u.Btt.arg1 = subst_AvailExpr_Temp( env, ae->u.Btt.arg1 );
         ae->u.Btt.arg2 = subst_AvailExpr_Temp( env, ae->u.Btt.arg2 );
         break;
      case Btc:
         ae->u.Btc.arg1 = subst_AvailExpr_Temp( env, ae->u.Btc.arg1 );
         break;
      case Bct:
         ae->u.Bct.arg2 = subst_AvailExpr_Temp( env, ae->u.Bct.arg2 );
         break;
      case Cf64i:
         break;
      case Ittt:
         ae->u.Ittt.co = subst_AvailExpr_Temp( env, ae->u.Ittt.co );
         ae->u.Ittt.e1 = subst_AvailExpr_Temp( env, ae->u.Ittt.e1 );
         ae->u.Ittt.e0 = subst_AvailExpr_Temp( env, ae->u.Ittt.e0 );
         break;
      case Ittc:
         ae->u.Ittc.co = subst_AvailExpr_Temp( env, ae->u.Ittc.co );
         ae->u.Ittc.e1 = subst_AvailExpr_Temp( env, ae->u.Ittc.e1 );
         break;
      case Itct:
         ae->u.Itct.co = subst_AvailExpr_Temp( env, ae->u.Itct.co );
         ae->u.Itct.e0 = subst_AvailExpr_Temp( env, ae->u.Itct.e0 );
         break;
      case Itcc:
         ae->u.Itcc.co = subst_AvailExpr_Temp( env, ae->u.Itcc.co );
         break;
      case GetIt:
         ae->u.GetIt.ix = subst_AvailExpr_Temp( env, ae->u.GetIt.ix );
         break;
      case CCall: {
         Int i, n = ae->u.CCall.nArgs;;
         for (i = 0; i < n; i++) {
            subst_AvailExpr_TmpOrConst(&ae->u.CCall.args[i], env);
         }
         break;
      }
      case Load:
         subst_AvailExpr_TmpOrConst(&ae->u.Load.addr, env);
         break;
      default: 
         vpanic("subst_AvailExpr");
   }
}

static AvailExpr* irExpr_to_AvailExpr ( IRExpr* e, Bool allowLoadsToBeCSEd )
{
   AvailExpr* ae;

   switch (e->tag) {
      case Iex_Unop:
         if (e->Iex.Unop.arg->tag == Iex_RdTmp) {
            ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
            ae->tag      = Ut;
            ae->u.Ut.op  = e->Iex.Unop.op;
            ae->u.Ut.arg = e->Iex.Unop.arg->Iex.RdTmp.tmp;
            return ae;
         }
         break;

      case Iex_Binop:
         if (e->Iex.Binop.arg1->tag == Iex_RdTmp) {
            if (e->Iex.Binop.arg2->tag == Iex_RdTmp) {
               ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
               ae->tag        = Btt;
               ae->u.Btt.op   = e->Iex.Binop.op;
               ae->u.Btt.arg1 = e->Iex.Binop.arg1->Iex.RdTmp.tmp;
               ae->u.Btt.arg2 = e->Iex.Binop.arg2->Iex.RdTmp.tmp;
               return ae;
            }
            if (e->Iex.Binop.arg2->tag == Iex_Const) {
               ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
               ae->tag        = Btc;
               ae->u.Btc.op   = e->Iex.Binop.op;
               ae->u.Btc.arg1 = e->Iex.Binop.arg1->Iex.RdTmp.tmp;
               ae->u.Btc.con2 = *(e->Iex.Binop.arg2->Iex.Const.con);
               return ae;
            }
         } else if (e->Iex.Binop.arg1->tag == Iex_Const
                    && e->Iex.Binop.arg2->tag == Iex_RdTmp) {
            ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
            ae->tag        = Bct;
            ae->u.Bct.op   = e->Iex.Binop.op;
            ae->u.Bct.arg2 = e->Iex.Binop.arg2->Iex.RdTmp.tmp;
            ae->u.Bct.con1 = *(e->Iex.Binop.arg1->Iex.Const.con);
            return ae;
         }
         break;

      case Iex_Const:
         if (e->Iex.Const.con->tag == Ico_F64i) {
            ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
            ae->tag          = Cf64i;
            ae->u.Cf64i.f64i = e->Iex.Const.con->Ico.F64i;
            return ae;
         }
         break;

      case Iex_ITE:
         if (e->Iex.ITE.cond->tag == Iex_RdTmp) {
            if (e->Iex.ITE.iffalse->tag == Iex_RdTmp) {
               if (e->Iex.ITE.iftrue->tag == Iex_RdTmp) {
                  ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
                  ae->tag       = Ittt;
                  ae->u.Ittt.co = e->Iex.ITE.cond->Iex.RdTmp.tmp;
                  ae->u.Ittt.e1 = e->Iex.ITE.iftrue->Iex.RdTmp.tmp;
                  ae->u.Ittt.e0 = e->Iex.ITE.iffalse->Iex.RdTmp.tmp;
                  return ae;
               }
               if (e->Iex.ITE.iftrue->tag == Iex_Const) {
                  ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
                  ae->tag       = Itct;
                  ae->u.Itct.co = e->Iex.ITE.cond->Iex.RdTmp.tmp;
                  ae->u.Itct.con1 = *(e->Iex.ITE.iftrue->Iex.Const.con);
                  ae->u.Itct.e0 = e->Iex.ITE.iffalse->Iex.RdTmp.tmp;
                  return ae;
               }
            } else if (e->Iex.ITE.iffalse->tag == Iex_Const) {
               if (e->Iex.ITE.iftrue->tag == Iex_RdTmp) {
                  ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
                  ae->tag       = Ittc;
                  ae->u.Ittc.co = e->Iex.ITE.cond->Iex.RdTmp.tmp;
                  ae->u.Ittc.e1 = e->Iex.ITE.iftrue->Iex.RdTmp.tmp;
                  ae->u.Ittc.con0 = *(e->Iex.ITE.iffalse->Iex.Const.con);
                  return ae;
               }
               if (e->Iex.ITE.iftrue->tag == Iex_Const) {
                  ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
                  ae->tag       = Itcc;
                  ae->u.Itcc.co = e->Iex.ITE.cond->Iex.RdTmp.tmp;
                  ae->u.Itcc.con1 = *(e->Iex.ITE.iftrue->Iex.Const.con);
                  ae->u.Itcc.con0 = *(e->Iex.ITE.iffalse->Iex.Const.con);
                  return ae;
               }
            }
         }
         break;

      case Iex_GetI:
         if (e->Iex.GetI.ix->tag == Iex_RdTmp) {
            ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
            ae->tag           = GetIt;
            ae->u.GetIt.descr = e->Iex.GetI.descr;
            ae->u.GetIt.ix    = e->Iex.GetI.ix->Iex.RdTmp.tmp;
            ae->u.GetIt.bias  = e->Iex.GetI.bias;
            return ae;
         }
         break;

      case Iex_CCall:
         ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
         ae->tag = CCall;
         /* Ok to share only the cee, since it is immutable. */
         ae->u.CCall.cee   = e->Iex.CCall.cee;
         ae->u.CCall.retty = e->Iex.CCall.retty;
         /* irExprVec_to_TmpOrConsts will assert if the args are
            neither tmps nor constants, but that's ok .. that's all they
            should be. */
         irExprVec_to_TmpOrConsts(
                                  &ae->u.CCall.args, &ae->u.CCall.nArgs,
                                  e->Iex.CCall.args
                                 );
         return ae;

      case Iex_Load:
         /* If the caller of do_cse_BB has requested that loads also
            be CSEd, convert them into AvailExprs.  If not, we'll just
            return NULL here, and the load never becomes considered
            "available", which effectively disables CSEing of them, as
            desired. */
         if (allowLoadsToBeCSEd) {
            ae = LibVEX_Alloc_inline(sizeof(AvailExpr));
            ae->tag        = Load;
            ae->u.Load.end = e->Iex.Load.end;
            ae->u.Load.ty  = e->Iex.Load.ty;
            irExpr_to_TmpOrConst(&ae->u.Load.addr, e->Iex.Load.addr);
            return ae;
         }
         break;

      default:
         break;
   }

   return NULL;
}


/* The BB is modified in-place.  Returns True if any changes were
   made.  The caller can choose whether or not loads should be CSEd.
   In the normal course of things we don't do that, since CSEing loads
   is something of a dodgy proposition if the guest program is doing
   some screwy stuff to do with races and spinloops. */

static Bool do_cse_BB ( IRSB* bb, Bool allowLoadsToBeCSEd )
{
   Int        i, j, paranoia;
   IRTemp     t, q;
   IRStmt*    st;
   AvailExpr* eprime;
   AvailExpr* ae;
   Bool       invalidate;
   Bool       anyDone = False;

   HashHW* tenv = newHHW(); /* :: IRTemp -> IRTemp */
   HashHW* aenv = newHHW(); /* :: AvailExpr* -> IRTemp */

   vassert(sizeof(IRTemp) <= sizeof(HWord));

   if (0) { ppIRSB(bb); vex_printf("\n\n"); }

   /* Iterate forwards over the stmts.  
      On seeing "t = E", where E is one of the AvailExpr forms:
         let E' = apply tenv substitution to E
         search aenv for E'
            if a mapping E' -> q is found, 
               replace this stmt by "t = q"
               and add binding t -> q to tenv
            else
               add binding E' -> t to aenv
               replace this stmt by "t = E'"

      Other statements are only interesting to the extent that they
      might invalidate some of the expressions in aenv.  So there is
      an invalidate-bindings check for each statement seen.
   */
   for (i = 0; i < bb->stmts_used; i++) {
      st = bb->stmts[i];

      /* ------ BEGIN invalidate aenv bindings ------ */
      /* This is critical: remove from aenv any E' -> .. bindings
         which might be invalidated by this statement.  The only
         vulnerable kind of bindings are the GetI and Load kinds.
            Dirty call - dump (paranoia level -> 2) 
            Store      - dump (ditto) 
            Put, PutI  - dump unless no-overlap is proven (.. -> 1)
         Uses getAliasingRelation_IC and getAliasingRelation_II
         to do the no-overlap assessments needed for Put/PutI.
      */
      switch (st->tag) {
         case Ist_Dirty: case Ist_Store: case Ist_MBE:
         case Ist_CAS: case Ist_LLSC:
         case Ist_StoreG:
            paranoia = 2; break;
         case Ist_Put: case Ist_PutI: 
            paranoia = 1; break;
         case Ist_NoOp: case Ist_IMark: case Ist_AbiHint: 
         case Ist_WrTmp: case Ist_Exit: case Ist_LoadG:
            paranoia = 0; break;
         default: 
            vpanic("do_cse_BB(1)");
      }

      if (paranoia > 0) {
         for (j = 0; j < aenv->used; j++) {
            if (!aenv->inuse[j])
               continue;
            ae = (AvailExpr*)aenv->key[j];
            if (ae->tag != GetIt && ae->tag != Load) 
               continue;
            invalidate = False;
            if (paranoia >= 2) {
               invalidate = True;
            } else {
               vassert(paranoia == 1);
               if (ae->tag == Load) {
                  /* Loads can be invalidated by anything that could
                     possibly touch memory.  But in that case we
                     should have |paranoia| == 2 and we won't get
                     here.  So there's nothing to do; we don't have to
                     invalidate the load. */
               }
               else
               if (st->tag == Ist_Put) {
                  if (getAliasingRelation_IC(
                         ae->u.GetIt.descr, 
                         IRExpr_RdTmp(ae->u.GetIt.ix), 
                         st->Ist.Put.offset, 
                         typeOfIRExpr(bb->tyenv,st->Ist.Put.data) 
                      ) != NoAlias) 
                     invalidate = True;
               }
               else 
               if (st->tag == Ist_PutI) {
                  IRPutI *puti = st->Ist.PutI.details;
                  if (getAliasingRelation_II(
                         ae->u.GetIt.descr, 
                         IRExpr_RdTmp(ae->u.GetIt.ix), 
                         ae->u.GetIt.bias,
                         puti->descr,
                         puti->ix,
                         puti->bias
                      ) != NoAlias)
                     invalidate = True;
               }
               else 
                  vpanic("do_cse_BB(2)");
            }

            if (invalidate) {
               aenv->inuse[j] = False;
               aenv->key[j]   = (HWord)NULL;  /* be sure */
            }
         } /* for j */
      } /* paranoia > 0 */

      /* ------ ENV invalidate aenv bindings ------ */

      /* ignore not-interestings */
      if (st->tag != Ist_WrTmp)
         continue;

      t = st->Ist.WrTmp.tmp;
      eprime = irExpr_to_AvailExpr(st->Ist.WrTmp.data, allowLoadsToBeCSEd);
      /* ignore if not of AvailExpr form */
      if (!eprime)
         continue;

      /* vex_printf("considering: " ); ppIRStmt(st); vex_printf("\n"); */

      /* apply tenv */
      subst_AvailExpr( tenv, eprime );

      /* search aenv for eprime, unfortunately the hard way */
      for (j = 0; j < aenv->used; j++)
         if (aenv->inuse[j] && eq_AvailExpr(eprime, (AvailExpr*)aenv->key[j]))
            break;

      if (j < aenv->used) {
         /* A binding E' -> q was found.  Replace stmt by "t = q" and
            note the t->q binding in tenv. */
         /* (this is the core of the CSE action) */
         q = (IRTemp)aenv->val[j];
         bb->stmts[i] = IRStmt_WrTmp( t, IRExpr_RdTmp(q) );
         addToHHW( tenv, (HWord)t, (HWord)q );
         anyDone = True;
      } else {
         /* No binding was found, so instead we add E' -> t to our
            collection of available expressions, replace this stmt
            with "t = E'", and move on. */
         bb->stmts[i] = IRStmt_WrTmp( t, availExpr_to_IRExpr(eprime) );
         addToHHW( aenv, (HWord)eprime, (HWord)t );
      }
   }

   /*
   ppIRSB(bb);
   sanityCheckIRSB(bb, Ity_I32);
   vex_printf("\n\n");
   */
   return anyDone;
}


/*---------------------------------------------------------------*/
/*--- Add32/Sub32 chain collapsing                            ---*/
/*---------------------------------------------------------------*/

/* ----- Helper functions for Add32/Sub32 chain collapsing ----- */

/* Is this expression "Add32(tmp,const)" or "Sub32(tmp,const)" ?  If
   yes, set *tmp and *i32 appropriately.  *i32 is set as if the
   root node is Add32, not Sub32. */

static Bool isAdd32OrSub32 ( IRExpr* e, IRTemp* tmp, Int* i32 )
{ 
   if (e->tag != Iex_Binop)
      return False;
   if (e->Iex.Binop.op != Iop_Add32 && e->Iex.Binop.op != Iop_Sub32)
      return False;
   if (e->Iex.Binop.arg1->tag != Iex_RdTmp)
      return False;
   if (e->Iex.Binop.arg2->tag != Iex_Const)
      return False;
   *tmp = e->Iex.Binop.arg1->Iex.RdTmp.tmp;
   *i32 = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32);
   if (e->Iex.Binop.op == Iop_Sub32)
      *i32 = -*i32;
   return True;
}


/* Figure out if tmp can be expressed as tmp2 +32 const, for some
   other tmp2.  Scan backwards from the specified start point -- an
   optimisation. */

static Bool collapseChain ( IRSB* bb, Int startHere,
                            IRTemp tmp,
                            IRTemp* tmp2, Int* i32 )
{
   Int     j, ii;
   IRTemp  vv;
   IRStmt* st;
   IRExpr* e;

   /* the (var, con) pair contain the current 'representation' for
      'tmp'.  We start with 'tmp + 0'.  */
   IRTemp var = tmp;
   Int    con = 0;

   /* Scan backwards to see if tmp can be replaced by some other tmp
     +/- a constant. */
   for (j = startHere; j >= 0; j--) {
      st = bb->stmts[j];
      if (st->tag != Ist_WrTmp) 
         continue;
      if (st->Ist.WrTmp.tmp != var)
         continue;
      e = st->Ist.WrTmp.data;
      if (!isAdd32OrSub32(e, &vv, &ii))
         break;
      var = vv;
      con += ii;
   }
   if (j == -1)
      /* no earlier binding for var .. ill-formed IR */
      vpanic("collapseChain");

   /* so, did we find anything interesting? */
   if (var == tmp)
      return False; /* no .. */
      
   *tmp2 = var;
   *i32  = con;
   return True;
}


/* ------- Main function for Add32/Sub32 chain collapsing ------ */

static void collapse_AddSub_chains_BB ( IRSB* bb )
{
   IRStmt *st;
   IRTemp var, var2;
   Int    i, con, con2;

   for (i = bb->stmts_used-1; i >= 0; i--) {
      st = bb->stmts[i];
      if (st->tag == Ist_NoOp)
         continue;

      /* Try to collapse 't1 = Add32/Sub32(t2, con)'. */

      if (st->tag == Ist_WrTmp
          && isAdd32OrSub32(st->Ist.WrTmp.data, &var, &con)) {

         /* So e1 is of the form Add32(var,con) or Sub32(var,-con).
            Find out if var can be expressed as var2 + con2. */
         if (collapseChain(bb, i-1, var, &var2, &con2)) {
            if (DEBUG_IROPT) {
               vex_printf("replacing1 ");
               ppIRStmt(st);
               vex_printf(" with ");
            }
            con2 += con;
            bb->stmts[i] 
               = IRStmt_WrTmp(
                    st->Ist.WrTmp.tmp,
                    (con2 >= 0) 
                      ? IRExpr_Binop(Iop_Add32, 
                                     IRExpr_RdTmp(var2),
                                     IRExpr_Const(IRConst_U32(con2)))
                      : IRExpr_Binop(Iop_Sub32, 
                                     IRExpr_RdTmp(var2),
                                     IRExpr_Const(IRConst_U32(-con2)))
                 );
            if (DEBUG_IROPT) {
               ppIRStmt(bb->stmts[i]);
               vex_printf("\n");
            }
         }

         continue;
      }

      /* Try to collapse 't1 = GetI[t2, con]'. */

      if (st->tag == Ist_WrTmp
          && st->Ist.WrTmp.data->tag == Iex_GetI
          && st->Ist.WrTmp.data->Iex.GetI.ix->tag == Iex_RdTmp
          && collapseChain(bb, i-1, st->Ist.WrTmp.data->Iex.GetI.ix
                                      ->Iex.RdTmp.tmp, &var2, &con2)) {
         if (DEBUG_IROPT) {
            vex_printf("replacing3 ");
            ppIRStmt(st);
            vex_printf(" with ");
         }
         con2 += st->Ist.WrTmp.data->Iex.GetI.bias;
         bb->stmts[i]
            = IRStmt_WrTmp(
                 st->Ist.WrTmp.tmp,
                 IRExpr_GetI(st->Ist.WrTmp.data->Iex.GetI.descr,
                             IRExpr_RdTmp(var2),
                             con2));
         if (DEBUG_IROPT) {
            ppIRStmt(bb->stmts[i]);
            vex_printf("\n");
         }
         continue;
      }

      /* Perhaps st is PutI[t, con] ? */
      IRPutI *puti = st->Ist.PutI.details;
      if (st->tag == Ist_PutI
          && puti->ix->tag == Iex_RdTmp
          && collapseChain(bb, i-1, puti->ix->Iex.RdTmp.tmp, 
                               &var2, &con2)) {
         if (DEBUG_IROPT) {
            vex_printf("replacing2 ");
            ppIRStmt(st);
            vex_printf(" with ");
         }
         con2 += puti->bias;
         bb->stmts[i]
            = IRStmt_PutI(mkIRPutI(puti->descr,
                                   IRExpr_RdTmp(var2),
                                   con2,
                                   puti->data));
         if (DEBUG_IROPT) {
            ppIRStmt(bb->stmts[i]);
            vex_printf("\n");
         }
         continue;
      }

   } /* for */
}


/*---------------------------------------------------------------*/
/*--- PutI/GetI transformations                               ---*/
/*---------------------------------------------------------------*/

/* Given the parts (descr, tmp, bias) for a GetI, scan backwards from
   the given starting point to find, if any, a PutI which writes
   exactly the same piece of guest state, and so return the expression
   that the PutI writes.  This is the core of PutI-GetI forwarding. */

static 
IRExpr* findPutI ( IRSB* bb, Int startHere,
                   IRRegArray* descrG, IRExpr* ixG, Int biasG )
{
   Int        j;
   IRStmt*    st;
   GSAliasing relation;

   if (0) {
      vex_printf("\nfindPutI ");
      ppIRRegArray(descrG);
      vex_printf(" ");
      ppIRExpr(ixG);
      vex_printf(" %d\n", biasG);
   }

   /* Scan backwards in bb from startHere to find a suitable PutI
      binding for (descrG, ixG, biasG), if any. */

   for (j = startHere; j >= 0; j--) {
      st = bb->stmts[j];
      if (st->tag == Ist_NoOp) 
         continue;

      if (st->tag == Ist_Put) {
         /* Non-indexed Put.  This can't give a binding, but we do
            need to check it doesn't invalidate the search by
            overlapping any part of the indexed guest state. */

         relation
            = getAliasingRelation_IC(
                 descrG, ixG,
                 st->Ist.Put.offset,
                 typeOfIRExpr(bb->tyenv,st->Ist.Put.data) );

         if (relation == NoAlias) {
            /* we're OK; keep going */
            continue;
         } else {
            /* relation == UnknownAlias || relation == ExactAlias */
            /* If this assertion fails, we've found a Put which writes
               an area of guest state which is read by a GetI.  Which
               is unlikely (although not per se wrong). */
            vassert(relation != ExactAlias);
            /* This Put potentially writes guest state that the GetI
               reads; we must fail. */
            return NULL;
         }
      }

      if (st->tag == Ist_PutI) {
         IRPutI *puti = st->Ist.PutI.details;

         relation = getAliasingRelation_II(
                       descrG, ixG, biasG,
                       puti->descr,
                       puti->ix,
                       puti->bias
                    );

         if (relation == NoAlias) {
            /* This PutI definitely doesn't overlap.  Ignore it and
               keep going. */
            continue; /* the for j loop */
         }

         if (relation == UnknownAlias) {
            /* We don't know if this PutI writes to the same guest
               state that the GetI, or not.  So we have to give up. */
            return NULL;
         }

         /* Otherwise, we've found what we're looking for.  */
         vassert(relation == ExactAlias);
         return puti->data;

      } /* if (st->tag == Ist_PutI) */

      if (st->tag == Ist_Dirty) {
         /* Be conservative.  If the dirty call has any guest effects at
            all, give up.  We could do better -- only give up if there
            are any guest writes/modifies. */
         if (st->Ist.Dirty.details->nFxState > 0)
            return NULL;
      }

   } /* for */

   /* No valid replacement was found. */
   return NULL;
}



/* Assuming pi is a PutI stmt, is s2 identical to it (in the sense
   that it writes exactly the same piece of guest state) ?  Safe
   answer: False. */

static Bool identicalPutIs ( IRStmt* pi, IRStmt* s2 )
{
   vassert(pi->tag == Ist_PutI);
   if (s2->tag != Ist_PutI)
      return False;

   IRPutI *p1 = pi->Ist.PutI.details;
   IRPutI *p2 = s2->Ist.PutI.details;

   return toBool(
          getAliasingRelation_II( 
             p1->descr, p1->ix, p1->bias, 
             p2->descr, p2->ix, p2->bias
          )
          == ExactAlias
          );
}


/* Assuming pi is a PutI stmt, is s2 a Get/GetI/Put/PutI which might
   overlap it?  Safe answer: True.  Note, we could do a lot better
   than this if needed. */

static 
Bool guestAccessWhichMightOverlapPutI ( 
        IRTypeEnv* tyenv, IRStmt* pi, IRStmt* s2 
     )
{
   GSAliasing relation;
   UInt       minoffP, maxoffP;

   vassert(pi->tag == Ist_PutI);

   IRPutI *p1 = pi->Ist.PutI.details;

   getArrayBounds(p1->descr, &minoffP, &maxoffP);
   switch (s2->tag) {

      case Ist_NoOp:
      case Ist_IMark:
         return False;

      case Ist_MBE:
      case Ist_AbiHint:
         /* just be paranoid ... these should be rare. */
         return True;

      case Ist_CAS:
         /* This is unbelievably lame, but it's probably not
            significant from a performance point of view.  Really, a
            CAS is a load-store op, so it should be safe to say False.
            However .. */
         return True;

      case Ist_Dirty:
         /* If the dirty call has any guest effects at all, give up.
            Probably could do better. */
         if (s2->Ist.Dirty.details->nFxState > 0)
            return True;
         return False;

      case Ist_Put:
         vassert(isIRAtom(s2->Ist.Put.data));
         relation 
            = getAliasingRelation_IC(
                 p1->descr, p1->ix,
                 s2->Ist.Put.offset, 
                 typeOfIRExpr(tyenv,s2->Ist.Put.data)
              );
         goto have_relation;

      case Ist_PutI: {
         IRPutI *p2 = s2->Ist.PutI.details;

         vassert(isIRAtom(p2->ix));
         vassert(isIRAtom(p2->data));
         relation
            = getAliasingRelation_II(
                 p1->descr, p1->ix, p1->bias, 
                 p2->descr, p2->ix, p2->bias
              );
         goto have_relation;
      }

      case Ist_WrTmp:
         if (s2->Ist.WrTmp.data->tag == Iex_GetI) {
            relation
               = getAliasingRelation_II(
                    p1->descr, p1->ix, p1->bias, 
                    s2->Ist.WrTmp.data->Iex.GetI.descr,
                    s2->Ist.WrTmp.data->Iex.GetI.ix,
                    s2->Ist.WrTmp.data->Iex.GetI.bias
                 );
            goto have_relation;
         }
         if (s2->Ist.WrTmp.data->tag == Iex_Get) {
            relation
               = getAliasingRelation_IC(
                    p1->descr, p1->ix,
                    s2->Ist.WrTmp.data->Iex.Get.offset,
                    s2->Ist.WrTmp.data->Iex.Get.ty
                 );
            goto have_relation;
         }
         return False;

      case Ist_Store:
         vassert(isIRAtom(s2->Ist.Store.addr));
         vassert(isIRAtom(s2->Ist.Store.data));
         return False;

      default:
         vex_printf("\n"); ppIRStmt(s2); vex_printf("\n");
         vpanic("guestAccessWhichMightOverlapPutI");
   }

  have_relation:
   if (relation == NoAlias)
      return False;
   else
      return True; /* ExactAlias or UnknownAlias */
}



/* ---------- PutI/GetI transformations main functions --------- */

/* Remove redundant GetIs, to the extent that they can be detected.
   bb is modified in-place. */

static
void do_redundant_GetI_elimination ( IRSB* bb )
{
   Int     i;
   IRStmt* st;

   for (i = bb->stmts_used-1; i >= 0; i--) {
      st = bb->stmts[i];
      if (st->tag == Ist_NoOp)
         continue;

      if (st->tag == Ist_WrTmp
          && st->Ist.WrTmp.data->tag == Iex_GetI
          && st->Ist.WrTmp.data->Iex.GetI.ix->tag == Iex_RdTmp) {
         IRRegArray* descr = st->Ist.WrTmp.data->Iex.GetI.descr;
         IRExpr*     ix    = st->Ist.WrTmp.data->Iex.GetI.ix;
         Int         bias  = st->Ist.WrTmp.data->Iex.GetI.bias;
         IRExpr*     replacement = findPutI(bb, i-1, descr, ix, bias);
         if (replacement 
             && isIRAtom(replacement)
             /* Make sure we're doing a type-safe transformation! */
             && typeOfIRExpr(bb->tyenv, replacement) == descr->elemTy) {
            if (DEBUG_IROPT) {
               vex_printf("rGI:  "); 
               ppIRExpr(st->Ist.WrTmp.data);
               vex_printf(" -> ");
               ppIRExpr(replacement);
               vex_printf("\n");
            }
            bb->stmts[i] = IRStmt_WrTmp(st->Ist.WrTmp.tmp, replacement);
         }
      }
   }

}


/* Remove redundant PutIs, to the extent which they can be detected.
   bb is modified in-place. */

static
void do_redundant_PutI_elimination ( IRSB* bb, VexRegisterUpdates pxControl )
{
   Int    i, j;
   Bool   delete;
   IRStmt *st, *stj;

   vassert(pxControl < VexRegUpdAllregsAtEachInsn);

   for (i = 0; i < bb->stmts_used; i++) {
      st = bb->stmts[i];
      if (st->tag != Ist_PutI)
         continue;
      /* Ok, search forwards from here to see if we can find another
         PutI which makes this one redundant, and dodging various 
         hazards.  Search forwards:
         * If conditional exit, give up (because anything after that 
           does not postdominate this put).
         * If a Get which might overlap, give up (because this PutI 
           not necessarily dead).
         * If a Put which is identical, stop with success.
         * If a Put which might overlap, but is not identical, give up.
         * If a dirty helper call which might write guest state, give up.
         * If a Put which definitely doesn't overlap, or any other 
           kind of stmt, continue.
      */
      delete = False;
      for (j = i+1; j < bb->stmts_used; j++) {
         stj = bb->stmts[j];
         if (stj->tag == Ist_NoOp) 
            continue;
         if (identicalPutIs(st, stj)) {
            /* success! */
            delete = True;
            break;
         }
         if (stj->tag == Ist_Exit)
            /* give up */
            break;
         if (st->tag == Ist_Dirty)
            /* give up; could do better here */
            break;
         if (guestAccessWhichMightOverlapPutI(bb->tyenv, st, stj))
            /* give up */
           break;
      }

      if (delete) {
         if (DEBUG_IROPT) {
            vex_printf("rPI:  "); 
            ppIRStmt(st); 
            vex_printf("\n");
         }
         bb->stmts[i] = IRStmt_NoOp();
      }

   }
}


/*---------------------------------------------------------------*/
/*--- Loop unrolling                                          ---*/
/*---------------------------------------------------------------*/

/* Adjust all tmp values (names) in e by delta.  e is destructively
   modified. */

static void deltaIRExpr ( IRExpr* e, Int delta )
{
   Int i;
   switch (e->tag) {
      case Iex_RdTmp:
         e->Iex.RdTmp.tmp += delta;
         break;
      case Iex_Get:
      case Iex_Const:
         break;
      case Iex_GetI:
         deltaIRExpr(e->Iex.GetI.ix, delta);
         break;
      case Iex_Qop:
         deltaIRExpr(e->Iex.Qop.details->arg1, delta);
         deltaIRExpr(e->Iex.Qop.details->arg2, delta);
         deltaIRExpr(e->Iex.Qop.details->arg3, delta);
         deltaIRExpr(e->Iex.Qop.details->arg4, delta);
         break;
      case Iex_Triop:
         deltaIRExpr(e->Iex.Triop.details->arg1, delta);
         deltaIRExpr(e->Iex.Triop.details->arg2, delta);
         deltaIRExpr(e->Iex.Triop.details->arg3, delta);
         break;
      case Iex_Binop:
         deltaIRExpr(e->Iex.Binop.arg1, delta);
         deltaIRExpr(e->Iex.Binop.arg2, delta);
         break;
      case Iex_Unop:
         deltaIRExpr(e->Iex.Unop.arg, delta);
         break;
      case Iex_Load:
         deltaIRExpr(e->Iex.Load.addr, delta);
         break;
      case Iex_CCall:
         for (i = 0; e->Iex.CCall.args[i]; i++)
            deltaIRExpr(e->Iex.CCall.args[i], delta);
         break;
      case Iex_ITE:
         deltaIRExpr(e->Iex.ITE.cond, delta);
         deltaIRExpr(e->Iex.ITE.iftrue, delta);
         deltaIRExpr(e->Iex.ITE.iffalse, delta);
         break;
      default: 
         vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
         vpanic("deltaIRExpr");
   }
}

/* Adjust all tmp values (names) in st by delta.  st is destructively
   modified. */

static void deltaIRStmt ( IRStmt* st, Int delta )
{
   Int      i;
   IRDirty* d;
   switch (st->tag) {
      case Ist_NoOp:
      case Ist_IMark:
      case Ist_MBE:
         break;
      case Ist_AbiHint:
         deltaIRExpr(st->Ist.AbiHint.base, delta);
         deltaIRExpr(st->Ist.AbiHint.nia, delta);
         break;
      case Ist_Put:
         deltaIRExpr(st->Ist.Put.data, delta);
         break;
      case Ist_PutI:
         deltaIRExpr(st->Ist.PutI.details->ix, delta);
         deltaIRExpr(st->Ist.PutI.details->data, delta);
         break;
      case Ist_WrTmp: 
         st->Ist.WrTmp.tmp += delta;
         deltaIRExpr(st->Ist.WrTmp.data, delta);
         break;
      case Ist_Exit:
         deltaIRExpr(st->Ist.Exit.guard, delta);
         break;
      case Ist_Store:
         deltaIRExpr(st->Ist.Store.addr, delta);
         deltaIRExpr(st->Ist.Store.data, delta);
         break;
      case Ist_StoreG: {
         IRStoreG* sg = st->Ist.StoreG.details;
         deltaIRExpr(sg->addr, delta);
         deltaIRExpr(sg->data, delta);
         deltaIRExpr(sg->guard, delta);
         break;
      }
      case Ist_LoadG: {
         IRLoadG* lg = st->Ist.LoadG.details;
         lg->dst += delta;
         deltaIRExpr(lg->addr, delta);
         deltaIRExpr(lg->alt, delta);
         deltaIRExpr(lg->guard, delta);
         break;
      }
      case Ist_CAS:
         if (st->Ist.CAS.details->oldHi != IRTemp_INVALID)
            st->Ist.CAS.details->oldHi += delta;
         st->Ist.CAS.details->oldLo += delta;
         deltaIRExpr(st->Ist.CAS.details->addr, delta);
         if (st->Ist.CAS.details->expdHi)
            deltaIRExpr(st->Ist.CAS.details->expdHi, delta);
         deltaIRExpr(st->Ist.CAS.details->expdLo, delta);
         if (st->Ist.CAS.details->dataHi)
            deltaIRExpr(st->Ist.CAS.details->dataHi, delta);
         deltaIRExpr(st->Ist.CAS.details->dataLo, delta);
         break;
      case Ist_LLSC:
         st->Ist.LLSC.result += delta;
         deltaIRExpr(st->Ist.LLSC.addr, delta);
         if (st->Ist.LLSC.storedata)
            deltaIRExpr(st->Ist.LLSC.storedata, delta);
         break;
      case Ist_Dirty:
         d = st->Ist.Dirty.details;
         deltaIRExpr(d->guard, delta);
         for (i = 0; d->args[i]; i++) {
            IRExpr* arg = d->args[i];
            if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg)))
               deltaIRExpr(arg, delta);
         }
         if (d->tmp != IRTemp_INVALID)
            d->tmp += delta;
         if (d->mAddr)
            deltaIRExpr(d->mAddr, delta);
         break;
      default: 
         vex_printf("\n"); ppIRStmt(st); vex_printf("\n");
         vpanic("deltaIRStmt");
   }
}


/* If possible, return a loop-unrolled version of bb0.  The original
   is changed.  If not possible, return NULL.  */

/* The two schemas considered are:

     X: BODY; goto X

     which unrolls to (eg)  X: BODY;BODY; goto X

   and

       X: BODY; if (c) goto X; goto Y
   which trivially transforms to
       X: BODY; if (!c) goto Y; goto X;
   so it falls in the scope of the first case.  

   X and Y must be literal (guest) addresses.
*/

static Int calc_unroll_factor( IRSB* bb )
{
   Int n_stmts, i;

   n_stmts = 0;
   for (i = 0; i < bb->stmts_used; i++) {
      if (bb->stmts[i]->tag != Ist_NoOp)
         n_stmts++;
   }

   if (n_stmts <= vex_control.iropt_unroll_thresh/8) {
      if (vex_control.iropt_verbosity > 0)
         vex_printf("vex iropt: 8 x unrolling (%d sts -> %d sts)\n",
                    n_stmts, 8* n_stmts);
      return 8;
   }
   if (n_stmts <= vex_control.iropt_unroll_thresh/4) {
      if (vex_control.iropt_verbosity > 0)
         vex_printf("vex iropt: 4 x unrolling (%d sts -> %d sts)\n",
                    n_stmts, 4* n_stmts);
      return 4;
   }

   if (n_stmts <= vex_control.iropt_unroll_thresh/2) {
      if (vex_control.iropt_verbosity > 0)
         vex_printf("vex iropt: 2 x unrolling (%d sts -> %d sts)\n",
                    n_stmts, 2* n_stmts);
      return 2;
   }

   if (vex_control.iropt_verbosity > 0)
      vex_printf("vex iropt: not unrolling (%d sts)\n", n_stmts);

   return 1;
}


static IRSB* maybe_loop_unroll_BB ( IRSB* bb0, Addr my_addr )
{
   Int      i, j, jmax, n_vars;
   Bool     xxx_known;
   Addr64   xxx_value, yyy_value;
   IRExpr*  udst;
   IRStmt*  st;
   IRConst* con;
   IRSB     *bb1, *bb2;
   Int      unroll_factor;

   if (vex_control.iropt_unroll_thresh <= 0)
      return NULL;

   /* First off, figure out if we can unroll this loop.  Do this
      without modifying bb0. */

   if (bb0->jumpkind != Ijk_Boring)
      return NULL;

   xxx_known = False;
   xxx_value = 0;

   /* Extract the next-guest address.  If it isn't a literal, we 
      have to give up. */

   udst = bb0->next;
   if (udst->tag == Iex_Const
       && (udst->Iex.Const.con->tag == Ico_U32
           || udst->Iex.Const.con->tag == Ico_U64)) {
      /* The BB ends in a jump to a literal location. */
      xxx_known = True;
      xxx_value = udst->Iex.Const.con->tag == Ico_U64
                    ?  udst->Iex.Const.con->Ico.U64
                    : (Addr64)(udst->Iex.Const.con->Ico.U32);
   }

   if (!xxx_known)
      return NULL;

   /* Now we know the BB ends to a jump to a literal location.  If
      it's a jump to itself (viz, idiom #1), move directly to the
      unrolling stage, first cloning the bb so the original isn't
      modified. */
   if (xxx_value == my_addr) {
      unroll_factor = calc_unroll_factor( bb0 );
      if (unroll_factor < 2)
         return NULL;
      bb1 = deepCopyIRSB( bb0 );
      bb0 = NULL;
      udst = NULL; /* is now invalid */
      goto do_unroll;
   }

   /* Search for the second idiomatic form:
        X: BODY; if (c) goto X; goto Y
      We know Y, but need to establish that the last stmt
      is 'if (c) goto X'.
   */
   yyy_value = xxx_value;
   for (i = bb0->stmts_used-1; i >= 0; i--)
      if (bb0->stmts[i])
         break;

   if (i < 0)
      return NULL; /* block with no stmts.  Strange. */

   st = bb0->stmts[i];
   if (st->tag != Ist_Exit)
      return NULL;
   if (st->Ist.Exit.jk != Ijk_Boring)
      return NULL;

   con = st->Ist.Exit.dst;
   vassert(con->tag == Ico_U32 || con->tag == Ico_U64);

   xxx_value = con->tag == Ico_U64 
                  ? st->Ist.Exit.dst->Ico.U64
                  : (Addr64)(st->Ist.Exit.dst->Ico.U32);

   /* If this assertion fails, we have some kind of type error. */
   vassert(con->tag == udst->Iex.Const.con->tag);

   if (xxx_value != my_addr)
      /* We didn't find either idiom.  Give up. */
      return NULL;

   /* Ok, we found idiom #2.  Copy the BB, switch around the xxx and
      yyy values (which makes it look like idiom #1), and go into
      unrolling proper.  This means finding (again) the last stmt, in
      the copied BB. */

   unroll_factor = calc_unroll_factor( bb0 );
   if (unroll_factor < 2)
      return NULL;

   bb1 = deepCopyIRSB( bb0 );
   bb0 = NULL;
   udst = NULL; /* is now invalid */
   for (i = bb1->stmts_used-1; i >= 0; i--)
      if (bb1->stmts[i])
         break;

   /* The next bunch of assertions should be true since we already
      found and checked the last stmt in the original bb. */

   vassert(i >= 0);

   st = bb1->stmts[i];
   vassert(st->tag == Ist_Exit);

   con = st->Ist.Exit.dst;
   vassert(con->tag == Ico_U32 || con->tag == Ico_U64);

   udst = bb1->next;
   vassert(udst->tag == Iex_Const);
   vassert(udst->Iex.Const.con->tag == Ico_U32
          || udst->Iex.Const.con->tag == Ico_U64);
   vassert(con->tag == udst->Iex.Const.con->tag);

   /* switch the xxx and yyy fields around */
   if (con->tag == Ico_U64) {
      udst->Iex.Const.con->Ico.U64 = xxx_value;
      con->Ico.U64 = yyy_value;
   } else {
      udst->Iex.Const.con->Ico.U32 = (UInt)xxx_value;
      con->Ico.U32 = (UInt)yyy_value;
   }

   /* negate the test condition */
   st->Ist.Exit.guard 
      = IRExpr_Unop(Iop_Not1,deepCopyIRExpr(st->Ist.Exit.guard));

   /* --- The unroller proper.  Both idioms are by now --- */
   /* --- now converted to idiom 1. --- */

  do_unroll:

   vassert(unroll_factor == 2 
           || unroll_factor == 4
           || unroll_factor == 8);

   jmax = unroll_factor==8 ? 3 : (unroll_factor==4 ? 2 : 1);
   for (j = 1; j <= jmax; j++) {

      n_vars = bb1->tyenv->types_used;

      bb2 = deepCopyIRSB(bb1);
      for (i = 0; i < n_vars; i++)
         (void)newIRTemp(bb1->tyenv, bb2->tyenv->types[i]);

      for (i = 0; i < bb2->stmts_used; i++) {
         /* deltaIRStmt destructively modifies the stmt, but 
            that's OK since bb2 is a complete fresh copy of bb1. */
         deltaIRStmt(bb2->stmts[i], n_vars);
         addStmtToIRSB(bb1, bb2->stmts[i]);
      }
   }

   if (DEBUG_IROPT) {
      vex_printf("\nUNROLLED (%lx)\n", my_addr);
      ppIRSB(bb1);
      vex_printf("\n");
   }

   /* Flattening; sigh.  The unroller succeeds in breaking flatness
      by negating the test condition.  This should be fixed properly.
      For the moment use this shotgun approach.  */
   return flatten_BB(bb1);
}


/*---------------------------------------------------------------*/
/*--- The tree builder                                        ---*/
/*---------------------------------------------------------------*/

/* This isn't part of IR optimisation.  Really it's a pass done prior
   to instruction selection, which improves the code that the
   instruction selector can produce. */

/* --- The 'tmp' environment is the central data structure here --- */

/* The number of outstanding bindings we're prepared to track.
   The number of times the env becomes full and we have to dump
   the oldest binding (hence reducing code quality) falls very
   rapidly as the env size increases.  8 gives reasonable performance 
   under most circumstances. */
#define A_NENV 10

/* An interval. Used to record the bytes in the guest state accessed
   by a Put[I] statement or by (one or more) Get[I] expression(s). In 
   case of several Get[I] expressions, the lower/upper bounds are recorded.
   This is conservative but cheap.
   E.g. a Put of 8 bytes at address 100 would be recorded as [100,107].
   E.g. an expression that reads 8 bytes at offset 100 and 4 bytes at
   offset 200 would be recorded as [100,203] */
typedef
   struct {
      Bool present;
      Int  low;
      Int  high;
   }
   Interval;

static inline Bool
intervals_overlap(Interval i1, Interval i2)
{
   return (i1.low >= i2.low && i1.low <= i2.high) ||
          (i2.low >= i1.low && i2.low <= i1.high);
}

static inline void
update_interval(Interval *i, Int low, Int high)
{
   vassert(low <= high);

   if (i->present) {
      if (low  < i->low)  i->low  = low;
      if (high > i->high) i->high = high;
   } else {
      i->present = True;
      i->low  = low;
      i->high = high;
   }
}


/* bindee == NULL   ===  slot is not in use
   bindee != NULL   ===  slot is in use
*/
typedef
   struct {
      IRTemp  binder;
      IRExpr* bindee;
      Bool    doesLoad;
      /* Record the bytes of the guest state BINDEE reads from. */
      Interval getInterval;
   }
   ATmpInfo;

__attribute__((unused))
static void ppAEnv ( ATmpInfo* env )
{
   Int i;
   for (i = 0; i < A_NENV; i++) {
      vex_printf("%d  tmp %d  val ", i, (Int)env[i].binder);
      if (env[i].bindee) 
         ppIRExpr(env[i].bindee);
      else 
         vex_printf("(null)");
      vex_printf("\n");
   }
}

/* --- Tree-traversal fns --- */

/* Traverse an expr, and detect if any part of it reads memory or does
   a Get.  Be careful ... this really controls how much the
   tree-builder can reorder the code, so getting it right is critical.
*/
static void setHints_Expr (Bool* doesLoad, Interval* getInterval, IRExpr* e )
{
   Int i;
   switch (e->tag) {
      case Iex_CCall:
         for (i = 0; e->Iex.CCall.args[i]; i++)
            setHints_Expr(doesLoad, getInterval, e->Iex.CCall.args[i]);
         return;
      case Iex_ITE:
         setHints_Expr(doesLoad, getInterval, e->Iex.ITE.cond);
         setHints_Expr(doesLoad, getInterval, e->Iex.ITE.iftrue);
         setHints_Expr(doesLoad, getInterval, e->Iex.ITE.iffalse);
         return;
      case Iex_Qop:
         setHints_Expr(doesLoad, getInterval, e->Iex.Qop.details->arg1);
         setHints_Expr(doesLoad, getInterval, e->Iex.Qop.details->arg2);
         setHints_Expr(doesLoad, getInterval, e->Iex.Qop.details->arg3);
         setHints_Expr(doesLoad, getInterval, e->Iex.Qop.details->arg4);
         return;
      case Iex_Triop:
         setHints_Expr(doesLoad, getInterval, e->Iex.Triop.details->arg1);
         setHints_Expr(doesLoad, getInterval, e->Iex.Triop.details->arg2);
         setHints_Expr(doesLoad, getInterval, e->Iex.Triop.details->arg3);
         return;
      case Iex_Binop:
         setHints_Expr(doesLoad, getInterval, e->Iex.Binop.arg1);
         setHints_Expr(doesLoad, getInterval, e->Iex.Binop.arg2);
         return;
      case Iex_Unop:
         setHints_Expr(doesLoad, getInterval, e->Iex.Unop.arg);
         return;
      case Iex_Load:
         *doesLoad = True;
         setHints_Expr(doesLoad, getInterval, e->Iex.Load.addr);
         return;
      case Iex_Get: {
         Int low = e->Iex.Get.offset;
         Int high = low + sizeofIRType(e->Iex.Get.ty) - 1;
         update_interval(getInterval, low, high);
         return;
      }
      case Iex_GetI: {
         IRRegArray *descr = e->Iex.GetI.descr;
         Int size = sizeofIRType(descr->elemTy);
         Int low  = descr->base;
         Int high = low + descr->nElems * size - 1;
         update_interval(getInterval, low, high);
         setHints_Expr(doesLoad, getInterval, e->Iex.GetI.ix);
         return;
      }
      case Iex_RdTmp:
      case Iex_Const:
         return;
      default: 
         vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
         vpanic("setHints_Expr");
   }
}


/* Add a binding to the front of the env and slide all the rest
   backwards.  It should be the case that the last slot is free. */
static void addToEnvFront ( ATmpInfo* env, IRTemp binder, IRExpr* bindee )
{
   Int i;
   vassert(env[A_NENV-1].bindee == NULL);
   for (i = A_NENV-1; i >= 1; i--)
      env[i] = env[i-1];
   env[0].binder   = binder;
   env[0].bindee   = bindee;
   env[0].doesLoad = False; /* filled in later */
   env[0].getInterval.present  = False; /* filled in later */
   env[0].getInterval.low  = -1; /* filled in later */
   env[0].getInterval.high = -1; /* filled in later */
}

/* Given uses :: array of UShort, indexed by IRTemp
   Add the use-occurrences of temps in this expression 
   to the env. 
*/
static void aoccCount_Expr ( UShort* uses, IRExpr* e )
{
   Int i;

   switch (e->tag) {

      case Iex_RdTmp: /* the only interesting case */
         uses[e->Iex.RdTmp.tmp]++;
         return;

      case Iex_ITE:
         aoccCount_Expr(uses, e->Iex.ITE.cond);
         aoccCount_Expr(uses, e->Iex.ITE.iftrue);
         aoccCount_Expr(uses, e->Iex.ITE.iffalse);
         return;

      case Iex_Qop: 
         aoccCount_Expr(uses, e->Iex.Qop.details->arg1);
         aoccCount_Expr(uses, e->Iex.Qop.details->arg2);
         aoccCount_Expr(uses, e->Iex.Qop.details->arg3);
         aoccCount_Expr(uses, e->Iex.Qop.details->arg4);
         return;

      case Iex_Triop: 
         aoccCount_Expr(uses, e->Iex.Triop.details->arg1);
         aoccCount_Expr(uses, e->Iex.Triop.details->arg2);
         aoccCount_Expr(uses, e->Iex.Triop.details->arg3);
         return;

      case Iex_Binop: 
         aoccCount_Expr(uses, e->Iex.Binop.arg1);
         aoccCount_Expr(uses, e->Iex.Binop.arg2);
         return;

      case Iex_Unop: 
         aoccCount_Expr(uses, e->Iex.Unop.arg);
         return;

      case Iex_Load:
         aoccCount_Expr(uses, e->Iex.Load.addr);
         return;

      case Iex_CCall:
         for (i = 0; e->Iex.CCall.args[i]; i++)
            aoccCount_Expr(uses, e->Iex.CCall.args[i]);
         return;

      case Iex_GetI:
         aoccCount_Expr(uses, e->Iex.GetI.ix);
         return;

      case Iex_Const:
      case Iex_Get:
         return;

      default: 
         vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
         vpanic("aoccCount_Expr");
    }
}


/* Given uses :: array of UShort, indexed by IRTemp
   Add the use-occurrences of temps in this statement 
   to the env. 
*/
static void aoccCount_Stmt ( UShort* uses, IRStmt* st )
{
   Int      i;
   IRDirty* d;
   IRCAS*   cas;
   switch (st->tag) {
      case Ist_AbiHint:
         aoccCount_Expr(uses, st->Ist.AbiHint.base);
         aoccCount_Expr(uses, st->Ist.AbiHint.nia);
         return;
      case Ist_WrTmp: 
         aoccCount_Expr(uses, st->Ist.WrTmp.data); 
         return; 
      case Ist_Put: 
         aoccCount_Expr(uses, st->Ist.Put.data);
         return;
      case Ist_PutI:
         aoccCount_Expr(uses, st->Ist.PutI.details->ix);
         aoccCount_Expr(uses, st->Ist.PutI.details->data);
         return;
      case Ist_Store:
         aoccCount_Expr(uses, st->Ist.Store.addr);
         aoccCount_Expr(uses, st->Ist.Store.data);
         return;
      case Ist_StoreG: {
         IRStoreG* sg = st->Ist.StoreG.details;
         aoccCount_Expr(uses, sg->addr);
         aoccCount_Expr(uses, sg->data);
         aoccCount_Expr(uses, sg->guard);
         return;
      }
      case Ist_LoadG: {
         IRLoadG* lg = st->Ist.LoadG.details;
         aoccCount_Expr(uses, lg->addr);
         aoccCount_Expr(uses, lg->alt);
         aoccCount_Expr(uses, lg->guard);
         return;
      }
      case Ist_CAS:
         cas = st->Ist.CAS.details;
         aoccCount_Expr(uses, cas->addr);
         if (cas->expdHi)
            aoccCount_Expr(uses, cas->expdHi);
         aoccCount_Expr(uses, cas->expdLo);
         if (cas->dataHi)
            aoccCount_Expr(uses, cas->dataHi);
         aoccCount_Expr(uses, cas->dataLo);
         return;
      case Ist_LLSC:
         aoccCount_Expr(uses, st->Ist.LLSC.addr);
         if (st->Ist.LLSC.storedata)
            aoccCount_Expr(uses, st->Ist.LLSC.storedata);
         return;
      case Ist_Dirty:
         d = st->Ist.Dirty.details;
         if (d->mFx != Ifx_None)
            aoccCount_Expr(uses, d->mAddr);
         aoccCount_Expr(uses, d->guard);
         for (i = 0; d->args[i]; i++) {
            IRExpr* arg = d->args[i];
            if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg)))
               aoccCount_Expr(uses, arg);
         }
         return;
      case Ist_NoOp:
      case Ist_IMark:
      case Ist_MBE:
         return;
      case Ist_Exit:
         aoccCount_Expr(uses, st->Ist.Exit.guard);
         return;
      default: 
         vex_printf("\n"); ppIRStmt(st); vex_printf("\n");
         vpanic("aoccCount_Stmt");
   }
}

/* Look up a binding for tmp in the env.  If found, return the bound
   expression, and set the env's binding to NULL so it is marked as
   used.  If not found, return NULL. */

static IRExpr* atbSubst_Temp ( ATmpInfo* env, IRTemp tmp )
{
   Int i;
   for (i = 0; i < A_NENV; i++) {
      if (env[i].binder == tmp && env[i].bindee != NULL) {
         IRExpr* bindee = env[i].bindee;
         env[i].bindee = NULL;
         return bindee;
      }
   }
   return NULL;
}

/* Traverse e, looking for temps.  For each observed temp, see if env
   contains a binding for the temp, and if so return the bound value.
   The env has the property that any binding it holds is
   'single-shot', so once a binding is used, it is marked as no longer
   available, by setting its .bindee field to NULL. */

static inline Bool is_Unop ( IRExpr* e, IROp op ) {
   return e->tag == Iex_Unop && e->Iex.Unop.op == op;
}
static inline Bool is_Binop ( IRExpr* e, IROp op ) {
   return e->tag == Iex_Binop && e->Iex.Binop.op == op;
}

static IRExpr* fold_IRExpr_Binop ( IROp op, IRExpr* a1, IRExpr* a2 )
{
   switch (op) {
   case Iop_Or32:
      /* Or32( CmpwNEZ32(x), CmpwNEZ32(y) ) --> CmpwNEZ32( Or32( x, y ) )  */
      if (is_Unop(a1, Iop_CmpwNEZ32) && is_Unop(a2, Iop_CmpwNEZ32))
         return IRExpr_Unop( Iop_CmpwNEZ32,
                             IRExpr_Binop( Iop_Or32, a1->Iex.Unop.arg, 
                                                     a2->Iex.Unop.arg ) );
      break;

   case Iop_CmpNE32:
      /* Since X has type Ity_I1 we can simplify:
         CmpNE32(1Uto32(X),0)) ==> X */
      if (is_Unop(a1, Iop_1Uto32) && isZeroU32(a2))
         return a1->Iex.Unop.arg;
      break;

   default:
      break;
   }
   /* no reduction rule applies */
   return IRExpr_Binop( op, a1, a2 );
}

static IRExpr* fold_IRExpr_Unop ( IROp op, IRExpr* aa )
{
   switch (op) {
   case Iop_CmpwNEZ64:
      /* CmpwNEZ64( CmpwNEZ64 ( x ) ) --> CmpwNEZ64 ( x ) */
      if (is_Unop(aa, Iop_CmpwNEZ64))
         return IRExpr_Unop( Iop_CmpwNEZ64, aa->Iex.Unop.arg );
      /* CmpwNEZ64( Or64 ( CmpwNEZ64(x), y ) ) --> CmpwNEZ64( Or64( x, y ) ) */
      if (is_Binop(aa, Iop_Or64) 
          && is_Unop(aa->Iex.Binop.arg1, Iop_CmpwNEZ64))
         return fold_IRExpr_Unop(
                   Iop_CmpwNEZ64,
                   IRExpr_Binop(Iop_Or64, 
                                aa->Iex.Binop.arg1->Iex.Unop.arg, 
                                aa->Iex.Binop.arg2));
      /* CmpwNEZ64( Or64 ( x, CmpwNEZ64(y) ) ) --> CmpwNEZ64( Or64( x, y ) ) */
      if (is_Binop(aa, Iop_Or64)
          && is_Unop(aa->Iex.Binop.arg2, Iop_CmpwNEZ64))
         return fold_IRExpr_Unop(
                   Iop_CmpwNEZ64,
                   IRExpr_Binop(Iop_Or64, 
                                aa->Iex.Binop.arg1, 
                                aa->Iex.Binop.arg2->Iex.Unop.arg));
      break;
   case Iop_CmpNEZ64:
      /* CmpNEZ64( Left64(x) ) --> CmpNEZ64(x) */
      if (is_Unop(aa, Iop_Left64)) 
         return IRExpr_Unop(Iop_CmpNEZ64, aa->Iex.Unop.arg);
      /* CmpNEZ64( 1Uto64(X) ) --> X */
      if (is_Unop(aa, Iop_1Uto64))
         return aa->Iex.Unop.arg;
      break;
   case Iop_CmpwNEZ32:
      /* CmpwNEZ32( CmpwNEZ32 ( x ) ) --> CmpwNEZ32 ( x ) */
      if (is_Unop(aa, Iop_CmpwNEZ32))
         return IRExpr_Unop( Iop_CmpwNEZ32, aa->Iex.Unop.arg );
      break;
   case Iop_CmpNEZ32:
      /* CmpNEZ32( Left32(x) ) --> CmpNEZ32(x) */
      if (is_Unop(aa, Iop_Left32)) 
         return IRExpr_Unop(Iop_CmpNEZ32, aa->Iex.Unop.arg);
      /* CmpNEZ32( 1Uto32(X) ) --> X */
      if (is_Unop(aa, Iop_1Uto32))
         return aa->Iex.Unop.arg;
      /* CmpNEZ32( 64to32( CmpwNEZ64(X) ) ) --> CmpNEZ64(X) */
      if (is_Unop(aa, Iop_64to32) && is_Unop(aa->Iex.Unop.arg, Iop_CmpwNEZ64))
         return IRExpr_Unop(Iop_CmpNEZ64, aa->Iex.Unop.arg->Iex.Unop.arg);
      break;
   case Iop_CmpNEZ8:
      /* CmpNEZ8( 1Uto8(X) ) --> X */
      if (is_Unop(aa, Iop_1Uto8))
         return aa->Iex.Unop.arg;
      break;
   case Iop_Left32:
      /* Left32( Left32(x) ) --> Left32(x) */
      if (is_Unop(aa, Iop_Left32))
         return IRExpr_Unop( Iop_Left32, aa->Iex.Unop.arg );
      break;
   case Iop_Left64:
      /* Left64( Left64(x) ) --> Left64(x) */
      if (is_Unop(aa, Iop_Left64))
         return IRExpr_Unop( Iop_Left64, aa->Iex.Unop.arg );
      break;
   case Iop_ZeroHI64ofV128:
      /* ZeroHI64ofV128( ZeroHI64ofV128(x) ) --> ZeroHI64ofV128(x) */
      if (is_Unop(aa, Iop_ZeroHI64ofV128))
         return IRExpr_Unop( Iop_ZeroHI64ofV128, aa->Iex.Unop.arg );
      break;
   case Iop_32to1:
      /* 32to1( 1Uto32 ( x ) ) --> x */
      if (is_Unop(aa, Iop_1Uto32))
         return aa->Iex.Unop.arg;
      /* 32to1( CmpwNEZ32 ( x )) --> CmpNEZ32(x) */
      if (is_Unop(aa, Iop_CmpwNEZ32))
         return IRExpr_Unop( Iop_CmpNEZ32, aa->Iex.Unop.arg );
      break;
   case Iop_64to1:
      /* 64to1( 1Uto64 ( x ) ) --> x */
      if (is_Unop(aa, Iop_1Uto64))
         return aa->Iex.Unop.arg;
      /* 64to1( CmpwNEZ64 ( x )) --> CmpNEZ64(x) */
      if (is_Unop(aa, Iop_CmpwNEZ64))
         return IRExpr_Unop( Iop_CmpNEZ64, aa->Iex.Unop.arg );
      break;
   case Iop_64to32:
      /* 64to32( 32Uto64 ( x )) --> x */
      if (is_Unop(aa, Iop_32Uto64))
         return aa->Iex.Unop.arg;
      /* 64to32( 8Uto64 ( x )) --> 8Uto32(x) */
      if (is_Unop(aa, Iop_8Uto64))
         return IRExpr_Unop(Iop_8Uto32, aa->Iex.Unop.arg);
      break;

   case Iop_32Uto64:
      /* 32Uto64( 8Uto32( x )) --> 8Uto64(x) */
      if (is_Unop(aa, Iop_8Uto32))
         return IRExpr_Unop(Iop_8Uto64, aa->Iex.Unop.arg);
      /* 32Uto64( 16Uto32( x )) --> 16Uto64(x) */
      if (is_Unop(aa, Iop_16Uto32))
         return IRExpr_Unop(Iop_16Uto64, aa->Iex.Unop.arg);
      /* 32Uto64(64to32( Shr64( 32Uto64(64to32(x)), sh ))
                     --> Shr64( 32Uto64(64to32(x)), sh )) */
      if (is_Unop(aa, Iop_64to32)
          && is_Binop(aa->Iex.Unop.arg, Iop_Shr64)
          && is_Unop(aa->Iex.Unop.arg->Iex.Binop.arg1, Iop_32Uto64)
          && is_Unop(aa->Iex.Unop.arg->Iex.Binop.arg1->Iex.Unop.arg,
                     Iop_64to32)) {
         return aa->Iex.Unop.arg;
      }
      /*     32Uto64(64to32( Shl64( 32Uto64(64to32(x)), sh ))
         --> 32Uto64(64to32( Shl64(                x,   sh )) */
      if (is_Unop(aa, Iop_64to32)
          && is_Binop(aa->Iex.Unop.arg, Iop_Shl64)
          && is_Unop(aa->Iex.Unop.arg->Iex.Binop.arg1, Iop_32Uto64)
          && is_Unop(aa->Iex.Unop.arg->Iex.Binop.arg1->Iex.Unop.arg,
                     Iop_64to32)) {
         return
            IRExpr_Unop(
               Iop_32Uto64,
               IRExpr_Unop(
                  Iop_64to32,
                  IRExpr_Binop(
                     Iop_Shl64, 
                     aa->Iex.Unop.arg->Iex.Binop.arg1->Iex.Unop.arg->Iex.Unop.arg,
                     aa->Iex.Unop.arg->Iex.Binop.arg2
            )));
      }
      break;

   case Iop_1Sto32:
      /* 1Sto32( CmpNEZ8( 32to8( 1Uto32( CmpNEZ32( x ))))) -> CmpwNEZ32(x) */
      if (is_Unop(aa, Iop_CmpNEZ8)
          && is_Unop(aa->Iex.Unop.arg, Iop_32to8)
          && is_Unop(aa->Iex.Unop.arg->Iex.Unop.arg, Iop_1Uto32)
          && is_Unop(aa->Iex.Unop.arg->Iex.Unop.arg->Iex.Unop.arg,
                     Iop_CmpNEZ32)) {
         return IRExpr_Unop( Iop_CmpwNEZ32,
                             aa->Iex.Unop.arg->Iex.Unop.arg
                               ->Iex.Unop.arg->Iex.Unop.arg);
      }
      break;

   default:
      break;
   }
   /* no reduction rule applies */
   return IRExpr_Unop( op, aa );
}

static IRExpr* atbSubst_Expr ( ATmpInfo* env, IRExpr* e )
{
   IRExpr*  e2;
   IRExpr** args2;
   Int      i;

   switch (e->tag) {

      case Iex_CCall:
         args2 = shallowCopyIRExprVec(e->Iex.CCall.args);
         for (i = 0; args2[i]; i++)
            args2[i] = atbSubst_Expr(env,args2[i]);
         return IRExpr_CCall(
                   e->Iex.CCall.cee,
                   e->Iex.CCall.retty,
                   args2
                );
      case Iex_RdTmp:
         e2 = atbSubst_Temp(env, e->Iex.RdTmp.tmp);
         return e2 ? e2 : e;
      case Iex_ITE:
         return IRExpr_ITE(
                   atbSubst_Expr(env, e->Iex.ITE.cond),
                   atbSubst_Expr(env, e->Iex.ITE.iftrue),
                   atbSubst_Expr(env, e->Iex.ITE.iffalse)
                );
      case Iex_Qop:
         return IRExpr_Qop(
                   e->Iex.Qop.details->op,
                   atbSubst_Expr(env, e->Iex.Qop.details->arg1),
                   atbSubst_Expr(env, e->Iex.Qop.details->arg2),
                   atbSubst_Expr(env, e->Iex.Qop.details->arg3),
                   atbSubst_Expr(env, e->Iex.Qop.details->arg4)
                );
      case Iex_Triop:
         return IRExpr_Triop(
                   e->Iex.Triop.details->op,
                   atbSubst_Expr(env, e->Iex.Triop.details->arg1),
                   atbSubst_Expr(env, e->Iex.Triop.details->arg2),
                   atbSubst_Expr(env, e->Iex.Triop.details->arg3)
                );
      case Iex_Binop:
         return fold_IRExpr_Binop(
                   e->Iex.Binop.op,
                   atbSubst_Expr(env, e->Iex.Binop.arg1),
                   atbSubst_Expr(env, e->Iex.Binop.arg2)
                );
      case Iex_Unop:
         return fold_IRExpr_Unop(
                   e->Iex.Unop.op,
                   atbSubst_Expr(env, e->Iex.Unop.arg)
                );
      case Iex_Load:
         return IRExpr_Load(
                   e->Iex.Load.end,
                   e->Iex.Load.ty,
                   atbSubst_Expr(env, e->Iex.Load.addr)
                );
      case Iex_GetI:
         return IRExpr_GetI(
                   e->Iex.GetI.descr,
                   atbSubst_Expr(env, e->Iex.GetI.ix),
                   e->Iex.GetI.bias
                );
      case Iex_Const:
      case Iex_Get:
         return e;
      default: 
         vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
         vpanic("atbSubst_Expr");
   }
}

/* Same deal as atbSubst_Expr, except for stmts. */

static IRStmt* atbSubst_Stmt ( ATmpInfo* env, IRStmt* st )
{
   Int     i;
   IRDirty *d, *d2;
   IRCAS   *cas, *cas2;
   IRPutI  *puti, *puti2;

   switch (st->tag) {
      case Ist_AbiHint:
         return IRStmt_AbiHint(
                   atbSubst_Expr(env, st->Ist.AbiHint.base),
                   st->Ist.AbiHint.len,
                   atbSubst_Expr(env, st->Ist.AbiHint.nia)
                );
      case Ist_Store:
         return IRStmt_Store(
                   st->Ist.Store.end,
                   atbSubst_Expr(env, st->Ist.Store.addr),
                   atbSubst_Expr(env, st->Ist.Store.data)
                );
      case Ist_StoreG: {
         IRStoreG* sg = st->Ist.StoreG.details;
         return IRStmt_StoreG(sg->end,
                              atbSubst_Expr(env, sg->addr),
                              atbSubst_Expr(env, sg->data),
                              atbSubst_Expr(env, sg->guard));
      }
      case Ist_LoadG: {
         IRLoadG* lg = st->Ist.LoadG.details;
         return IRStmt_LoadG(lg->end, lg->cvt, lg->dst,
                             atbSubst_Expr(env, lg->addr),
                             atbSubst_Expr(env, lg->alt),
                             atbSubst_Expr(env, lg->guard));
      }
      case Ist_WrTmp:
         return IRStmt_WrTmp(
                   st->Ist.WrTmp.tmp,
                   atbSubst_Expr(env, st->Ist.WrTmp.data)
                );
      case Ist_Put:
         return IRStmt_Put(
                   st->Ist.Put.offset,
                   atbSubst_Expr(env, st->Ist.Put.data)
                );
      case Ist_PutI:
         puti  = st->Ist.PutI.details;
         puti2 = mkIRPutI(puti->descr, 
                          atbSubst_Expr(env, puti->ix),
                          puti->bias,
                          atbSubst_Expr(env, puti->data));
         return IRStmt_PutI(puti2);

      case Ist_Exit:
         return IRStmt_Exit(
                   atbSubst_Expr(env, st->Ist.Exit.guard),
                   st->Ist.Exit.jk,
                   st->Ist.Exit.dst,
                   st->Ist.Exit.offsIP
                );
      case Ist_IMark:
         return IRStmt_IMark(st->Ist.IMark.addr,
                             st->Ist.IMark.len,
                             st->Ist.IMark.delta);
      case Ist_NoOp:
         return IRStmt_NoOp();
      case Ist_MBE:
         return IRStmt_MBE(st->Ist.MBE.event);
      case Ist_CAS:
         cas  = st->Ist.CAS.details;
         cas2 = mkIRCAS(
                   cas->oldHi, cas->oldLo, cas->end, 
                   atbSubst_Expr(env, cas->addr),
                   cas->expdHi ? atbSubst_Expr(env, cas->expdHi) : NULL,
                   atbSubst_Expr(env, cas->expdLo),
                   cas->dataHi ? atbSubst_Expr(env, cas->dataHi) : NULL,
                   atbSubst_Expr(env, cas->dataLo)
                );
         return IRStmt_CAS(cas2);
      case Ist_LLSC:
         return IRStmt_LLSC(
                   st->Ist.LLSC.end,
                   st->Ist.LLSC.result,
                   atbSubst_Expr(env, st->Ist.LLSC.addr),
                   st->Ist.LLSC.storedata
                      ? atbSubst_Expr(env, st->Ist.LLSC.storedata) : NULL
                );
      case Ist_Dirty:
         d  = st->Ist.Dirty.details;
         d2 = emptyIRDirty();
         *d2 = *d;
         if (d2->mFx != Ifx_None)
            d2->mAddr = atbSubst_Expr(env, d2->mAddr);
         d2->guard = atbSubst_Expr(env, d2->guard);
         for (i = 0; d2->args[i]; i++) {
            IRExpr* arg = d2->args[i];
            if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg)))
               d2->args[i] = atbSubst_Expr(env, arg);
         }
         return IRStmt_Dirty(d2);
      default: 
         vex_printf("\n"); ppIRStmt(st); vex_printf("\n");
         vpanic("atbSubst_Stmt");
   }
}

inline
static Bool dirty_helper_stores ( const IRDirty *d )
{
   return d->mFx == Ifx_Write || d->mFx == Ifx_Modify;
}

inline
static Interval dirty_helper_puts (
                   const IRDirty *d,
                   Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
                   VexRegisterUpdates pxControl,
                   /*OUT*/Bool *requiresPreciseMemExns
                )
{
   Int i;
   Interval interval;

   /* Passing the guest state pointer opens the door to modifying the
      guest state under the covers.  It's not allowed, but let's be
      extra conservative and assume the worst. */
   for (i = 0; d->args[i]; i++) {
      if (UNLIKELY(d->args[i]->tag == Iex_BBPTR)) {
         *requiresPreciseMemExns = True;
         /* Assume all guest state is written. */
         interval.present = True;
         interval.low  = 0;
         interval.high = 0x7FFFFFFF;
         return interval;
      }
   }

   /* Check the side effects on the guest state */
   interval.present = False;
   interval.low = interval.high = -1;
   *requiresPreciseMemExns = False;

   for (i = 0; i < d->nFxState; ++i) {
      if (d->fxState[i].fx != Ifx_Read) {
         Int offset = d->fxState[i].offset;
         Int size = d->fxState[i].size;
         Int nRepeats = d->fxState[i].nRepeats;
         Int repeatLen = d->fxState[i].repeatLen;

         if (preciseMemExnsFn(offset,
                              offset + nRepeats * repeatLen + size - 1,
                              pxControl)) {
            *requiresPreciseMemExns = True;
         }
         update_interval(&interval, offset,
                         offset + nRepeats * repeatLen + size - 1);
      }
   }

   return interval;
}

/* Return an interval if st modifies the guest state.  Via
   requiresPreciseMemExns return whether or not that modification
   requires precise exceptions. */
static Interval stmt_modifies_guest_state ( 
                   IRSB *bb, const IRStmt *st,
                   Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
                   VexRegisterUpdates pxControl,
                   /*OUT*/Bool *requiresPreciseMemExns
                )
{
   Interval interval;

   switch (st->tag) {
   case Ist_Put: {
      Int offset = st->Ist.Put.offset;
      Int size = sizeofIRType(typeOfIRExpr(bb->tyenv, st->Ist.Put.data));

      *requiresPreciseMemExns
         = preciseMemExnsFn(offset, offset + size - 1, pxControl);
      interval.present = True;
      interval.low  = offset;
      interval.high = offset + size - 1;
      return interval;
   }

   case Ist_PutI: {
      IRRegArray *descr = st->Ist.PutI.details->descr;
      Int offset = descr->base;
      Int size = sizeofIRType(descr->elemTy);

      /* We quietly assume here that all segments are contiguous and there
         are no holes. This is to avoid a loop. The assumption is conservative
         in the sense that we might report that precise memory exceptions are
         needed when in fact they are not. */
      *requiresPreciseMemExns
         = preciseMemExnsFn(offset, offset + descr->nElems * size - 1,
                            pxControl);
      interval.present = True;
      interval.low  = offset;
      interval.high = offset + descr->nElems * size - 1;
      return interval;
   }

   case Ist_Dirty:
      return dirty_helper_puts(st->Ist.Dirty.details,
                               preciseMemExnsFn, pxControl,
                               requiresPreciseMemExns);

   default:
      *requiresPreciseMemExns = False;
      interval.present = False;
      interval.low  = -1;
      interval.high = -1;
      return interval;
   }
}

/* notstatic */ Addr ado_treebuild_BB (
                        IRSB* bb,
                        Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
                        VexRegisterUpdates pxControl
                     )
{
   Int      i, j, k, m;
   Bool     stmtStores, invalidateMe;
   Interval putInterval;
   IRStmt*  st;
   IRStmt*  st2;
   ATmpInfo env[A_NENV];

   Bool   max_ga_known = False;
   Addr   max_ga       = 0;

   Int       n_tmps = bb->tyenv->types_used;
   UShort*   uses   = LibVEX_Alloc_inline(n_tmps * sizeof(UShort));

   /* Phase 1.  Scan forwards in bb, counting use occurrences of each
      temp.  Also count occurrences in the bb->next field.  Take the
      opportunity to also find the maximum guest address in the block,
      since that will be needed later for deciding when we can safely
      elide event checks. */

   for (i = 0; i < n_tmps; i++)
      uses[i] = 0;

   for (i = 0; i < bb->stmts_used; i++) {
      st = bb->stmts[i];
      switch (st->tag) {
         case Ist_NoOp:
            continue;
         case Ist_IMark: {
            UInt len = st->Ist.IMark.len;
            Addr mga = st->Ist.IMark.addr + (len < 1 ? 1 : len) - 1;
            max_ga_known = True;
            if (mga > max_ga)
               max_ga = mga;
            break;
         }
         default:
            break;
      }
      aoccCount_Stmt( uses, st );
   }
   aoccCount_Expr(uses, bb->next );

#  if 0
   for (i = 0; i < n_tmps; i++) {
      if (uses[i] == 0)
        continue;
      ppIRTemp( (IRTemp)i );
      vex_printf("  used %d\n", (Int)uses[i] );
   }
#  endif

   /* Phase 2.  Scan forwards in bb.  For each statement in turn:

         If the env is full, emit the end element.  This guarantees
         there is at least one free slot in the following.

         On seeing 't = E', occ(t)==1,  
            let E'=env(E)
            delete this stmt
            add t -> E' to the front of the env
            Examine E' and set the hints for E' appropriately
              (doesLoad? doesGet?)

         On seeing any other stmt, 
            let stmt' = env(stmt)
            remove from env any 't=E' binds invalidated by stmt
                emit the invalidated stmts
            emit stmt'
            compact any holes in env 
              by sliding entries towards the front

      Finally, apply env to bb->next.  
   */

   for (i = 0; i < A_NENV; i++) {
      env[i].bindee = NULL;
      env[i].binder = IRTemp_INVALID;
   }

   /* The stmts in bb are being reordered, and we are guaranteed to
      end up with no more than the number we started with.  Use i to
      be the cursor of the current stmt examined and j <= i to be that
      for the current stmt being written. 
   */
   j = 0;
   for (i = 0; i < bb->stmts_used; i++) {

      st = bb->stmts[i];
      if (st->tag == Ist_NoOp)
         continue;
     
      /* Ensure there's at least one space in the env, by emitting
         the oldest binding if necessary. */
      if (env[A_NENV-1].bindee != NULL) {
         bb->stmts[j] = IRStmt_WrTmp( env[A_NENV-1].binder, 
                                      env[A_NENV-1].bindee );
         j++;
         vassert(j <= i);
         env[A_NENV-1].bindee = NULL;
      }

      /* Consider current stmt. */
      if (st->tag == Ist_WrTmp && uses[st->Ist.WrTmp.tmp] <= 1) {
         IRExpr *e, *e2;

         /* optional extra: dump dead bindings as we find them.
            Removes the need for a prior dead-code removal pass. */
         if (uses[st->Ist.WrTmp.tmp] == 0) {
	    if (0) vex_printf("DEAD binding\n");
            continue; /* for (i = 0; i < bb->stmts_used; i++) loop */
         }
         vassert(uses[st->Ist.WrTmp.tmp] == 1);

         /* ok, we have 't = E', occ(t)==1.  Do the abovementioned
            actions. */
         e  = st->Ist.WrTmp.data;
         e2 = atbSubst_Expr(env, e);
         addToEnvFront(env, st->Ist.WrTmp.tmp, e2);
         setHints_Expr(&env[0].doesLoad, &env[0].getInterval, e2);
         /* don't advance j, as we are deleting this stmt and instead
            holding it temporarily in the env. */
         continue; /* for (i = 0; i < bb->stmts_used; i++) loop */
      }

      /* we get here for any other kind of statement. */
      /* 'use up' any bindings required by the current statement. */
      st2 = atbSubst_Stmt(env, st);

      /* Now, before this stmt, dump any bindings in env that it
         invalidates.  These need to be dumped in the order in which
         they originally entered env -- that means from oldest to
         youngest. */

      /* putInterval/stmtStores characterise what the stmt under
         consideration does, or might do (sidely safe @ True). */

      Bool putRequiresPreciseMemExns;
      putInterval = stmt_modifies_guest_state(
                       bb, st, preciseMemExnsFn, pxControl,
                       &putRequiresPreciseMemExns
                    );

      /* be True if this stmt writes memory or might do (==> we don't
         want to reorder other loads or stores relative to it).  Also,
         both LL and SC fall under this classification, since we
         really ought to be conservative and not reorder any other
         memory transactions relative to them. */
      stmtStores
         = toBool( st->tag == Ist_Store
                   || (st->tag == Ist_Dirty
                       && dirty_helper_stores(st->Ist.Dirty.details))
                   || st->tag == Ist_LLSC
                   || st->tag == Ist_CAS );

      for (k = A_NENV-1; k >= 0; k--) {
         if (env[k].bindee == NULL)
            continue;
         /* Compare the actions of this stmt with the actions of
            binding 'k', to see if they invalidate the binding. */
         invalidateMe
            = toBool(
              /* a store invalidates loaded data */
              (env[k].doesLoad && stmtStores)
              /* a put invalidates get'd data, if they overlap */
              || ((env[k].getInterval.present && putInterval.present) &&
                  intervals_overlap(env[k].getInterval, putInterval))
              /* a put invalidates loaded data. That means, in essense, that
                 a load expression cannot be substituted into a statement
                 that follows the put. But there is nothing wrong doing so
                 except when the put statement requries precise exceptions.
                 Think of a load that is moved past a put where the put
                 updates the IP in the guest state. If the load generates
                 a segfault, the wrong address (line number) would be
                 reported. */
              || (env[k].doesLoad && putInterval.present &&
                  putRequiresPreciseMemExns)
              /* probably overly conservative: a memory bus event
                 invalidates absolutely everything, so that all
                 computation prior to it is forced to complete before
                 proceeding with the event (fence,lock,unlock). */
              || st->tag == Ist_MBE
              /* also be (probably overly) paranoid re AbiHints */
              || st->tag == Ist_AbiHint
              );
         if (invalidateMe) {
            bb->stmts[j] = IRStmt_WrTmp( env[k].binder, env[k].bindee );
            j++;
            vassert(j <= i);
            env[k].bindee = NULL;
         }
      }

      /* Slide in-use entries in env up to the front */
      m = 0;
      for (k = 0; k < A_NENV; k++) {
         if (env[k].bindee != NULL) {
            env[m] = env[k];
            m++;
	 }
      }
      for (m = m; m < A_NENV; m++) {
         env[m].bindee = NULL;
      }

      /* finally, emit the substituted statement */
      bb->stmts[j] = st2;
      /* vex_printf("**2  "); ppIRStmt(bb->stmts[j]); vex_printf("\n"); */
      j++;

      vassert(j <= i+1);
   } /* for each stmt in the original bb ... */

   /* Finally ... substitute the ->next field as much as possible, and
      dump any left-over bindings.  Hmm.  Perhaps there should be no
      left over bindings?  Or any left-over bindings are
      by definition dead? */
   bb->next = atbSubst_Expr(env, bb->next);
   bb->stmts_used = j;

   return max_ga_known ? max_ga : ~(Addr)0;
}


/*---------------------------------------------------------------*/
/*--- MSVC specific transformation hacks                      ---*/
/*---------------------------------------------------------------*/

/* The purpose of all this is to find MSVC's idiom for non-constant
   bitfield assignment, "a ^ ((a ^ b) & c)", and transform it into
   gcc's idiom "(a & ~c) | (b & c)".  Motivation is that Memcheck has
   generates a lot of false positives from the MSVC version because it
   doesn't understand that XORing an undefined bit with itself gives a
   defined result.

   This isn't a problem for the simple case "x ^ x", because iropt
   folds it to a constant zero before Memcheck ever sees it.  But in
   this case we have an intervening "& c" which defeats the simple
   case.  So we have to carefully inspect all expressions rooted at an
   XOR to see if any of them match "a ^ ((a ^ b) & c)", or any of the
   7 other variants resulting from swapping the order of arguments to
   the three binary operations.  If we get a match, we then replace
   the tree with "(a & ~c) | (b & c)", and Memcheck is happy.

   The key difficulty is to spot the two uses of "a".  To normalise
   the IR to maximise the chances of success, we first do a CSE pass,
   with CSEing of loads enabled, since the two "a" expressions may be
   loads, which need to be commoned up.  Then we do a constant folding
   pass, so as to remove any tmp-to-tmp assignment chains that would
   make matching the original expression more difficult.
*/


/* Helper function for debug printing */
__attribute__((unused))
static void print_flat_expr ( IRExpr** env, IRExpr* e )
{
   if (e == NULL) {
      vex_printf("?");
      return;
   }
   switch (e->tag) {
      case Iex_Binop: {
         ppIROp(e->Iex.Binop.op);
         vex_printf("(");
         print_flat_expr(env, e->Iex.Binop.arg1);
         vex_printf(",");
         print_flat_expr(env, e->Iex.Binop.arg2);
         vex_printf(")");
         break;
      }
      case Iex_Unop: {
         ppIROp(e->Iex.Unop.op);
         vex_printf("(");
         print_flat_expr(env, e->Iex.Unop.arg);
         vex_printf(")");
         break;
      }
      case Iex_RdTmp:
         ppIRTemp(e->Iex.RdTmp.tmp);
         vex_printf("=");
         print_flat_expr(env, chase(env, e));
         break;
      case Iex_Const:
      case Iex_CCall:
      case Iex_Load:
      case Iex_ITE:
      case Iex_Get:
         ppIRExpr(e);
         break;
      default:
         vex_printf("FAIL: "); ppIRExpr(e); vex_printf("\n");
         vassert(0);
   }
}

/* Spot   a ^ ((a ^ b) & c)   for a,b and c tmp-or-const (atoms)
   or any of the other 7 variants generated by switching the order
   of arguments to the outer ^, the inner ^ and the &.
*/
static UInt spotBitfieldAssignment ( /*OUT*/IRExpr** aa, /*OUT*/IRExpr** bb,
                                     /*OUT*/IRExpr** cc,
                                     IRExpr** env, IRExpr* e,
                                     IROp opAND, IROp opXOR)
{
#  define ISBIN(_e,_op) ((_e) && (_e)->tag == Iex_Binop \
                              && (_e)->Iex.Binop.op == (_op))
#  define ISATOM(_e)    isIRAtom(_e)
#  define STEP(_e)      chase1(env, (_e))
#  define LL(_e)        ((_e)->Iex.Binop.arg1)
#  define RR(_e)        ((_e)->Iex.Binop.arg2)

   IRExpr *a1, *and, *xor, *c, *a2bL, *a2bR;

   /* This is common to all 8 cases */
   if (!ISBIN(e, opXOR)) goto fail;

   /*                        -----and------ */
   /*                         --xor---      */
   /* find variant 1:   a1 ^ ((a2 ^ b) & c) */
   /* find variant 2:   a1 ^ ((b ^ a2) & c) */
   a1 = and = xor = c = a2bL = a2bR = NULL;

   a1   = LL(e);
   and  = STEP(RR(e));
   if (!ISBIN(and, opAND)) goto v34;
   xor  = STEP(LL(and));
   c    = RR(and);
   if (!ISBIN(xor, opXOR)) goto v34;
   a2bL = LL(xor);
   a2bR = RR(xor);

   if (eqIRAtom(a1, a2bL) && !eqIRAtom(a1, a2bR)) {
      *aa = a1;
      *bb = a2bR;
      *cc = c;
      return 1;
   }
   if (eqIRAtom(a1, a2bR) && !eqIRAtom(a1, a2bL)) {
      *aa = a1;
      *bb = a2bL;
      *cc = c;
      return 2;
   }

  v34:
   /*                   -----and------      */
   /*                    --xor---           */
   /* find variant 3:   ((a2 ^ b) & c) ^ a1 */
   /* find variant 4:   ((b ^ a2) & c) ^ a1 */
   a1 = and = xor = c = a2bL = a2bR = NULL;

   a1   = RR(e);
   and  = STEP(LL(e));
   if (!ISBIN(and, opAND)) goto v56;
   xor  = STEP(LL(and));
   c    = RR(and);
   if (!ISBIN(xor, opXOR)) goto v56;
   a2bL = LL(xor);
   a2bR = RR(xor);

   if (eqIRAtom(a1, a2bL) && !eqIRAtom(a1, a2bR)) {
      *aa = a1;
      *bb = a2bR;
      *cc = c;
      return 3;
   }
   if (eqIRAtom(a1, a2bR) && !eqIRAtom(a1, a2bL)) {
      *aa = a1;
      *bb = a2bL;
      *cc = c;
      return 4;
   }

  v56:
   /*                        -----and------ */
   /*                         --xor---      */
   /* find variant 5:   a1 ^ (c & (a2 ^ b)) */
   /* find variant 6:   a1 ^ (c & (b ^ a2)) */
   a1 = and = xor = c = a2bL = a2bR = NULL;

   a1   = LL(e);
   and  = STEP(RR(e));
   if (!ISBIN(and, opAND)) goto v78;
   xor  = STEP(RR(and));
   c    = LL(and);
   if (!ISBIN(xor, opXOR)) goto v78;
   a2bL = LL(xor);
   a2bR = RR(xor);

   if (eqIRAtom(a1, a2bL) && !eqIRAtom(a1, a2bR)) {
      *aa = a1;
      *bb = a2bR;
      *cc = c;
      vassert(5-5); // ATC
      return 5;
   }
   if (eqIRAtom(a1, a2bR) && !eqIRAtom(a1, a2bL)) {
      *aa = a1;
      *bb = a2bL;
      *cc = c;
      vassert(6-6); // ATC
      return 6;
   }

 v78:
   /*                   -----and------      */
   /*                    --xor---           */
   /* find variant 7:   (c & (a2 ^ b)) ^ a1 */
   /* find variant 8:   (c & (b ^ a2)) ^ a1 */
   a1 = and = xor = c = a2bL = a2bR = NULL;

   a1   = RR(e);
   and  = STEP(LL(e));
   if (!ISBIN(and, opAND)) goto fail;
   xor  = STEP(RR(and));
   c    = LL(and);
   if (!ISBIN(xor, opXOR)) goto fail;
   a2bL = LL(xor);
   a2bR = RR(xor);

   if (eqIRAtom(a1, a2bL) && !eqIRAtom(a1, a2bR)) {
      *aa = a1;
      *bb = a2bR;
      *cc = c;
      return 7;
   }
   if (eqIRAtom(a1, a2bR) && !eqIRAtom(a1, a2bL)) {
      *aa = a1;
      *bb = a2bL;
      *cc = c;
      return 8;
   }

 fail:
   return 0;

#  undef ISBIN
#  undef ISATOM
#  undef STEP
#  undef LL
#  undef RR
}

/* If |e| is of the form a ^ ((a ^ b) & c) (or any of the 7 other
   variants thereof generated by switching arguments around), return
   the IRExpr* for (a & ~c) | (b & c).  Else return NULL. */
static IRExpr* do_XOR_TRANSFORMS_IRExpr ( IRExpr** env, IRExpr* e )
{
   if (e->tag != Iex_Binop)
      return NULL;

   const HChar* tyNm = NULL;
   IROp   opOR  = Iop_INVALID;
   IROp   opAND = Iop_INVALID;
   IROp   opNOT = Iop_INVALID;
   IROp   opXOR = Iop_INVALID;
   switch (e->Iex.Binop.op) {
      case Iop_Xor32:
         tyNm  = "I32";
         opOR  = Iop_Or32;  opAND = Iop_And32;
         opNOT = Iop_Not32; opXOR = Iop_Xor32;
         break;
      case Iop_Xor16:
         tyNm  = "I16";
         opOR  = Iop_Or16;  opAND = Iop_And16;
         opNOT = Iop_Not16; opXOR = Iop_Xor16;
         break;
      case Iop_Xor8:
         tyNm  = "I8";
         opOR  = Iop_Or8;  opAND = Iop_And8;
         opNOT = Iop_Not8; opXOR = Iop_Xor8;
         break;
      default:
         return NULL;
   }

   IRExpr* aa = NULL;
   IRExpr* bb = NULL;
   IRExpr* cc = NULL;
   UInt variant = spotBitfieldAssignment(&aa, &bb, &cc, env, e, opAND, opXOR);
   if (variant > 0) {
      static UInt ctr = 0;
      if (0)
         vex_printf("XXXXXXXXXX Bitfield Assignment number %u, "
                    "type %s, variant %u\n",
                    ++ctr, tyNm, variant);
      /* It's vitally important that the returned aa, bb and cc are
         atoms -- either constants or tmps.  If it's anything else
         (eg, a GET) then incorporating them in a tree at this point
         in the SB may erroneously pull them forwards (eg of a PUT
         that originally was after the GET) and so transform the IR
         wrongly.  spotBitfieldAssignment should guarantee only to
         give us atoms, but we check here anyway. */
      vassert(aa && isIRAtom(aa));
      vassert(bb && isIRAtom(bb));
      vassert(cc && isIRAtom(cc));
      return IRExpr_Binop(
                opOR,
                IRExpr_Binop(opAND, aa, IRExpr_Unop(opNOT, cc)),
                IRExpr_Binop(opAND, bb, cc)
             );
   }
   return NULL;
}


/* SB is modified in-place.  Visit all the IRExprs and, for those
   which are allowed to be non-atomic, perform the XOR transform if
   possible.  This makes |sb| be non-flat, but that's ok, the caller
   can re-flatten it.  Returns True iff any changes were made. */
static Bool do_XOR_TRANSFORM_IRSB ( IRSB* sb )
{
   Int  i;
   Bool changed = False;

   /* Make the tmp->expr environment, so we can use it for
      chasing expressions. */
   Int      n_tmps = sb->tyenv->types_used;
   IRExpr** env = LibVEX_Alloc_inline(n_tmps * sizeof(IRExpr*));
   for (i = 0; i < n_tmps; i++)
      env[i] = NULL;

   for (i = 0; i < sb->stmts_used; i++) {
      IRStmt* st = sb->stmts[i];
      if (st->tag != Ist_WrTmp)
         continue;
      IRTemp t = st->Ist.WrTmp.tmp;
      vassert(t >= 0 && t < n_tmps);
      env[t] = st->Ist.WrTmp.data;
   }

   for (i = 0; i < sb->stmts_used; i++) {
      IRStmt* st = sb->stmts[i];

      switch (st->tag) {
         case Ist_AbiHint:
            vassert(isIRAtom(st->Ist.AbiHint.base));
            vassert(isIRAtom(st->Ist.AbiHint.nia));
            break;
         case Ist_Put:
            vassert(isIRAtom(st->Ist.Put.data));
            break;
         case Ist_PutI: {
            IRPutI* puti = st->Ist.PutI.details;
            vassert(isIRAtom(puti->ix));
            vassert(isIRAtom(puti->data));
            break;
         }
         case Ist_WrTmp: {
            /* This is the one place where an expr (st->Ist.WrTmp.data) is
               allowed to be more than just a constant or a tmp. */
            IRExpr* mb_new_data
               = do_XOR_TRANSFORMS_IRExpr(env, st->Ist.WrTmp.data);
            if (mb_new_data) {
               //ppIRSB(sb);
               st->Ist.WrTmp.data = mb_new_data;
               //ppIRSB(sb);
               changed = True;
            }
            break;
         }
         case Ist_Store:
            vassert(isIRAtom(st->Ist.Store.addr));
            vassert(isIRAtom(st->Ist.Store.data));
            break;
         case Ist_StoreG: {
            IRStoreG* sg = st->Ist.StoreG.details;
            vassert(isIRAtom(sg->addr));
            vassert(isIRAtom(sg->data));
            vassert(isIRAtom(sg->guard));
            break;
         }
         case Ist_LoadG: {
            IRLoadG* lg = st->Ist.LoadG.details;
            vassert(isIRAtom(lg->addr));
            vassert(isIRAtom(lg->alt));
            vassert(isIRAtom(lg->guard));
            break;
         }
         case Ist_CAS: {
            IRCAS* cas = st->Ist.CAS.details;
            vassert(isIRAtom(cas->addr));
            vassert(cas->expdHi == NULL || isIRAtom(cas->expdHi));
            vassert(isIRAtom(cas->expdLo));
            vassert(cas->dataHi == NULL || isIRAtom(cas->dataHi));
            vassert(isIRAtom(cas->dataLo));
            break;
         }
         case Ist_LLSC:
            vassert(isIRAtom(st->Ist.LLSC.addr));
            if (st->Ist.LLSC.storedata)
               vassert(isIRAtom(st->Ist.LLSC.storedata));
            break;
         case Ist_Dirty: {
            IRDirty* d = st->Ist.Dirty.details;
            if (d->mFx != Ifx_None) {
               vassert(isIRAtom(d->mAddr));
            }
            vassert(isIRAtom(d->guard));
            for (Int j = 0; d->args[j]; j++) {
               IRExpr* arg = d->args[j];
               if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg))) {
                  vassert(isIRAtom(arg));
               }
            }
            break;
         }
         case Ist_IMark:
         case Ist_NoOp:
         case Ist_MBE:
            break;
         case Ist_Exit:
            vassert(isIRAtom(st->Ist.Exit.guard));
            break;
         default:
            vex_printf("\n"); ppIRStmt(st);
            vpanic("do_XOR_TRANSFORMS_IRSB");
      }
   }

   vassert(isIRAtom(sb->next));
   return changed;
}


static IRSB* do_MSVC_HACKS ( IRSB* sb )
{
   // Normalise as much as we can.  This is the one-and-only place
   // where we call do_cse_BB with allowLoadsToBeCSEd set to True.
   Bool any_cse_changes = do_cse_BB( sb, True/*allowLoadsToBeCSEd*/ );
   if (any_cse_changes) {
      // CSEing might have created dead code.  Remove it.
      sb = cprop_BB ( sb );
      do_deadcode_BB(sb);
   }

   // Visit all atoms, do the transformation proper.  bb is modified
   // in-place.
   Bool changed = do_XOR_TRANSFORM_IRSB(sb);

   if (changed) {
      // The transformation generates non-flat expressions, so we now
      // need to re-flatten the block.
      sb = flatten_BB(sb);
   }

   return sb;
}


/*---------------------------------------------------------------*/
/*--- iropt main                                              ---*/
/*---------------------------------------------------------------*/

static Bool iropt_verbose = False; /* True; */


/* Do a simple cleanup pass on bb.  This is: redundant Get removal,
   redundant Put removal, constant propagation, dead code removal,
   clean helper specialisation, and dead code removal (again).
*/


static 
IRSB* cheap_transformations ( 
         IRSB* bb,
         IRExpr* (*specHelper) (const HChar*, IRExpr**, IRStmt**, Int),
         Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
         VexRegisterUpdates pxControl
      )
{
   redundant_get_removal_BB ( bb );
   if (iropt_verbose) {
      vex_printf("\n========= REDUNDANT GET\n\n" );
      ppIRSB(bb);
   }

   if (pxControl < VexRegUpdAllregsAtEachInsn) {
      redundant_put_removal_BB ( bb, preciseMemExnsFn, pxControl );
   }
   if (iropt_verbose) {
      vex_printf("\n========= REDUNDANT PUT\n\n" );
      ppIRSB(bb);
   }

   bb = cprop_BB ( bb );
   if (iropt_verbose) {
      vex_printf("\n========= CPROPD\n\n" );
      ppIRSB(bb);
   }

   do_deadcode_BB ( bb );
   if (iropt_verbose) {
      vex_printf("\n========= DEAD\n\n" );
      ppIRSB(bb);
   }

   bb = spec_helpers_BB ( bb, specHelper );
   do_deadcode_BB ( bb );
   if (iropt_verbose) {
      vex_printf("\n========= SPECd \n\n" );
      ppIRSB(bb);
   }

   return bb;
}


/* Do some more expensive transformations on bb, which are aimed at
   optimising as much as possible in the presence of GetI and PutI.  */

static
IRSB* expensive_transformations( IRSB* bb, VexRegisterUpdates pxControl )
{
   (void)do_cse_BB( bb, False/*!allowLoadsToBeCSEd*/ );
   collapse_AddSub_chains_BB( bb );
   do_redundant_GetI_elimination( bb );
   if (pxControl < VexRegUpdAllregsAtEachInsn) {
      do_redundant_PutI_elimination( bb, pxControl );
   }
   do_deadcode_BB( bb );
   return bb;
}


/* Scan a flattened BB to look for signs that more expensive
   optimisations might be useful:
   - find out if there are any GetIs and PutIs
   - find out if there are any floating or vector-typed temporaries
*/

static void considerExpensives ( /*OUT*/Bool* hasGetIorPutI,
                                 /*OUT*/Bool* hasVorFtemps,
                                 IRSB* bb )
{
   Int      i, j;
   IRStmt*  st;
   IRDirty* d;
   IRCAS*   cas;

   *hasGetIorPutI = False;
   *hasVorFtemps  = False;

   for (i = 0; i < bb->stmts_used; i++) {
      st = bb->stmts[i];
      switch (st->tag) {
         case Ist_AbiHint:
            vassert(isIRAtom(st->Ist.AbiHint.base));
            vassert(isIRAtom(st->Ist.AbiHint.nia));
            break;
         case Ist_PutI: 
            *hasGetIorPutI = True;
            break;
         case Ist_WrTmp:  
            if (st->Ist.WrTmp.data->tag == Iex_GetI)
               *hasGetIorPutI = True;
            switch (typeOfIRTemp(bb->tyenv, st->Ist.WrTmp.tmp)) {
               case Ity_I1: case Ity_I8: case Ity_I16: 
               case Ity_I32: case Ity_I64: case Ity_I128: 
                  break;
               case Ity_F16: case Ity_F32: case Ity_F64: case Ity_F128:
               case Ity_V128: case Ity_V256:
                  *hasVorFtemps = True;
                  break;
               case Ity_D32: case Ity_D64: case Ity_D128:
                  *hasVorFtemps = True;
                  break;
               default: 
                  goto bad;
            }
            break;
         case Ist_Put:
            vassert(isIRAtom(st->Ist.Put.data));
            break;
         case Ist_Store:
            vassert(isIRAtom(st->Ist.Store.addr));
            vassert(isIRAtom(st->Ist.Store.data));
            break;
         case Ist_StoreG: {
            IRStoreG* sg = st->Ist.StoreG.details;
            vassert(isIRAtom(sg->addr));
            vassert(isIRAtom(sg->data));
            vassert(isIRAtom(sg->guard));
            break;
         }
         case Ist_LoadG: {
            IRLoadG* lg = st->Ist.LoadG.details;
            vassert(isIRAtom(lg->addr));
            vassert(isIRAtom(lg->alt));
            vassert(isIRAtom(lg->guard));
            break;
         }
         case Ist_CAS:
            cas = st->Ist.CAS.details;
            vassert(isIRAtom(cas->addr));
            vassert(cas->expdHi == NULL || isIRAtom(cas->expdHi));
            vassert(isIRAtom(cas->expdLo));
            vassert(cas->dataHi == NULL || isIRAtom(cas->dataHi));
            vassert(isIRAtom(cas->dataLo));
            break;
         case Ist_LLSC:
            vassert(isIRAtom(st->Ist.LLSC.addr));
            if (st->Ist.LLSC.storedata)
               vassert(isIRAtom(st->Ist.LLSC.storedata));
            break;
         case Ist_Dirty:
            d = st->Ist.Dirty.details;
            vassert(isIRAtom(d->guard));
            for (j = 0; d->args[j]; j++) {
               IRExpr* arg = d->args[j];
               if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(arg)))
                  vassert(isIRAtom(arg));
            }
            if (d->mFx != Ifx_None)
               vassert(isIRAtom(d->mAddr));
            break;
         case Ist_NoOp:
         case Ist_IMark:
         case Ist_MBE:
            break;
         case Ist_Exit:
            vassert(isIRAtom(st->Ist.Exit.guard));
            break;
         default: 
         bad:
            ppIRStmt(st);
            vpanic("considerExpensives");
      }
   }
}


/* ---------------- The main iropt entry point. ---------------- */

/* exported from this file */
/* Rules of the game:

   - IRExpr/IRStmt trees should be treated as immutable, as they
     may get shared.  So never change a field of such a tree node;
     instead construct and return a new one if needed.
*/


IRSB* do_iropt_BB(
         IRSB* bb0,
         IRExpr* (*specHelper) (const HChar*, IRExpr**, IRStmt**, Int),
         Bool (*preciseMemExnsFn)(Int,Int,VexRegisterUpdates),
         VexRegisterUpdates pxControl,
         Addr    guest_addr,
         VexArch guest_arch
      )
{
   static Int n_total     = 0;
   static Int n_expensive = 0;

   Bool hasGetIorPutI, hasVorFtemps;
   IRSB *bb, *bb2;

   n_total++;

   /* First flatten the block out, since all other
      phases assume flat code. */

   bb = flatten_BB ( bb0 );

   if (iropt_verbose) {
      vex_printf("\n========= FLAT\n\n" );
      ppIRSB(bb);
   }

   /* If at level 0, stop now. */
   if (vex_control.iropt_level <= 0) return bb;

   /* Now do a preliminary cleanup pass, and figure out if we also
      need to do 'expensive' optimisations.  Expensive optimisations
      are deemed necessary if the block contains any GetIs or PutIs.
      If needed, do expensive transformations and then another cheap
      cleanup pass. */

   bb = cheap_transformations( bb, specHelper, preciseMemExnsFn, pxControl );

   if (guest_arch == VexArchARM) {
      /* Translating Thumb2 code produces a lot of chaff.  We have to
         work extra hard to get rid of it. */
      bb = cprop_BB(bb);
      bb = spec_helpers_BB ( bb, specHelper );
      if (pxControl < VexRegUpdAllregsAtEachInsn) {
         redundant_put_removal_BB ( bb, preciseMemExnsFn, pxControl );
      }
      do_cse_BB( bb, False/*!allowLoadsToBeCSEd*/ );
      do_deadcode_BB( bb );
   }

   if (vex_control.iropt_level > 1) {

      /* Peer at what we have, to decide how much more effort to throw
         at it. */
      considerExpensives( &hasGetIorPutI, &hasVorFtemps, bb );

      if (hasVorFtemps && !hasGetIorPutI) {
         /* If any evidence of FP or Vector activity, CSE, as that
            tends to mop up all manner of lardy code to do with
            rounding modes.  Don't bother if hasGetIorPutI since that
            case leads into the expensive transformations, which do
            CSE anyway. */
         (void)do_cse_BB( bb, False/*!allowLoadsToBeCSEd*/ );
         do_deadcode_BB( bb );
      }

      if (hasGetIorPutI) {
         Bool cses;
         n_expensive++;
         if (DEBUG_IROPT)
            vex_printf("***** EXPENSIVE %d %d\n", n_total, n_expensive);
         bb = expensive_transformations( bb, pxControl );
         bb = cheap_transformations( bb, specHelper,
                                     preciseMemExnsFn, pxControl );
         /* Potentially common up GetIs */
         cses = do_cse_BB( bb, False/*!allowLoadsToBeCSEd*/ );
         if (cses)
            bb = cheap_transformations( bb, specHelper,
                                        preciseMemExnsFn, pxControl );
      }

      ///////////////////////////////////////////////////////////
      // BEGIN MSVC optimised code transformation hacks
      if (0)
         bb = do_MSVC_HACKS(bb);
      // END   MSVC optimised code transformation hacks
      ///////////////////////////////////////////////////////////

      /* Now have a go at unrolling simple (single-BB) loops.  If
         successful, clean up the results as much as possible. */

      bb2 = maybe_loop_unroll_BB( bb, guest_addr );
      if (bb2) {
         bb = cheap_transformations( bb2, specHelper,
                                     preciseMemExnsFn, pxControl );
         if (hasGetIorPutI) {
            bb = expensive_transformations( bb, pxControl );
            bb = cheap_transformations( bb, specHelper,
                                        preciseMemExnsFn, pxControl );
         } else {
            /* at least do CSE and dead code removal */
            do_cse_BB( bb, False/*!allowLoadsToBeCSEd*/ );
            do_deadcode_BB( bb );
         }
         if (0) vex_printf("vex iropt: unrolled a loop\n");
      }

   }

   return bb;
}


/*---------------------------------------------------------------*/
/*--- end                                            ir_opt.c ---*/
/*---------------------------------------------------------------*/