libhb_core.c - Android社区 - https://www.androidos.net.cn/

/*--------------------------------------------------------------------*/
/*--- LibHB: a library for implementing and checking               ---*/
/*--- the happens-before relationship in concurrent programs.      ---*/
/*---                                                 libhb_main.c ---*/
/*--------------------------------------------------------------------*/

/*
   This file is part of LibHB, a library for implementing and checking
   the happens-before relationship in concurrent programs.

This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307, USA.

The GNU General Public License is contained in the file COPYING.
*/

#include "pub_tool_basics.h"
#include "pub_tool_libcassert.h"
#include "pub_tool_libcbase.h"
#include "pub_tool_libcprint.h"
#include "pub_tool_mallocfree.h"
#include "pub_tool_wordfm.h"
#include "pub_tool_sparsewa.h"
#include "pub_tool_xarray.h"
#include "pub_tool_oset.h"
#include "pub_tool_threadstate.h"
#include "pub_tool_aspacemgr.h"
#include "pub_tool_execontext.h"
#include "pub_tool_errormgr.h"
#include "pub_tool_options.h"        // VG_(clo_stats)
#include "hg_basics.h"
#include "hg_wordset.h"
#include "hg_lock_n_thread.h"
#include "hg_errors.h"

#include "libhb.h"

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// Debugging #defines                                          //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/* Check the sanity of shadow values in the core memory state
   machine.  Change #if 0 to #if 1 to enable this. */
#if 0
#  define CHECK_MSM 1
#else
#  define CHECK_MSM 0
#endif

/* Check sanity (reference counts, etc) in the conflicting access
   machinery.  Change #if 0 to #if 1 to enable this. */
#if 0
#  define CHECK_CEM 1
#else
#  define CHECK_CEM 0
#endif

/* Check sanity in the compressed shadow memory machinery,
   particularly in its caching innards.  Unfortunately there's no
   almost-zero-cost way to make them selectable at run time.  Hence
   set the #if 0 to #if 1 and rebuild if you want them. */
#if 0
#  define CHECK_ZSM 1  /* do sanity-check CacheLine stuff */
#  define inline __attribute__((noinline))
   /* probably want to ditch -fomit-frame-pointer too */
#else
#  define CHECK_ZSM 0   /* don't sanity-check CacheLine stuff */
#endif

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// data decls: VtsID                                           //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
   bits, since they have to be packed into the lowest 30 bits of an
   SVal. */
typedef  UInt  VtsID;
#define VtsID_INVALID 0xFFFFFFFF

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// data decls: SVal                                            //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

typedef  ULong  SVal;

/* This value has special significance to the implementation, and callers
   may not store it in the shadow memory. */
#define SVal_INVALID (3ULL << 62)

/* This is the default value for shadow memory.  Initially the shadow
   memory contains no accessible areas and so all reads produce this
   value.  TODO: make this caller-defineable. */
#define SVal_NOACCESS (2ULL << 62)

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// data decls: ScalarTS                                        //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/* Scalar Timestamp.  We have to store a lot of these, so there is
   some effort to make them as small as possible.  Logically they are
   a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
   We pack it into 64 bits by representing the Thr* using a ThrID, a
   small integer (18 bits), and a 46 bit integer for the timestamp
   number.  The 46/18 split is arbitary, but has the effect that
   Helgrind can only handle programs that create 2^18 or fewer threads
   over their entire lifetime, and have no more than 2^46 timestamp
   ticks (synchronisation operations on the same thread).

This doesn't seem like much of a limitation.  2^46 ticks is
   7.06e+13, and if each tick (optimistically) takes the machine 1000
   cycles to process, then the minimum time to process that many ticks
   at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
   but VTS ticks, which isn't realistic.

NB1: SCALARTS_N_THRBITS must be 29 or lower.  The obvious limit is
   32 since a ThrID is a UInt.  29 comes from the fact that
   'Thr_n_RCEC', which records information about old accesses, packs
   not only a ThrID but also 2+1 other bits (access size and
   writeness) in a UInt, hence limiting size to 32-(2+1) == 29.

NB2: thrid values are issued upwards from 1024, and values less
   than that aren't valid.  This isn't per se necessary (any order
   will do, so long as they are unique), but it does help ensure they
   are less likely to get confused with the various other kinds of
   small-integer thread ids drifting around (eg, TId).  See also NB5.

NB3: this probably also relies on the fact that Thr's are never
   deallocated -- they exist forever.  Hence the 1-1 mapping from
   Thr's to thrid values (set up in Thr__new) persists forever.

NB4: temp_max_sized_VTS is allocated at startup and never freed.
   It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
   ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
   making the memory use for this go sky-high.  With
   SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
   like an OK tradeoff.  If more than 256k threads need to be
   supported, we could change SCALARTS_N_THRBITS to 20, which would
   facilitate supporting 1 million threads at the cost of 8MB storage
   for temp_max_sized_VTS.

NB5: the conflicting-map mechanism (Thr_n_RCEC, specifically) uses
   ThrID == 0 to denote an empty Thr_n_RCEC record.  So ThrID == 0
   must never be a valid ThrID.  Given NB2 that's OK.
*/
#define SCALARTS_N_THRBITS 18  /* valid range: 11 to 29 inclusive */

#define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
typedef
   struct {
      ThrID thrid : SCALARTS_N_THRBITS;
      ULong tym   : SCALARTS_N_TYMBITS;
   }
   ScalarTS;

#define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// data decls: Filter                                          //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

// baseline: 5, 9
#define FI_LINE_SZB_LOG2  5
#define FI_NUM_LINES_LOG2 10

#define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
#define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)

#define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
#define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)

#define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
                             & (Addr)(FI_NUM_LINES-1) )

/* In the lines, each 8 bytes are treated individually, and are mapped
   to a UShort.  Regardless of endianness of the underlying machine,
   bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
   the highest address.

Of each bit pair, the higher numbered bit is set if a R has been
   seen, so the actual layout is:

15 14             ...  01 00

R  W  for addr+7  ...  R  W  for addr+0

So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
*/

/* tags are separated from lines.  tags are Addrs and are
   the base address of the line. */
typedef
   struct {
      UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
   }
   FiLine;

typedef
   struct {
      Addr   tags[FI_NUM_LINES];
      FiLine lines[FI_NUM_LINES];
   }
   Filter;

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// data decls: Thr, ULong_n_EC                                 //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

// Records stacks for H1 history mechanism (DRD-style)
typedef
   struct { ULong ull; ExeContext* ec; }
   ULong_n_EC;

/* How many of the above records to collect for each thread?  Older
   ones are dumped when we run out of space.  62.5k requires 1MB per
   thread, since each ULong_n_EC record is 16 bytes long.  When more
   than N_KWs_N_STACKs_PER_THREAD are present, the older half are
   deleted to make space.  Hence in the worst case we will be able to
   produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
   Kw transitions (segments in this thread).  For the current setting
   that gives a guaranteed stack for at least the last 31.25k
   segments. */
#define N_KWs_N_STACKs_PER_THREAD 62500

struct _Thr {
   /* Current VTSs for this thread.  They change as we go along.  viR
      is the VTS to be used for reads, viW for writes.  Usually they
      are the same, but can differ when we deal with reader-writer
      locks.  It is always the case that 
         VtsID__cmpLEQ(viW,viR) == True
      that is, viW must be the same, or lagging behind, viR. */
   VtsID viR;
   VtsID viW;

/* Is initially False, and is set to True after the thread really
      has done a low-level exit.  When True, we expect to never see
      any more memory references done by this thread. */
   Bool llexit_done;

/* Is initially False, and is set to True after the thread has been
      joined with (reaped by some other thread).  After this point, we
      do not expect to see any uses of .viR or .viW, so it is safe to
      set them to VtsID_INVALID. */
   Bool joinedwith_done;

/* A small integer giving a unique identity to this Thr.  See
      comments on the definition of ScalarTS for details. */
   ThrID thrid : SCALARTS_N_THRBITS;

/* A filter that removes references for which we believe that
      msmcread/msmcwrite will not change the state, nor report a
      race. */
   Filter* filter;

/* A pointer back to the top level Thread structure.  There is a
      1-1 mapping between Thread and Thr structures -- each Thr points
      at its corresponding Thread, and vice versa.  Really, Thr and
      Thread should be merged into a single structure. */
   Thread* hgthread;

/* The ULongs (scalar Kws) in this accumulate in strictly
      increasing order, without duplicates.  This is important because
      we need to be able to find a given scalar Kw in this array
      later, by binary search. */
   XArray* /* ULong_n_EC */ local_Kws_n_stacks;
};

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// data decls: SO                                              //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

// (UInt) `echo "Synchronisation object" | md5sum`
#define SO_MAGIC 0x56b3c5b0U

struct _SO {
   struct _SO* admin_prev;
   struct _SO* admin_next;
   VtsID viR; /* r-clock of sender */
   VtsID viW; /* w-clock of sender */
   UInt  magic;
};

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// Forward declarations                                        //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/* fwds for
   Globals needed by other parts of the library.  These are set
   once at startup and then never changed. */
static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
static ExeContext* (*main_get_EC)( Thr* ) = NULL;

/* misc fn and data fwdses */
static void VtsID__rcinc ( VtsID ii );
static void VtsID__rcdec ( VtsID ii );

static inline Bool SVal__isC ( SVal s );
static inline VtsID SVal__unC_Rmin ( SVal s );
static inline VtsID SVal__unC_Wmin ( SVal s );
static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini );

/* A double linked list of all the SO's. */
SO* admin_SO;

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// SECTION BEGIN compressed shadow memory                      //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

#ifndef __HB_ZSM_H
#define __HB_ZSM_H

/* Initialise the library.  Once initialised, it will (or may) call
   rcinc and rcdec in response to all the calls below, in order to
   allow the user to do reference counting on the SVals stored herein.
   It is important to understand, however, that due to internal
   caching, the reference counts are in general inaccurate, and can be
   both above or below the true reference count for an item.  In
   particular, the library may indicate that the reference count for
   an item is zero, when in fact it is not.

To make the reference counting exact and therefore non-pointless,
   call zsm_flush_cache.  Immediately after it returns, the reference
   counts for all items, as deduced by the caller by observing calls
   to rcinc and rcdec, will be correct, and so any items with a zero
   reference count may be freed (or at least considered to be
   unreferenced by this library).
*/
static void zsm_init ( void(*rcinc)(SVal), void(*rcdec)(SVal) );

static void zsm_sset_range  ( Addr, SizeT, SVal );
static void zsm_scopy_range ( Addr, Addr, SizeT );
static void zsm_flush_cache ( void );

#endif /* ! __HB_ZSM_H */

/* Round a up to the next multiple of N.  N must be a power of 2 */
#define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
/* Round a down to the next multiple of N.  N must be a power of 2 */
#define ROUNDDN(a, N)   ((a) & ~(N-1))

/* ------ User-supplied RC functions ------ */
static void(*rcinc)(SVal) = NULL;
static void(*rcdec)(SVal) = NULL;

/* ------ CacheLine ------ */

#define N_LINE_BITS      6 /* must be >= 3 */
#define N_LINE_ARANGE    (1 << N_LINE_BITS)
#define N_LINE_TREES     (N_LINE_ARANGE >> 3)

typedef
   struct {
      UShort descrs[N_LINE_TREES];
      SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
   }
   CacheLine;

#define TREE_DESCR_16_0 (1<<0)
#define TREE_DESCR_32_0 (1<<1)
#define TREE_DESCR_16_1 (1<<2)
#define TREE_DESCR_64   (1<<3)
#define TREE_DESCR_16_2 (1<<4)
#define TREE_DESCR_32_1 (1<<5)
#define TREE_DESCR_16_3 (1<<6)
#define TREE_DESCR_8_0  (1<<7)
#define TREE_DESCR_8_1  (1<<8)
#define TREE_DESCR_8_2  (1<<9)
#define TREE_DESCR_8_3  (1<<10)
#define TREE_DESCR_8_4  (1<<11)
#define TREE_DESCR_8_5  (1<<12)
#define TREE_DESCR_8_6  (1<<13)
#define TREE_DESCR_8_7  (1<<14)
#define TREE_DESCR_DTY  (1<<15)

typedef
   struct {
      SVal  dict[4]; /* can represent up to 4 diff values in the line */
      UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
                                      dict indexes */
      /* if dict[0] == SVal_INVALID then dict[1] is the index of the
         LineF to use, and dict[2..] are also SVal_INVALID. */
   }
   LineZ; /* compressed rep for a cache line */

typedef
   struct {
      Bool inUse;
      SVal w64s[N_LINE_ARANGE];
   }
   LineF; /* full rep for a cache line */

/* Shadow memory.
   Primary map is a WordFM Addr SecMap*.  
   SecMaps cover some page-size-ish section of address space and hold
     a compressed representation.
   CacheLine-sized chunks of SecMaps are copied into a Cache, being
   decompressed when moved into the cache and recompressed on the
   way out.  Because of this, the cache must operate as a writeback
   cache, not a writethrough one.

Each SecMap must hold a power-of-2 number of CacheLines.  Hence
   N_SECMAP_BITS must >= N_LINE_BITS.
*/
#define N_SECMAP_BITS   13
#define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)

// # CacheLines held by a SecMap
#define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)

/* The data in the SecMap is held in the array of LineZs.  Each LineZ
   either carries the required data directly, in a compressed
   representation, or it holds (in .dict[0]) an index to the LineF in
   .linesF that holds the full representation.

Currently-unused LineF's have their .inUse bit set to zero.
   Since each in-use LineF is referred to be exactly one LineZ,
   the number of .linesZ[] that refer to .linesF should equal
   the number of .linesF[] that have .inUse == True.

RC obligations: the RCs presented to the user include exactly
   the values in:
   * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
   * F reps that are in use (.inUse == True)

Hence the following actions at the following transitions are required:

F rep: .inUse==True  -> .inUse==False        -- rcdec_LineF
   F rep: .inUse==False -> .inUse==True         -- rcinc_LineF
   Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
   Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
*/
typedef
   struct {
      UInt   magic;
      LineZ  linesZ[N_SECMAP_ZLINES];
      LineF* linesF;
      UInt   linesF_size;
   }
   SecMap;

#define SecMap_MAGIC   0x571e58cbU

static inline Bool is_sane_SecMap ( SecMap* sm ) {
   return sm != NULL && sm->magic == SecMap_MAGIC;
}

/* ------ Cache ------ */

#define N_WAY_BITS 16
#define N_WAY_NENT (1 << N_WAY_BITS)

/* Each tag is the address of the associated CacheLine, rounded down
   to a CacheLine address boundary.  A CacheLine size must be a power
   of 2 and must be 8 or more.  Hence an easy way to initialise the
   cache so it is empty is to set all the tag values to any value % 8
   != 0, eg 1.  This means all queries in the cache initially miss.
   It does however require us to detect and not writeback, any line
   with a bogus tag. */
typedef
   struct {
      CacheLine lyns0[N_WAY_NENT];
      Addr      tags0[N_WAY_NENT];
   }
   Cache;

static inline Bool is_valid_scache_tag ( Addr tag ) {
   /* a valid tag should be naturally aligned to the start of
      a CacheLine. */
   return 0 == (tag & (N_LINE_ARANGE - 1));
}

/* --------- Primary data structures --------- */

/* Shadow memory primary map */
static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
static Cache   cache_shmem;

static UWord stats__secmaps_search       = 0; // # SM finds
static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
static UWord stats__secmap_linesF_allocd = 0; // # LineF's issued
static UWord stats__secmap_linesF_bytes  = 0; //  .. using this much storage
static UWord stats__secmap_iterator_steppings = 0; // # calls to stepSMIter
static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
static UWord stats__cache_F_fetches      = 0; // # F lines fetched
static UWord stats__cache_F_wbacks       = 0; // # F lines written back
static UWord stats__cache_invals         = 0; // # cache invals
static UWord stats__cache_flushes        = 0; // # cache flushes
static UWord stats__cache_totrefs        = 0; // # total accesses
static UWord stats__cache_totmisses      = 0; // # misses
static ULong stats__cache_make_New_arange = 0; // total arange made New
static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
static UWord stats__cline_cread64s       = 0; // # calls to s_m_read64
static UWord stats__cline_cread32s       = 0; // # calls to s_m_read32
static UWord stats__cline_cread16s       = 0; // # calls to s_m_read16
static UWord stats__cline_cread08s       = 0; // # calls to s_m_read8
static UWord stats__cline_cwrite64s      = 0; // # calls to s_m_write64
static UWord stats__cline_cwrite32s      = 0; // # calls to s_m_write32
static UWord stats__cline_cwrite16s      = 0; // # calls to s_m_write16
static UWord stats__cline_cwrite08s      = 0; // # calls to s_m_write8
static UWord stats__cline_sread08s       = 0; // # calls to s_m_set8
static UWord stats__cline_swrite08s      = 0; // # calls to s_m_get8
static UWord stats__cline_swrite16s      = 0; // # calls to s_m_get8
static UWord stats__cline_swrite32s      = 0; // # calls to s_m_get8
static UWord stats__cline_swrite64s      = 0; // # calls to s_m_get8
static UWord stats__cline_scopy08s       = 0; // # calls to s_m_copy8
static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
static UWord stats__vts__tick            = 0; // # calls to VTS__tick
static UWord stats__vts__join            = 0; // # calls to VTS__join
static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural

// # calls to VTS__cmp_structural w/ slow case
static UWord stats__vts__cmp_structural_slow = 0;

// # calls to VTS__indexAt_SLOW
static UWord stats__vts__indexat_slow = 0;

// # calls to vts_set__find__or__clone_and_add
static UWord stats__vts_set__focaa    = 0;

// # calls to vts_set__find__or__clone_and_add that lead to an
// allocation
static UWord stats__vts_set__focaa_a  = 0;

static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
   return a & ~(N_SECMAP_ARANGE - 1);
}
static inline UWord shmem__get_SecMap_offset ( Addr a ) {
   return a & (N_SECMAP_ARANGE - 1);
}

/*----------------------------------------------------------------*/
/*--- map_shmem :: WordFM Addr SecMap                          ---*/
/*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
/*----------------------------------------------------------------*/

/*--------------- SecMap allocation --------------- */

static HChar* shmem__bigchunk_next = NULL;
static HChar* shmem__bigchunk_end1 = NULL;

static void* shmem__bigchunk_alloc ( SizeT n )
{
   const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
   tl_assert(n > 0);
   n = VG_ROUNDUP(n, 16);
   tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
   tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
             <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
   if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
      if (0)
      VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
                  (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
      shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
      if (shmem__bigchunk_next == NULL)
         VG_(out_of_memory_NORETURN)(
            "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE );
      shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
   }
   tl_assert(shmem__bigchunk_next);
   tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
   tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
   shmem__bigchunk_next += n;
   return shmem__bigchunk_next - n;
}

static SecMap* shmem__alloc_SecMap ( void )
{
   Word    i, j;
   SecMap* sm = shmem__bigchunk_alloc( sizeof(SecMap) );
   if (0) VG_(printf)("alloc_SecMap %p\n",sm);
   tl_assert(sm);
   sm->magic = SecMap_MAGIC;
   for (i = 0; i < N_SECMAP_ZLINES; i++) {
      sm->linesZ[i].dict[0] = SVal_NOACCESS;
      sm->linesZ[i].dict[1] = SVal_INVALID;
      sm->linesZ[i].dict[2] = SVal_INVALID;
      sm->linesZ[i].dict[3] = SVal_INVALID;
      for (j = 0; j < N_LINE_ARANGE/4; j++)
         sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
   }
   sm->linesF      = NULL;
   sm->linesF_size = 0;
   stats__secmaps_allocd++;
   stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
   stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
   stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
   return sm;
}

typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };

static SecMap* shmem__find_SecMap ( Addr ga ) 
{
   SecMap* sm    = NULL;
   Addr    gaKey = shmem__round_to_SecMap_base(ga);
   // Cache
   stats__secmaps_search++;
   if (LIKELY(gaKey == smCache[0].gaKey))
      return smCache[0].sm;
   if (LIKELY(gaKey == smCache[1].gaKey)) {
      SMCacheEnt tmp = smCache[0];
      smCache[0] = smCache[1];
      smCache[1] = tmp;
      return smCache[0].sm;
   }
   if (gaKey == smCache[2].gaKey) {
      SMCacheEnt tmp = smCache[1];
      smCache[1] = smCache[2];
      smCache[2] = tmp;
      return smCache[1].sm;
   }
   // end Cache
   stats__secmaps_search_slow++;
   if (VG_(lookupFM)( map_shmem,
                      NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
      tl_assert(sm != NULL);
      smCache[2] = smCache[1];
      smCache[1] = smCache[0];
      smCache[0].gaKey = gaKey;
      smCache[0].sm    = sm;
   } else {
      tl_assert(sm == NULL);
   }
   return sm;
}

static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
{
   SecMap* sm = shmem__find_SecMap ( ga );
   if (LIKELY(sm)) {
      return sm;
   } else {
      /* create a new one */
      Addr gaKey = shmem__round_to_SecMap_base(ga);
      sm = shmem__alloc_SecMap();
      tl_assert(sm);
      VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
      return sm;
   }
}

/* ------------ LineF and LineZ related ------------ */

static void rcinc_LineF ( LineF* lineF ) {
   UWord i;
   tl_assert(lineF->inUse);
   for (i = 0; i < N_LINE_ARANGE; i++)
      rcinc(lineF->w64s[i]);
}

static void rcdec_LineF ( LineF* lineF ) {
   UWord i;
   tl_assert(lineF->inUse);
   for (i = 0; i < N_LINE_ARANGE; i++)
      rcdec(lineF->w64s[i]);
}

static void rcinc_LineZ ( LineZ* lineZ ) {
   tl_assert(lineZ->dict[0] != SVal_INVALID);
   rcinc(lineZ->dict[0]);
   if (lineZ->dict[1] != SVal_INVALID) rcinc(lineZ->dict[1]);
   if (lineZ->dict[2] != SVal_INVALID) rcinc(lineZ->dict[2]);
   if (lineZ->dict[3] != SVal_INVALID) rcinc(lineZ->dict[3]);
}

static void rcdec_LineZ ( LineZ* lineZ ) {
   tl_assert(lineZ->dict[0] != SVal_INVALID);
   rcdec(lineZ->dict[0]);
   if (lineZ->dict[1] != SVal_INVALID) rcdec(lineZ->dict[1]);
   if (lineZ->dict[2] != SVal_INVALID) rcdec(lineZ->dict[2]);
   if (lineZ->dict[3] != SVal_INVALID) rcdec(lineZ->dict[3]);
}

inline
static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
   Word bix, shft, mask, prep;
   tl_assert(ix >= 0);
   bix  = ix >> 2;
   shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
   mask = 3 << shft;
   prep = b2 << shft;
   arr[bix] = (arr[bix] & ~mask) | prep;
}

inline
static UWord read_twobit_array ( UChar* arr, UWord ix ) {
   Word bix, shft;
   tl_assert(ix >= 0);
   bix  = ix >> 2;
   shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
   return (arr[bix] >> shft) & 3;
}

/* Given address 'tag', find either the Z or F line containing relevant
   data, so it can be read into the cache.
*/
static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
                                  /*OUT*/LineF** fp, Addr tag ) {
   LineZ* lineZ;
   LineF* lineF;
   UWord   zix;
   SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
   UWord   smoff = shmem__get_SecMap_offset(tag);
   /* since smoff is derived from a valid tag, it should be
      cacheline-aligned. */
   tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
   zix = smoff >> N_LINE_BITS;
   tl_assert(zix < N_SECMAP_ZLINES);
   lineZ = &sm->linesZ[zix];
   lineF = NULL;
   if (lineZ->dict[0] == SVal_INVALID) {
      UInt fix = (UInt)lineZ->dict[1];
      tl_assert(sm->linesF);
      tl_assert(sm->linesF_size > 0);
      tl_assert(fix >= 0 && fix < sm->linesF_size);
      lineF = &sm->linesF[fix];
      tl_assert(lineF->inUse);
      lineZ = NULL;
   }
   *zp = lineZ;
   *fp = lineF;
}

/* Given address 'tag', return the relevant SecMap and the index of
   the LineZ within it, in the expectation that the line is to be
   overwritten.  Regardless of whether 'tag' is currently associated
   with a Z or F representation, to rcdec on the current
   representation, in recognition of the fact that the contents are
   just about to be overwritten. */
static __attribute__((noinline))
void find_Z_for_writing ( /*OUT*/SecMap** smp,
                          /*OUT*/Word* zixp,
                          Addr tag ) {
   LineZ* lineZ;
   LineF* lineF;
   UWord   zix;
   SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
   UWord   smoff = shmem__get_SecMap_offset(tag);
   /* since smoff is derived from a valid tag, it should be
      cacheline-aligned. */
   tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
   zix = smoff >> N_LINE_BITS;
   tl_assert(zix < N_SECMAP_ZLINES);
   lineZ = &sm->linesZ[zix];
   lineF = NULL;
   /* re RCs, we are freeing up this LineZ/LineF so that new data can
      be parked in it.  Hence have to rcdec it accordingly. */
   /* If lineZ has an associated lineF, free it up. */
   if (lineZ->dict[0] == SVal_INVALID) {
      UInt fix = (UInt)lineZ->dict[1];
      tl_assert(sm->linesF);
      tl_assert(sm->linesF_size > 0);
      tl_assert(fix >= 0 && fix < sm->linesF_size);
      lineF = &sm->linesF[fix];
      tl_assert(lineF->inUse);
      rcdec_LineF(lineF);
      lineF->inUse = False;
   } else {
      rcdec_LineZ(lineZ);
   }
   *smp  = sm;
   *zixp = zix;
}

static __attribute__((noinline))
void alloc_F_for_writing ( /*MOD*/SecMap* sm, /*OUT*/Word* fixp ) {
   UInt        i, new_size;
   LineF* nyu;

if (sm->linesF) {
      tl_assert(sm->linesF_size > 0);
   } else {
      tl_assert(sm->linesF_size == 0);
   }

if (sm->linesF) {
      for (i = 0; i < sm->linesF_size; i++) {
         if (!sm->linesF[i].inUse) {
            *fixp = (Word)i;
            return;
         }
      }
   }

/* No free F line found.  Expand existing array and try again. */
   new_size = sm->linesF_size==0 ? 1 : 2 * sm->linesF_size;
   nyu      = HG_(zalloc)( "libhb.aFfw.1 (LineF storage)",
                           new_size * sizeof(LineF) );
   tl_assert(nyu);

stats__secmap_linesF_allocd += (new_size - sm->linesF_size);
   stats__secmap_linesF_bytes  += (new_size - sm->linesF_size)
                                  * sizeof(LineF);

if (0)
   VG_(printf)("SM %p: expand F array from %d to %d\n", 
               sm, (Int)sm->linesF_size, new_size);

for (i = 0; i < new_size; i++)
      nyu[i].inUse = False;

if (sm->linesF) {
      for (i = 0; i < sm->linesF_size; i++) {
         tl_assert(sm->linesF[i].inUse);
         nyu[i] = sm->linesF[i];
      }
      VG_(memset)(sm->linesF, 0, sm->linesF_size * sizeof(LineF) );
      HG_(free)(sm->linesF);
   }

sm->linesF      = nyu;
   sm->linesF_size = new_size;

for (i = 0; i < sm->linesF_size; i++) {
      if (!sm->linesF[i].inUse) {
         *fixp = (Word)i;
         return;
      }
    }

/*NOTREACHED*/
    tl_assert(0);
}

/* ------------ CacheLine and implicit-tree related ------------ */

__attribute__((unused))
static void pp_CacheLine ( CacheLine* cl ) {
   Word i;
   if (!cl) {
      VG_(printf)("%s","pp_CacheLine(NULL)\n");
      return;
   }
   for (i = 0; i < N_LINE_TREES; i++) 
      VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
   for (i = 0; i < N_LINE_ARANGE; i++) 
      VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
}

static UChar descr_to_validbits ( UShort descr )
{
   /* a.k.a Party Time for gcc's constant folder */
#  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
                b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
             ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
                          ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
                          ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
                          ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
                          ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
                          ( (b16_2) << 4)  | ( (b64)   << 3)  | \
                          ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
                          ( (b16_0) << 0) ) )

#  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
             ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
                         ( (bit5) << 5) | ( (bit4) << 4) | \
                         ( (bit3) << 3) | ( (bit2) << 2) | \
                         ( (bit1) << 1) | ( (bit0) << 0) ) )

/* these should all get folded out at compile time */
   tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
   tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);

switch (descr) {
   /*
              +--------------------------------- TREE_DESCR_8_7
              |             +------------------- TREE_DESCR_8_0
              |             |  +---------------- TREE_DESCR_16_3
              |             |  | +-------------- TREE_DESCR_32_1
              |             |  | | +------------ TREE_DESCR_16_2
              |             |  | | |  +--------- TREE_DESCR_64
              |             |  | | |  |  +------ TREE_DESCR_16_1
              |             |  | | |  |  | +---- TREE_DESCR_32_0
              |             |  | | |  |  | | +-- TREE_DESCR_16_0
              |             |  | | |  |  | | |
              |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
   case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
                                                 return BYTE(1,1,1,1,1,1,1,1);
   case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
                                                 return BYTE(1,1,0,1,1,1,1,1);
   case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */ 
                                                 return BYTE(0,1,1,1,1,1,1,1);
   case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
                                                 return BYTE(0,1,0,1,1,1,1,1);

case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */ 
                                                 return BYTE(1,1,1,1,1,1,0,1);
   case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
                                                 return BYTE(1,1,0,1,1,1,0,1);
   case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
                                                 return BYTE(0,1,1,1,1,1,0,1);
   case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
                                                 return BYTE(0,1,0,1,1,1,0,1);

case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
                                                 return BYTE(1,1,1,1,0,1,1,1);
   case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
                                                 return BYTE(1,1,0,1,0,1,1,1);
   case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
                                                 return BYTE(0,1,1,1,0,1,1,1);
   case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
                                                 return BYTE(0,1,0,1,0,1,1,1);

case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
                                                 return BYTE(1,1,1,1,0,1,0,1);
   case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
                                                 return BYTE(1,1,0,1,0,1,0,1);
   case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
                                                 return BYTE(0,1,1,1,0,1,0,1);
   case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
                                                 return BYTE(0,1,0,1,0,1,0,1);

case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
                                                 return BYTE(0,0,0,1,1,1,1,1);
   case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
                                                 return BYTE(0,0,0,1,1,1,0,1);
   case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
                                                 return BYTE(0,0,0,1,0,1,1,1);
   case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
                                                 return BYTE(0,0,0,1,0,1,0,1);

case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
                                                 return BYTE(1,1,1,1,0,0,0,1);
   case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
                                                 return BYTE(1,1,0,1,0,0,0,1);
   case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
                                                 return BYTE(0,1,1,1,0,0,0,1);
   case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
                                                 return BYTE(0,1,0,1,0,0,0,1);

case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
                                                 return BYTE(0,0,0,1,0,0,0,1);

case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
                                                 return BYTE(0,0,0,0,0,0,0,1);

default: return BYTE(0,0,0,0,0,0,0,0); 
                   /* INVALID - any valid descr produces at least one
                      valid bit in tree[0..7]*/
   }
   /* NOTREACHED*/
   tl_assert(0);

#  undef DESCR
#  undef BYTE
}

__attribute__((unused))
static Bool is_sane_Descr ( UShort descr ) {
   return descr_to_validbits(descr) != 0;
}

static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
   VG_(sprintf)(dst, 
                "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
                (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
                (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
                (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
                (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
                (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
                (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
                (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
                (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
   );
}
static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
   VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
                     (Int)((byte & 128) ? 1 : 0),
                     (Int)((byte &  64) ? 1 : 0),
                     (Int)((byte &  32) ? 1 : 0),
                     (Int)((byte &  16) ? 1 : 0),
                     (Int)((byte &   8) ? 1 : 0),
                     (Int)((byte &   4) ? 1 : 0),
                     (Int)((byte &   2) ? 1 : 0),
                     (Int)((byte &   1) ? 1 : 0)
   );
}

static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
   Word  i;
   UChar validbits = descr_to_validbits(descr);
   HChar buf[128], buf2[128];
   if (validbits == 0)
      goto bad;
   for (i = 0; i < 8; i++) {
      if (validbits & (1<<i)) {
         if (tree[i] == SVal_INVALID)
            goto bad;
      } else {
         if (tree[i] != SVal_INVALID)
            goto bad;
      }
   }
   return True;
  bad:
   sprintf_Descr( buf, descr );
   sprintf_Byte( buf2, validbits );
   VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
   VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
   VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
   for (i = 0; i < 8; i++)
      VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
   VG_(printf)("%s","}\n");
   return 0;
}

static Bool is_sane_CacheLine ( CacheLine* cl )
{
   Word tno, cloff;

if (!cl) goto bad;

for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
      UShort descr = cl->descrs[tno];
      SVal*  tree  = &cl->svals[cloff];
      if (!is_sane_Descr_and_Tree(descr, tree))
         goto bad;
   }
   tl_assert(cloff == N_LINE_ARANGE);
   return True;
  bad:
   pp_CacheLine(cl);
   return False;
}

static UShort normalise_tree ( /*MOD*/SVal* tree )
{
   UShort descr;
   /* pre: incoming tree[0..7] does not have any invalid shvals, in
      particular no zeroes. */
   if (UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
                || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
                || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
                || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
      tl_assert(0);
   
   descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
           | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
           | TREE_DESCR_8_1 | TREE_DESCR_8_0;
   /* build 16-bit layer */
   if (tree[1] == tree[0]) {
      tree[1] = SVal_INVALID;
      descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
      descr |= TREE_DESCR_16_0;
   }
   if (tree[3] == tree[2]) {
      tree[3] = SVal_INVALID;
      descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
      descr |= TREE_DESCR_16_1;
   }
   if (tree[5] == tree[4]) {
      tree[5] = SVal_INVALID;
      descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
      descr |= TREE_DESCR_16_2;
   }
   if (tree[7] == tree[6]) {
      tree[7] = SVal_INVALID;
      descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
      descr |= TREE_DESCR_16_3;
   }
   /* build 32-bit layer */
   if (tree[2] == tree[0]
       && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
      tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
      descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
      descr |= TREE_DESCR_32_0;
   }
   if (tree[6] == tree[4]
       && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
      tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
      descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
      descr |= TREE_DESCR_32_1;
   }
   /* build 64-bit layer */
   if (tree[4] == tree[0]
       && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
      tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
      descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
      descr |= TREE_DESCR_64;
   }
   return descr;
}

/* This takes a cacheline where all the data is at the leaves
   (w8[..]) and builds a correctly normalised tree. */
static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
{
   Word tno, cloff;
   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
      SVal* tree = &cl->svals[cloff];
      cl->descrs[tno] = normalise_tree( tree );
   }
   tl_assert(cloff == N_LINE_ARANGE);
   if (CHECK_ZSM)
      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   stats__cline_normalises++;
}

typedef struct { UChar count; SVal sval; } CountedSVal;

static
void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
                               /*OUT*/Word* dstUsedP,
                               Word nDst, CacheLine* src )
{
   Word  tno, cloff, dstUsed;

tl_assert(nDst == N_LINE_ARANGE);
   dstUsed = 0;

for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
      UShort descr = src->descrs[tno];
      SVal*  tree  = &src->svals[cloff];

/* sequentialise the tree described by (descr,tree). */
#     define PUT(_n,_v)                                \
         do { dst[dstUsed  ].count = (_n);             \
              dst[dstUsed++].sval  = (_v);             \
         } while (0)

/* byte 0 */
      if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
      if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
      if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
      if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
      /* byte 1 */
      if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
      /* byte 2 */
      if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
      if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
      /* byte 3 */
      if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
      /* byte 4 */
      if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
      if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
      if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
      /* byte 5 */
      if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
      /* byte 6 */
      if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
      if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
      /* byte 7 */
      if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);

#     undef PUT
      /* END sequentialise the tree described by (descr,tree). */

}
   tl_assert(cloff == N_LINE_ARANGE);
   tl_assert(dstUsed <= nDst);

*dstUsedP = dstUsed;
}

/* Write the cacheline 'wix' to backing store.  Where it ends up
   is determined by its tag field. */
static __attribute__((noinline)) void cacheline_wback ( UWord wix )
{
   Word        i, j, k, m;
   Addr        tag;
   SecMap*     sm;
   CacheLine*  cl;
   LineZ* lineZ;
   LineF* lineF;
   Word        zix, fix, csvalsUsed;
   CountedSVal csvals[N_LINE_ARANGE];
   SVal        sv;

if (0)
   VG_(printf)("scache wback line %d\n", (Int)wix);

tl_assert(wix >= 0 && wix < N_WAY_NENT);

tag =  cache_shmem.tags0[wix];
   cl  = &cache_shmem.lyns0[wix];

/* The cache line may have been invalidated; if so, ignore it. */
   if (!is_valid_scache_tag(tag))
      return;

/* Where are we going to put it? */
   sm         = NULL;
   lineZ      = NULL;
   lineF      = NULL;
   zix = fix = -1;

/* find the Z line to write in and rcdec it or the associated F
      line. */
   find_Z_for_writing( &sm, &zix, tag );

tl_assert(sm);
   tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
   lineZ = &sm->linesZ[zix];

/* Generate the data to be stored */
   if (CHECK_ZSM)
      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */

csvalsUsed = -1;
   sequentialise_CacheLine( csvals, &csvalsUsed, 
                            N_LINE_ARANGE, cl );
   tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
   if (0) VG_(printf)("%lu ", csvalsUsed);

lineZ->dict[0] = lineZ->dict[1] 
                  = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;

/* i indexes actual shadow values, k is cursor in csvals */
   i = 0;
   for (k = 0; k < csvalsUsed; k++) {

sv = csvals[k].sval;
      if (CHECK_ZSM)
         tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
      /* do we already have it? */
      if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
      if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
      if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
      if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
      /* no.  look for a free slot. */
      if (CHECK_ZSM)
         tl_assert(sv != SVal_INVALID);
      if (lineZ->dict[0] 
          == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
      if (lineZ->dict[1]
          == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
      if (lineZ->dict[2]
          == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
      if (lineZ->dict[3]
          == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
      break; /* we'll have to use the f rep */
     dict_ok:
      m = csvals[k].count;
      if (m == 8) {
         write_twobit_array( lineZ->ix2s, i+0, j );
         write_twobit_array( lineZ->ix2s, i+1, j );
         write_twobit_array( lineZ->ix2s, i+2, j );
         write_twobit_array( lineZ->ix2s, i+3, j );
         write_twobit_array( lineZ->ix2s, i+4, j );
         write_twobit_array( lineZ->ix2s, i+5, j );
         write_twobit_array( lineZ->ix2s, i+6, j );
         write_twobit_array( lineZ->ix2s, i+7, j );
         i += 8;
      }
      else if (m == 4) {
         write_twobit_array( lineZ->ix2s, i+0, j );
         write_twobit_array( lineZ->ix2s, i+1, j );
         write_twobit_array( lineZ->ix2s, i+2, j );
         write_twobit_array( lineZ->ix2s, i+3, j );
         i += 4;
      }
      else if (m == 1) {
         write_twobit_array( lineZ->ix2s, i+0, j );
         i += 1;
      }
      else if (m == 2) {
         write_twobit_array( lineZ->ix2s, i+0, j );
         write_twobit_array( lineZ->ix2s, i+1, j );
         i += 2;
      }
      else {
         tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
      }

}

if (LIKELY(i == N_LINE_ARANGE)) {
      /* Construction of the compressed representation was
         successful. */
      rcinc_LineZ(lineZ);
      stats__cache_Z_wbacks++;
   } else {
      /* Cannot use the compressed(z) representation.  Use the full(f)
         rep instead. */
      tl_assert(i >= 0 && i < N_LINE_ARANGE);
      alloc_F_for_writing( sm, &fix );
      tl_assert(sm->linesF);
      tl_assert(sm->linesF_size > 0);
      tl_assert(fix >= 0 && fix < (Word)sm->linesF_size);
      lineF = &sm->linesF[fix];
      tl_assert(!lineF->inUse);
      lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
      lineZ->dict[1] = (SVal)fix;
      lineF->inUse = True;
      i = 0;
      for (k = 0; k < csvalsUsed; k++) {
         if (CHECK_ZSM)
            tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
         sv = csvals[k].sval;
         if (CHECK_ZSM)
            tl_assert(sv != SVal_INVALID);
         for (m = csvals[k].count; m > 0; m--) {
            lineF->w64s[i] = sv;
            i++;
         }
      }
      tl_assert(i == N_LINE_ARANGE);
      rcinc_LineF(lineF);
      stats__cache_F_wbacks++;
   }
}

/* Fetch the cacheline 'wix' from the backing store.  The tag
   associated with 'wix' is assumed to have already been filled in;
   hence that is used to determine where in the backing store to read
   from. */
static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
{
   Word       i;
   Addr       tag;
   CacheLine* cl;
   LineZ*     lineZ;
   LineF*     lineF;

if (0)
   VG_(printf)("scache fetch line %d\n", (Int)wix);

tl_assert(wix >= 0 && wix < N_WAY_NENT);

tag =  cache_shmem.tags0[wix];
   cl  = &cache_shmem.lyns0[wix];

/* reject nonsense requests */
   tl_assert(is_valid_scache_tag(tag));

lineZ = NULL;
   lineF = NULL;
   find_ZF_for_reading( &lineZ, &lineF, tag );
   tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );

/* expand the data into the bottom layer of the tree, then get
      cacheline_normalise to build the descriptor array. */
   if (lineF) {
      tl_assert(lineF->inUse);
      for (i = 0; i < N_LINE_ARANGE; i++) {
         cl->svals[i] = lineF->w64s[i];
      }
      stats__cache_F_fetches++;
   } else {
      for (i = 0; i < N_LINE_ARANGE; i++) {
         SVal sv;
         UWord ix = read_twobit_array( lineZ->ix2s, i );
         /* correct, but expensive: tl_assert(ix >= 0 && ix <= 3); */
         sv = lineZ->dict[ix];
         tl_assert(sv != SVal_INVALID);
         cl->svals[i] = sv;
      }
      stats__cache_Z_fetches++;
   }
   normalise_CacheLine( cl );
}

static void shmem__invalidate_scache ( void ) {
   Word wix;
   if (0) VG_(printf)("%s","scache inval\n");
   tl_assert(!is_valid_scache_tag(1));
   for (wix = 0; wix < N_WAY_NENT; wix++) {
      cache_shmem.tags0[wix] = 1/*INVALID*/;
   }
   stats__cache_invals++;
}

static void shmem__flush_and_invalidate_scache ( void ) {
   Word wix;
   Addr tag;
   if (0) VG_(printf)("%s","scache flush and invalidate\n");
   tl_assert(!is_valid_scache_tag(1));
   for (wix = 0; wix < N_WAY_NENT; wix++) {
      tag = cache_shmem.tags0[wix];
      if (tag == 1/*INVALID*/) {
         /* already invalid; nothing to do */
      } else {
         tl_assert(is_valid_scache_tag(tag));
         cacheline_wback( wix );
      }
      cache_shmem.tags0[wix] = 1/*INVALID*/;
   }
   stats__cache_flushes++;
   stats__cache_invals++;
}

static inline Bool aligned16 ( Addr a ) {
   return 0 == (a & 1);
}
static inline Bool aligned32 ( Addr a ) {
   return 0 == (a & 3);
}
static inline Bool aligned64 ( Addr a ) {
   return 0 == (a & 7);
}
static inline UWord get_cacheline_offset ( Addr a ) {
   return (UWord)(a & (N_LINE_ARANGE - 1));
}
static inline Addr cacheline_ROUNDUP ( Addr a ) {
   return ROUNDUP(a, N_LINE_ARANGE);
}
static inline Addr cacheline_ROUNDDN ( Addr a ) {
   return ROUNDDN(a, N_LINE_ARANGE);
}
static inline UWord get_treeno ( Addr a ) {
   return get_cacheline_offset(a) >> 3;
}
static inline UWord get_tree_offset ( Addr a ) {
   return a & 7;
}

static __attribute__((noinline))
       CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
static inline CacheLine* get_cacheline ( Addr a )
{
   /* tag is 'a' with the in-line offset masked out, 
      eg a[31]..a[4] 0000 */
   Addr       tag = a & ~(N_LINE_ARANGE - 1);
   UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
   stats__cache_totrefs++;
   if (LIKELY(tag == cache_shmem.tags0[wix])) {
      return &cache_shmem.lyns0[wix];
   } else {
      return get_cacheline_MISS( a );
   }
}

static __attribute__((noinline))
       CacheLine* get_cacheline_MISS ( Addr a )
{
   /* tag is 'a' with the in-line offset masked out, 
      eg a[31]..a[4] 0000 */

CacheLine* cl;
   Addr*      tag_old_p;
   Addr       tag = a & ~(N_LINE_ARANGE - 1);
   UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);

tl_assert(tag != cache_shmem.tags0[wix]);

/* Dump the old line into the backing store. */
   stats__cache_totmisses++;

cl        = &cache_shmem.lyns0[wix];
   tag_old_p = &cache_shmem.tags0[wix];

if (is_valid_scache_tag( *tag_old_p )) {
      /* EXPENSIVE and REDUNDANT: callee does it */
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
      cacheline_wback( wix );
   }
   /* and reload the new one */
   *tag_old_p = tag;
   cacheline_fetch( wix );
   if (CHECK_ZSM)
      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   return cl;
}

static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   stats__cline_64to32pulldown++;
   switch (toff) {
      case 0: case 4:
         tl_assert(descr & TREE_DESCR_64);
         tree[4] = tree[0];
         descr &= ~TREE_DESCR_64;
         descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
         break;
      default:
         tl_assert(0);
   }
   return descr;
}

static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   stats__cline_32to16pulldown++;
   switch (toff) {
      case 0: case 2:
         if (!(descr & TREE_DESCR_32_0)) {
            descr = pulldown_to_32(tree, 0, descr);
         }
         tl_assert(descr & TREE_DESCR_32_0);
         tree[2] = tree[0];
         descr &= ~TREE_DESCR_32_0;
         descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
         break;
      case 4: case 6:
         if (!(descr & TREE_DESCR_32_1)) {
            descr = pulldown_to_32(tree, 4, descr);
         }
         tl_assert(descr & TREE_DESCR_32_1);
         tree[6] = tree[4];
         descr &= ~TREE_DESCR_32_1;
         descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
         break;
      default:
         tl_assert(0);
   }
   return descr;
}

static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   stats__cline_16to8pulldown++;
   switch (toff) {
      case 0: case 1:
         if (!(descr & TREE_DESCR_16_0)) {
            descr = pulldown_to_16(tree, 0, descr);
         }
         tl_assert(descr & TREE_DESCR_16_0);
         tree[1] = tree[0];
         descr &= ~TREE_DESCR_16_0;
         descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
         break;
      case 2: case 3:
         if (!(descr & TREE_DESCR_16_1)) {
            descr = pulldown_to_16(tree, 2, descr);
         }
         tl_assert(descr & TREE_DESCR_16_1);
         tree[3] = tree[2];
         descr &= ~TREE_DESCR_16_1;
         descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
         break;
      case 4: case 5:
         if (!(descr & TREE_DESCR_16_2)) {
            descr = pulldown_to_16(tree, 4, descr);
         }
         tl_assert(descr & TREE_DESCR_16_2);
         tree[5] = tree[4];
         descr &= ~TREE_DESCR_16_2;
         descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
         break;
      case 6: case 7:
         if (!(descr & TREE_DESCR_16_3)) {
            descr = pulldown_to_16(tree, 6, descr);
         }
         tl_assert(descr & TREE_DESCR_16_3);
         tree[7] = tree[6];
         descr &= ~TREE_DESCR_16_3;
         descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
         break;
      default:
         tl_assert(0);
   }
   return descr;
}

static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
   UShort mask;
   switch (toff) {
      case 0:
         mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
         tl_assert( (descr & mask) == mask );
         descr &= ~mask;
         descr |= TREE_DESCR_16_0;
         break;
      case 2:
         mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
         tl_assert( (descr & mask) == mask );
         descr &= ~mask;
         descr |= TREE_DESCR_16_1;
         break;
      case 4:
         mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
         tl_assert( (descr & mask) == mask );
         descr &= ~mask;
         descr |= TREE_DESCR_16_2;
         break;
      case 6:
         mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
         tl_assert( (descr & mask) == mask );
         descr &= ~mask;
         descr |= TREE_DESCR_16_3;
         break;
      default:
         tl_assert(0);
   }
   return descr;
}

static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
   UShort mask;
   switch (toff) {
      case 0:
         if (!(descr & TREE_DESCR_16_0))
            descr = pullup_descr_to_16(descr, 0);
         if (!(descr & TREE_DESCR_16_1))
            descr = pullup_descr_to_16(descr, 2);
         mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
         tl_assert( (descr & mask) == mask );
         descr &= ~mask;
         descr |= TREE_DESCR_32_0;
         break;
      case 4:
         if (!(descr & TREE_DESCR_16_2))
            descr = pullup_descr_to_16(descr, 4);
         if (!(descr & TREE_DESCR_16_3))
            descr = pullup_descr_to_16(descr, 6);
         mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
         tl_assert( (descr & mask) == mask );
         descr &= ~mask;
         descr |= TREE_DESCR_32_1;
         break;
      default:
         tl_assert(0);
   }
   return descr;
}

static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
   switch (toff) {
      case 0: case 4:
         return 0 != (descr & TREE_DESCR_64);
      default:
         tl_assert(0);
   }
}

static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
   switch (toff) {
      case 0:
         return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
      case 2:
         return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
      case 4:
         return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
      case 6:
         return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
      default:
         tl_assert(0);
   }
}

/* ------------ Cache management ------------ */

static void zsm_flush_cache ( void )
{
   shmem__flush_and_invalidate_scache();
}

static void zsm_init ( void(*p_rcinc)(SVal), void(*p_rcdec)(SVal) )
{
   tl_assert( sizeof(UWord) == sizeof(Addr) );

rcinc = p_rcinc;
   rcdec = p_rcdec;

tl_assert(map_shmem == NULL);
   map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
                           HG_(free), 
                           NULL/*unboxed UWord cmp*/);
   tl_assert(map_shmem != NULL);
   shmem__invalidate_scache();

/* a SecMap must contain an integral number of CacheLines */
   tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
   /* also ... a CacheLine holds an integral number of trees */
   tl_assert(0 == (N_LINE_ARANGE % 8));
}

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// SECTION END compressed shadow memory                        //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// SECTION BEGIN vts primitives                                //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/* There's a 1-1 mapping between Thr and ThrIDs -- the latter merely
   being compact stand-ins for Thr*'s.  Use these functions to map
   between them. */
static ThrID Thr__to_ThrID   ( Thr*  thr   ); /* fwds */
static Thr*  Thr__from_ThrID ( ThrID thrid ); /* fwds */

__attribute__((noreturn))
static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
{
   if (due_to_nThrs) {
      HChar* s =
         "\n"
         "Helgrind: cannot continue, run aborted: too many threads.\n"
         "Sorry.  Helgrind can only handle programs that create\n"
         "%'llu or fewer threads over their entire lifetime.\n"
         "\n";
      VG_(umsg)(s, (ULong)(ThrID_MAX_VALID - 1024));
   } else {
      HChar* s =
         "\n"
         "Helgrind: cannot continue, run aborted: too many\n"
         "synchronisation events.  Sorry. Helgrind can only handle\n"
         "programs which perform %'llu or fewer\n"
         "inter-thread synchronisation events (locks, unlocks, etc).\n"
         "\n";
      VG_(umsg)(s, (1ULL << SCALARTS_N_TYMBITS) - 1);
   }
   VG_(exit)(1);
   /*NOTREACHED*/
   tl_assert(0); /*wtf?!*/
}

/* The dead thread (ThrID, actually) table.  A thread may only be
   listed here if we have been notified thereof by libhb_async_exit.
   New entries are added at the end.  The order isn't important, but
   the ThrID values must be unique.  This table lists the identity of
   all threads that have ever died -- none are ever removed.  We keep
   this table so as to be able to prune entries from VTSs.  We don't
   actually need to keep the set of threads that have ever died --
   only the threads that have died since the previous round of
   pruning.  But it's useful for sanity check purposes to keep the
   entire set, so we do. */
static XArray* /* of ThrID */ verydead_thread_table = NULL;

/* Arbitrary total ordering on ThrIDs. */
static Int cmp__ThrID ( void* v1, void* v2 ) {
   ThrID id1 = *(ThrID*)v1;
   ThrID id2 = *(ThrID*)v2;
   if (id1 < id2) return -1;
   if (id1 > id2) return 1;
   return 0;
}

static void verydead_thread_table_init ( void )
{
   tl_assert(!verydead_thread_table);
   verydead_thread_table
     = VG_(newXA)( HG_(zalloc),
                   "libhb.verydead_thread_table_init.1",
                   HG_(free), sizeof(ThrID) );
   tl_assert(verydead_thread_table);
   VG_(setCmpFnXA)(verydead_thread_table, cmp__ThrID);
}

/* A VTS contains .ts, its vector clock, and also .id, a field to hold
   a backlink for the caller's convenience.  Since we have no idea
   what to set that to in the library, it always gets set to
   VtsID_INVALID. */
typedef
   struct {
      VtsID    id;
      UInt     usedTS;
      UInt     sizeTS;
      ScalarTS ts[0];
   }
   VTS;

/* Allocate a VTS capable of storing 'sizeTS' entries. */
static VTS* VTS__new ( HChar* who, UInt sizeTS );

/* Make a clone of 'vts', sizing the new array to exactly match the
   number of ScalarTSs present. */
static VTS* VTS__clone ( HChar* who, VTS* vts );

/* Make a clone of 'vts' with the thrids in 'thrids' removed.  The new
   array is sized exactly to hold the number of required elements.
   'thridsToDel' is an array of ThrIDs to be omitted in the clone, and
   must be in strictly increasing order. */
static VTS* VTS__subtract ( HChar* who, VTS* vts, XArray* thridsToDel );

/* Delete this VTS in its entirety. */
static void VTS__delete ( VTS* vts );

/* Create a new singleton VTS in 'out'.  Caller must have
   pre-allocated 'out' sufficiently big to hold the result in all
   possible cases. */
static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym );

/* Create in 'out' a VTS which is the same as 'vts' except with
   vts[me]++, so to speak.  Caller must have pre-allocated 'out'
   sufficiently big to hold the result in all possible cases. */
static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts );

/* Create in 'out' a VTS which is the join (max) of 'a' and
   'b'. Caller must have pre-allocated 'out' sufficiently big to hold
   the result in all possible cases. */
static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b );

/* Compute the partial ordering relation of the two args.  Although we
   could be completely general and return an enumeration value (EQ,
   LT, GT, UN), in fact we only need LEQ, and so we may as well
   hardwire that fact.

Returns zero iff LEQ(A,B), or a valid ThrID if not (zero is an
   invald ThrID).  In the latter case, the returned ThrID indicates
   the discovered point for which they are not.  There may be more
   than one such point, but we only care about seeing one of them, not
   all of them.  This rather strange convention is used because
   sometimes we want to know the actual index at which they first
   differ. */
static UInt VTS__cmpLEQ ( VTS* a, VTS* b );

/* Compute an arbitrary structural (total) ordering on the two args,
   based on their VCs, so they can be looked up in a table, tree, etc.
   Returns -1, 0 or 1. */
static Word VTS__cmp_structural ( VTS* a, VTS* b );

/* Debugging only.  Display the given VTS in the buffer. */
static void VTS__show ( HChar* buf, Int nBuf, VTS* vts );

/* Debugging only.  Return vts[index], so to speak. */
static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );

/* Notify the VTS machinery that a thread has been declared
   comprehensively dead: that is, it has done an async exit AND it has
   been joined with.  This should ensure that its local clocks (.viR
   and .viW) will never again change, and so all mentions of this
   thread from all VTSs in the system may be removed. */
static void VTS__declare_thread_very_dead ( Thr* idx );

/*--------------- to do with Vector Timestamps ---------------*/

static Bool is_sane_VTS ( VTS* vts )
{
   UWord     i, n;
   ScalarTS  *st1, *st2;
   if (!vts) return False;
   if (!vts->ts) return False;
   if (vts->usedTS > vts->sizeTS) return False;
   n = vts->usedTS;
   if (n == 1) {
      st1 = &vts->ts[0];
      if (st1->tym == 0)
         return False;
   }
   else
   if (n >= 2) {
      for (i = 0; i < n-1; i++) {
         st1 = &vts->ts[i];
         st2 = &vts->ts[i+1];
         if (st1->thrid >= st2->thrid)
            return False;
         if (st1->tym == 0 || st2->tym == 0)
            return False;
      }
   }
   return True;
}

/* Create a new, empty VTS.
*/
static VTS* VTS__new ( HChar* who, UInt sizeTS )
{
   VTS* vts = HG_(zalloc)(who, sizeof(VTS) + (sizeTS+1) * sizeof(ScalarTS));
   tl_assert(vts->usedTS == 0);
   vts->sizeTS = sizeTS;
   *(ULong*)(&vts->ts[sizeTS]) = 0x0ddC0ffeeBadF00dULL;
   return vts;
}

/* Clone this VTS.
*/
static VTS* VTS__clone ( HChar* who, VTS* vts )
{
   tl_assert(vts);
   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   UInt nTS = vts->usedTS;
   VTS* clone = VTS__new(who, nTS);
   clone->id = vts->id;
   clone->sizeTS = nTS;
   clone->usedTS = nTS;
   UInt i;
   for (i = 0; i < nTS; i++) {
      clone->ts[i] = vts->ts[i];
   }
   tl_assert( *(ULong*)(&clone->ts[clone->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   return clone;
}

/* Make a clone of a VTS with specified ThrIDs removed.  'thridsToDel'
   must be in strictly increasing order.  We could obviously do this
   much more efficiently (in linear time) if necessary.
*/
static VTS* VTS__subtract ( HChar* who, VTS* vts, XArray* thridsToDel )
{
   UInt i, j;
   tl_assert(vts);
   tl_assert(thridsToDel);
   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   UInt nTS = vts->usedTS;
   /* Figure out how many ScalarTSs will remain in the output. */
   UInt nReq = nTS;
   for (i = 0; i < nTS; i++) {
      ThrID thrid = vts->ts[i].thrid;
      if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
         nReq--;
   }
   tl_assert(nReq <= nTS);
   /* Copy the ones that will remain. */
   VTS* res = VTS__new(who, nReq);
   j = 0;
   for (i = 0; i < nTS; i++) {
      ThrID thrid = vts->ts[i].thrid;
      if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
         continue;
      res->ts[j++] = vts->ts[i];
   }
   tl_assert(j == nReq);
   tl_assert(j == res->sizeTS);
   res->usedTS = j;
   tl_assert( *(ULong*)(&res->ts[j]) == 0x0ddC0ffeeBadF00dULL);
   return res;
}

/* Delete this VTS in its entirety.
*/
static void VTS__delete ( VTS* vts )
{
   tl_assert(vts);
   tl_assert(vts->usedTS <= vts->sizeTS);
   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   HG_(free)(vts);
}

/* Create a new singleton VTS. 
*/
static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym )
{
   tl_assert(thr);
   tl_assert(tym >= 1);
   tl_assert(out);
   tl_assert(out->usedTS == 0);
   tl_assert(out->sizeTS >= 1);
   UInt hi = out->usedTS++;
   out->ts[hi].thrid = Thr__to_ThrID(thr);
   out->ts[hi].tym   = tym;
}

/* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
   not modified.
*/
static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts )
{
   UInt      i, n;
   ThrID     me_thrid;
   Bool      found = False;

stats__vts__tick++;

tl_assert(out);
   tl_assert(out->usedTS == 0);
   if (vts->usedTS >= ThrID_MAX_VALID)
      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
   tl_assert(out->sizeTS >= 1 + vts->usedTS);

tl_assert(me);
   me_thrid = Thr__to_ThrID(me);
   tl_assert(is_sane_VTS(vts));
   n = vts->usedTS;

/* Copy all entries which precede 'me'. */
   for (i = 0; i < n; i++) {
      ScalarTS* here = &vts->ts[i];
      if (UNLIKELY(here->thrid >= me_thrid))
         break;
      UInt hi = out->usedTS++;
      out->ts[hi] = *here;
   }

/* 'i' now indicates the next entry to copy, if any.
       There are 3 possibilities:
       (a) there is no next entry (we used them all up already):
           add (me_thrid,1) to the output, and quit
       (b) there is a next entry, and its thrid > me_thrid:
           add (me_thrid,1) to the output, then copy the remaining entries
       (c) there is a next entry, and its thrid == me_thrid:
           copy it to the output but increment its timestamp value.
           Then copy the remaining entries.  (c) is the common case.
   */
   tl_assert(i >= 0 && i <= n);
   if (i == n) { /* case (a) */
      UInt hi = out->usedTS++;
      out->ts[hi].thrid = me_thrid;
      out->ts[hi].tym   = 1;
   } else {
      /* cases (b) and (c) */
      ScalarTS* here = &vts->ts[i];
      if (me_thrid == here->thrid) { /* case (c) */
         if (UNLIKELY(here->tym >= (1ULL << SCALARTS_N_TYMBITS) - 2ULL)) {
            /* We're hosed.  We have to stop. */
            scalarts_limitations_fail_NORETURN( False/*!due_to_nThrs*/ );
         }
         UInt hi = out->usedTS++;
         out->ts[hi].thrid = here->thrid;
         out->ts[hi].tym   = here->tym + 1;
         i++;
         found = True;
      } else { /* case (b) */
         UInt hi = out->usedTS++;
         out->ts[hi].thrid = me_thrid;
         out->ts[hi].tym   = 1;
      }
      /* And copy any remaining entries. */
      for (/*keepgoing*/; i < n; i++) {
         ScalarTS* here2 = &vts->ts[i];
         UInt hi = out->usedTS++;
         out->ts[hi] = *here2;
      }
   }

tl_assert(is_sane_VTS(out));
   tl_assert(out->usedTS == vts->usedTS + (found ? 0 : 1));
   tl_assert(out->usedTS <= out->sizeTS);
}

/* Return a new VTS constructed as the join (max) of the 2 args.
   Neither arg is modified.
*/
static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b )
{
   UInt     ia, ib, useda, usedb;
   ULong    tyma, tymb, tymMax;
   ThrID    thrid;
   UInt     ncommon = 0;

stats__vts__join++;

tl_assert(a);
   tl_assert(b);
   useda = a->usedTS;
   usedb = b->usedTS;

tl_assert(out);
   tl_assert(out->usedTS == 0);
   /* overly conservative test, but doing better involves comparing
      the two VTSs, which we don't want to do at this point. */
   if (useda + usedb >= ThrID_MAX_VALID)
      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
   tl_assert(out->sizeTS >= useda + usedb);

ia = ib = 0;

while (1) {

/* This logic is to enumerate triples (thrid, tyma, tymb) drawn
         from a and b in order, where thrid is the next ThrID
         occurring in either a or b, and tyma/b are the relevant
         scalar timestamps, taking into account implicit zeroes. */
      tl_assert(ia >= 0 && ia <= useda);
      tl_assert(ib >= 0 && ib <= usedb);

if        (ia == useda && ib == usedb) {
         /* both empty - done */
         break;

} else if (ia == useda && ib != usedb) {
         /* a empty, use up b */
         ScalarTS* tmpb = &b->ts[ib];
         thrid = tmpb->thrid;
         tyma  = 0;
         tymb  = tmpb->tym;
         ib++;

} else if (ia != useda && ib == usedb) {
         /* b empty, use up a */
         ScalarTS* tmpa = &a->ts[ia];
         thrid = tmpa->thrid;
         tyma  = tmpa->tym;
         tymb  = 0;
         ia++;

} else {
         /* both not empty; extract lowest-ThrID'd triple */
         ScalarTS* tmpa = &a->ts[ia];
         ScalarTS* tmpb = &b->ts[ib];
         if (tmpa->thrid < tmpb->thrid) {
            /* a has the lowest unconsidered ThrID */
            thrid = tmpa->thrid;
            tyma  = tmpa->tym;
            tymb  = 0;
            ia++;
         } else if (tmpa->thrid > tmpb->thrid) {
            /* b has the lowest unconsidered ThrID */
            thrid = tmpb->thrid;
            tyma  = 0;
            tymb  = tmpb->tym;
            ib++;
         } else {
            /* they both next mention the same ThrID */
            tl_assert(tmpa->thrid == tmpb->thrid);
            thrid = tmpa->thrid; /* == tmpb->thrid */
            tyma  = tmpa->tym;
            tymb  = tmpb->tym;
            ia++;
            ib++;
            ncommon++;
         }
      }

/* having laboriously determined (thr, tyma, tymb), do something
         useful with it. */
      tymMax = tyma > tymb ? tyma : tymb;
      if (tymMax > 0) {
         UInt hi = out->usedTS++;
         out->ts[hi].thrid = thrid;
         out->ts[hi].tym   = tymMax;
      }

}

tl_assert(is_sane_VTS(out));
   tl_assert(out->usedTS <= out->sizeTS);
   tl_assert(out->usedTS == useda + usedb - ncommon);
}

/* Determine if 'a' <= 'b', in the partial ordering.  Returns zero if
   they are, or the first ThrID for which they are not (no valid ThrID
   has the value zero).  This rather strange convention is used
   because sometimes we want to know the actual index at which they
   first differ. */
static UInt/*ThrID*/ VTS__cmpLEQ ( VTS* a, VTS* b )
{
   Word  ia, ib, useda, usedb;
   ULong tyma, tymb;

stats__vts__cmpLEQ++;

tl_assert(a);
   tl_assert(b);
   useda = a->usedTS;
   usedb = b->usedTS;

ia = ib = 0;

while (1) {

/* This logic is to enumerate doubles (tyma, tymb) drawn
         from a and b in order, and tyma/b are the relevant
         scalar timestamps, taking into account implicit zeroes. */
      ThrID thrid;

tl_assert(ia >= 0 && ia <= useda);
      tl_assert(ib >= 0 && ib <= usedb);

if        (ia == useda && ib == usedb) {
         /* both empty - done */
         break;

} else if (ia == useda && ib != usedb) {
         /* a empty, use up b */
         ScalarTS* tmpb = &b->ts[ib];
         tyma  = 0;
         tymb  = tmpb->tym;
         thrid = tmpb->thrid;
         ib++;

} else if (ia != useda && ib == usedb) {
         /* b empty, use up a */
         ScalarTS* tmpa = &a->ts[ia];
         tyma  = tmpa->tym;
         thrid = tmpa->thrid;
         tymb  = 0;
         ia++;

} else {
         /* both not empty; extract lowest-ThrID'd triple */
         ScalarTS* tmpa = &a->ts[ia];
         ScalarTS* tmpb = &b->ts[ib];
         if (tmpa->thrid < tmpb->thrid) {
            /* a has the lowest unconsidered ThrID */
            tyma  = tmpa->tym;
            thrid = tmpa->thrid;
            tymb  = 0;
            ia++;
         }
         else
         if (tmpa->thrid > tmpb->thrid) {
            /* b has the lowest unconsidered ThrID */
            tyma  = 0;
            tymb  = tmpb->tym;
            thrid = tmpb->thrid;
            ib++;
         } else {
            /* they both next mention the same ThrID */
            tl_assert(tmpa->thrid == tmpb->thrid);
            tyma  = tmpa->tym;
            thrid = tmpa->thrid;
            tymb  = tmpb->tym;
            ia++;
            ib++;
         }
      }

/* having laboriously determined (tyma, tymb), do something
         useful with it. */
      if (tyma > tymb) {
         /* not LEQ at this index.  Quit, since the answer is
            determined already. */
         tl_assert(thrid >= 1024);
         return thrid;
      }
   }

return 0; /* all points are LEQ => return an invalid ThrID */
}

/* Compute an arbitrary structural (total) ordering on the two args,
   based on their VCs, so they can be looked up in a table, tree, etc.
   Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
   performance critical so there is some effort expended to make it sa
   fast as possible.
*/
Word VTS__cmp_structural ( VTS* a, VTS* b )
{
   /* We just need to generate an arbitrary total ordering based on
      a->ts and b->ts.  Preferably do it in a way which comes across likely
      differences relatively quickly. */
   Word     i;
   Word     useda = 0,    usedb = 0;
   ScalarTS *ctsa = NULL, *ctsb = NULL;

stats__vts__cmp_structural++;

tl_assert(a);
   tl_assert(b);

ctsa = &a->ts[0]; useda = a->usedTS;
   ctsb = &b->ts[0]; usedb = b->usedTS;

if (LIKELY(useda == usedb)) {
      ScalarTS *tmpa = NULL, *tmpb = NULL;
      stats__vts__cmp_structural_slow++;
      /* Same length vectors.  Find the first difference, if any, as
         fast as possible. */
      for (i = 0; i < useda; i++) {
         tmpa = &ctsa[i];
         tmpb = &ctsb[i];
         if (LIKELY(tmpa->tym == tmpb->tym
                    && tmpa->thrid == tmpb->thrid))
            continue;
         else
            break;
      }
      if (UNLIKELY(i == useda)) {
         /* They're identical. */
         return 0;
      } else {
         tl_assert(i >= 0 && i < useda);
         if (tmpa->tym < tmpb->tym) return -1;
         if (tmpa->tym > tmpb->tym) return 1;
         if (tmpa->thrid < tmpb->thrid) return -1;
         if (tmpa->thrid > tmpb->thrid) return 1;
         /* we just established them as non-identical, hence: */
      }
      /*NOTREACHED*/
      tl_assert(0);
   }

if (useda < usedb) return -1;
   if (useda > usedb) return 1;
   /*NOTREACHED*/
   tl_assert(0);
}

/* Debugging only.  Display the given VTS in the buffer.
*/
void VTS__show ( HChar* buf, Int nBuf, VTS* vts )
{
   ScalarTS* st;
   HChar     unit[64];
   Word      i, n;
   Int       avail = nBuf;
   tl_assert(vts && vts->ts);
   tl_assert(nBuf > 16);
   buf[0] = '[';
   buf[1] = 0;
   n =  vts->usedTS;
   for (i = 0; i < n; i++) {
      tl_assert(avail >= 40);
      st = &vts->ts[i];
      VG_(memset)(unit, 0, sizeof(unit));
      VG_(sprintf)(unit, i < n-1 ? "%u:%llu " : "%u:%llu",
                         st->thrid, (ULong)st->tym);
      if (avail < VG_(strlen)(unit) + 40/*let's say*/) {
         VG_(strcat)(buf, " ...]");
         buf[nBuf-1] = 0;
         return;
      }
      VG_(strcat)(buf, unit);
      avail -= VG_(strlen)(unit);
   }
   VG_(strcat)(buf, "]");
   buf[nBuf-1] = 0;
}

/* Debugging only.  Return vts[index], so to speak.
*/
ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
{
   UWord i, n;
   ThrID idx_thrid = Thr__to_ThrID(idx);
   stats__vts__indexat_slow++;
   tl_assert(vts && vts->ts);
   n = vts->usedTS;
   for (i = 0; i < n; i++) {
      ScalarTS* st = &vts->ts[i];
      if (st->thrid == idx_thrid)
         return st->tym;
   }
   return 0;
}

/* See comment on prototype above.
*/
static void VTS__declare_thread_very_dead ( Thr* thr )
{
   if (0) VG_(printf)("VTQ:  tae %p\n", thr);

tl_assert(thr->llexit_done);
   tl_assert(thr->joinedwith_done);

ThrID nyu;
   nyu = Thr__to_ThrID(thr);
   VG_(addToXA)( verydead_thread_table, &nyu );

/* We can only get here if we're assured that we'll never again
      need to look at this thread's ::viR or ::viW.  Set them to
      VtsID_INVALID, partly so as to avoid holding on to the VTSs, but
      mostly so that we don't wind up pruning them (as that would be
      nonsensical: the only interesting ScalarTS entry for a dead
      thread is its own index, and the pruning will remove that.). */
   VtsID__rcdec(thr->viR);
   VtsID__rcdec(thr->viW);
   thr->viR = VtsID_INVALID;
   thr->viW = VtsID_INVALID;
}

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// SECTION END vts primitives                                  //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// SECTION BEGIN main library                                  //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/////////////////////////////////////////////////////////
//                                                     //
// VTS set                                             //
//                                                     //
/////////////////////////////////////////////////////////

static WordFM* /* WordFM VTS* void */ vts_set = NULL;

static void vts_set_init ( void )
{
   tl_assert(!vts_set);
   vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
                         HG_(free),
                         (Word(*)(UWord,UWord))VTS__cmp_structural );
   tl_assert(vts_set);
}

/* Given a VTS, look in vts_set to see if we already have a
   structurally identical one.  If yes, return the pair (True, pointer
   to the existing one).  If no, clone this one, add the clone to the
   set, and return (False, pointer to the clone). */
static Bool vts_set__find__or__clone_and_add ( /*OUT*/VTS** res, VTS* cand )
{
   UWord keyW, valW;
   stats__vts_set__focaa++;
   tl_assert(cand->id == VtsID_INVALID);
   /* lookup cand (by value) */
   if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
      /* found it */
      tl_assert(valW == 0);
      /* if this fails, cand (by ref) was already present (!) */
      tl_assert(keyW != (UWord)cand);
      *res = (VTS*)keyW;
      return True;
   } else {
      /* not present.  Clone, add and return address of clone. */
      stats__vts_set__focaa_a++;
      VTS* clone = VTS__clone( "libhb.vts_set_focaa.1", cand );
      tl_assert(clone != cand);
      VG_(addToFM)( vts_set, (UWord)clone, 0/*val is unused*/ );
      *res = clone;
      return False;
   }
}

/////////////////////////////////////////////////////////
//                                                     //
// VTS table                                           //
//                                                     //
/////////////////////////////////////////////////////////

static void VtsID__invalidate_caches ( void ); /* fwds */

/* A type to hold VTS table entries.  Invariants:
   If .vts == NULL, then this entry is not in use, so:
   - .rc == 0
   - this entry is on the freelist (unfortunately, does not imply
     any constraints on value for .freelink)
   If .vts != NULL, then this entry is in use:
   - .vts is findable in vts_set
   - .vts->id == this entry number
   - no specific value for .rc (even 0 is OK)
   - this entry is not on freelist, so .freelink == VtsID_INVALID
*/
typedef
   struct {
      VTS*  vts;      /* vts, in vts_set */
      UWord rc;       /* reference count - enough for entire aspace */
      VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
      VtsID remap;    /* used only during pruning */
   }
   VtsTE;

/* The VTS table. */
static XArray* /* of VtsTE */ vts_tab = NULL;

/* An index into the VTS table, indicating the start of the list of
   free (available for use) entries.  If the list is empty, this is
   VtsID_INVALID. */
static VtsID vts_tab_freelist = VtsID_INVALID;

/* Do a GC of vts_tab when the freelist becomes empty AND the size of
   vts_tab equals or exceeds this size.  After GC, the value here is
   set appropriately so as to check for the next GC point. */
static Word vts_next_GC_at = 1000;

static void vts_tab_init ( void )
{
   vts_tab
      = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
                    HG_(free), sizeof(VtsTE) );
   vts_tab_freelist
      = VtsID_INVALID;
   tl_assert(vts_tab);
}

/* Add ii to the free list, checking that it looks out-of-use. */
static void add_to_free_list ( VtsID ii )
{
   VtsTE* ie = VG_(indexXA)( vts_tab, ii );
   tl_assert(ie->vts == NULL);
   tl_assert(ie->rc == 0);
   tl_assert(ie->freelink == VtsID_INVALID);
   ie->freelink = vts_tab_freelist;
   vts_tab_freelist = ii;
}

/* Get an entry from the free list.  This will return VtsID_INVALID if
   the free list is empty. */
static VtsID get_from_free_list ( void )
{
   VtsID  ii;
   VtsTE* ie;
   if (vts_tab_freelist == VtsID_INVALID)
      return VtsID_INVALID;
   ii = vts_tab_freelist;
   ie = VG_(indexXA)( vts_tab, ii );
   tl_assert(ie->vts == NULL);
   tl_assert(ie->rc == 0);
   vts_tab_freelist = ie->freelink;
   return ii;
}

/* Produce a new VtsID that can be used, either by getting it from
   the freelist, or, if that is empty, by expanding vts_tab. */
static VtsID get_new_VtsID ( void )
{
   VtsID ii;
   VtsTE te;
   ii = get_from_free_list();
   if (ii != VtsID_INVALID)
      return ii;
   te.vts = NULL;
   te.rc = 0;
   te.freelink = VtsID_INVALID;
   te.remap    = VtsID_INVALID;
   ii = (VtsID)VG_(addToXA)( vts_tab, &te );
   return ii;
}

/* Indirect callback from lib_zsm. */
static void VtsID__rcinc ( VtsID ii )
{
   VtsTE* ie;
   /* VG_(indexXA) does a range check for us */
   ie = VG_(indexXA)( vts_tab, ii );
   tl_assert(ie->vts); /* else it's not in use */
   tl_assert(ie->rc < ~0UL); /* else we can't continue */
   tl_assert(ie->vts->id == ii);
   ie->rc++;
}

/* Indirect callback from lib_zsm. */
static void VtsID__rcdec ( VtsID ii )
{
   VtsTE* ie;
   /* VG_(indexXA) does a range check for us */
   ie = VG_(indexXA)( vts_tab, ii );
   tl_assert(ie->vts); /* else it's not in use */
   tl_assert(ie->rc > 0); /* else RC snafu */
   tl_assert(ie->vts->id == ii);
   ie->rc--;
}

/* Look up 'cand' in our collection of VTSs.  If present, return the
   VtsID for the pre-existing version.  If not present, clone it, add
   the clone to both vts_tab and vts_set, allocate a fresh VtsID for
   it, and return that. */
static VtsID vts_tab__find__or__clone_and_add ( VTS* cand )
{
   VTS* in_tab = NULL;
   tl_assert(cand->id == VtsID_INVALID);
   Bool already_have = vts_set__find__or__clone_and_add( &in_tab, cand );
   tl_assert(in_tab);
   if (already_have) {
      /* We already have a copy of 'cand'.  Use that. */
      VtsTE* ie;
      tl_assert(in_tab->id != VtsID_INVALID);
      ie = VG_(indexXA)( vts_tab, in_tab->id );
      tl_assert(ie->vts == in_tab);
      return in_tab->id;
   } else {
      VtsID  ii = get_new_VtsID();
      VtsTE* ie = VG_(indexXA)( vts_tab, ii );
      ie->vts = in_tab;
      ie->rc = 0;
      ie->freelink = VtsID_INVALID;
      in_tab->id = ii;
      return ii;
   }
}

static void show_vts_stats ( HChar* caller )
{
   UWord nSet, nTab, nLive;
   ULong totrc;
   UWord n, i;
   nSet = VG_(sizeFM)( vts_set );
   nTab = VG_(sizeXA)( vts_tab );
   totrc = 0;
   nLive = 0;
   n = VG_(sizeXA)( vts_tab );
   for (i = 0; i < n; i++) {
      VtsTE* ie = VG_(indexXA)( vts_tab, i );
      if (ie->vts) {
         nLive++;
         totrc += (ULong)ie->rc;
      } else {
         tl_assert(ie->rc == 0);
      }
   }
   VG_(printf)("  show_vts_stats %s\n", caller);
   VG_(printf)("    vts_tab size %4lu\n", nTab);
   VG_(printf)("    vts_tab live %4lu\n", nLive);
   VG_(printf)("    vts_set size %4lu\n", nSet);
   VG_(printf)("        total rc %4llu\n", totrc);
}

/* --- Helpers for VtsID pruning --- */

static
void remap_VtsID ( /*MOD*/XArray* /* of VtsTE */ old_tab,
                   /*MOD*/XArray* /* of VtsTE */ new_tab,
                   VtsID* ii )
{
   VtsTE *old_te, *new_te;
   VtsID old_id, new_id;
   /* We're relying here on VG_(indexXA)'s range checking to assert on
      any stupid values, in particular *ii == VtsID_INVALID. */
   old_id = *ii;
   old_te = VG_(indexXA)( old_tab, old_id );
   old_te->rc--;
   new_id = old_te->remap;
   new_te = VG_(indexXA)( new_tab, new_id );
   new_te->rc++;
   *ii = new_id;
}

static
void remap_VtsIDs_in_SVal ( /*MOD*/XArray* /* of VtsTE */ old_tab,
                            /*MOD*/XArray* /* of VtsTE */ new_tab,
                            SVal* s )
{
   SVal old_sv, new_sv;
   old_sv = *s;
   if (SVal__isC(old_sv)) {
      VtsID rMin, wMin;
      rMin = SVal__unC_Rmin(old_sv);
      wMin = SVal__unC_Wmin(old_sv);
      remap_VtsID( old_tab, new_tab, &rMin );
      remap_VtsID( old_tab, new_tab, &wMin );
      new_sv = SVal__mkC( rMin, wMin );
      *s = new_sv;
  }
}

/* NOT TO BE CALLED FROM WITHIN libzsm. */
__attribute__((noinline))
static void vts_tab__do_GC ( Bool show_stats )
{
   UWord i, nTab, nLive, nFreed;

/* ---------- BEGIN VTS GC ---------- */
   /* check this is actually necessary. */
   tl_assert(vts_tab_freelist == VtsID_INVALID);

/* empty the caches for partial order checks and binary joins.  We
      could do better and prune out the entries to be deleted, but it
      ain't worth the hassle. */
   VtsID__invalidate_caches();

/* First, make the reference counts up to date. */
   zsm_flush_cache();

nTab = VG_(sizeXA)( vts_tab );

if (show_stats) {
      VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
      show_vts_stats("before GC");
   }

/* Now we can inspect the entire vts_tab.  Any entries with zero
      .rc fields are now no longer in use and can be put back on the
      free list, removed from vts_set, and deleted. */
   nFreed = 0;
   for (i = 0; i < nTab; i++) {
      Bool present;
      UWord oldK = 0, oldV = 12345;
      VtsTE* te = VG_(indexXA)( vts_tab, i );
      if (te->vts == NULL) {
         tl_assert(te->rc == 0);
         continue; /* already on the free list (presumably) */
      }
      if (te->rc > 0)
         continue; /* in use */
      /* Ok, we got one we can free. */
      tl_assert(te->vts->id == i);
      /* first, remove it from vts_set. */
      present = VG_(delFromFM)( vts_set,
                                &oldK, &oldV, (UWord)te->vts );
      tl_assert(present); /* else it isn't in vts_set ?! */
      tl_assert(oldV == 0); /* no info stored in vts_set val fields */
      tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
      /* now free the VTS itself */
      VTS__delete(te->vts);
      te->vts = NULL;
      /* and finally put this entry on the free list */
      tl_assert(te->freelink == VtsID_INVALID); /* can't already be on it */
      add_to_free_list( i );
      nFreed++;
   }

/* Now figure out when the next GC should be.  We'll allow the
      number of VTSs to double before GCing again.  Except of course
      that since we can't (or, at least, don't) shrink vts_tab, we
      can't set the threshhold value smaller than it. */
   tl_assert(nFreed <= nTab);
   nLive = nTab - nFreed;
   tl_assert(nLive >= 0 && nLive <= nTab);
   vts_next_GC_at = 2 * nLive;
   if (vts_next_GC_at < nTab)
      vts_next_GC_at = nTab;

if (show_stats) {
      show_vts_stats("after GC");
      VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
   }

if (VG_(clo_stats)) {
      static UInt ctr = 1;
      tl_assert(nTab > 0);
      VG_(message)(Vg_DebugMsg,
                  "libhb: VTS GC: #%u  old size %lu  live %lu  (%2llu%%)\n",
                  ctr++, nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
   }
   /* ---------- END VTS GC ---------- */

/* Decide whether to do VTS pruning.  We have one of three
      settings. */
   static UInt pruning_auto_ctr = 0; /* do not make non-static */

Bool do_pruning = False;
   switch (HG_(clo_vts_pruning)) {
      case 0: /* never */
         break;
      case 1: /* auto */
         do_pruning = (++pruning_auto_ctr % 5) == 0;
         break;
      case 2: /* always */
         do_pruning = True;
         break;
      default:
         tl_assert(0);
   }

/* The rest of this routine only handles pruning, so we can
      quit at this point if it is not to be done. */
   if (!do_pruning)
      return;

/* ---------- BEGIN VTS PRUNING ---------- */
   /* We begin by sorting the backing table on its .thr values, so as
      to (1) check they are unique [else something has gone wrong,
      since it means we must have seen some Thr* exiting more than
      once, which can't happen], and (2) so that we can quickly look
      up the dead-thread entries as we work through the VTSs. */
   VG_(sortXA)( verydead_thread_table );
   /* Sanity check: check for unique .sts.thr values. */
   UWord nBT = VG_(sizeXA)( verydead_thread_table );
   if (nBT > 0) {
      ThrID thrid1, thrid2;
      thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, 0 );
      for (i = 1; i < nBT; i++) {
         thrid1 = thrid2;
         thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, i );
         tl_assert(thrid1 < thrid2);
      }
   }
   /* Ok, so the dead thread table has unique and in-order keys. */

/* We will run through the old table, and create a new table and
      set, at the same time setting the .remap entries in the old
      table to point to the new entries.  Then, visit every VtsID in
      the system, and replace all of them with new ones, using the
      .remap entries in the old table.  Finally, we can delete the old
      table and set. */

XArray* /* of VtsTE */ new_tab
      = VG_(newXA)( HG_(zalloc), "libhb.vts_tab__do_GC.new_tab",
                    HG_(free), sizeof(VtsTE) );

/* WordFM VTS* void */
   WordFM* new_set
      = VG_(newFM)( HG_(zalloc), "libhb.vts_tab__do_GC.new_set",
                    HG_(free),
                    (Word(*)(UWord,UWord))VTS__cmp_structural );

/* Visit each old VTS.  For each one:

* make a pruned version

* search new_set for the pruned version, yielding either
        Nothing (not present) or the new VtsID for it.

* if not present, allocate a new VtsID for it, insert (pruned
        VTS, new VtsID) in the tree, and set
        remap_table[old VtsID] = new VtsID.

* if present, set remap_table[old VtsID] = new VtsID, where
        new VtsID was determined by the tree lookup.  Then free up
        the clone.
   */

UWord nBeforePruning = 0, nAfterPruning = 0;
   UWord nSTSsBefore = 0, nSTSsAfter = 0;
   VtsID new_VtsID_ctr = 0;

for (i = 0; i < nTab; i++) {

/* For each old VTS .. */
      VtsTE* old_te  = VG_(indexXA)( vts_tab, i );
      VTS*   old_vts = old_te->vts;
      tl_assert(old_te->remap == VtsID_INVALID);

/* Skip it if not in use */
      if (old_te->rc == 0) {
         tl_assert(old_vts == NULL);
         continue;
      }
      tl_assert(old_vts != NULL);
      tl_assert(old_vts->id == i);
      tl_assert(old_vts->ts != NULL);

/* It is in use. Make a pruned version. */
      nBeforePruning++;
      nSTSsBefore += old_vts->usedTS;
      VTS* new_vts = VTS__subtract("libhb.vts_tab__do_GC.new_vts",
                                   old_vts, verydead_thread_table);
      tl_assert(new_vts->sizeTS == new_vts->usedTS);
      tl_assert(*(ULong*)(&new_vts->ts[new_vts->usedTS])
                == 0x0ddC0ffeeBadF00dULL);

/* Get rid of the old VTS and the tree entry.  It's a bit more
         complex to incrementally delete the VTSs now than to nuke
         them all after we're done, but the upside is that we don't
         wind up temporarily storing potentially two complete copies
         of each VTS and hence spiking memory use. */
      UWord oldK = 0, oldV = 12345;
      Bool  present = VG_(delFromFM)( vts_set,
                                      &oldK, &oldV, (UWord)old_vts );
      tl_assert(present); /* else it isn't in vts_set ?! */
      tl_assert(oldV == 0); /* no info stored in vts_set val fields */
      tl_assert(oldK == (UWord)old_vts); /* else what did delFromFM find?! */
      /* now free the VTS itself */
      VTS__delete(old_vts);
      old_te->vts = NULL;
      old_vts = NULL;

/* NO MENTIONS of old_vts allowed beyond this point. */

/* Ok, we have the pruned copy in new_vts.  See if a
         structurally identical version is already present in new_set.
         If so, delete the one we just made and move on; if not, add
         it. */
      VTS*  identical_version = NULL;
      UWord valW = 12345;
      if (VG_(lookupFM)(new_set, (UWord*)&identical_version, &valW,
                        (UWord)new_vts)) {
         // already have it
         tl_assert(valW == 0);
         tl_assert(identical_version != NULL);
         tl_assert(identical_version != new_vts);
         VTS__delete(new_vts);
         new_vts = identical_version;
         tl_assert(new_vts->id != VtsID_INVALID);
      } else {
         tl_assert(valW == 12345);
         tl_assert(identical_version == NULL);
         new_vts->id = new_VtsID_ctr++;
         Bool b = VG_(addToFM)(new_set, (UWord)new_vts, 0);
         tl_assert(!b);
         VtsTE new_te;
         new_te.vts      = new_vts;
         new_te.rc       = 0;
         new_te.freelink = VtsID_INVALID;
         new_te.remap    = VtsID_INVALID;
         Word j = VG_(addToXA)( new_tab, &new_te );
         tl_assert(j <= i);
         tl_assert(j == new_VtsID_ctr - 1);
         // stats
         nAfterPruning++;
         nSTSsAfter += new_vts->usedTS;
      }
      old_te->remap = new_vts->id;

} /* for (i = 0; i < nTab; i++) */

/* At this point, we have:
      * the old VTS table, with its .remap entries set,
        and with all .vts == NULL.
      * the old VTS tree should be empty, since it and the old VTSs
        it contained have been incrementally deleted was we worked
        through the old table.
      * the new VTS table, with all .rc == 0, all .freelink and .remap
        == VtsID_INVALID. 
      * the new VTS tree.
   */
   tl_assert( VG_(sizeFM)(vts_set) == 0 );

/* Now actually apply the mapping. */
   /* Visit all the VtsIDs in the entire system.  Where do we expect
      to find them?
      (a) in shadow memory -- the LineZs and LineFs
      (b) in our collection of struct _Thrs.
      (c) in our collection of struct _SOs.
      Nowhere else, AFAICS.  Not in the zsm cache, because that just
      got invalidated.

Using the .remap fields in vts_tab, map each old VtsID to a new
      VtsID.  For each old VtsID, dec its rc; and for each new one,
      inc it.  This sets up the new refcounts, and it also gives a
      cheap sanity check of the old ones: all old refcounts should be
      zero after this operation.
   */

/* Do the mappings for (a) above: iterate over the Primary shadow
      mem map (WordFM Addr SecMap*). */
   UWord secmapW = 0;
   VG_(initIterFM)( map_shmem );
   while (VG_(nextIterFM)( map_shmem, NULL, &secmapW )) {
      UWord   j;
      SecMap* sm = (SecMap*)secmapW;
      tl_assert(sm->magic == SecMap_MAGIC);
      /* Deal with the LineZs */
      for (i = 0; i < N_SECMAP_ZLINES; i++) {
         LineZ* lineZ = &sm->linesZ[i];
         if (lineZ->dict[0] == SVal_INVALID)
            continue; /* not in use -- data is in F rep instead */
         for (j = 0; j < 4; j++)
            remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineZ->dict[j]);
      }
      /* Deal with the LineFs */
      for (i = 0; i < sm->linesF_size; i++) {
         LineF* lineF = &sm->linesF[i];
         if (!lineF->inUse)
            continue;
         for (j = 0; j < N_LINE_ARANGE; j++)
            remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineF->w64s[j]);
      }
   }
   VG_(doneIterFM)( map_shmem );

/* Do the mappings for (b) above: visit our collection of struct
      _Thrs. */
   Thread* hgthread = get_admin_threads();
   tl_assert(hgthread);
   while (hgthread) {
      Thr* hbthr = hgthread->hbthr;
      tl_assert(hbthr);
      /* Threads that are listed in the prunable set have their viR
         and viW set to VtsID_INVALID, so we can't mess with them. */
      if (hbthr->llexit_done && hbthr->joinedwith_done) {
         tl_assert(hbthr->viR == VtsID_INVALID);
         tl_assert(hbthr->viW == VtsID_INVALID);
         hgthread = hgthread->admin;
         continue;
      }
      remap_VtsID( vts_tab, new_tab, &hbthr->viR );
      remap_VtsID( vts_tab, new_tab, &hbthr->viW );
      hgthread = hgthread->admin;
   }

/* Do the mappings for (c) above: visit the struct _SOs. */
   SO* so = admin_SO;
   while (so) {
      if (so->viR != VtsID_INVALID)
         remap_VtsID( vts_tab, new_tab, &so->viR );
      if (so->viW != VtsID_INVALID)
         remap_VtsID( vts_tab, new_tab, &so->viW );
      so = so->admin_next;
   }

/* So, we're nearly done (with this incredibly complex operation).
      Check the refcounts for the old VtsIDs all fell to zero, as
      expected.  Any failure is serious. */
   for (i = 0; i < nTab; i++) {
      VtsTE* te = VG_(indexXA)( vts_tab, i );
      tl_assert(te->vts == NULL);
      /* This is the assert proper.  Note we're also asserting
         zeroness for old entries which are unmapped (hence have
         .remap == VtsID_INVALID).  That's OK. */
      tl_assert(te->rc == 0);
   }

/* Install the new table and set. */
   VG_(deleteFM)(vts_set, NULL/*kFin*/, NULL/*vFin*/);
   vts_set = new_set;
   VG_(deleteXA)( vts_tab );
   vts_tab = new_tab;

/* The freelist of vts_tab entries is empty now, because we've
      compacted all of the live entries at the low end of the
      table. */
   vts_tab_freelist = VtsID_INVALID;

/* Sanity check vts_set and vts_tab. */

/* Because all the live entries got slid down to the bottom of vts_tab: */
   tl_assert( VG_(sizeXA)( vts_tab ) == VG_(sizeFM)( vts_set ));

/* Assert that the vts_tab and vts_set entries point at each other
      in the required way */
   UWord wordK = 0, wordV = 0;
   VG_(initIterFM)( vts_set );
   while (VG_(nextIterFM)( vts_set, &wordK, &wordV )) {
      tl_assert(wordK != 0);
      tl_assert(wordV == 0);
      VTS* vts = (VTS*)wordK;
      tl_assert(vts->id != VtsID_INVALID);
      VtsTE* te = VG_(indexXA)( vts_tab, vts->id );
      tl_assert(te->vts == vts);
   }
   VG_(doneIterFM)( vts_set );

/* Also iterate over the table, and check each entry is
      plausible. */
   nTab = VG_(sizeXA)( vts_tab );
   for (i = 0; i < nTab; i++) {
      VtsTE* te = VG_(indexXA)( vts_tab, i );
      tl_assert(te->vts);
      tl_assert(te->vts->id == i);
      tl_assert(te->rc > 0); /* 'cos we just GC'd */
      tl_assert(te->freelink == VtsID_INVALID); /* in use */
      tl_assert(te->remap == VtsID_INVALID); /* not relevant */
   }

/* And we're done.  Bwahahaha. Ha. Ha. Ha. */
   if (VG_(clo_stats)) {
      static UInt ctr = 1;
      tl_assert(nTab > 0);
      VG_(message)(
         Vg_DebugMsg,
         "libhb: VTS PR: #%u  before %lu (avg sz %lu)  "
            "after %lu (avg sz %lu)\n",
         ctr++,
         nBeforePruning, nSTSsBefore / (nBeforePruning ? nBeforePruning : 1),
         nAfterPruning, nSTSsAfter / (nAfterPruning ? nAfterPruning : 1)
      );
   }
   if (0)
   VG_(printf)("VTQ: before pruning %lu (avg sz %lu), "
               "after pruning %lu (avg sz %lu)\n",
               nBeforePruning, nSTSsBefore / nBeforePruning,
               nAfterPruning, nSTSsAfter / nAfterPruning);
   /* ---------- END VTS PRUNING ---------- */
}

/////////////////////////////////////////////////////////
//                                                     //
// Vts IDs                                             //
//                                                     //
/////////////////////////////////////////////////////////

//////////////////////////
/* A temporary, max-sized VTS which is used as a temporary (the first
   argument) in VTS__singleton, VTS__tick and VTS__join operations. */
static VTS* temp_max_sized_VTS = NULL;

//////////////////////////
static ULong stats__cmpLEQ_queries = 0;
static ULong stats__cmpLEQ_misses  = 0;
static ULong stats__join2_queries  = 0;
static ULong stats__join2_misses   = 0;

static inline UInt ROL32 ( UInt w, Int n ) {
   w = (w << n) | (w >> (32-n));
   return w;
}
static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) {
   UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13);
   return hash % nTab;
}

#define N_CMPLEQ_CACHE 1023
static
   struct { VtsID vi1; VtsID vi2; Bool leq; }
   cmpLEQ_cache[N_CMPLEQ_CACHE];

#define N_JOIN2_CACHE 1023
static
   struct { VtsID vi1; VtsID vi2; VtsID res; }
   join2_cache[N_JOIN2_CACHE];

static void VtsID__invalidate_caches ( void ) {
   Int i;
   for (i = 0; i < N_CMPLEQ_CACHE; i++) {
      cmpLEQ_cache[i].vi1 = VtsID_INVALID;
      cmpLEQ_cache[i].vi2 = VtsID_INVALID;
      cmpLEQ_cache[i].leq = False;
   }
   for (i = 0; i < N_JOIN2_CACHE; i++) {
     join2_cache[i].vi1 = VtsID_INVALID;
     join2_cache[i].vi2 = VtsID_INVALID;
     join2_cache[i].res = VtsID_INVALID;
   }
}
//////////////////////////

//static Bool VtsID__is_valid ( VtsID vi ) {
//   VtsTE* ve;
//   if (vi >= (VtsID)VG_(sizeXA)( vts_tab ))
//      return False;
//   ve = VG_(indexXA)( vts_tab, vi );
//   if (!ve->vts)
//      return False;
//   tl_assert(ve->vts->id == vi);
//   return True;
//}

static VTS* VtsID__to_VTS ( VtsID vi ) {
   VtsTE* te = VG_(indexXA)( vts_tab, vi );
   tl_assert(te->vts);
   return te->vts;
}

static void VtsID__pp ( VtsID vi ) {
   HChar buf[100];
   VTS* vts = VtsID__to_VTS(vi);
   VTS__show( buf, sizeof(buf)-1, vts );
   buf[sizeof(buf)-1] = 0;
   VG_(printf)("%s", buf);
}

/* compute partial ordering relation of vi1 and vi2. */
__attribute__((noinline))
static Bool VtsID__cmpLEQ_WRK ( VtsID vi1, VtsID vi2 ) {
   UInt hash;
   Bool leq;
   VTS  *v1, *v2;
   //if (vi1 == vi2) return True;
   tl_assert(vi1 != vi2);
   ////++
   stats__cmpLEQ_queries++;
   hash = hash_VtsIDs(vi1, vi2, N_CMPLEQ_CACHE);
   if (cmpLEQ_cache[hash].vi1 == vi1
       && cmpLEQ_cache[hash].vi2 == vi2)
      return cmpLEQ_cache[hash].leq;
   stats__cmpLEQ_misses++;
   ////--
   v1  = VtsID__to_VTS(vi1);
   v2  = VtsID__to_VTS(vi2);
   leq = VTS__cmpLEQ( v1, v2 ) == 0;
   ////++
   cmpLEQ_cache[hash].vi1 = vi1;
   cmpLEQ_cache[hash].vi2 = vi2;
   cmpLEQ_cache[hash].leq = leq;
   ////--
   return leq;
}
static inline Bool VtsID__cmpLEQ ( VtsID vi1, VtsID vi2 ) {
   return LIKELY(vi1 == vi2)  ? True  : VtsID__cmpLEQ_WRK(vi1, vi2);
}

/* compute binary join */
__attribute__((noinline))
static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
   UInt  hash;
   VtsID res;
   VTS   *vts1, *vts2;
   //if (vi1 == vi2) return vi1;
   tl_assert(vi1 != vi2);
   ////++
   stats__join2_queries++;
   hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE);
   if (join2_cache[hash].vi1 == vi1
       && join2_cache[hash].vi2 == vi2)
      return join2_cache[hash].res;
   stats__join2_misses++;
   ////--
   vts1 = VtsID__to_VTS(vi1);
   vts2 = VtsID__to_VTS(vi2);
   temp_max_sized_VTS->usedTS = 0;
   VTS__join(temp_max_sized_VTS, vts1,vts2);
   res = vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
   ////++
   join2_cache[hash].vi1 = vi1;
   join2_cache[hash].vi2 = vi2;
   join2_cache[hash].res = res;
   ////--
   return res;
}
static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
   return LIKELY(vi1 == vi2)  ? vi1  : VtsID__join2_WRK(vi1, vi2);
}

/* create a singleton VTS, namely [thr:1] */
static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) {
   temp_max_sized_VTS->usedTS = 0;
   VTS__singleton(temp_max_sized_VTS, thr,tym);
   return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
}

/* tick operation, creates value 1 if specified index is absent */
static VtsID VtsID__tick ( VtsID vi, Thr* idx ) {
   VTS* vts = VtsID__to_VTS(vi);
   temp_max_sized_VTS->usedTS = 0;
   VTS__tick(temp_max_sized_VTS, idx,vts);
   return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
}

/* index into a VTS (only for assertions) */
static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) {
   VTS* vts = VtsID__to_VTS(vi);
   return VTS__indexAt_SLOW( vts, idx );
}

/* Assuming that !cmpLEQ(vi1, vi2), find the index of the first (or
   any, really) element in vi1 which is pointwise greater-than the
   corresponding element in vi2.  If no such element exists, return
   NULL.  This needs to be fairly quick since it is called every time
   a race is detected. */
static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 )
{
   VTS  *vts1, *vts2;
   Thr*  diffthr;
   ThrID diffthrid;
   tl_assert(vi1 != vi2);
   vts1 = VtsID__to_VTS(vi1);
   vts2 = VtsID__to_VTS(vi2);
   tl_assert(vts1 != vts2);
   diffthrid = VTS__cmpLEQ(vts1, vts2);
   diffthr = Thr__from_ThrID(diffthrid);
   tl_assert(diffthr); /* else they are LEQ ! */
   return diffthr;
}

/////////////////////////////////////////////////////////
//                                                     //
// Filters                                             //
//                                                     //
/////////////////////////////////////////////////////////

/* Forget everything we know -- clear the filter and let everything
   through.  This needs to be as fast as possible, since it is called
   every time the running thread changes, and every time a thread's
   vector clocks change, which can be quite frequent.  The obvious
   fast way to do this is simply to stuff in tags which we know are
   not going to match anything, since they're not aligned to the start
   of a line. */
static void Filter__clear ( Filter* fi, HChar* who )
{
   UWord i;
   if (0) VG_(printf)("  Filter__clear(%p, %s)\n", fi, who);
   for (i = 0; i < FI_NUM_LINES; i += 8) {
      fi->tags[i+0] = 1; /* impossible value -- cannot match */
      fi->tags[i+1] = 1;
      fi->tags[i+2] = 1;
      fi->tags[i+3] = 1;
      fi->tags[i+4] = 1;
      fi->tags[i+5] = 1;
      fi->tags[i+6] = 1;
      fi->tags[i+7] = 1;
   }
   tl_assert(i == FI_NUM_LINES);
}

/* Clearing an arbitrary range in the filter.  Unfortunately
   we have to do this due to core-supplied new/die-mem events. */

static void Filter__clear_1byte ( Filter* fi, Addr a )
{
   Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   FiLine* line   = &fi->lines[lineno];
   UWord   loff   = (a - atag) / 8;
   UShort  mask   = 0x3 << (2 * (a & 7));
   /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
   if (LIKELY( fi->tags[lineno] == atag )) {
      /* hit.  clear the bits. */
      UShort  u16  = line->u16s[loff];
      line->u16s[loff] = u16 & ~mask; /* clear them */
   } else {
      /* miss.  The filter doesn't hold this address, so ignore. */
   }
}

static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a )
{
   Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   FiLine* line   = &fi->lines[lineno];
   UWord   loff   = (a - atag) / 8;
   if (LIKELY( fi->tags[lineno] == atag )) {
      line->u16s[loff] = 0;
   } else {
    /* miss.  The filter doesn't hold this address, so ignore. */
   }
}

static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
{
  //VG_(printf)("%lu ", len);
   /* slowly do part preceding 8-alignment */
   while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) {
      Filter__clear_1byte( fi, a );
      a++;
      len--;
   }
   /* vector loop */
   while (len >= 8) {
      Filter__clear_8bytes_aligned( fi, a );
      a += 8;
      len -= 8;
   }
   /* slowly do tail */
   while (UNLIKELY(len > 0)) {
      Filter__clear_1byte( fi, a );
      a++;
      len--;
   }
}

/* ------ Read handlers for the filter. ------ */

static inline Bool Filter__ok_to_skip_crd64 ( Filter* fi, Addr a )
{
   if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
      return False;
   { 
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0xAAAA;
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort u16  = line->u16s[loff];
        Bool   ok   = (u16 & mask) == mask; /* all R bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

static inline Bool Filter__ok_to_skip_crd32 ( Filter* fi, Addr a )
{
   if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
      return False;
   {
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0xAA << (2 * (a & 4)); /* 0xAA00 or 0x00AA */
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort  u16  = line->u16s[loff];
        Bool    ok   = (u16 & mask) == mask; /* 4 x R bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord   i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

static inline Bool Filter__ok_to_skip_crd16 ( Filter* fi, Addr a )
{
   if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
      return False;
   {
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0xA << (2 * (a & 6));
     /* mask is A000, 0A00, 00A0 or 000A */
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort  u16  = line->u16s[loff];
        Bool    ok   = (u16 & mask) == mask; /* 2 x R bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord   i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

static inline Bool Filter__ok_to_skip_crd08 ( Filter* fi, Addr a )
{
   {
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0x2 << (2 * (a & 7));
     /* mask is 8000, 2000, 0800, 0200, 0080, 0020, 0008 or 0002 */
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort  u16  = line->u16s[loff];
        Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord   i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

/* ------ Write handlers for the filter. ------ */

static inline Bool Filter__ok_to_skip_cwr64 ( Filter* fi, Addr a )
{
   if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
      return False;
   { 
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0xFFFF;
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort u16  = line->u16s[loff];
        Bool   ok   = (u16 & mask) == mask; /* all R & W bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

static inline Bool Filter__ok_to_skip_cwr32 ( Filter* fi, Addr a )
{
   if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
      return False;
   {
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0xFF << (2 * (a & 4)); /* 0xFF00 or 0x00FF */
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort  u16  = line->u16s[loff];
        Bool    ok   = (u16 & mask) == mask; /* 4 x R & W bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord   i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

static inline Bool Filter__ok_to_skip_cwr16 ( Filter* fi, Addr a )
{
   if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
      return False;
   {
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0xF << (2 * (a & 6));
     /* mask is F000, 0F00, 00F0 or 000F */
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort  u16  = line->u16s[loff];
        Bool    ok   = (u16 & mask) == mask; /* 2 x R & W bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord   i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a )
{
   {
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0x3 << (2 * (a & 7));
     /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort  u16  = line->u16s[loff];
        Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord   i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

/////////////////////////////////////////////////////////
//                                                     //
// Threads                                             //
//                                                     //
/////////////////////////////////////////////////////////

/* Maps ThrID values to their Thr*s (which contain ThrID values that
   should point back to the relevant slot in the array.  Lowest
   numbered slot (0) is for thrid = 1024, (1) is for 1025, etc. */
static XArray* /* of Thr* */ thrid_to_thr_map = NULL;

/* And a counter to dole out ThrID values.  For rationale/background,
   see comments on definition of ScalarTS (far) above. */
static ThrID thrid_counter = 1024; /* runs up to ThrID_MAX_VALID */

static ThrID Thr__to_ThrID ( Thr* thr ) {
   return thr->thrid;
}
static Thr* Thr__from_ThrID ( UInt thrid ) {
   Thr* thr = *(Thr**)VG_(indexXA)( thrid_to_thr_map, thrid - 1024 );
   tl_assert(thr->thrid == thrid);
   return thr;
}

static Thr* Thr__new ( void )
{
   Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) );
   thr->viR = VtsID_INVALID;
   thr->viW = VtsID_INVALID;
   thr->llexit_done = False;
   thr->joinedwith_done = False;
   thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
   /* We only really need this at history level 1, but unfortunately
      this routine is called before the command line processing is
      done (sigh), so we can't rely on HG_(clo_history_level) at this
      point.  Hence always allocate it.  Bah. */
   thr->local_Kws_n_stacks
      = VG_(newXA)( HG_(zalloc),
                    "libhb.Thr__new.3 (local_Kws_and_stacks)",
                    HG_(free), sizeof(ULong_n_EC) );

/* Add this Thr* <-> ThrID binding to the mapping, and
      cross-check */
   if (!thrid_to_thr_map) {
      thrid_to_thr_map = VG_(newXA)( HG_(zalloc), "libhb.Thr__new.4",
                                     HG_(free), sizeof(Thr*) );
      tl_assert(thrid_to_thr_map);
   }

if (thrid_counter >= ThrID_MAX_VALID) {
      /* We're hosed.  We have to stop. */
      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
   }

thr->thrid = thrid_counter++;
   Word ix = VG_(addToXA)( thrid_to_thr_map, &thr );
   tl_assert(ix + 1024 == thr->thrid);

return thr;
}

static void note_local_Kw_n_stack_for ( Thr* thr )
{
   Word       nPresent;
   ULong_n_EC pair;
   tl_assert(thr);

// We only collect this info at history level 1 (approx)
   if (HG_(clo_history_level) != 1) 
      return;

/* This is the scalar Kw for thr. */
   pair.ull = VtsID__indexAt( thr->viW, thr );
   pair.ec  = main_get_EC( thr );
   tl_assert(pair.ec);
   tl_assert(thr->local_Kws_n_stacks);

/* check that we're not adding duplicates */
   nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );

/* Throw away old stacks, if necessary.  We can't accumulate stuff
      indefinitely. */
   if (nPresent >= N_KWs_N_STACKs_PER_THREAD) {
      VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 );
      nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
      if (0)
         VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p (!!! gc !!!)\n",
                     thr, pair.ull, pair.ec );
   }

if (nPresent > 0) {
      ULong_n_EC* prevPair
         = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 );
      tl_assert( prevPair->ull <= pair.ull );
   }

if (nPresent == 0)
      pair.ec = NULL;

VG_(addToXA)( thr->local_Kws_n_stacks, &pair );

if (0)
      VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p\n",
                  thr, pair.ull, pair.ec );
   if (0)
      VG_(pp_ExeContext)(pair.ec);
}

static Int cmp__ULong_n_EC__by_ULong ( ULong_n_EC* pair1, ULong_n_EC* pair2 )
{
   if (pair1->ull < pair2->ull) return -1;
   if (pair1->ull > pair2->ull) return 1;
   return 0;
}

/////////////////////////////////////////////////////////
//                                                     //
// Shadow Values                                       //
//                                                     //
/////////////////////////////////////////////////////////

// type SVal, SVal_INVALID and SVal_NOACCESS are defined by
// hb_zsm.h.  We have to do everything else here.

/* SVal is 64 bit unsigned int.

<---------30--------->    <---------30--------->
   00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X   C(Rmin,Wmin)
   10 X--------------------X XX X--------------------X   A: SVal_NOACCESS
   11 0--------------------0 00 0--------------------0   A: SVal_INVALID

*/
#define SVAL_TAGMASK (3ULL << 62)

static inline Bool SVal__isC ( SVal s ) {
   return (0ULL << 62) == (s & SVAL_TAGMASK);
}
static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) {
   //tl_assert(VtsID__is_valid(rmini));
   //tl_assert(VtsID__is_valid(wmini));
   return (((ULong)rmini) << 32) | ((ULong)wmini);
}
static inline VtsID SVal__unC_Rmin ( SVal s ) {
   tl_assert(SVal__isC(s));
   return (VtsID)(s >> 32);
}
static inline VtsID SVal__unC_Wmin ( SVal s ) {
   tl_assert(SVal__isC(s));
   return (VtsID)(s & 0xFFFFFFFFULL);
}

static inline Bool SVal__isA ( SVal s ) {
   return (2ULL << 62) == (s & SVAL_TAGMASK);
}
static inline SVal SVal__mkA ( void ) {
   return 2ULL << 62;
}

/* Direct callback from lib_zsm. */
static void SVal__rcinc ( SVal s ) {
   if (SVal__isC(s)) {
      VtsID__rcinc( SVal__unC_Rmin(s) );
      VtsID__rcinc( SVal__unC_Wmin(s) );
   }
}

/* Direct callback from lib_zsm. */
static void SVal__rcdec ( SVal s ) {
   if (SVal__isC(s)) {
      VtsID__rcdec( SVal__unC_Rmin(s) );
      VtsID__rcdec( SVal__unC_Wmin(s) );
   }
}

/////////////////////////////////////////////////////////
//                                                     //
// A simple group (memory) allocator                   //
//                                                     //
/////////////////////////////////////////////////////////

//////////////// BEGIN general group allocator
typedef
   struct {
      UWord   elemSzB;        /* element size */
      UWord   nPerGroup;      /* # elems per group */
      void*   (*alloc)(HChar*, SizeT); /* group allocator */
      HChar*  cc; /* group allocator's cc */
      void    (*free)(void*); /* group allocator's free-er (unused) */
      /* XArray of void* (pointers to groups).  The groups themselves.
         Each element is a pointer to a block of size (elemSzB *
         nPerGroup) bytes. */
      XArray* groups;
      /* next free element.  Is a pointer to an element in one of the
         groups pointed to by .groups. */
      void* nextFree;
   }
   GroupAlloc;

static void init_GroupAlloc ( /*MOD*/GroupAlloc* ga,
                              UWord  elemSzB,
                              UWord  nPerGroup,
                              void*  (*alloc)(HChar*, SizeT),
                              HChar* cc,
                              void   (*free)(void*) )
{
   tl_assert(0 == (elemSzB % sizeof(UWord)));
   tl_assert(elemSzB >= sizeof(UWord));
   tl_assert(nPerGroup >= 100); /* let's say */
   tl_assert(alloc);
   tl_assert(cc);
   tl_assert(free);
   tl_assert(ga);
   VG_(memset)(ga, 0, sizeof(*ga));
   ga->elemSzB   = elemSzB;
   ga->nPerGroup = nPerGroup;
   ga->groups    = NULL;
   ga->alloc     = alloc;
   ga->cc        = cc;
   ga->free      = free;
   ga->groups    = VG_(newXA)( alloc, cc, free, sizeof(void*) );
   ga->nextFree  = NULL;
   tl_assert(ga->groups);
}

/* The freelist is empty.  Allocate a new group and put all the new
   elements in it onto the freelist. */
__attribute__((noinline))
static void gal_add_new_group ( GroupAlloc* ga ) 
{
   Word   i;
   UWord* group;
   tl_assert(ga);
   tl_assert(ga->nextFree == NULL);
   group = ga->alloc( ga->cc, ga->elemSzB * ga->nPerGroup );
   tl_assert(group);
   /* extend the freelist through the new group.  Place the freelist
      pointer in the first word of each element.  That's why the
      element size must be at least one word. */
   for (i = ga->nPerGroup-1; i >= 0; i--) {
      UChar* elemC = ((UChar*)group) + i * ga->elemSzB;
      UWord* elem  = (UWord*)elemC;
      tl_assert(0 == (((UWord)elem) % sizeof(UWord)));
      *elem = (UWord)ga->nextFree;
      ga->nextFree = elem;
   }
   /* and add to our collection of groups */
   VG_(addToXA)( ga->groups, &group );
}

inline static void* gal_Alloc ( GroupAlloc* ga )
{
   UWord* elem;
   if (UNLIKELY(ga->nextFree == NULL)) {
      gal_add_new_group(ga);
   }
   elem = ga->nextFree;
   ga->nextFree = (void*)*elem;
   *elem = 0; /* unnecessary, but just to be on the safe side */
   return elem;
}

inline static void* gal_Alloc_w_size_check ( GroupAlloc* ga, SizeT n )
{
   tl_assert(n == ga->elemSzB);
   return gal_Alloc( ga );
}

inline static void gal_Free ( GroupAlloc* ga, void* p )
{
   UWord* elem = (UWord*)p;
   *elem = (UWord)ga->nextFree;
   ga->nextFree = elem;
}
//////////////// END general group allocator

/////////////////////////////////////////////////////////
//                                                     //
// Change-event map2                                   //
//                                                     //
/////////////////////////////////////////////////////////

#define EVENT_MAP_GC_DISCARD_FRACTION  0.5

/* This is in two parts:

1. A hash table of RCECs.  This is a set of reference-counted stack
      traces.  When the reference count of a stack trace becomes zero,
      it is removed from the set and freed up.  The intent is to have
      a set of stack traces which can be referred to from (2), but to
      only represent each one once.  The set is indexed/searched by
      ordering on the stack trace vectors.

2. A SparseWA of OldRefs.  These store information about each old
      ref that we need to record.  It is indexed by address of the
      location for which the information is recorded.  For LRU
      purposes, each OldRef also contains a generation number,
      indicating when it was most recently accessed.

The important part of an OldRef is, however, its accs[] array.
      This is an array of N_OLDREF_ACCS which binds (thread, R/W,
      size) triples to RCECs.  This allows us to collect the last
      access-traceback by up to N_OLDREF_ACCS different triples for
      this location.  The accs[] array is a MTF-array.  If a binding
      falls off the end, that's too bad -- we will lose info about
      that triple's access to this location.

When the SparseWA becomes too big, we can throw away the OldRefs
      whose generation numbers are below some threshold; hence doing
      approximate LRU discarding.  For each discarded OldRef we must
      of course decrement the reference count on the all RCECs it
      refers to, in order that entries from (1) eventually get
      discarded too.

A major improvement in reliability of this mechanism would be to
   have a dynamically sized OldRef.accs[] array, so no entries ever
   fall off the end.  In investigations (Dec 08) it appears that a
   major cause for the non-availability of conflicting-access traces
   in race reports is caused by the fixed size of this array.  I
   suspect for most OldRefs, only a few entries are used, but for a
   minority of cases there is an overflow, leading to info lossage.
   Investigations also suggest this is very workload and scheduling
   sensitive.  Therefore a dynamic sizing would be better.

However, dynamic sizing would defeat the use of a GroupAllocator
   for OldRef structures.  And that's important for performance.  So
   it's not straightforward to do.
*/

static UWord stats__ctxt_rcdec1 = 0;
static UWord stats__ctxt_rcdec2 = 0;
static UWord stats__ctxt_rcdec3 = 0;
static UWord stats__ctxt_rcdec_calls = 0;
static UWord stats__ctxt_rcdec_discards = 0;
static UWord stats__ctxt_rcdec1_eq = 0;

static UWord stats__ctxt_tab_curr = 0;
static UWord stats__ctxt_tab_max  = 0;

static UWord stats__ctxt_tab_qs   = 0;
static UWord stats__ctxt_tab_cmps = 0;

///////////////////////////////////////////////////////
//// Part (1): A hash table of RCECs
///

#define N_FRAMES 8

// (UInt) `echo "Reference Counted Execution Context" | md5sum`
#define RCEC_MAGIC 0xab88abb2UL

//#define N_RCEC_TAB 98317 /* prime */
#define N_RCEC_TAB 196613 /* prime */

typedef
   struct _RCEC {
      UWord magic;  /* sanity check only */
      struct _RCEC* next;
      UWord rc;
      UWord rcX; /* used for crosschecking */
      UWord frames_hash;          /* hash of all the frames */
      UWord frames[N_FRAMES];
   }
   RCEC;

static RCEC** contextTab = NULL; /* hash table of RCEC*s */

/* Gives an arbitrary total order on RCEC .frames fields */
static Word RCEC__cmp_by_frames ( RCEC* ec1, RCEC* ec2 ) {
   Word i;
   tl_assert(ec1 && ec1->magic == RCEC_MAGIC);
   tl_assert(ec2 && ec2->magic == RCEC_MAGIC);
   if (ec1->frames_hash < ec2->frames_hash) return -1;
   if (ec1->frames_hash > ec2->frames_hash) return  1;
   for (i = 0; i < N_FRAMES; i++) {
      if (ec1->frames[i] < ec2->frames[i]) return -1;
      if (ec1->frames[i] > ec2->frames[i]) return  1;
   }
   return 0;
}

/* Dec the ref of this RCEC. */
static void ctxt__rcdec ( RCEC* ec )
{
   stats__ctxt_rcdec_calls++;
   tl_assert(ec && ec->magic == RCEC_MAGIC);
   tl_assert(ec->rc > 0);
   ec->rc--;
}

static void ctxt__rcinc ( RCEC* ec )
{
   tl_assert(ec && ec->magic == RCEC_MAGIC);
   ec->rc++;
}

//////////// BEGIN RCEC group allocator
static GroupAlloc rcec_group_allocator;

static RCEC* alloc_RCEC ( void ) {
   return gal_Alloc ( &rcec_group_allocator );
}

static void free_RCEC ( RCEC* rcec ) {
   tl_assert(rcec->magic == RCEC_MAGIC);
   gal_Free( &rcec_group_allocator, rcec );
}
//////////// END RCEC group allocator

/* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and
   move it one step closer the the front of the list, so as to make
   subsequent searches for it cheaper. */
static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec )
{
   RCEC *ec0, *ec1, *ec2;
   if (ec == *headp)
      tl_assert(0); /* already at head of list */
   tl_assert(ec != NULL);
   ec0 = *headp;
   ec1 = NULL;
   ec2 = NULL;
   while (True) {
      if (ec0 == NULL || ec0 == ec) break;
      ec2 = ec1;
      ec1 = ec0;
      ec0 = ec0->next;
   }
   tl_assert(ec0 == ec);
   if (ec0 != NULL && ec1 != NULL && ec2 != NULL) {
      RCEC* tmp;
      /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's
         predecessor.  Swap ec0 and ec1, that is, move ec0 one step
         closer to the start of the list. */
      tl_assert(ec2->next == ec1);
      tl_assert(ec1->next == ec0);
      tmp = ec0->next;
      ec2->next = ec0;
      ec0->next = ec1;
      ec1->next = tmp;
   }
   else
   if (ec0 != NULL && ec1 != NULL && ec2 == NULL) {
      /* it's second in the list. */
      tl_assert(*headp == ec1);
      tl_assert(ec1->next == ec0);
      ec1->next = ec0->next;
      ec0->next = ec1;
      *headp = ec0;
   }
}

/* Find the given RCEC in the tree, and return a pointer to it.  Or,
   if not present, add the given one to the tree (by making a copy of
   it, so the caller can immediately deallocate the original) and
   return a pointer to the copy.  The caller can safely have 'example'
   on its stack, since we will always return a pointer to a copy of
   it, not to the original.  Note that the inserted node will have .rc
   of zero and so the caller must immediatly increment it. */
__attribute__((noinline))
static RCEC* ctxt__find_or_add ( RCEC* example )
{
   UWord hent;
   RCEC* copy;
   tl_assert(example && example->magic == RCEC_MAGIC);
   tl_assert(example->rc == 0);

/* Search the hash table to see if we already have it. */
   stats__ctxt_tab_qs++;
   hent = example->frames_hash % N_RCEC_TAB;
   copy = contextTab[hent];
   while (1) {
      if (!copy) break;
      tl_assert(copy->magic == RCEC_MAGIC);
      stats__ctxt_tab_cmps++;
      if (0 == RCEC__cmp_by_frames(copy, example)) break;
      copy = copy->next;
   }

if (copy) {
      tl_assert(copy != example);
      /* optimisation: if it's not at the head of its list, move 1
         step fwds, to make future searches cheaper */
      if (copy != contextTab[hent]) {
         move_RCEC_one_step_forward( &contextTab[hent], copy );
      }
   } else {
      copy = alloc_RCEC();
      tl_assert(copy != example);
      *copy = *example;
      copy->next = contextTab[hent];
      contextTab[hent] = copy;
      stats__ctxt_tab_curr++;
      if (stats__ctxt_tab_curr > stats__ctxt_tab_max)
         stats__ctxt_tab_max = stats__ctxt_tab_curr;
   }
   return copy;
}

static inline UWord ROLW ( UWord w, Int n )
{
   Int bpw = 8 * sizeof(UWord);
   w = (w << n) | (w >> (bpw-n));
   return w;
}

__attribute__((noinline))
static RCEC* get_RCEC ( Thr* thr )
{
   UWord hash, i;
   RCEC  example;
   example.magic = RCEC_MAGIC;
   example.rc = 0;
   example.rcX = 0;
   main_get_stacktrace( thr, &example.frames[0], N_FRAMES );
   hash = 0;
   for (i = 0; i < N_FRAMES; i++) {
      hash ^= example.frames[i];
      hash = ROLW(hash, 19);
   }
   example.frames_hash = hash;
   return ctxt__find_or_add( &example );
}

///////////////////////////////////////////////////////
//// Part (2):
///  A SparseWA guest-addr -> OldRef, that refers to (1)
///

// (UInt) `echo "Old Reference Information" | md5sum`
#define OldRef_MAGIC 0x30b1f075UL

/* Records an access: a thread, a context (size & writeness) and the
   number of held locks. The size (1,2,4,8) is encoded as 00 = 1, 01 =
   2, 10 = 4, 11 = 8. 
*/
typedef
   struct { 
      RCEC*     rcec;
      WordSetID locksHeldW;
      UInt      thrid  : SCALARTS_N_THRBITS;
      UInt      szLg2B : 2;
      UInt      isW    : 1;
   }
   Thr_n_RCEC;

#define N_OLDREF_ACCS 5

typedef
   struct {
      UWord magic;  /* sanity check only */
      UWord gen;    /* when most recently accessed */
                    /* or free list when not in use */
      /* unused slots in this array have .thrid == 0, which is invalid */
      Thr_n_RCEC accs[N_OLDREF_ACCS];
   }
   OldRef;

//////////// BEGIN OldRef group allocator
static GroupAlloc oldref_group_allocator;

static OldRef* alloc_OldRef ( void ) {
   return gal_Alloc ( &oldref_group_allocator );
}

static void free_OldRef ( OldRef* r ) {
   tl_assert(r->magic == OldRef_MAGIC);
   gal_Free( &oldref_group_allocator, r );
}
//////////// END OldRef group allocator

static SparseWA* oldrefTree     = NULL; /* SparseWA* OldRef* */
static UWord     oldrefGen      = 0;    /* current LRU generation # */
static UWord     oldrefTreeN    = 0;    /* # elems in oldrefTree */
static UWord     oldrefGenIncAt = 0;    /* inc gen # when size hits this */

inline static UInt min_UInt ( UInt a, UInt b ) {
   return a < b ? a : b;
}

/* Compare the intervals [a1,a1+n1) and [a2,a2+n2).  Return -1 if the
   first interval is lower, 1 if the first interval is higher, and 0
   if there is any overlap.  Redundant paranoia with casting is there
   following what looked distinctly like a bug in gcc-4.1.2, in which
   some of the comparisons were done signedly instead of
   unsignedly. */
/* Copied from exp-ptrcheck/sg_main.c */
static Word cmp_nonempty_intervals ( Addr a1, SizeT n1,
                                     Addr a2, SizeT n2 ) {
   UWord a1w = (UWord)a1;
   UWord n1w = (UWord)n1;
   UWord a2w = (UWord)a2;
   UWord n2w = (UWord)n2;
   tl_assert(n1w > 0 && n2w > 0);
   if (a1w + n1w <= a2w) return -1L;
   if (a2w + n2w <= a1w) return 1L;
   return 0;
}

static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
{
   OldRef* ref;
   RCEC*   rcec;
   Word    i, j;
   UWord   keyW, valW;
   Bool    b;

tl_assert(thr);
   ThrID thrid = thr->thrid;
   tl_assert(thrid != 0); /* zero is used to denote an empty slot. */

WordSetID locksHeldW = thr->hgthread->locksetW;

rcec = get_RCEC( thr );
   ctxt__rcinc(rcec);

UInt szLg2B = 0;
   switch (szB) {
      /* This doesn't look particularly branch-predictor friendly. */
      case 1:  szLg2B = 0; break;
      case 2:  szLg2B = 1; break;
      case 4:  szLg2B = 2; break;
      case 8:  szLg2B = 3; break;
      default: tl_assert(0);
   }

/* Look in the map to see if we already have a record for this
      address. */
   b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, a );

if (b) {

/* We already have a record for this address.  We now need to
         see if we have a stack trace pertaining to this (thrid, R/W,
         size) triple. */
      tl_assert(keyW == a);
      ref = (OldRef*)valW;
      tl_assert(ref->magic == OldRef_MAGIC);

for (i = 0; i < N_OLDREF_ACCS; i++) {
         if (ref->accs[i].thrid != thrid)
            continue;
         if (ref->accs[i].szLg2B != szLg2B)
            continue;
         if (ref->accs[i].isW != (UInt)(isW & 1))
            continue;
         /* else we have a match, so stop looking. */
         break;
      }

if (i < N_OLDREF_ACCS) {
         /* thread 'thr' has an entry at index 'i'.  Update its RCEC. */
         if (i > 0) {
            Thr_n_RCEC tmp = ref->accs[i-1];
            ref->accs[i-1] = ref->accs[i];
            ref->accs[i] = tmp;
            i--;
         }
         if (rcec == ref->accs[i].rcec) stats__ctxt_rcdec1_eq++;
         stats__ctxt_rcdec1++;
         ctxt__rcdec( ref->accs[i].rcec );
         tl_assert(ref->accs[i].thrid == thrid);
         /* Update the RCEC and the W-held lockset. */
         ref->accs[i].rcec       = rcec;
         ref->accs[i].locksHeldW = locksHeldW;
      } else {
         /* No entry for this (thread, R/W, size, nWHeld) quad.
            Shuffle all of them down one slot, and put the new entry
            at the start of the array. */
         if (ref->accs[N_OLDREF_ACCS-1].thrid != 0) {
            /* the last slot is in use.  We must dec the rc on the
               associated rcec. */
            tl_assert(ref->accs[N_OLDREF_ACCS-1].rcec);
            stats__ctxt_rcdec2++;
            if (0 && 0 == (stats__ctxt_rcdec2 & 0xFFF))
               VG_(printf)("QQQQ %lu overflows\n",stats__ctxt_rcdec2);
            ctxt__rcdec( ref->accs[N_OLDREF_ACCS-1].rcec );
         } else {
            tl_assert(!ref->accs[N_OLDREF_ACCS-1].rcec);
         }
         for (j = N_OLDREF_ACCS-1; j >= 1; j--)
            ref->accs[j] = ref->accs[j-1];
         ref->accs[0].thrid      = thrid;
         ref->accs[0].szLg2B     = szLg2B;
         ref->accs[0].isW        = (UInt)(isW & 1);
         ref->accs[0].locksHeldW = locksHeldW;
         ref->accs[0].rcec       = rcec;
         /* thrid==0 is used to signify an empty slot, so we can't
            add zero thrid (such a ThrID is invalid anyway). */
         /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */
      }

ref->gen = oldrefGen;

} else {

/* We don't have a record for this address.  Create a new one. */
      if (oldrefTreeN >= oldrefGenIncAt) {
         oldrefGen++;
         oldrefGenIncAt = oldrefTreeN + 50000;
         if (0) VG_(printf)("oldrefTree: new gen %lu at size %lu\n",
                            oldrefGen, oldrefTreeN );
      }

ref = alloc_OldRef();
      ref->magic = OldRef_MAGIC;
      ref->gen   = oldrefGen;
      ref->accs[0].thrid      = thrid;
      ref->accs[0].szLg2B     = szLg2B;
      ref->accs[0].isW        = (UInt)(isW & 1);
      ref->accs[0].locksHeldW = locksHeldW;
      ref->accs[0].rcec       = rcec;

/* thrid==0 is used to signify an empty slot, so we can't
         add zero thrid (such a ThrID is invalid anyway). */
      /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */

/* Clear out the rest of the entries */
      for (j = 1; j < N_OLDREF_ACCS; j++) {
         ref->accs[j].rcec       = NULL;
         ref->accs[j].thrid      = 0;
         ref->accs[j].szLg2B     = 0;
         ref->accs[j].isW        = 0;
         ref->accs[j].locksHeldW = 0;
      }
      VG_(addToSWA)( oldrefTree, a, (UWord)ref );
      oldrefTreeN++;

}
}

/* Extract info from the conflicting-access machinery. */
Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
                              /*OUT*/Thr**        resThr,
                              /*OUT*/SizeT*       resSzB,
                              /*OUT*/Bool*        resIsW,
                              /*OUT*/WordSetID*   locksHeldW,
                              Thr* thr, Addr a, SizeT szB, Bool isW )
{
   Word    i, j;
   OldRef* ref;
   UWord   keyW, valW;
   Bool    b;

ThrID     cand_thrid;
   RCEC*     cand_rcec;
   Bool      cand_isW;
   SizeT     cand_szB;
   WordSetID cand_locksHeldW;
   Addr      cand_a;

Addr toCheck[15];
   Int  nToCheck = 0;

tl_assert(thr);
   tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1);

ThrID thrid = thr->thrid;

toCheck[nToCheck++] = a;
   for (i = -7; i < (Word)szB; i++) {
      if (i != 0)
         toCheck[nToCheck++] = a + i;
   }
   tl_assert(nToCheck <= 15);

/* Now see if we can find a suitable matching event for
      any of the addresses in toCheck[0 .. nToCheck-1]. */
   for (j = 0; j < nToCheck; j++) {

cand_a = toCheck[j];
      //      VG_(printf)("test %ld %p\n", j, cand_a);

b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, cand_a );
      if (!b)
         continue;

ref = (OldRef*)valW;
      tl_assert(keyW == cand_a);
      tl_assert(ref->magic == OldRef_MAGIC);
      tl_assert(ref->accs[0].thrid != 0); /* first slot must always be used */

cand_thrid      = 0; /* invalid; see comments in event_map_bind */
      cand_rcec       = NULL;
      cand_isW        = False;
      cand_szB        = 0;
      cand_locksHeldW = 0; /* always valid; see initialise_data_structures() */

for (i = 0; i < N_OLDREF_ACCS; i++) {
         Thr_n_RCEC* cand = &ref->accs[i];
         cand_rcec       = cand->rcec;
         cand_thrid      = cand->thrid;
         cand_isW        = (Bool)cand->isW;
         cand_szB        = 1 << cand->szLg2B;
         cand_locksHeldW = cand->locksHeldW;

if (cand_thrid == 0) 
            /* This slot isn't in use.  Ignore it. */
            continue;

if (cand_thrid == thrid)
            /* This is an access by the same thread, but we're only
               interested in accesses from other threads.  Ignore. */
            continue;

if ((!cand_isW) && (!isW))
            /* We don't want to report a read racing against another
               read; that's stupid.  So in this case move on. */
            continue;

if (cmp_nonempty_intervals(a, szB, cand_a, cand_szB) != 0)
            /* No overlap with the access we're asking about.  Ignore. */
            continue;

/* We have a match.  Stop searching. */
         break;
      }

tl_assert(i >= 0 && i <= N_OLDREF_ACCS);

if (i < N_OLDREF_ACCS) {
         Int n, maxNFrames;
         /* return with success */
         tl_assert(cand_thrid);
         tl_assert(cand_rcec);
         tl_assert(cand_rcec->magic == RCEC_MAGIC);
         tl_assert(cand_szB >= 1);
         /* Count how many non-zero frames we have. */
         maxNFrames = min_UInt(N_FRAMES, VG_(clo_backtrace_size));
         for (n = 0; n < maxNFrames; n++) {
            if (0 == cand_rcec->frames[n]) break;
         }
         *resEC      = VG_(make_ExeContext_from_StackTrace)
                          (cand_rcec->frames, n);
         *resThr     = Thr__from_ThrID(cand_thrid);
         *resSzB     = cand_szB;
         *resIsW     = cand_isW;
         *locksHeldW = cand_locksHeldW;
         return True;
      }

/* consider next address in toCheck[] */
   } /* for (j = 0; j < nToCheck; j++) */

/* really didn't find anything. */
   return False;
}

static void event_map_init ( void )
{
   Word i;

/* Context (RCEC) group allocator */
   init_GroupAlloc ( &rcec_group_allocator,
                     sizeof(RCEC),
                     1000 /* RCECs per group */,
                     HG_(zalloc),
                     "libhb.event_map_init.1 (RCEC groups)",
                     HG_(free) );

/* Context table */
   tl_assert(!contextTab);
   contextTab = HG_(zalloc)( "libhb.event_map_init.2 (context table)",
                             N_RCEC_TAB * sizeof(RCEC*) );
   tl_assert(contextTab);
   for (i = 0; i < N_RCEC_TAB; i++)
      contextTab[i] = NULL;

/* Oldref group allocator */
   init_GroupAlloc ( &oldref_group_allocator,
                     sizeof(OldRef),
                     1000 /* OldRefs per group */,
                     HG_(zalloc),
                     "libhb.event_map_init.3 (OldRef groups)",
                     HG_(free) );

/* Oldref tree */
   tl_assert(!oldrefTree);
   oldrefTree = VG_(newSWA)(
                   HG_(zalloc),
                   "libhb.event_map_init.4 (oldref tree)", 
                   HG_(free)
                );
   tl_assert(oldrefTree);

oldrefGen = 0;
   oldrefGenIncAt = 0;
   oldrefTreeN = 0;
}

static void event_map__check_reference_counts ( Bool before )
{
   RCEC*   rcec;
   OldRef* oldref;
   Word    i;
   UWord   nEnts = 0;
   UWord   keyW, valW;

/* Set the 'check' reference counts to zero.  Also, optionally
      check that the real reference counts are non-zero.  We allow
      these to fall to zero before a GC, but the GC must get rid of
      all those that are zero, hence none should be zero after a
      GC. */
   for (i = 0; i < N_RCEC_TAB; i++) {
      for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
         nEnts++;
         tl_assert(rcec);
         tl_assert(rcec->magic == RCEC_MAGIC);
         if (!before)
            tl_assert(rcec->rc > 0);
         rcec->rcX = 0;
      }
   }

/* check that the stats are sane */
   tl_assert(nEnts == stats__ctxt_tab_curr);
   tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max);

/* visit all the referencing points, inc check ref counts */
   VG_(initIterSWA)( oldrefTree );
   while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
      oldref = (OldRef*)valW;
      tl_assert(oldref->magic == OldRef_MAGIC);
      for (i = 0; i < N_OLDREF_ACCS; i++) {
         ThrID aThrID = oldref->accs[i].thrid;
         RCEC* aRef   = oldref->accs[i].rcec;
         if (aThrID != 0) {
            tl_assert(aRef);
            tl_assert(aRef->magic == RCEC_MAGIC);
            aRef->rcX++;
         } else {
            tl_assert(!aRef);
         }
      }
   }

/* compare check ref counts with actual */
   for (i = 0; i < N_RCEC_TAB; i++) {
      for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
         tl_assert(rcec->rc == rcec->rcX);
      }
   }
}

__attribute__((noinline))
static void event_map_maybe_GC ( void )
{
   OldRef* oldref;
   UWord   keyW, valW, retained, maxGen;
   XArray* refs2del;
   Word    i, j, n2del;

UWord* genMap      = NULL;
   UWord  genMap_min  = 0;
   UWord  genMap_size = 0;

if (LIKELY(oldrefTreeN < HG_(clo_conflict_cache_size)))
      return;

if (0)
      VG_(printf)("libhb: event_map GC at size %lu\n", oldrefTreeN);

/* Check for sane command line params.  Limit values must match
      those in hg_process_cmd_line_option. */
   tl_assert( HG_(clo_conflict_cache_size) >= 10*1000 );
   tl_assert( HG_(clo_conflict_cache_size) <= 30*1000*1000 );

/* Check our counting is sane (expensive) */
   if (CHECK_CEM)
      tl_assert(oldrefTreeN == VG_(sizeSWA)( oldrefTree ));

/* Check the reference counts (expensive) */
   if (CHECK_CEM)
      event_map__check_reference_counts( True/*before*/ );

/* Compute the distribution of generation values in the ref tree.
      There are likely only to be a few different generation numbers
      in the whole tree, but we don't know what they are.  Hence use a
      dynamically resized array of counters.  The array is genMap[0
      .. genMap_size-1], where genMap[0] is the count for the
      generation number genMap_min, genMap[1] is the count for
      genMap_min+1, etc.  If a new number is seen outside the range
      [genMap_min .. genMap_min + genMap_size - 1] then the array is
      copied into a larger array, and genMap_min and genMap_size are
      adjusted accordingly. */

/* genMap :: generation-number -> count-of-nodes-with-that-number */

VG_(initIterSWA)( oldrefTree );
   while ( VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {

UWord ea, key;
       oldref = (OldRef*)valW;
       key = oldref->gen;

/* BEGIN find 'ea', which is the index in genMap holding the
         count for generation number 'key'. */
      if (UNLIKELY(genMap == NULL)) {
         /* deal with the first key to be seen, so that the following
            cases don't need to handle the complexity of a NULL count
            array. */
         genMap_min  = key;
         genMap_size = 1;
         genMap = HG_(zalloc)( "libhb.emmG.1a",
                                genMap_size * sizeof(UWord) );
         ea = 0;
         if (0) VG_(printf)("(%lu) case 1 [%lu .. %lu]\n",
                            key, genMap_min, genMap_min+genMap_size- 1 );
      }
      else
      if (LIKELY(key >= genMap_min && key < genMap_min + genMap_size)) {
         /* this is the expected (almost-always-happens) case: 'key'
            is already mapped in the array. */
         ea = key - genMap_min;
      }
      else
      if (key < genMap_min) {
         /* 'key' appears before the start of the current array.
            Extend the current array by allocating a larger one and
            copying the current one to the upper end of it. */
         Word   more;
         UWord* map2;
         more = genMap_min - key;
         tl_assert(more > 0);
         map2 = HG_(zalloc)( "libhb.emmG.1b",
                             (genMap_size + more) * sizeof(UWord) );
         VG_(memcpy)( &map2[more], genMap, genMap_size * sizeof(UWord) );
         HG_(free)( genMap );
         genMap = map2;
         genMap_size += more;
         genMap_min -= more;
         ea = 0;
         tl_assert(genMap_min == key);
         if (0) VG_(printf)("(%lu) case 2 [%lu .. %lu]\n",
                            key, genMap_min,  genMap_min+genMap_size- 1 );
      }
      else {
         /* 'key' appears after the end of the current array.  Extend
            the current array by allocating a larger one and copying
            the current one to the lower end of it. */
         Word   more;
         UWord* map2;
         tl_assert(key >= genMap_min + genMap_size);
         more = key - (genMap_min + genMap_size) + 1;
         tl_assert(more > 0);
         map2 = HG_(zalloc)( "libhb.emmG.1c",
                             (genMap_size + more) * sizeof(UWord) );
         VG_(memcpy)( &map2[0], genMap, genMap_size * sizeof(UWord) );
         HG_(free)( genMap );
         genMap = map2;
         genMap_size += more;
         ea = genMap_size - 1;;
         tl_assert(genMap_min + genMap_size - 1 == key);
         if (0) VG_(printf)("(%lu) case 3 [%lu .. %lu]\n",
                            key, genMap_min, genMap_min+genMap_size- 1 );
      }
      /* END find 'ea' from 'key' */

tl_assert(ea >= 0 && ea < genMap_size);
      /* and the whole point of this elaborate computation of 'ea' is .. */
      genMap[ea]++;
   }

tl_assert(genMap);
   tl_assert(genMap_size > 0);

/* Sanity check what we just computed */
   { UWord sum = 0;
     for (i = 0; i < genMap_size; i++) {
        if (0) VG_(printf)("  xxx: gen %ld has %lu\n",
                           i + genMap_min, genMap[i] );
        sum += genMap[i];
     }
     tl_assert(sum == oldrefTreeN);
   }

/* Figure out how many generations to throw away */
   retained = oldrefTreeN;
   maxGen = 0;

for (i = 0; i < genMap_size; i++) {
      keyW = i + genMap_min;
      valW = genMap[i];
      tl_assert(keyW > 0); /* can't allow a generation # 0 */
      if (0) VG_(printf)("  XXX: gen %lu has %lu\n", keyW, valW );
      tl_assert(keyW >= maxGen);
      tl_assert(retained >= valW);
      if (retained - valW
          > (UWord)(HG_(clo_conflict_cache_size) 
                    * EVENT_MAP_GC_DISCARD_FRACTION)) {
         retained -= valW;
         maxGen = keyW;
      } else {
         break;
      }
   }

HG_(free)(genMap);

tl_assert(retained >= 0 && retained <= oldrefTreeN);

/* Now make up a big list of the oldrefTree entries we want to
      delete.  We can't simultaneously traverse the tree and delete
      stuff from it, so first we need to copy them off somewhere
      else. (sigh) */
   refs2del = VG_(newXA)( HG_(zalloc), "libhb.emmG.2",
                          HG_(free), sizeof(Addr) );

if (retained < oldrefTreeN) {

/* This is the normal (expected) case.  We discard any ref whose
         generation number <= maxGen. */
      VG_(initIterSWA)( oldrefTree );
      while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
         oldref = (OldRef*)valW;
         tl_assert(oldref->magic == OldRef_MAGIC);
         if (oldref->gen <= maxGen) {
            VG_(addToXA)( refs2del, &keyW );
         }
      }
      if (VG_(clo_stats)) {
         VG_(message)(Vg_DebugMsg,
            "libhb: EvM GC: delete generations %lu and below, "
            "retaining %lu entries\n",
            maxGen, retained );
      }

} else {

static UInt rand_seed = 0; /* leave as static */

/* Degenerate case: there's only one generation in the entire
         tree, so we need to have some other way of deciding which
         refs to throw away.  Just throw out half of them randomly. */
      tl_assert(retained == oldrefTreeN);
      VG_(initIterSWA)( oldrefTree );
      while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
         UInt n;
         oldref = (OldRef*)valW;
         tl_assert(oldref->magic == OldRef_MAGIC);
         n = VG_(random)( &rand_seed );
         if ((n & 0xFFF) < 0x800) {
            VG_(addToXA)( refs2del, &keyW );
            retained--;
         }
      }
      if (VG_(clo_stats)) {
         VG_(message)(Vg_DebugMsg,
            "libhb: EvM GC: randomly delete half the entries, "
            "retaining %lu entries\n",
            retained );
      }

}

n2del = VG_(sizeXA)( refs2del );
   tl_assert(n2del == (Word)(oldrefTreeN - retained));

if (0) VG_(printf)("%s","deleting entries\n");
   for (i = 0; i < n2del; i++) {
      Bool  b;
      Addr  ga2del = *(Addr*)VG_(indexXA)( refs2del, i );
      b = VG_(delFromSWA)( oldrefTree, &keyW, &valW, ga2del );
      tl_assert(b);
      tl_assert(keyW == ga2del);
      oldref = (OldRef*)valW;
      for (j = 0; j < N_OLDREF_ACCS; j++) {
         ThrID aThrID = oldref->accs[j].thrid;
         RCEC* aRef   = oldref->accs[j].rcec;
         if (aRef) {
            tl_assert(aThrID != 0);
            stats__ctxt_rcdec3++;
            ctxt__rcdec( aRef );
         } else {
            tl_assert(aThrID == 0);
         }
      }

free_OldRef( oldref );
   }

VG_(deleteXA)( refs2del );

tl_assert( VG_(sizeSWA)( oldrefTree ) == retained );

oldrefTreeN = retained;
   oldrefGenIncAt = oldrefTreeN; /* start new gen right away */

/* Throw away all RCECs with zero reference counts */
   for (i = 0; i < N_RCEC_TAB; i++) {
      RCEC** pp = &contextTab[i];
      RCEC*  p  = *pp;
      while (p) {
         if (p->rc == 0) {
            *pp = p->next;
            free_RCEC(p);
            p = *pp;
            tl_assert(stats__ctxt_tab_curr > 0);
            stats__ctxt_tab_curr--;
         } else {
            pp = &p->next;
            p = p->next;
         }
      }
   }

/* Check the reference counts (expensive) */
   if (CHECK_CEM)
      event_map__check_reference_counts( False/*after*/ );

//if (0)
   //VG_(printf)("XXXX final sizes: oldrefTree %ld, contextTree %ld\n\n",
   //            VG_(OSetGen_Size)(oldrefTree), VG_(OSetGen_Size)(contextTree));

}

/////////////////////////////////////////////////////////
//                                                     //
// Core MSM                                            //
//                                                     //
/////////////////////////////////////////////////////////

/* Logic in msmcread/msmcwrite updated/verified after re-analysis, 19
   Nov 08, and again after [...],
   June 09. */

static ULong stats__msmcread         = 0;
static ULong stats__msmcread_change  = 0;
static ULong stats__msmcwrite        = 0;
static ULong stats__msmcwrite_change = 0;

/* Some notes on the H1 history mechanism:

Transition rules are:

read_{Kr,Kw}(Cr,Cw)  = (Cr,           Cr `join` Kw)
   write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw)

After any access by a thread T to a location L, L's constraint pair
   (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock.

After a race by thread T conflicting with some previous access by
   some other thread U, for a location with constraint (before
   processing the later access) (Cr,Cw), then Cw[U] is the segment in
   which the previously access lies.

Hence in record_race_info, we pass in Cfailed and Kfailed, which
   are compared so as to find out which thread(s) this access
   conflicts with.  Once that is established, we also require the
   pre-update Cw for the location, so we can index into it for those
   threads, to get the scalar clock values for the point at which the
   former accesses were made.  (In fact we only bother to do any of
   this for an arbitrarily chosen one of the conflicting threads, as
   that's simpler, it avoids flooding the user with vast amounts of
   mostly useless information, and because the program is wrong if it
   contains any races at all -- so we don't really need to show all
   conflicting access pairs initially, so long as we only show none if
   none exist).

---

That requires the auxiliary proof that

(Cr `join` Kw)[T] == Kw[T]

Why should that be true?  Because for any thread T, Kw[T] >= the
   scalar clock value for T known by any other thread.  In other
   words, because T's value for its own scalar clock is at least as up
   to date as the value for it known by any other thread (that is true
   for both the R- and W- scalar clocks).  Hence no other thread will
   be able to feed in a value for that element (indirectly via a
   constraint) which will exceed Kw[T], and hence the join cannot
   cause that particular element to advance.
*/

__attribute__((noinline))
static void record_race_info ( Thr* acc_thr, 
                               Addr acc_addr, SizeT szB, Bool isWrite,
                               VtsID Cfailed,
                               VtsID Kfailed,
                               VtsID Cw )
{
   /* Call here to report a race.  We just hand it onwards to
      HG_(record_error_Race).  If that in turn discovers that the
      error is going to be collected, then, at history_level 2, that
      queries the conflicting-event map.  The alternative would be to
      query it right here.  But that causes a lot of pointless queries
      for errors which will shortly be discarded as duplicates, and
      can become a performance overhead; so we defer the query until
      we know the error is not a duplicate. */

/* Stacks for the bounds of the (or one of the) conflicting
      segment(s).  These are only set at history_level 1. */
   ExeContext* hist1_seg_start = NULL;
   ExeContext* hist1_seg_end   = NULL;
   Thread*     hist1_conf_thr  = NULL;

tl_assert(acc_thr);
   tl_assert(acc_thr->hgthread);
   tl_assert(acc_thr->hgthread->hbthr == acc_thr);
   tl_assert(HG_(clo_history_level) >= 0 && HG_(clo_history_level) <= 2);

if (HG_(clo_history_level) == 1) {
      Bool found;
      Word firstIx, lastIx;
      ULong_n_EC key;

/* At history_level 1, we must round up the relevant stack-pair
         for the conflicting segment right now.  This is because
         deferring it is complex; we can't (easily) put Kfailed and
         Cfailed into the XError and wait for later without
         getting tied up in difficulties with VtsID reference
         counting.  So just do it now. */
      Thr*  confThr;
      ULong confTym = 0;
      /* Which thread are we in conflict with?  There may be more than
         one, in which case VtsID__findFirst_notLEQ selects one arbitrarily
         (in fact it's the one with the lowest Thr* value). */
      confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed );
      /* This must exist!  since if it was NULL then there's no
         conflict (semantics of return value of
         VtsID__findFirst_notLEQ), and msmc{read,write}, which has
         called us, just checked exactly this -- that there was in
         fact a race. */
      tl_assert(confThr);

/* Get the scalar clock value that the conflicting thread
         introduced into the constraint.  A careful examination of the
         base machine rules shows that this must be the same as the
         conflicting thread's scalar clock when it created this
         constraint.  Hence we know the scalar clock of the
         conflicting thread when the conflicting access was made. */
      confTym = VtsID__indexAt( Cfailed, confThr );

/* Using this scalar clock, index into the conflicting thread's
         collection of stack traces made each time its vector clock
         (hence its scalar clock) changed.  This gives the stack
         traces at the start and end of the conflicting segment (well,
         as per comment just above, of one of the conflicting
         segments, if there are more than one). */
      key.ull = confTym;
      key.ec  = NULL;
      /* tl_assert(confThr); -- asserted just above */
      tl_assert(confThr->local_Kws_n_stacks);
      firstIx = lastIx = 0;
      found = VG_(lookupXA_UNSAFE)(
                 confThr->local_Kws_n_stacks,
                 &key, &firstIx, &lastIx,
                 (Int(*)(void*,void*))cmp__ULong_n_EC__by_ULong
              );
      if (0) VG_(printf)("record_race_info %u %u %u  confThr %p "
                         "confTym %llu found %d (%lu,%lu)\n", 
                         Cfailed, Kfailed, Cw,
                         confThr, confTym, found, firstIx, lastIx);
      /* We can't indefinitely collect stack traces at VTS
         transitions, since we'd eventually run out of memory.  Hence
         note_local_Kw_n_stack_for will eventually throw away old
         ones, which in turn means we might fail to find index value
         confTym in the array. */
      if (found) {
         ULong_n_EC *pair_start, *pair_end;
         pair_start 
            = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx );
         hist1_seg_start = pair_start->ec;
         if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) {
            pair_end
               = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks,
                                            lastIx+1 );
            /* from properties of VG_(lookupXA) and the comparison fn used: */
            tl_assert(pair_start->ull < pair_end->ull);
            hist1_seg_end = pair_end->ec;
            /* Could do a bit better here.  It may be that pair_end
               doesn't have a stack, but the following entries in the
               array have the same scalar Kw and to have a stack.  So
               we should search a bit further along the array than
               lastIx+1 if hist1_seg_end is NULL. */
         } else {
            if (!confThr->llexit_done)
               hist1_seg_end = main_get_EC( confThr );
         }
         // seg_start could be NULL iff this is the first stack in the thread
         //if (seg_start) VG_(pp_ExeContext)(seg_start);
         //if (seg_end)   VG_(pp_ExeContext)(seg_end);
         hist1_conf_thr = confThr->hgthread;
      }
   }

HG_(record_error_Race)( acc_thr->hgthread, acc_addr,
                           szB, isWrite,
                           hist1_conf_thr, hist1_seg_start, hist1_seg_end );
}

static Bool is_sane_SVal_C ( SVal sv ) {
   Bool leq;
   if (!SVal__isC(sv)) return True;
   leq = VtsID__cmpLEQ( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) );
   return leq;
}

/* Compute new state following a read */
static inline SVal msmcread ( SVal svOld,
                              /* The following are only needed for 
                                 creating error reports. */
                              Thr* acc_thr,
                              Addr acc_addr, SizeT szB )
{
   SVal svNew = SVal_INVALID;
   stats__msmcread++;

/* Redundant sanity check on the constraints */
   if (CHECK_MSM) {
      tl_assert(is_sane_SVal_C(svOld));
   }

if (LIKELY(SVal__isC(svOld))) {
      VtsID tviR  = acc_thr->viR;
      VtsID tviW  = acc_thr->viW;
      VtsID rmini = SVal__unC_Rmin(svOld);
      VtsID wmini = SVal__unC_Wmin(svOld);
      Bool  leq   = VtsID__cmpLEQ(rmini,tviR);
      if (LIKELY(leq)) {
         /* no race */
         /* Note: RWLOCK subtlety: use tviW, not tviR */
         svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
         goto out;
      } else {
         /* assert on sanity of constraints. */
         Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
         tl_assert(leqxx);
         // same as in non-race case
         svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
         record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
                           rmini, /* Cfailed */
                           tviR,  /* Kfailed */
                           wmini  /* Cw */ );
         goto out;
      }
   }
   if (SVal__isA(svOld)) {
      /* reading no-access memory (sigh); leave unchanged */
      /* check for no pollution */
      tl_assert(svOld == SVal_NOACCESS);
      svNew = SVal_NOACCESS;
      goto out;
   }
   if (0) VG_(printf)("msmcread: bad svOld: 0x%016llx\n", svOld);
   tl_assert(0);

out:
   if (CHECK_MSM) {
      tl_assert(is_sane_SVal_C(svNew));
   }
   if (UNLIKELY(svNew != svOld)) {
      tl_assert(svNew != SVal_INVALID);
      if (HG_(clo_history_level) >= 2
          && SVal__isC(svOld) && SVal__isC(svNew)) {
         event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr );
         stats__msmcread_change++;
      }
   }
   return svNew;
}

/* Compute new state following a write */
static inline SVal msmcwrite ( SVal svOld,
                              /* The following are only needed for 
                                 creating error reports. */
                              Thr* acc_thr,
                              Addr acc_addr, SizeT szB )
{
   SVal svNew = SVal_INVALID;
   stats__msmcwrite++;

/* Redundant sanity check on the constraints */
   if (CHECK_MSM) {
      tl_assert(is_sane_SVal_C(svOld));
   }

if (LIKELY(SVal__isC(svOld))) {
      VtsID tviW  = acc_thr->viW;
      VtsID wmini = SVal__unC_Wmin(svOld);
      Bool  leq   = VtsID__cmpLEQ(wmini,tviW);
      if (LIKELY(leq)) {
         /* no race */
         svNew = SVal__mkC( tviW, tviW );
         goto out;
      } else {
         VtsID rmini = SVal__unC_Rmin(svOld);
         /* assert on sanity of constraints. */
         Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
         tl_assert(leqxx);
         // same as in non-race case
         // proof: in the non-race case, we have
         //    rmini <= wmini (invar on constraints)
         //    tviW <= tviR (invar on thread clocks)
         //    wmini <= tviW (from run-time check)
         // hence from transitivity of <= we have
         //    rmini <= wmini <= tviW
         // and so join(rmini,tviW) == tviW
         // and    join(wmini,tviW) == tviW
         // qed.
         svNew = SVal__mkC( VtsID__join2(rmini, tviW),
                            VtsID__join2(wmini, tviW) );
         record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
                           wmini, /* Cfailed */
                           tviW,  /* Kfailed */
                           wmini  /* Cw */ );
         goto out;
      }
   }
   if (SVal__isA(svOld)) {
      /* writing no-access memory (sigh); leave unchanged */
      /* check for no pollution */
      tl_assert(svOld == SVal_NOACCESS);
      svNew = SVal_NOACCESS;
      goto out;
   }
   if (0) VG_(printf)("msmcwrite: bad svOld: 0x%016llx\n", svOld);
   tl_assert(0);

out:
   if (CHECK_MSM) {
      tl_assert(is_sane_SVal_C(svNew));
   }
   if (UNLIKELY(svNew != svOld)) {
      tl_assert(svNew != SVal_INVALID);
      if (HG_(clo_history_level) >= 2
          && SVal__isC(svOld) && SVal__isC(svNew)) {
         event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr );
         stats__msmcwrite_change++;
      }
   }
   return svNew;
}

/////////////////////////////////////////////////////////
//                                                     //
// Apply core MSM to specific memory locations         //
//                                                     //
/////////////////////////////////////////////////////////

/*------------- ZSM accesses: 8 bit sapply ------------- */

static void zsm_sapply08__msmcread ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cread08s++;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 .. 7 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
      SVal* tree = &cl->svals[tno << 3];
      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   svOld = cl->svals[cloff];
   svNew = msmcread( svOld, thr,a,1 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
}

static void zsm_sapply08__msmcwrite ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cwrite08s++;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 .. 7 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
      SVal* tree = &cl->svals[tno << 3];
      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   svOld = cl->svals[cloff];
   svNew = msmcwrite( svOld, thr,a,1 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
}

/*------------- ZSM accesses: 16 bit sapply ------------- */

static void zsm_sapply16__msmcread ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cread16s++;
   if (UNLIKELY(!aligned16(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
      if (valid_value_is_below_me_16(descr, toff)) {
         goto slowcase;
      } else {
         SVal* tree = &cl->svals[tno << 3];
         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
      }
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   svOld = cl->svals[cloff];
   svNew = msmcread( svOld, thr,a,2 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
   return;
  slowcase: /* misaligned, or must go further down the tree */
   stats__cline_16to8splits++;
   zsm_sapply08__msmcread( thr, a + 0 );
   zsm_sapply08__msmcread( thr, a + 1 );
}

static void zsm_sapply16__msmcwrite ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cwrite16s++;
   if (UNLIKELY(!aligned16(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
      if (valid_value_is_below_me_16(descr, toff)) {
         goto slowcase;
      } else {
         SVal* tree = &cl->svals[tno << 3];
         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
      }
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   svOld = cl->svals[cloff];
   svNew = msmcwrite( svOld, thr,a,2 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
   return;
  slowcase: /* misaligned, or must go further down the tree */
   stats__cline_16to8splits++;
   zsm_sapply08__msmcwrite( thr, a + 0 );
   zsm_sapply08__msmcwrite( thr, a + 1 );
}

/*------------- ZSM accesses: 32 bit sapply ------------- */

static void zsm_sapply32__msmcread ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cread32s++;
   if (UNLIKELY(!aligned32(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 or 4 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
      if (valid_value_is_above_me_32(descr, toff)) {
         SVal* tree = &cl->svals[tno << 3];
         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
      } else {
         goto slowcase;
      }
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   svOld = cl->svals[cloff];
   svNew = msmcread( svOld, thr,a,4 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
   return;
  slowcase: /* misaligned, or must go further down the tree */
   stats__cline_32to16splits++;
   zsm_sapply16__msmcread( thr, a + 0 );
   zsm_sapply16__msmcread( thr, a + 2 );
}

static void zsm_sapply32__msmcwrite ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cwrite32s++;
   if (UNLIKELY(!aligned32(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 or 4 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
      if (valid_value_is_above_me_32(descr, toff)) {
         SVal* tree = &cl->svals[tno << 3];
         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
      } else {
         goto slowcase;
      }
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   svOld = cl->svals[cloff];
   svNew = msmcwrite( svOld, thr,a,4 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
   return;
  slowcase: /* misaligned, or must go further down the tree */
   stats__cline_32to16splits++;
   zsm_sapply16__msmcwrite( thr, a + 0 );
   zsm_sapply16__msmcwrite( thr, a + 2 );
}

/*------------- ZSM accesses: 64 bit sapply ------------- */

static void zsm_sapply64__msmcread ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno;
   //UWord      toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cread64s++;
   if (UNLIKELY(!aligned64(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   //toff  = get_tree_offset(a); /* == 0, unused */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
      goto slowcase;
   }
   svOld = cl->svals[cloff];
   svNew = msmcread( svOld, thr,a,8 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
   return;
  slowcase: /* misaligned, or must go further down the tree */
   stats__cline_64to32splits++;
   zsm_sapply32__msmcread( thr, a + 0 );
   zsm_sapply32__msmcread( thr, a + 4 );
}

static void zsm_sapply64__msmcwrite ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno;
   //UWord      toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cwrite64s++;
   if (UNLIKELY(!aligned64(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   //toff  = get_tree_offset(a); /* == 0, unused */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
      goto slowcase;
   }
   svOld = cl->svals[cloff];
   svNew = msmcwrite( svOld, thr,a,8 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
   return;
  slowcase: /* misaligned, or must go further down the tree */
   stats__cline_64to32splits++;
   zsm_sapply32__msmcwrite( thr, a + 0 );
   zsm_sapply32__msmcwrite( thr, a + 4 );
}

/*--------------- ZSM accesses: 8 bit swrite --------------- */

static
void zsm_swrite08 ( Addr a, SVal svNew ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   UShort     descr;
   stats__cline_swrite08s++;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 .. 7 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
      SVal* tree = &cl->svals[tno << 3];
      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
}

/*--------------- ZSM accesses: 16 bit swrite --------------- */

static
void zsm_swrite16 ( Addr a, SVal svNew ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   UShort     descr;
   stats__cline_swrite16s++;
   if (UNLIKELY(!aligned16(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
      if (valid_value_is_below_me_16(descr, toff)) {
         /* Writing at this level.  Need to fix up 'descr'. */
         cl->descrs[tno] = pullup_descr_to_16(descr, toff);
         /* At this point, the tree does not match cl->descr[tno] any
            more.  The assignments below will fix it up. */
      } else {
         /* We can't indiscriminately write on the w16 node as in the
            w64 case, as that might make the node inconsistent with
            its parent.  So first, pull down to this level. */
         SVal* tree = &cl->svals[tno << 3];
         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
      }
   }
   tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff + 0] = svNew;
   cl->svals[cloff + 1] = SVal_INVALID;
   return;
  slowcase: /* misaligned */
   stats__cline_16to8splits++;
   zsm_swrite08( a + 0, svNew );
   zsm_swrite08( a + 1, svNew );
}

/*--------------- ZSM accesses: 32 bit swrite --------------- */

static
void zsm_swrite32 ( Addr a, SVal svNew ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   UShort     descr;
   stats__cline_swrite32s++;
   if (UNLIKELY(!aligned32(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 or 4 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
      if (valid_value_is_above_me_32(descr, toff)) {
         /* We can't indiscriminately write on the w32 node as in the
            w64 case, as that might make the node inconsistent with
            its parent.  So first, pull down to this level. */
         SVal* tree = &cl->svals[tno << 3];
         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
         if (CHECK_ZSM)
            tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
      } else {
         /* Writing at this level.  Need to fix up 'descr'. */
         cl->descrs[tno] = pullup_descr_to_32(descr, toff);
         /* At this point, the tree does not match cl->descr[tno] any
            more.  The assignments below will fix it up. */
      }
   }
   tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff + 0] = svNew;
   cl->svals[cloff + 1] = SVal_INVALID;
   cl->svals[cloff + 2] = SVal_INVALID;
   cl->svals[cloff + 3] = SVal_INVALID;
   return;
  slowcase: /* misaligned */
   stats__cline_32to16splits++;
   zsm_swrite16( a + 0, svNew );
   zsm_swrite16( a + 2, svNew );
}

/*--------------- ZSM accesses: 64 bit swrite --------------- */

static
void zsm_swrite64 ( Addr a, SVal svNew ) {
   CacheLine* cl; 
   UWord      cloff, tno;
   //UWord    toff;
   stats__cline_swrite64s++;
   if (UNLIKELY(!aligned64(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   //toff  = get_tree_offset(a); /* == 0, unused */
   cl->descrs[tno] = TREE_DESCR_64;
   tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff + 0] = svNew;
   cl->svals[cloff + 1] = SVal_INVALID;
   cl->svals[cloff + 2] = SVal_INVALID;
   cl->svals[cloff + 3] = SVal_INVALID;
   cl->svals[cloff + 4] = SVal_INVALID;
   cl->svals[cloff + 5] = SVal_INVALID;
   cl->svals[cloff + 6] = SVal_INVALID;
   cl->svals[cloff + 7] = SVal_INVALID;
   return;
  slowcase: /* misaligned */
   stats__cline_64to32splits++;
   zsm_swrite32( a + 0, svNew );
   zsm_swrite32( a + 4, svNew );
}

/*------------- ZSM accesses: 8 bit sread/scopy ------------- */

static
SVal zsm_sread08 ( Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   UShort     descr;
   stats__cline_sread08s++;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 .. 7 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
      SVal* tree = &cl->svals[tno << 3];
      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
   }
   return cl->svals[cloff];
}

static void zsm_scopy08 ( Addr src, Addr dst, Bool uu_normalise ) {
   SVal       sv;
   stats__cline_scopy08s++;
   sv = zsm_sread08( src );
   zsm_swrite08( dst, sv );
}

/* Block-copy states (needed for implementing realloc()).  Note this
   doesn't change the filtering arrangements.  The caller of
   zsm_scopy_range needs to attend to that. */

static void zsm_scopy_range ( Addr src, Addr dst, SizeT len )
{
   SizeT i;
   if (len == 0)
      return;

/* assert for non-overlappingness */
   tl_assert(src+len <= dst || dst+len <= src);

/* To be simple, just copy byte by byte.  But so as not to wreck
      performance for later accesses to dst[0 .. len-1], normalise
      destination lines as we finish with them, and also normalise the
      line containing the first and last address. */
   for (i = 0; i < len; i++) {
      Bool normalise
         = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */
           || i == 0       /* first in range */
           || i == len-1;  /* last in range */
      zsm_scopy08( src+i, dst+i, normalise );
   }
}

/* For setting address ranges to a given value.  Has considerable
   sophistication so as to avoid generating large numbers of pointless
   cache loads/writebacks for large ranges. */

/* Do small ranges in-cache, in the obvious way. */
static
void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew )
{
   /* fast track a couple of common cases */
   if (len == 4 && aligned32(a)) {
      zsm_swrite32( a, svNew );
      return;
   }
   if (len == 8 && aligned64(a)) {
      zsm_swrite64( a, svNew );
      return;
   }

/* be completely general (but as efficient as possible) */
   if (len == 0) return;

if (!aligned16(a) && len >= 1) {
      zsm_swrite08( a, svNew );
      a += 1;
      len -= 1;
      tl_assert(aligned16(a));
   }
   if (len == 0) return;

if (!aligned32(a) && len >= 2) {
      zsm_swrite16( a, svNew );
      a += 2;
      len -= 2;
      tl_assert(aligned32(a));
   }
   if (len == 0) return;

if (!aligned64(a) && len >= 4) {
      zsm_swrite32( a, svNew );
      a += 4;
      len -= 4;
      tl_assert(aligned64(a));
   }
   if (len == 0) return;

if (len >= 8) {
      tl_assert(aligned64(a));
      while (len >= 8) {
         zsm_swrite64( a, svNew );
         a += 8;
         len -= 8;
      }
      tl_assert(aligned64(a));
   }
   if (len == 0) return;

if (len >= 4)
      tl_assert(aligned32(a));
   if (len >= 4) {
      zsm_swrite32( a, svNew );
      a += 4;
      len -= 4;
   }
   if (len == 0) return;

if (len >= 2)
      tl_assert(aligned16(a));
   if (len >= 2) {
      zsm_swrite16( a, svNew );
      a += 2;
      len -= 2;
   }
   if (len == 0) return;

if (len >= 1) {
      zsm_swrite08( a, svNew );
      //a += 1;
      len -= 1;
   }
   tl_assert(len == 0);
}

/* If we're doing a small range, hand off to zsm_sset_range_SMALL.  But
   for larger ranges, try to operate directly on the out-of-cache
   representation, rather than dragging lines into the cache,
   overwriting them, and forcing them out.  This turns out to be an
   important performance optimisation.

Note that this doesn't change the filtering arrangements.  The
   caller of zsm_sset_range needs to attend to that. */

static void zsm_sset_range ( Addr a, SizeT len, SVal svNew )
{
   tl_assert(svNew != SVal_INVALID);
   stats__cache_make_New_arange += (ULong)len;

if (0 && len > 500)
      VG_(printf)("make New      ( %#lx, %ld )\n", a, len );

if (0) {
      static UWord n_New_in_cache = 0;
      static UWord n_New_not_in_cache = 0;
      /* tag is 'a' with the in-line offset masked out, 
         eg a[31]..a[4] 0000 */
      Addr       tag = a & ~(N_LINE_ARANGE - 1);
      UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
      if (LIKELY(tag == cache_shmem.tags0[wix])) {
         n_New_in_cache++;
      } else {
         n_New_not_in_cache++;
      }
      if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000))
         VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n",
                     n_New_in_cache, n_New_not_in_cache );
   }

if (LIKELY(len < 2 * N_LINE_ARANGE)) {
      zsm_sset_range_SMALL( a, len, svNew );
   } else {
      Addr  before_start  = a;
      Addr  aligned_start = cacheline_ROUNDUP(a);
      Addr  after_start   = cacheline_ROUNDDN(a + len);
      UWord before_len    = aligned_start - before_start;
      UWord aligned_len   = after_start - aligned_start;
      UWord after_len     = a + len - after_start;
      tl_assert(before_start <= aligned_start);
      tl_assert(aligned_start <= after_start);
      tl_assert(before_len < N_LINE_ARANGE);
      tl_assert(after_len < N_LINE_ARANGE);
      tl_assert(get_cacheline_offset(aligned_start) == 0);
      if (get_cacheline_offset(a) == 0) {
         tl_assert(before_len == 0);
         tl_assert(a == aligned_start);
      }
      if (get_cacheline_offset(a+len) == 0) {
         tl_assert(after_len == 0);
         tl_assert(after_start == a+len);
      }
      if (before_len > 0) {
         zsm_sset_range_SMALL( before_start, before_len, svNew );
      }
      if (after_len > 0) {
         zsm_sset_range_SMALL( after_start, after_len, svNew );
      }
      stats__cache_make_New_inZrep += (ULong)aligned_len;

while (1) {
         Addr tag;
         UWord wix;
         if (aligned_start >= after_start)
            break;
         tl_assert(get_cacheline_offset(aligned_start) == 0);
         tag = aligned_start & ~(N_LINE_ARANGE - 1);
         wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1);
         if (tag == cache_shmem.tags0[wix]) {
            UWord i;
            for (i = 0; i < N_LINE_ARANGE / 8; i++)
               zsm_swrite64( aligned_start + i * 8, svNew );
         } else {
            UWord i;
            Word zix;
            SecMap* sm;
            LineZ* lineZ;
            /* This line is not in the cache.  Do not force it in; instead
               modify it in-place. */
            /* find the Z line to write in and rcdec it or the
               associated F line. */
            find_Z_for_writing( &sm, &zix, tag );
            tl_assert(sm);
            tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
            lineZ = &sm->linesZ[zix];
            lineZ->dict[0] = svNew;
            lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
            for (i = 0; i < N_LINE_ARANGE/4; i++)
               lineZ->ix2s[i] = 0; /* all refer to dict[0] */
            rcinc_LineZ(lineZ);
         }
         aligned_start += N_LINE_ARANGE;
         aligned_len -= N_LINE_ARANGE;
      }
      tl_assert(aligned_start == after_start);
      tl_assert(aligned_len == 0);
   }
}

/////////////////////////////////////////////////////////
//                                                     //
// Front-filtering accesses                            //
//                                                     //
/////////////////////////////////////////////////////////

static UWord stats__f_ac = 0;
static UWord stats__f_sk = 0;

#if 0
#  define STATS__F_SHOW \
     do { \
        if (UNLIKELY(0 == (stats__f_ac & 0xFFFFFF))) \
           VG_(printf)("filters: ac %lu sk %lu\n",   \
           stats__f_ac, stats__f_sk); \
     } while (0)
#else
#  define STATS__F_SHOW /* */
#endif

void zsm_sapply08_f__msmcwrite ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_cwr08(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply08__msmcwrite(thr, a);
}

void zsm_sapply16_f__msmcwrite ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_cwr16(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply16__msmcwrite(thr, a);
}

void zsm_sapply32_f__msmcwrite ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_cwr32(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply32__msmcwrite(thr, a);
}

void zsm_sapply64_f__msmcwrite ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_cwr64(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply64__msmcwrite(thr, a);
}

void zsm_sapplyNN_f__msmcwrite ( Thr* thr, Addr a, SizeT len )
{
   /* fast track a couple of common cases */
   if (len == 4 && aligned32(a)) {
      zsm_sapply32_f__msmcwrite( thr, a );
      return;
   }
   if (len == 8 && aligned64(a)) {
      zsm_sapply64_f__msmcwrite( thr, a );
      return;
   }

/* be completely general (but as efficient as possible) */
   if (len == 0) return;

if (!aligned16(a) && len >= 1) {
      zsm_sapply08_f__msmcwrite( thr, a );
      a += 1;
      len -= 1;
      tl_assert(aligned16(a));
   }
   if (len == 0) return;

if (!aligned32(a) && len >= 2) {
      zsm_sapply16_f__msmcwrite( thr, a );
      a += 2;
      len -= 2;
      tl_assert(aligned32(a));
   }
   if (len == 0) return;

if (!aligned64(a) && len >= 4) {
      zsm_sapply32_f__msmcwrite( thr, a );
      a += 4;
      len -= 4;
      tl_assert(aligned64(a));
   }
   if (len == 0) return;

if (len >= 8) {
      tl_assert(aligned64(a));
      while (len >= 8) {
         zsm_sapply64_f__msmcwrite( thr, a );
         a += 8;
         len -= 8;
      }
      tl_assert(aligned64(a));
   }
   if (len == 0) return;

if (len >= 4)
      tl_assert(aligned32(a));
   if (len >= 4) {
      zsm_sapply32_f__msmcwrite( thr, a );
      a += 4;
      len -= 4;
   }
   if (len == 0) return;

if (len >= 2)
      tl_assert(aligned16(a));
   if (len >= 2) {
      zsm_sapply16_f__msmcwrite( thr, a );
      a += 2;
      len -= 2;
   }
   if (len == 0) return;

if (len >= 1) {
      zsm_sapply08_f__msmcwrite( thr, a );
      //a += 1;
      len -= 1;
   }
   tl_assert(len == 0);
}

void zsm_sapply08_f__msmcread ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_crd08(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply08__msmcread(thr, a);
}

void zsm_sapply16_f__msmcread ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_crd16(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply16__msmcread(thr, a);
}

void zsm_sapply32_f__msmcread ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_crd32(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply32__msmcread(thr, a);
}

void zsm_sapply64_f__msmcread ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_crd64(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply64__msmcread(thr, a);
}

void zsm_sapplyNN_f__msmcread ( Thr* thr, Addr a, SizeT len )
{
   /* fast track a couple of common cases */
   if (len == 4 && aligned32(a)) {
      zsm_sapply32_f__msmcread( thr, a );
      return;
   }
   if (len == 8 && aligned64(a)) {
      zsm_sapply64_f__msmcread( thr, a );
      return;
   }

/* be completely general (but as efficient as possible) */
   if (len == 0) return;

if (!aligned16(a) && len >= 1) {
      zsm_sapply08_f__msmcread( thr, a );
      a += 1;
      len -= 1;
      tl_assert(aligned16(a));
   }
   if (len == 0) return;

if (!aligned32(a) && len >= 2) {
      zsm_sapply16_f__msmcread( thr, a );
      a += 2;
      len -= 2;
      tl_assert(aligned32(a));
   }
   if (len == 0) return;

if (!aligned64(a) && len >= 4) {
      zsm_sapply32_f__msmcread( thr, a );
      a += 4;
      len -= 4;
      tl_assert(aligned64(a));
   }
   if (len == 0) return;

if (len >= 8) {
      tl_assert(aligned64(a));
      while (len >= 8) {
         zsm_sapply64_f__msmcread( thr, a );
         a += 8;
         len -= 8;
      }
      tl_assert(aligned64(a));
   }
   if (len == 0) return;

if (len >= 4)
      tl_assert(aligned32(a));
   if (len >= 4) {
      zsm_sapply32_f__msmcread( thr, a );
      a += 4;
      len -= 4;
   }
   if (len == 0) return;

if (len >= 2)
      tl_assert(aligned16(a));
   if (len >= 2) {
      zsm_sapply16_f__msmcread( thr, a );
      a += 2;
      len -= 2;
   }
   if (len == 0) return;

if (len >= 1) {
      zsm_sapply08_f__msmcread( thr, a );
      //a += 1;
      len -= 1;
   }
   tl_assert(len == 0);
}

void libhb_Thr_resumes ( Thr* thr )
{
   if (0) VG_(printf)("resume %p\n", thr);
   tl_assert(thr);
   tl_assert(!thr->llexit_done);
   Filter__clear(thr->filter, "libhb_Thr_resumes");
   /* A kludge, but .. if this thread doesn't have any marker stacks
      at all, get one right now.  This is easier than figuring out
      exactly when at thread startup we can and can't take a stack
      snapshot. */
   if (HG_(clo_history_level) == 1) {
      tl_assert(thr->local_Kws_n_stacks);
      if (VG_(sizeXA)( thr->local_Kws_n_stacks ) == 0)
         note_local_Kw_n_stack_for(thr);
   }
}

/////////////////////////////////////////////////////////
//                                                     //
// Synchronisation objects                             //
//                                                     //
/////////////////////////////////////////////////////////

/* A double linked list of all the SO's. */
SO* admin_SO = NULL;

static SO* SO__Alloc ( void )
{
   SO* so = HG_(zalloc)( "libhb.SO__Alloc.1", sizeof(SO) );
   so->viR   = VtsID_INVALID;
   so->viW   = VtsID_INVALID;
   so->magic = SO_MAGIC;
   /* Add to double linked list */
   if (admin_SO) {
      tl_assert(admin_SO->admin_prev == NULL);
      admin_SO->admin_prev = so;
      so->admin_next = admin_SO;
   } else {
      so->admin_next = NULL;
   }
   so->admin_prev = NULL;
   admin_SO = so;
   /* */
   return so;
}

static void SO__Dealloc ( SO* so )
{
   tl_assert(so);
   tl_assert(so->magic == SO_MAGIC);
   if (so->viR == VtsID_INVALID) {
      tl_assert(so->viW == VtsID_INVALID);
   } else {
      tl_assert(so->viW != VtsID_INVALID);
      VtsID__rcdec(so->viR);
      VtsID__rcdec(so->viW);
   }
   so->magic = 0;
   /* Del from double linked list */
   if (so->admin_prev)
      so->admin_prev->admin_next = so->admin_next;
   if (so->admin_next)
      so->admin_next->admin_prev = so->admin_prev;
   if (so == admin_SO)
      admin_SO = so->admin_next;
   /* */
   HG_(free)( so );
}

/////////////////////////////////////////////////////////
//                                                     //
// Top Level API                                       //
//                                                     //
/////////////////////////////////////////////////////////

static void show_thread_state ( HChar* str, Thr* t ) 
{
   if (1) return;
   if (t->viR == t->viW) {
      VG_(printf)("thr \"%s\" %p has vi* %u==", str, t, t->viR );
      VtsID__pp( t->viR );
      VG_(printf)("%s","\n");
   } else {
      VG_(printf)("thr \"%s\" %p has viR %u==", str, t, t->viR );
      VtsID__pp( t->viR );
      VG_(printf)(" viW %u==", t->viW);
      VtsID__pp( t->viW );
      VG_(printf)("%s","\n");
   }
}

Thr* libhb_init (
        void        (*get_stacktrace)( Thr*, Addr*, UWord ),
        ExeContext* (*get_EC)( Thr* )
     )
{
   Thr*  thr;
   VtsID vi;

// We will have to have to store a large number of these,
   // so make sure they're the size we expect them to be.
   tl_assert(sizeof(ScalarTS) == 8);

/* because first 1024 unusable */
   tl_assert(SCALARTS_N_THRBITS >= 11);
   /* so as to fit in a UInt w/ 3 bits to spare (see defn of
      Thr_n_RCEC). */
   tl_assert(SCALARTS_N_THRBITS <= 29);

/* Need to be sure that Thr_n_RCEC is 2 words (64-bit) or 3 words
      (32-bit).  It's not correctness-critical, but there are a lot of
      them, so it's important from a space viewpoint.  Unfortunately
      we simply can't pack it into 2 words on a 32-bit target. */
   if (sizeof(UWord) == 8) {
      tl_assert(sizeof(Thr_n_RCEC) == 16);
   } else {
      tl_assert(sizeof(Thr_n_RCEC) == 12);
   }

/* Word sets really are 32 bits.  Even on a 64 bit target. */
   tl_assert(sizeof(WordSetID) == 4);
   tl_assert(sizeof(WordSet) == sizeof(WordSetID));

tl_assert(get_stacktrace);
   tl_assert(get_EC);
   main_get_stacktrace   = get_stacktrace;
   main_get_EC           = get_EC;

// No need to initialise hg_wordfm.
   // No need to initialise hg_wordset.

/* Allocated once and never deallocated.  Used as a temporary in
      VTS singleton, tick and join operations. */
   temp_max_sized_VTS = VTS__new( "libhb.libhb_init.1", ThrID_MAX_VALID );
   temp_max_sized_VTS->id = VtsID_INVALID;
   verydead_thread_table_init();
   vts_set_init();
   vts_tab_init();
   event_map_init();
   VtsID__invalidate_caches();

// initialise shadow memory
   zsm_init( SVal__rcinc, SVal__rcdec );

thr = Thr__new();
   vi  = VtsID__mk_Singleton( thr, 1 );
   thr->viR = vi;
   thr->viW = vi;
   VtsID__rcinc(thr->viR);
   VtsID__rcinc(thr->viW);

show_thread_state("  root", thr);
   return thr;
}

Thr* libhb_create ( Thr* parent )
{
   /* The child's VTSs are copies of the parent's VTSs, but ticked at
      the child's index.  Since the child's index is guaranteed
      unique, it has never been seen before, so the implicit value
      before the tick is zero and after that is one. */
   Thr* child = Thr__new();

child->viR = VtsID__tick( parent->viR, child );
   child->viW = VtsID__tick( parent->viW, child );
   Filter__clear(child->filter, "libhb_create(child)");
   VtsID__rcinc(child->viR);
   VtsID__rcinc(child->viW);
   /* We need to do note_local_Kw_n_stack_for( child ), but it's too
      early for that - it may not have a valid TId yet.  So, let
      libhb_Thr_resumes pick it up the first time the thread runs. */

tl_assert(VtsID__indexAt( child->viR, child ) == 1);
   tl_assert(VtsID__indexAt( child->viW, child ) == 1);

/* and the parent has to move along too */
   VtsID__rcdec(parent->viR);
   VtsID__rcdec(parent->viW);
   parent->viR = VtsID__tick( parent->viR, parent );
   parent->viW = VtsID__tick( parent->viW, parent );
   Filter__clear(parent->filter, "libhb_create(parent)");
   VtsID__rcinc(parent->viR);
   VtsID__rcinc(parent->viW);
   note_local_Kw_n_stack_for( parent );

show_thread_state(" child", child);
   show_thread_state("parent", parent);

return child;
}

/* Shut down the library, and print stats (in fact that's _all_
   this is for. */
void libhb_shutdown ( Bool show_stats )
{
   if (show_stats) {
      VG_(printf)("%s","<<< BEGIN libhb stats >>>\n");
      VG_(printf)(" secmaps: %'10lu allocd (%'12lu g-a-range)\n",
                  stats__secmaps_allocd,
                  stats__secmap_ga_space_covered);
      VG_(printf)("  linesZ: %'10lu allocd (%'12lu bytes occupied)\n",
                  stats__secmap_linesZ_allocd,
                  stats__secmap_linesZ_bytes);
      VG_(printf)("  linesF: %'10lu allocd (%'12lu bytes occupied)\n",
                  stats__secmap_linesF_allocd,
                  stats__secmap_linesF_bytes);
      VG_(printf)(" secmaps: %'10lu iterator steppings\n",
                  stats__secmap_iterator_steppings);
      VG_(printf)(" secmaps: %'10lu searches (%'12lu slow)\n",
                  stats__secmaps_search, stats__secmaps_search_slow);

VG_(printf)("%s","\n");
      VG_(printf)("   cache: %'lu totrefs (%'lu misses)\n",
                  stats__cache_totrefs, stats__cache_totmisses );
      VG_(printf)("   cache: %'14lu Z-fetch,    %'14lu F-fetch\n",
                  stats__cache_Z_fetches, stats__cache_F_fetches );
      VG_(printf)("   cache: %'14lu Z-wback,    %'14lu F-wback\n",
                  stats__cache_Z_wbacks, stats__cache_F_wbacks );
      VG_(printf)("   cache: %'14lu invals,     %'14lu flushes\n",
                  stats__cache_invals, stats__cache_flushes );
      VG_(printf)("   cache: %'14llu arange_New  %'14llu direct-to-Zreps\n",
                  stats__cache_make_New_arange,
                  stats__cache_make_New_inZrep);

VG_(printf)("%s","\n");
      VG_(printf)("   cline: %'10lu normalises\n",
                  stats__cline_normalises );
      VG_(printf)("   cline: c rds 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
                  stats__cline_cread64s,
                  stats__cline_cread32s,
                  stats__cline_cread16s,
                  stats__cline_cread08s );
      VG_(printf)("   cline: c wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
                  stats__cline_cwrite64s,
                  stats__cline_cwrite32s,
                  stats__cline_cwrite16s,
                  stats__cline_cwrite08s );
      VG_(printf)("   cline: s wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
                  stats__cline_swrite64s,
                  stats__cline_swrite32s,
                  stats__cline_swrite16s,
                  stats__cline_swrite08s );
      VG_(printf)("   cline: s rd1s %'lu, s copy1s %'lu\n",
                  stats__cline_sread08s, stats__cline_scopy08s );
      VG_(printf)("   cline:    splits: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
                 stats__cline_64to32splits,
                 stats__cline_32to16splits,
                 stats__cline_16to8splits );
      VG_(printf)("   cline: pulldowns: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
                 stats__cline_64to32pulldown,
                 stats__cline_32to16pulldown,
                 stats__cline_16to8pulldown );
      if (0)
      VG_(printf)("   cline: sizeof(CacheLineZ) %ld, covers %ld bytes of arange\n",
                  (Word)sizeof(LineZ), (Word)N_LINE_ARANGE);

VG_(printf)("%s","\n");

VG_(printf)("   libhb: %'13llu msmcread  (%'llu dragovers)\n",
                  stats__msmcread, stats__msmcread_change);
      VG_(printf)("   libhb: %'13llu msmcwrite (%'llu dragovers)\n",
                  stats__msmcwrite, stats__msmcwrite_change);
      VG_(printf)("   libhb: %'13llu cmpLEQ queries (%'llu misses)\n",
                  stats__cmpLEQ_queries, stats__cmpLEQ_misses);
      VG_(printf)("   libhb: %'13llu join2  queries (%'llu misses)\n",
                  stats__join2_queries, stats__join2_misses);

VG_(printf)("%s","\n");
      VG_(printf)( "   libhb: VTSops: tick %'lu,  join %'lu,  cmpLEQ %'lu\n",
                   stats__vts__tick, stats__vts__join,  stats__vts__cmpLEQ );
      VG_(printf)( "   libhb: VTSops: cmp_structural %'lu (%'lu slow)\n",
                   stats__vts__cmp_structural, stats__vts__cmp_structural_slow );
      VG_(printf)( "   libhb: VTSset: find__or__clone_and_add %'lu (%'lu allocd)\n",
                   stats__vts_set__focaa, stats__vts_set__focaa_a );
      VG_(printf)( "   libhb: VTSops: indexAt_SLOW %'lu\n",
                   stats__vts__indexat_slow );

VG_(printf)("%s","\n");
      VG_(printf)(
         "   libhb: %ld entries in vts_table (approximately %lu bytes)\n",
         VG_(sizeXA)( vts_tab ), VG_(sizeXA)( vts_tab ) * sizeof(VtsTE)
      );
      VG_(printf)( "   libhb: %lu entries in vts_set\n",
                   VG_(sizeFM)( vts_set ) );

VG_(printf)("%s","\n");
      VG_(printf)( "   libhb: ctxt__rcdec: 1=%lu(%lu eq), 2=%lu, 3=%lu\n",
                   stats__ctxt_rcdec1, stats__ctxt_rcdec1_eq,
                   stats__ctxt_rcdec2,
                   stats__ctxt_rcdec3 );
      VG_(printf)( "   libhb: ctxt__rcdec: calls %lu, discards %lu\n",
                   stats__ctxt_rcdec_calls, stats__ctxt_rcdec_discards);
      VG_(printf)( "   libhb: contextTab: %lu slots, %lu max ents\n",
                   (UWord)N_RCEC_TAB,
                   stats__ctxt_tab_curr );
      VG_(printf)( "   libhb: contextTab: %lu queries, %lu cmps\n",
                   stats__ctxt_tab_qs,
                   stats__ctxt_tab_cmps );
#if 0
      VG_(printf)("sizeof(AvlNode)     = %lu\n", sizeof(AvlNode));
      VG_(printf)("sizeof(WordBag)     = %lu\n", sizeof(WordBag));
      VG_(printf)("sizeof(MaybeWord)   = %lu\n", sizeof(MaybeWord));
      VG_(printf)("sizeof(CacheLine)   = %lu\n", sizeof(CacheLine));
      VG_(printf)("sizeof(LineZ)       = %lu\n", sizeof(LineZ));
      VG_(printf)("sizeof(LineF)       = %lu\n", sizeof(LineF));
      VG_(printf)("sizeof(SecMap)      = %lu\n", sizeof(SecMap));
      VG_(printf)("sizeof(Cache)       = %lu\n", sizeof(Cache));
      VG_(printf)("sizeof(SMCacheEnt)  = %lu\n", sizeof(SMCacheEnt));
      VG_(printf)("sizeof(CountedSVal) = %lu\n", sizeof(CountedSVal));
      VG_(printf)("sizeof(VTS)         = %lu\n", sizeof(VTS));
      VG_(printf)("sizeof(ScalarTS)    = %lu\n", sizeof(ScalarTS));
      VG_(printf)("sizeof(VtsTE)       = %lu\n", sizeof(VtsTE));
      VG_(printf)("sizeof(MSMInfo)     = %lu\n", sizeof(MSMInfo));

VG_(printf)("sizeof(struct _XArray)     = %lu\n", sizeof(struct _XArray));
      VG_(printf)("sizeof(struct _WordFM)     = %lu\n", sizeof(struct _WordFM));
      VG_(printf)("sizeof(struct _Thr)     = %lu\n", sizeof(struct _Thr));
      VG_(printf)("sizeof(struct _SO)     = %lu\n", sizeof(struct _SO));
#endif

VG_(printf)("%s","<<< END libhb stats >>>\n");
      VG_(printf)("%s","\n");

}
}

/* Receive notification that a thread has low level exited.  The
   significance here is that we do not expect to see any more memory
   references from it. */
void libhb_async_exit ( Thr* thr )
{
   tl_assert(thr);
   tl_assert(!thr->llexit_done);
   thr->llexit_done = True;

/* free up Filter and local_Kws_n_stacks (well, actually not the
      latter ..) */
   tl_assert(thr->filter);
   HG_(free)(thr->filter);
   thr->filter = NULL;

/* Tell the VTS mechanism this thread has exited, so it can
      participate in VTS pruning.  Note this can only happen if the
      thread has both ll_exited and has been joined with. */
   if (thr->joinedwith_done)
      VTS__declare_thread_very_dead(thr);

/* Another space-accuracy tradeoff.  Do we want to be able to show
      H1 history for conflicts in threads which have since exited?  If
      yes, then we better not free up thr->local_Kws_n_stacks.  The
      downside is a potential per-thread leak of up to
      N_KWs_N_STACKs_PER_THREAD * sizeof(ULong_n_EC) * whatever the
      XArray average overcommit factor is (1.5 I'd guess). */
   // hence:
   // VG_(deleteXA)(thr->local_Kws_n_stacks);
   // thr->local_Kws_n_stacks = NULL;
}

/* Receive notification that a thread has been joined with.  The
   significance here is that we do not expect to see any further
   references to its vector clocks (Thr::viR and Thr::viW). */
void libhb_joinedwith_done ( Thr* thr )
{
   tl_assert(thr);
   /* Caller must ensure that this is only ever called once per Thr. */
   tl_assert(!thr->joinedwith_done);
   thr->joinedwith_done = True;
   if (thr->llexit_done)
      VTS__declare_thread_very_dead(thr);
}

/* Both Segs and SOs point to VTSs.  However, there is no sharing, so
   a Seg that points at a VTS is its one-and-only owner, and ditto for
   a SO that points at a VTS. */

SO* libhb_so_alloc ( void )
{
   return SO__Alloc();
}

void libhb_so_dealloc ( SO* so )
{
   tl_assert(so);
   tl_assert(so->magic == SO_MAGIC);
   SO__Dealloc(so);
}

/* See comments in libhb.h for details on the meaning of 
   strong vs weak sends and strong vs weak receives. */
void libhb_so_send ( Thr* thr, SO* so, Bool strong_send )
{
   /* Copy the VTSs from 'thr' into the sync object, and then move
      the thread along one step. */

tl_assert(so);
   tl_assert(so->magic == SO_MAGIC);

/* stay sane .. a thread's read-clock must always lead or be the
      same as its write-clock */
   { Bool leq = VtsID__cmpLEQ(thr->viW, thr->viR);
     tl_assert(leq);
   }

/* since we're overwriting the VtsIDs in the SO, we need to drop
      any references made by the previous contents thereof */
   if (so->viR == VtsID_INVALID) {
      tl_assert(so->viW == VtsID_INVALID);
      so->viR = thr->viR;
      so->viW = thr->viW;
      VtsID__rcinc(so->viR);
      VtsID__rcinc(so->viW);
   } else {
      /* In a strong send, we dump any previous VC in the SO and
         install the sending thread's VC instead.  For a weak send we
         must join2 with what's already there. */
      tl_assert(so->viW != VtsID_INVALID);
      VtsID__rcdec(so->viR);
      VtsID__rcdec(so->viW);
      so->viR = strong_send ? thr->viR : VtsID__join2( so->viR, thr->viR );
      so->viW = strong_send ? thr->viW : VtsID__join2( so->viW, thr->viW );
      VtsID__rcinc(so->viR);
      VtsID__rcinc(so->viW);
   }

/* move both parent clocks along */
   VtsID__rcdec(thr->viR);
   VtsID__rcdec(thr->viW);
   thr->viR = VtsID__tick( thr->viR, thr );
   thr->viW = VtsID__tick( thr->viW, thr );
   if (!thr->llexit_done) {
      Filter__clear(thr->filter, "libhb_so_send");
      note_local_Kw_n_stack_for(thr);
   }
   VtsID__rcinc(thr->viR);
   VtsID__rcinc(thr->viW);

if (strong_send)
      show_thread_state("s-send", thr);
   else
      show_thread_state("w-send", thr);
}

void libhb_so_recv ( Thr* thr, SO* so, Bool strong_recv )
{
   tl_assert(so);
   tl_assert(so->magic == SO_MAGIC);

if (so->viR != VtsID_INVALID) {
      tl_assert(so->viW != VtsID_INVALID);

/* Weak receive (basically, an R-acquisition of a R-W lock).
         This advances the read-clock of the receiver, but not the
         write-clock. */
      VtsID__rcdec(thr->viR);
      thr->viR = VtsID__join2( thr->viR, so->viR );
      VtsID__rcinc(thr->viR);

/* At one point (r10589) it seemed safest to tick the clocks for
         the receiving thread after the join.  But on reflection, I
         wonder if that might cause it to 'overtake' constraints,
         which could lead to missing races.  So, back out that part of
         r10589. */
      //VtsID__rcdec(thr->viR);
      //thr->viR = VtsID__tick( thr->viR, thr );
      //VtsID__rcinc(thr->viR);

/* For a strong receive, we also advance the receiver's write
         clock, which means the receive as a whole is essentially
         equivalent to a W-acquisition of a R-W lock. */
      if (strong_recv) {
         VtsID__rcdec(thr->viW);
         thr->viW = VtsID__join2( thr->viW, so->viW );
         VtsID__rcinc(thr->viW);

/* See comment just above, re r10589. */
         //VtsID__rcdec(thr->viW);
         //thr->viW = VtsID__tick( thr->viW, thr );
         //VtsID__rcinc(thr->viW);
      }

if (thr->filter)
         Filter__clear(thr->filter, "libhb_so_recv");
      note_local_Kw_n_stack_for(thr);

if (strong_recv) 
         show_thread_state("s-recv", thr);
      else 
         show_thread_state("w-recv", thr);

} else {
      tl_assert(so->viW == VtsID_INVALID);
      /* Deal with degenerate case: 'so' has no vts, so there has been
         no message posted to it.  Just ignore this case. */
      show_thread_state("d-recv", thr);
   }
}

Bool libhb_so_everSent ( SO* so )
{
   if (so->viR == VtsID_INVALID) {
      tl_assert(so->viW == VtsID_INVALID);
      return False;
   } else {
      tl_assert(so->viW != VtsID_INVALID);
      return True;
   }
}

#define XXX1 0 // 0x67a106c
#define XXX2 0

static inline Bool TRACEME(Addr a, SizeT szB) {
   if (XXX1 && a <= XXX1 && XXX1 <= a+szB) return True;
   if (XXX2 && a <= XXX2 && XXX2 <= a+szB) return True;
   return False;
}
static void trace ( Thr* thr, Addr a, SizeT szB, HChar* s ) {
  SVal sv = zsm_sread08(a);
  VG_(printf)("thr %p (%#lx,%lu) %s: 0x%016llx ", thr,a,szB,s,sv);
  show_thread_state("", thr);
  VG_(printf)("%s","\n");
}

void libhb_srange_new ( Thr* thr, Addr a, SizeT szB )
{
   SVal sv = SVal__mkC(thr->viW, thr->viW);
   tl_assert(is_sane_SVal_C(sv));
   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-before");
   zsm_sset_range( a, szB, sv );
   Filter__clear_range( thr->filter, a, szB );
   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-after ");
}

void libhb_srange_noaccess_NoFX ( Thr* thr, Addr a, SizeT szB )
{
   /* do nothing */
}

void libhb_srange_noaccess_AHAE ( Thr* thr, Addr a, SizeT szB )
{
   /* This really does put the requested range in NoAccess.  It's
      expensive though. */
   SVal sv = SVal_NOACCESS;
   tl_assert(is_sane_SVal_C(sv));
   zsm_sset_range( a, szB, sv );
   Filter__clear_range( thr->filter, a, szB );
}

void libhb_srange_untrack ( Thr* thr, Addr a, SizeT szB )
{
   SVal sv = SVal_NOACCESS;
   tl_assert(is_sane_SVal_C(sv));
   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-before");
   zsm_sset_range( a, szB, sv );
   Filter__clear_range( thr->filter, a, szB );
   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-after ");
}

Thread* libhb_get_Thr_hgthread ( Thr* thr ) {
   tl_assert(thr);
   return thr->hgthread;
}

void libhb_set_Thr_hgthread ( Thr* thr, Thread* hgthread ) {
   tl_assert(thr);
   thr->hgthread = hgthread;
}

void libhb_copy_shadow_state ( Thr* thr, Addr src, Addr dst, SizeT len )
{
   zsm_scopy_range(src, dst, len);
   Filter__clear_range( thr->filter, dst, len ); 
}

void libhb_maybe_GC ( void )
{
   event_map_maybe_GC();
   /* If there are still freelist entries available, no need for a
      GC. */
   if (vts_tab_freelist != VtsID_INVALID)
      return;
   /* So all the table entries are full, and we're having to expand
      the table.  But did we hit the threshhold point yet? */
   if (VG_(sizeXA)( vts_tab ) < vts_next_GC_at)
      return;
   vts_tab__do_GC( False/*don't show stats*/ );
}

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// SECTION END main library                                    //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/*--------------------------------------------------------------------*/
/*--- end                                             libhb_main.c ---*/
/*--------------------------------------------------------------------*/

C++程序 | 6677行 | 218.14 KB


/*--------------------------------------------------------------------*/
/*--- LibHB: a library for implementing and checking               ---*/
/*--- the happens-before relationship in concurrent programs.      ---*/
/*---                                                 libhb_main.c ---*/
/*--------------------------------------------------------------------*/

/*
   This file is part of LibHB, a library for implementing and checking
   the happens-before relationship in concurrent programs.

   Copyright (C) 2008-2011 OpenWorks Ltd
      info@open-works.co.uk

   This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

   This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
   02111-1307, USA.

   The GNU General Public License is contained in the file COPYING.
*/

#include "pub_tool_basics.h"
#include "pub_tool_libcassert.h"
#include "pub_tool_libcbase.h"
#include "pub_tool_libcprint.h"
#include "pub_tool_mallocfree.h"
#include "pub_tool_wordfm.h"
#include "pub_tool_sparsewa.h"
#include "pub_tool_xarray.h"
#include "pub_tool_oset.h"
#include "pub_tool_threadstate.h"
#include "pub_tool_aspacemgr.h"
#include "pub_tool_execontext.h"
#include "pub_tool_errormgr.h"
#include "pub_tool_options.h"        // VG_(clo_stats)
#include "hg_basics.h"
#include "hg_wordset.h"
#include "hg_lock_n_thread.h"
#include "hg_errors.h"

#include "libhb.h"


/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// Debugging #defines                                          //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/* Check the sanity of shadow values in the core memory state
   machine.  Change #if 0 to #if 1 to enable this. */
#if 0
#  define CHECK_MSM 1
#else
#  define CHECK_MSM 0
#endif


/* Check sanity (reference counts, etc) in the conflicting access
   machinery.  Change #if 0 to #if 1 to enable this. */
#if 0
#  define CHECK_CEM 1
#else
#  define CHECK_CEM 0
#endif


/* Check sanity in the compressed shadow memory machinery,
   particularly in its caching innards.  Unfortunately there's no
   almost-zero-cost way to make them selectable at run time.  Hence
   set the #if 0 to #if 1 and rebuild if you want them. */
#if 0
#  define CHECK_ZSM 1  /* do sanity-check CacheLine stuff */
#  define inline __attribute__((noinline))
   /* probably want to ditch -fomit-frame-pointer too */
#else
#  define CHECK_ZSM 0   /* don't sanity-check CacheLine stuff */
#endif


/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// data decls: VtsID                                           //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
   bits, since they have to be packed into the lowest 30 bits of an
   SVal. */
typedef  UInt  VtsID;
#define VtsID_INVALID 0xFFFFFFFF



/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// data decls: SVal                                            //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

typedef  ULong  SVal;

/* This value has special significance to the implementation, and callers
   may not store it in the shadow memory. */
#define SVal_INVALID (3ULL << 62)

/* This is the default value for shadow memory.  Initially the shadow
   memory contains no accessible areas and so all reads produce this
   value.  TODO: make this caller-defineable. */
#define SVal_NOACCESS (2ULL << 62)



/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// data decls: ScalarTS                                        //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/* Scalar Timestamp.  We have to store a lot of these, so there is
   some effort to make them as small as possible.  Logically they are
   a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
   We pack it into 64 bits by representing the Thr* using a ThrID, a
   small integer (18 bits), and a 46 bit integer for the timestamp
   number.  The 46/18 split is arbitary, but has the effect that
   Helgrind can only handle programs that create 2^18 or fewer threads
   over their entire lifetime, and have no more than 2^46 timestamp
   ticks (synchronisation operations on the same thread).

   This doesn't seem like much of a limitation.  2^46 ticks is
   7.06e+13, and if each tick (optimistically) takes the machine 1000
   cycles to process, then the minimum time to process that many ticks
   at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
   but VTS ticks, which isn't realistic.

   NB1: SCALARTS_N_THRBITS must be 29 or lower.  The obvious limit is
   32 since a ThrID is a UInt.  29 comes from the fact that
   'Thr_n_RCEC', which records information about old accesses, packs
   not only a ThrID but also 2+1 other bits (access size and
   writeness) in a UInt, hence limiting size to 32-(2+1) == 29.

   NB2: thrid values are issued upwards from 1024, and values less
   than that aren't valid.  This isn't per se necessary (any order
   will do, so long as they are unique), but it does help ensure they
   are less likely to get confused with the various other kinds of
   small-integer thread ids drifting around (eg, TId).  See also NB5.

   NB3: this probably also relies on the fact that Thr's are never
   deallocated -- they exist forever.  Hence the 1-1 mapping from
   Thr's to thrid values (set up in Thr__new) persists forever.

   NB4: temp_max_sized_VTS is allocated at startup and never freed.
   It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
   ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
   making the memory use for this go sky-high.  With
   SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
   like an OK tradeoff.  If more than 256k threads need to be
   supported, we could change SCALARTS_N_THRBITS to 20, which would
   facilitate supporting 1 million threads at the cost of 8MB storage
   for temp_max_sized_VTS.

   NB5: the conflicting-map mechanism (Thr_n_RCEC, specifically) uses
   ThrID == 0 to denote an empty Thr_n_RCEC record.  So ThrID == 0
   must never be a valid ThrID.  Given NB2 that's OK.
*/
#define SCALARTS_N_THRBITS 18  /* valid range: 11 to 29 inclusive */

#define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
typedef
   struct {
      ThrID thrid : SCALARTS_N_THRBITS;
      ULong tym   : SCALARTS_N_TYMBITS;
   }
   ScalarTS;

#define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)



/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// data decls: Filter                                          //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

// baseline: 5, 9
#define FI_LINE_SZB_LOG2  5
#define FI_NUM_LINES_LOG2 10

#define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
#define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)

#define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
#define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)

#define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
                             & (Addr)(FI_NUM_LINES-1) )


/* In the lines, each 8 bytes are treated individually, and are mapped
   to a UShort.  Regardless of endianness of the underlying machine,
   bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
   the highest address.

   Of each bit pair, the higher numbered bit is set if a R has been
   seen, so the actual layout is:

   15 14             ...  01 00

   R  W  for addr+7  ...  R  W  for addr+0

   So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
*/

/* tags are separated from lines.  tags are Addrs and are
   the base address of the line. */
typedef
   struct {
      UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
   }
   FiLine;

typedef
   struct {
      Addr   tags[FI_NUM_LINES];
      FiLine lines[FI_NUM_LINES];
   }
   Filter;



/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// data decls: Thr, ULong_n_EC                                 //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

// Records stacks for H1 history mechanism (DRD-style)
typedef
   struct { ULong ull; ExeContext* ec; }
   ULong_n_EC;


/* How many of the above records to collect for each thread?  Older
   ones are dumped when we run out of space.  62.5k requires 1MB per
   thread, since each ULong_n_EC record is 16 bytes long.  When more
   than N_KWs_N_STACKs_PER_THREAD are present, the older half are
   deleted to make space.  Hence in the worst case we will be able to
   produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
   Kw transitions (segments in this thread).  For the current setting
   that gives a guaranteed stack for at least the last 31.25k
   segments. */
#define N_KWs_N_STACKs_PER_THREAD 62500


struct _Thr {
   /* Current VTSs for this thread.  They change as we go along.  viR
      is the VTS to be used for reads, viW for writes.  Usually they
      are the same, but can differ when we deal with reader-writer
      locks.  It is always the case that 
         VtsID__cmpLEQ(viW,viR) == True
      that is, viW must be the same, or lagging behind, viR. */
   VtsID viR;
   VtsID viW;

   /* Is initially False, and is set to True after the thread really
      has done a low-level exit.  When True, we expect to never see
      any more memory references done by this thread. */
   Bool llexit_done;

   /* Is initially False, and is set to True after the thread has been
      joined with (reaped by some other thread).  After this point, we
      do not expect to see any uses of .viR or .viW, so it is safe to
      set them to VtsID_INVALID. */
   Bool joinedwith_done;

   /* A small integer giving a unique identity to this Thr.  See
      comments on the definition of ScalarTS for details. */
   ThrID thrid : SCALARTS_N_THRBITS;

   /* A filter that removes references for which we believe that
      msmcread/msmcwrite will not change the state, nor report a
      race. */
   Filter* filter;

   /* A pointer back to the top level Thread structure.  There is a
      1-1 mapping between Thread and Thr structures -- each Thr points
      at its corresponding Thread, and vice versa.  Really, Thr and
      Thread should be merged into a single structure. */
   Thread* hgthread;

   /* The ULongs (scalar Kws) in this accumulate in strictly
      increasing order, without duplicates.  This is important because
      we need to be able to find a given scalar Kw in this array
      later, by binary search. */
   XArray* /* ULong_n_EC */ local_Kws_n_stacks;
};



/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// data decls: SO                                              //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

// (UInt) `echo "Synchronisation object" | md5sum`
#define SO_MAGIC 0x56b3c5b0U

struct _SO {
   struct _SO* admin_prev;
   struct _SO* admin_next;
   VtsID viR; /* r-clock of sender */
   VtsID viW; /* w-clock of sender */
   UInt  magic;
};



/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// Forward declarations                                        //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/* fwds for
   Globals needed by other parts of the library.  These are set
   once at startup and then never changed. */
static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
static ExeContext* (*main_get_EC)( Thr* ) = NULL;

/* misc fn and data fwdses */
static void VtsID__rcinc ( VtsID ii );
static void VtsID__rcdec ( VtsID ii );

static inline Bool SVal__isC ( SVal s );
static inline VtsID SVal__unC_Rmin ( SVal s );
static inline VtsID SVal__unC_Wmin ( SVal s );
static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini );

/* A double linked list of all the SO's. */
SO* admin_SO;



/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// SECTION BEGIN compressed shadow memory                      //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

#ifndef __HB_ZSM_H
#define __HB_ZSM_H

/* Initialise the library.  Once initialised, it will (or may) call
   rcinc and rcdec in response to all the calls below, in order to
   allow the user to do reference counting on the SVals stored herein.
   It is important to understand, however, that due to internal
   caching, the reference counts are in general inaccurate, and can be
   both above or below the true reference count for an item.  In
   particular, the library may indicate that the reference count for
   an item is zero, when in fact it is not.

   To make the reference counting exact and therefore non-pointless,
   call zsm_flush_cache.  Immediately after it returns, the reference
   counts for all items, as deduced by the caller by observing calls
   to rcinc and rcdec, will be correct, and so any items with a zero
   reference count may be freed (or at least considered to be
   unreferenced by this library).
*/
static void zsm_init ( void(*rcinc)(SVal), void(*rcdec)(SVal) );

static void zsm_sset_range  ( Addr, SizeT, SVal );
static void zsm_scopy_range ( Addr, Addr, SizeT );
static void zsm_flush_cache ( void );

#endif /* ! __HB_ZSM_H */


/* Round a up to the next multiple of N.  N must be a power of 2 */
#define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
/* Round a down to the next multiple of N.  N must be a power of 2 */
#define ROUNDDN(a, N)   ((a) & ~(N-1))



/* ------ User-supplied RC functions ------ */
static void(*rcinc)(SVal) = NULL;
static void(*rcdec)(SVal) = NULL;


/* ------ CacheLine ------ */

#define N_LINE_BITS      6 /* must be >= 3 */
#define N_LINE_ARANGE    (1 << N_LINE_BITS)
#define N_LINE_TREES     (N_LINE_ARANGE >> 3)

typedef
   struct {
      UShort descrs[N_LINE_TREES];
      SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
   }
   CacheLine;

#define TREE_DESCR_16_0 (1<<0)
#define TREE_DESCR_32_0 (1<<1)
#define TREE_DESCR_16_1 (1<<2)
#define TREE_DESCR_64   (1<<3)
#define TREE_DESCR_16_2 (1<<4)
#define TREE_DESCR_32_1 (1<<5)
#define TREE_DESCR_16_3 (1<<6)
#define TREE_DESCR_8_0  (1<<7)
#define TREE_DESCR_8_1  (1<<8)
#define TREE_DESCR_8_2  (1<<9)
#define TREE_DESCR_8_3  (1<<10)
#define TREE_DESCR_8_4  (1<<11)
#define TREE_DESCR_8_5  (1<<12)
#define TREE_DESCR_8_6  (1<<13)
#define TREE_DESCR_8_7  (1<<14)
#define TREE_DESCR_DTY  (1<<15)

typedef
   struct {
      SVal  dict[4]; /* can represent up to 4 diff values in the line */
      UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
                                      dict indexes */
      /* if dict[0] == SVal_INVALID then dict[1] is the index of the
         LineF to use, and dict[2..] are also SVal_INVALID. */
   }
   LineZ; /* compressed rep for a cache line */

typedef
   struct {
      Bool inUse;
      SVal w64s[N_LINE_ARANGE];
   }
   LineF; /* full rep for a cache line */

/* Shadow memory.
   Primary map is a WordFM Addr SecMap*.  
   SecMaps cover some page-size-ish section of address space and hold
     a compressed representation.
   CacheLine-sized chunks of SecMaps are copied into a Cache, being
   decompressed when moved into the cache and recompressed on the
   way out.  Because of this, the cache must operate as a writeback
   cache, not a writethrough one.

   Each SecMap must hold a power-of-2 number of CacheLines.  Hence
   N_SECMAP_BITS must >= N_LINE_BITS.
*/
#define N_SECMAP_BITS   13
#define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)

// # CacheLines held by a SecMap
#define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)

/* The data in the SecMap is held in the array of LineZs.  Each LineZ
   either carries the required data directly, in a compressed
   representation, or it holds (in .dict[0]) an index to the LineF in
   .linesF that holds the full representation.

   Currently-unused LineF's have their .inUse bit set to zero.
   Since each in-use LineF is referred to be exactly one LineZ,
   the number of .linesZ[] that refer to .linesF should equal
   the number of .linesF[] that have .inUse == True.

   RC obligations: the RCs presented to the user include exactly
   the values in:
   * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
   * F reps that are in use (.inUse == True)

   Hence the following actions at the following transitions are required:

   F rep: .inUse==True  -> .inUse==False        -- rcdec_LineF
   F rep: .inUse==False -> .inUse==True         -- rcinc_LineF
   Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
   Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
*/
typedef
   struct {
      UInt   magic;
      LineZ  linesZ[N_SECMAP_ZLINES];
      LineF* linesF;
      UInt   linesF_size;
   }
   SecMap;

#define SecMap_MAGIC   0x571e58cbU

static inline Bool is_sane_SecMap ( SecMap* sm ) {
   return sm != NULL && sm->magic == SecMap_MAGIC;
}

/* ------ Cache ------ */

#define N_WAY_BITS 16
#define N_WAY_NENT (1 << N_WAY_BITS)

/* Each tag is the address of the associated CacheLine, rounded down
   to a CacheLine address boundary.  A CacheLine size must be a power
   of 2 and must be 8 or more.  Hence an easy way to initialise the
   cache so it is empty is to set all the tag values to any value % 8
   != 0, eg 1.  This means all queries in the cache initially miss.
   It does however require us to detect and not writeback, any line
   with a bogus tag. */
typedef
   struct {
      CacheLine lyns0[N_WAY_NENT];
      Addr      tags0[N_WAY_NENT];
   }
   Cache;

static inline Bool is_valid_scache_tag ( Addr tag ) {
   /* a valid tag should be naturally aligned to the start of
      a CacheLine. */
   return 0 == (tag & (N_LINE_ARANGE - 1));
}


/* --------- Primary data structures --------- */

/* Shadow memory primary map */
static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
static Cache   cache_shmem;


static UWord stats__secmaps_search       = 0; // # SM finds
static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
static UWord stats__secmap_linesF_allocd = 0; // # LineF's issued
static UWord stats__secmap_linesF_bytes  = 0; //  .. using this much storage
static UWord stats__secmap_iterator_steppings = 0; // # calls to stepSMIter
static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
static UWord stats__cache_F_fetches      = 0; // # F lines fetched
static UWord stats__cache_F_wbacks       = 0; // # F lines written back
static UWord stats__cache_invals         = 0; // # cache invals
static UWord stats__cache_flushes        = 0; // # cache flushes
static UWord stats__cache_totrefs        = 0; // # total accesses
static UWord stats__cache_totmisses      = 0; // # misses
static ULong stats__cache_make_New_arange = 0; // total arange made New
static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
static UWord stats__cline_cread64s       = 0; // # calls to s_m_read64
static UWord stats__cline_cread32s       = 0; // # calls to s_m_read32
static UWord stats__cline_cread16s       = 0; // # calls to s_m_read16
static UWord stats__cline_cread08s       = 0; // # calls to s_m_read8
static UWord stats__cline_cwrite64s      = 0; // # calls to s_m_write64
static UWord stats__cline_cwrite32s      = 0; // # calls to s_m_write32
static UWord stats__cline_cwrite16s      = 0; // # calls to s_m_write16
static UWord stats__cline_cwrite08s      = 0; // # calls to s_m_write8
static UWord stats__cline_sread08s       = 0; // # calls to s_m_set8
static UWord stats__cline_swrite08s      = 0; // # calls to s_m_get8
static UWord stats__cline_swrite16s      = 0; // # calls to s_m_get8
static UWord stats__cline_swrite32s      = 0; // # calls to s_m_get8
static UWord stats__cline_swrite64s      = 0; // # calls to s_m_get8
static UWord stats__cline_scopy08s       = 0; // # calls to s_m_copy8
static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
static UWord stats__vts__tick            = 0; // # calls to VTS__tick
static UWord stats__vts__join            = 0; // # calls to VTS__join
static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural

// # calls to VTS__cmp_structural w/ slow case
static UWord stats__vts__cmp_structural_slow = 0;

// # calls to VTS__indexAt_SLOW
static UWord stats__vts__indexat_slow = 0;

// # calls to vts_set__find__or__clone_and_add
static UWord stats__vts_set__focaa    = 0;

// # calls to vts_set__find__or__clone_and_add that lead to an
// allocation
static UWord stats__vts_set__focaa_a  = 0;


static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
   return a & ~(N_SECMAP_ARANGE - 1);
}
static inline UWord shmem__get_SecMap_offset ( Addr a ) {
   return a & (N_SECMAP_ARANGE - 1);
}


/*----------------------------------------------------------------*/
/*--- map_shmem :: WordFM Addr SecMap                          ---*/
/*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
/*----------------------------------------------------------------*/

/*--------------- SecMap allocation --------------- */

static HChar* shmem__bigchunk_next = NULL;
static HChar* shmem__bigchunk_end1 = NULL;

static void* shmem__bigchunk_alloc ( SizeT n )
{
   const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
   tl_assert(n > 0);
   n = VG_ROUNDUP(n, 16);
   tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
   tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
             <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
   if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
      if (0)
      VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
                  (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
      shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
      if (shmem__bigchunk_next == NULL)
         VG_(out_of_memory_NORETURN)(
            "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE );
      shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
   }
   tl_assert(shmem__bigchunk_next);
   tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
   tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
   shmem__bigchunk_next += n;
   return shmem__bigchunk_next - n;
}

static SecMap* shmem__alloc_SecMap ( void )
{
   Word    i, j;
   SecMap* sm = shmem__bigchunk_alloc( sizeof(SecMap) );
   if (0) VG_(printf)("alloc_SecMap %p\n",sm);
   tl_assert(sm);
   sm->magic = SecMap_MAGIC;
   for (i = 0; i < N_SECMAP_ZLINES; i++) {
      sm->linesZ[i].dict[0] = SVal_NOACCESS;
      sm->linesZ[i].dict[1] = SVal_INVALID;
      sm->linesZ[i].dict[2] = SVal_INVALID;
      sm->linesZ[i].dict[3] = SVal_INVALID;
      for (j = 0; j < N_LINE_ARANGE/4; j++)
         sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
   }
   sm->linesF      = NULL;
   sm->linesF_size = 0;
   stats__secmaps_allocd++;
   stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
   stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
   stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
   return sm;
}

typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };

static SecMap* shmem__find_SecMap ( Addr ga ) 
{
   SecMap* sm    = NULL;
   Addr    gaKey = shmem__round_to_SecMap_base(ga);
   // Cache
   stats__secmaps_search++;
   if (LIKELY(gaKey == smCache[0].gaKey))
      return smCache[0].sm;
   if (LIKELY(gaKey == smCache[1].gaKey)) {
      SMCacheEnt tmp = smCache[0];
      smCache[0] = smCache[1];
      smCache[1] = tmp;
      return smCache[0].sm;
   }
   if (gaKey == smCache[2].gaKey) {
      SMCacheEnt tmp = smCache[1];
      smCache[1] = smCache[2];
      smCache[2] = tmp;
      return smCache[1].sm;
   }
   // end Cache
   stats__secmaps_search_slow++;
   if (VG_(lookupFM)( map_shmem,
                      NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
      tl_assert(sm != NULL);
      smCache[2] = smCache[1];
      smCache[1] = smCache[0];
      smCache[0].gaKey = gaKey;
      smCache[0].sm    = sm;
   } else {
      tl_assert(sm == NULL);
   }
   return sm;
}

static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
{
   SecMap* sm = shmem__find_SecMap ( ga );
   if (LIKELY(sm)) {
      return sm;
   } else {
      /* create a new one */
      Addr gaKey = shmem__round_to_SecMap_base(ga);
      sm = shmem__alloc_SecMap();
      tl_assert(sm);
      VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
      return sm;
   }
}


/* ------------ LineF and LineZ related ------------ */

static void rcinc_LineF ( LineF* lineF ) {
   UWord i;
   tl_assert(lineF->inUse);
   for (i = 0; i < N_LINE_ARANGE; i++)
      rcinc(lineF->w64s[i]);
}

static void rcdec_LineF ( LineF* lineF ) {
   UWord i;
   tl_assert(lineF->inUse);
   for (i = 0; i < N_LINE_ARANGE; i++)
      rcdec(lineF->w64s[i]);
}

static void rcinc_LineZ ( LineZ* lineZ ) {
   tl_assert(lineZ->dict[0] != SVal_INVALID);
   rcinc(lineZ->dict[0]);
   if (lineZ->dict[1] != SVal_INVALID) rcinc(lineZ->dict[1]);
   if (lineZ->dict[2] != SVal_INVALID) rcinc(lineZ->dict[2]);
   if (lineZ->dict[3] != SVal_INVALID) rcinc(lineZ->dict[3]);
}

static void rcdec_LineZ ( LineZ* lineZ ) {
   tl_assert(lineZ->dict[0] != SVal_INVALID);
   rcdec(lineZ->dict[0]);
   if (lineZ->dict[1] != SVal_INVALID) rcdec(lineZ->dict[1]);
   if (lineZ->dict[2] != SVal_INVALID) rcdec(lineZ->dict[2]);
   if (lineZ->dict[3] != SVal_INVALID) rcdec(lineZ->dict[3]);
}

inline
static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
   Word bix, shft, mask, prep;
   tl_assert(ix >= 0);
   bix  = ix >> 2;
   shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
   mask = 3 << shft;
   prep = b2 << shft;
   arr[bix] = (arr[bix] & ~mask) | prep;
}

inline
static UWord read_twobit_array ( UChar* arr, UWord ix ) {
   Word bix, shft;
   tl_assert(ix >= 0);
   bix  = ix >> 2;
   shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
   return (arr[bix] >> shft) & 3;
}

/* Given address 'tag', find either the Z or F line containing relevant
   data, so it can be read into the cache.
*/
static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
                                  /*OUT*/LineF** fp, Addr tag ) {
   LineZ* lineZ;
   LineF* lineF;
   UWord   zix;
   SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
   UWord   smoff = shmem__get_SecMap_offset(tag);
   /* since smoff is derived from a valid tag, it should be
      cacheline-aligned. */
   tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
   zix = smoff >> N_LINE_BITS;
   tl_assert(zix < N_SECMAP_ZLINES);
   lineZ = &sm->linesZ[zix];
   lineF = NULL;
   if (lineZ->dict[0] == SVal_INVALID) {
      UInt fix = (UInt)lineZ->dict[1];
      tl_assert(sm->linesF);
      tl_assert(sm->linesF_size > 0);
      tl_assert(fix >= 0 && fix < sm->linesF_size);
      lineF = &sm->linesF[fix];
      tl_assert(lineF->inUse);
      lineZ = NULL;
   }
   *zp = lineZ;
   *fp = lineF;
}

/* Given address 'tag', return the relevant SecMap and the index of
   the LineZ within it, in the expectation that the line is to be
   overwritten.  Regardless of whether 'tag' is currently associated
   with a Z or F representation, to rcdec on the current
   representation, in recognition of the fact that the contents are
   just about to be overwritten. */
static __attribute__((noinline))
void find_Z_for_writing ( /*OUT*/SecMap** smp,
                          /*OUT*/Word* zixp,
                          Addr tag ) {
   LineZ* lineZ;
   LineF* lineF;
   UWord   zix;
   SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
   UWord   smoff = shmem__get_SecMap_offset(tag);
   /* since smoff is derived from a valid tag, it should be
      cacheline-aligned. */
   tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
   zix = smoff >> N_LINE_BITS;
   tl_assert(zix < N_SECMAP_ZLINES);
   lineZ = &sm->linesZ[zix];
   lineF = NULL;
   /* re RCs, we are freeing up this LineZ/LineF so that new data can
      be parked in it.  Hence have to rcdec it accordingly. */
   /* If lineZ has an associated lineF, free it up. */
   if (lineZ->dict[0] == SVal_INVALID) {
      UInt fix = (UInt)lineZ->dict[1];
      tl_assert(sm->linesF);
      tl_assert(sm->linesF_size > 0);
      tl_assert(fix >= 0 && fix < sm->linesF_size);
      lineF = &sm->linesF[fix];
      tl_assert(lineF->inUse);
      rcdec_LineF(lineF);
      lineF->inUse = False;
   } else {
      rcdec_LineZ(lineZ);
   }
   *smp  = sm;
   *zixp = zix;
}

static __attribute__((noinline))
void alloc_F_for_writing ( /*MOD*/SecMap* sm, /*OUT*/Word* fixp ) {
   UInt        i, new_size;
   LineF* nyu;

   if (sm->linesF) {
      tl_assert(sm->linesF_size > 0);
   } else {
      tl_assert(sm->linesF_size == 0);
   }

   if (sm->linesF) {
      for (i = 0; i < sm->linesF_size; i++) {
         if (!sm->linesF[i].inUse) {
            *fixp = (Word)i;
            return;
         }
      }
   }

   /* No free F line found.  Expand existing array and try again. */
   new_size = sm->linesF_size==0 ? 1 : 2 * sm->linesF_size;
   nyu      = HG_(zalloc)( "libhb.aFfw.1 (LineF storage)",
                           new_size * sizeof(LineF) );
   tl_assert(nyu);

   stats__secmap_linesF_allocd += (new_size - sm->linesF_size);
   stats__secmap_linesF_bytes  += (new_size - sm->linesF_size)
                                  * sizeof(LineF);

   if (0)
   VG_(printf)("SM %p: expand F array from %d to %d\n", 
               sm, (Int)sm->linesF_size, new_size);

   for (i = 0; i < new_size; i++)
      nyu[i].inUse = False;

   if (sm->linesF) {
      for (i = 0; i < sm->linesF_size; i++) {
         tl_assert(sm->linesF[i].inUse);
         nyu[i] = sm->linesF[i];
      }
      VG_(memset)(sm->linesF, 0, sm->linesF_size * sizeof(LineF) );
      HG_(free)(sm->linesF);
   }

   sm->linesF      = nyu;
   sm->linesF_size = new_size;

   for (i = 0; i < sm->linesF_size; i++) {
      if (!sm->linesF[i].inUse) {
         *fixp = (Word)i;
         return;
      }
    }

    /*NOTREACHED*/
    tl_assert(0);
}


/* ------------ CacheLine and implicit-tree related ------------ */

__attribute__((unused))
static void pp_CacheLine ( CacheLine* cl ) {
   Word i;
   if (!cl) {
      VG_(printf)("%s","pp_CacheLine(NULL)\n");
      return;
   }
   for (i = 0; i < N_LINE_TREES; i++) 
      VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
   for (i = 0; i < N_LINE_ARANGE; i++) 
      VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
}

static UChar descr_to_validbits ( UShort descr )
{
   /* a.k.a Party Time for gcc's constant folder */
#  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
                b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
             ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
                          ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
                          ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
                          ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
                          ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
                          ( (b16_2) << 4)  | ( (b64)   << 3)  | \
                          ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
                          ( (b16_0) << 0) ) )

#  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
             ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
                         ( (bit5) << 5) | ( (bit4) << 4) | \
                         ( (bit3) << 3) | ( (bit2) << 2) | \
                         ( (bit1) << 1) | ( (bit0) << 0) ) )

   /* these should all get folded out at compile time */
   tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
   tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
   tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);

   switch (descr) {
   /*
              +--------------------------------- TREE_DESCR_8_7
              |             +------------------- TREE_DESCR_8_0
              |             |  +---------------- TREE_DESCR_16_3
              |             |  | +-------------- TREE_DESCR_32_1
              |             |  | | +------------ TREE_DESCR_16_2
              |             |  | | |  +--------- TREE_DESCR_64
              |             |  | | |  |  +------ TREE_DESCR_16_1
              |             |  | | |  |  | +---- TREE_DESCR_32_0
              |             |  | | |  |  | | +-- TREE_DESCR_16_0
              |             |  | | |  |  | | |
              |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
   case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
                                                 return BYTE(1,1,1,1,1,1,1,1);
   case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
                                                 return BYTE(1,1,0,1,1,1,1,1);
   case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */ 
                                                 return BYTE(0,1,1,1,1,1,1,1);
   case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
                                                 return BYTE(0,1,0,1,1,1,1,1);

   case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */ 
                                                 return BYTE(1,1,1,1,1,1,0,1);
   case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
                                                 return BYTE(1,1,0,1,1,1,0,1);
   case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
                                                 return BYTE(0,1,1,1,1,1,0,1);
   case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
                                                 return BYTE(0,1,0,1,1,1,0,1);

   case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
                                                 return BYTE(1,1,1,1,0,1,1,1);
   case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
                                                 return BYTE(1,1,0,1,0,1,1,1);
   case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
                                                 return BYTE(0,1,1,1,0,1,1,1);
   case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
                                                 return BYTE(0,1,0,1,0,1,1,1);

   case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
                                                 return BYTE(1,1,1,1,0,1,0,1);
   case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
                                                 return BYTE(1,1,0,1,0,1,0,1);
   case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
                                                 return BYTE(0,1,1,1,0,1,0,1);
   case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
                                                 return BYTE(0,1,0,1,0,1,0,1);

   case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
                                                 return BYTE(0,0,0,1,1,1,1,1);
   case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
                                                 return BYTE(0,0,0,1,1,1,0,1);
   case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
                                                 return BYTE(0,0,0,1,0,1,1,1);
   case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
                                                 return BYTE(0,0,0,1,0,1,0,1);

   case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
                                                 return BYTE(1,1,1,1,0,0,0,1);
   case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
                                                 return BYTE(1,1,0,1,0,0,0,1);
   case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
                                                 return BYTE(0,1,1,1,0,0,0,1);
   case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
                                                 return BYTE(0,1,0,1,0,0,0,1);

   case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
                                                 return BYTE(0,0,0,1,0,0,0,1);

   case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
                                                 return BYTE(0,0,0,0,0,0,0,1);

   default: return BYTE(0,0,0,0,0,0,0,0); 
                   /* INVALID - any valid descr produces at least one
                      valid bit in tree[0..7]*/
   }
   /* NOTREACHED*/
   tl_assert(0);

#  undef DESCR
#  undef BYTE
}

__attribute__((unused))
static Bool is_sane_Descr ( UShort descr ) {
   return descr_to_validbits(descr) != 0;
}

static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
   VG_(sprintf)(dst, 
                "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
                (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
                (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
                (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
                (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
                (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
                (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
                (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
                (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
                (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
   );
}
static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
   VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
                     (Int)((byte & 128) ? 1 : 0),
                     (Int)((byte &  64) ? 1 : 0),
                     (Int)((byte &  32) ? 1 : 0),
                     (Int)((byte &  16) ? 1 : 0),
                     (Int)((byte &   8) ? 1 : 0),
                     (Int)((byte &   4) ? 1 : 0),
                     (Int)((byte &   2) ? 1 : 0),
                     (Int)((byte &   1) ? 1 : 0)
   );
}

static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
   Word  i;
   UChar validbits = descr_to_validbits(descr);
   HChar buf[128], buf2[128];
   if (validbits == 0)
      goto bad;
   for (i = 0; i < 8; i++) {
      if (validbits & (1<<i)) {
         if (tree[i] == SVal_INVALID)
            goto bad;
      } else {
         if (tree[i] != SVal_INVALID)
            goto bad;
      }
   }
   return True;
  bad:
   sprintf_Descr( buf, descr );
   sprintf_Byte( buf2, validbits );
   VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
   VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
   VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
   for (i = 0; i < 8; i++)
      VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
   VG_(printf)("%s","}\n");
   return 0;
}

static Bool is_sane_CacheLine ( CacheLine* cl )
{
   Word tno, cloff;

   if (!cl) goto bad;

   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
      UShort descr = cl->descrs[tno];
      SVal*  tree  = &cl->svals[cloff];
      if (!is_sane_Descr_and_Tree(descr, tree))
         goto bad;
   }
   tl_assert(cloff == N_LINE_ARANGE);
   return True;
  bad:
   pp_CacheLine(cl);
   return False;
}

static UShort normalise_tree ( /*MOD*/SVal* tree )
{
   UShort descr;
   /* pre: incoming tree[0..7] does not have any invalid shvals, in
      particular no zeroes. */
   if (UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
                || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
                || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
                || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
      tl_assert(0);
   
   descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
           | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
           | TREE_DESCR_8_1 | TREE_DESCR_8_0;
   /* build 16-bit layer */
   if (tree[1] == tree[0]) {
      tree[1] = SVal_INVALID;
      descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
      descr |= TREE_DESCR_16_0;
   }
   if (tree[3] == tree[2]) {
      tree[3] = SVal_INVALID;
      descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
      descr |= TREE_DESCR_16_1;
   }
   if (tree[5] == tree[4]) {
      tree[5] = SVal_INVALID;
      descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
      descr |= TREE_DESCR_16_2;
   }
   if (tree[7] == tree[6]) {
      tree[7] = SVal_INVALID;
      descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
      descr |= TREE_DESCR_16_3;
   }
   /* build 32-bit layer */
   if (tree[2] == tree[0]
       && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
      tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
      descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
      descr |= TREE_DESCR_32_0;
   }
   if (tree[6] == tree[4]
       && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
      tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
      descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
      descr |= TREE_DESCR_32_1;
   }
   /* build 64-bit layer */
   if (tree[4] == tree[0]
       && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
      tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
      descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
      descr |= TREE_DESCR_64;
   }
   return descr;
}

/* This takes a cacheline where all the data is at the leaves
   (w8[..]) and builds a correctly normalised tree. */
static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
{
   Word tno, cloff;
   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
      SVal* tree = &cl->svals[cloff];
      cl->descrs[tno] = normalise_tree( tree );
   }
   tl_assert(cloff == N_LINE_ARANGE);
   if (CHECK_ZSM)
      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   stats__cline_normalises++;
}


typedef struct { UChar count; SVal sval; } CountedSVal;

static
void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
                               /*OUT*/Word* dstUsedP,
                               Word nDst, CacheLine* src )
{
   Word  tno, cloff, dstUsed;

   tl_assert(nDst == N_LINE_ARANGE);
   dstUsed = 0;

   for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
      UShort descr = src->descrs[tno];
      SVal*  tree  = &src->svals[cloff];

      /* sequentialise the tree described by (descr,tree). */
#     define PUT(_n,_v)                                \
         do { dst[dstUsed  ].count = (_n);             \
              dst[dstUsed++].sval  = (_v);             \
         } while (0)

      /* byte 0 */
      if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
      if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
      if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
      if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
      /* byte 1 */
      if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
      /* byte 2 */
      if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
      if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
      /* byte 3 */
      if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
      /* byte 4 */
      if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
      if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
      if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
      /* byte 5 */
      if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
      /* byte 6 */
      if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
      if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
      /* byte 7 */
      if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);

#     undef PUT
      /* END sequentialise the tree described by (descr,tree). */

   }
   tl_assert(cloff == N_LINE_ARANGE);
   tl_assert(dstUsed <= nDst);

   *dstUsedP = dstUsed;
}

/* Write the cacheline 'wix' to backing store.  Where it ends up
   is determined by its tag field. */
static __attribute__((noinline)) void cacheline_wback ( UWord wix )
{
   Word        i, j, k, m;
   Addr        tag;
   SecMap*     sm;
   CacheLine*  cl;
   LineZ* lineZ;
   LineF* lineF;
   Word        zix, fix, csvalsUsed;
   CountedSVal csvals[N_LINE_ARANGE];
   SVal        sv;

   if (0)
   VG_(printf)("scache wback line %d\n", (Int)wix);

   tl_assert(wix >= 0 && wix < N_WAY_NENT);

   tag =  cache_shmem.tags0[wix];
   cl  = &cache_shmem.lyns0[wix];

   /* The cache line may have been invalidated; if so, ignore it. */
   if (!is_valid_scache_tag(tag))
      return;

   /* Where are we going to put it? */
   sm         = NULL;
   lineZ      = NULL;
   lineF      = NULL;
   zix = fix = -1;

   /* find the Z line to write in and rcdec it or the associated F
      line. */
   find_Z_for_writing( &sm, &zix, tag );

   tl_assert(sm);
   tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
   lineZ = &sm->linesZ[zix];

   /* Generate the data to be stored */
   if (CHECK_ZSM)
      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */

   csvalsUsed = -1;
   sequentialise_CacheLine( csvals, &csvalsUsed, 
                            N_LINE_ARANGE, cl );
   tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
   if (0) VG_(printf)("%lu ", csvalsUsed);

   lineZ->dict[0] = lineZ->dict[1] 
                  = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;

   /* i indexes actual shadow values, k is cursor in csvals */
   i = 0;
   for (k = 0; k < csvalsUsed; k++) {

      sv = csvals[k].sval;
      if (CHECK_ZSM)
         tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
      /* do we already have it? */
      if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
      if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
      if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
      if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
      /* no.  look for a free slot. */
      if (CHECK_ZSM)
         tl_assert(sv != SVal_INVALID);
      if (lineZ->dict[0] 
          == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
      if (lineZ->dict[1]
          == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
      if (lineZ->dict[2]
          == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
      if (lineZ->dict[3]
          == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
      break; /* we'll have to use the f rep */
     dict_ok:
      m = csvals[k].count;
      if (m == 8) {
         write_twobit_array( lineZ->ix2s, i+0, j );
         write_twobit_array( lineZ->ix2s, i+1, j );
         write_twobit_array( lineZ->ix2s, i+2, j );
         write_twobit_array( lineZ->ix2s, i+3, j );
         write_twobit_array( lineZ->ix2s, i+4, j );
         write_twobit_array( lineZ->ix2s, i+5, j );
         write_twobit_array( lineZ->ix2s, i+6, j );
         write_twobit_array( lineZ->ix2s, i+7, j );
         i += 8;
      }
      else if (m == 4) {
         write_twobit_array( lineZ->ix2s, i+0, j );
         write_twobit_array( lineZ->ix2s, i+1, j );
         write_twobit_array( lineZ->ix2s, i+2, j );
         write_twobit_array( lineZ->ix2s, i+3, j );
         i += 4;
      }
      else if (m == 1) {
         write_twobit_array( lineZ->ix2s, i+0, j );
         i += 1;
      }
      else if (m == 2) {
         write_twobit_array( lineZ->ix2s, i+0, j );
         write_twobit_array( lineZ->ix2s, i+1, j );
         i += 2;
      }
      else {
         tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
      }

   }

   if (LIKELY(i == N_LINE_ARANGE)) {
      /* Construction of the compressed representation was
         successful. */
      rcinc_LineZ(lineZ);
      stats__cache_Z_wbacks++;
   } else {
      /* Cannot use the compressed(z) representation.  Use the full(f)
         rep instead. */
      tl_assert(i >= 0 && i < N_LINE_ARANGE);
      alloc_F_for_writing( sm, &fix );
      tl_assert(sm->linesF);
      tl_assert(sm->linesF_size > 0);
      tl_assert(fix >= 0 && fix < (Word)sm->linesF_size);
      lineF = &sm->linesF[fix];
      tl_assert(!lineF->inUse);
      lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
      lineZ->dict[1] = (SVal)fix;
      lineF->inUse = True;
      i = 0;
      for (k = 0; k < csvalsUsed; k++) {
         if (CHECK_ZSM)
            tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
         sv = csvals[k].sval;
         if (CHECK_ZSM)
            tl_assert(sv != SVal_INVALID);
         for (m = csvals[k].count; m > 0; m--) {
            lineF->w64s[i] = sv;
            i++;
         }
      }
      tl_assert(i == N_LINE_ARANGE);
      rcinc_LineF(lineF);
      stats__cache_F_wbacks++;
   }
}

/* Fetch the cacheline 'wix' from the backing store.  The tag
   associated with 'wix' is assumed to have already been filled in;
   hence that is used to determine where in the backing store to read
   from. */
static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
{
   Word       i;
   Addr       tag;
   CacheLine* cl;
   LineZ*     lineZ;
   LineF*     lineF;

   if (0)
   VG_(printf)("scache fetch line %d\n", (Int)wix);

   tl_assert(wix >= 0 && wix < N_WAY_NENT);

   tag =  cache_shmem.tags0[wix];
   cl  = &cache_shmem.lyns0[wix];

   /* reject nonsense requests */
   tl_assert(is_valid_scache_tag(tag));

   lineZ = NULL;
   lineF = NULL;
   find_ZF_for_reading( &lineZ, &lineF, tag );
   tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );

   /* expand the data into the bottom layer of the tree, then get
      cacheline_normalise to build the descriptor array. */
   if (lineF) {
      tl_assert(lineF->inUse);
      for (i = 0; i < N_LINE_ARANGE; i++) {
         cl->svals[i] = lineF->w64s[i];
      }
      stats__cache_F_fetches++;
   } else {
      for (i = 0; i < N_LINE_ARANGE; i++) {
         SVal sv;
         UWord ix = read_twobit_array( lineZ->ix2s, i );
         /* correct, but expensive: tl_assert(ix >= 0 && ix <= 3); */
         sv = lineZ->dict[ix];
         tl_assert(sv != SVal_INVALID);
         cl->svals[i] = sv;
      }
      stats__cache_Z_fetches++;
   }
   normalise_CacheLine( cl );
}

static void shmem__invalidate_scache ( void ) {
   Word wix;
   if (0) VG_(printf)("%s","scache inval\n");
   tl_assert(!is_valid_scache_tag(1));
   for (wix = 0; wix < N_WAY_NENT; wix++) {
      cache_shmem.tags0[wix] = 1/*INVALID*/;
   }
   stats__cache_invals++;
}

static void shmem__flush_and_invalidate_scache ( void ) {
   Word wix;
   Addr tag;
   if (0) VG_(printf)("%s","scache flush and invalidate\n");
   tl_assert(!is_valid_scache_tag(1));
   for (wix = 0; wix < N_WAY_NENT; wix++) {
      tag = cache_shmem.tags0[wix];
      if (tag == 1/*INVALID*/) {
         /* already invalid; nothing to do */
      } else {
         tl_assert(is_valid_scache_tag(tag));
         cacheline_wback( wix );
      }
      cache_shmem.tags0[wix] = 1/*INVALID*/;
   }
   stats__cache_flushes++;
   stats__cache_invals++;
}


static inline Bool aligned16 ( Addr a ) {
   return 0 == (a & 1);
}
static inline Bool aligned32 ( Addr a ) {
   return 0 == (a & 3);
}
static inline Bool aligned64 ( Addr a ) {
   return 0 == (a & 7);
}
static inline UWord get_cacheline_offset ( Addr a ) {
   return (UWord)(a & (N_LINE_ARANGE - 1));
}
static inline Addr cacheline_ROUNDUP ( Addr a ) {
   return ROUNDUP(a, N_LINE_ARANGE);
}
static inline Addr cacheline_ROUNDDN ( Addr a ) {
   return ROUNDDN(a, N_LINE_ARANGE);
}
static inline UWord get_treeno ( Addr a ) {
   return get_cacheline_offset(a) >> 3;
}
static inline UWord get_tree_offset ( Addr a ) {
   return a & 7;
}

static __attribute__((noinline))
       CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
static inline CacheLine* get_cacheline ( Addr a )
{
   /* tag is 'a' with the in-line offset masked out, 
      eg a[31]..a[4] 0000 */
   Addr       tag = a & ~(N_LINE_ARANGE - 1);
   UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
   stats__cache_totrefs++;
   if (LIKELY(tag == cache_shmem.tags0[wix])) {
      return &cache_shmem.lyns0[wix];
   } else {
      return get_cacheline_MISS( a );
   }
}

static __attribute__((noinline))
       CacheLine* get_cacheline_MISS ( Addr a )
{
   /* tag is 'a' with the in-line offset masked out, 
      eg a[31]..a[4] 0000 */

   CacheLine* cl;
   Addr*      tag_old_p;
   Addr       tag = a & ~(N_LINE_ARANGE - 1);
   UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);

   tl_assert(tag != cache_shmem.tags0[wix]);

   /* Dump the old line into the backing store. */
   stats__cache_totmisses++;

   cl        = &cache_shmem.lyns0[wix];
   tag_old_p = &cache_shmem.tags0[wix];

   if (is_valid_scache_tag( *tag_old_p )) {
      /* EXPENSIVE and REDUNDANT: callee does it */
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
      cacheline_wback( wix );
   }
   /* and reload the new one */
   *tag_old_p = tag;
   cacheline_fetch( wix );
   if (CHECK_ZSM)
      tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   return cl;
}

static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   stats__cline_64to32pulldown++;
   switch (toff) {
      case 0: case 4:
         tl_assert(descr & TREE_DESCR_64);
         tree[4] = tree[0];
         descr &= ~TREE_DESCR_64;
         descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
         break;
      default:
         tl_assert(0);
   }
   return descr;
}

static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   stats__cline_32to16pulldown++;
   switch (toff) {
      case 0: case 2:
         if (!(descr & TREE_DESCR_32_0)) {
            descr = pulldown_to_32(tree, 0, descr);
         }
         tl_assert(descr & TREE_DESCR_32_0);
         tree[2] = tree[0];
         descr &= ~TREE_DESCR_32_0;
         descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
         break;
      case 4: case 6:
         if (!(descr & TREE_DESCR_32_1)) {
            descr = pulldown_to_32(tree, 4, descr);
         }
         tl_assert(descr & TREE_DESCR_32_1);
         tree[6] = tree[4];
         descr &= ~TREE_DESCR_32_1;
         descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
         break;
      default:
         tl_assert(0);
   }
   return descr;
}

static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   stats__cline_16to8pulldown++;
   switch (toff) {
      case 0: case 1:
         if (!(descr & TREE_DESCR_16_0)) {
            descr = pulldown_to_16(tree, 0, descr);
         }
         tl_assert(descr & TREE_DESCR_16_0);
         tree[1] = tree[0];
         descr &= ~TREE_DESCR_16_0;
         descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
         break;
      case 2: case 3:
         if (!(descr & TREE_DESCR_16_1)) {
            descr = pulldown_to_16(tree, 2, descr);
         }
         tl_assert(descr & TREE_DESCR_16_1);
         tree[3] = tree[2];
         descr &= ~TREE_DESCR_16_1;
         descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
         break;
      case 4: case 5:
         if (!(descr & TREE_DESCR_16_2)) {
            descr = pulldown_to_16(tree, 4, descr);
         }
         tl_assert(descr & TREE_DESCR_16_2);
         tree[5] = tree[4];
         descr &= ~TREE_DESCR_16_2;
         descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
         break;
      case 6: case 7:
         if (!(descr & TREE_DESCR_16_3)) {
            descr = pulldown_to_16(tree, 6, descr);
         }
         tl_assert(descr & TREE_DESCR_16_3);
         tree[7] = tree[6];
         descr &= ~TREE_DESCR_16_3;
         descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
         break;
      default:
         tl_assert(0);
   }
   return descr;
}


static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
   UShort mask;
   switch (toff) {
      case 0:
         mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
         tl_assert( (descr & mask) == mask );
         descr &= ~mask;
         descr |= TREE_DESCR_16_0;
         break;
      case 2:
         mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
         tl_assert( (descr & mask) == mask );
         descr &= ~mask;
         descr |= TREE_DESCR_16_1;
         break;
      case 4:
         mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
         tl_assert( (descr & mask) == mask );
         descr &= ~mask;
         descr |= TREE_DESCR_16_2;
         break;
      case 6:
         mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
         tl_assert( (descr & mask) == mask );
         descr &= ~mask;
         descr |= TREE_DESCR_16_3;
         break;
      default:
         tl_assert(0);
   }
   return descr;
}

static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
   UShort mask;
   switch (toff) {
      case 0:
         if (!(descr & TREE_DESCR_16_0))
            descr = pullup_descr_to_16(descr, 0);
         if (!(descr & TREE_DESCR_16_1))
            descr = pullup_descr_to_16(descr, 2);
         mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
         tl_assert( (descr & mask) == mask );
         descr &= ~mask;
         descr |= TREE_DESCR_32_0;
         break;
      case 4:
         if (!(descr & TREE_DESCR_16_2))
            descr = pullup_descr_to_16(descr, 4);
         if (!(descr & TREE_DESCR_16_3))
            descr = pullup_descr_to_16(descr, 6);
         mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
         tl_assert( (descr & mask) == mask );
         descr &= ~mask;
         descr |= TREE_DESCR_32_1;
         break;
      default:
         tl_assert(0);
   }
   return descr;
}

static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
   switch (toff) {
      case 0: case 4:
         return 0 != (descr & TREE_DESCR_64);
      default:
         tl_assert(0);
   }
}

static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
   switch (toff) {
      case 0:
         return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
      case 2:
         return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
      case 4:
         return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
      case 6:
         return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
      default:
         tl_assert(0);
   }
}

/* ------------ Cache management ------------ */

static void zsm_flush_cache ( void )
{
   shmem__flush_and_invalidate_scache();
}


static void zsm_init ( void(*p_rcinc)(SVal), void(*p_rcdec)(SVal) )
{
   tl_assert( sizeof(UWord) == sizeof(Addr) );

   rcinc = p_rcinc;
   rcdec = p_rcdec;

   tl_assert(map_shmem == NULL);
   map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
                           HG_(free), 
                           NULL/*unboxed UWord cmp*/);
   tl_assert(map_shmem != NULL);
   shmem__invalidate_scache();

   /* a SecMap must contain an integral number of CacheLines */
   tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
   /* also ... a CacheLine holds an integral number of trees */
   tl_assert(0 == (N_LINE_ARANGE % 8));
}

/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// SECTION END compressed shadow memory                        //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////



/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// SECTION BEGIN vts primitives                                //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////


/* There's a 1-1 mapping between Thr and ThrIDs -- the latter merely
   being compact stand-ins for Thr*'s.  Use these functions to map
   between them. */
static ThrID Thr__to_ThrID   ( Thr*  thr   ); /* fwds */
static Thr*  Thr__from_ThrID ( ThrID thrid ); /* fwds */

__attribute__((noreturn))
static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
{
   if (due_to_nThrs) {
      HChar* s =
         "\n"
         "Helgrind: cannot continue, run aborted: too many threads.\n"
         "Sorry.  Helgrind can only handle programs that create\n"
         "%'llu or fewer threads over their entire lifetime.\n"
         "\n";
      VG_(umsg)(s, (ULong)(ThrID_MAX_VALID - 1024));
   } else {
      HChar* s =
         "\n"
         "Helgrind: cannot continue, run aborted: too many\n"
         "synchronisation events.  Sorry. Helgrind can only handle\n"
         "programs which perform %'llu or fewer\n"
         "inter-thread synchronisation events (locks, unlocks, etc).\n"
         "\n";
      VG_(umsg)(s, (1ULL << SCALARTS_N_TYMBITS) - 1);
   }
   VG_(exit)(1);
   /*NOTREACHED*/
   tl_assert(0); /*wtf?!*/
}


/* The dead thread (ThrID, actually) table.  A thread may only be
   listed here if we have been notified thereof by libhb_async_exit.
   New entries are added at the end.  The order isn't important, but
   the ThrID values must be unique.  This table lists the identity of
   all threads that have ever died -- none are ever removed.  We keep
   this table so as to be able to prune entries from VTSs.  We don't
   actually need to keep the set of threads that have ever died --
   only the threads that have died since the previous round of
   pruning.  But it's useful for sanity check purposes to keep the
   entire set, so we do. */
static XArray* /* of ThrID */ verydead_thread_table = NULL;

/* Arbitrary total ordering on ThrIDs. */
static Int cmp__ThrID ( void* v1, void* v2 ) {
   ThrID id1 = *(ThrID*)v1;
   ThrID id2 = *(ThrID*)v2;
   if (id1 < id2) return -1;
   if (id1 > id2) return 1;
   return 0;
}

static void verydead_thread_table_init ( void )
{
   tl_assert(!verydead_thread_table);
   verydead_thread_table
     = VG_(newXA)( HG_(zalloc),
                   "libhb.verydead_thread_table_init.1",
                   HG_(free), sizeof(ThrID) );
   tl_assert(verydead_thread_table);
   VG_(setCmpFnXA)(verydead_thread_table, cmp__ThrID);
}


/* A VTS contains .ts, its vector clock, and also .id, a field to hold
   a backlink for the caller's convenience.  Since we have no idea
   what to set that to in the library, it always gets set to
   VtsID_INVALID. */
typedef
   struct {
      VtsID    id;
      UInt     usedTS;
      UInt     sizeTS;
      ScalarTS ts[0];
   }
   VTS;

/* Allocate a VTS capable of storing 'sizeTS' entries. */
static VTS* VTS__new ( HChar* who, UInt sizeTS );

/* Make a clone of 'vts', sizing the new array to exactly match the
   number of ScalarTSs present. */
static VTS* VTS__clone ( HChar* who, VTS* vts );

/* Make a clone of 'vts' with the thrids in 'thrids' removed.  The new
   array is sized exactly to hold the number of required elements.
   'thridsToDel' is an array of ThrIDs to be omitted in the clone, and
   must be in strictly increasing order. */
static VTS* VTS__subtract ( HChar* who, VTS* vts, XArray* thridsToDel );

/* Delete this VTS in its entirety. */
static void VTS__delete ( VTS* vts );

/* Create a new singleton VTS in 'out'.  Caller must have
   pre-allocated 'out' sufficiently big to hold the result in all
   possible cases. */
static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym );

/* Create in 'out' a VTS which is the same as 'vts' except with
   vts[me]++, so to speak.  Caller must have pre-allocated 'out'
   sufficiently big to hold the result in all possible cases. */
static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts );

/* Create in 'out' a VTS which is the join (max) of 'a' and
   'b'. Caller must have pre-allocated 'out' sufficiently big to hold
   the result in all possible cases. */
static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b );

/* Compute the partial ordering relation of the two args.  Although we
   could be completely general and return an enumeration value (EQ,
   LT, GT, UN), in fact we only need LEQ, and so we may as well
   hardwire that fact.

   Returns zero iff LEQ(A,B), or a valid ThrID if not (zero is an
   invald ThrID).  In the latter case, the returned ThrID indicates
   the discovered point for which they are not.  There may be more
   than one such point, but we only care about seeing one of them, not
   all of them.  This rather strange convention is used because
   sometimes we want to know the actual index at which they first
   differ. */
static UInt VTS__cmpLEQ ( VTS* a, VTS* b );

/* Compute an arbitrary structural (total) ordering on the two args,
   based on their VCs, so they can be looked up in a table, tree, etc.
   Returns -1, 0 or 1. */
static Word VTS__cmp_structural ( VTS* a, VTS* b );

/* Debugging only.  Display the given VTS in the buffer. */
static void VTS__show ( HChar* buf, Int nBuf, VTS* vts );

/* Debugging only.  Return vts[index], so to speak. */
static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );

/* Notify the VTS machinery that a thread has been declared
   comprehensively dead: that is, it has done an async exit AND it has
   been joined with.  This should ensure that its local clocks (.viR
   and .viW) will never again change, and so all mentions of this
   thread from all VTSs in the system may be removed. */
static void VTS__declare_thread_very_dead ( Thr* idx );

/*--------------- to do with Vector Timestamps ---------------*/

static Bool is_sane_VTS ( VTS* vts )
{
   UWord     i, n;
   ScalarTS  *st1, *st2;
   if (!vts) return False;
   if (!vts->ts) return False;
   if (vts->usedTS > vts->sizeTS) return False;
   n = vts->usedTS;
   if (n == 1) {
      st1 = &vts->ts[0];
      if (st1->tym == 0)
         return False;
   }
   else
   if (n >= 2) {
      for (i = 0; i < n-1; i++) {
         st1 = &vts->ts[i];
         st2 = &vts->ts[i+1];
         if (st1->thrid >= st2->thrid)
            return False;
         if (st1->tym == 0 || st2->tym == 0)
            return False;
      }
   }
   return True;
}


/* Create a new, empty VTS.
*/
static VTS* VTS__new ( HChar* who, UInt sizeTS )
{
   VTS* vts = HG_(zalloc)(who, sizeof(VTS) + (sizeTS+1) * sizeof(ScalarTS));
   tl_assert(vts->usedTS == 0);
   vts->sizeTS = sizeTS;
   *(ULong*)(&vts->ts[sizeTS]) = 0x0ddC0ffeeBadF00dULL;
   return vts;
}

/* Clone this VTS.
*/
static VTS* VTS__clone ( HChar* who, VTS* vts )
{
   tl_assert(vts);
   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   UInt nTS = vts->usedTS;
   VTS* clone = VTS__new(who, nTS);
   clone->id = vts->id;
   clone->sizeTS = nTS;
   clone->usedTS = nTS;
   UInt i;
   for (i = 0; i < nTS; i++) {
      clone->ts[i] = vts->ts[i];
   }
   tl_assert( *(ULong*)(&clone->ts[clone->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   return clone;
}


/* Make a clone of a VTS with specified ThrIDs removed.  'thridsToDel'
   must be in strictly increasing order.  We could obviously do this
   much more efficiently (in linear time) if necessary.
*/
static VTS* VTS__subtract ( HChar* who, VTS* vts, XArray* thridsToDel )
{
   UInt i, j;
   tl_assert(vts);
   tl_assert(thridsToDel);
   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   UInt nTS = vts->usedTS;
   /* Figure out how many ScalarTSs will remain in the output. */
   UInt nReq = nTS;
   for (i = 0; i < nTS; i++) {
      ThrID thrid = vts->ts[i].thrid;
      if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
         nReq--;
   }
   tl_assert(nReq <= nTS);
   /* Copy the ones that will remain. */
   VTS* res = VTS__new(who, nReq);
   j = 0;
   for (i = 0; i < nTS; i++) {
      ThrID thrid = vts->ts[i].thrid;
      if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
         continue;
      res->ts[j++] = vts->ts[i];
   }
   tl_assert(j == nReq);
   tl_assert(j == res->sizeTS);
   res->usedTS = j;
   tl_assert( *(ULong*)(&res->ts[j]) == 0x0ddC0ffeeBadF00dULL);
   return res;
}


/* Delete this VTS in its entirety.
*/
static void VTS__delete ( VTS* vts )
{
   tl_assert(vts);
   tl_assert(vts->usedTS <= vts->sizeTS);
   tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   HG_(free)(vts);
}


/* Create a new singleton VTS. 
*/
static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym )
{
   tl_assert(thr);
   tl_assert(tym >= 1);
   tl_assert(out);
   tl_assert(out->usedTS == 0);
   tl_assert(out->sizeTS >= 1);
   UInt hi = out->usedTS++;
   out->ts[hi].thrid = Thr__to_ThrID(thr);
   out->ts[hi].tym   = tym;
}


/* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
   not modified.
*/
static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts )
{
   UInt      i, n;
   ThrID     me_thrid;
   Bool      found = False;

   stats__vts__tick++;

   tl_assert(out);
   tl_assert(out->usedTS == 0);
   if (vts->usedTS >= ThrID_MAX_VALID)
      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
   tl_assert(out->sizeTS >= 1 + vts->usedTS);

   tl_assert(me);
   me_thrid = Thr__to_ThrID(me);
   tl_assert(is_sane_VTS(vts));
   n = vts->usedTS;

   /* Copy all entries which precede 'me'. */
   for (i = 0; i < n; i++) {
      ScalarTS* here = &vts->ts[i];
      if (UNLIKELY(here->thrid >= me_thrid))
         break;
      UInt hi = out->usedTS++;
      out->ts[hi] = *here;
   }

   /* 'i' now indicates the next entry to copy, if any.
       There are 3 possibilities:
       (a) there is no next entry (we used them all up already):
           add (me_thrid,1) to the output, and quit
       (b) there is a next entry, and its thrid > me_thrid:
           add (me_thrid,1) to the output, then copy the remaining entries
       (c) there is a next entry, and its thrid == me_thrid:
           copy it to the output but increment its timestamp value.
           Then copy the remaining entries.  (c) is the common case.
   */
   tl_assert(i >= 0 && i <= n);
   if (i == n) { /* case (a) */
      UInt hi = out->usedTS++;
      out->ts[hi].thrid = me_thrid;
      out->ts[hi].tym   = 1;
   } else {
      /* cases (b) and (c) */
      ScalarTS* here = &vts->ts[i];
      if (me_thrid == here->thrid) { /* case (c) */
         if (UNLIKELY(here->tym >= (1ULL << SCALARTS_N_TYMBITS) - 2ULL)) {
            /* We're hosed.  We have to stop. */
            scalarts_limitations_fail_NORETURN( False/*!due_to_nThrs*/ );
         }
         UInt hi = out->usedTS++;
         out->ts[hi].thrid = here->thrid;
         out->ts[hi].tym   = here->tym + 1;
         i++;
         found = True;
      } else { /* case (b) */
         UInt hi = out->usedTS++;
         out->ts[hi].thrid = me_thrid;
         out->ts[hi].tym   = 1;
      }
      /* And copy any remaining entries. */
      for (/*keepgoing*/; i < n; i++) {
         ScalarTS* here2 = &vts->ts[i];
         UInt hi = out->usedTS++;
         out->ts[hi] = *here2;
      }
   }

   tl_assert(is_sane_VTS(out));
   tl_assert(out->usedTS == vts->usedTS + (found ? 0 : 1));
   tl_assert(out->usedTS <= out->sizeTS);
}


/* Return a new VTS constructed as the join (max) of the 2 args.
   Neither arg is modified.
*/
static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b )
{
   UInt     ia, ib, useda, usedb;
   ULong    tyma, tymb, tymMax;
   ThrID    thrid;
   UInt     ncommon = 0;

   stats__vts__join++;

   tl_assert(a);
   tl_assert(b);
   useda = a->usedTS;
   usedb = b->usedTS;

   tl_assert(out);
   tl_assert(out->usedTS == 0);
   /* overly conservative test, but doing better involves comparing
      the two VTSs, which we don't want to do at this point. */
   if (useda + usedb >= ThrID_MAX_VALID)
      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
   tl_assert(out->sizeTS >= useda + usedb);

   ia = ib = 0;

   while (1) {

      /* This logic is to enumerate triples (thrid, tyma, tymb) drawn
         from a and b in order, where thrid is the next ThrID
         occurring in either a or b, and tyma/b are the relevant
         scalar timestamps, taking into account implicit zeroes. */
      tl_assert(ia >= 0 && ia <= useda);
      tl_assert(ib >= 0 && ib <= usedb);

      if        (ia == useda && ib == usedb) {
         /* both empty - done */
         break;

      } else if (ia == useda && ib != usedb) {
         /* a empty, use up b */
         ScalarTS* tmpb = &b->ts[ib];
         thrid = tmpb->thrid;
         tyma  = 0;
         tymb  = tmpb->tym;
         ib++;

      } else if (ia != useda && ib == usedb) {
         /* b empty, use up a */
         ScalarTS* tmpa = &a->ts[ia];
         thrid = tmpa->thrid;
         tyma  = tmpa->tym;
         tymb  = 0;
         ia++;

      } else {
         /* both not empty; extract lowest-ThrID'd triple */
         ScalarTS* tmpa = &a->ts[ia];
         ScalarTS* tmpb = &b->ts[ib];
         if (tmpa->thrid < tmpb->thrid) {
            /* a has the lowest unconsidered ThrID */
            thrid = tmpa->thrid;
            tyma  = tmpa->tym;
            tymb  = 0;
            ia++;
         } else if (tmpa->thrid > tmpb->thrid) {
            /* b has the lowest unconsidered ThrID */
            thrid = tmpb->thrid;
            tyma  = 0;
            tymb  = tmpb->tym;
            ib++;
         } else {
            /* they both next mention the same ThrID */
            tl_assert(tmpa->thrid == tmpb->thrid);
            thrid = tmpa->thrid; /* == tmpb->thrid */
            tyma  = tmpa->tym;
            tymb  = tmpb->tym;
            ia++;
            ib++;
            ncommon++;
         }
      }

      /* having laboriously determined (thr, tyma, tymb), do something
         useful with it. */
      tymMax = tyma > tymb ? tyma : tymb;
      if (tymMax > 0) {
         UInt hi = out->usedTS++;
         out->ts[hi].thrid = thrid;
         out->ts[hi].tym   = tymMax;
      }

   }

   tl_assert(is_sane_VTS(out));
   tl_assert(out->usedTS <= out->sizeTS);
   tl_assert(out->usedTS == useda + usedb - ncommon);
}


/* Determine if 'a' <= 'b', in the partial ordering.  Returns zero if
   they are, or the first ThrID for which they are not (no valid ThrID
   has the value zero).  This rather strange convention is used
   because sometimes we want to know the actual index at which they
   first differ. */
static UInt/*ThrID*/ VTS__cmpLEQ ( VTS* a, VTS* b )
{
   Word  ia, ib, useda, usedb;
   ULong tyma, tymb;

   stats__vts__cmpLEQ++;

   tl_assert(a);
   tl_assert(b);
   useda = a->usedTS;
   usedb = b->usedTS;

   ia = ib = 0;

   while (1) {

      /* This logic is to enumerate doubles (tyma, tymb) drawn
         from a and b in order, and tyma/b are the relevant
         scalar timestamps, taking into account implicit zeroes. */
      ThrID thrid;

      tl_assert(ia >= 0 && ia <= useda);
      tl_assert(ib >= 0 && ib <= usedb);

      if        (ia == useda && ib == usedb) {
         /* both empty - done */
         break;

      } else if (ia == useda && ib != usedb) {
         /* a empty, use up b */
         ScalarTS* tmpb = &b->ts[ib];
         tyma  = 0;
         tymb  = tmpb->tym;
         thrid = tmpb->thrid;
         ib++;

      } else if (ia != useda && ib == usedb) {
         /* b empty, use up a */
         ScalarTS* tmpa = &a->ts[ia];
         tyma  = tmpa->tym;
         thrid = tmpa->thrid;
         tymb  = 0;
         ia++;

      } else {
         /* both not empty; extract lowest-ThrID'd triple */
         ScalarTS* tmpa = &a->ts[ia];
         ScalarTS* tmpb = &b->ts[ib];
         if (tmpa->thrid < tmpb->thrid) {
            /* a has the lowest unconsidered ThrID */
            tyma  = tmpa->tym;
            thrid = tmpa->thrid;
            tymb  = 0;
            ia++;
         }
         else
         if (tmpa->thrid > tmpb->thrid) {
            /* b has the lowest unconsidered ThrID */
            tyma  = 0;
            tymb  = tmpb->tym;
            thrid = tmpb->thrid;
            ib++;
         } else {
            /* they both next mention the same ThrID */
            tl_assert(tmpa->thrid == tmpb->thrid);
            tyma  = tmpa->tym;
            thrid = tmpa->thrid;
            tymb  = tmpb->tym;
            ia++;
            ib++;
         }
      }

      /* having laboriously determined (tyma, tymb), do something
         useful with it. */
      if (tyma > tymb) {
         /* not LEQ at this index.  Quit, since the answer is
            determined already. */
         tl_assert(thrid >= 1024);
         return thrid;
      }
   }

   return 0; /* all points are LEQ => return an invalid ThrID */
}


/* Compute an arbitrary structural (total) ordering on the two args,
   based on their VCs, so they can be looked up in a table, tree, etc.
   Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
   performance critical so there is some effort expended to make it sa
   fast as possible.
*/
Word VTS__cmp_structural ( VTS* a, VTS* b )
{
   /* We just need to generate an arbitrary total ordering based on
      a->ts and b->ts.  Preferably do it in a way which comes across likely
      differences relatively quickly. */
   Word     i;
   Word     useda = 0,    usedb = 0;
   ScalarTS *ctsa = NULL, *ctsb = NULL;

   stats__vts__cmp_structural++;

   tl_assert(a);
   tl_assert(b);

   ctsa = &a->ts[0]; useda = a->usedTS;
   ctsb = &b->ts[0]; usedb = b->usedTS;

   if (LIKELY(useda == usedb)) {
      ScalarTS *tmpa = NULL, *tmpb = NULL;
      stats__vts__cmp_structural_slow++;
      /* Same length vectors.  Find the first difference, if any, as
         fast as possible. */
      for (i = 0; i < useda; i++) {
         tmpa = &ctsa[i];
         tmpb = &ctsb[i];
         if (LIKELY(tmpa->tym == tmpb->tym
                    && tmpa->thrid == tmpb->thrid))
            continue;
         else
            break;
      }
      if (UNLIKELY(i == useda)) {
         /* They're identical. */
         return 0;
      } else {
         tl_assert(i >= 0 && i < useda);
         if (tmpa->tym < tmpb->tym) return -1;
         if (tmpa->tym > tmpb->tym) return 1;
         if (tmpa->thrid < tmpb->thrid) return -1;
         if (tmpa->thrid > tmpb->thrid) return 1;
         /* we just established them as non-identical, hence: */
      }
      /*NOTREACHED*/
      tl_assert(0);
   }

   if (useda < usedb) return -1;
   if (useda > usedb) return 1;
   /*NOTREACHED*/
   tl_assert(0);
}


/* Debugging only.  Display the given VTS in the buffer.
*/
void VTS__show ( HChar* buf, Int nBuf, VTS* vts )
{
   ScalarTS* st;
   HChar     unit[64];
   Word      i, n;
   Int       avail = nBuf;
   tl_assert(vts && vts->ts);
   tl_assert(nBuf > 16);
   buf[0] = '[';
   buf[1] = 0;
   n =  vts->usedTS;
   for (i = 0; i < n; i++) {
      tl_assert(avail >= 40);
      st = &vts->ts[i];
      VG_(memset)(unit, 0, sizeof(unit));
      VG_(sprintf)(unit, i < n-1 ? "%u:%llu " : "%u:%llu",
                         st->thrid, (ULong)st->tym);
      if (avail < VG_(strlen)(unit) + 40/*let's say*/) {
         VG_(strcat)(buf, " ...]");
         buf[nBuf-1] = 0;
         return;
      }
      VG_(strcat)(buf, unit);
      avail -= VG_(strlen)(unit);
   }
   VG_(strcat)(buf, "]");
   buf[nBuf-1] = 0;
}


/* Debugging only.  Return vts[index], so to speak.
*/
ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
{
   UWord i, n;
   ThrID idx_thrid = Thr__to_ThrID(idx);
   stats__vts__indexat_slow++;
   tl_assert(vts && vts->ts);
   n = vts->usedTS;
   for (i = 0; i < n; i++) {
      ScalarTS* st = &vts->ts[i];
      if (st->thrid == idx_thrid)
         return st->tym;
   }
   return 0;
}


/* See comment on prototype above.
*/
static void VTS__declare_thread_very_dead ( Thr* thr )
{
   if (0) VG_(printf)("VTQ:  tae %p\n", thr);

   tl_assert(thr->llexit_done);
   tl_assert(thr->joinedwith_done);

   ThrID nyu;
   nyu = Thr__to_ThrID(thr);
   VG_(addToXA)( verydead_thread_table, &nyu );

   /* We can only get here if we're assured that we'll never again
      need to look at this thread's ::viR or ::viW.  Set them to
      VtsID_INVALID, partly so as to avoid holding on to the VTSs, but
      mostly so that we don't wind up pruning them (as that would be
      nonsensical: the only interesting ScalarTS entry for a dead
      thread is its own index, and the pruning will remove that.). */
   VtsID__rcdec(thr->viR);
   VtsID__rcdec(thr->viW);
   thr->viR = VtsID_INVALID;
   thr->viW = VtsID_INVALID;
}


/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// SECTION END vts primitives                                  //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////



/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// SECTION BEGIN main library                                  //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////


/////////////////////////////////////////////////////////
//                                                     //
// VTS set                                             //
//                                                     //
/////////////////////////////////////////////////////////

static WordFM* /* WordFM VTS* void */ vts_set = NULL;

static void vts_set_init ( void )
{
   tl_assert(!vts_set);
   vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
                         HG_(free),
                         (Word(*)(UWord,UWord))VTS__cmp_structural );
   tl_assert(vts_set);
}

/* Given a VTS, look in vts_set to see if we already have a
   structurally identical one.  If yes, return the pair (True, pointer
   to the existing one).  If no, clone this one, add the clone to the
   set, and return (False, pointer to the clone). */
static Bool vts_set__find__or__clone_and_add ( /*OUT*/VTS** res, VTS* cand )
{
   UWord keyW, valW;
   stats__vts_set__focaa++;
   tl_assert(cand->id == VtsID_INVALID);
   /* lookup cand (by value) */
   if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
      /* found it */
      tl_assert(valW == 0);
      /* if this fails, cand (by ref) was already present (!) */
      tl_assert(keyW != (UWord)cand);
      *res = (VTS*)keyW;
      return True;
   } else {
      /* not present.  Clone, add and return address of clone. */
      stats__vts_set__focaa_a++;
      VTS* clone = VTS__clone( "libhb.vts_set_focaa.1", cand );
      tl_assert(clone != cand);
      VG_(addToFM)( vts_set, (UWord)clone, 0/*val is unused*/ );
      *res = clone;
      return False;
   }
}


/////////////////////////////////////////////////////////
//                                                     //
// VTS table                                           //
//                                                     //
/////////////////////////////////////////////////////////

static void VtsID__invalidate_caches ( void ); /* fwds */

/* A type to hold VTS table entries.  Invariants:
   If .vts == NULL, then this entry is not in use, so:
   - .rc == 0
   - this entry is on the freelist (unfortunately, does not imply
     any constraints on value for .freelink)
   If .vts != NULL, then this entry is in use:
   - .vts is findable in vts_set
   - .vts->id == this entry number
   - no specific value for .rc (even 0 is OK)
   - this entry is not on freelist, so .freelink == VtsID_INVALID
*/
typedef
   struct {
      VTS*  vts;      /* vts, in vts_set */
      UWord rc;       /* reference count - enough for entire aspace */
      VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
      VtsID remap;    /* used only during pruning */
   }
   VtsTE;

/* The VTS table. */
static XArray* /* of VtsTE */ vts_tab = NULL;

/* An index into the VTS table, indicating the start of the list of
   free (available for use) entries.  If the list is empty, this is
   VtsID_INVALID. */
static VtsID vts_tab_freelist = VtsID_INVALID;

/* Do a GC of vts_tab when the freelist becomes empty AND the size of
   vts_tab equals or exceeds this size.  After GC, the value here is
   set appropriately so as to check for the next GC point. */
static Word vts_next_GC_at = 1000;

static void vts_tab_init ( void )
{
   vts_tab
      = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
                    HG_(free), sizeof(VtsTE) );
   vts_tab_freelist
      = VtsID_INVALID;
   tl_assert(vts_tab);
}

/* Add ii to the free list, checking that it looks out-of-use. */
static void add_to_free_list ( VtsID ii )
{
   VtsTE* ie = VG_(indexXA)( vts_tab, ii );
   tl_assert(ie->vts == NULL);
   tl_assert(ie->rc == 0);
   tl_assert(ie->freelink == VtsID_INVALID);
   ie->freelink = vts_tab_freelist;
   vts_tab_freelist = ii;
}

/* Get an entry from the free list.  This will return VtsID_INVALID if
   the free list is empty. */
static VtsID get_from_free_list ( void )
{
   VtsID  ii;
   VtsTE* ie;
   if (vts_tab_freelist == VtsID_INVALID)
      return VtsID_INVALID;
   ii = vts_tab_freelist;
   ie = VG_(indexXA)( vts_tab, ii );
   tl_assert(ie->vts == NULL);
   tl_assert(ie->rc == 0);
   vts_tab_freelist = ie->freelink;
   return ii;
}

/* Produce a new VtsID that can be used, either by getting it from
   the freelist, or, if that is empty, by expanding vts_tab. */
static VtsID get_new_VtsID ( void )
{
   VtsID ii;
   VtsTE te;
   ii = get_from_free_list();
   if (ii != VtsID_INVALID)
      return ii;
   te.vts = NULL;
   te.rc = 0;
   te.freelink = VtsID_INVALID;
   te.remap    = VtsID_INVALID;
   ii = (VtsID)VG_(addToXA)( vts_tab, &te );
   return ii;
}


/* Indirect callback from lib_zsm. */
static void VtsID__rcinc ( VtsID ii )
{
   VtsTE* ie;
   /* VG_(indexXA) does a range check for us */
   ie = VG_(indexXA)( vts_tab, ii );
   tl_assert(ie->vts); /* else it's not in use */
   tl_assert(ie->rc < ~0UL); /* else we can't continue */
   tl_assert(ie->vts->id == ii);
   ie->rc++;
}

/* Indirect callback from lib_zsm. */
static void VtsID__rcdec ( VtsID ii )
{
   VtsTE* ie;
   /* VG_(indexXA) does a range check for us */
   ie = VG_(indexXA)( vts_tab, ii );
   tl_assert(ie->vts); /* else it's not in use */
   tl_assert(ie->rc > 0); /* else RC snafu */
   tl_assert(ie->vts->id == ii);
   ie->rc--;
}


/* Look up 'cand' in our collection of VTSs.  If present, return the
   VtsID for the pre-existing version.  If not present, clone it, add
   the clone to both vts_tab and vts_set, allocate a fresh VtsID for
   it, and return that. */
static VtsID vts_tab__find__or__clone_and_add ( VTS* cand )
{
   VTS* in_tab = NULL;
   tl_assert(cand->id == VtsID_INVALID);
   Bool already_have = vts_set__find__or__clone_and_add( &in_tab, cand );
   tl_assert(in_tab);
   if (already_have) {
      /* We already have a copy of 'cand'.  Use that. */
      VtsTE* ie;
      tl_assert(in_tab->id != VtsID_INVALID);
      ie = VG_(indexXA)( vts_tab, in_tab->id );
      tl_assert(ie->vts == in_tab);
      return in_tab->id;
   } else {
      VtsID  ii = get_new_VtsID();
      VtsTE* ie = VG_(indexXA)( vts_tab, ii );
      ie->vts = in_tab;
      ie->rc = 0;
      ie->freelink = VtsID_INVALID;
      in_tab->id = ii;
      return ii;
   }
}


static void show_vts_stats ( HChar* caller )
{
   UWord nSet, nTab, nLive;
   ULong totrc;
   UWord n, i;
   nSet = VG_(sizeFM)( vts_set );
   nTab = VG_(sizeXA)( vts_tab );
   totrc = 0;
   nLive = 0;
   n = VG_(sizeXA)( vts_tab );
   for (i = 0; i < n; i++) {
      VtsTE* ie = VG_(indexXA)( vts_tab, i );
      if (ie->vts) {
         nLive++;
         totrc += (ULong)ie->rc;
      } else {
         tl_assert(ie->rc == 0);
      }
   }
   VG_(printf)("  show_vts_stats %s\n", caller);
   VG_(printf)("    vts_tab size %4lu\n", nTab);
   VG_(printf)("    vts_tab live %4lu\n", nLive);
   VG_(printf)("    vts_set size %4lu\n", nSet);
   VG_(printf)("        total rc %4llu\n", totrc);
}


/* --- Helpers for VtsID pruning --- */

static
void remap_VtsID ( /*MOD*/XArray* /* of VtsTE */ old_tab,
                   /*MOD*/XArray* /* of VtsTE */ new_tab,
                   VtsID* ii )
{
   VtsTE *old_te, *new_te;
   VtsID old_id, new_id;
   /* We're relying here on VG_(indexXA)'s range checking to assert on
      any stupid values, in particular *ii == VtsID_INVALID. */
   old_id = *ii;
   old_te = VG_(indexXA)( old_tab, old_id );
   old_te->rc--;
   new_id = old_te->remap;
   new_te = VG_(indexXA)( new_tab, new_id );
   new_te->rc++;
   *ii = new_id;
}

static
void remap_VtsIDs_in_SVal ( /*MOD*/XArray* /* of VtsTE */ old_tab,
                            /*MOD*/XArray* /* of VtsTE */ new_tab,
                            SVal* s )
{
   SVal old_sv, new_sv;
   old_sv = *s;
   if (SVal__isC(old_sv)) {
      VtsID rMin, wMin;
      rMin = SVal__unC_Rmin(old_sv);
      wMin = SVal__unC_Wmin(old_sv);
      remap_VtsID( old_tab, new_tab, &rMin );
      remap_VtsID( old_tab, new_tab, &wMin );
      new_sv = SVal__mkC( rMin, wMin );
      *s = new_sv;
  }
}


/* NOT TO BE CALLED FROM WITHIN libzsm. */
__attribute__((noinline))
static void vts_tab__do_GC ( Bool show_stats )
{
   UWord i, nTab, nLive, nFreed;

   /* ---------- BEGIN VTS GC ---------- */
   /* check this is actually necessary. */
   tl_assert(vts_tab_freelist == VtsID_INVALID);

   /* empty the caches for partial order checks and binary joins.  We
      could do better and prune out the entries to be deleted, but it
      ain't worth the hassle. */
   VtsID__invalidate_caches();

   /* First, make the reference counts up to date. */
   zsm_flush_cache();

   nTab = VG_(sizeXA)( vts_tab );

   if (show_stats) {
      VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
      show_vts_stats("before GC");
   }

   /* Now we can inspect the entire vts_tab.  Any entries with zero
      .rc fields are now no longer in use and can be put back on the
      free list, removed from vts_set, and deleted. */
   nFreed = 0;
   for (i = 0; i < nTab; i++) {
      Bool present;
      UWord oldK = 0, oldV = 12345;
      VtsTE* te = VG_(indexXA)( vts_tab, i );
      if (te->vts == NULL) {
         tl_assert(te->rc == 0);
         continue; /* already on the free list (presumably) */
      }
      if (te->rc > 0)
         continue; /* in use */
      /* Ok, we got one we can free. */
      tl_assert(te->vts->id == i);
      /* first, remove it from vts_set. */
      present = VG_(delFromFM)( vts_set,
                                &oldK, &oldV, (UWord)te->vts );
      tl_assert(present); /* else it isn't in vts_set ?! */
      tl_assert(oldV == 0); /* no info stored in vts_set val fields */
      tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
      /* now free the VTS itself */
      VTS__delete(te->vts);
      te->vts = NULL;
      /* and finally put this entry on the free list */
      tl_assert(te->freelink == VtsID_INVALID); /* can't already be on it */
      add_to_free_list( i );
      nFreed++;
   }

   /* Now figure out when the next GC should be.  We'll allow the
      number of VTSs to double before GCing again.  Except of course
      that since we can't (or, at least, don't) shrink vts_tab, we
      can't set the threshhold value smaller than it. */
   tl_assert(nFreed <= nTab);
   nLive = nTab - nFreed;
   tl_assert(nLive >= 0 && nLive <= nTab);
   vts_next_GC_at = 2 * nLive;
   if (vts_next_GC_at < nTab)
      vts_next_GC_at = nTab;

   if (show_stats) {
      show_vts_stats("after GC");
      VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
   }

   if (VG_(clo_stats)) {
      static UInt ctr = 1;
      tl_assert(nTab > 0);
      VG_(message)(Vg_DebugMsg,
                  "libhb: VTS GC: #%u  old size %lu  live %lu  (%2llu%%)\n",
                  ctr++, nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
   }
   /* ---------- END VTS GC ---------- */

   /* Decide whether to do VTS pruning.  We have one of three
      settings. */
   static UInt pruning_auto_ctr = 0; /* do not make non-static */

   Bool do_pruning = False;
   switch (HG_(clo_vts_pruning)) {
      case 0: /* never */
         break;
      case 1: /* auto */
         do_pruning = (++pruning_auto_ctr % 5) == 0;
         break;
      case 2: /* always */
         do_pruning = True;
         break;
      default:
         tl_assert(0);
   }

   /* The rest of this routine only handles pruning, so we can
      quit at this point if it is not to be done. */
   if (!do_pruning)
      return;

   /* ---------- BEGIN VTS PRUNING ---------- */
   /* We begin by sorting the backing table on its .thr values, so as
      to (1) check they are unique [else something has gone wrong,
      since it means we must have seen some Thr* exiting more than
      once, which can't happen], and (2) so that we can quickly look
      up the dead-thread entries as we work through the VTSs. */
   VG_(sortXA)( verydead_thread_table );
   /* Sanity check: check for unique .sts.thr values. */
   UWord nBT = VG_(sizeXA)( verydead_thread_table );
   if (nBT > 0) {
      ThrID thrid1, thrid2;
      thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, 0 );
      for (i = 1; i < nBT; i++) {
         thrid1 = thrid2;
         thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, i );
         tl_assert(thrid1 < thrid2);
      }
   }
   /* Ok, so the dead thread table has unique and in-order keys. */

   /* We will run through the old table, and create a new table and
      set, at the same time setting the .remap entries in the old
      table to point to the new entries.  Then, visit every VtsID in
      the system, and replace all of them with new ones, using the
      .remap entries in the old table.  Finally, we can delete the old
      table and set. */

   XArray* /* of VtsTE */ new_tab
      = VG_(newXA)( HG_(zalloc), "libhb.vts_tab__do_GC.new_tab",
                    HG_(free), sizeof(VtsTE) );

   /* WordFM VTS* void */
   WordFM* new_set
      = VG_(newFM)( HG_(zalloc), "libhb.vts_tab__do_GC.new_set",
                    HG_(free),
                    (Word(*)(UWord,UWord))VTS__cmp_structural );

   /* Visit each old VTS.  For each one:

      * make a pruned version

      * search new_set for the pruned version, yielding either
        Nothing (not present) or the new VtsID for it.

      * if not present, allocate a new VtsID for it, insert (pruned
        VTS, new VtsID) in the tree, and set
        remap_table[old VtsID] = new VtsID.

      * if present, set remap_table[old VtsID] = new VtsID, where
        new VtsID was determined by the tree lookup.  Then free up
        the clone.
   */

   UWord nBeforePruning = 0, nAfterPruning = 0;
   UWord nSTSsBefore = 0, nSTSsAfter = 0;
   VtsID new_VtsID_ctr = 0;

   for (i = 0; i < nTab; i++) {

      /* For each old VTS .. */
      VtsTE* old_te  = VG_(indexXA)( vts_tab, i );
      VTS*   old_vts = old_te->vts;
      tl_assert(old_te->remap == VtsID_INVALID);

      /* Skip it if not in use */
      if (old_te->rc == 0) {
         tl_assert(old_vts == NULL);
         continue;
      }
      tl_assert(old_vts != NULL);
      tl_assert(old_vts->id == i);
      tl_assert(old_vts->ts != NULL);

      /* It is in use. Make a pruned version. */
      nBeforePruning++;
      nSTSsBefore += old_vts->usedTS;
      VTS* new_vts = VTS__subtract("libhb.vts_tab__do_GC.new_vts",
                                   old_vts, verydead_thread_table);
      tl_assert(new_vts->sizeTS == new_vts->usedTS);
      tl_assert(*(ULong*)(&new_vts->ts[new_vts->usedTS])
                == 0x0ddC0ffeeBadF00dULL);

      /* Get rid of the old VTS and the tree entry.  It's a bit more
         complex to incrementally delete the VTSs now than to nuke
         them all after we're done, but the upside is that we don't
         wind up temporarily storing potentially two complete copies
         of each VTS and hence spiking memory use. */
      UWord oldK = 0, oldV = 12345;
      Bool  present = VG_(delFromFM)( vts_set,
                                      &oldK, &oldV, (UWord)old_vts );
      tl_assert(present); /* else it isn't in vts_set ?! */
      tl_assert(oldV == 0); /* no info stored in vts_set val fields */
      tl_assert(oldK == (UWord)old_vts); /* else what did delFromFM find?! */
      /* now free the VTS itself */
      VTS__delete(old_vts);
      old_te->vts = NULL;
      old_vts = NULL;

      /* NO MENTIONS of old_vts allowed beyond this point. */

      /* Ok, we have the pruned copy in new_vts.  See if a
         structurally identical version is already present in new_set.
         If so, delete the one we just made and move on; if not, add
         it. */
      VTS*  identical_version = NULL;
      UWord valW = 12345;
      if (VG_(lookupFM)(new_set, (UWord*)&identical_version, &valW,
                        (UWord)new_vts)) {
         // already have it
         tl_assert(valW == 0);
         tl_assert(identical_version != NULL);
         tl_assert(identical_version != new_vts);
         VTS__delete(new_vts);
         new_vts = identical_version;
         tl_assert(new_vts->id != VtsID_INVALID);
      } else {
         tl_assert(valW == 12345);
         tl_assert(identical_version == NULL);
         new_vts->id = new_VtsID_ctr++;
         Bool b = VG_(addToFM)(new_set, (UWord)new_vts, 0);
         tl_assert(!b);
         VtsTE new_te;
         new_te.vts      = new_vts;
         new_te.rc       = 0;
         new_te.freelink = VtsID_INVALID;
         new_te.remap    = VtsID_INVALID;
         Word j = VG_(addToXA)( new_tab, &new_te );
         tl_assert(j <= i);
         tl_assert(j == new_VtsID_ctr - 1);
         // stats
         nAfterPruning++;
         nSTSsAfter += new_vts->usedTS;
      }
      old_te->remap = new_vts->id;

   } /* for (i = 0; i < nTab; i++) */

   /* At this point, we have:
      * the old VTS table, with its .remap entries set,
        and with all .vts == NULL.
      * the old VTS tree should be empty, since it and the old VTSs
        it contained have been incrementally deleted was we worked
        through the old table.
      * the new VTS table, with all .rc == 0, all .freelink and .remap
        == VtsID_INVALID. 
      * the new VTS tree.
   */
   tl_assert( VG_(sizeFM)(vts_set) == 0 );

   /* Now actually apply the mapping. */
   /* Visit all the VtsIDs in the entire system.  Where do we expect
      to find them?
      (a) in shadow memory -- the LineZs and LineFs
      (b) in our collection of struct _Thrs.
      (c) in our collection of struct _SOs.
      Nowhere else, AFAICS.  Not in the zsm cache, because that just
      got invalidated.

      Using the .remap fields in vts_tab, map each old VtsID to a new
      VtsID.  For each old VtsID, dec its rc; and for each new one,
      inc it.  This sets up the new refcounts, and it also gives a
      cheap sanity check of the old ones: all old refcounts should be
      zero after this operation.
   */

   /* Do the mappings for (a) above: iterate over the Primary shadow
      mem map (WordFM Addr SecMap*). */
   UWord secmapW = 0;
   VG_(initIterFM)( map_shmem );
   while (VG_(nextIterFM)( map_shmem, NULL, &secmapW )) {
      UWord   j;
      SecMap* sm = (SecMap*)secmapW;
      tl_assert(sm->magic == SecMap_MAGIC);
      /* Deal with the LineZs */
      for (i = 0; i < N_SECMAP_ZLINES; i++) {
         LineZ* lineZ = &sm->linesZ[i];
         if (lineZ->dict[0] == SVal_INVALID)
            continue; /* not in use -- data is in F rep instead */
         for (j = 0; j < 4; j++)
            remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineZ->dict[j]);
      }
      /* Deal with the LineFs */
      for (i = 0; i < sm->linesF_size; i++) {
         LineF* lineF = &sm->linesF[i];
         if (!lineF->inUse)
            continue;
         for (j = 0; j < N_LINE_ARANGE; j++)
            remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineF->w64s[j]);
      }
   }
   VG_(doneIterFM)( map_shmem );

   /* Do the mappings for (b) above: visit our collection of struct
      _Thrs. */
   Thread* hgthread = get_admin_threads();
   tl_assert(hgthread);
   while (hgthread) {
      Thr* hbthr = hgthread->hbthr;
      tl_assert(hbthr);
      /* Threads that are listed in the prunable set have their viR
         and viW set to VtsID_INVALID, so we can't mess with them. */
      if (hbthr->llexit_done && hbthr->joinedwith_done) {
         tl_assert(hbthr->viR == VtsID_INVALID);
         tl_assert(hbthr->viW == VtsID_INVALID);
         hgthread = hgthread->admin;
         continue;
      }
      remap_VtsID( vts_tab, new_tab, &hbthr->viR );
      remap_VtsID( vts_tab, new_tab, &hbthr->viW );
      hgthread = hgthread->admin;
   }

   /* Do the mappings for (c) above: visit the struct _SOs. */
   SO* so = admin_SO;
   while (so) {
      if (so->viR != VtsID_INVALID)
         remap_VtsID( vts_tab, new_tab, &so->viR );
      if (so->viW != VtsID_INVALID)
         remap_VtsID( vts_tab, new_tab, &so->viW );
      so = so->admin_next;
   }

   /* So, we're nearly done (with this incredibly complex operation).
      Check the refcounts for the old VtsIDs all fell to zero, as
      expected.  Any failure is serious. */
   for (i = 0; i < nTab; i++) {
      VtsTE* te = VG_(indexXA)( vts_tab, i );
      tl_assert(te->vts == NULL);
      /* This is the assert proper.  Note we're also asserting
         zeroness for old entries which are unmapped (hence have
         .remap == VtsID_INVALID).  That's OK. */
      tl_assert(te->rc == 0);
   }

   /* Install the new table and set. */
   VG_(deleteFM)(vts_set, NULL/*kFin*/, NULL/*vFin*/);
   vts_set = new_set;
   VG_(deleteXA)( vts_tab );
   vts_tab = new_tab;

   /* The freelist of vts_tab entries is empty now, because we've
      compacted all of the live entries at the low end of the
      table. */
   vts_tab_freelist = VtsID_INVALID;

   /* Sanity check vts_set and vts_tab. */

   /* Because all the live entries got slid down to the bottom of vts_tab: */
   tl_assert( VG_(sizeXA)( vts_tab ) == VG_(sizeFM)( vts_set ));

   /* Assert that the vts_tab and vts_set entries point at each other
      in the required way */
   UWord wordK = 0, wordV = 0;
   VG_(initIterFM)( vts_set );
   while (VG_(nextIterFM)( vts_set, &wordK, &wordV )) {
      tl_assert(wordK != 0);
      tl_assert(wordV == 0);
      VTS* vts = (VTS*)wordK;
      tl_assert(vts->id != VtsID_INVALID);
      VtsTE* te = VG_(indexXA)( vts_tab, vts->id );
      tl_assert(te->vts == vts);
   }
   VG_(doneIterFM)( vts_set );

   /* Also iterate over the table, and check each entry is
      plausible. */
   nTab = VG_(sizeXA)( vts_tab );
   for (i = 0; i < nTab; i++) {
      VtsTE* te = VG_(indexXA)( vts_tab, i );
      tl_assert(te->vts);
      tl_assert(te->vts->id == i);
      tl_assert(te->rc > 0); /* 'cos we just GC'd */
      tl_assert(te->freelink == VtsID_INVALID); /* in use */
      tl_assert(te->remap == VtsID_INVALID); /* not relevant */
   }

   /* And we're done.  Bwahahaha. Ha. Ha. Ha. */
   if (VG_(clo_stats)) {
      static UInt ctr = 1;
      tl_assert(nTab > 0);
      VG_(message)(
         Vg_DebugMsg,
         "libhb: VTS PR: #%u  before %lu (avg sz %lu)  "
            "after %lu (avg sz %lu)\n",
         ctr++,
         nBeforePruning, nSTSsBefore / (nBeforePruning ? nBeforePruning : 1),
         nAfterPruning, nSTSsAfter / (nAfterPruning ? nAfterPruning : 1)
      );
   }
   if (0)
   VG_(printf)("VTQ: before pruning %lu (avg sz %lu), "
               "after pruning %lu (avg sz %lu)\n",
               nBeforePruning, nSTSsBefore / nBeforePruning,
               nAfterPruning, nSTSsAfter / nAfterPruning);
   /* ---------- END VTS PRUNING ---------- */
}


/////////////////////////////////////////////////////////
//                                                     //
// Vts IDs                                             //
//                                                     //
/////////////////////////////////////////////////////////

//////////////////////////
/* A temporary, max-sized VTS which is used as a temporary (the first
   argument) in VTS__singleton, VTS__tick and VTS__join operations. */
static VTS* temp_max_sized_VTS = NULL;

//////////////////////////
static ULong stats__cmpLEQ_queries = 0;
static ULong stats__cmpLEQ_misses  = 0;
static ULong stats__join2_queries  = 0;
static ULong stats__join2_misses   = 0;

static inline UInt ROL32 ( UInt w, Int n ) {
   w = (w << n) | (w >> (32-n));
   return w;
}
static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) {
   UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13);
   return hash % nTab;
}

#define N_CMPLEQ_CACHE 1023
static
   struct { VtsID vi1; VtsID vi2; Bool leq; }
   cmpLEQ_cache[N_CMPLEQ_CACHE];

#define N_JOIN2_CACHE 1023
static
   struct { VtsID vi1; VtsID vi2; VtsID res; }
   join2_cache[N_JOIN2_CACHE];

static void VtsID__invalidate_caches ( void ) {
   Int i;
   for (i = 0; i < N_CMPLEQ_CACHE; i++) {
      cmpLEQ_cache[i].vi1 = VtsID_INVALID;
      cmpLEQ_cache[i].vi2 = VtsID_INVALID;
      cmpLEQ_cache[i].leq = False;
   }
   for (i = 0; i < N_JOIN2_CACHE; i++) {
     join2_cache[i].vi1 = VtsID_INVALID;
     join2_cache[i].vi2 = VtsID_INVALID;
     join2_cache[i].res = VtsID_INVALID;
   }
}
//////////////////////////

//static Bool VtsID__is_valid ( VtsID vi ) {
//   VtsTE* ve;
//   if (vi >= (VtsID)VG_(sizeXA)( vts_tab ))
//      return False;
//   ve = VG_(indexXA)( vts_tab, vi );
//   if (!ve->vts)
//      return False;
//   tl_assert(ve->vts->id == vi);
//   return True;
//}

static VTS* VtsID__to_VTS ( VtsID vi ) {
   VtsTE* te = VG_(indexXA)( vts_tab, vi );
   tl_assert(te->vts);
   return te->vts;
}

static void VtsID__pp ( VtsID vi ) {
   HChar buf[100];
   VTS* vts = VtsID__to_VTS(vi);
   VTS__show( buf, sizeof(buf)-1, vts );
   buf[sizeof(buf)-1] = 0;
   VG_(printf)("%s", buf);
}

/* compute partial ordering relation of vi1 and vi2. */
__attribute__((noinline))
static Bool VtsID__cmpLEQ_WRK ( VtsID vi1, VtsID vi2 ) {
   UInt hash;
   Bool leq;
   VTS  *v1, *v2;
   //if (vi1 == vi2) return True;
   tl_assert(vi1 != vi2);
   ////++
   stats__cmpLEQ_queries++;
   hash = hash_VtsIDs(vi1, vi2, N_CMPLEQ_CACHE);
   if (cmpLEQ_cache[hash].vi1 == vi1
       && cmpLEQ_cache[hash].vi2 == vi2)
      return cmpLEQ_cache[hash].leq;
   stats__cmpLEQ_misses++;
   ////--
   v1  = VtsID__to_VTS(vi1);
   v2  = VtsID__to_VTS(vi2);
   leq = VTS__cmpLEQ( v1, v2 ) == 0;
   ////++
   cmpLEQ_cache[hash].vi1 = vi1;
   cmpLEQ_cache[hash].vi2 = vi2;
   cmpLEQ_cache[hash].leq = leq;
   ////--
   return leq;
}
static inline Bool VtsID__cmpLEQ ( VtsID vi1, VtsID vi2 ) {
   return LIKELY(vi1 == vi2)  ? True  : VtsID__cmpLEQ_WRK(vi1, vi2);
}

/* compute binary join */
__attribute__((noinline))
static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
   UInt  hash;
   VtsID res;
   VTS   *vts1, *vts2;
   //if (vi1 == vi2) return vi1;
   tl_assert(vi1 != vi2);
   ////++
   stats__join2_queries++;
   hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE);
   if (join2_cache[hash].vi1 == vi1
       && join2_cache[hash].vi2 == vi2)
      return join2_cache[hash].res;
   stats__join2_misses++;
   ////--
   vts1 = VtsID__to_VTS(vi1);
   vts2 = VtsID__to_VTS(vi2);
   temp_max_sized_VTS->usedTS = 0;
   VTS__join(temp_max_sized_VTS, vts1,vts2);
   res = vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
   ////++
   join2_cache[hash].vi1 = vi1;
   join2_cache[hash].vi2 = vi2;
   join2_cache[hash].res = res;
   ////--
   return res;
}
static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
   return LIKELY(vi1 == vi2)  ? vi1  : VtsID__join2_WRK(vi1, vi2);
}

/* create a singleton VTS, namely [thr:1] */
static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) {
   temp_max_sized_VTS->usedTS = 0;
   VTS__singleton(temp_max_sized_VTS, thr,tym);
   return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
}

/* tick operation, creates value 1 if specified index is absent */
static VtsID VtsID__tick ( VtsID vi, Thr* idx ) {
   VTS* vts = VtsID__to_VTS(vi);
   temp_max_sized_VTS->usedTS = 0;
   VTS__tick(temp_max_sized_VTS, idx,vts);
   return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
}

/* index into a VTS (only for assertions) */
static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) {
   VTS* vts = VtsID__to_VTS(vi);
   return VTS__indexAt_SLOW( vts, idx );
}

/* Assuming that !cmpLEQ(vi1, vi2), find the index of the first (or
   any, really) element in vi1 which is pointwise greater-than the
   corresponding element in vi2.  If no such element exists, return
   NULL.  This needs to be fairly quick since it is called every time
   a race is detected. */
static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 )
{
   VTS  *vts1, *vts2;
   Thr*  diffthr;
   ThrID diffthrid;
   tl_assert(vi1 != vi2);
   vts1 = VtsID__to_VTS(vi1);
   vts2 = VtsID__to_VTS(vi2);
   tl_assert(vts1 != vts2);
   diffthrid = VTS__cmpLEQ(vts1, vts2);
   diffthr = Thr__from_ThrID(diffthrid);
   tl_assert(diffthr); /* else they are LEQ ! */
   return diffthr;
}


/////////////////////////////////////////////////////////
//                                                     //
// Filters                                             //
//                                                     //
/////////////////////////////////////////////////////////

/* Forget everything we know -- clear the filter and let everything
   through.  This needs to be as fast as possible, since it is called
   every time the running thread changes, and every time a thread's
   vector clocks change, which can be quite frequent.  The obvious
   fast way to do this is simply to stuff in tags which we know are
   not going to match anything, since they're not aligned to the start
   of a line. */
static void Filter__clear ( Filter* fi, HChar* who )
{
   UWord i;
   if (0) VG_(printf)("  Filter__clear(%p, %s)\n", fi, who);
   for (i = 0; i < FI_NUM_LINES; i += 8) {
      fi->tags[i+0] = 1; /* impossible value -- cannot match */
      fi->tags[i+1] = 1;
      fi->tags[i+2] = 1;
      fi->tags[i+3] = 1;
      fi->tags[i+4] = 1;
      fi->tags[i+5] = 1;
      fi->tags[i+6] = 1;
      fi->tags[i+7] = 1;
   }
   tl_assert(i == FI_NUM_LINES);
}

/* Clearing an arbitrary range in the filter.  Unfortunately
   we have to do this due to core-supplied new/die-mem events. */

static void Filter__clear_1byte ( Filter* fi, Addr a )
{
   Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   FiLine* line   = &fi->lines[lineno];
   UWord   loff   = (a - atag) / 8;
   UShort  mask   = 0x3 << (2 * (a & 7));
   /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
   if (LIKELY( fi->tags[lineno] == atag )) {
      /* hit.  clear the bits. */
      UShort  u16  = line->u16s[loff];
      line->u16s[loff] = u16 & ~mask; /* clear them */
   } else {
      /* miss.  The filter doesn't hold this address, so ignore. */
   }
}

static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a )
{
   Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   FiLine* line   = &fi->lines[lineno];
   UWord   loff   = (a - atag) / 8;
   if (LIKELY( fi->tags[lineno] == atag )) {
      line->u16s[loff] = 0;
   } else {
    /* miss.  The filter doesn't hold this address, so ignore. */
   }
}

static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
{
  //VG_(printf)("%lu ", len);
   /* slowly do part preceding 8-alignment */
   while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) {
      Filter__clear_1byte( fi, a );
      a++;
      len--;
   }
   /* vector loop */
   while (len >= 8) {
      Filter__clear_8bytes_aligned( fi, a );
      a += 8;
      len -= 8;
   }
   /* slowly do tail */
   while (UNLIKELY(len > 0)) {
      Filter__clear_1byte( fi, a );
      a++;
      len--;
   }
}


/* ------ Read handlers for the filter. ------ */

static inline Bool Filter__ok_to_skip_crd64 ( Filter* fi, Addr a )
{
   if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
      return False;
   { 
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0xAAAA;
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort u16  = line->u16s[loff];
        Bool   ok   = (u16 & mask) == mask; /* all R bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

static inline Bool Filter__ok_to_skip_crd32 ( Filter* fi, Addr a )
{
   if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
      return False;
   {
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0xAA << (2 * (a & 4)); /* 0xAA00 or 0x00AA */
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort  u16  = line->u16s[loff];
        Bool    ok   = (u16 & mask) == mask; /* 4 x R bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord   i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

static inline Bool Filter__ok_to_skip_crd16 ( Filter* fi, Addr a )
{
   if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
      return False;
   {
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0xA << (2 * (a & 6));
     /* mask is A000, 0A00, 00A0 or 000A */
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort  u16  = line->u16s[loff];
        Bool    ok   = (u16 & mask) == mask; /* 2 x R bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord   i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

static inline Bool Filter__ok_to_skip_crd08 ( Filter* fi, Addr a )
{
   {
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0x2 << (2 * (a & 7));
     /* mask is 8000, 2000, 0800, 0200, 0080, 0020, 0008 or 0002 */
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort  u16  = line->u16s[loff];
        Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord   i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}


/* ------ Write handlers for the filter. ------ */

static inline Bool Filter__ok_to_skip_cwr64 ( Filter* fi, Addr a )
{
   if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
      return False;
   { 
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0xFFFF;
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort u16  = line->u16s[loff];
        Bool   ok   = (u16 & mask) == mask; /* all R & W bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

static inline Bool Filter__ok_to_skip_cwr32 ( Filter* fi, Addr a )
{
   if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
      return False;
   {
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0xFF << (2 * (a & 4)); /* 0xFF00 or 0x00FF */
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort  u16  = line->u16s[loff];
        Bool    ok   = (u16 & mask) == mask; /* 4 x R & W bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord   i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

static inline Bool Filter__ok_to_skip_cwr16 ( Filter* fi, Addr a )
{
   if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
      return False;
   {
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0xF << (2 * (a & 6));
     /* mask is F000, 0F00, 00F0 or 000F */
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort  u16  = line->u16s[loff];
        Bool    ok   = (u16 & mask) == mask; /* 2 x R & W bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord   i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}

static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a )
{
   {
     Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
     UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
     FiLine* line   = &fi->lines[lineno];
     UWord   loff   = (a - atag) / 8;
     UShort  mask   = 0x3 << (2 * (a & 7));
     /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
     if (LIKELY( fi->tags[lineno] == atag )) {
        /* hit.  check line and update. */
        UShort  u16  = line->u16s[loff];
        Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
        line->u16s[loff] = u16 | mask; /* set them */
        return ok;
     } else {
        /* miss.  nuke existing line and re-use it. */
        UWord   i;
        fi->tags[lineno] = atag;
        for (i = 0; i < FI_LINE_SZB / 8; i++)
           line->u16s[i] = 0;
        line->u16s[loff] = mask;
        return False;
     }
   }
}


/////////////////////////////////////////////////////////
//                                                     //
// Threads                                             //
//                                                     //
/////////////////////////////////////////////////////////

/* Maps ThrID values to their Thr*s (which contain ThrID values that
   should point back to the relevant slot in the array.  Lowest
   numbered slot (0) is for thrid = 1024, (1) is for 1025, etc. */
static XArray* /* of Thr* */ thrid_to_thr_map = NULL;

/* And a counter to dole out ThrID values.  For rationale/background,
   see comments on definition of ScalarTS (far) above. */
static ThrID thrid_counter = 1024; /* runs up to ThrID_MAX_VALID */

static ThrID Thr__to_ThrID ( Thr* thr ) {
   return thr->thrid;
}
static Thr* Thr__from_ThrID ( UInt thrid ) {
   Thr* thr = *(Thr**)VG_(indexXA)( thrid_to_thr_map, thrid - 1024 );
   tl_assert(thr->thrid == thrid);
   return thr;
}

static Thr* Thr__new ( void )
{
   Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) );
   thr->viR = VtsID_INVALID;
   thr->viW = VtsID_INVALID;
   thr->llexit_done = False;
   thr->joinedwith_done = False;
   thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
   /* We only really need this at history level 1, but unfortunately
      this routine is called before the command line processing is
      done (sigh), so we can't rely on HG_(clo_history_level) at this
      point.  Hence always allocate it.  Bah. */
   thr->local_Kws_n_stacks
      = VG_(newXA)( HG_(zalloc),
                    "libhb.Thr__new.3 (local_Kws_and_stacks)",
                    HG_(free), sizeof(ULong_n_EC) );

   /* Add this Thr* <-> ThrID binding to the mapping, and
      cross-check */
   if (!thrid_to_thr_map) {
      thrid_to_thr_map = VG_(newXA)( HG_(zalloc), "libhb.Thr__new.4",
                                     HG_(free), sizeof(Thr*) );
      tl_assert(thrid_to_thr_map);
   }

   if (thrid_counter >= ThrID_MAX_VALID) {
      /* We're hosed.  We have to stop. */
      scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
   }

   thr->thrid = thrid_counter++;
   Word ix = VG_(addToXA)( thrid_to_thr_map, &thr );
   tl_assert(ix + 1024 == thr->thrid);

   return thr;
}

static void note_local_Kw_n_stack_for ( Thr* thr )
{
   Word       nPresent;
   ULong_n_EC pair;
   tl_assert(thr);

   // We only collect this info at history level 1 (approx)
   if (HG_(clo_history_level) != 1) 
      return;

   /* This is the scalar Kw for thr. */
   pair.ull = VtsID__indexAt( thr->viW, thr );
   pair.ec  = main_get_EC( thr );
   tl_assert(pair.ec);
   tl_assert(thr->local_Kws_n_stacks);

   /* check that we're not adding duplicates */
   nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );

   /* Throw away old stacks, if necessary.  We can't accumulate stuff
      indefinitely. */
   if (nPresent >= N_KWs_N_STACKs_PER_THREAD) {
      VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 );
      nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
      if (0)
         VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p (!!! gc !!!)\n",
                     thr, pair.ull, pair.ec );
   }

   if (nPresent > 0) {
      ULong_n_EC* prevPair
         = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 );
      tl_assert( prevPair->ull <= pair.ull );
   }

   if (nPresent == 0)
      pair.ec = NULL;

   VG_(addToXA)( thr->local_Kws_n_stacks, &pair );

   if (0)
      VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p\n",
                  thr, pair.ull, pair.ec );
   if (0)
      VG_(pp_ExeContext)(pair.ec);
}

static Int cmp__ULong_n_EC__by_ULong ( ULong_n_EC* pair1, ULong_n_EC* pair2 )
{
   if (pair1->ull < pair2->ull) return -1;
   if (pair1->ull > pair2->ull) return 1;
   return 0;
}


/////////////////////////////////////////////////////////
//                                                     //
// Shadow Values                                       //
//                                                     //
/////////////////////////////////////////////////////////

// type SVal, SVal_INVALID and SVal_NOACCESS are defined by
// hb_zsm.h.  We have to do everything else here.

/* SVal is 64 bit unsigned int.

      <---------30--------->    <---------30--------->
   00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X   C(Rmin,Wmin)
   10 X--------------------X XX X--------------------X   A: SVal_NOACCESS
   11 0--------------------0 00 0--------------------0   A: SVal_INVALID

*/
#define SVAL_TAGMASK (3ULL << 62)

static inline Bool SVal__isC ( SVal s ) {
   return (0ULL << 62) == (s & SVAL_TAGMASK);
}
static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) {
   //tl_assert(VtsID__is_valid(rmini));
   //tl_assert(VtsID__is_valid(wmini));
   return (((ULong)rmini) << 32) | ((ULong)wmini);
}
static inline VtsID SVal__unC_Rmin ( SVal s ) {
   tl_assert(SVal__isC(s));
   return (VtsID)(s >> 32);
}
static inline VtsID SVal__unC_Wmin ( SVal s ) {
   tl_assert(SVal__isC(s));
   return (VtsID)(s & 0xFFFFFFFFULL);
}

static inline Bool SVal__isA ( SVal s ) {
   return (2ULL << 62) == (s & SVAL_TAGMASK);
}
static inline SVal SVal__mkA ( void ) {
   return 2ULL << 62;
}

/* Direct callback from lib_zsm. */
static void SVal__rcinc ( SVal s ) {
   if (SVal__isC(s)) {
      VtsID__rcinc( SVal__unC_Rmin(s) );
      VtsID__rcinc( SVal__unC_Wmin(s) );
   }
}

/* Direct callback from lib_zsm. */
static void SVal__rcdec ( SVal s ) {
   if (SVal__isC(s)) {
      VtsID__rcdec( SVal__unC_Rmin(s) );
      VtsID__rcdec( SVal__unC_Wmin(s) );
   }
}


/////////////////////////////////////////////////////////
//                                                     //
// A simple group (memory) allocator                   //
//                                                     //
/////////////////////////////////////////////////////////

//////////////// BEGIN general group allocator
typedef
   struct {
      UWord   elemSzB;        /* element size */
      UWord   nPerGroup;      /* # elems per group */
      void*   (*alloc)(HChar*, SizeT); /* group allocator */
      HChar*  cc; /* group allocator's cc */
      void    (*free)(void*); /* group allocator's free-er (unused) */
      /* XArray of void* (pointers to groups).  The groups themselves.
         Each element is a pointer to a block of size (elemSzB *
         nPerGroup) bytes. */
      XArray* groups;
      /* next free element.  Is a pointer to an element in one of the
         groups pointed to by .groups. */
      void* nextFree;
   }
   GroupAlloc;

static void init_GroupAlloc ( /*MOD*/GroupAlloc* ga,
                              UWord  elemSzB,
                              UWord  nPerGroup,
                              void*  (*alloc)(HChar*, SizeT),
                              HChar* cc,
                              void   (*free)(void*) )
{
   tl_assert(0 == (elemSzB % sizeof(UWord)));
   tl_assert(elemSzB >= sizeof(UWord));
   tl_assert(nPerGroup >= 100); /* let's say */
   tl_assert(alloc);
   tl_assert(cc);
   tl_assert(free);
   tl_assert(ga);
   VG_(memset)(ga, 0, sizeof(*ga));
   ga->elemSzB   = elemSzB;
   ga->nPerGroup = nPerGroup;
   ga->groups    = NULL;
   ga->alloc     = alloc;
   ga->cc        = cc;
   ga->free      = free;
   ga->groups    = VG_(newXA)( alloc, cc, free, sizeof(void*) );
   ga->nextFree  = NULL;
   tl_assert(ga->groups);
}

/* The freelist is empty.  Allocate a new group and put all the new
   elements in it onto the freelist. */
__attribute__((noinline))
static void gal_add_new_group ( GroupAlloc* ga ) 
{
   Word   i;
   UWord* group;
   tl_assert(ga);
   tl_assert(ga->nextFree == NULL);
   group = ga->alloc( ga->cc, ga->elemSzB * ga->nPerGroup );
   tl_assert(group);
   /* extend the freelist through the new group.  Place the freelist
      pointer in the first word of each element.  That's why the
      element size must be at least one word. */
   for (i = ga->nPerGroup-1; i >= 0; i--) {
      UChar* elemC = ((UChar*)group) + i * ga->elemSzB;
      UWord* elem  = (UWord*)elemC;
      tl_assert(0 == (((UWord)elem) % sizeof(UWord)));
      *elem = (UWord)ga->nextFree;
      ga->nextFree = elem;
   }
   /* and add to our collection of groups */
   VG_(addToXA)( ga->groups, &group );
}

inline static void* gal_Alloc ( GroupAlloc* ga )
{
   UWord* elem;
   if (UNLIKELY(ga->nextFree == NULL)) {
      gal_add_new_group(ga);
   }
   elem = ga->nextFree;
   ga->nextFree = (void*)*elem;
   *elem = 0; /* unnecessary, but just to be on the safe side */
   return elem;
}

inline static void* gal_Alloc_w_size_check ( GroupAlloc* ga, SizeT n )
{
   tl_assert(n == ga->elemSzB);
   return gal_Alloc( ga );
}

inline static void gal_Free ( GroupAlloc* ga, void* p )
{
   UWord* elem = (UWord*)p;
   *elem = (UWord)ga->nextFree;
   ga->nextFree = elem;
}
//////////////// END general group allocator


/////////////////////////////////////////////////////////
//                                                     //
// Change-event map2                                   //
//                                                     //
/////////////////////////////////////////////////////////

#define EVENT_MAP_GC_DISCARD_FRACTION  0.5

/* This is in two parts:

   1. A hash table of RCECs.  This is a set of reference-counted stack
      traces.  When the reference count of a stack trace becomes zero,
      it is removed from the set and freed up.  The intent is to have
      a set of stack traces which can be referred to from (2), but to
      only represent each one once.  The set is indexed/searched by
      ordering on the stack trace vectors.

   2. A SparseWA of OldRefs.  These store information about each old
      ref that we need to record.  It is indexed by address of the
      location for which the information is recorded.  For LRU
      purposes, each OldRef also contains a generation number,
      indicating when it was most recently accessed.

      The important part of an OldRef is, however, its accs[] array.
      This is an array of N_OLDREF_ACCS which binds (thread, R/W,
      size) triples to RCECs.  This allows us to collect the last
      access-traceback by up to N_OLDREF_ACCS different triples for
      this location.  The accs[] array is a MTF-array.  If a binding
      falls off the end, that's too bad -- we will lose info about
      that triple's access to this location.

      When the SparseWA becomes too big, we can throw away the OldRefs
      whose generation numbers are below some threshold; hence doing
      approximate LRU discarding.  For each discarded OldRef we must
      of course decrement the reference count on the all RCECs it
      refers to, in order that entries from (1) eventually get
      discarded too.

   A major improvement in reliability of this mechanism would be to
   have a dynamically sized OldRef.accs[] array, so no entries ever
   fall off the end.  In investigations (Dec 08) it appears that a
   major cause for the non-availability of conflicting-access traces
   in race reports is caused by the fixed size of this array.  I
   suspect for most OldRefs, only a few entries are used, but for a
   minority of cases there is an overflow, leading to info lossage.
   Investigations also suggest this is very workload and scheduling
   sensitive.  Therefore a dynamic sizing would be better.

   However, dynamic sizing would defeat the use of a GroupAllocator
   for OldRef structures.  And that's important for performance.  So
   it's not straightforward to do.
*/


static UWord stats__ctxt_rcdec1 = 0;
static UWord stats__ctxt_rcdec2 = 0;
static UWord stats__ctxt_rcdec3 = 0;
static UWord stats__ctxt_rcdec_calls = 0;
static UWord stats__ctxt_rcdec_discards = 0;
static UWord stats__ctxt_rcdec1_eq = 0;

static UWord stats__ctxt_tab_curr = 0;
static UWord stats__ctxt_tab_max  = 0;

static UWord stats__ctxt_tab_qs   = 0;
static UWord stats__ctxt_tab_cmps = 0;


///////////////////////////////////////////////////////
//// Part (1): A hash table of RCECs
///

#define N_FRAMES 8

// (UInt) `echo "Reference Counted Execution Context" | md5sum`
#define RCEC_MAGIC 0xab88abb2UL

//#define N_RCEC_TAB 98317 /* prime */
#define N_RCEC_TAB 196613 /* prime */

typedef
   struct _RCEC {
      UWord magic;  /* sanity check only */
      struct _RCEC* next;
      UWord rc;
      UWord rcX; /* used for crosschecking */
      UWord frames_hash;          /* hash of all the frames */
      UWord frames[N_FRAMES];
   }
   RCEC;

static RCEC** contextTab = NULL; /* hash table of RCEC*s */


/* Gives an arbitrary total order on RCEC .frames fields */
static Word RCEC__cmp_by_frames ( RCEC* ec1, RCEC* ec2 ) {
   Word i;
   tl_assert(ec1 && ec1->magic == RCEC_MAGIC);
   tl_assert(ec2 && ec2->magic == RCEC_MAGIC);
   if (ec1->frames_hash < ec2->frames_hash) return -1;
   if (ec1->frames_hash > ec2->frames_hash) return  1;
   for (i = 0; i < N_FRAMES; i++) {
      if (ec1->frames[i] < ec2->frames[i]) return -1;
      if (ec1->frames[i] > ec2->frames[i]) return  1;
   }
   return 0;
}


/* Dec the ref of this RCEC. */
static void ctxt__rcdec ( RCEC* ec )
{
   stats__ctxt_rcdec_calls++;
   tl_assert(ec && ec->magic == RCEC_MAGIC);
   tl_assert(ec->rc > 0);
   ec->rc--;
}

static void ctxt__rcinc ( RCEC* ec )
{
   tl_assert(ec && ec->magic == RCEC_MAGIC);
   ec->rc++;
}


//////////// BEGIN RCEC group allocator
static GroupAlloc rcec_group_allocator;

static RCEC* alloc_RCEC ( void ) {
   return gal_Alloc ( &rcec_group_allocator );
}

static void free_RCEC ( RCEC* rcec ) {
   tl_assert(rcec->magic == RCEC_MAGIC);
   gal_Free( &rcec_group_allocator, rcec );
}
//////////// END RCEC group allocator


/* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and
   move it one step closer the the front of the list, so as to make
   subsequent searches for it cheaper. */
static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec )
{
   RCEC *ec0, *ec1, *ec2;
   if (ec == *headp)
      tl_assert(0); /* already at head of list */
   tl_assert(ec != NULL);
   ec0 = *headp;
   ec1 = NULL;
   ec2 = NULL;
   while (True) {
      if (ec0 == NULL || ec0 == ec) break;
      ec2 = ec1;
      ec1 = ec0;
      ec0 = ec0->next;
   }
   tl_assert(ec0 == ec);
   if (ec0 != NULL && ec1 != NULL && ec2 != NULL) {
      RCEC* tmp;
      /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's
         predecessor.  Swap ec0 and ec1, that is, move ec0 one step
         closer to the start of the list. */
      tl_assert(ec2->next == ec1);
      tl_assert(ec1->next == ec0);
      tmp = ec0->next;
      ec2->next = ec0;
      ec0->next = ec1;
      ec1->next = tmp;
   }
   else
   if (ec0 != NULL && ec1 != NULL && ec2 == NULL) {
      /* it's second in the list. */
      tl_assert(*headp == ec1);
      tl_assert(ec1->next == ec0);
      ec1->next = ec0->next;
      ec0->next = ec1;
      *headp = ec0;
   }
}


/* Find the given RCEC in the tree, and return a pointer to it.  Or,
   if not present, add the given one to the tree (by making a copy of
   it, so the caller can immediately deallocate the original) and
   return a pointer to the copy.  The caller can safely have 'example'
   on its stack, since we will always return a pointer to a copy of
   it, not to the original.  Note that the inserted node will have .rc
   of zero and so the caller must immediatly increment it. */
__attribute__((noinline))
static RCEC* ctxt__find_or_add ( RCEC* example )
{
   UWord hent;
   RCEC* copy;
   tl_assert(example && example->magic == RCEC_MAGIC);
   tl_assert(example->rc == 0);

   /* Search the hash table to see if we already have it. */
   stats__ctxt_tab_qs++;
   hent = example->frames_hash % N_RCEC_TAB;
   copy = contextTab[hent];
   while (1) {
      if (!copy) break;
      tl_assert(copy->magic == RCEC_MAGIC);
      stats__ctxt_tab_cmps++;
      if (0 == RCEC__cmp_by_frames(copy, example)) break;
      copy = copy->next;
   }

   if (copy) {
      tl_assert(copy != example);
      /* optimisation: if it's not at the head of its list, move 1
         step fwds, to make future searches cheaper */
      if (copy != contextTab[hent]) {
         move_RCEC_one_step_forward( &contextTab[hent], copy );
      }
   } else {
      copy = alloc_RCEC();
      tl_assert(copy != example);
      *copy = *example;
      copy->next = contextTab[hent];
      contextTab[hent] = copy;
      stats__ctxt_tab_curr++;
      if (stats__ctxt_tab_curr > stats__ctxt_tab_max)
         stats__ctxt_tab_max = stats__ctxt_tab_curr;
   }
   return copy;
}

static inline UWord ROLW ( UWord w, Int n )
{
   Int bpw = 8 * sizeof(UWord);
   w = (w << n) | (w >> (bpw-n));
   return w;
}

__attribute__((noinline))
static RCEC* get_RCEC ( Thr* thr )
{
   UWord hash, i;
   RCEC  example;
   example.magic = RCEC_MAGIC;
   example.rc = 0;
   example.rcX = 0;
   main_get_stacktrace( thr, &example.frames[0], N_FRAMES );
   hash = 0;
   for (i = 0; i < N_FRAMES; i++) {
      hash ^= example.frames[i];
      hash = ROLW(hash, 19);
   }
   example.frames_hash = hash;
   return ctxt__find_or_add( &example );
}

///////////////////////////////////////////////////////
//// Part (2):
///  A SparseWA guest-addr -> OldRef, that refers to (1)
///

// (UInt) `echo "Old Reference Information" | md5sum`
#define OldRef_MAGIC 0x30b1f075UL

/* Records an access: a thread, a context (size & writeness) and the
   number of held locks. The size (1,2,4,8) is encoded as 00 = 1, 01 =
   2, 10 = 4, 11 = 8. 
*/
typedef
   struct { 
      RCEC*     rcec;
      WordSetID locksHeldW;
      UInt      thrid  : SCALARTS_N_THRBITS;
      UInt      szLg2B : 2;
      UInt      isW    : 1;
   }
   Thr_n_RCEC;

#define N_OLDREF_ACCS 5

typedef
   struct {
      UWord magic;  /* sanity check only */
      UWord gen;    /* when most recently accessed */
                    /* or free list when not in use */
      /* unused slots in this array have .thrid == 0, which is invalid */
      Thr_n_RCEC accs[N_OLDREF_ACCS];
   }
   OldRef;


//////////// BEGIN OldRef group allocator
static GroupAlloc oldref_group_allocator;

static OldRef* alloc_OldRef ( void ) {
   return gal_Alloc ( &oldref_group_allocator );
}

static void free_OldRef ( OldRef* r ) {
   tl_assert(r->magic == OldRef_MAGIC);
   gal_Free( &oldref_group_allocator, r );
}
//////////// END OldRef group allocator


static SparseWA* oldrefTree     = NULL; /* SparseWA* OldRef* */
static UWord     oldrefGen      = 0;    /* current LRU generation # */
static UWord     oldrefTreeN    = 0;    /* # elems in oldrefTree */
static UWord     oldrefGenIncAt = 0;    /* inc gen # when size hits this */

inline static UInt min_UInt ( UInt a, UInt b ) {
   return a < b ? a : b;
}

/* Compare the intervals [a1,a1+n1) and [a2,a2+n2).  Return -1 if the
   first interval is lower, 1 if the first interval is higher, and 0
   if there is any overlap.  Redundant paranoia with casting is there
   following what looked distinctly like a bug in gcc-4.1.2, in which
   some of the comparisons were done signedly instead of
   unsignedly. */
/* Copied from exp-ptrcheck/sg_main.c */
static Word cmp_nonempty_intervals ( Addr a1, SizeT n1,
                                     Addr a2, SizeT n2 ) {
   UWord a1w = (UWord)a1;
   UWord n1w = (UWord)n1;
   UWord a2w = (UWord)a2;
   UWord n2w = (UWord)n2;
   tl_assert(n1w > 0 && n2w > 0);
   if (a1w + n1w <= a2w) return -1L;
   if (a2w + n2w <= a1w) return 1L;
   return 0;
}

static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
{
   OldRef* ref;
   RCEC*   rcec;
   Word    i, j;
   UWord   keyW, valW;
   Bool    b;

   tl_assert(thr);
   ThrID thrid = thr->thrid;
   tl_assert(thrid != 0); /* zero is used to denote an empty slot. */

   WordSetID locksHeldW = thr->hgthread->locksetW;

   rcec = get_RCEC( thr );
   ctxt__rcinc(rcec);

   UInt szLg2B = 0;
   switch (szB) {
      /* This doesn't look particularly branch-predictor friendly. */
      case 1:  szLg2B = 0; break;
      case 2:  szLg2B = 1; break;
      case 4:  szLg2B = 2; break;
      case 8:  szLg2B = 3; break;
      default: tl_assert(0);
   }

   /* Look in the map to see if we already have a record for this
      address. */
   b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, a );

   if (b) {

      /* We already have a record for this address.  We now need to
         see if we have a stack trace pertaining to this (thrid, R/W,
         size) triple. */
      tl_assert(keyW == a);
      ref = (OldRef*)valW;
      tl_assert(ref->magic == OldRef_MAGIC);

      for (i = 0; i < N_OLDREF_ACCS; i++) {
         if (ref->accs[i].thrid != thrid)
            continue;
         if (ref->accs[i].szLg2B != szLg2B)
            continue;
         if (ref->accs[i].isW != (UInt)(isW & 1))
            continue;
         /* else we have a match, so stop looking. */
         break;
      }

      if (i < N_OLDREF_ACCS) {
         /* thread 'thr' has an entry at index 'i'.  Update its RCEC. */
         if (i > 0) {
            Thr_n_RCEC tmp = ref->accs[i-1];
            ref->accs[i-1] = ref->accs[i];
            ref->accs[i] = tmp;
            i--;
         }
         if (rcec == ref->accs[i].rcec) stats__ctxt_rcdec1_eq++;
         stats__ctxt_rcdec1++;
         ctxt__rcdec( ref->accs[i].rcec );
         tl_assert(ref->accs[i].thrid == thrid);
         /* Update the RCEC and the W-held lockset. */
         ref->accs[i].rcec       = rcec;
         ref->accs[i].locksHeldW = locksHeldW;
      } else {
         /* No entry for this (thread, R/W, size, nWHeld) quad.
            Shuffle all of them down one slot, and put the new entry
            at the start of the array. */
         if (ref->accs[N_OLDREF_ACCS-1].thrid != 0) {
            /* the last slot is in use.  We must dec the rc on the
               associated rcec. */
            tl_assert(ref->accs[N_OLDREF_ACCS-1].rcec);
            stats__ctxt_rcdec2++;
            if (0 && 0 == (stats__ctxt_rcdec2 & 0xFFF))
               VG_(printf)("QQQQ %lu overflows\n",stats__ctxt_rcdec2);
            ctxt__rcdec( ref->accs[N_OLDREF_ACCS-1].rcec );
         } else {
            tl_assert(!ref->accs[N_OLDREF_ACCS-1].rcec);
         }
         for (j = N_OLDREF_ACCS-1; j >= 1; j--)
            ref->accs[j] = ref->accs[j-1];
         ref->accs[0].thrid      = thrid;
         ref->accs[0].szLg2B     = szLg2B;
         ref->accs[0].isW        = (UInt)(isW & 1);
         ref->accs[0].locksHeldW = locksHeldW;
         ref->accs[0].rcec       = rcec;
         /* thrid==0 is used to signify an empty slot, so we can't
            add zero thrid (such a ThrID is invalid anyway). */
         /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */
      }

      ref->gen = oldrefGen;

   } else {

      /* We don't have a record for this address.  Create a new one. */
      if (oldrefTreeN >= oldrefGenIncAt) {
         oldrefGen++;
         oldrefGenIncAt = oldrefTreeN + 50000;
         if (0) VG_(printf)("oldrefTree: new gen %lu at size %lu\n",
                            oldrefGen, oldrefTreeN );
      }

      ref = alloc_OldRef();
      ref->magic = OldRef_MAGIC;
      ref->gen   = oldrefGen;
      ref->accs[0].thrid      = thrid;
      ref->accs[0].szLg2B     = szLg2B;
      ref->accs[0].isW        = (UInt)(isW & 1);
      ref->accs[0].locksHeldW = locksHeldW;
      ref->accs[0].rcec       = rcec;

      /* thrid==0 is used to signify an empty slot, so we can't
         add zero thrid (such a ThrID is invalid anyway). */
      /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */

      /* Clear out the rest of the entries */
      for (j = 1; j < N_OLDREF_ACCS; j++) {
         ref->accs[j].rcec       = NULL;
         ref->accs[j].thrid      = 0;
         ref->accs[j].szLg2B     = 0;
         ref->accs[j].isW        = 0;
         ref->accs[j].locksHeldW = 0;
      }
      VG_(addToSWA)( oldrefTree, a, (UWord)ref );
      oldrefTreeN++;

   }
}


/* Extract info from the conflicting-access machinery. */
Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
                              /*OUT*/Thr**        resThr,
                              /*OUT*/SizeT*       resSzB,
                              /*OUT*/Bool*        resIsW,
                              /*OUT*/WordSetID*   locksHeldW,
                              Thr* thr, Addr a, SizeT szB, Bool isW )
{
   Word    i, j;
   OldRef* ref;
   UWord   keyW, valW;
   Bool    b;

   ThrID     cand_thrid;
   RCEC*     cand_rcec;
   Bool      cand_isW;
   SizeT     cand_szB;
   WordSetID cand_locksHeldW;
   Addr      cand_a;

   Addr toCheck[15];
   Int  nToCheck = 0;

   tl_assert(thr);
   tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1);

   ThrID thrid = thr->thrid;

   toCheck[nToCheck++] = a;
   for (i = -7; i < (Word)szB; i++) {
      if (i != 0)
         toCheck[nToCheck++] = a + i;
   }
   tl_assert(nToCheck <= 15);

   /* Now see if we can find a suitable matching event for
      any of the addresses in toCheck[0 .. nToCheck-1]. */
   for (j = 0; j < nToCheck; j++) {

      cand_a = toCheck[j];
      //      VG_(printf)("test %ld %p\n", j, cand_a);

      b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, cand_a );
      if (!b)
         continue;

      ref = (OldRef*)valW;
      tl_assert(keyW == cand_a);
      tl_assert(ref->magic == OldRef_MAGIC);
      tl_assert(ref->accs[0].thrid != 0); /* first slot must always be used */

      cand_thrid      = 0; /* invalid; see comments in event_map_bind */
      cand_rcec       = NULL;
      cand_isW        = False;
      cand_szB        = 0;
      cand_locksHeldW = 0; /* always valid; see initialise_data_structures() */

      for (i = 0; i < N_OLDREF_ACCS; i++) {
         Thr_n_RCEC* cand = &ref->accs[i];
         cand_rcec       = cand->rcec;
         cand_thrid      = cand->thrid;
         cand_isW        = (Bool)cand->isW;
         cand_szB        = 1 << cand->szLg2B;
         cand_locksHeldW = cand->locksHeldW;

         if (cand_thrid == 0) 
            /* This slot isn't in use.  Ignore it. */
            continue;

         if (cand_thrid == thrid)
            /* This is an access by the same thread, but we're only
               interested in accesses from other threads.  Ignore. */
            continue;

         if ((!cand_isW) && (!isW))
            /* We don't want to report a read racing against another
               read; that's stupid.  So in this case move on. */
            continue;

         if (cmp_nonempty_intervals(a, szB, cand_a, cand_szB) != 0)
            /* No overlap with the access we're asking about.  Ignore. */
            continue;

         /* We have a match.  Stop searching. */
         break;
      }

      tl_assert(i >= 0 && i <= N_OLDREF_ACCS);

      if (i < N_OLDREF_ACCS) {
         Int n, maxNFrames;
         /* return with success */
         tl_assert(cand_thrid);
         tl_assert(cand_rcec);
         tl_assert(cand_rcec->magic == RCEC_MAGIC);
         tl_assert(cand_szB >= 1);
         /* Count how many non-zero frames we have. */
         maxNFrames = min_UInt(N_FRAMES, VG_(clo_backtrace_size));
         for (n = 0; n < maxNFrames; n++) {
            if (0 == cand_rcec->frames[n]) break;
         }
         *resEC      = VG_(make_ExeContext_from_StackTrace)
                          (cand_rcec->frames, n);
         *resThr     = Thr__from_ThrID(cand_thrid);
         *resSzB     = cand_szB;
         *resIsW     = cand_isW;
         *locksHeldW = cand_locksHeldW;
         return True;
      }

      /* consider next address in toCheck[] */
   } /* for (j = 0; j < nToCheck; j++) */

   /* really didn't find anything. */
   return False;
}

static void event_map_init ( void )
{
   Word i;

   /* Context (RCEC) group allocator */
   init_GroupAlloc ( &rcec_group_allocator,
                     sizeof(RCEC),
                     1000 /* RCECs per group */,
                     HG_(zalloc),
                     "libhb.event_map_init.1 (RCEC groups)",
                     HG_(free) );

   /* Context table */
   tl_assert(!contextTab);
   contextTab = HG_(zalloc)( "libhb.event_map_init.2 (context table)",
                             N_RCEC_TAB * sizeof(RCEC*) );
   tl_assert(contextTab);
   for (i = 0; i < N_RCEC_TAB; i++)
      contextTab[i] = NULL;

   /* Oldref group allocator */
   init_GroupAlloc ( &oldref_group_allocator,
                     sizeof(OldRef),
                     1000 /* OldRefs per group */,
                     HG_(zalloc),
                     "libhb.event_map_init.3 (OldRef groups)",
                     HG_(free) );

   /* Oldref tree */
   tl_assert(!oldrefTree);
   oldrefTree = VG_(newSWA)(
                   HG_(zalloc),
                   "libhb.event_map_init.4 (oldref tree)", 
                   HG_(free)
                );
   tl_assert(oldrefTree);

   oldrefGen = 0;
   oldrefGenIncAt = 0;
   oldrefTreeN = 0;
}

static void event_map__check_reference_counts ( Bool before )
{
   RCEC*   rcec;
   OldRef* oldref;
   Word    i;
   UWord   nEnts = 0;
   UWord   keyW, valW;

   /* Set the 'check' reference counts to zero.  Also, optionally
      check that the real reference counts are non-zero.  We allow
      these to fall to zero before a GC, but the GC must get rid of
      all those that are zero, hence none should be zero after a
      GC. */
   for (i = 0; i < N_RCEC_TAB; i++) {
      for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
         nEnts++;
         tl_assert(rcec);
         tl_assert(rcec->magic == RCEC_MAGIC);
         if (!before)
            tl_assert(rcec->rc > 0);
         rcec->rcX = 0;
      }
   }

   /* check that the stats are sane */
   tl_assert(nEnts == stats__ctxt_tab_curr);
   tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max);

   /* visit all the referencing points, inc check ref counts */
   VG_(initIterSWA)( oldrefTree );
   while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
      oldref = (OldRef*)valW;
      tl_assert(oldref->magic == OldRef_MAGIC);
      for (i = 0; i < N_OLDREF_ACCS; i++) {
         ThrID aThrID = oldref->accs[i].thrid;
         RCEC* aRef   = oldref->accs[i].rcec;
         if (aThrID != 0) {
            tl_assert(aRef);
            tl_assert(aRef->magic == RCEC_MAGIC);
            aRef->rcX++;
         } else {
            tl_assert(!aRef);
         }
      }
   }

   /* compare check ref counts with actual */
   for (i = 0; i < N_RCEC_TAB; i++) {
      for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
         tl_assert(rcec->rc == rcec->rcX);
      }
   }
}

__attribute__((noinline))
static void event_map_maybe_GC ( void )
{
   OldRef* oldref;
   UWord   keyW, valW, retained, maxGen;
   XArray* refs2del;
   Word    i, j, n2del;

   UWord* genMap      = NULL;
   UWord  genMap_min  = 0;
   UWord  genMap_size = 0;

   if (LIKELY(oldrefTreeN < HG_(clo_conflict_cache_size)))
      return;

   if (0)
      VG_(printf)("libhb: event_map GC at size %lu\n", oldrefTreeN);

   /* Check for sane command line params.  Limit values must match
      those in hg_process_cmd_line_option. */
   tl_assert( HG_(clo_conflict_cache_size) >= 10*1000 );
   tl_assert( HG_(clo_conflict_cache_size) <= 30*1000*1000 );

   /* Check our counting is sane (expensive) */
   if (CHECK_CEM)
      tl_assert(oldrefTreeN == VG_(sizeSWA)( oldrefTree ));

   /* Check the reference counts (expensive) */
   if (CHECK_CEM)
      event_map__check_reference_counts( True/*before*/ );

   /* Compute the distribution of generation values in the ref tree.
      There are likely only to be a few different generation numbers
      in the whole tree, but we don't know what they are.  Hence use a
      dynamically resized array of counters.  The array is genMap[0
      .. genMap_size-1], where genMap[0] is the count for the
      generation number genMap_min, genMap[1] is the count for
      genMap_min+1, etc.  If a new number is seen outside the range
      [genMap_min .. genMap_min + genMap_size - 1] then the array is
      copied into a larger array, and genMap_min and genMap_size are
      adjusted accordingly. */

   /* genMap :: generation-number -> count-of-nodes-with-that-number */

   VG_(initIterSWA)( oldrefTree );
   while ( VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {

       UWord ea, key;
       oldref = (OldRef*)valW;
       key = oldref->gen;

      /* BEGIN find 'ea', which is the index in genMap holding the
         count for generation number 'key'. */
      if (UNLIKELY(genMap == NULL)) {
         /* deal with the first key to be seen, so that the following
            cases don't need to handle the complexity of a NULL count
            array. */
         genMap_min  = key;
         genMap_size = 1;
         genMap = HG_(zalloc)( "libhb.emmG.1a",
                                genMap_size * sizeof(UWord) );
         ea = 0;
         if (0) VG_(printf)("(%lu) case 1 [%lu .. %lu]\n",
                            key, genMap_min, genMap_min+genMap_size- 1 );
      }
      else
      if (LIKELY(key >= genMap_min && key < genMap_min + genMap_size)) {
         /* this is the expected (almost-always-happens) case: 'key'
            is already mapped in the array. */
         ea = key - genMap_min;
      }
      else
      if (key < genMap_min) {
         /* 'key' appears before the start of the current array.
            Extend the current array by allocating a larger one and
            copying the current one to the upper end of it. */
         Word   more;
         UWord* map2;
         more = genMap_min - key;
         tl_assert(more > 0);
         map2 = HG_(zalloc)( "libhb.emmG.1b",
                             (genMap_size + more) * sizeof(UWord) );
         VG_(memcpy)( &map2[more], genMap, genMap_size * sizeof(UWord) );
         HG_(free)( genMap );
         genMap = map2;
         genMap_size += more;
         genMap_min -= more;
         ea = 0;
         tl_assert(genMap_min == key);
         if (0) VG_(printf)("(%lu) case 2 [%lu .. %lu]\n",
                            key, genMap_min,  genMap_min+genMap_size- 1 );
      }
      else {
         /* 'key' appears after the end of the current array.  Extend
            the current array by allocating a larger one and copying
            the current one to the lower end of it. */
         Word   more;
         UWord* map2;
         tl_assert(key >= genMap_min + genMap_size);
         more = key - (genMap_min + genMap_size) + 1;
         tl_assert(more > 0);
         map2 = HG_(zalloc)( "libhb.emmG.1c",
                             (genMap_size + more) * sizeof(UWord) );
         VG_(memcpy)( &map2[0], genMap, genMap_size * sizeof(UWord) );
         HG_(free)( genMap );
         genMap = map2;
         genMap_size += more;
         ea = genMap_size - 1;;
         tl_assert(genMap_min + genMap_size - 1 == key);
         if (0) VG_(printf)("(%lu) case 3 [%lu .. %lu]\n",
                            key, genMap_min, genMap_min+genMap_size- 1 );
      }
      /* END find 'ea' from 'key' */

      tl_assert(ea >= 0 && ea < genMap_size);
      /* and the whole point of this elaborate computation of 'ea' is .. */
      genMap[ea]++;
   }

   tl_assert(genMap);
   tl_assert(genMap_size > 0);

   /* Sanity check what we just computed */
   { UWord sum = 0;
     for (i = 0; i < genMap_size; i++) {
        if (0) VG_(printf)("  xxx: gen %ld has %lu\n",
                           i + genMap_min, genMap[i] );
        sum += genMap[i];
     }
     tl_assert(sum == oldrefTreeN);
   }

   /* Figure out how many generations to throw away */
   retained = oldrefTreeN;
   maxGen = 0;

   for (i = 0; i < genMap_size; i++) {
      keyW = i + genMap_min;
      valW = genMap[i];
      tl_assert(keyW > 0); /* can't allow a generation # 0 */
      if (0) VG_(printf)("  XXX: gen %lu has %lu\n", keyW, valW );
      tl_assert(keyW >= maxGen);
      tl_assert(retained >= valW);
      if (retained - valW
          > (UWord)(HG_(clo_conflict_cache_size) 
                    * EVENT_MAP_GC_DISCARD_FRACTION)) {
         retained -= valW;
         maxGen = keyW;
      } else {
         break;
      }
   }

   HG_(free)(genMap);

   tl_assert(retained >= 0 && retained <= oldrefTreeN);

   /* Now make up a big list of the oldrefTree entries we want to
      delete.  We can't simultaneously traverse the tree and delete
      stuff from it, so first we need to copy them off somewhere
      else. (sigh) */
   refs2del = VG_(newXA)( HG_(zalloc), "libhb.emmG.2",
                          HG_(free), sizeof(Addr) );

   if (retained < oldrefTreeN) {

      /* This is the normal (expected) case.  We discard any ref whose
         generation number <= maxGen. */
      VG_(initIterSWA)( oldrefTree );
      while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
         oldref = (OldRef*)valW;
         tl_assert(oldref->magic == OldRef_MAGIC);
         if (oldref->gen <= maxGen) {
            VG_(addToXA)( refs2del, &keyW );
         }
      }
      if (VG_(clo_stats)) {
         VG_(message)(Vg_DebugMsg,
            "libhb: EvM GC: delete generations %lu and below, "
            "retaining %lu entries\n",
            maxGen, retained );
      }

   } else {

      static UInt rand_seed = 0; /* leave as static */

      /* Degenerate case: there's only one generation in the entire
         tree, so we need to have some other way of deciding which
         refs to throw away.  Just throw out half of them randomly. */
      tl_assert(retained == oldrefTreeN);
      VG_(initIterSWA)( oldrefTree );
      while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
         UInt n;
         oldref = (OldRef*)valW;
         tl_assert(oldref->magic == OldRef_MAGIC);
         n = VG_(random)( &rand_seed );
         if ((n & 0xFFF) < 0x800) {
            VG_(addToXA)( refs2del, &keyW );
            retained--;
         }
      }
      if (VG_(clo_stats)) {
         VG_(message)(Vg_DebugMsg,
            "libhb: EvM GC: randomly delete half the entries, "
            "retaining %lu entries\n",
            retained );
      }

   }

   n2del = VG_(sizeXA)( refs2del );
   tl_assert(n2del == (Word)(oldrefTreeN - retained));

   if (0) VG_(printf)("%s","deleting entries\n");
   for (i = 0; i < n2del; i++) {
      Bool  b;
      Addr  ga2del = *(Addr*)VG_(indexXA)( refs2del, i );
      b = VG_(delFromSWA)( oldrefTree, &keyW, &valW, ga2del );
      tl_assert(b);
      tl_assert(keyW == ga2del);
      oldref = (OldRef*)valW;
      for (j = 0; j < N_OLDREF_ACCS; j++) {
         ThrID aThrID = oldref->accs[j].thrid;
         RCEC* aRef   = oldref->accs[j].rcec;
         if (aRef) {
            tl_assert(aThrID != 0);
            stats__ctxt_rcdec3++;
            ctxt__rcdec( aRef );
         } else {
            tl_assert(aThrID == 0);
         }
      }

      free_OldRef( oldref );
   }

   VG_(deleteXA)( refs2del );

   tl_assert( VG_(sizeSWA)( oldrefTree ) == retained );

   oldrefTreeN = retained;
   oldrefGenIncAt = oldrefTreeN; /* start new gen right away */

   /* Throw away all RCECs with zero reference counts */
   for (i = 0; i < N_RCEC_TAB; i++) {
      RCEC** pp = &contextTab[i];
      RCEC*  p  = *pp;
      while (p) {
         if (p->rc == 0) {
            *pp = p->next;
            free_RCEC(p);
            p = *pp;
            tl_assert(stats__ctxt_tab_curr > 0);
            stats__ctxt_tab_curr--;
         } else {
            pp = &p->next;
            p = p->next;
         }
      }
   }

   /* Check the reference counts (expensive) */
   if (CHECK_CEM)
      event_map__check_reference_counts( False/*after*/ );

   //if (0)
   //VG_(printf)("XXXX final sizes: oldrefTree %ld, contextTree %ld\n\n",
   //            VG_(OSetGen_Size)(oldrefTree), VG_(OSetGen_Size)(contextTree));

}


/////////////////////////////////////////////////////////
//                                                     //
// Core MSM                                            //
//                                                     //
/////////////////////////////////////////////////////////

/* Logic in msmcread/msmcwrite updated/verified after re-analysis, 19
   Nov 08, and again after [...],
   June 09. */

static ULong stats__msmcread         = 0;
static ULong stats__msmcread_change  = 0;
static ULong stats__msmcwrite        = 0;
static ULong stats__msmcwrite_change = 0;

/* Some notes on the H1 history mechanism:

   Transition rules are:

   read_{Kr,Kw}(Cr,Cw)  = (Cr,           Cr `join` Kw)
   write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw)

   After any access by a thread T to a location L, L's constraint pair
   (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock.

   After a race by thread T conflicting with some previous access by
   some other thread U, for a location with constraint (before
   processing the later access) (Cr,Cw), then Cw[U] is the segment in
   which the previously access lies.

   Hence in record_race_info, we pass in Cfailed and Kfailed, which
   are compared so as to find out which thread(s) this access
   conflicts with.  Once that is established, we also require the
   pre-update Cw for the location, so we can index into it for those
   threads, to get the scalar clock values for the point at which the
   former accesses were made.  (In fact we only bother to do any of
   this for an arbitrarily chosen one of the conflicting threads, as
   that's simpler, it avoids flooding the user with vast amounts of
   mostly useless information, and because the program is wrong if it
   contains any races at all -- so we don't really need to show all
   conflicting access pairs initially, so long as we only show none if
   none exist).

   ---

   That requires the auxiliary proof that 

      (Cr `join` Kw)[T] == Kw[T]

   Why should that be true?  Because for any thread T, Kw[T] >= the
   scalar clock value for T known by any other thread.  In other
   words, because T's value for its own scalar clock is at least as up
   to date as the value for it known by any other thread (that is true
   for both the R- and W- scalar clocks).  Hence no other thread will
   be able to feed in a value for that element (indirectly via a
   constraint) which will exceed Kw[T], and hence the join cannot
   cause that particular element to advance.
*/

__attribute__((noinline))
static void record_race_info ( Thr* acc_thr, 
                               Addr acc_addr, SizeT szB, Bool isWrite,
                               VtsID Cfailed,
                               VtsID Kfailed,
                               VtsID Cw )
{
   /* Call here to report a race.  We just hand it onwards to
      HG_(record_error_Race).  If that in turn discovers that the
      error is going to be collected, then, at history_level 2, that
      queries the conflicting-event map.  The alternative would be to
      query it right here.  But that causes a lot of pointless queries
      for errors which will shortly be discarded as duplicates, and
      can become a performance overhead; so we defer the query until
      we know the error is not a duplicate. */

   /* Stacks for the bounds of the (or one of the) conflicting
      segment(s).  These are only set at history_level 1. */
   ExeContext* hist1_seg_start = NULL;
   ExeContext* hist1_seg_end   = NULL;
   Thread*     hist1_conf_thr  = NULL;

   tl_assert(acc_thr);
   tl_assert(acc_thr->hgthread);
   tl_assert(acc_thr->hgthread->hbthr == acc_thr);
   tl_assert(HG_(clo_history_level) >= 0 && HG_(clo_history_level) <= 2);

   if (HG_(clo_history_level) == 1) {
      Bool found;
      Word firstIx, lastIx;
      ULong_n_EC key;

      /* At history_level 1, we must round up the relevant stack-pair
         for the conflicting segment right now.  This is because
         deferring it is complex; we can't (easily) put Kfailed and
         Cfailed into the XError and wait for later without
         getting tied up in difficulties with VtsID reference
         counting.  So just do it now. */
      Thr*  confThr;
      ULong confTym = 0;
      /* Which thread are we in conflict with?  There may be more than
         one, in which case VtsID__findFirst_notLEQ selects one arbitrarily
         (in fact it's the one with the lowest Thr* value). */
      confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed );
      /* This must exist!  since if it was NULL then there's no
         conflict (semantics of return value of
         VtsID__findFirst_notLEQ), and msmc{read,write}, which has
         called us, just checked exactly this -- that there was in
         fact a race. */
      tl_assert(confThr);

      /* Get the scalar clock value that the conflicting thread
         introduced into the constraint.  A careful examination of the
         base machine rules shows that this must be the same as the
         conflicting thread's scalar clock when it created this
         constraint.  Hence we know the scalar clock of the
         conflicting thread when the conflicting access was made. */
      confTym = VtsID__indexAt( Cfailed, confThr );

      /* Using this scalar clock, index into the conflicting thread's
         collection of stack traces made each time its vector clock
         (hence its scalar clock) changed.  This gives the stack
         traces at the start and end of the conflicting segment (well,
         as per comment just above, of one of the conflicting
         segments, if there are more than one). */
      key.ull = confTym;
      key.ec  = NULL;
      /* tl_assert(confThr); -- asserted just above */
      tl_assert(confThr->local_Kws_n_stacks);
      firstIx = lastIx = 0;
      found = VG_(lookupXA_UNSAFE)(
                 confThr->local_Kws_n_stacks,
                 &key, &firstIx, &lastIx,
                 (Int(*)(void*,void*))cmp__ULong_n_EC__by_ULong
              );
      if (0) VG_(printf)("record_race_info %u %u %u  confThr %p "
                         "confTym %llu found %d (%lu,%lu)\n", 
                         Cfailed, Kfailed, Cw,
                         confThr, confTym, found, firstIx, lastIx);
      /* We can't indefinitely collect stack traces at VTS
         transitions, since we'd eventually run out of memory.  Hence
         note_local_Kw_n_stack_for will eventually throw away old
         ones, which in turn means we might fail to find index value
         confTym in the array. */
      if (found) {
         ULong_n_EC *pair_start, *pair_end;
         pair_start 
            = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx );
         hist1_seg_start = pair_start->ec;
         if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) {
            pair_end
               = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks,
                                            lastIx+1 );
            /* from properties of VG_(lookupXA) and the comparison fn used: */
            tl_assert(pair_start->ull < pair_end->ull);
            hist1_seg_end = pair_end->ec;
            /* Could do a bit better here.  It may be that pair_end
               doesn't have a stack, but the following entries in the
               array have the same scalar Kw and to have a stack.  So
               we should search a bit further along the array than
               lastIx+1 if hist1_seg_end is NULL. */
         } else {
            if (!confThr->llexit_done)
               hist1_seg_end = main_get_EC( confThr );
         }
         // seg_start could be NULL iff this is the first stack in the thread
         //if (seg_start) VG_(pp_ExeContext)(seg_start);
         //if (seg_end)   VG_(pp_ExeContext)(seg_end);
         hist1_conf_thr = confThr->hgthread;
      }
   }

   HG_(record_error_Race)( acc_thr->hgthread, acc_addr,
                           szB, isWrite,
                           hist1_conf_thr, hist1_seg_start, hist1_seg_end );
}

static Bool is_sane_SVal_C ( SVal sv ) {
   Bool leq;
   if (!SVal__isC(sv)) return True;
   leq = VtsID__cmpLEQ( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) );
   return leq;
}


/* Compute new state following a read */
static inline SVal msmcread ( SVal svOld,
                              /* The following are only needed for 
                                 creating error reports. */
                              Thr* acc_thr,
                              Addr acc_addr, SizeT szB )
{
   SVal svNew = SVal_INVALID;
   stats__msmcread++;

   /* Redundant sanity check on the constraints */
   if (CHECK_MSM) {
      tl_assert(is_sane_SVal_C(svOld));
   }

   if (LIKELY(SVal__isC(svOld))) {
      VtsID tviR  = acc_thr->viR;
      VtsID tviW  = acc_thr->viW;
      VtsID rmini = SVal__unC_Rmin(svOld);
      VtsID wmini = SVal__unC_Wmin(svOld);
      Bool  leq   = VtsID__cmpLEQ(rmini,tviR);
      if (LIKELY(leq)) {
         /* no race */
         /* Note: RWLOCK subtlety: use tviW, not tviR */
         svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
         goto out;
      } else {
         /* assert on sanity of constraints. */
         Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
         tl_assert(leqxx);
         // same as in non-race case
         svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
         record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
                           rmini, /* Cfailed */
                           tviR,  /* Kfailed */
                           wmini  /* Cw */ );
         goto out;
      }
   }
   if (SVal__isA(svOld)) {
      /* reading no-access memory (sigh); leave unchanged */
      /* check for no pollution */
      tl_assert(svOld == SVal_NOACCESS);
      svNew = SVal_NOACCESS;
      goto out;
   }
   if (0) VG_(printf)("msmcread: bad svOld: 0x%016llx\n", svOld);
   tl_assert(0);

  out:
   if (CHECK_MSM) {
      tl_assert(is_sane_SVal_C(svNew));
   }
   if (UNLIKELY(svNew != svOld)) {
      tl_assert(svNew != SVal_INVALID);
      if (HG_(clo_history_level) >= 2
          && SVal__isC(svOld) && SVal__isC(svNew)) {
         event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr );
         stats__msmcread_change++;
      }
   }
   return svNew;
}


/* Compute new state following a write */
static inline SVal msmcwrite ( SVal svOld,
                              /* The following are only needed for 
                                 creating error reports. */
                              Thr* acc_thr,
                              Addr acc_addr, SizeT szB )
{
   SVal svNew = SVal_INVALID;
   stats__msmcwrite++;

   /* Redundant sanity check on the constraints */
   if (CHECK_MSM) {
      tl_assert(is_sane_SVal_C(svOld));
   }

   if (LIKELY(SVal__isC(svOld))) {
      VtsID tviW  = acc_thr->viW;
      VtsID wmini = SVal__unC_Wmin(svOld);
      Bool  leq   = VtsID__cmpLEQ(wmini,tviW);
      if (LIKELY(leq)) {
         /* no race */
         svNew = SVal__mkC( tviW, tviW );
         goto out;
      } else {
         VtsID rmini = SVal__unC_Rmin(svOld);
         /* assert on sanity of constraints. */
         Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
         tl_assert(leqxx);
         // same as in non-race case
         // proof: in the non-race case, we have
         //    rmini <= wmini (invar on constraints)
         //    tviW <= tviR (invar on thread clocks)
         //    wmini <= tviW (from run-time check)
         // hence from transitivity of <= we have
         //    rmini <= wmini <= tviW
         // and so join(rmini,tviW) == tviW
         // and    join(wmini,tviW) == tviW
         // qed.
         svNew = SVal__mkC( VtsID__join2(rmini, tviW),
                            VtsID__join2(wmini, tviW) );
         record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
                           wmini, /* Cfailed */
                           tviW,  /* Kfailed */
                           wmini  /* Cw */ );
         goto out;
      }
   }
   if (SVal__isA(svOld)) {
      /* writing no-access memory (sigh); leave unchanged */
      /* check for no pollution */
      tl_assert(svOld == SVal_NOACCESS);
      svNew = SVal_NOACCESS;
      goto out;
   }
   if (0) VG_(printf)("msmcwrite: bad svOld: 0x%016llx\n", svOld);
   tl_assert(0);

  out:
   if (CHECK_MSM) {
      tl_assert(is_sane_SVal_C(svNew));
   }
   if (UNLIKELY(svNew != svOld)) {
      tl_assert(svNew != SVal_INVALID);
      if (HG_(clo_history_level) >= 2
          && SVal__isC(svOld) && SVal__isC(svNew)) {
         event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr );
         stats__msmcwrite_change++;
      }
   }
   return svNew;
}


/////////////////////////////////////////////////////////
//                                                     //
// Apply core MSM to specific memory locations         //
//                                                     //
/////////////////////////////////////////////////////////

/*------------- ZSM accesses: 8 bit sapply ------------- */

static void zsm_sapply08__msmcread ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cread08s++;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 .. 7 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
      SVal* tree = &cl->svals[tno << 3];
      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   svOld = cl->svals[cloff];
   svNew = msmcread( svOld, thr,a,1 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
}

static void zsm_sapply08__msmcwrite ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cwrite08s++;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 .. 7 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
      SVal* tree = &cl->svals[tno << 3];
      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   svOld = cl->svals[cloff];
   svNew = msmcwrite( svOld, thr,a,1 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
}

/*------------- ZSM accesses: 16 bit sapply ------------- */

static void zsm_sapply16__msmcread ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cread16s++;
   if (UNLIKELY(!aligned16(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
      if (valid_value_is_below_me_16(descr, toff)) {
         goto slowcase;
      } else {
         SVal* tree = &cl->svals[tno << 3];
         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
      }
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   svOld = cl->svals[cloff];
   svNew = msmcread( svOld, thr,a,2 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
   return;
  slowcase: /* misaligned, or must go further down the tree */
   stats__cline_16to8splits++;
   zsm_sapply08__msmcread( thr, a + 0 );
   zsm_sapply08__msmcread( thr, a + 1 );
}

static void zsm_sapply16__msmcwrite ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cwrite16s++;
   if (UNLIKELY(!aligned16(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
      if (valid_value_is_below_me_16(descr, toff)) {
         goto slowcase;
      } else {
         SVal* tree = &cl->svals[tno << 3];
         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
      }
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   svOld = cl->svals[cloff];
   svNew = msmcwrite( svOld, thr,a,2 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
   return;
  slowcase: /* misaligned, or must go further down the tree */
   stats__cline_16to8splits++;
   zsm_sapply08__msmcwrite( thr, a + 0 );
   zsm_sapply08__msmcwrite( thr, a + 1 );
}

/*------------- ZSM accesses: 32 bit sapply ------------- */

static void zsm_sapply32__msmcread ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cread32s++;
   if (UNLIKELY(!aligned32(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 or 4 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
      if (valid_value_is_above_me_32(descr, toff)) {
         SVal* tree = &cl->svals[tno << 3];
         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
      } else {
         goto slowcase;
      }
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   svOld = cl->svals[cloff];
   svNew = msmcread( svOld, thr,a,4 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
   return;
  slowcase: /* misaligned, or must go further down the tree */
   stats__cline_32to16splits++;
   zsm_sapply16__msmcread( thr, a + 0 );
   zsm_sapply16__msmcread( thr, a + 2 );
}

static void zsm_sapply32__msmcwrite ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cwrite32s++;
   if (UNLIKELY(!aligned32(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 or 4 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
      if (valid_value_is_above_me_32(descr, toff)) {
         SVal* tree = &cl->svals[tno << 3];
         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
      } else {
         goto slowcase;
      }
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   svOld = cl->svals[cloff];
   svNew = msmcwrite( svOld, thr,a,4 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
   return;
  slowcase: /* misaligned, or must go further down the tree */
   stats__cline_32to16splits++;
   zsm_sapply16__msmcwrite( thr, a + 0 );
   zsm_sapply16__msmcwrite( thr, a + 2 );
}

/*------------- ZSM accesses: 64 bit sapply ------------- */

static void zsm_sapply64__msmcread ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno;
   //UWord      toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cread64s++;
   if (UNLIKELY(!aligned64(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   //toff  = get_tree_offset(a); /* == 0, unused */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
      goto slowcase;
   }
   svOld = cl->svals[cloff];
   svNew = msmcread( svOld, thr,a,8 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
   return;
  slowcase: /* misaligned, or must go further down the tree */
   stats__cline_64to32splits++;
   zsm_sapply32__msmcread( thr, a + 0 );
   zsm_sapply32__msmcread( thr, a + 4 );
}

static void zsm_sapply64__msmcwrite ( Thr* thr, Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno;
   //UWord      toff;
   SVal       svOld, svNew;
   UShort     descr;
   stats__cline_cwrite64s++;
   if (UNLIKELY(!aligned64(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   //toff  = get_tree_offset(a); /* == 0, unused */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
      goto slowcase;
   }
   svOld = cl->svals[cloff];
   svNew = msmcwrite( svOld, thr,a,8 );
   if (CHECK_ZSM)
      tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
   return;
  slowcase: /* misaligned, or must go further down the tree */
   stats__cline_64to32splits++;
   zsm_sapply32__msmcwrite( thr, a + 0 );
   zsm_sapply32__msmcwrite( thr, a + 4 );
}

/*--------------- ZSM accesses: 8 bit swrite --------------- */

static
void zsm_swrite08 ( Addr a, SVal svNew ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   UShort     descr;
   stats__cline_swrite08s++;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 .. 7 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
      SVal* tree = &cl->svals[tno << 3];
      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   }
   tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff] = svNew;
}

/*--------------- ZSM accesses: 16 bit swrite --------------- */

static
void zsm_swrite16 ( Addr a, SVal svNew ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   UShort     descr;
   stats__cline_swrite16s++;
   if (UNLIKELY(!aligned16(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
      if (valid_value_is_below_me_16(descr, toff)) {
         /* Writing at this level.  Need to fix up 'descr'. */
         cl->descrs[tno] = pullup_descr_to_16(descr, toff);
         /* At this point, the tree does not match cl->descr[tno] any
            more.  The assignments below will fix it up. */
      } else {
         /* We can't indiscriminately write on the w16 node as in the
            w64 case, as that might make the node inconsistent with
            its parent.  So first, pull down to this level. */
         SVal* tree = &cl->svals[tno << 3];
         cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
      if (CHECK_ZSM)
         tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
      }
   }
   tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff + 0] = svNew;
   cl->svals[cloff + 1] = SVal_INVALID;
   return;
  slowcase: /* misaligned */
   stats__cline_16to8splits++;
   zsm_swrite08( a + 0, svNew );
   zsm_swrite08( a + 1, svNew );
}

/*--------------- ZSM accesses: 32 bit swrite --------------- */

static
void zsm_swrite32 ( Addr a, SVal svNew ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   UShort     descr;
   stats__cline_swrite32s++;
   if (UNLIKELY(!aligned32(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 or 4 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
      if (valid_value_is_above_me_32(descr, toff)) {
         /* We can't indiscriminately write on the w32 node as in the
            w64 case, as that might make the node inconsistent with
            its parent.  So first, pull down to this level. */
         SVal* tree = &cl->svals[tno << 3];
         cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
         if (CHECK_ZSM)
            tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
      } else {
         /* Writing at this level.  Need to fix up 'descr'. */
         cl->descrs[tno] = pullup_descr_to_32(descr, toff);
         /* At this point, the tree does not match cl->descr[tno] any
            more.  The assignments below will fix it up. */
      }
   }
   tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff + 0] = svNew;
   cl->svals[cloff + 1] = SVal_INVALID;
   cl->svals[cloff + 2] = SVal_INVALID;
   cl->svals[cloff + 3] = SVal_INVALID;
   return;
  slowcase: /* misaligned */
   stats__cline_32to16splits++;
   zsm_swrite16( a + 0, svNew );
   zsm_swrite16( a + 2, svNew );
}

/*--------------- ZSM accesses: 64 bit swrite --------------- */

static
void zsm_swrite64 ( Addr a, SVal svNew ) {
   CacheLine* cl; 
   UWord      cloff, tno;
   //UWord    toff;
   stats__cline_swrite64s++;
   if (UNLIKELY(!aligned64(a))) goto slowcase;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   //toff  = get_tree_offset(a); /* == 0, unused */
   cl->descrs[tno] = TREE_DESCR_64;
   tl_assert(svNew != SVal_INVALID);
   cl->svals[cloff + 0] = svNew;
   cl->svals[cloff + 1] = SVal_INVALID;
   cl->svals[cloff + 2] = SVal_INVALID;
   cl->svals[cloff + 3] = SVal_INVALID;
   cl->svals[cloff + 4] = SVal_INVALID;
   cl->svals[cloff + 5] = SVal_INVALID;
   cl->svals[cloff + 6] = SVal_INVALID;
   cl->svals[cloff + 7] = SVal_INVALID;
   return;
  slowcase: /* misaligned */
   stats__cline_64to32splits++;
   zsm_swrite32( a + 0, svNew );
   zsm_swrite32( a + 4, svNew );
}

/*------------- ZSM accesses: 8 bit sread/scopy ------------- */

static
SVal zsm_sread08 ( Addr a ) {
   CacheLine* cl; 
   UWord      cloff, tno, toff;
   UShort     descr;
   stats__cline_sread08s++;
   cl    = get_cacheline(a);
   cloff = get_cacheline_offset(a);
   tno   = get_treeno(a);
   toff  = get_tree_offset(a); /* == 0 .. 7 */
   descr = cl->descrs[tno];
   if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
      SVal* tree = &cl->svals[tno << 3];
      cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
   }
   return cl->svals[cloff];
}

static void zsm_scopy08 ( Addr src, Addr dst, Bool uu_normalise ) {
   SVal       sv;
   stats__cline_scopy08s++;
   sv = zsm_sread08( src );
   zsm_swrite08( dst, sv );
}


/* Block-copy states (needed for implementing realloc()).  Note this
   doesn't change the filtering arrangements.  The caller of
   zsm_scopy_range needs to attend to that. */

static void zsm_scopy_range ( Addr src, Addr dst, SizeT len )
{
   SizeT i;
   if (len == 0)
      return;

   /* assert for non-overlappingness */
   tl_assert(src+len <= dst || dst+len <= src);

   /* To be simple, just copy byte by byte.  But so as not to wreck
      performance for later accesses to dst[0 .. len-1], normalise
      destination lines as we finish with them, and also normalise the
      line containing the first and last address. */
   for (i = 0; i < len; i++) {
      Bool normalise
         = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */
           || i == 0       /* first in range */
           || i == len-1;  /* last in range */
      zsm_scopy08( src+i, dst+i, normalise );
   }
}


/* For setting address ranges to a given value.  Has considerable
   sophistication so as to avoid generating large numbers of pointless
   cache loads/writebacks for large ranges. */

/* Do small ranges in-cache, in the obvious way. */
static
void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew )
{
   /* fast track a couple of common cases */
   if (len == 4 && aligned32(a)) {
      zsm_swrite32( a, svNew );
      return;
   }
   if (len == 8 && aligned64(a)) {
      zsm_swrite64( a, svNew );
      return;
   }

   /* be completely general (but as efficient as possible) */
   if (len == 0) return;

   if (!aligned16(a) && len >= 1) {
      zsm_swrite08( a, svNew );
      a += 1;
      len -= 1;
      tl_assert(aligned16(a));
   }
   if (len == 0) return;

   if (!aligned32(a) && len >= 2) {
      zsm_swrite16( a, svNew );
      a += 2;
      len -= 2;
      tl_assert(aligned32(a));
   }
   if (len == 0) return;

   if (!aligned64(a) && len >= 4) {
      zsm_swrite32( a, svNew );
      a += 4;
      len -= 4;
      tl_assert(aligned64(a));
   }
   if (len == 0) return;

   if (len >= 8) {
      tl_assert(aligned64(a));
      while (len >= 8) {
         zsm_swrite64( a, svNew );
         a += 8;
         len -= 8;
      }
      tl_assert(aligned64(a));
   }
   if (len == 0) return;

   if (len >= 4)
      tl_assert(aligned32(a));
   if (len >= 4) {
      zsm_swrite32( a, svNew );
      a += 4;
      len -= 4;
   }
   if (len == 0) return;

   if (len >= 2)
      tl_assert(aligned16(a));
   if (len >= 2) {
      zsm_swrite16( a, svNew );
      a += 2;
      len -= 2;
   }
   if (len == 0) return;

   if (len >= 1) {
      zsm_swrite08( a, svNew );
      //a += 1;
      len -= 1;
   }
   tl_assert(len == 0);
}


/* If we're doing a small range, hand off to zsm_sset_range_SMALL.  But
   for larger ranges, try to operate directly on the out-of-cache
   representation, rather than dragging lines into the cache,
   overwriting them, and forcing them out.  This turns out to be an
   important performance optimisation.

   Note that this doesn't change the filtering arrangements.  The
   caller of zsm_sset_range needs to attend to that. */

static void zsm_sset_range ( Addr a, SizeT len, SVal svNew )
{
   tl_assert(svNew != SVal_INVALID);
   stats__cache_make_New_arange += (ULong)len;

   if (0 && len > 500)
      VG_(printf)("make New      ( %#lx, %ld )\n", a, len );

   if (0) {
      static UWord n_New_in_cache = 0;
      static UWord n_New_not_in_cache = 0;
      /* tag is 'a' with the in-line offset masked out, 
         eg a[31]..a[4] 0000 */
      Addr       tag = a & ~(N_LINE_ARANGE - 1);
      UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
      if (LIKELY(tag == cache_shmem.tags0[wix])) {
         n_New_in_cache++;
      } else {
         n_New_not_in_cache++;
      }
      if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000))
         VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n",
                     n_New_in_cache, n_New_not_in_cache );
   }

   if (LIKELY(len < 2 * N_LINE_ARANGE)) {
      zsm_sset_range_SMALL( a, len, svNew );
   } else {
      Addr  before_start  = a;
      Addr  aligned_start = cacheline_ROUNDUP(a);
      Addr  after_start   = cacheline_ROUNDDN(a + len);
      UWord before_len    = aligned_start - before_start;
      UWord aligned_len   = after_start - aligned_start;
      UWord after_len     = a + len - after_start;
      tl_assert(before_start <= aligned_start);
      tl_assert(aligned_start <= after_start);
      tl_assert(before_len < N_LINE_ARANGE);
      tl_assert(after_len < N_LINE_ARANGE);
      tl_assert(get_cacheline_offset(aligned_start) == 0);
      if (get_cacheline_offset(a) == 0) {
         tl_assert(before_len == 0);
         tl_assert(a == aligned_start);
      }
      if (get_cacheline_offset(a+len) == 0) {
         tl_assert(after_len == 0);
         tl_assert(after_start == a+len);
      }
      if (before_len > 0) {
         zsm_sset_range_SMALL( before_start, before_len, svNew );
      }
      if (after_len > 0) {
         zsm_sset_range_SMALL( after_start, after_len, svNew );
      }
      stats__cache_make_New_inZrep += (ULong)aligned_len;

      while (1) {
         Addr tag;
         UWord wix;
         if (aligned_start >= after_start)
            break;
         tl_assert(get_cacheline_offset(aligned_start) == 0);
         tag = aligned_start & ~(N_LINE_ARANGE - 1);
         wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1);
         if (tag == cache_shmem.tags0[wix]) {
            UWord i;
            for (i = 0; i < N_LINE_ARANGE / 8; i++)
               zsm_swrite64( aligned_start + i * 8, svNew );
         } else {
            UWord i;
            Word zix;
            SecMap* sm;
            LineZ* lineZ;
            /* This line is not in the cache.  Do not force it in; instead
               modify it in-place. */
            /* find the Z line to write in and rcdec it or the
               associated F line. */
            find_Z_for_writing( &sm, &zix, tag );
            tl_assert(sm);
            tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
            lineZ = &sm->linesZ[zix];
            lineZ->dict[0] = svNew;
            lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
            for (i = 0; i < N_LINE_ARANGE/4; i++)
               lineZ->ix2s[i] = 0; /* all refer to dict[0] */
            rcinc_LineZ(lineZ);
         }
         aligned_start += N_LINE_ARANGE;
         aligned_len -= N_LINE_ARANGE;
      }
      tl_assert(aligned_start == after_start);
      tl_assert(aligned_len == 0);
   }
}


/////////////////////////////////////////////////////////
//                                                     //
// Front-filtering accesses                            //
//                                                     //
/////////////////////////////////////////////////////////

static UWord stats__f_ac = 0;
static UWord stats__f_sk = 0;

#if 0
#  define STATS__F_SHOW \
     do { \
        if (UNLIKELY(0 == (stats__f_ac & 0xFFFFFF))) \
           VG_(printf)("filters: ac %lu sk %lu\n",   \
           stats__f_ac, stats__f_sk); \
     } while (0)
#else
#  define STATS__F_SHOW /* */
#endif

void zsm_sapply08_f__msmcwrite ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_cwr08(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply08__msmcwrite(thr, a);
}

void zsm_sapply16_f__msmcwrite ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_cwr16(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply16__msmcwrite(thr, a);
}

void zsm_sapply32_f__msmcwrite ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_cwr32(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply32__msmcwrite(thr, a);
}

void zsm_sapply64_f__msmcwrite ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_cwr64(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply64__msmcwrite(thr, a);
}

void zsm_sapplyNN_f__msmcwrite ( Thr* thr, Addr a, SizeT len )
{
   /* fast track a couple of common cases */
   if (len == 4 && aligned32(a)) {
      zsm_sapply32_f__msmcwrite( thr, a );
      return;
   }
   if (len == 8 && aligned64(a)) {
      zsm_sapply64_f__msmcwrite( thr, a );
      return;
   }

   /* be completely general (but as efficient as possible) */
   if (len == 0) return;

   if (!aligned16(a) && len >= 1) {
      zsm_sapply08_f__msmcwrite( thr, a );
      a += 1;
      len -= 1;
      tl_assert(aligned16(a));
   }
   if (len == 0) return;

   if (!aligned32(a) && len >= 2) {
      zsm_sapply16_f__msmcwrite( thr, a );
      a += 2;
      len -= 2;
      tl_assert(aligned32(a));
   }
   if (len == 0) return;

   if (!aligned64(a) && len >= 4) {
      zsm_sapply32_f__msmcwrite( thr, a );
      a += 4;
      len -= 4;
      tl_assert(aligned64(a));
   }
   if (len == 0) return;

   if (len >= 8) {
      tl_assert(aligned64(a));
      while (len >= 8) {
         zsm_sapply64_f__msmcwrite( thr, a );
         a += 8;
         len -= 8;
      }
      tl_assert(aligned64(a));
   }
   if (len == 0) return;

   if (len >= 4)
      tl_assert(aligned32(a));
   if (len >= 4) {
      zsm_sapply32_f__msmcwrite( thr, a );
      a += 4;
      len -= 4;
   }
   if (len == 0) return;

   if (len >= 2)
      tl_assert(aligned16(a));
   if (len >= 2) {
      zsm_sapply16_f__msmcwrite( thr, a );
      a += 2;
      len -= 2;
   }
   if (len == 0) return;

   if (len >= 1) {
      zsm_sapply08_f__msmcwrite( thr, a );
      //a += 1;
      len -= 1;
   }
   tl_assert(len == 0);
}

void zsm_sapply08_f__msmcread ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_crd08(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply08__msmcread(thr, a);
}

void zsm_sapply16_f__msmcread ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_crd16(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply16__msmcread(thr, a);
}

void zsm_sapply32_f__msmcread ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_crd32(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply32__msmcread(thr, a);
}

void zsm_sapply64_f__msmcread ( Thr* thr, Addr a ) {
   stats__f_ac++;
   STATS__F_SHOW;
   if (LIKELY(Filter__ok_to_skip_crd64(thr->filter, a))) {
      stats__f_sk++;
      return;
   }
   zsm_sapply64__msmcread(thr, a);
}

void zsm_sapplyNN_f__msmcread ( Thr* thr, Addr a, SizeT len )
{
   /* fast track a couple of common cases */
   if (len == 4 && aligned32(a)) {
      zsm_sapply32_f__msmcread( thr, a );
      return;
   }
   if (len == 8 && aligned64(a)) {
      zsm_sapply64_f__msmcread( thr, a );
      return;
   }

   /* be completely general (but as efficient as possible) */
   if (len == 0) return;

   if (!aligned16(a) && len >= 1) {
      zsm_sapply08_f__msmcread( thr, a );
      a += 1;
      len -= 1;
      tl_assert(aligned16(a));
   }
   if (len == 0) return;

   if (!aligned32(a) && len >= 2) {
      zsm_sapply16_f__msmcread( thr, a );
      a += 2;
      len -= 2;
      tl_assert(aligned32(a));
   }
   if (len == 0) return;

   if (!aligned64(a) && len >= 4) {
      zsm_sapply32_f__msmcread( thr, a );
      a += 4;
      len -= 4;
      tl_assert(aligned64(a));
   }
   if (len == 0) return;

   if (len >= 8) {
      tl_assert(aligned64(a));
      while (len >= 8) {
         zsm_sapply64_f__msmcread( thr, a );
         a += 8;
         len -= 8;
      }
      tl_assert(aligned64(a));
   }
   if (len == 0) return;

   if (len >= 4)
      tl_assert(aligned32(a));
   if (len >= 4) {
      zsm_sapply32_f__msmcread( thr, a );
      a += 4;
      len -= 4;
   }
   if (len == 0) return;

   if (len >= 2)
      tl_assert(aligned16(a));
   if (len >= 2) {
      zsm_sapply16_f__msmcread( thr, a );
      a += 2;
      len -= 2;
   }
   if (len == 0) return;

   if (len >= 1) {
      zsm_sapply08_f__msmcread( thr, a );
      //a += 1;
      len -= 1;
   }
   tl_assert(len == 0);
}

void libhb_Thr_resumes ( Thr* thr )
{
   if (0) VG_(printf)("resume %p\n", thr);
   tl_assert(thr);
   tl_assert(!thr->llexit_done);
   Filter__clear(thr->filter, "libhb_Thr_resumes");
   /* A kludge, but .. if this thread doesn't have any marker stacks
      at all, get one right now.  This is easier than figuring out
      exactly when at thread startup we can and can't take a stack
      snapshot. */
   if (HG_(clo_history_level) == 1) {
      tl_assert(thr->local_Kws_n_stacks);
      if (VG_(sizeXA)( thr->local_Kws_n_stacks ) == 0)
         note_local_Kw_n_stack_for(thr);
   }
}


/////////////////////////////////////////////////////////
//                                                     //
// Synchronisation objects                             //
//                                                     //
/////////////////////////////////////////////////////////

/* A double linked list of all the SO's. */
SO* admin_SO = NULL;

static SO* SO__Alloc ( void )
{
   SO* so = HG_(zalloc)( "libhb.SO__Alloc.1", sizeof(SO) );
   so->viR   = VtsID_INVALID;
   so->viW   = VtsID_INVALID;
   so->magic = SO_MAGIC;
   /* Add to double linked list */
   if (admin_SO) {
      tl_assert(admin_SO->admin_prev == NULL);
      admin_SO->admin_prev = so;
      so->admin_next = admin_SO;
   } else {
      so->admin_next = NULL;
   }
   so->admin_prev = NULL;
   admin_SO = so;
   /* */
   return so;
}

static void SO__Dealloc ( SO* so )
{
   tl_assert(so);
   tl_assert(so->magic == SO_MAGIC);
   if (so->viR == VtsID_INVALID) {
      tl_assert(so->viW == VtsID_INVALID);
   } else {
      tl_assert(so->viW != VtsID_INVALID);
      VtsID__rcdec(so->viR);
      VtsID__rcdec(so->viW);
   }
   so->magic = 0;
   /* Del from double linked list */
   if (so->admin_prev)
      so->admin_prev->admin_next = so->admin_next;
   if (so->admin_next)
      so->admin_next->admin_prev = so->admin_prev;
   if (so == admin_SO)
      admin_SO = so->admin_next;
   /* */
   HG_(free)( so );
}


/////////////////////////////////////////////////////////
//                                                     //
// Top Level API                                       //
//                                                     //
/////////////////////////////////////////////////////////

static void show_thread_state ( HChar* str, Thr* t ) 
{
   if (1) return;
   if (t->viR == t->viW) {
      VG_(printf)("thr \"%s\" %p has vi* %u==", str, t, t->viR );
      VtsID__pp( t->viR );
      VG_(printf)("%s","\n");
   } else {
      VG_(printf)("thr \"%s\" %p has viR %u==", str, t, t->viR );
      VtsID__pp( t->viR );
      VG_(printf)(" viW %u==", t->viW);
      VtsID__pp( t->viW );
      VG_(printf)("%s","\n");
   }
}


Thr* libhb_init (
        void        (*get_stacktrace)( Thr*, Addr*, UWord ),
        ExeContext* (*get_EC)( Thr* )
     )
{
   Thr*  thr;
   VtsID vi;

   // We will have to have to store a large number of these,
   // so make sure they're the size we expect them to be.
   tl_assert(sizeof(ScalarTS) == 8);

   /* because first 1024 unusable */
   tl_assert(SCALARTS_N_THRBITS >= 11);
   /* so as to fit in a UInt w/ 3 bits to spare (see defn of
      Thr_n_RCEC). */
   tl_assert(SCALARTS_N_THRBITS <= 29);

   /* Need to be sure that Thr_n_RCEC is 2 words (64-bit) or 3 words
      (32-bit).  It's not correctness-critical, but there are a lot of
      them, so it's important from a space viewpoint.  Unfortunately
      we simply can't pack it into 2 words on a 32-bit target. */
   if (sizeof(UWord) == 8) {
      tl_assert(sizeof(Thr_n_RCEC) == 16);
   } else {
      tl_assert(sizeof(Thr_n_RCEC) == 12);
   }

   /* Word sets really are 32 bits.  Even on a 64 bit target. */
   tl_assert(sizeof(WordSetID) == 4);
   tl_assert(sizeof(WordSet) == sizeof(WordSetID));

   tl_assert(get_stacktrace);
   tl_assert(get_EC);
   main_get_stacktrace   = get_stacktrace;
   main_get_EC           = get_EC;

   // No need to initialise hg_wordfm.
   // No need to initialise hg_wordset.

   /* Allocated once and never deallocated.  Used as a temporary in
      VTS singleton, tick and join operations. */
   temp_max_sized_VTS = VTS__new( "libhb.libhb_init.1", ThrID_MAX_VALID );
   temp_max_sized_VTS->id = VtsID_INVALID;
   verydead_thread_table_init();
   vts_set_init();
   vts_tab_init();
   event_map_init();
   VtsID__invalidate_caches();

   // initialise shadow memory
   zsm_init( SVal__rcinc, SVal__rcdec );

   thr = Thr__new();
   vi  = VtsID__mk_Singleton( thr, 1 );
   thr->viR = vi;
   thr->viW = vi;
   VtsID__rcinc(thr->viR);
   VtsID__rcinc(thr->viW);

   show_thread_state("  root", thr);
   return thr;
}


Thr* libhb_create ( Thr* parent )
{
   /* The child's VTSs are copies of the parent's VTSs, but ticked at
      the child's index.  Since the child's index is guaranteed
      unique, it has never been seen before, so the implicit value
      before the tick is zero and after that is one. */
   Thr* child = Thr__new();

   child->viR = VtsID__tick( parent->viR, child );
   child->viW = VtsID__tick( parent->viW, child );
   Filter__clear(child->filter, "libhb_create(child)");
   VtsID__rcinc(child->viR);
   VtsID__rcinc(child->viW);
   /* We need to do note_local_Kw_n_stack_for( child ), but it's too
      early for that - it may not have a valid TId yet.  So, let
      libhb_Thr_resumes pick it up the first time the thread runs. */

   tl_assert(VtsID__indexAt( child->viR, child ) == 1);
   tl_assert(VtsID__indexAt( child->viW, child ) == 1);

   /* and the parent has to move along too */
   VtsID__rcdec(parent->viR);
   VtsID__rcdec(parent->viW);
   parent->viR = VtsID__tick( parent->viR, parent );
   parent->viW = VtsID__tick( parent->viW, parent );
   Filter__clear(parent->filter, "libhb_create(parent)");
   VtsID__rcinc(parent->viR);
   VtsID__rcinc(parent->viW);
   note_local_Kw_n_stack_for( parent );

   show_thread_state(" child", child);
   show_thread_state("parent", parent);

   return child;
}

/* Shut down the library, and print stats (in fact that's _all_
   this is for. */
void libhb_shutdown ( Bool show_stats )
{
   if (show_stats) {
      VG_(printf)("%s","<<< BEGIN libhb stats >>>\n");
      VG_(printf)(" secmaps: %'10lu allocd (%'12lu g-a-range)\n",
                  stats__secmaps_allocd,
                  stats__secmap_ga_space_covered);
      VG_(printf)("  linesZ: %'10lu allocd (%'12lu bytes occupied)\n",
                  stats__secmap_linesZ_allocd,
                  stats__secmap_linesZ_bytes);
      VG_(printf)("  linesF: %'10lu allocd (%'12lu bytes occupied)\n",
                  stats__secmap_linesF_allocd,
                  stats__secmap_linesF_bytes);
      VG_(printf)(" secmaps: %'10lu iterator steppings\n",
                  stats__secmap_iterator_steppings);
      VG_(printf)(" secmaps: %'10lu searches (%'12lu slow)\n",
                  stats__secmaps_search, stats__secmaps_search_slow);

      VG_(printf)("%s","\n");
      VG_(printf)("   cache: %'lu totrefs (%'lu misses)\n",
                  stats__cache_totrefs, stats__cache_totmisses );
      VG_(printf)("   cache: %'14lu Z-fetch,    %'14lu F-fetch\n",
                  stats__cache_Z_fetches, stats__cache_F_fetches );
      VG_(printf)("   cache: %'14lu Z-wback,    %'14lu F-wback\n",
                  stats__cache_Z_wbacks, stats__cache_F_wbacks );
      VG_(printf)("   cache: %'14lu invals,     %'14lu flushes\n",
                  stats__cache_invals, stats__cache_flushes );
      VG_(printf)("   cache: %'14llu arange_New  %'14llu direct-to-Zreps\n",
                  stats__cache_make_New_arange,
                  stats__cache_make_New_inZrep);

      VG_(printf)("%s","\n");
      VG_(printf)("   cline: %'10lu normalises\n",
                  stats__cline_normalises );
      VG_(printf)("   cline: c rds 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
                  stats__cline_cread64s,
                  stats__cline_cread32s,
                  stats__cline_cread16s,
                  stats__cline_cread08s );
      VG_(printf)("   cline: c wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
                  stats__cline_cwrite64s,
                  stats__cline_cwrite32s,
                  stats__cline_cwrite16s,
                  stats__cline_cwrite08s );
      VG_(printf)("   cline: s wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
                  stats__cline_swrite64s,
                  stats__cline_swrite32s,
                  stats__cline_swrite16s,
                  stats__cline_swrite08s );
      VG_(printf)("   cline: s rd1s %'lu, s copy1s %'lu\n",
                  stats__cline_sread08s, stats__cline_scopy08s );
      VG_(printf)("   cline:    splits: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
                 stats__cline_64to32splits,
                 stats__cline_32to16splits,
                 stats__cline_16to8splits );
      VG_(printf)("   cline: pulldowns: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
                 stats__cline_64to32pulldown,
                 stats__cline_32to16pulldown,
                 stats__cline_16to8pulldown );
      if (0)
      VG_(printf)("   cline: sizeof(CacheLineZ) %ld, covers %ld bytes of arange\n",
                  (Word)sizeof(LineZ), (Word)N_LINE_ARANGE);

      VG_(printf)("%s","\n");

      VG_(printf)("   libhb: %'13llu msmcread  (%'llu dragovers)\n",
                  stats__msmcread, stats__msmcread_change);
      VG_(printf)("   libhb: %'13llu msmcwrite (%'llu dragovers)\n",
                  stats__msmcwrite, stats__msmcwrite_change);
      VG_(printf)("   libhb: %'13llu cmpLEQ queries (%'llu misses)\n",
                  stats__cmpLEQ_queries, stats__cmpLEQ_misses);
      VG_(printf)("   libhb: %'13llu join2  queries (%'llu misses)\n",
                  stats__join2_queries, stats__join2_misses);

      VG_(printf)("%s","\n");
      VG_(printf)( "   libhb: VTSops: tick %'lu,  join %'lu,  cmpLEQ %'lu\n",
                   stats__vts__tick, stats__vts__join,  stats__vts__cmpLEQ );
      VG_(printf)( "   libhb: VTSops: cmp_structural %'lu (%'lu slow)\n",
                   stats__vts__cmp_structural, stats__vts__cmp_structural_slow );
      VG_(printf)( "   libhb: VTSset: find__or__clone_and_add %'lu (%'lu allocd)\n",
                   stats__vts_set__focaa, stats__vts_set__focaa_a );
      VG_(printf)( "   libhb: VTSops: indexAt_SLOW %'lu\n",
                   stats__vts__indexat_slow );

      VG_(printf)("%s","\n");
      VG_(printf)(
         "   libhb: %ld entries in vts_table (approximately %lu bytes)\n",
         VG_(sizeXA)( vts_tab ), VG_(sizeXA)( vts_tab ) * sizeof(VtsTE)
      );
      VG_(printf)( "   libhb: %lu entries in vts_set\n",
                   VG_(sizeFM)( vts_set ) );

      VG_(printf)("%s","\n");
      VG_(printf)( "   libhb: ctxt__rcdec: 1=%lu(%lu eq), 2=%lu, 3=%lu\n",
                   stats__ctxt_rcdec1, stats__ctxt_rcdec1_eq,
                   stats__ctxt_rcdec2,
                   stats__ctxt_rcdec3 );
      VG_(printf)( "   libhb: ctxt__rcdec: calls %lu, discards %lu\n",
                   stats__ctxt_rcdec_calls, stats__ctxt_rcdec_discards);
      VG_(printf)( "   libhb: contextTab: %lu slots, %lu max ents\n",
                   (UWord)N_RCEC_TAB,
                   stats__ctxt_tab_curr );
      VG_(printf)( "   libhb: contextTab: %lu queries, %lu cmps\n",
                   stats__ctxt_tab_qs,
                   stats__ctxt_tab_cmps );
#if 0
      VG_(printf)("sizeof(AvlNode)     = %lu\n", sizeof(AvlNode));
      VG_(printf)("sizeof(WordBag)     = %lu\n", sizeof(WordBag));
      VG_(printf)("sizeof(MaybeWord)   = %lu\n", sizeof(MaybeWord));
      VG_(printf)("sizeof(CacheLine)   = %lu\n", sizeof(CacheLine));
      VG_(printf)("sizeof(LineZ)       = %lu\n", sizeof(LineZ));
      VG_(printf)("sizeof(LineF)       = %lu\n", sizeof(LineF));
      VG_(printf)("sizeof(SecMap)      = %lu\n", sizeof(SecMap));
      VG_(printf)("sizeof(Cache)       = %lu\n", sizeof(Cache));
      VG_(printf)("sizeof(SMCacheEnt)  = %lu\n", sizeof(SMCacheEnt));
      VG_(printf)("sizeof(CountedSVal) = %lu\n", sizeof(CountedSVal));
      VG_(printf)("sizeof(VTS)         = %lu\n", sizeof(VTS));
      VG_(printf)("sizeof(ScalarTS)    = %lu\n", sizeof(ScalarTS));
      VG_(printf)("sizeof(VtsTE)       = %lu\n", sizeof(VtsTE));
      VG_(printf)("sizeof(MSMInfo)     = %lu\n", sizeof(MSMInfo));

      VG_(printf)("sizeof(struct _XArray)     = %lu\n", sizeof(struct _XArray));
      VG_(printf)("sizeof(struct _WordFM)     = %lu\n", sizeof(struct _WordFM));
      VG_(printf)("sizeof(struct _Thr)     = %lu\n", sizeof(struct _Thr));
      VG_(printf)("sizeof(struct _SO)     = %lu\n", sizeof(struct _SO));
#endif

      VG_(printf)("%s","<<< END libhb stats >>>\n");
      VG_(printf)("%s","\n");

   }
}

/* Receive notification that a thread has low level exited.  The
   significance here is that we do not expect to see any more memory
   references from it. */
void libhb_async_exit ( Thr* thr )
{
   tl_assert(thr);
   tl_assert(!thr->llexit_done);
   thr->llexit_done = True;

   /* free up Filter and local_Kws_n_stacks (well, actually not the
      latter ..) */
   tl_assert(thr->filter);
   HG_(free)(thr->filter);
   thr->filter = NULL;

   /* Tell the VTS mechanism this thread has exited, so it can
      participate in VTS pruning.  Note this can only happen if the
      thread has both ll_exited and has been joined with. */
   if (thr->joinedwith_done)
      VTS__declare_thread_very_dead(thr);

   /* Another space-accuracy tradeoff.  Do we want to be able to show
      H1 history for conflicts in threads which have since exited?  If
      yes, then we better not free up thr->local_Kws_n_stacks.  The
      downside is a potential per-thread leak of up to
      N_KWs_N_STACKs_PER_THREAD * sizeof(ULong_n_EC) * whatever the
      XArray average overcommit factor is (1.5 I'd guess). */
   // hence:
   // VG_(deleteXA)(thr->local_Kws_n_stacks);
   // thr->local_Kws_n_stacks = NULL;
}

/* Receive notification that a thread has been joined with.  The
   significance here is that we do not expect to see any further
   references to its vector clocks (Thr::viR and Thr::viW). */
void libhb_joinedwith_done ( Thr* thr )
{
   tl_assert(thr);
   /* Caller must ensure that this is only ever called once per Thr. */
   tl_assert(!thr->joinedwith_done);
   thr->joinedwith_done = True;
   if (thr->llexit_done)
      VTS__declare_thread_very_dead(thr);
}


/* Both Segs and SOs point to VTSs.  However, there is no sharing, so
   a Seg that points at a VTS is its one-and-only owner, and ditto for
   a SO that points at a VTS. */

SO* libhb_so_alloc ( void )
{
   return SO__Alloc();
}

void libhb_so_dealloc ( SO* so )
{
   tl_assert(so);
   tl_assert(so->magic == SO_MAGIC);
   SO__Dealloc(so);
}

/* See comments in libhb.h for details on the meaning of 
   strong vs weak sends and strong vs weak receives. */
void libhb_so_send ( Thr* thr, SO* so, Bool strong_send )
{
   /* Copy the VTSs from 'thr' into the sync object, and then move
      the thread along one step. */

   tl_assert(so);
   tl_assert(so->magic == SO_MAGIC);

   /* stay sane .. a thread's read-clock must always lead or be the
      same as its write-clock */
   { Bool leq = VtsID__cmpLEQ(thr->viW, thr->viR);
     tl_assert(leq);
   }

   /* since we're overwriting the VtsIDs in the SO, we need to drop
      any references made by the previous contents thereof */
   if (so->viR == VtsID_INVALID) {
      tl_assert(so->viW == VtsID_INVALID);
      so->viR = thr->viR;
      so->viW = thr->viW;
      VtsID__rcinc(so->viR);
      VtsID__rcinc(so->viW);
   } else {
      /* In a strong send, we dump any previous VC in the SO and
         install the sending thread's VC instead.  For a weak send we
         must join2 with what's already there. */
      tl_assert(so->viW != VtsID_INVALID);
      VtsID__rcdec(so->viR);
      VtsID__rcdec(so->viW);
      so->viR = strong_send ? thr->viR : VtsID__join2( so->viR, thr->viR );
      so->viW = strong_send ? thr->viW : VtsID__join2( so->viW, thr->viW );
      VtsID__rcinc(so->viR);
      VtsID__rcinc(so->viW);
   }

   /* move both parent clocks along */
   VtsID__rcdec(thr->viR);
   VtsID__rcdec(thr->viW);
   thr->viR = VtsID__tick( thr->viR, thr );
   thr->viW = VtsID__tick( thr->viW, thr );
   if (!thr->llexit_done) {
      Filter__clear(thr->filter, "libhb_so_send");
      note_local_Kw_n_stack_for(thr);
   }
   VtsID__rcinc(thr->viR);
   VtsID__rcinc(thr->viW);

   if (strong_send)
      show_thread_state("s-send", thr);
   else
      show_thread_state("w-send", thr);
}

void libhb_so_recv ( Thr* thr, SO* so, Bool strong_recv )
{
   tl_assert(so);
   tl_assert(so->magic == SO_MAGIC);

   if (so->viR != VtsID_INVALID) {
      tl_assert(so->viW != VtsID_INVALID);

      /* Weak receive (basically, an R-acquisition of a R-W lock).
         This advances the read-clock of the receiver, but not the
         write-clock. */
      VtsID__rcdec(thr->viR);
      thr->viR = VtsID__join2( thr->viR, so->viR );
      VtsID__rcinc(thr->viR);

      /* At one point (r10589) it seemed safest to tick the clocks for
         the receiving thread after the join.  But on reflection, I
         wonder if that might cause it to 'overtake' constraints,
         which could lead to missing races.  So, back out that part of
         r10589. */
      //VtsID__rcdec(thr->viR);
      //thr->viR = VtsID__tick( thr->viR, thr );
      //VtsID__rcinc(thr->viR);

      /* For a strong receive, we also advance the receiver's write
         clock, which means the receive as a whole is essentially
         equivalent to a W-acquisition of a R-W lock. */
      if (strong_recv) {
         VtsID__rcdec(thr->viW);
         thr->viW = VtsID__join2( thr->viW, so->viW );
         VtsID__rcinc(thr->viW);

         /* See comment just above, re r10589. */
         //VtsID__rcdec(thr->viW);
         //thr->viW = VtsID__tick( thr->viW, thr );
         //VtsID__rcinc(thr->viW);
      }

      if (thr->filter)
         Filter__clear(thr->filter, "libhb_so_recv");
      note_local_Kw_n_stack_for(thr);

      if (strong_recv) 
         show_thread_state("s-recv", thr);
      else 
         show_thread_state("w-recv", thr);

   } else {
      tl_assert(so->viW == VtsID_INVALID);
      /* Deal with degenerate case: 'so' has no vts, so there has been
         no message posted to it.  Just ignore this case. */
      show_thread_state("d-recv", thr);
   }
}

Bool libhb_so_everSent ( SO* so )
{
   if (so->viR == VtsID_INVALID) {
      tl_assert(so->viW == VtsID_INVALID);
      return False;
   } else {
      tl_assert(so->viW != VtsID_INVALID);
      return True;
   }
}

#define XXX1 0 // 0x67a106c
#define XXX2 0

static inline Bool TRACEME(Addr a, SizeT szB) {
   if (XXX1 && a <= XXX1 && XXX1 <= a+szB) return True;
   if (XXX2 && a <= XXX2 && XXX2 <= a+szB) return True;
   return False;
}
static void trace ( Thr* thr, Addr a, SizeT szB, HChar* s ) {
  SVal sv = zsm_sread08(a);
  VG_(printf)("thr %p (%#lx,%lu) %s: 0x%016llx ", thr,a,szB,s,sv);
  show_thread_state("", thr);
  VG_(printf)("%s","\n");
}

void libhb_srange_new ( Thr* thr, Addr a, SizeT szB )
{
   SVal sv = SVal__mkC(thr->viW, thr->viW);
   tl_assert(is_sane_SVal_C(sv));
   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-before");
   zsm_sset_range( a, szB, sv );
   Filter__clear_range( thr->filter, a, szB );
   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-after ");
}

void libhb_srange_noaccess_NoFX ( Thr* thr, Addr a, SizeT szB )
{
   /* do nothing */
}

void libhb_srange_noaccess_AHAE ( Thr* thr, Addr a, SizeT szB )
{
   /* This really does put the requested range in NoAccess.  It's
      expensive though. */
   SVal sv = SVal_NOACCESS;
   tl_assert(is_sane_SVal_C(sv));
   zsm_sset_range( a, szB, sv );
   Filter__clear_range( thr->filter, a, szB );
}

void libhb_srange_untrack ( Thr* thr, Addr a, SizeT szB )
{
   SVal sv = SVal_NOACCESS;
   tl_assert(is_sane_SVal_C(sv));
   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-before");
   zsm_sset_range( a, szB, sv );
   Filter__clear_range( thr->filter, a, szB );
   if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-after ");
}

Thread* libhb_get_Thr_hgthread ( Thr* thr ) {
   tl_assert(thr);
   return thr->hgthread;
}

void libhb_set_Thr_hgthread ( Thr* thr, Thread* hgthread ) {
   tl_assert(thr);
   thr->hgthread = hgthread;
}

void libhb_copy_shadow_state ( Thr* thr, Addr src, Addr dst, SizeT len )
{
   zsm_scopy_range(src, dst, len);
   Filter__clear_range( thr->filter, dst, len ); 
}

void libhb_maybe_GC ( void )
{
   event_map_maybe_GC();
   /* If there are still freelist entries available, no need for a
      GC. */
   if (vts_tab_freelist != VtsID_INVALID)
      return;
   /* So all the table entries are full, and we're having to expand
      the table.  But did we hit the threshhold point yet? */
   if (VG_(sizeXA)( vts_tab ) < vts_next_GC_at)
      return;
   vts_tab__do_GC( False/*don't show stats*/ );
}


/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////
//                                                             //
// SECTION END main library                                    //
//                                                             //
/////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

/*--------------------------------------------------------------------*/
/*--- end                                             libhb_main.c ---*/
/*--------------------------------------------------------------------*/

登录后可以享受更多权益