Home | History | Annotate | Download | only in helgrind
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- LibHB: a library for implementing and checking               ---*/
      4 /*--- the happens-before relationship in concurrent programs.      ---*/
      5 /*---                                                 libhb_main.c ---*/
      6 /*--------------------------------------------------------------------*/
      7 
      8 /*
      9    This file is part of LibHB, a library for implementing and checking
     10    the happens-before relationship in concurrent programs.
     11 
     12    Copyright (C) 2008-2012 OpenWorks Ltd
     13       info (at) open-works.co.uk
     14 
     15    This program is free software; you can redistribute it and/or
     16    modify it under the terms of the GNU General Public License as
     17    published by the Free Software Foundation; either version 2 of the
     18    License, or (at your option) any later version.
     19 
     20    This program is distributed in the hope that it will be useful, but
     21    WITHOUT ANY WARRANTY; without even the implied warranty of
     22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     23    General Public License for more details.
     24 
     25    You should have received a copy of the GNU General Public License
     26    along with this program; if not, write to the Free Software
     27    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     28    02111-1307, USA.
     29 
     30    The GNU General Public License is contained in the file COPYING.
     31 */
     32 
     33 #include "pub_tool_basics.h"
     34 #include "pub_tool_poolalloc.h"
     35 #include "pub_tool_libcassert.h"
     36 #include "pub_tool_libcbase.h"
     37 #include "pub_tool_libcprint.h"
     38 #include "pub_tool_mallocfree.h"
     39 #include "pub_tool_wordfm.h"
     40 #include "pub_tool_sparsewa.h"
     41 #include "pub_tool_xarray.h"
     42 #include "pub_tool_oset.h"
     43 #include "pub_tool_threadstate.h"
     44 #include "pub_tool_aspacemgr.h"
     45 #include "pub_tool_execontext.h"
     46 #include "pub_tool_errormgr.h"
     47 #include "pub_tool_options.h"        // VG_(clo_stats)
     48 #include "hg_basics.h"
     49 #include "hg_wordset.h"
     50 #include "hg_lock_n_thread.h"
     51 #include "hg_errors.h"
     52 
     53 #include "libhb.h"
     54 
     55 
     56 /////////////////////////////////////////////////////////////////
     57 /////////////////////////////////////////////////////////////////
     58 //                                                             //
     59 // Debugging #defines                                          //
     60 //                                                             //
     61 /////////////////////////////////////////////////////////////////
     62 /////////////////////////////////////////////////////////////////
     63 
     64 /* Check the sanity of shadow values in the core memory state
     65    machine.  Change #if 0 to #if 1 to enable this. */
     66 #if 0
     67 #  define CHECK_MSM 1
     68 #else
     69 #  define CHECK_MSM 0
     70 #endif
     71 
     72 
     73 /* Check sanity (reference counts, etc) in the conflicting access
     74    machinery.  Change #if 0 to #if 1 to enable this. */
     75 #if 0
     76 #  define CHECK_CEM 1
     77 #else
     78 #  define CHECK_CEM 0
     79 #endif
     80 
     81 
     82 /* Check sanity in the compressed shadow memory machinery,
     83    particularly in its caching innards.  Unfortunately there's no
     84    almost-zero-cost way to make them selectable at run time.  Hence
     85    set the #if 0 to #if 1 and rebuild if you want them. */
     86 #if 0
     87 #  define CHECK_ZSM 1  /* do sanity-check CacheLine stuff */
     88 #  define inline __attribute__((noinline))
     89    /* probably want to ditch -fomit-frame-pointer too */
     90 #else
     91 #  define CHECK_ZSM 0   /* don't sanity-check CacheLine stuff */
     92 #endif
     93 
     94 
     95 /////////////////////////////////////////////////////////////////
     96 /////////////////////////////////////////////////////////////////
     97 //                                                             //
     98 // data decls: VtsID                                           //
     99 //                                                             //
    100 /////////////////////////////////////////////////////////////////
    101 /////////////////////////////////////////////////////////////////
    102 
    103 /* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
    104    bits, since they have to be packed into the lowest 30 bits of an
    105    SVal. */
    106 typedef  UInt  VtsID;
    107 #define VtsID_INVALID 0xFFFFFFFF
    108 
    109 
    110 
    111 /////////////////////////////////////////////////////////////////
    112 /////////////////////////////////////////////////////////////////
    113 //                                                             //
    114 // data decls: SVal                                            //
    115 //                                                             //
    116 /////////////////////////////////////////////////////////////////
    117 /////////////////////////////////////////////////////////////////
    118 
    119 typedef  ULong  SVal;
    120 
    121 /* This value has special significance to the implementation, and callers
    122    may not store it in the shadow memory. */
    123 #define SVal_INVALID (3ULL << 62)
    124 
    125 /* This is the default value for shadow memory.  Initially the shadow
    126    memory contains no accessible areas and so all reads produce this
    127    value.  TODO: make this caller-defineable. */
    128 #define SVal_NOACCESS (2ULL << 62)
    129 
    130 
    131 
    132 /////////////////////////////////////////////////////////////////
    133 /////////////////////////////////////////////////////////////////
    134 //                                                             //
    135 // data decls: ScalarTS                                        //
    136 //                                                             //
    137 /////////////////////////////////////////////////////////////////
    138 /////////////////////////////////////////////////////////////////
    139 
    140 /* Scalar Timestamp.  We have to store a lot of these, so there is
    141    some effort to make them as small as possible.  Logically they are
    142    a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
    143    We pack it into 64 bits by representing the Thr* using a ThrID, a
    144    small integer (18 bits), and a 46 bit integer for the timestamp
    145    number.  The 46/18 split is arbitary, but has the effect that
    146    Helgrind can only handle programs that create 2^18 or fewer threads
    147    over their entire lifetime, and have no more than 2^46 timestamp
    148    ticks (synchronisation operations on the same thread).
    149 
    150    This doesn't seem like much of a limitation.  2^46 ticks is
    151    7.06e+13, and if each tick (optimistically) takes the machine 1000
    152    cycles to process, then the minimum time to process that many ticks
    153    at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
    154    but VTS ticks, which isn't realistic.
    155 
    156    NB1: SCALARTS_N_THRBITS must be 29 or lower.  The obvious limit is
    157    32 since a ThrID is a UInt.  29 comes from the fact that
    158    'Thr_n_RCEC', which records information about old accesses, packs
    159    not only a ThrID but also 2+1 other bits (access size and
    160    writeness) in a UInt, hence limiting size to 32-(2+1) == 29.
    161 
    162    NB2: thrid values are issued upwards from 1024, and values less
    163    than that aren't valid.  This isn't per se necessary (any order
    164    will do, so long as they are unique), but it does help ensure they
    165    are less likely to get confused with the various other kinds of
    166    small-integer thread ids drifting around (eg, TId).  See also NB5.
    167 
    168    NB3: this probably also relies on the fact that Thr's are never
    169    deallocated -- they exist forever.  Hence the 1-1 mapping from
    170    Thr's to thrid values (set up in Thr__new) persists forever.
    171 
    172    NB4: temp_max_sized_VTS is allocated at startup and never freed.
    173    It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
    174    ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
    175    making the memory use for this go sky-high.  With
    176    SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
    177    like an OK tradeoff.  If more than 256k threads need to be
    178    supported, we could change SCALARTS_N_THRBITS to 20, which would
    179    facilitate supporting 1 million threads at the cost of 8MB storage
    180    for temp_max_sized_VTS.
    181 
    182    NB5: the conflicting-map mechanism (Thr_n_RCEC, specifically) uses
    183    ThrID == 0 to denote an empty Thr_n_RCEC record.  So ThrID == 0
    184    must never be a valid ThrID.  Given NB2 that's OK.
    185 */
    186 #define SCALARTS_N_THRBITS 18  /* valid range: 11 to 29 inclusive */
    187 
    188 #define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
    189 typedef
    190    struct {
    191       ThrID thrid : SCALARTS_N_THRBITS;
    192       ULong tym   : SCALARTS_N_TYMBITS;
    193    }
    194    ScalarTS;
    195 
    196 #define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)
    197 
    198 
    199 
    200 /////////////////////////////////////////////////////////////////
    201 /////////////////////////////////////////////////////////////////
    202 //                                                             //
    203 // data decls: Filter                                          //
    204 //                                                             //
    205 /////////////////////////////////////////////////////////////////
    206 /////////////////////////////////////////////////////////////////
    207 
    208 // baseline: 5, 9
    209 #define FI_LINE_SZB_LOG2  5
    210 #define FI_NUM_LINES_LOG2 10
    211 
    212 #define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
    213 #define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)
    214 
    215 #define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
    216 #define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)
    217 
    218 #define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
    219                              & (Addr)(FI_NUM_LINES-1) )
    220 
    221 
    222 /* In the lines, each 8 bytes are treated individually, and are mapped
    223    to a UShort.  Regardless of endianness of the underlying machine,
    224    bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
    225    the highest address.
    226 
    227    Of each bit pair, the higher numbered bit is set if a R has been
    228    seen, so the actual layout is:
    229 
    230    15 14             ...  01 00
    231 
    232    R  W  for addr+7  ...  R  W  for addr+0
    233 
    234    So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
    235 */
    236 
    237 /* tags are separated from lines.  tags are Addrs and are
    238    the base address of the line. */
    239 typedef
    240    struct {
    241       UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
    242    }
    243    FiLine;
    244 
    245 typedef
    246    struct {
    247       Addr   tags[FI_NUM_LINES];
    248       FiLine lines[FI_NUM_LINES];
    249    }
    250    Filter;
    251 
    252 
    253 
    254 /////////////////////////////////////////////////////////////////
    255 /////////////////////////////////////////////////////////////////
    256 //                                                             //
    257 // data decls: Thr, ULong_n_EC                                 //
    258 //                                                             //
    259 /////////////////////////////////////////////////////////////////
    260 /////////////////////////////////////////////////////////////////
    261 
    262 // Records stacks for H1 history mechanism (DRD-style)
    263 typedef
    264    struct { ULong ull; ExeContext* ec; }
    265    ULong_n_EC;
    266 
    267 
    268 /* How many of the above records to collect for each thread?  Older
    269    ones are dumped when we run out of space.  62.5k requires 1MB per
    270    thread, since each ULong_n_EC record is 16 bytes long.  When more
    271    than N_KWs_N_STACKs_PER_THREAD are present, the older half are
    272    deleted to make space.  Hence in the worst case we will be able to
    273    produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
    274    Kw transitions (segments in this thread).  For the current setting
    275    that gives a guaranteed stack for at least the last 31.25k
    276    segments. */
    277 #define N_KWs_N_STACKs_PER_THREAD 62500
    278 
    279 
    280 struct _Thr {
    281    /* Current VTSs for this thread.  They change as we go along.  viR
    282       is the VTS to be used for reads, viW for writes.  Usually they
    283       are the same, but can differ when we deal with reader-writer
    284       locks.  It is always the case that
    285          VtsID__cmpLEQ(viW,viR) == True
    286       that is, viW must be the same, or lagging behind, viR. */
    287    VtsID viR;
    288    VtsID viW;
    289 
    290    /* Is initially False, and is set to True after the thread really
    291       has done a low-level exit.  When True, we expect to never see
    292       any more memory references done by this thread. */
    293    Bool llexit_done;
    294 
    295    /* Is initially False, and is set to True after the thread has been
    296       joined with (reaped by some other thread).  After this point, we
    297       do not expect to see any uses of .viR or .viW, so it is safe to
    298       set them to VtsID_INVALID. */
    299    Bool joinedwith_done;
    300 
    301    /* A small integer giving a unique identity to this Thr.  See
    302       comments on the definition of ScalarTS for details. */
    303    ThrID thrid : SCALARTS_N_THRBITS;
    304 
    305    /* A filter that removes references for which we believe that
    306       msmcread/msmcwrite will not change the state, nor report a
    307       race. */
    308    Filter* filter;
    309 
    310    /* A pointer back to the top level Thread structure.  There is a
    311       1-1 mapping between Thread and Thr structures -- each Thr points
    312       at its corresponding Thread, and vice versa.  Really, Thr and
    313       Thread should be merged into a single structure. */
    314    Thread* hgthread;
    315 
    316    /* The ULongs (scalar Kws) in this accumulate in strictly
    317       increasing order, without duplicates.  This is important because
    318       we need to be able to find a given scalar Kw in this array
    319       later, by binary search. */
    320    XArray* /* ULong_n_EC */ local_Kws_n_stacks;
    321 };
    322 
    323 
    324 
    325 /////////////////////////////////////////////////////////////////
    326 /////////////////////////////////////////////////////////////////
    327 //                                                             //
    328 // data decls: SO                                              //
    329 //                                                             //
    330 /////////////////////////////////////////////////////////////////
    331 /////////////////////////////////////////////////////////////////
    332 
    333 // (UInt) `echo "Synchronisation object" | md5sum`
    334 #define SO_MAGIC 0x56b3c5b0U
    335 
    336 struct _SO {
    337    struct _SO* admin_prev;
    338    struct _SO* admin_next;
    339    VtsID viR; /* r-clock of sender */
    340    VtsID viW; /* w-clock of sender */
    341    UInt  magic;
    342 };
    343 
    344 
    345 
    346 /////////////////////////////////////////////////////////////////
    347 /////////////////////////////////////////////////////////////////
    348 //                                                             //
    349 // Forward declarations                                        //
    350 //                                                             //
    351 /////////////////////////////////////////////////////////////////
    352 /////////////////////////////////////////////////////////////////
    353 
    354 /* fwds for
    355    Globals needed by other parts of the library.  These are set
    356    once at startup and then never changed. */
    357 static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
    358 static ExeContext* (*main_get_EC)( Thr* ) = NULL;
    359 
    360 /* misc fn and data fwdses */
    361 static void VtsID__rcinc ( VtsID ii );
    362 static void VtsID__rcdec ( VtsID ii );
    363 
    364 static inline Bool SVal__isC ( SVal s );
    365 static inline VtsID SVal__unC_Rmin ( SVal s );
    366 static inline VtsID SVal__unC_Wmin ( SVal s );
    367 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini );
    368 
    369 /* A double linked list of all the SO's. */
    370 SO* admin_SO;
    371 
    372 
    373 
    374 /////////////////////////////////////////////////////////////////
    375 /////////////////////////////////////////////////////////////////
    376 //                                                             //
    377 // SECTION BEGIN compressed shadow memory                      //
    378 //                                                             //
    379 /////////////////////////////////////////////////////////////////
    380 /////////////////////////////////////////////////////////////////
    381 
    382 #ifndef __HB_ZSM_H
    383 #define __HB_ZSM_H
    384 
    385 /* Initialise the library.  Once initialised, it will (or may) call
    386    rcinc and rcdec in response to all the calls below, in order to
    387    allow the user to do reference counting on the SVals stored herein.
    388    It is important to understand, however, that due to internal
    389    caching, the reference counts are in general inaccurate, and can be
    390    both above or below the true reference count for an item.  In
    391    particular, the library may indicate that the reference count for
    392    an item is zero, when in fact it is not.
    393 
    394    To make the reference counting exact and therefore non-pointless,
    395    call zsm_flush_cache.  Immediately after it returns, the reference
    396    counts for all items, as deduced by the caller by observing calls
    397    to rcinc and rcdec, will be correct, and so any items with a zero
    398    reference count may be freed (or at least considered to be
    399    unreferenced by this library).
    400 */
    401 static void zsm_init ( void(*rcinc)(SVal), void(*rcdec)(SVal) );
    402 
    403 static void zsm_sset_range  ( Addr, SizeT, SVal );
    404 static void zsm_scopy_range ( Addr, Addr, SizeT );
    405 static void zsm_flush_cache ( void );
    406 
    407 #endif /* ! __HB_ZSM_H */
    408 
    409 
    410 /* Round a up to the next multiple of N.  N must be a power of 2 */
    411 #define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
    412 /* Round a down to the next multiple of N.  N must be a power of 2 */
    413 #define ROUNDDN(a, N)   ((a) & ~(N-1))
    414 
    415 
    416 
    417 /* ------ User-supplied RC functions ------ */
    418 static void(*rcinc)(SVal) = NULL;
    419 static void(*rcdec)(SVal) = NULL;
    420 
    421 
    422 /* ------ CacheLine ------ */
    423 
    424 #define N_LINE_BITS      6 /* must be >= 3 */
    425 #define N_LINE_ARANGE    (1 << N_LINE_BITS)
    426 #define N_LINE_TREES     (N_LINE_ARANGE >> 3)
    427 
    428 typedef
    429    struct {
    430       UShort descrs[N_LINE_TREES];
    431       SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
    432    }
    433    CacheLine;
    434 
    435 #define TREE_DESCR_16_0 (1<<0)
    436 #define TREE_DESCR_32_0 (1<<1)
    437 #define TREE_DESCR_16_1 (1<<2)
    438 #define TREE_DESCR_64   (1<<3)
    439 #define TREE_DESCR_16_2 (1<<4)
    440 #define TREE_DESCR_32_1 (1<<5)
    441 #define TREE_DESCR_16_3 (1<<6)
    442 #define TREE_DESCR_8_0  (1<<7)
    443 #define TREE_DESCR_8_1  (1<<8)
    444 #define TREE_DESCR_8_2  (1<<9)
    445 #define TREE_DESCR_8_3  (1<<10)
    446 #define TREE_DESCR_8_4  (1<<11)
    447 #define TREE_DESCR_8_5  (1<<12)
    448 #define TREE_DESCR_8_6  (1<<13)
    449 #define TREE_DESCR_8_7  (1<<14)
    450 #define TREE_DESCR_DTY  (1<<15)
    451 
    452 typedef
    453    struct {
    454       SVal  dict[4]; /* can represent up to 4 diff values in the line */
    455       UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
    456                                       dict indexes */
    457       /* if dict[0] == SVal_INVALID then dict[1] is the index of the
    458          LineF to use, and dict[2..] are also SVal_INVALID. */
    459    }
    460    LineZ; /* compressed rep for a cache line */
    461 
    462 typedef
    463    struct {
    464       Bool inUse;
    465       SVal w64s[N_LINE_ARANGE];
    466    }
    467    LineF; /* full rep for a cache line */
    468 
    469 /* Shadow memory.
    470    Primary map is a WordFM Addr SecMap*.
    471    SecMaps cover some page-size-ish section of address space and hold
    472      a compressed representation.
    473    CacheLine-sized chunks of SecMaps are copied into a Cache, being
    474    decompressed when moved into the cache and recompressed on the
    475    way out.  Because of this, the cache must operate as a writeback
    476    cache, not a writethrough one.
    477 
    478    Each SecMap must hold a power-of-2 number of CacheLines.  Hence
    479    N_SECMAP_BITS must >= N_LINE_BITS.
    480 */
    481 #define N_SECMAP_BITS   13
    482 #define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)
    483 
    484 // # CacheLines held by a SecMap
    485 #define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)
    486 
    487 /* The data in the SecMap is held in the array of LineZs.  Each LineZ
    488    either carries the required data directly, in a compressed
    489    representation, or it holds (in .dict[0]) an index to the LineF in
    490    .linesF that holds the full representation.
    491 
    492    Currently-unused LineF's have their .inUse bit set to zero.
    493    Since each in-use LineF is referred to be exactly one LineZ,
    494    the number of .linesZ[] that refer to .linesF should equal
    495    the number of .linesF[] that have .inUse == True.
    496 
    497    RC obligations: the RCs presented to the user include exactly
    498    the values in:
    499    * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
    500    * F reps that are in use (.inUse == True)
    501 
    502    Hence the following actions at the following transitions are required:
    503 
    504    F rep: .inUse==True  -> .inUse==False        -- rcdec_LineF
    505    F rep: .inUse==False -> .inUse==True         -- rcinc_LineF
    506    Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
    507    Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
    508 */
    509 typedef
    510    struct {
    511       UInt   magic;
    512       LineZ  linesZ[N_SECMAP_ZLINES];
    513       LineF* linesF;
    514       UInt   linesF_size;
    515    }
    516    SecMap;
    517 
    518 #define SecMap_MAGIC   0x571e58cbU
    519 
    520 static inline Bool is_sane_SecMap ( SecMap* sm ) {
    521    return sm != NULL && sm->magic == SecMap_MAGIC;
    522 }
    523 
    524 /* ------ Cache ------ */
    525 
    526 #define N_WAY_BITS 16
    527 #define N_WAY_NENT (1 << N_WAY_BITS)
    528 
    529 /* Each tag is the address of the associated CacheLine, rounded down
    530    to a CacheLine address boundary.  A CacheLine size must be a power
    531    of 2 and must be 8 or more.  Hence an easy way to initialise the
    532    cache so it is empty is to set all the tag values to any value % 8
    533    != 0, eg 1.  This means all queries in the cache initially miss.
    534    It does however require us to detect and not writeback, any line
    535    with a bogus tag. */
    536 typedef
    537    struct {
    538       CacheLine lyns0[N_WAY_NENT];
    539       Addr      tags0[N_WAY_NENT];
    540    }
    541    Cache;
    542 
    543 static inline Bool is_valid_scache_tag ( Addr tag ) {
    544    /* a valid tag should be naturally aligned to the start of
    545       a CacheLine. */
    546    return 0 == (tag & (N_LINE_ARANGE - 1));
    547 }
    548 
    549 
    550 /* --------- Primary data structures --------- */
    551 
    552 /* Shadow memory primary map */
    553 static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
    554 static Cache   cache_shmem;
    555 
    556 
    557 static UWord stats__secmaps_search       = 0; // # SM finds
    558 static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
    559 static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
    560 static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
    561 static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
    562 static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
    563 static UWord stats__secmap_linesF_allocd = 0; // # LineF's issued
    564 static UWord stats__secmap_linesF_bytes  = 0; //  .. using this much storage
    565 static UWord stats__secmap_iterator_steppings = 0; // # calls to stepSMIter
    566 static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
    567 static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
    568 static UWord stats__cache_F_fetches      = 0; // # F lines fetched
    569 static UWord stats__cache_F_wbacks       = 0; // # F lines written back
    570 static UWord stats__cache_invals         = 0; // # cache invals
    571 static UWord stats__cache_flushes        = 0; // # cache flushes
    572 static UWord stats__cache_totrefs        = 0; // # total accesses
    573 static UWord stats__cache_totmisses      = 0; // # misses
    574 static ULong stats__cache_make_New_arange = 0; // total arange made New
    575 static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
    576 static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
    577 static UWord stats__cline_cread64s       = 0; // # calls to s_m_read64
    578 static UWord stats__cline_cread32s       = 0; // # calls to s_m_read32
    579 static UWord stats__cline_cread16s       = 0; // # calls to s_m_read16
    580 static UWord stats__cline_cread08s       = 0; // # calls to s_m_read8
    581 static UWord stats__cline_cwrite64s      = 0; // # calls to s_m_write64
    582 static UWord stats__cline_cwrite32s      = 0; // # calls to s_m_write32
    583 static UWord stats__cline_cwrite16s      = 0; // # calls to s_m_write16
    584 static UWord stats__cline_cwrite08s      = 0; // # calls to s_m_write8
    585 static UWord stats__cline_sread08s       = 0; // # calls to s_m_set8
    586 static UWord stats__cline_swrite08s      = 0; // # calls to s_m_get8
    587 static UWord stats__cline_swrite16s      = 0; // # calls to s_m_get8
    588 static UWord stats__cline_swrite32s      = 0; // # calls to s_m_get8
    589 static UWord stats__cline_swrite64s      = 0; // # calls to s_m_get8
    590 static UWord stats__cline_scopy08s       = 0; // # calls to s_m_copy8
    591 static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
    592 static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
    593 static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
    594 static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
    595 static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
    596 static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
    597 static UWord stats__vts__tick            = 0; // # calls to VTS__tick
    598 static UWord stats__vts__join            = 0; // # calls to VTS__join
    599 static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
    600 static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural
    601 
    602 // # calls to VTS__cmp_structural w/ slow case
    603 static UWord stats__vts__cmp_structural_slow = 0;
    604 
    605 // # calls to VTS__indexAt_SLOW
    606 static UWord stats__vts__indexat_slow = 0;
    607 
    608 // # calls to vts_set__find__or__clone_and_add
    609 static UWord stats__vts_set__focaa    = 0;
    610 
    611 // # calls to vts_set__find__or__clone_and_add that lead to an
    612 // allocation
    613 static UWord stats__vts_set__focaa_a  = 0;
    614 
    615 
    616 static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
    617    return a & ~(N_SECMAP_ARANGE - 1);
    618 }
    619 static inline UWord shmem__get_SecMap_offset ( Addr a ) {
    620    return a & (N_SECMAP_ARANGE - 1);
    621 }
    622 
    623 
    624 /*----------------------------------------------------------------*/
    625 /*--- map_shmem :: WordFM Addr SecMap                          ---*/
    626 /*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
    627 /*----------------------------------------------------------------*/
    628 
    629 /*--------------- SecMap allocation --------------- */
    630 
    631 static HChar* shmem__bigchunk_next = NULL;
    632 static HChar* shmem__bigchunk_end1 = NULL;
    633 
    634 static void* shmem__bigchunk_alloc ( SizeT n )
    635 {
    636    const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
    637    tl_assert(n > 0);
    638    n = VG_ROUNDUP(n, 16);
    639    tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
    640    tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
    641              <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
    642    if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
    643       if (0)
    644       VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
    645                   (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
    646       shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
    647       if (shmem__bigchunk_next == NULL)
    648          VG_(out_of_memory_NORETURN)(
    649             "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE );
    650       shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
    651    }
    652    tl_assert(shmem__bigchunk_next);
    653    tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
    654    tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
    655    shmem__bigchunk_next += n;
    656    return shmem__bigchunk_next - n;
    657 }
    658 
    659 static SecMap* shmem__alloc_SecMap ( void )
    660 {
    661    Word    i, j;
    662    SecMap* sm = shmem__bigchunk_alloc( sizeof(SecMap) );
    663    if (0) VG_(printf)("alloc_SecMap %p\n",sm);
    664    tl_assert(sm);
    665    sm->magic = SecMap_MAGIC;
    666    for (i = 0; i < N_SECMAP_ZLINES; i++) {
    667       sm->linesZ[i].dict[0] = SVal_NOACCESS;
    668       sm->linesZ[i].dict[1] = SVal_INVALID;
    669       sm->linesZ[i].dict[2] = SVal_INVALID;
    670       sm->linesZ[i].dict[3] = SVal_INVALID;
    671       for (j = 0; j < N_LINE_ARANGE/4; j++)
    672          sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
    673    }
    674    sm->linesF      = NULL;
    675    sm->linesF_size = 0;
    676    stats__secmaps_allocd++;
    677    stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
    678    stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
    679    stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
    680    return sm;
    681 }
    682 
    683 typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
    684 static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };
    685 
    686 static SecMap* shmem__find_SecMap ( Addr ga )
    687 {
    688    SecMap* sm    = NULL;
    689    Addr    gaKey = shmem__round_to_SecMap_base(ga);
    690    // Cache
    691    stats__secmaps_search++;
    692    if (LIKELY(gaKey == smCache[0].gaKey))
    693       return smCache[0].sm;
    694    if (LIKELY(gaKey == smCache[1].gaKey)) {
    695       SMCacheEnt tmp = smCache[0];
    696       smCache[0] = smCache[1];
    697       smCache[1] = tmp;
    698       return smCache[0].sm;
    699    }
    700    if (gaKey == smCache[2].gaKey) {
    701       SMCacheEnt tmp = smCache[1];
    702       smCache[1] = smCache[2];
    703       smCache[2] = tmp;
    704       return smCache[1].sm;
    705    }
    706    // end Cache
    707    stats__secmaps_search_slow++;
    708    if (VG_(lookupFM)( map_shmem,
    709                       NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
    710       tl_assert(sm != NULL);
    711       smCache[2] = smCache[1];
    712       smCache[1] = smCache[0];
    713       smCache[0].gaKey = gaKey;
    714       smCache[0].sm    = sm;
    715    } else {
    716       tl_assert(sm == NULL);
    717    }
    718    return sm;
    719 }
    720 
    721 static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
    722 {
    723    SecMap* sm = shmem__find_SecMap ( ga );
    724    if (LIKELY(sm)) {
    725       return sm;
    726    } else {
    727       /* create a new one */
    728       Addr gaKey = shmem__round_to_SecMap_base(ga);
    729       sm = shmem__alloc_SecMap();
    730       tl_assert(sm);
    731       VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
    732       return sm;
    733    }
    734 }
    735 
    736 
    737 /* ------------ LineF and LineZ related ------------ */
    738 
    739 static void rcinc_LineF ( LineF* lineF ) {
    740    UWord i;
    741    tl_assert(lineF->inUse);
    742    for (i = 0; i < N_LINE_ARANGE; i++)
    743       rcinc(lineF->w64s[i]);
    744 }
    745 
    746 static void rcdec_LineF ( LineF* lineF ) {
    747    UWord i;
    748    tl_assert(lineF->inUse);
    749    for (i = 0; i < N_LINE_ARANGE; i++)
    750       rcdec(lineF->w64s[i]);
    751 }
    752 
    753 static void rcinc_LineZ ( LineZ* lineZ ) {
    754    tl_assert(lineZ->dict[0] != SVal_INVALID);
    755    rcinc(lineZ->dict[0]);
    756    if (lineZ->dict[1] != SVal_INVALID) rcinc(lineZ->dict[1]);
    757    if (lineZ->dict[2] != SVal_INVALID) rcinc(lineZ->dict[2]);
    758    if (lineZ->dict[3] != SVal_INVALID) rcinc(lineZ->dict[3]);
    759 }
    760 
    761 static void rcdec_LineZ ( LineZ* lineZ ) {
    762    tl_assert(lineZ->dict[0] != SVal_INVALID);
    763    rcdec(lineZ->dict[0]);
    764    if (lineZ->dict[1] != SVal_INVALID) rcdec(lineZ->dict[1]);
    765    if (lineZ->dict[2] != SVal_INVALID) rcdec(lineZ->dict[2]);
    766    if (lineZ->dict[3] != SVal_INVALID) rcdec(lineZ->dict[3]);
    767 }
    768 
    769 inline
    770 static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
    771    Word bix, shft, mask, prep;
    772    tl_assert(ix >= 0);
    773    bix  = ix >> 2;
    774    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
    775    mask = 3 << shft;
    776    prep = b2 << shft;
    777    arr[bix] = (arr[bix] & ~mask) | prep;
    778 }
    779 
    780 inline
    781 static UWord read_twobit_array ( UChar* arr, UWord ix ) {
    782    Word bix, shft;
    783    tl_assert(ix >= 0);
    784    bix  = ix >> 2;
    785    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
    786    return (arr[bix] >> shft) & 3;
    787 }
    788 
    789 /* Given address 'tag', find either the Z or F line containing relevant
    790    data, so it can be read into the cache.
    791 */
    792 static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
    793                                   /*OUT*/LineF** fp, Addr tag ) {
    794    LineZ* lineZ;
    795    LineF* lineF;
    796    UWord   zix;
    797    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
    798    UWord   smoff = shmem__get_SecMap_offset(tag);
    799    /* since smoff is derived from a valid tag, it should be
    800       cacheline-aligned. */
    801    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
    802    zix = smoff >> N_LINE_BITS;
    803    tl_assert(zix < N_SECMAP_ZLINES);
    804    lineZ = &sm->linesZ[zix];
    805    lineF = NULL;
    806    if (lineZ->dict[0] == SVal_INVALID) {
    807       UInt fix = (UInt)lineZ->dict[1];
    808       tl_assert(sm->linesF);
    809       tl_assert(sm->linesF_size > 0);
    810       tl_assert(fix >= 0 && fix < sm->linesF_size);
    811       lineF = &sm->linesF[fix];
    812       tl_assert(lineF->inUse);
    813       lineZ = NULL;
    814    }
    815    *zp = lineZ;
    816    *fp = lineF;
    817 }
    818 
    819 /* Given address 'tag', return the relevant SecMap and the index of
    820    the LineZ within it, in the expectation that the line is to be
    821    overwritten.  Regardless of whether 'tag' is currently associated
    822    with a Z or F representation, to rcdec on the current
    823    representation, in recognition of the fact that the contents are
    824    just about to be overwritten. */
    825 static __attribute__((noinline))
    826 void find_Z_for_writing ( /*OUT*/SecMap** smp,
    827                           /*OUT*/Word* zixp,
    828                           Addr tag ) {
    829    LineZ* lineZ;
    830    LineF* lineF;
    831    UWord   zix;
    832    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
    833    UWord   smoff = shmem__get_SecMap_offset(tag);
    834    /* since smoff is derived from a valid tag, it should be
    835       cacheline-aligned. */
    836    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
    837    zix = smoff >> N_LINE_BITS;
    838    tl_assert(zix < N_SECMAP_ZLINES);
    839    lineZ = &sm->linesZ[zix];
    840    lineF = NULL;
    841    /* re RCs, we are freeing up this LineZ/LineF so that new data can
    842       be parked in it.  Hence have to rcdec it accordingly. */
    843    /* If lineZ has an associated lineF, free it up. */
    844    if (lineZ->dict[0] == SVal_INVALID) {
    845       UInt fix = (UInt)lineZ->dict[1];
    846       tl_assert(sm->linesF);
    847       tl_assert(sm->linesF_size > 0);
    848       tl_assert(fix >= 0 && fix < sm->linesF_size);
    849       lineF = &sm->linesF[fix];
    850       tl_assert(lineF->inUse);
    851       rcdec_LineF(lineF);
    852       lineF->inUse = False;
    853    } else {
    854       rcdec_LineZ(lineZ);
    855    }
    856    *smp  = sm;
    857    *zixp = zix;
    858 }
    859 
    860 static __attribute__((noinline))
    861 void alloc_F_for_writing ( /*MOD*/SecMap* sm, /*OUT*/Word* fixp ) {
    862    UInt        i, new_size;
    863    LineF* nyu;
    864 
    865    if (sm->linesF) {
    866       tl_assert(sm->linesF_size > 0);
    867    } else {
    868       tl_assert(sm->linesF_size == 0);
    869    }
    870 
    871    if (sm->linesF) {
    872       for (i = 0; i < sm->linesF_size; i++) {
    873          if (!sm->linesF[i].inUse) {
    874             *fixp = (Word)i;
    875             return;
    876          }
    877       }
    878    }
    879 
    880    /* No free F line found.  Expand existing array and try again. */
    881    new_size = sm->linesF_size==0 ? 1 : 2 * sm->linesF_size;
    882    nyu      = HG_(zalloc)( "libhb.aFfw.1 (LineF storage)",
    883                            new_size * sizeof(LineF) );
    884    tl_assert(nyu);
    885 
    886    stats__secmap_linesF_allocd += (new_size - sm->linesF_size);
    887    stats__secmap_linesF_bytes  += (new_size - sm->linesF_size)
    888                                   * sizeof(LineF);
    889 
    890    if (0)
    891    VG_(printf)("SM %p: expand F array from %d to %d\n",
    892                sm, (Int)sm->linesF_size, new_size);
    893 
    894    for (i = 0; i < new_size; i++)
    895       nyu[i].inUse = False;
    896 
    897    if (sm->linesF) {
    898       for (i = 0; i < sm->linesF_size; i++) {
    899          tl_assert(sm->linesF[i].inUse);
    900          nyu[i] = sm->linesF[i];
    901       }
    902       VG_(memset)(sm->linesF, 0, sm->linesF_size * sizeof(LineF) );
    903       HG_(free)(sm->linesF);
    904    }
    905 
    906    sm->linesF      = nyu;
    907    sm->linesF_size = new_size;
    908 
    909    for (i = 0; i < sm->linesF_size; i++) {
    910       if (!sm->linesF[i].inUse) {
    911          *fixp = (Word)i;
    912          return;
    913       }
    914     }
    915 
    916     /*NOTREACHED*/
    917     tl_assert(0);
    918 }
    919 
    920 
    921 /* ------------ CacheLine and implicit-tree related ------------ */
    922 
    923 __attribute__((unused))
    924 static void pp_CacheLine ( CacheLine* cl ) {
    925    Word i;
    926    if (!cl) {
    927       VG_(printf)("%s","pp_CacheLine(NULL)\n");
    928       return;
    929    }
    930    for (i = 0; i < N_LINE_TREES; i++)
    931       VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
    932    for (i = 0; i < N_LINE_ARANGE; i++)
    933       VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
    934 }
    935 
    936 static UChar descr_to_validbits ( UShort descr )
    937 {
    938    /* a.k.a Party Time for gcc's constant folder */
    939 #  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
    940                 b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
    941              ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
    942                           ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
    943                           ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
    944                           ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
    945                           ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
    946                           ( (b16_2) << 4)  | ( (b64)   << 3)  | \
    947                           ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
    948                           ( (b16_0) << 0) ) )
    949 
    950 #  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
    951              ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
    952                          ( (bit5) << 5) | ( (bit4) << 4) | \
    953                          ( (bit3) << 3) | ( (bit2) << 2) | \
    954                          ( (bit1) << 1) | ( (bit0) << 0) ) )
    955 
    956    /* these should all get folded out at compile time */
    957    tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
    958    tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
    959    tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
    960    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
    961    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
    962    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
    963    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
    964    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
    965    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);
    966 
    967    switch (descr) {
    968    /*
    969               +--------------------------------- TREE_DESCR_8_7
    970               |             +------------------- TREE_DESCR_8_0
    971               |             |  +---------------- TREE_DESCR_16_3
    972               |             |  | +-------------- TREE_DESCR_32_1
    973               |             |  | | +------------ TREE_DESCR_16_2
    974               |             |  | | |  +--------- TREE_DESCR_64
    975               |             |  | | |  |  +------ TREE_DESCR_16_1
    976               |             |  | | |  |  | +---- TREE_DESCR_32_0
    977               |             |  | | |  |  | | +-- TREE_DESCR_16_0
    978               |             |  | | |  |  | | |
    979               |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
    980    case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
    981                                                  return BYTE(1,1,1,1,1,1,1,1);
    982    case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
    983                                                  return BYTE(1,1,0,1,1,1,1,1);
    984    case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */
    985                                                  return BYTE(0,1,1,1,1,1,1,1);
    986    case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
    987                                                  return BYTE(0,1,0,1,1,1,1,1);
    988 
    989    case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */
    990                                                  return BYTE(1,1,1,1,1,1,0,1);
    991    case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
    992                                                  return BYTE(1,1,0,1,1,1,0,1);
    993    case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
    994                                                  return BYTE(0,1,1,1,1,1,0,1);
    995    case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
    996                                                  return BYTE(0,1,0,1,1,1,0,1);
    997 
    998    case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
    999                                                  return BYTE(1,1,1,1,0,1,1,1);
   1000    case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
   1001                                                  return BYTE(1,1,0,1,0,1,1,1);
   1002    case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
   1003                                                  return BYTE(0,1,1,1,0,1,1,1);
   1004    case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
   1005                                                  return BYTE(0,1,0,1,0,1,1,1);
   1006 
   1007    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
   1008                                                  return BYTE(1,1,1,1,0,1,0,1);
   1009    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
   1010                                                  return BYTE(1,1,0,1,0,1,0,1);
   1011    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
   1012                                                  return BYTE(0,1,1,1,0,1,0,1);
   1013    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
   1014                                                  return BYTE(0,1,0,1,0,1,0,1);
   1015 
   1016    case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
   1017                                                  return BYTE(0,0,0,1,1,1,1,1);
   1018    case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
   1019                                                  return BYTE(0,0,0,1,1,1,0,1);
   1020    case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
   1021                                                  return BYTE(0,0,0,1,0,1,1,1);
   1022    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
   1023                                                  return BYTE(0,0,0,1,0,1,0,1);
   1024 
   1025    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
   1026                                                  return BYTE(1,1,1,1,0,0,0,1);
   1027    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
   1028                                                  return BYTE(1,1,0,1,0,0,0,1);
   1029    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
   1030                                                  return BYTE(0,1,1,1,0,0,0,1);
   1031    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
   1032                                                  return BYTE(0,1,0,1,0,0,0,1);
   1033 
   1034    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
   1035                                                  return BYTE(0,0,0,1,0,0,0,1);
   1036 
   1037    case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
   1038                                                  return BYTE(0,0,0,0,0,0,0,1);
   1039 
   1040    default: return BYTE(0,0,0,0,0,0,0,0);
   1041                    /* INVALID - any valid descr produces at least one
   1042                       valid bit in tree[0..7]*/
   1043    }
   1044    /* NOTREACHED*/
   1045    tl_assert(0);
   1046 
   1047 #  undef DESCR
   1048 #  undef BYTE
   1049 }
   1050 
   1051 __attribute__((unused))
   1052 static Bool is_sane_Descr ( UShort descr ) {
   1053    return descr_to_validbits(descr) != 0;
   1054 }
   1055 
   1056 static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
   1057    VG_(sprintf)(dst,
   1058                 "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
   1059                 (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
   1060                 (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
   1061                 (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
   1062                 (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
   1063                 (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
   1064                 (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
   1065                 (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
   1066                 (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
   1067                 (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
   1068                 (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
   1069                 (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
   1070                 (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
   1071                 (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
   1072                 (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
   1073                 (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
   1074    );
   1075 }
   1076 static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
   1077    VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
   1078                      (Int)((byte & 128) ? 1 : 0),
   1079                      (Int)((byte &  64) ? 1 : 0),
   1080                      (Int)((byte &  32) ? 1 : 0),
   1081                      (Int)((byte &  16) ? 1 : 0),
   1082                      (Int)((byte &   8) ? 1 : 0),
   1083                      (Int)((byte &   4) ? 1 : 0),
   1084                      (Int)((byte &   2) ? 1 : 0),
   1085                      (Int)((byte &   1) ? 1 : 0)
   1086    );
   1087 }
   1088 
   1089 static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
   1090    Word  i;
   1091    UChar validbits = descr_to_validbits(descr);
   1092    HChar buf[128], buf2[128];
   1093    if (validbits == 0)
   1094       goto bad;
   1095    for (i = 0; i < 8; i++) {
   1096       if (validbits & (1<<i)) {
   1097          if (tree[i] == SVal_INVALID)
   1098             goto bad;
   1099       } else {
   1100          if (tree[i] != SVal_INVALID)
   1101             goto bad;
   1102       }
   1103    }
   1104    return True;
   1105   bad:
   1106    sprintf_Descr( buf, descr );
   1107    sprintf_Byte( buf2, validbits );
   1108    VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
   1109    VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
   1110    VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
   1111    for (i = 0; i < 8; i++)
   1112       VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
   1113    VG_(printf)("%s","}\n");
   1114    return 0;
   1115 }
   1116 
   1117 static Bool is_sane_CacheLine ( CacheLine* cl )
   1118 {
   1119    Word tno, cloff;
   1120 
   1121    if (!cl) goto bad;
   1122 
   1123    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
   1124       UShort descr = cl->descrs[tno];
   1125       SVal*  tree  = &cl->svals[cloff];
   1126       if (!is_sane_Descr_and_Tree(descr, tree))
   1127          goto bad;
   1128    }
   1129    tl_assert(cloff == N_LINE_ARANGE);
   1130    return True;
   1131   bad:
   1132    pp_CacheLine(cl);
   1133    return False;
   1134 }
   1135 
   1136 static UShort normalise_tree ( /*MOD*/SVal* tree )
   1137 {
   1138    UShort descr;
   1139    /* pre: incoming tree[0..7] does not have any invalid shvals, in
   1140       particular no zeroes. */
   1141    if (UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
   1142                 || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
   1143                 || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
   1144                 || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
   1145       tl_assert(0);
   1146 
   1147    descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
   1148            | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
   1149            | TREE_DESCR_8_1 | TREE_DESCR_8_0;
   1150    /* build 16-bit layer */
   1151    if (tree[1] == tree[0]) {
   1152       tree[1] = SVal_INVALID;
   1153       descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
   1154       descr |= TREE_DESCR_16_0;
   1155    }
   1156    if (tree[3] == tree[2]) {
   1157       tree[3] = SVal_INVALID;
   1158       descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
   1159       descr |= TREE_DESCR_16_1;
   1160    }
   1161    if (tree[5] == tree[4]) {
   1162       tree[5] = SVal_INVALID;
   1163       descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
   1164       descr |= TREE_DESCR_16_2;
   1165    }
   1166    if (tree[7] == tree[6]) {
   1167       tree[7] = SVal_INVALID;
   1168       descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
   1169       descr |= TREE_DESCR_16_3;
   1170    }
   1171    /* build 32-bit layer */
   1172    if (tree[2] == tree[0]
   1173        && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
   1174       tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
   1175       descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
   1176       descr |= TREE_DESCR_32_0;
   1177    }
   1178    if (tree[6] == tree[4]
   1179        && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
   1180       tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
   1181       descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
   1182       descr |= TREE_DESCR_32_1;
   1183    }
   1184    /* build 64-bit layer */
   1185    if (tree[4] == tree[0]
   1186        && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
   1187       tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
   1188       descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
   1189       descr |= TREE_DESCR_64;
   1190    }
   1191    return descr;
   1192 }
   1193 
   1194 /* This takes a cacheline where all the data is at the leaves
   1195    (w8[..]) and builds a correctly normalised tree. */
   1196 static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
   1197 {
   1198    Word tno, cloff;
   1199    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
   1200       SVal* tree = &cl->svals[cloff];
   1201       cl->descrs[tno] = normalise_tree( tree );
   1202    }
   1203    tl_assert(cloff == N_LINE_ARANGE);
   1204    if (CHECK_ZSM)
   1205       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   1206    stats__cline_normalises++;
   1207 }
   1208 
   1209 
   1210 typedef struct { UChar count; SVal sval; } CountedSVal;
   1211 
   1212 static
   1213 void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
   1214                                /*OUT*/Word* dstUsedP,
   1215                                Word nDst, CacheLine* src )
   1216 {
   1217    Word  tno, cloff, dstUsed;
   1218 
   1219    tl_assert(nDst == N_LINE_ARANGE);
   1220    dstUsed = 0;
   1221 
   1222    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
   1223       UShort descr = src->descrs[tno];
   1224       SVal*  tree  = &src->svals[cloff];
   1225 
   1226       /* sequentialise the tree described by (descr,tree). */
   1227 #     define PUT(_n,_v)                                \
   1228          do { dst[dstUsed  ].count = (_n);             \
   1229               dst[dstUsed++].sval  = (_v);             \
   1230          } while (0)
   1231 
   1232       /* byte 0 */
   1233       if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
   1234       if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
   1235       if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
   1236       if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
   1237       /* byte 1 */
   1238       if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
   1239       /* byte 2 */
   1240       if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
   1241       if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
   1242       /* byte 3 */
   1243       if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
   1244       /* byte 4 */
   1245       if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
   1246       if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
   1247       if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
   1248       /* byte 5 */
   1249       if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
   1250       /* byte 6 */
   1251       if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
   1252       if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
   1253       /* byte 7 */
   1254       if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);
   1255 
   1256 #     undef PUT
   1257       /* END sequentialise the tree described by (descr,tree). */
   1258 
   1259    }
   1260    tl_assert(cloff == N_LINE_ARANGE);
   1261    tl_assert(dstUsed <= nDst);
   1262 
   1263    *dstUsedP = dstUsed;
   1264 }
   1265 
   1266 /* Write the cacheline 'wix' to backing store.  Where it ends up
   1267    is determined by its tag field. */
   1268 static __attribute__((noinline)) void cacheline_wback ( UWord wix )
   1269 {
   1270    Word        i, j, k, m;
   1271    Addr        tag;
   1272    SecMap*     sm;
   1273    CacheLine*  cl;
   1274    LineZ* lineZ;
   1275    LineF* lineF;
   1276    Word        zix, fix, csvalsUsed;
   1277    CountedSVal csvals[N_LINE_ARANGE];
   1278    SVal        sv;
   1279 
   1280    if (0)
   1281    VG_(printf)("scache wback line %d\n", (Int)wix);
   1282 
   1283    tl_assert(wix >= 0 && wix < N_WAY_NENT);
   1284 
   1285    tag =  cache_shmem.tags0[wix];
   1286    cl  = &cache_shmem.lyns0[wix];
   1287 
   1288    /* The cache line may have been invalidated; if so, ignore it. */
   1289    if (!is_valid_scache_tag(tag))
   1290       return;
   1291 
   1292    /* Where are we going to put it? */
   1293    sm         = NULL;
   1294    lineZ      = NULL;
   1295    lineF      = NULL;
   1296    zix = fix = -1;
   1297 
   1298    /* find the Z line to write in and rcdec it or the associated F
   1299       line. */
   1300    find_Z_for_writing( &sm, &zix, tag );
   1301 
   1302    tl_assert(sm);
   1303    tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
   1304    lineZ = &sm->linesZ[zix];
   1305 
   1306    /* Generate the data to be stored */
   1307    if (CHECK_ZSM)
   1308       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   1309 
   1310    csvalsUsed = -1;
   1311    sequentialise_CacheLine( csvals, &csvalsUsed,
   1312                             N_LINE_ARANGE, cl );
   1313    tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
   1314    if (0) VG_(printf)("%lu ", csvalsUsed);
   1315 
   1316    lineZ->dict[0] = lineZ->dict[1]
   1317                   = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
   1318 
   1319    /* i indexes actual shadow values, k is cursor in csvals */
   1320    i = 0;
   1321    for (k = 0; k < csvalsUsed; k++) {
   1322 
   1323       sv = csvals[k].sval;
   1324       if (CHECK_ZSM)
   1325          tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
   1326       /* do we already have it? */
   1327       if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
   1328       if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
   1329       if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
   1330       if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
   1331       /* no.  look for a free slot. */
   1332       if (CHECK_ZSM)
   1333          tl_assert(sv != SVal_INVALID);
   1334       if (lineZ->dict[0]
   1335           == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
   1336       if (lineZ->dict[1]
   1337           == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
   1338       if (lineZ->dict[2]
   1339           == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
   1340       if (lineZ->dict[3]
   1341           == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
   1342       break; /* we'll have to use the f rep */
   1343      dict_ok:
   1344       m = csvals[k].count;
   1345       if (m == 8) {
   1346          write_twobit_array( lineZ->ix2s, i+0, j );
   1347          write_twobit_array( lineZ->ix2s, i+1, j );
   1348          write_twobit_array( lineZ->ix2s, i+2, j );
   1349          write_twobit_array( lineZ->ix2s, i+3, j );
   1350          write_twobit_array( lineZ->ix2s, i+4, j );
   1351          write_twobit_array( lineZ->ix2s, i+5, j );
   1352          write_twobit_array( lineZ->ix2s, i+6, j );
   1353          write_twobit_array( lineZ->ix2s, i+7, j );
   1354          i += 8;
   1355       }
   1356       else if (m == 4) {
   1357          write_twobit_array( lineZ->ix2s, i+0, j );
   1358          write_twobit_array( lineZ->ix2s, i+1, j );
   1359          write_twobit_array( lineZ->ix2s, i+2, j );
   1360          write_twobit_array( lineZ->ix2s, i+3, j );
   1361          i += 4;
   1362       }
   1363       else if (m == 1) {
   1364          write_twobit_array( lineZ->ix2s, i+0, j );
   1365          i += 1;
   1366       }
   1367       else if (m == 2) {
   1368          write_twobit_array( lineZ->ix2s, i+0, j );
   1369          write_twobit_array( lineZ->ix2s, i+1, j );
   1370          i += 2;
   1371       }
   1372       else {
   1373          tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
   1374       }
   1375 
   1376    }
   1377 
   1378    if (LIKELY(i == N_LINE_ARANGE)) {
   1379       /* Construction of the compressed representation was
   1380          successful. */
   1381       rcinc_LineZ(lineZ);
   1382       stats__cache_Z_wbacks++;
   1383    } else {
   1384       /* Cannot use the compressed(z) representation.  Use the full(f)
   1385          rep instead. */
   1386       tl_assert(i >= 0 && i < N_LINE_ARANGE);
   1387       alloc_F_for_writing( sm, &fix );
   1388       tl_assert(sm->linesF);
   1389       tl_assert(sm->linesF_size > 0);
   1390       tl_assert(fix >= 0 && fix < (Word)sm->linesF_size);
   1391       lineF = &sm->linesF[fix];
   1392       tl_assert(!lineF->inUse);
   1393       lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
   1394       lineZ->dict[1] = (SVal)fix;
   1395       lineF->inUse = True;
   1396       i = 0;
   1397       for (k = 0; k < csvalsUsed; k++) {
   1398          if (CHECK_ZSM)
   1399             tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
   1400          sv = csvals[k].sval;
   1401          if (CHECK_ZSM)
   1402             tl_assert(sv != SVal_INVALID);
   1403          for (m = csvals[k].count; m > 0; m--) {
   1404             lineF->w64s[i] = sv;
   1405             i++;
   1406          }
   1407       }
   1408       tl_assert(i == N_LINE_ARANGE);
   1409       rcinc_LineF(lineF);
   1410       stats__cache_F_wbacks++;
   1411    }
   1412 }
   1413 
   1414 /* Fetch the cacheline 'wix' from the backing store.  The tag
   1415    associated with 'wix' is assumed to have already been filled in;
   1416    hence that is used to determine where in the backing store to read
   1417    from. */
   1418 static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
   1419 {
   1420    Word       i;
   1421    Addr       tag;
   1422    CacheLine* cl;
   1423    LineZ*     lineZ;
   1424    LineF*     lineF;
   1425 
   1426    if (0)
   1427    VG_(printf)("scache fetch line %d\n", (Int)wix);
   1428 
   1429    tl_assert(wix >= 0 && wix < N_WAY_NENT);
   1430 
   1431    tag =  cache_shmem.tags0[wix];
   1432    cl  = &cache_shmem.lyns0[wix];
   1433 
   1434    /* reject nonsense requests */
   1435    tl_assert(is_valid_scache_tag(tag));
   1436 
   1437    lineZ = NULL;
   1438    lineF = NULL;
   1439    find_ZF_for_reading( &lineZ, &lineF, tag );
   1440    tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );
   1441 
   1442    /* expand the data into the bottom layer of the tree, then get
   1443       cacheline_normalise to build the descriptor array. */
   1444    if (lineF) {
   1445       tl_assert(lineF->inUse);
   1446       for (i = 0; i < N_LINE_ARANGE; i++) {
   1447          cl->svals[i] = lineF->w64s[i];
   1448       }
   1449       stats__cache_F_fetches++;
   1450    } else {
   1451       for (i = 0; i < N_LINE_ARANGE; i++) {
   1452          SVal sv;
   1453          UWord ix = read_twobit_array( lineZ->ix2s, i );
   1454          /* correct, but expensive: tl_assert(ix >= 0 && ix <= 3); */
   1455          sv = lineZ->dict[ix];
   1456          tl_assert(sv != SVal_INVALID);
   1457          cl->svals[i] = sv;
   1458       }
   1459       stats__cache_Z_fetches++;
   1460    }
   1461    normalise_CacheLine( cl );
   1462 }
   1463 
   1464 static void shmem__invalidate_scache ( void ) {
   1465    Word wix;
   1466    if (0) VG_(printf)("%s","scache inval\n");
   1467    tl_assert(!is_valid_scache_tag(1));
   1468    for (wix = 0; wix < N_WAY_NENT; wix++) {
   1469       cache_shmem.tags0[wix] = 1/*INVALID*/;
   1470    }
   1471    stats__cache_invals++;
   1472 }
   1473 
   1474 static void shmem__flush_and_invalidate_scache ( void ) {
   1475    Word wix;
   1476    Addr tag;
   1477    if (0) VG_(printf)("%s","scache flush and invalidate\n");
   1478    tl_assert(!is_valid_scache_tag(1));
   1479    for (wix = 0; wix < N_WAY_NENT; wix++) {
   1480       tag = cache_shmem.tags0[wix];
   1481       if (tag == 1/*INVALID*/) {
   1482          /* already invalid; nothing to do */
   1483       } else {
   1484          tl_assert(is_valid_scache_tag(tag));
   1485          cacheline_wback( wix );
   1486       }
   1487       cache_shmem.tags0[wix] = 1/*INVALID*/;
   1488    }
   1489    stats__cache_flushes++;
   1490    stats__cache_invals++;
   1491 }
   1492 
   1493 
   1494 static inline Bool aligned16 ( Addr a ) {
   1495    return 0 == (a & 1);
   1496 }
   1497 static inline Bool aligned32 ( Addr a ) {
   1498    return 0 == (a & 3);
   1499 }
   1500 static inline Bool aligned64 ( Addr a ) {
   1501    return 0 == (a & 7);
   1502 }
   1503 static inline UWord get_cacheline_offset ( Addr a ) {
   1504    return (UWord)(a & (N_LINE_ARANGE - 1));
   1505 }
   1506 static inline Addr cacheline_ROUNDUP ( Addr a ) {
   1507    return ROUNDUP(a, N_LINE_ARANGE);
   1508 }
   1509 static inline Addr cacheline_ROUNDDN ( Addr a ) {
   1510    return ROUNDDN(a, N_LINE_ARANGE);
   1511 }
   1512 static inline UWord get_treeno ( Addr a ) {
   1513    return get_cacheline_offset(a) >> 3;
   1514 }
   1515 static inline UWord get_tree_offset ( Addr a ) {
   1516    return a & 7;
   1517 }
   1518 
   1519 static __attribute__((noinline))
   1520        CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
   1521 static inline CacheLine* get_cacheline ( Addr a )
   1522 {
   1523    /* tag is 'a' with the in-line offset masked out,
   1524       eg a[31]..a[4] 0000 */
   1525    Addr       tag = a & ~(N_LINE_ARANGE - 1);
   1526    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
   1527    stats__cache_totrefs++;
   1528    if (LIKELY(tag == cache_shmem.tags0[wix])) {
   1529       return &cache_shmem.lyns0[wix];
   1530    } else {
   1531       return get_cacheline_MISS( a );
   1532    }
   1533 }
   1534 
   1535 static __attribute__((noinline))
   1536        CacheLine* get_cacheline_MISS ( Addr a )
   1537 {
   1538    /* tag is 'a' with the in-line offset masked out,
   1539       eg a[31]..a[4] 0000 */
   1540 
   1541    CacheLine* cl;
   1542    Addr*      tag_old_p;
   1543    Addr       tag = a & ~(N_LINE_ARANGE - 1);
   1544    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
   1545 
   1546    tl_assert(tag != cache_shmem.tags0[wix]);
   1547 
   1548    /* Dump the old line into the backing store. */
   1549    stats__cache_totmisses++;
   1550 
   1551    cl        = &cache_shmem.lyns0[wix];
   1552    tag_old_p = &cache_shmem.tags0[wix];
   1553 
   1554    if (is_valid_scache_tag( *tag_old_p )) {
   1555       /* EXPENSIVE and REDUNDANT: callee does it */
   1556       if (CHECK_ZSM)
   1557          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   1558       cacheline_wback( wix );
   1559    }
   1560    /* and reload the new one */
   1561    *tag_old_p = tag;
   1562    cacheline_fetch( wix );
   1563    if (CHECK_ZSM)
   1564       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   1565    return cl;
   1566 }
   1567 
   1568 static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   1569    stats__cline_64to32pulldown++;
   1570    switch (toff) {
   1571       case 0: case 4:
   1572          tl_assert(descr & TREE_DESCR_64);
   1573          tree[4] = tree[0];
   1574          descr &= ~TREE_DESCR_64;
   1575          descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
   1576          break;
   1577       default:
   1578          tl_assert(0);
   1579    }
   1580    return descr;
   1581 }
   1582 
   1583 static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   1584    stats__cline_32to16pulldown++;
   1585    switch (toff) {
   1586       case 0: case 2:
   1587          if (!(descr & TREE_DESCR_32_0)) {
   1588             descr = pulldown_to_32(tree, 0, descr);
   1589          }
   1590          tl_assert(descr & TREE_DESCR_32_0);
   1591          tree[2] = tree[0];
   1592          descr &= ~TREE_DESCR_32_0;
   1593          descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
   1594          break;
   1595       case 4: case 6:
   1596          if (!(descr & TREE_DESCR_32_1)) {
   1597             descr = pulldown_to_32(tree, 4, descr);
   1598          }
   1599          tl_assert(descr & TREE_DESCR_32_1);
   1600          tree[6] = tree[4];
   1601          descr &= ~TREE_DESCR_32_1;
   1602          descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
   1603          break;
   1604       default:
   1605          tl_assert(0);
   1606    }
   1607    return descr;
   1608 }
   1609 
   1610 static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   1611    stats__cline_16to8pulldown++;
   1612    switch (toff) {
   1613       case 0: case 1:
   1614          if (!(descr & TREE_DESCR_16_0)) {
   1615             descr = pulldown_to_16(tree, 0, descr);
   1616          }
   1617          tl_assert(descr & TREE_DESCR_16_0);
   1618          tree[1] = tree[0];
   1619          descr &= ~TREE_DESCR_16_0;
   1620          descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
   1621          break;
   1622       case 2: case 3:
   1623          if (!(descr & TREE_DESCR_16_1)) {
   1624             descr = pulldown_to_16(tree, 2, descr);
   1625          }
   1626          tl_assert(descr & TREE_DESCR_16_1);
   1627          tree[3] = tree[2];
   1628          descr &= ~TREE_DESCR_16_1;
   1629          descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
   1630          break;
   1631       case 4: case 5:
   1632          if (!(descr & TREE_DESCR_16_2)) {
   1633             descr = pulldown_to_16(tree, 4, descr);
   1634          }
   1635          tl_assert(descr & TREE_DESCR_16_2);
   1636          tree[5] = tree[4];
   1637          descr &= ~TREE_DESCR_16_2;
   1638          descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
   1639          break;
   1640       case 6: case 7:
   1641          if (!(descr & TREE_DESCR_16_3)) {
   1642             descr = pulldown_to_16(tree, 6, descr);
   1643          }
   1644          tl_assert(descr & TREE_DESCR_16_3);
   1645          tree[7] = tree[6];
   1646          descr &= ~TREE_DESCR_16_3;
   1647          descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
   1648          break;
   1649       default:
   1650          tl_assert(0);
   1651    }
   1652    return descr;
   1653 }
   1654 
   1655 
   1656 static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
   1657    UShort mask;
   1658    switch (toff) {
   1659       case 0:
   1660          mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
   1661          tl_assert( (descr & mask) == mask );
   1662          descr &= ~mask;
   1663          descr |= TREE_DESCR_16_0;
   1664          break;
   1665       case 2:
   1666          mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
   1667          tl_assert( (descr & mask) == mask );
   1668          descr &= ~mask;
   1669          descr |= TREE_DESCR_16_1;
   1670          break;
   1671       case 4:
   1672          mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
   1673          tl_assert( (descr & mask) == mask );
   1674          descr &= ~mask;
   1675          descr |= TREE_DESCR_16_2;
   1676          break;
   1677       case 6:
   1678          mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
   1679          tl_assert( (descr & mask) == mask );
   1680          descr &= ~mask;
   1681          descr |= TREE_DESCR_16_3;
   1682          break;
   1683       default:
   1684          tl_assert(0);
   1685    }
   1686    return descr;
   1687 }
   1688 
   1689 static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
   1690    UShort mask;
   1691    switch (toff) {
   1692       case 0:
   1693          if (!(descr & TREE_DESCR_16_0))
   1694             descr = pullup_descr_to_16(descr, 0);
   1695          if (!(descr & TREE_DESCR_16_1))
   1696             descr = pullup_descr_to_16(descr, 2);
   1697          mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
   1698          tl_assert( (descr & mask) == mask );
   1699          descr &= ~mask;
   1700          descr |= TREE_DESCR_32_0;
   1701          break;
   1702       case 4:
   1703          if (!(descr & TREE_DESCR_16_2))
   1704             descr = pullup_descr_to_16(descr, 4);
   1705          if (!(descr & TREE_DESCR_16_3))
   1706             descr = pullup_descr_to_16(descr, 6);
   1707          mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
   1708          tl_assert( (descr & mask) == mask );
   1709          descr &= ~mask;
   1710          descr |= TREE_DESCR_32_1;
   1711          break;
   1712       default:
   1713          tl_assert(0);
   1714    }
   1715    return descr;
   1716 }
   1717 
   1718 static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
   1719    switch (toff) {
   1720       case 0: case 4:
   1721          return 0 != (descr & TREE_DESCR_64);
   1722       default:
   1723          tl_assert(0);
   1724    }
   1725 }
   1726 
   1727 static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
   1728    switch (toff) {
   1729       case 0:
   1730          return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
   1731       case 2:
   1732          return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
   1733       case 4:
   1734          return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
   1735       case 6:
   1736          return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
   1737       default:
   1738          tl_assert(0);
   1739    }
   1740 }
   1741 
   1742 /* ------------ Cache management ------------ */
   1743 
   1744 static void zsm_flush_cache ( void )
   1745 {
   1746    shmem__flush_and_invalidate_scache();
   1747 }
   1748 
   1749 
   1750 static void zsm_init ( void(*p_rcinc)(SVal), void(*p_rcdec)(SVal) )
   1751 {
   1752    tl_assert( sizeof(UWord) == sizeof(Addr) );
   1753 
   1754    rcinc = p_rcinc;
   1755    rcdec = p_rcdec;
   1756 
   1757    tl_assert(map_shmem == NULL);
   1758    map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
   1759                            HG_(free),
   1760                            NULL/*unboxed UWord cmp*/);
   1761    tl_assert(map_shmem != NULL);
   1762    shmem__invalidate_scache();
   1763 
   1764    /* a SecMap must contain an integral number of CacheLines */
   1765    tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
   1766    /* also ... a CacheLine holds an integral number of trees */
   1767    tl_assert(0 == (N_LINE_ARANGE % 8));
   1768 }
   1769 
   1770 /////////////////////////////////////////////////////////////////
   1771 /////////////////////////////////////////////////////////////////
   1772 //                                                             //
   1773 // SECTION END compressed shadow memory                        //
   1774 //                                                             //
   1775 /////////////////////////////////////////////////////////////////
   1776 /////////////////////////////////////////////////////////////////
   1777 
   1778 
   1779 
   1780 /////////////////////////////////////////////////////////////////
   1781 /////////////////////////////////////////////////////////////////
   1782 //                                                             //
   1783 // SECTION BEGIN vts primitives                                //
   1784 //                                                             //
   1785 /////////////////////////////////////////////////////////////////
   1786 /////////////////////////////////////////////////////////////////
   1787 
   1788 
   1789 /* There's a 1-1 mapping between Thr and ThrIDs -- the latter merely
   1790    being compact stand-ins for Thr*'s.  Use these functions to map
   1791    between them. */
   1792 static ThrID Thr__to_ThrID   ( Thr*  thr   ); /* fwds */
   1793 static Thr*  Thr__from_ThrID ( ThrID thrid ); /* fwds */
   1794 
   1795 __attribute__((noreturn))
   1796 static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
   1797 {
   1798    if (due_to_nThrs) {
   1799       HChar* s =
   1800          "\n"
   1801          "Helgrind: cannot continue, run aborted: too many threads.\n"
   1802          "Sorry.  Helgrind can only handle programs that create\n"
   1803          "%'llu or fewer threads over their entire lifetime.\n"
   1804          "\n";
   1805       VG_(umsg)(s, (ULong)(ThrID_MAX_VALID - 1024));
   1806    } else {
   1807       HChar* s =
   1808          "\n"
   1809          "Helgrind: cannot continue, run aborted: too many\n"
   1810          "synchronisation events.  Sorry. Helgrind can only handle\n"
   1811          "programs which perform %'llu or fewer\n"
   1812          "inter-thread synchronisation events (locks, unlocks, etc).\n"
   1813          "\n";
   1814       VG_(umsg)(s, (1ULL << SCALARTS_N_TYMBITS) - 1);
   1815    }
   1816    VG_(exit)(1);
   1817    /*NOTREACHED*/
   1818    tl_assert(0); /*wtf?!*/
   1819 }
   1820 
   1821 
   1822 /* The dead thread (ThrID, actually) table.  A thread may only be
   1823    listed here if we have been notified thereof by libhb_async_exit.
   1824    New entries are added at the end.  The order isn't important, but
   1825    the ThrID values must be unique.  This table lists the identity of
   1826    all threads that have ever died -- none are ever removed.  We keep
   1827    this table so as to be able to prune entries from VTSs.  We don't
   1828    actually need to keep the set of threads that have ever died --
   1829    only the threads that have died since the previous round of
   1830    pruning.  But it's useful for sanity check purposes to keep the
   1831    entire set, so we do. */
   1832 static XArray* /* of ThrID */ verydead_thread_table = NULL;
   1833 
   1834 /* Arbitrary total ordering on ThrIDs. */
   1835 static Int cmp__ThrID ( void* v1, void* v2 ) {
   1836    ThrID id1 = *(ThrID*)v1;
   1837    ThrID id2 = *(ThrID*)v2;
   1838    if (id1 < id2) return -1;
   1839    if (id1 > id2) return 1;
   1840    return 0;
   1841 }
   1842 
   1843 static void verydead_thread_table_init ( void )
   1844 {
   1845    tl_assert(!verydead_thread_table);
   1846    verydead_thread_table
   1847      = VG_(newXA)( HG_(zalloc),
   1848                    "libhb.verydead_thread_table_init.1",
   1849                    HG_(free), sizeof(ThrID) );
   1850    tl_assert(verydead_thread_table);
   1851    VG_(setCmpFnXA)(verydead_thread_table, cmp__ThrID);
   1852 }
   1853 
   1854 
   1855 /* A VTS contains .ts, its vector clock, and also .id, a field to hold
   1856    a backlink for the caller's convenience.  Since we have no idea
   1857    what to set that to in the library, it always gets set to
   1858    VtsID_INVALID. */
   1859 typedef
   1860    struct {
   1861       VtsID    id;
   1862       UInt     usedTS;
   1863       UInt     sizeTS;
   1864       ScalarTS ts[0];
   1865    }
   1866    VTS;
   1867 
   1868 /* Allocate a VTS capable of storing 'sizeTS' entries. */
   1869 static VTS* VTS__new ( HChar* who, UInt sizeTS );
   1870 
   1871 /* Make a clone of 'vts', sizing the new array to exactly match the
   1872    number of ScalarTSs present. */
   1873 static VTS* VTS__clone ( HChar* who, VTS* vts );
   1874 
   1875 /* Make a clone of 'vts' with the thrids in 'thrids' removed.  The new
   1876    array is sized exactly to hold the number of required elements.
   1877    'thridsToDel' is an array of ThrIDs to be omitted in the clone, and
   1878    must be in strictly increasing order. */
   1879 static VTS* VTS__subtract ( HChar* who, VTS* vts, XArray* thridsToDel );
   1880 
   1881 /* Delete this VTS in its entirety. */
   1882 static void VTS__delete ( VTS* vts );
   1883 
   1884 /* Create a new singleton VTS in 'out'.  Caller must have
   1885    pre-allocated 'out' sufficiently big to hold the result in all
   1886    possible cases. */
   1887 static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym );
   1888 
   1889 /* Create in 'out' a VTS which is the same as 'vts' except with
   1890    vts[me]++, so to speak.  Caller must have pre-allocated 'out'
   1891    sufficiently big to hold the result in all possible cases. */
   1892 static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts );
   1893 
   1894 /* Create in 'out' a VTS which is the join (max) of 'a' and
   1895    'b'. Caller must have pre-allocated 'out' sufficiently big to hold
   1896    the result in all possible cases. */
   1897 static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b );
   1898 
   1899 /* Compute the partial ordering relation of the two args.  Although we
   1900    could be completely general and return an enumeration value (EQ,
   1901    LT, GT, UN), in fact we only need LEQ, and so we may as well
   1902    hardwire that fact.
   1903 
   1904    Returns zero iff LEQ(A,B), or a valid ThrID if not (zero is an
   1905    invald ThrID).  In the latter case, the returned ThrID indicates
   1906    the discovered point for which they are not.  There may be more
   1907    than one such point, but we only care about seeing one of them, not
   1908    all of them.  This rather strange convention is used because
   1909    sometimes we want to know the actual index at which they first
   1910    differ. */
   1911 static UInt VTS__cmpLEQ ( VTS* a, VTS* b );
   1912 
   1913 /* Compute an arbitrary structural (total) ordering on the two args,
   1914    based on their VCs, so they can be looked up in a table, tree, etc.
   1915    Returns -1, 0 or 1. */
   1916 static Word VTS__cmp_structural ( VTS* a, VTS* b );
   1917 
   1918 /* Debugging only.  Display the given VTS in the buffer. */
   1919 static void VTS__show ( HChar* buf, Int nBuf, VTS* vts );
   1920 
   1921 /* Debugging only.  Return vts[index], so to speak. */
   1922 static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );
   1923 
   1924 /* Notify the VTS machinery that a thread has been declared
   1925    comprehensively dead: that is, it has done an async exit AND it has
   1926    been joined with.  This should ensure that its local clocks (.viR
   1927    and .viW) will never again change, and so all mentions of this
   1928    thread from all VTSs in the system may be removed. */
   1929 static void VTS__declare_thread_very_dead ( Thr* idx );
   1930 
   1931 /*--------------- to do with Vector Timestamps ---------------*/
   1932 
   1933 static Bool is_sane_VTS ( VTS* vts )
   1934 {
   1935    UWord     i, n;
   1936    ScalarTS  *st1, *st2;
   1937    if (!vts) return False;
   1938    if (!vts->ts) return False;
   1939    if (vts->usedTS > vts->sizeTS) return False;
   1940    n = vts->usedTS;
   1941    if (n == 1) {
   1942       st1 = &vts->ts[0];
   1943       if (st1->tym == 0)
   1944          return False;
   1945    }
   1946    else
   1947    if (n >= 2) {
   1948       for (i = 0; i < n-1; i++) {
   1949          st1 = &vts->ts[i];
   1950          st2 = &vts->ts[i+1];
   1951          if (st1->thrid >= st2->thrid)
   1952             return False;
   1953          if (st1->tym == 0 || st2->tym == 0)
   1954             return False;
   1955       }
   1956    }
   1957    return True;
   1958 }
   1959 
   1960 
   1961 /* Create a new, empty VTS.
   1962 */
   1963 static VTS* VTS__new ( HChar* who, UInt sizeTS )
   1964 {
   1965    VTS* vts = HG_(zalloc)(who, sizeof(VTS) + (sizeTS+1) * sizeof(ScalarTS));
   1966    tl_assert(vts->usedTS == 0);
   1967    vts->sizeTS = sizeTS;
   1968    *(ULong*)(&vts->ts[sizeTS]) = 0x0ddC0ffeeBadF00dULL;
   1969    return vts;
   1970 }
   1971 
   1972 /* Clone this VTS.
   1973 */
   1974 static VTS* VTS__clone ( HChar* who, VTS* vts )
   1975 {
   1976    tl_assert(vts);
   1977    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   1978    UInt nTS = vts->usedTS;
   1979    VTS* clone = VTS__new(who, nTS);
   1980    clone->id = vts->id;
   1981    clone->sizeTS = nTS;
   1982    clone->usedTS = nTS;
   1983    UInt i;
   1984    for (i = 0; i < nTS; i++) {
   1985       clone->ts[i] = vts->ts[i];
   1986    }
   1987    tl_assert( *(ULong*)(&clone->ts[clone->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   1988    return clone;
   1989 }
   1990 
   1991 
   1992 /* Make a clone of a VTS with specified ThrIDs removed.  'thridsToDel'
   1993    must be in strictly increasing order.  We could obviously do this
   1994    much more efficiently (in linear time) if necessary.
   1995 */
   1996 static VTS* VTS__subtract ( HChar* who, VTS* vts, XArray* thridsToDel )
   1997 {
   1998    UInt i, j;
   1999    tl_assert(vts);
   2000    tl_assert(thridsToDel);
   2001    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   2002    UInt nTS = vts->usedTS;
   2003    /* Figure out how many ScalarTSs will remain in the output. */
   2004    UInt nReq = nTS;
   2005    for (i = 0; i < nTS; i++) {
   2006       ThrID thrid = vts->ts[i].thrid;
   2007       if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
   2008          nReq--;
   2009    }
   2010    tl_assert(nReq <= nTS);
   2011    /* Copy the ones that will remain. */
   2012    VTS* res = VTS__new(who, nReq);
   2013    j = 0;
   2014    for (i = 0; i < nTS; i++) {
   2015       ThrID thrid = vts->ts[i].thrid;
   2016       if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
   2017          continue;
   2018       res->ts[j++] = vts->ts[i];
   2019    }
   2020    tl_assert(j == nReq);
   2021    tl_assert(j == res->sizeTS);
   2022    res->usedTS = j;
   2023    tl_assert( *(ULong*)(&res->ts[j]) == 0x0ddC0ffeeBadF00dULL);
   2024    return res;
   2025 }
   2026 
   2027 
   2028 /* Delete this VTS in its entirety.
   2029 */
   2030 static void VTS__delete ( VTS* vts )
   2031 {
   2032    tl_assert(vts);
   2033    tl_assert(vts->usedTS <= vts->sizeTS);
   2034    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   2035    HG_(free)(vts);
   2036 }
   2037 
   2038 
   2039 /* Create a new singleton VTS.
   2040 */
   2041 static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym )
   2042 {
   2043    tl_assert(thr);
   2044    tl_assert(tym >= 1);
   2045    tl_assert(out);
   2046    tl_assert(out->usedTS == 0);
   2047    tl_assert(out->sizeTS >= 1);
   2048    UInt hi = out->usedTS++;
   2049    out->ts[hi].thrid = Thr__to_ThrID(thr);
   2050    out->ts[hi].tym   = tym;
   2051 }
   2052 
   2053 
   2054 /* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
   2055    not modified.
   2056 */
   2057 static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts )
   2058 {
   2059    UInt      i, n;
   2060    ThrID     me_thrid;
   2061    Bool      found = False;
   2062 
   2063    stats__vts__tick++;
   2064 
   2065    tl_assert(out);
   2066    tl_assert(out->usedTS == 0);
   2067    if (vts->usedTS >= ThrID_MAX_VALID)
   2068       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
   2069    tl_assert(out->sizeTS >= 1 + vts->usedTS);
   2070 
   2071    tl_assert(me);
   2072    me_thrid = Thr__to_ThrID(me);
   2073    tl_assert(is_sane_VTS(vts));
   2074    n = vts->usedTS;
   2075 
   2076    /* Copy all entries which precede 'me'. */
   2077    for (i = 0; i < n; i++) {
   2078       ScalarTS* here = &vts->ts[i];
   2079       if (UNLIKELY(here->thrid >= me_thrid))
   2080          break;
   2081       UInt hi = out->usedTS++;
   2082       out->ts[hi] = *here;
   2083    }
   2084 
   2085    /* 'i' now indicates the next entry to copy, if any.
   2086        There are 3 possibilities:
   2087        (a) there is no next entry (we used them all up already):
   2088            add (me_thrid,1) to the output, and quit
   2089        (b) there is a next entry, and its thrid > me_thrid:
   2090            add (me_thrid,1) to the output, then copy the remaining entries
   2091        (c) there is a next entry, and its thrid == me_thrid:
   2092            copy it to the output but increment its timestamp value.
   2093            Then copy the remaining entries.  (c) is the common case.
   2094    */
   2095    tl_assert(i >= 0 && i <= n);
   2096    if (i == n) { /* case (a) */
   2097       UInt hi = out->usedTS++;
   2098       out->ts[hi].thrid = me_thrid;
   2099       out->ts[hi].tym   = 1;
   2100    } else {
   2101       /* cases (b) and (c) */
   2102       ScalarTS* here = &vts->ts[i];
   2103       if (me_thrid == here->thrid) { /* case (c) */
   2104          if (UNLIKELY(here->tym >= (1ULL << SCALARTS_N_TYMBITS) - 2ULL)) {
   2105             /* We're hosed.  We have to stop. */
   2106             scalarts_limitations_fail_NORETURN( False/*!due_to_nThrs*/ );
   2107          }
   2108          UInt hi = out->usedTS++;
   2109          out->ts[hi].thrid = here->thrid;
   2110          out->ts[hi].tym   = here->tym + 1;
   2111          i++;
   2112          found = True;
   2113       } else { /* case (b) */
   2114          UInt hi = out->usedTS++;
   2115          out->ts[hi].thrid = me_thrid;
   2116          out->ts[hi].tym   = 1;
   2117       }
   2118       /* And copy any remaining entries. */
   2119       for (/*keepgoing*/; i < n; i++) {
   2120          ScalarTS* here2 = &vts->ts[i];
   2121          UInt hi = out->usedTS++;
   2122          out->ts[hi] = *here2;
   2123       }
   2124    }
   2125 
   2126    tl_assert(is_sane_VTS(out));
   2127    tl_assert(out->usedTS == vts->usedTS + (found ? 0 : 1));
   2128    tl_assert(out->usedTS <= out->sizeTS);
   2129 }
   2130 
   2131 
   2132 /* Return a new VTS constructed as the join (max) of the 2 args.
   2133    Neither arg is modified.
   2134 */
   2135 static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b )
   2136 {
   2137    UInt     ia, ib, useda, usedb;
   2138    ULong    tyma, tymb, tymMax;
   2139    ThrID    thrid;
   2140    UInt     ncommon = 0;
   2141 
   2142    stats__vts__join++;
   2143 
   2144    tl_assert(a);
   2145    tl_assert(b);
   2146    useda = a->usedTS;
   2147    usedb = b->usedTS;
   2148 
   2149    tl_assert(out);
   2150    tl_assert(out->usedTS == 0);
   2151    /* overly conservative test, but doing better involves comparing
   2152       the two VTSs, which we don't want to do at this point. */
   2153    if (useda + usedb >= ThrID_MAX_VALID)
   2154       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
   2155    tl_assert(out->sizeTS >= useda + usedb);
   2156 
   2157    ia = ib = 0;
   2158 
   2159    while (1) {
   2160 
   2161       /* This logic is to enumerate triples (thrid, tyma, tymb) drawn
   2162          from a and b in order, where thrid is the next ThrID
   2163          occurring in either a or b, and tyma/b are the relevant
   2164          scalar timestamps, taking into account implicit zeroes. */
   2165       tl_assert(ia >= 0 && ia <= useda);
   2166       tl_assert(ib >= 0 && ib <= usedb);
   2167 
   2168       if        (ia == useda && ib == usedb) {
   2169          /* both empty - done */
   2170          break;
   2171 
   2172       } else if (ia == useda && ib != usedb) {
   2173          /* a empty, use up b */
   2174          ScalarTS* tmpb = &b->ts[ib];
   2175          thrid = tmpb->thrid;
   2176          tyma  = 0;
   2177          tymb  = tmpb->tym;
   2178          ib++;
   2179 
   2180       } else if (ia != useda && ib == usedb) {
   2181          /* b empty, use up a */
   2182          ScalarTS* tmpa = &a->ts[ia];
   2183          thrid = tmpa->thrid;
   2184          tyma  = tmpa->tym;
   2185          tymb  = 0;
   2186          ia++;
   2187 
   2188       } else {
   2189          /* both not empty; extract lowest-ThrID'd triple */
   2190          ScalarTS* tmpa = &a->ts[ia];
   2191          ScalarTS* tmpb = &b->ts[ib];
   2192          if (tmpa->thrid < tmpb->thrid) {
   2193             /* a has the lowest unconsidered ThrID */
   2194             thrid = tmpa->thrid;
   2195             tyma  = tmpa->tym;
   2196             tymb  = 0;
   2197             ia++;
   2198          } else if (tmpa->thrid > tmpb->thrid) {
   2199             /* b has the lowest unconsidered ThrID */
   2200             thrid = tmpb->thrid;
   2201             tyma  = 0;
   2202             tymb  = tmpb->tym;
   2203             ib++;
   2204          } else {
   2205             /* they both next mention the same ThrID */
   2206             tl_assert(tmpa->thrid == tmpb->thrid);
   2207             thrid = tmpa->thrid; /* == tmpb->thrid */
   2208             tyma  = tmpa->tym;
   2209             tymb  = tmpb->tym;
   2210             ia++;
   2211             ib++;
   2212             ncommon++;
   2213          }
   2214       }
   2215 
   2216       /* having laboriously determined (thr, tyma, tymb), do something
   2217          useful with it. */
   2218       tymMax = tyma > tymb ? tyma : tymb;
   2219       if (tymMax > 0) {
   2220          UInt hi = out->usedTS++;
   2221          out->ts[hi].thrid = thrid;
   2222          out->ts[hi].tym   = tymMax;
   2223       }
   2224 
   2225    }
   2226 
   2227    tl_assert(is_sane_VTS(out));
   2228    tl_assert(out->usedTS <= out->sizeTS);
   2229    tl_assert(out->usedTS == useda + usedb - ncommon);
   2230 }
   2231 
   2232 
   2233 /* Determine if 'a' <= 'b', in the partial ordering.  Returns zero if
   2234    they are, or the first ThrID for which they are not (no valid ThrID
   2235    has the value zero).  This rather strange convention is used
   2236    because sometimes we want to know the actual index at which they
   2237    first differ. */
   2238 static UInt/*ThrID*/ VTS__cmpLEQ ( VTS* a, VTS* b )
   2239 {
   2240    Word  ia, ib, useda, usedb;
   2241    ULong tyma, tymb;
   2242 
   2243    stats__vts__cmpLEQ++;
   2244 
   2245    tl_assert(a);
   2246    tl_assert(b);
   2247    useda = a->usedTS;
   2248    usedb = b->usedTS;
   2249 
   2250    ia = ib = 0;
   2251 
   2252    while (1) {
   2253 
   2254       /* This logic is to enumerate doubles (tyma, tymb) drawn
   2255          from a and b in order, and tyma/b are the relevant
   2256          scalar timestamps, taking into account implicit zeroes. */
   2257       ThrID thrid;
   2258 
   2259       tl_assert(ia >= 0 && ia <= useda);
   2260       tl_assert(ib >= 0 && ib <= usedb);
   2261 
   2262       if        (ia == useda && ib == usedb) {
   2263          /* both empty - done */
   2264          break;
   2265 
   2266       } else if (ia == useda && ib != usedb) {
   2267          /* a empty, use up b */
   2268          ScalarTS* tmpb = &b->ts[ib];
   2269          tyma  = 0;
   2270          tymb  = tmpb->tym;
   2271          thrid = tmpb->thrid;
   2272          ib++;
   2273 
   2274       } else if (ia != useda && ib == usedb) {
   2275          /* b empty, use up a */
   2276          ScalarTS* tmpa = &a->ts[ia];
   2277          tyma  = tmpa->tym;
   2278          thrid = tmpa->thrid;
   2279          tymb  = 0;
   2280          ia++;
   2281 
   2282       } else {
   2283          /* both not empty; extract lowest-ThrID'd triple */
   2284          ScalarTS* tmpa = &a->ts[ia];
   2285          ScalarTS* tmpb = &b->ts[ib];
   2286          if (tmpa->thrid < tmpb->thrid) {
   2287             /* a has the lowest unconsidered ThrID */
   2288             tyma  = tmpa->tym;
   2289             thrid = tmpa->thrid;
   2290             tymb  = 0;
   2291             ia++;
   2292          }
   2293          else
   2294          if (tmpa->thrid > tmpb->thrid) {
   2295             /* b has the lowest unconsidered ThrID */
   2296             tyma  = 0;
   2297             tymb  = tmpb->tym;
   2298             thrid = tmpb->thrid;
   2299             ib++;
   2300          } else {
   2301             /* they both next mention the same ThrID */
   2302             tl_assert(tmpa->thrid == tmpb->thrid);
   2303             tyma  = tmpa->tym;
   2304             thrid = tmpa->thrid;
   2305             tymb  = tmpb->tym;
   2306             ia++;
   2307             ib++;
   2308          }
   2309       }
   2310 
   2311       /* having laboriously determined (tyma, tymb), do something
   2312          useful with it. */
   2313       if (tyma > tymb) {
   2314          /* not LEQ at this index.  Quit, since the answer is
   2315             determined already. */
   2316          tl_assert(thrid >= 1024);
   2317          return thrid;
   2318       }
   2319    }
   2320 
   2321    return 0; /* all points are LEQ => return an invalid ThrID */
   2322 }
   2323 
   2324 
   2325 /* Compute an arbitrary structural (total) ordering on the two args,
   2326    based on their VCs, so they can be looked up in a table, tree, etc.
   2327    Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
   2328    performance critical so there is some effort expended to make it sa
   2329    fast as possible.
   2330 */
   2331 Word VTS__cmp_structural ( VTS* a, VTS* b )
   2332 {
   2333    /* We just need to generate an arbitrary total ordering based on
   2334       a->ts and b->ts.  Preferably do it in a way which comes across likely
   2335       differences relatively quickly. */
   2336    Word     i;
   2337    Word     useda = 0,    usedb = 0;
   2338    ScalarTS *ctsa = NULL, *ctsb = NULL;
   2339 
   2340    stats__vts__cmp_structural++;
   2341 
   2342    tl_assert(a);
   2343    tl_assert(b);
   2344 
   2345    ctsa = &a->ts[0]; useda = a->usedTS;
   2346    ctsb = &b->ts[0]; usedb = b->usedTS;
   2347 
   2348    if (LIKELY(useda == usedb)) {
   2349       ScalarTS *tmpa = NULL, *tmpb = NULL;
   2350       stats__vts__cmp_structural_slow++;
   2351       /* Same length vectors.  Find the first difference, if any, as
   2352          fast as possible. */
   2353       for (i = 0; i < useda; i++) {
   2354          tmpa = &ctsa[i];
   2355          tmpb = &ctsb[i];
   2356          if (LIKELY(tmpa->tym == tmpb->tym
   2357                     && tmpa->thrid == tmpb->thrid))
   2358             continue;
   2359          else
   2360             break;
   2361       }
   2362       if (UNLIKELY(i == useda)) {
   2363          /* They're identical. */
   2364          return 0;
   2365       } else {
   2366          tl_assert(i >= 0 && i < useda);
   2367          if (tmpa->tym < tmpb->tym) return -1;
   2368          if (tmpa->tym > tmpb->tym) return 1;
   2369          if (tmpa->thrid < tmpb->thrid) return -1;
   2370          if (tmpa->thrid > tmpb->thrid) return 1;
   2371          /* we just established them as non-identical, hence: */
   2372       }
   2373       /*NOTREACHED*/
   2374       tl_assert(0);
   2375    }
   2376 
   2377    if (useda < usedb) return -1;
   2378    if (useda > usedb) return 1;
   2379    /*NOTREACHED*/
   2380    tl_assert(0);
   2381 }
   2382 
   2383 
   2384 /* Debugging only.  Display the given VTS in the buffer.
   2385 */
   2386 void VTS__show ( HChar* buf, Int nBuf, VTS* vts )
   2387 {
   2388    ScalarTS* st;
   2389    HChar     unit[64];
   2390    Word      i, n;
   2391    Int       avail = nBuf;
   2392    tl_assert(vts && vts->ts);
   2393    tl_assert(nBuf > 16);
   2394    buf[0] = '[';
   2395    buf[1] = 0;
   2396    n =  vts->usedTS;
   2397    for (i = 0; i < n; i++) {
   2398       tl_assert(avail >= 40);
   2399       st = &vts->ts[i];
   2400       VG_(memset)(unit, 0, sizeof(unit));
   2401       VG_(sprintf)(unit, i < n-1 ? "%u:%llu " : "%u:%llu",
   2402                          st->thrid, (ULong)st->tym);
   2403       if (avail < VG_(strlen)(unit) + 40/*let's say*/) {
   2404          VG_(strcat)(buf, " ...]");
   2405          buf[nBuf-1] = 0;
   2406          return;
   2407       }
   2408       VG_(strcat)(buf, unit);
   2409       avail -= VG_(strlen)(unit);
   2410    }
   2411    VG_(strcat)(buf, "]");
   2412    buf[nBuf-1] = 0;
   2413 }
   2414 
   2415 
   2416 /* Debugging only.  Return vts[index], so to speak.
   2417 */
   2418 ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
   2419 {
   2420    UWord i, n;
   2421    ThrID idx_thrid = Thr__to_ThrID(idx);
   2422    stats__vts__indexat_slow++;
   2423    tl_assert(vts && vts->ts);
   2424    n = vts->usedTS;
   2425    for (i = 0; i < n; i++) {
   2426       ScalarTS* st = &vts->ts[i];
   2427       if (st->thrid == idx_thrid)
   2428          return st->tym;
   2429    }
   2430    return 0;
   2431 }
   2432 
   2433 
   2434 /* See comment on prototype above.
   2435 */
   2436 static void VTS__declare_thread_very_dead ( Thr* thr )
   2437 {
   2438    if (0) VG_(printf)("VTQ:  tae %p\n", thr);
   2439 
   2440    tl_assert(thr->llexit_done);
   2441    tl_assert(thr->joinedwith_done);
   2442 
   2443    ThrID nyu;
   2444    nyu = Thr__to_ThrID(thr);
   2445    VG_(addToXA)( verydead_thread_table, &nyu );
   2446 
   2447    /* We can only get here if we're assured that we'll never again
   2448       need to look at this thread's ::viR or ::viW.  Set them to
   2449       VtsID_INVALID, partly so as to avoid holding on to the VTSs, but
   2450       mostly so that we don't wind up pruning them (as that would be
   2451       nonsensical: the only interesting ScalarTS entry for a dead
   2452       thread is its own index, and the pruning will remove that.). */
   2453    VtsID__rcdec(thr->viR);
   2454    VtsID__rcdec(thr->viW);
   2455    thr->viR = VtsID_INVALID;
   2456    thr->viW = VtsID_INVALID;
   2457 }
   2458 
   2459 
   2460 /////////////////////////////////////////////////////////////////
   2461 /////////////////////////////////////////////////////////////////
   2462 //                                                             //
   2463 // SECTION END vts primitives                                  //
   2464 //                                                             //
   2465 /////////////////////////////////////////////////////////////////
   2466 /////////////////////////////////////////////////////////////////
   2467 
   2468 
   2469 
   2470 /////////////////////////////////////////////////////////////////
   2471 /////////////////////////////////////////////////////////////////
   2472 //                                                             //
   2473 // SECTION BEGIN main library                                  //
   2474 //                                                             //
   2475 /////////////////////////////////////////////////////////////////
   2476 /////////////////////////////////////////////////////////////////
   2477 
   2478 
   2479 /////////////////////////////////////////////////////////
   2480 //                                                     //
   2481 // VTS set                                             //
   2482 //                                                     //
   2483 /////////////////////////////////////////////////////////
   2484 
   2485 static WordFM* /* WordFM VTS* void */ vts_set = NULL;
   2486 
   2487 static void vts_set_init ( void )
   2488 {
   2489    tl_assert(!vts_set);
   2490    vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
   2491                          HG_(free),
   2492                          (Word(*)(UWord,UWord))VTS__cmp_structural );
   2493    tl_assert(vts_set);
   2494 }
   2495 
   2496 /* Given a VTS, look in vts_set to see if we already have a
   2497    structurally identical one.  If yes, return the pair (True, pointer
   2498    to the existing one).  If no, clone this one, add the clone to the
   2499    set, and return (False, pointer to the clone). */
   2500 static Bool vts_set__find__or__clone_and_add ( /*OUT*/VTS** res, VTS* cand )
   2501 {
   2502    UWord keyW, valW;
   2503    stats__vts_set__focaa++;
   2504    tl_assert(cand->id == VtsID_INVALID);
   2505    /* lookup cand (by value) */
   2506    if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
   2507       /* found it */
   2508       tl_assert(valW == 0);
   2509       /* if this fails, cand (by ref) was already present (!) */
   2510       tl_assert(keyW != (UWord)cand);
   2511       *res = (VTS*)keyW;
   2512       return True;
   2513    } else {
   2514       /* not present.  Clone, add and return address of clone. */
   2515       stats__vts_set__focaa_a++;
   2516       VTS* clone = VTS__clone( "libhb.vts_set_focaa.1", cand );
   2517       tl_assert(clone != cand);
   2518       VG_(addToFM)( vts_set, (UWord)clone, 0/*val is unused*/ );
   2519       *res = clone;
   2520       return False;
   2521    }
   2522 }
   2523 
   2524 
   2525 /////////////////////////////////////////////////////////
   2526 //                                                     //
   2527 // VTS table                                           //
   2528 //                                                     //
   2529 /////////////////////////////////////////////////////////
   2530 
   2531 static void VtsID__invalidate_caches ( void ); /* fwds */
   2532 
   2533 /* A type to hold VTS table entries.  Invariants:
   2534    If .vts == NULL, then this entry is not in use, so:
   2535    - .rc == 0
   2536    - this entry is on the freelist (unfortunately, does not imply
   2537      any constraints on value for .freelink)
   2538    If .vts != NULL, then this entry is in use:
   2539    - .vts is findable in vts_set
   2540    - .vts->id == this entry number
   2541    - no specific value for .rc (even 0 is OK)
   2542    - this entry is not on freelist, so .freelink == VtsID_INVALID
   2543 */
   2544 typedef
   2545    struct {
   2546       VTS*  vts;      /* vts, in vts_set */
   2547       UWord rc;       /* reference count - enough for entire aspace */
   2548       VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
   2549       VtsID remap;    /* used only during pruning */
   2550    }
   2551    VtsTE;
   2552 
   2553 /* The VTS table. */
   2554 static XArray* /* of VtsTE */ vts_tab = NULL;
   2555 
   2556 /* An index into the VTS table, indicating the start of the list of
   2557    free (available for use) entries.  If the list is empty, this is
   2558    VtsID_INVALID. */
   2559 static VtsID vts_tab_freelist = VtsID_INVALID;
   2560 
   2561 /* Do a GC of vts_tab when the freelist becomes empty AND the size of
   2562    vts_tab equals or exceeds this size.  After GC, the value here is
   2563    set appropriately so as to check for the next GC point. */
   2564 static Word vts_next_GC_at = 1000;
   2565 
   2566 static void vts_tab_init ( void )
   2567 {
   2568    vts_tab
   2569       = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
   2570                     HG_(free), sizeof(VtsTE) );
   2571    vts_tab_freelist
   2572       = VtsID_INVALID;
   2573    tl_assert(vts_tab);
   2574 }
   2575 
   2576 /* Add ii to the free list, checking that it looks out-of-use. */
   2577 static void add_to_free_list ( VtsID ii )
   2578 {
   2579    VtsTE* ie = VG_(indexXA)( vts_tab, ii );
   2580    tl_assert(ie->vts == NULL);
   2581    tl_assert(ie->rc == 0);
   2582    tl_assert(ie->freelink == VtsID_INVALID);
   2583    ie->freelink = vts_tab_freelist;
   2584    vts_tab_freelist = ii;
   2585 }
   2586 
   2587 /* Get an entry from the free list.  This will return VtsID_INVALID if
   2588    the free list is empty. */
   2589 static VtsID get_from_free_list ( void )
   2590 {
   2591    VtsID  ii;
   2592    VtsTE* ie;
   2593    if (vts_tab_freelist == VtsID_INVALID)
   2594       return VtsID_INVALID;
   2595    ii = vts_tab_freelist;
   2596    ie = VG_(indexXA)( vts_tab, ii );
   2597    tl_assert(ie->vts == NULL);
   2598    tl_assert(ie->rc == 0);
   2599    vts_tab_freelist = ie->freelink;
   2600    return ii;
   2601 }
   2602 
   2603 /* Produce a new VtsID that can be used, either by getting it from
   2604    the freelist, or, if that is empty, by expanding vts_tab. */
   2605 static VtsID get_new_VtsID ( void )
   2606 {
   2607    VtsID ii;
   2608    VtsTE te;
   2609    ii = get_from_free_list();
   2610    if (ii != VtsID_INVALID)
   2611       return ii;
   2612    te.vts = NULL;
   2613    te.rc = 0;
   2614    te.freelink = VtsID_INVALID;
   2615    te.remap    = VtsID_INVALID;
   2616    ii = (VtsID)VG_(addToXA)( vts_tab, &te );
   2617    return ii;
   2618 }
   2619 
   2620 
   2621 /* Indirect callback from lib_zsm. */
   2622 static void VtsID__rcinc ( VtsID ii )
   2623 {
   2624    VtsTE* ie;
   2625    /* VG_(indexXA) does a range check for us */
   2626    ie = VG_(indexXA)( vts_tab, ii );
   2627    tl_assert(ie->vts); /* else it's not in use */
   2628    tl_assert(ie->rc < ~0UL); /* else we can't continue */
   2629    tl_assert(ie->vts->id == ii);
   2630    ie->rc++;
   2631 }
   2632 
   2633 /* Indirect callback from lib_zsm. */
   2634 static void VtsID__rcdec ( VtsID ii )
   2635 {
   2636    VtsTE* ie;
   2637    /* VG_(indexXA) does a range check for us */
   2638    ie = VG_(indexXA)( vts_tab, ii );
   2639    tl_assert(ie->vts); /* else it's not in use */
   2640    tl_assert(ie->rc > 0); /* else RC snafu */
   2641    tl_assert(ie->vts->id == ii);
   2642    ie->rc--;
   2643 }
   2644 
   2645 
   2646 /* Look up 'cand' in our collection of VTSs.  If present, return the
   2647    VtsID for the pre-existing version.  If not present, clone it, add
   2648    the clone to both vts_tab and vts_set, allocate a fresh VtsID for
   2649    it, and return that. */
   2650 static VtsID vts_tab__find__or__clone_and_add ( VTS* cand )
   2651 {
   2652    VTS* in_tab = NULL;
   2653    tl_assert(cand->id == VtsID_INVALID);
   2654    Bool already_have = vts_set__find__or__clone_and_add( &in_tab, cand );
   2655    tl_assert(in_tab);
   2656    if (already_have) {
   2657       /* We already have a copy of 'cand'.  Use that. */
   2658       VtsTE* ie;
   2659       tl_assert(in_tab->id != VtsID_INVALID);
   2660       ie = VG_(indexXA)( vts_tab, in_tab->id );
   2661       tl_assert(ie->vts == in_tab);
   2662       return in_tab->id;
   2663    } else {
   2664       VtsID  ii = get_new_VtsID();
   2665       VtsTE* ie = VG_(indexXA)( vts_tab, ii );
   2666       ie->vts = in_tab;
   2667       ie->rc = 0;
   2668       ie->freelink = VtsID_INVALID;
   2669       in_tab->id = ii;
   2670       return ii;
   2671    }
   2672 }
   2673 
   2674 
   2675 static void show_vts_stats ( HChar* caller )
   2676 {
   2677    UWord nSet, nTab, nLive;
   2678    ULong totrc;
   2679    UWord n, i;
   2680    nSet = VG_(sizeFM)( vts_set );
   2681    nTab = VG_(sizeXA)( vts_tab );
   2682    totrc = 0;
   2683    nLive = 0;
   2684    n = VG_(sizeXA)( vts_tab );
   2685    for (i = 0; i < n; i++) {
   2686       VtsTE* ie = VG_(indexXA)( vts_tab, i );
   2687       if (ie->vts) {
   2688          nLive++;
   2689          totrc += (ULong)ie->rc;
   2690       } else {
   2691          tl_assert(ie->rc == 0);
   2692       }
   2693    }
   2694    VG_(printf)("  show_vts_stats %s\n", caller);
   2695    VG_(printf)("    vts_tab size %4lu\n", nTab);
   2696    VG_(printf)("    vts_tab live %4lu\n", nLive);
   2697    VG_(printf)("    vts_set size %4lu\n", nSet);
   2698    VG_(printf)("        total rc %4llu\n", totrc);
   2699 }
   2700 
   2701 
   2702 /* --- Helpers for VtsID pruning --- */
   2703 
   2704 static
   2705 void remap_VtsID ( /*MOD*/XArray* /* of VtsTE */ old_tab,
   2706                    /*MOD*/XArray* /* of VtsTE */ new_tab,
   2707                    VtsID* ii )
   2708 {
   2709    VtsTE *old_te, *new_te;
   2710    VtsID old_id, new_id;
   2711    /* We're relying here on VG_(indexXA)'s range checking to assert on
   2712       any stupid values, in particular *ii == VtsID_INVALID. */
   2713    old_id = *ii;
   2714    old_te = VG_(indexXA)( old_tab, old_id );
   2715    old_te->rc--;
   2716    new_id = old_te->remap;
   2717    new_te = VG_(indexXA)( new_tab, new_id );
   2718    new_te->rc++;
   2719    *ii = new_id;
   2720 }
   2721 
   2722 static
   2723 void remap_VtsIDs_in_SVal ( /*MOD*/XArray* /* of VtsTE */ old_tab,
   2724                             /*MOD*/XArray* /* of VtsTE */ new_tab,
   2725                             SVal* s )
   2726 {
   2727    SVal old_sv, new_sv;
   2728    old_sv = *s;
   2729    if (SVal__isC(old_sv)) {
   2730       VtsID rMin, wMin;
   2731       rMin = SVal__unC_Rmin(old_sv);
   2732       wMin = SVal__unC_Wmin(old_sv);
   2733       remap_VtsID( old_tab, new_tab, &rMin );
   2734       remap_VtsID( old_tab, new_tab, &wMin );
   2735       new_sv = SVal__mkC( rMin, wMin );
   2736       *s = new_sv;
   2737   }
   2738 }
   2739 
   2740 
   2741 /* NOT TO BE CALLED FROM WITHIN libzsm. */
   2742 __attribute__((noinline))
   2743 static void vts_tab__do_GC ( Bool show_stats )
   2744 {
   2745    UWord i, nTab, nLive, nFreed;
   2746 
   2747    /* ---------- BEGIN VTS GC ---------- */
   2748    /* check this is actually necessary. */
   2749    tl_assert(vts_tab_freelist == VtsID_INVALID);
   2750 
   2751    /* empty the caches for partial order checks and binary joins.  We
   2752       could do better and prune out the entries to be deleted, but it
   2753       ain't worth the hassle. */
   2754    VtsID__invalidate_caches();
   2755 
   2756    /* First, make the reference counts up to date. */
   2757    zsm_flush_cache();
   2758 
   2759    nTab = VG_(sizeXA)( vts_tab );
   2760 
   2761    if (show_stats) {
   2762       VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
   2763       show_vts_stats("before GC");
   2764    }
   2765 
   2766    /* Now we can inspect the entire vts_tab.  Any entries with zero
   2767       .rc fields are now no longer in use and can be put back on the
   2768       free list, removed from vts_set, and deleted. */
   2769    nFreed = 0;
   2770    for (i = 0; i < nTab; i++) {
   2771       Bool present;
   2772       UWord oldK = 0, oldV = 12345;
   2773       VtsTE* te = VG_(indexXA)( vts_tab, i );
   2774       if (te->vts == NULL) {
   2775          tl_assert(te->rc == 0);
   2776          continue; /* already on the free list (presumably) */
   2777       }
   2778       if (te->rc > 0)
   2779          continue; /* in use */
   2780       /* Ok, we got one we can free. */
   2781       tl_assert(te->vts->id == i);
   2782       /* first, remove it from vts_set. */
   2783       present = VG_(delFromFM)( vts_set,
   2784                                 &oldK, &oldV, (UWord)te->vts );
   2785       tl_assert(present); /* else it isn't in vts_set ?! */
   2786       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
   2787       tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
   2788       /* now free the VTS itself */
   2789       VTS__delete(te->vts);
   2790       te->vts = NULL;
   2791       /* and finally put this entry on the free list */
   2792       tl_assert(te->freelink == VtsID_INVALID); /* can't already be on it */
   2793       add_to_free_list( i );
   2794       nFreed++;
   2795    }
   2796 
   2797    /* Now figure out when the next GC should be.  We'll allow the
   2798       number of VTSs to double before GCing again.  Except of course
   2799       that since we can't (or, at least, don't) shrink vts_tab, we
   2800       can't set the threshhold value smaller than it. */
   2801    tl_assert(nFreed <= nTab);
   2802    nLive = nTab - nFreed;
   2803    tl_assert(nLive >= 0 && nLive <= nTab);
   2804    vts_next_GC_at = 2 * nLive;
   2805    if (vts_next_GC_at < nTab)
   2806       vts_next_GC_at = nTab;
   2807 
   2808    if (show_stats) {
   2809       show_vts_stats("after GC");
   2810       VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
   2811    }
   2812 
   2813    if (VG_(clo_stats)) {
   2814       static UInt ctr = 1;
   2815       tl_assert(nTab > 0);
   2816       VG_(message)(Vg_DebugMsg,
   2817                   "libhb: VTS GC: #%u  old size %lu  live %lu  (%2llu%%)\n",
   2818                   ctr++, nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
   2819    }
   2820    /* ---------- END VTS GC ---------- */
   2821 
   2822    /* Decide whether to do VTS pruning.  We have one of three
   2823       settings. */
   2824    static UInt pruning_auto_ctr = 0; /* do not make non-static */
   2825 
   2826    Bool do_pruning = False;
   2827    switch (HG_(clo_vts_pruning)) {
   2828       case 0: /* never */
   2829          break;
   2830       case 1: /* auto */
   2831          do_pruning = (++pruning_auto_ctr % 5) == 0;
   2832          break;
   2833       case 2: /* always */
   2834          do_pruning = True;
   2835          break;
   2836       default:
   2837          tl_assert(0);
   2838    }
   2839 
   2840    /* The rest of this routine only handles pruning, so we can
   2841       quit at this point if it is not to be done. */
   2842    if (!do_pruning)
   2843       return;
   2844 
   2845    /* ---------- BEGIN VTS PRUNING ---------- */
   2846    /* We begin by sorting the backing table on its .thr values, so as
   2847       to (1) check they are unique [else something has gone wrong,
   2848       since it means we must have seen some Thr* exiting more than
   2849       once, which can't happen], and (2) so that we can quickly look
   2850       up the dead-thread entries as we work through the VTSs. */
   2851    VG_(sortXA)( verydead_thread_table );
   2852    /* Sanity check: check for unique .sts.thr values. */
   2853    UWord nBT = VG_(sizeXA)( verydead_thread_table );
   2854    if (nBT > 0) {
   2855       ThrID thrid1, thrid2;
   2856       thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, 0 );
   2857       for (i = 1; i < nBT; i++) {
   2858          thrid1 = thrid2;
   2859          thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, i );
   2860          tl_assert(thrid1 < thrid2);
   2861       }
   2862    }
   2863    /* Ok, so the dead thread table has unique and in-order keys. */
   2864 
   2865    /* We will run through the old table, and create a new table and
   2866       set, at the same time setting the .remap entries in the old
   2867       table to point to the new entries.  Then, visit every VtsID in
   2868       the system, and replace all of them with new ones, using the
   2869       .remap entries in the old table.  Finally, we can delete the old
   2870       table and set. */
   2871 
   2872    XArray* /* of VtsTE */ new_tab
   2873       = VG_(newXA)( HG_(zalloc), "libhb.vts_tab__do_GC.new_tab",
   2874                     HG_(free), sizeof(VtsTE) );
   2875 
   2876    /* WordFM VTS* void */
   2877    WordFM* new_set
   2878       = VG_(newFM)( HG_(zalloc), "libhb.vts_tab__do_GC.new_set",
   2879                     HG_(free),
   2880                     (Word(*)(UWord,UWord))VTS__cmp_structural );
   2881 
   2882    /* Visit each old VTS.  For each one:
   2883 
   2884       * make a pruned version
   2885 
   2886       * search new_set for the pruned version, yielding either
   2887         Nothing (not present) or the new VtsID for it.
   2888 
   2889       * if not present, allocate a new VtsID for it, insert (pruned
   2890         VTS, new VtsID) in the tree, and set
   2891         remap_table[old VtsID] = new VtsID.
   2892 
   2893       * if present, set remap_table[old VtsID] = new VtsID, where
   2894         new VtsID was determined by the tree lookup.  Then free up
   2895         the clone.
   2896    */
   2897 
   2898    UWord nBeforePruning = 0, nAfterPruning = 0;
   2899    UWord nSTSsBefore = 0, nSTSsAfter = 0;
   2900    VtsID new_VtsID_ctr = 0;
   2901 
   2902    for (i = 0; i < nTab; i++) {
   2903 
   2904       /* For each old VTS .. */
   2905       VtsTE* old_te  = VG_(indexXA)( vts_tab, i );
   2906       VTS*   old_vts = old_te->vts;
   2907       tl_assert(old_te->remap == VtsID_INVALID);
   2908 
   2909       /* Skip it if not in use */
   2910       if (old_te->rc == 0) {
   2911          tl_assert(old_vts == NULL);
   2912          continue;
   2913       }
   2914       tl_assert(old_vts != NULL);
   2915       tl_assert(old_vts->id == i);
   2916       tl_assert(old_vts->ts != NULL);
   2917 
   2918       /* It is in use. Make a pruned version. */
   2919       nBeforePruning++;
   2920       nSTSsBefore += old_vts->usedTS;
   2921       VTS* new_vts = VTS__subtract("libhb.vts_tab__do_GC.new_vts",
   2922                                    old_vts, verydead_thread_table);
   2923       tl_assert(new_vts->sizeTS == new_vts->usedTS);
   2924       tl_assert(*(ULong*)(&new_vts->ts[new_vts->usedTS])
   2925                 == 0x0ddC0ffeeBadF00dULL);
   2926 
   2927       /* Get rid of the old VTS and the tree entry.  It's a bit more
   2928          complex to incrementally delete the VTSs now than to nuke
   2929          them all after we're done, but the upside is that we don't
   2930          wind up temporarily storing potentially two complete copies
   2931          of each VTS and hence spiking memory use. */
   2932       UWord oldK = 0, oldV = 12345;
   2933       Bool  present = VG_(delFromFM)( vts_set,
   2934                                       &oldK, &oldV, (UWord)old_vts );
   2935       tl_assert(present); /* else it isn't in vts_set ?! */
   2936       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
   2937       tl_assert(oldK == (UWord)old_vts); /* else what did delFromFM find?! */
   2938       /* now free the VTS itself */
   2939       VTS__delete(old_vts);
   2940       old_te->vts = NULL;
   2941       old_vts = NULL;
   2942 
   2943       /* NO MENTIONS of old_vts allowed beyond this point. */
   2944 
   2945       /* Ok, we have the pruned copy in new_vts.  See if a
   2946          structurally identical version is already present in new_set.
   2947          If so, delete the one we just made and move on; if not, add
   2948          it. */
   2949       VTS*  identical_version = NULL;
   2950       UWord valW = 12345;
   2951       if (VG_(lookupFM)(new_set, (UWord*)&identical_version, &valW,
   2952                         (UWord)new_vts)) {
   2953          // already have it
   2954          tl_assert(valW == 0);
   2955          tl_assert(identical_version != NULL);
   2956          tl_assert(identical_version != new_vts);
   2957          VTS__delete(new_vts);
   2958          new_vts = identical_version;
   2959          tl_assert(new_vts->id != VtsID_INVALID);
   2960       } else {
   2961          tl_assert(valW == 12345);
   2962          tl_assert(identical_version == NULL);
   2963          new_vts->id = new_VtsID_ctr++;
   2964          Bool b = VG_(addToFM)(new_set, (UWord)new_vts, 0);
   2965          tl_assert(!b);
   2966          VtsTE new_te;
   2967          new_te.vts      = new_vts;
   2968          new_te.rc       = 0;
   2969          new_te.freelink = VtsID_INVALID;
   2970          new_te.remap    = VtsID_INVALID;
   2971          Word j = VG_(addToXA)( new_tab, &new_te );
   2972          tl_assert(j <= i);
   2973          tl_assert(j == new_VtsID_ctr - 1);
   2974          // stats
   2975          nAfterPruning++;
   2976          nSTSsAfter += new_vts->usedTS;
   2977       }
   2978       old_te->remap = new_vts->id;
   2979 
   2980    } /* for (i = 0; i < nTab; i++) */
   2981 
   2982    /* At this point, we have:
   2983       * the old VTS table, with its .remap entries set,
   2984         and with all .vts == NULL.
   2985       * the old VTS tree should be empty, since it and the old VTSs
   2986         it contained have been incrementally deleted was we worked
   2987         through the old table.
   2988       * the new VTS table, with all .rc == 0, all .freelink and .remap
   2989         == VtsID_INVALID.
   2990       * the new VTS tree.
   2991    */
   2992    tl_assert( VG_(sizeFM)(vts_set) == 0 );
   2993 
   2994    /* Now actually apply the mapping. */
   2995    /* Visit all the VtsIDs in the entire system.  Where do we expect
   2996       to find them?
   2997       (a) in shadow memory -- the LineZs and LineFs
   2998       (b) in our collection of struct _Thrs.
   2999       (c) in our collection of struct _SOs.
   3000       Nowhere else, AFAICS.  Not in the zsm cache, because that just
   3001       got invalidated.
   3002 
   3003       Using the .remap fields in vts_tab, map each old VtsID to a new
   3004       VtsID.  For each old VtsID, dec its rc; and for each new one,
   3005       inc it.  This sets up the new refcounts, and it also gives a
   3006       cheap sanity check of the old ones: all old refcounts should be
   3007       zero after this operation.
   3008    */
   3009 
   3010    /* Do the mappings for (a) above: iterate over the Primary shadow
   3011       mem map (WordFM Addr SecMap*). */
   3012    UWord secmapW = 0;
   3013    VG_(initIterFM)( map_shmem );
   3014    while (VG_(nextIterFM)( map_shmem, NULL, &secmapW )) {
   3015       UWord   j;
   3016       SecMap* sm = (SecMap*)secmapW;
   3017       tl_assert(sm->magic == SecMap_MAGIC);
   3018       /* Deal with the LineZs */
   3019       for (i = 0; i < N_SECMAP_ZLINES; i++) {
   3020          LineZ* lineZ = &sm->linesZ[i];
   3021          if (lineZ->dict[0] == SVal_INVALID)
   3022             continue; /* not in use -- data is in F rep instead */
   3023          for (j = 0; j < 4; j++)
   3024             remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineZ->dict[j]);
   3025       }
   3026       /* Deal with the LineFs */
   3027       for (i = 0; i < sm->linesF_size; i++) {
   3028          LineF* lineF = &sm->linesF[i];
   3029          if (!lineF->inUse)
   3030             continue;
   3031          for (j = 0; j < N_LINE_ARANGE; j++)
   3032             remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineF->w64s[j]);
   3033       }
   3034    }
   3035    VG_(doneIterFM)( map_shmem );
   3036 
   3037    /* Do the mappings for (b) above: visit our collection of struct
   3038       _Thrs. */
   3039    Thread* hgthread = get_admin_threads();
   3040    tl_assert(hgthread);
   3041    while (hgthread) {
   3042       Thr* hbthr = hgthread->hbthr;
   3043       tl_assert(hbthr);
   3044       /* Threads that are listed in the prunable set have their viR
   3045          and viW set to VtsID_INVALID, so we can't mess with them. */
   3046       if (hbthr->llexit_done && hbthr->joinedwith_done) {
   3047          tl_assert(hbthr->viR == VtsID_INVALID);
   3048          tl_assert(hbthr->viW == VtsID_INVALID);
   3049          hgthread = hgthread->admin;
   3050          continue;
   3051       }