Home | History | Annotate | Download | only in helgrind
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- LibHB: a library for implementing and checking               ---*/
      4 /*--- the happens-before relationship in concurrent programs.      ---*/
      5 /*---                                                 libhb_main.c ---*/
      6 /*--------------------------------------------------------------------*/
      7 
      8 /*
      9    This file is part of LibHB, a library for implementing and checking
     10    the happens-before relationship in concurrent programs.
     11 
     12    Copyright (C) 2008-2011 OpenWorks Ltd
     13       info (at) open-works.co.uk
     14 
     15    This program is free software; you can redistribute it and/or
     16    modify it under the terms of the GNU General Public License as
     17    published by the Free Software Foundation; either version 2 of the
     18    License, or (at your option) any later version.
     19 
     20    This program is distributed in the hope that it will be useful, but
     21    WITHOUT ANY WARRANTY; without even the implied warranty of
     22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     23    General Public License for more details.
     24 
     25    You should have received a copy of the GNU General Public License
     26    along with this program; if not, write to the Free Software
     27    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     28    02111-1307, USA.
     29 
     30    The GNU General Public License is contained in the file COPYING.
     31 */
     32 
     33 #include "pub_tool_basics.h"
     34 #include "pub_tool_libcassert.h"
     35 #include "pub_tool_libcbase.h"
     36 #include "pub_tool_libcprint.h"
     37 #include "pub_tool_mallocfree.h"
     38 #include "pub_tool_wordfm.h"
     39 #include "pub_tool_sparsewa.h"
     40 #include "pub_tool_xarray.h"
     41 #include "pub_tool_oset.h"
     42 #include "pub_tool_threadstate.h"
     43 #include "pub_tool_aspacemgr.h"
     44 #include "pub_tool_execontext.h"
     45 #include "pub_tool_errormgr.h"
     46 #include "pub_tool_options.h"        // VG_(clo_stats)
     47 #include "hg_basics.h"
     48 #include "hg_wordset.h"
     49 #include "hg_lock_n_thread.h"
     50 #include "hg_errors.h"
     51 
     52 #include "libhb.h"
     53 
     54 
     55 /////////////////////////////////////////////////////////////////
     56 /////////////////////////////////////////////////////////////////
     57 //                                                             //
     58 // Debugging #defines                                          //
     59 //                                                             //
     60 /////////////////////////////////////////////////////////////////
     61 /////////////////////////////////////////////////////////////////
     62 
     63 /* Check the sanity of shadow values in the core memory state
     64    machine.  Change #if 0 to #if 1 to enable this. */
     65 #if 0
     66 #  define CHECK_MSM 1
     67 #else
     68 #  define CHECK_MSM 0
     69 #endif
     70 
     71 
     72 /* Check sanity (reference counts, etc) in the conflicting access
     73    machinery.  Change #if 0 to #if 1 to enable this. */
     74 #if 0
     75 #  define CHECK_CEM 1
     76 #else
     77 #  define CHECK_CEM 0
     78 #endif
     79 
     80 
     81 /* Check sanity in the compressed shadow memory machinery,
     82    particularly in its caching innards.  Unfortunately there's no
     83    almost-zero-cost way to make them selectable at run time.  Hence
     84    set the #if 0 to #if 1 and rebuild if you want them. */
     85 #if 0
     86 #  define CHECK_ZSM 1  /* do sanity-check CacheLine stuff */
     87 #  define inline __attribute__((noinline))
     88    /* probably want to ditch -fomit-frame-pointer too */
     89 #else
     90 #  define CHECK_ZSM 0   /* don't sanity-check CacheLine stuff */
     91 #endif
     92 
     93 
     94 /////////////////////////////////////////////////////////////////
     95 /////////////////////////////////////////////////////////////////
     96 //                                                             //
     97 // data decls: VtsID                                           //
     98 //                                                             //
     99 /////////////////////////////////////////////////////////////////
    100 /////////////////////////////////////////////////////////////////
    101 
    102 /* VtsIDs: Unique small-integer IDs for VTSs.  VtsIDs can't exceed 30
    103    bits, since they have to be packed into the lowest 30 bits of an
    104    SVal. */
    105 typedef  UInt  VtsID;
    106 #define VtsID_INVALID 0xFFFFFFFF
    107 
    108 
    109 
    110 /////////////////////////////////////////////////////////////////
    111 /////////////////////////////////////////////////////////////////
    112 //                                                             //
    113 // data decls: SVal                                            //
    114 //                                                             //
    115 /////////////////////////////////////////////////////////////////
    116 /////////////////////////////////////////////////////////////////
    117 
    118 typedef  ULong  SVal;
    119 
    120 /* This value has special significance to the implementation, and callers
    121    may not store it in the shadow memory. */
    122 #define SVal_INVALID (3ULL << 62)
    123 
    124 /* This is the default value for shadow memory.  Initially the shadow
    125    memory contains no accessible areas and so all reads produce this
    126    value.  TODO: make this caller-defineable. */
    127 #define SVal_NOACCESS (2ULL << 62)
    128 
    129 
    130 
    131 /////////////////////////////////////////////////////////////////
    132 /////////////////////////////////////////////////////////////////
    133 //                                                             //
    134 // data decls: ScalarTS                                        //
    135 //                                                             //
    136 /////////////////////////////////////////////////////////////////
    137 /////////////////////////////////////////////////////////////////
    138 
    139 /* Scalar Timestamp.  We have to store a lot of these, so there is
    140    some effort to make them as small as possible.  Logically they are
    141    a pair, (Thr*, ULong), but that takes 16 bytes on a 64-bit target.
    142    We pack it into 64 bits by representing the Thr* using a ThrID, a
    143    small integer (18 bits), and a 46 bit integer for the timestamp
    144    number.  The 46/18 split is arbitary, but has the effect that
    145    Helgrind can only handle programs that create 2^18 or fewer threads
    146    over their entire lifetime, and have no more than 2^46 timestamp
    147    ticks (synchronisation operations on the same thread).
    148 
    149    This doesn't seem like much of a limitation.  2^46 ticks is
    150    7.06e+13, and if each tick (optimistically) takes the machine 1000
    151    cycles to process, then the minimum time to process that many ticks
    152    at a clock rate of 5 GHz is 162.9 days.  And that's doing nothing
    153    but VTS ticks, which isn't realistic.
    154 
    155    NB1: SCALARTS_N_THRBITS must be 29 or lower.  The obvious limit is
    156    32 since a ThrID is a UInt.  29 comes from the fact that
    157    'Thr_n_RCEC', which records information about old accesses, packs
    158    not only a ThrID but also 2+1 other bits (access size and
    159    writeness) in a UInt, hence limiting size to 32-(2+1) == 29.
    160 
    161    NB2: thrid values are issued upwards from 1024, and values less
    162    than that aren't valid.  This isn't per se necessary (any order
    163    will do, so long as they are unique), but it does help ensure they
    164    are less likely to get confused with the various other kinds of
    165    small-integer thread ids drifting around (eg, TId).  See also NB5.
    166 
    167    NB3: this probably also relies on the fact that Thr's are never
    168    deallocated -- they exist forever.  Hence the 1-1 mapping from
    169    Thr's to thrid values (set up in Thr__new) persists forever.
    170 
    171    NB4: temp_max_sized_VTS is allocated at startup and never freed.
    172    It is a maximum sized VTS, so has (1 << SCALARTS_N_TYMBITS)
    173    ScalarTSs.  So we can't make SCALARTS_N_THRBITS too large without
    174    making the memory use for this go sky-high.  With
    175    SCALARTS_N_THRBITS at 18, it occupies 2MB of memory, which seems
    176    like an OK tradeoff.  If more than 256k threads need to be
    177    supported, we could change SCALARTS_N_THRBITS to 20, which would
    178    facilitate supporting 1 million threads at the cost of 8MB storage
    179    for temp_max_sized_VTS.
    180 
    181    NB5: the conflicting-map mechanism (Thr_n_RCEC, specifically) uses
    182    ThrID == 0 to denote an empty Thr_n_RCEC record.  So ThrID == 0
    183    must never be a valid ThrID.  Given NB2 that's OK.
    184 */
    185 #define SCALARTS_N_THRBITS 18  /* valid range: 11 to 29 inclusive */
    186 
    187 #define SCALARTS_N_TYMBITS (64 - SCALARTS_N_THRBITS)
    188 typedef
    189    struct {
    190       ThrID thrid : SCALARTS_N_THRBITS;
    191       ULong tym   : SCALARTS_N_TYMBITS;
    192    }
    193    ScalarTS;
    194 
    195 #define ThrID_MAX_VALID ((1 << SCALARTS_N_THRBITS) - 1)
    196 
    197 
    198 
    199 /////////////////////////////////////////////////////////////////
    200 /////////////////////////////////////////////////////////////////
    201 //                                                             //
    202 // data decls: Filter                                          //
    203 //                                                             //
    204 /////////////////////////////////////////////////////////////////
    205 /////////////////////////////////////////////////////////////////
    206 
    207 // baseline: 5, 9
    208 #define FI_LINE_SZB_LOG2  5
    209 #define FI_NUM_LINES_LOG2 10
    210 
    211 #define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
    212 #define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)
    213 
    214 #define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
    215 #define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)
    216 
    217 #define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
    218                              & (Addr)(FI_NUM_LINES-1) )
    219 
    220 
    221 /* In the lines, each 8 bytes are treated individually, and are mapped
    222    to a UShort.  Regardless of endianness of the underlying machine,
    223    bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
    224    the highest address.
    225 
    226    Of each bit pair, the higher numbered bit is set if a R has been
    227    seen, so the actual layout is:
    228 
    229    15 14             ...  01 00
    230 
    231    R  W  for addr+7  ...  R  W  for addr+0
    232 
    233    So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
    234 */
    235 
    236 /* tags are separated from lines.  tags are Addrs and are
    237    the base address of the line. */
    238 typedef
    239    struct {
    240       UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
    241    }
    242    FiLine;
    243 
    244 typedef
    245    struct {
    246       Addr   tags[FI_NUM_LINES];
    247       FiLine lines[FI_NUM_LINES];
    248    }
    249    Filter;
    250 
    251 
    252 
    253 /////////////////////////////////////////////////////////////////
    254 /////////////////////////////////////////////////////////////////
    255 //                                                             //
    256 // data decls: Thr, ULong_n_EC                                 //
    257 //                                                             //
    258 /////////////////////////////////////////////////////////////////
    259 /////////////////////////////////////////////////////////////////
    260 
    261 // Records stacks for H1 history mechanism (DRD-style)
    262 typedef
    263    struct { ULong ull; ExeContext* ec; }
    264    ULong_n_EC;
    265 
    266 
    267 /* How many of the above records to collect for each thread?  Older
    268    ones are dumped when we run out of space.  62.5k requires 1MB per
    269    thread, since each ULong_n_EC record is 16 bytes long.  When more
    270    than N_KWs_N_STACKs_PER_THREAD are present, the older half are
    271    deleted to make space.  Hence in the worst case we will be able to
    272    produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
    273    Kw transitions (segments in this thread).  For the current setting
    274    that gives a guaranteed stack for at least the last 31.25k
    275    segments. */
    276 #define N_KWs_N_STACKs_PER_THREAD 62500
    277 
    278 
    279 struct _Thr {
    280    /* Current VTSs for this thread.  They change as we go along.  viR
    281       is the VTS to be used for reads, viW for writes.  Usually they
    282       are the same, but can differ when we deal with reader-writer
    283       locks.  It is always the case that
    284          VtsID__cmpLEQ(viW,viR) == True
    285       that is, viW must be the same, or lagging behind, viR. */
    286    VtsID viR;
    287    VtsID viW;
    288 
    289    /* Is initially False, and is set to True after the thread really
    290       has done a low-level exit.  When True, we expect to never see
    291       any more memory references done by this thread. */
    292    Bool llexit_done;
    293 
    294    /* Is initially False, and is set to True after the thread has been
    295       joined with (reaped by some other thread).  After this point, we
    296       do not expect to see any uses of .viR or .viW, so it is safe to
    297       set them to VtsID_INVALID. */
    298    Bool joinedwith_done;
    299 
    300    /* A small integer giving a unique identity to this Thr.  See
    301       comments on the definition of ScalarTS for details. */
    302    ThrID thrid : SCALARTS_N_THRBITS;
    303 
    304    /* A filter that removes references for which we believe that
    305       msmcread/msmcwrite will not change the state, nor report a
    306       race. */
    307    Filter* filter;
    308 
    309    /* A pointer back to the top level Thread structure.  There is a
    310       1-1 mapping between Thread and Thr structures -- each Thr points
    311       at its corresponding Thread, and vice versa.  Really, Thr and
    312       Thread should be merged into a single structure. */
    313    Thread* hgthread;
    314 
    315    /* The ULongs (scalar Kws) in this accumulate in strictly
    316       increasing order, without duplicates.  This is important because
    317       we need to be able to find a given scalar Kw in this array
    318       later, by binary search. */
    319    XArray* /* ULong_n_EC */ local_Kws_n_stacks;
    320 };
    321 
    322 
    323 
    324 /////////////////////////////////////////////////////////////////
    325 /////////////////////////////////////////////////////////////////
    326 //                                                             //
    327 // data decls: SO                                              //
    328 //                                                             //
    329 /////////////////////////////////////////////////////////////////
    330 /////////////////////////////////////////////////////////////////
    331 
    332 // (UInt) `echo "Synchronisation object" | md5sum`
    333 #define SO_MAGIC 0x56b3c5b0U
    334 
    335 struct _SO {
    336    struct _SO* admin_prev;
    337    struct _SO* admin_next;
    338    VtsID viR; /* r-clock of sender */
    339    VtsID viW; /* w-clock of sender */
    340    UInt  magic;
    341 };
    342 
    343 
    344 
    345 /////////////////////////////////////////////////////////////////
    346 /////////////////////////////////////////////////////////////////
    347 //                                                             //
    348 // Forward declarations                                        //
    349 //                                                             //
    350 /////////////////////////////////////////////////////////////////
    351 /////////////////////////////////////////////////////////////////
    352 
    353 /* fwds for
    354    Globals needed by other parts of the library.  These are set
    355    once at startup and then never changed. */
    356 static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
    357 static ExeContext* (*main_get_EC)( Thr* ) = NULL;
    358 
    359 /* misc fn and data fwdses */
    360 static void VtsID__rcinc ( VtsID ii );
    361 static void VtsID__rcdec ( VtsID ii );
    362 
    363 static inline Bool SVal__isC ( SVal s );
    364 static inline VtsID SVal__unC_Rmin ( SVal s );
    365 static inline VtsID SVal__unC_Wmin ( SVal s );
    366 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini );
    367 
    368 /* A double linked list of all the SO's. */
    369 SO* admin_SO;
    370 
    371 
    372 
    373 /////////////////////////////////////////////////////////////////
    374 /////////////////////////////////////////////////////////////////
    375 //                                                             //
    376 // SECTION BEGIN compressed shadow memory                      //
    377 //                                                             //
    378 /////////////////////////////////////////////////////////////////
    379 /////////////////////////////////////////////////////////////////
    380 
    381 #ifndef __HB_ZSM_H
    382 #define __HB_ZSM_H
    383 
    384 /* Initialise the library.  Once initialised, it will (or may) call
    385    rcinc and rcdec in response to all the calls below, in order to
    386    allow the user to do reference counting on the SVals stored herein.
    387    It is important to understand, however, that due to internal
    388    caching, the reference counts are in general inaccurate, and can be
    389    both above or below the true reference count for an item.  In
    390    particular, the library may indicate that the reference count for
    391    an item is zero, when in fact it is not.
    392 
    393    To make the reference counting exact and therefore non-pointless,
    394    call zsm_flush_cache.  Immediately after it returns, the reference
    395    counts for all items, as deduced by the caller by observing calls
    396    to rcinc and rcdec, will be correct, and so any items with a zero
    397    reference count may be freed (or at least considered to be
    398    unreferenced by this library).
    399 */
    400 static void zsm_init ( void(*rcinc)(SVal), void(*rcdec)(SVal) );
    401 
    402 static void zsm_sset_range  ( Addr, SizeT, SVal );
    403 static void zsm_scopy_range ( Addr, Addr, SizeT );
    404 static void zsm_flush_cache ( void );
    405 
    406 #endif /* ! __HB_ZSM_H */
    407 
    408 
    409 /* Round a up to the next multiple of N.  N must be a power of 2 */
    410 #define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
    411 /* Round a down to the next multiple of N.  N must be a power of 2 */
    412 #define ROUNDDN(a, N)   ((a) & ~(N-1))
    413 
    414 
    415 
    416 /* ------ User-supplied RC functions ------ */
    417 static void(*rcinc)(SVal) = NULL;
    418 static void(*rcdec)(SVal) = NULL;
    419 
    420 
    421 /* ------ CacheLine ------ */
    422 
    423 #define N_LINE_BITS      6 /* must be >= 3 */
    424 #define N_LINE_ARANGE    (1 << N_LINE_BITS)
    425 #define N_LINE_TREES     (N_LINE_ARANGE >> 3)
    426 
    427 typedef
    428    struct {
    429       UShort descrs[N_LINE_TREES];
    430       SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
    431    }
    432    CacheLine;
    433 
    434 #define TREE_DESCR_16_0 (1<<0)
    435 #define TREE_DESCR_32_0 (1<<1)
    436 #define TREE_DESCR_16_1 (1<<2)
    437 #define TREE_DESCR_64   (1<<3)
    438 #define TREE_DESCR_16_2 (1<<4)
    439 #define TREE_DESCR_32_1 (1<<5)
    440 #define TREE_DESCR_16_3 (1<<6)
    441 #define TREE_DESCR_8_0  (1<<7)
    442 #define TREE_DESCR_8_1  (1<<8)
    443 #define TREE_DESCR_8_2  (1<<9)
    444 #define TREE_DESCR_8_3  (1<<10)
    445 #define TREE_DESCR_8_4  (1<<11)
    446 #define TREE_DESCR_8_5  (1<<12)
    447 #define TREE_DESCR_8_6  (1<<13)
    448 #define TREE_DESCR_8_7  (1<<14)
    449 #define TREE_DESCR_DTY  (1<<15)
    450 
    451 typedef
    452    struct {
    453       SVal  dict[4]; /* can represent up to 4 diff values in the line */
    454       UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
    455                                       dict indexes */
    456       /* if dict[0] == SVal_INVALID then dict[1] is the index of the
    457          LineF to use, and dict[2..] are also SVal_INVALID. */
    458    }
    459    LineZ; /* compressed rep for a cache line */
    460 
    461 typedef
    462    struct {
    463       Bool inUse;
    464       SVal w64s[N_LINE_ARANGE];
    465    }
    466    LineF; /* full rep for a cache line */
    467 
    468 /* Shadow memory.
    469    Primary map is a WordFM Addr SecMap*.
    470    SecMaps cover some page-size-ish section of address space and hold
    471      a compressed representation.
    472    CacheLine-sized chunks of SecMaps are copied into a Cache, being
    473    decompressed when moved into the cache and recompressed on the
    474    way out.  Because of this, the cache must operate as a writeback
    475    cache, not a writethrough one.
    476 
    477    Each SecMap must hold a power-of-2 number of CacheLines.  Hence
    478    N_SECMAP_BITS must >= N_LINE_BITS.
    479 */
    480 #define N_SECMAP_BITS   13
    481 #define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)
    482 
    483 // # CacheLines held by a SecMap
    484 #define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)
    485 
    486 /* The data in the SecMap is held in the array of LineZs.  Each LineZ
    487    either carries the required data directly, in a compressed
    488    representation, or it holds (in .dict[0]) an index to the LineF in
    489    .linesF that holds the full representation.
    490 
    491    Currently-unused LineF's have their .inUse bit set to zero.
    492    Since each in-use LineF is referred to be exactly one LineZ,
    493    the number of .linesZ[] that refer to .linesF should equal
    494    the number of .linesF[] that have .inUse == True.
    495 
    496    RC obligations: the RCs presented to the user include exactly
    497    the values in:
    498    * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
    499    * F reps that are in use (.inUse == True)
    500 
    501    Hence the following actions at the following transitions are required:
    502 
    503    F rep: .inUse==True  -> .inUse==False        -- rcdec_LineF
    504    F rep: .inUse==False -> .inUse==True         -- rcinc_LineF
    505    Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
    506    Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
    507 */
    508 typedef
    509    struct {
    510       UInt   magic;
    511       LineZ  linesZ[N_SECMAP_ZLINES];
    512       LineF* linesF;
    513       UInt   linesF_size;
    514    }
    515    SecMap;
    516 
    517 #define SecMap_MAGIC   0x571e58cbU
    518 
    519 static inline Bool is_sane_SecMap ( SecMap* sm ) {
    520    return sm != NULL && sm->magic == SecMap_MAGIC;
    521 }
    522 
    523 /* ------ Cache ------ */
    524 
    525 #define N_WAY_BITS 16
    526 #define N_WAY_NENT (1 << N_WAY_BITS)
    527 
    528 /* Each tag is the address of the associated CacheLine, rounded down
    529    to a CacheLine address boundary.  A CacheLine size must be a power
    530    of 2 and must be 8 or more.  Hence an easy way to initialise the
    531    cache so it is empty is to set all the tag values to any value % 8
    532    != 0, eg 1.  This means all queries in the cache initially miss.
    533    It does however require us to detect and not writeback, any line
    534    with a bogus tag. */
    535 typedef
    536    struct {
    537       CacheLine lyns0[N_WAY_NENT];
    538       Addr      tags0[N_WAY_NENT];
    539    }
    540    Cache;
    541 
    542 static inline Bool is_valid_scache_tag ( Addr tag ) {
    543    /* a valid tag should be naturally aligned to the start of
    544       a CacheLine. */
    545    return 0 == (tag & (N_LINE_ARANGE - 1));
    546 }
    547 
    548 
    549 /* --------- Primary data structures --------- */
    550 
    551 /* Shadow memory primary map */
    552 static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
    553 static Cache   cache_shmem;
    554 
    555 
    556 static UWord stats__secmaps_search       = 0; // # SM finds
    557 static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
    558 static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
    559 static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
    560 static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
    561 static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
    562 static UWord stats__secmap_linesF_allocd = 0; // # LineF's issued
    563 static UWord stats__secmap_linesF_bytes  = 0; //  .. using this much storage
    564 static UWord stats__secmap_iterator_steppings = 0; // # calls to stepSMIter
    565 static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
    566 static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
    567 static UWord stats__cache_F_fetches      = 0; // # F lines fetched
    568 static UWord stats__cache_F_wbacks       = 0; // # F lines written back
    569 static UWord stats__cache_invals         = 0; // # cache invals
    570 static UWord stats__cache_flushes        = 0; // # cache flushes
    571 static UWord stats__cache_totrefs        = 0; // # total accesses
    572 static UWord stats__cache_totmisses      = 0; // # misses
    573 static ULong stats__cache_make_New_arange = 0; // total arange made New
    574 static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
    575 static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
    576 static UWord stats__cline_cread64s       = 0; // # calls to s_m_read64
    577 static UWord stats__cline_cread32s       = 0; // # calls to s_m_read32
    578 static UWord stats__cline_cread16s       = 0; // # calls to s_m_read16
    579 static UWord stats__cline_cread08s       = 0; // # calls to s_m_read8
    580 static UWord stats__cline_cwrite64s      = 0; // # calls to s_m_write64
    581 static UWord stats__cline_cwrite32s      = 0; // # calls to s_m_write32
    582 static UWord stats__cline_cwrite16s      = 0; // # calls to s_m_write16
    583 static UWord stats__cline_cwrite08s      = 0; // # calls to s_m_write8
    584 static UWord stats__cline_sread08s       = 0; // # calls to s_m_set8
    585 static UWord stats__cline_swrite08s      = 0; // # calls to s_m_get8
    586 static UWord stats__cline_swrite16s      = 0; // # calls to s_m_get8
    587 static UWord stats__cline_swrite32s      = 0; // # calls to s_m_get8
    588 static UWord stats__cline_swrite64s      = 0; // # calls to s_m_get8
    589 static UWord stats__cline_scopy08s       = 0; // # calls to s_m_copy8
    590 static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
    591 static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
    592 static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
    593 static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
    594 static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
    595 static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
    596 static UWord stats__vts__tick            = 0; // # calls to VTS__tick
    597 static UWord stats__vts__join            = 0; // # calls to VTS__join
    598 static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
    599 static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural
    600 
    601 // # calls to VTS__cmp_structural w/ slow case
    602 static UWord stats__vts__cmp_structural_slow = 0;
    603 
    604 // # calls to VTS__indexAt_SLOW
    605 static UWord stats__vts__indexat_slow = 0;
    606 
    607 // # calls to vts_set__find__or__clone_and_add
    608 static UWord stats__vts_set__focaa    = 0;
    609 
    610 // # calls to vts_set__find__or__clone_and_add that lead to an
    611 // allocation
    612 static UWord stats__vts_set__focaa_a  = 0;
    613 
    614 
    615 static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
    616    return a & ~(N_SECMAP_ARANGE - 1);
    617 }
    618 static inline UWord shmem__get_SecMap_offset ( Addr a ) {
    619    return a & (N_SECMAP_ARANGE - 1);
    620 }
    621 
    622 
    623 /*----------------------------------------------------------------*/
    624 /*--- map_shmem :: WordFM Addr SecMap                          ---*/
    625 /*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
    626 /*----------------------------------------------------------------*/
    627 
    628 /*--------------- SecMap allocation --------------- */
    629 
    630 static HChar* shmem__bigchunk_next = NULL;
    631 static HChar* shmem__bigchunk_end1 = NULL;
    632 
    633 static void* shmem__bigchunk_alloc ( SizeT n )
    634 {
    635    const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
    636    tl_assert(n > 0);
    637    n = VG_ROUNDUP(n, 16);
    638    tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
    639    tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
    640              <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
    641    if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
    642       if (0)
    643       VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
    644                   (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
    645       shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
    646       if (shmem__bigchunk_next == NULL)
    647          VG_(out_of_memory_NORETURN)(
    648             "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE );
    649       shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
    650    }
    651    tl_assert(shmem__bigchunk_next);
    652    tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
    653    tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
    654    shmem__bigchunk_next += n;
    655    return shmem__bigchunk_next - n;
    656 }
    657 
    658 static SecMap* shmem__alloc_SecMap ( void )
    659 {
    660    Word    i, j;
    661    SecMap* sm = shmem__bigchunk_alloc( sizeof(SecMap) );
    662    if (0) VG_(printf)("alloc_SecMap %p\n",sm);
    663    tl_assert(sm);
    664    sm->magic = SecMap_MAGIC;
    665    for (i = 0; i < N_SECMAP_ZLINES; i++) {
    666       sm->linesZ[i].dict[0] = SVal_NOACCESS;
    667       sm->linesZ[i].dict[1] = SVal_INVALID;
    668       sm->linesZ[i].dict[2] = SVal_INVALID;
    669       sm->linesZ[i].dict[3] = SVal_INVALID;
    670       for (j = 0; j < N_LINE_ARANGE/4; j++)
    671          sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
    672    }
    673    sm->linesF      = NULL;
    674    sm->linesF_size = 0;
    675    stats__secmaps_allocd++;
    676    stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
    677    stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
    678    stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
    679    return sm;
    680 }
    681 
    682 typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
    683 static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };
    684 
    685 static SecMap* shmem__find_SecMap ( Addr ga )
    686 {
    687    SecMap* sm    = NULL;
    688    Addr    gaKey = shmem__round_to_SecMap_base(ga);
    689    // Cache
    690    stats__secmaps_search++;
    691    if (LIKELY(gaKey == smCache[0].gaKey))
    692       return smCache[0].sm;
    693    if (LIKELY(gaKey == smCache[1].gaKey)) {
    694       SMCacheEnt tmp = smCache[0];
    695       smCache[0] = smCache[1];
    696       smCache[1] = tmp;
    697       return smCache[0].sm;
    698    }
    699    if (gaKey == smCache[2].gaKey) {
    700       SMCacheEnt tmp = smCache[1];
    701       smCache[1] = smCache[2];
    702       smCache[2] = tmp;
    703       return smCache[1].sm;
    704    }
    705    // end Cache
    706    stats__secmaps_search_slow++;
    707    if (VG_(lookupFM)( map_shmem,
    708                       NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
    709       tl_assert(sm != NULL);
    710       smCache[2] = smCache[1];
    711       smCache[1] = smCache[0];
    712       smCache[0].gaKey = gaKey;
    713       smCache[0].sm    = sm;
    714    } else {
    715       tl_assert(sm == NULL);
    716    }
    717    return sm;
    718 }
    719 
    720 static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
    721 {
    722    SecMap* sm = shmem__find_SecMap ( ga );
    723    if (LIKELY(sm)) {
    724       return sm;
    725    } else {
    726       /* create a new one */
    727       Addr gaKey = shmem__round_to_SecMap_base(ga);
    728       sm = shmem__alloc_SecMap();
    729       tl_assert(sm);
    730       VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
    731       return sm;
    732    }
    733 }
    734 
    735 
    736 /* ------------ LineF and LineZ related ------------ */
    737 
    738 static void rcinc_LineF ( LineF* lineF ) {
    739    UWord i;
    740    tl_assert(lineF->inUse);
    741    for (i = 0; i < N_LINE_ARANGE; i++)
    742       rcinc(lineF->w64s[i]);
    743 }
    744 
    745 static void rcdec_LineF ( LineF* lineF ) {
    746    UWord i;
    747    tl_assert(lineF->inUse);
    748    for (i = 0; i < N_LINE_ARANGE; i++)
    749       rcdec(lineF->w64s[i]);
    750 }
    751 
    752 static void rcinc_LineZ ( LineZ* lineZ ) {
    753    tl_assert(lineZ->dict[0] != SVal_INVALID);
    754    rcinc(lineZ->dict[0]);
    755    if (lineZ->dict[1] != SVal_INVALID) rcinc(lineZ->dict[1]);
    756    if (lineZ->dict[2] != SVal_INVALID) rcinc(lineZ->dict[2]);
    757    if (lineZ->dict[3] != SVal_INVALID) rcinc(lineZ->dict[3]);
    758 }
    759 
    760 static void rcdec_LineZ ( LineZ* lineZ ) {
    761    tl_assert(lineZ->dict[0] != SVal_INVALID);
    762    rcdec(lineZ->dict[0]);
    763    if (lineZ->dict[1] != SVal_INVALID) rcdec(lineZ->dict[1]);
    764    if (lineZ->dict[2] != SVal_INVALID) rcdec(lineZ->dict[2]);
    765    if (lineZ->dict[3] != SVal_INVALID) rcdec(lineZ->dict[3]);
    766 }
    767 
    768 inline
    769 static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
    770    Word bix, shft, mask, prep;
    771    tl_assert(ix >= 0);
    772    bix  = ix >> 2;
    773    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
    774    mask = 3 << shft;
    775    prep = b2 << shft;
    776    arr[bix] = (arr[bix] & ~mask) | prep;
    777 }
    778 
    779 inline
    780 static UWord read_twobit_array ( UChar* arr, UWord ix ) {
    781    Word bix, shft;
    782    tl_assert(ix >= 0);
    783    bix  = ix >> 2;
    784    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
    785    return (arr[bix] >> shft) & 3;
    786 }
    787 
    788 /* Given address 'tag', find either the Z or F line containing relevant
    789    data, so it can be read into the cache.
    790 */
    791 static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
    792                                   /*OUT*/LineF** fp, Addr tag ) {
    793    LineZ* lineZ;
    794    LineF* lineF;
    795    UWord   zix;
    796    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
    797    UWord   smoff = shmem__get_SecMap_offset(tag);
    798    /* since smoff is derived from a valid tag, it should be
    799       cacheline-aligned. */
    800    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
    801    zix = smoff >> N_LINE_BITS;
    802    tl_assert(zix < N_SECMAP_ZLINES);
    803    lineZ = &sm->linesZ[zix];
    804    lineF = NULL;
    805    if (lineZ->dict[0] == SVal_INVALID) {
    806       UInt fix = (UInt)lineZ->dict[1];
    807       tl_assert(sm->linesF);
    808       tl_assert(sm->linesF_size > 0);
    809       tl_assert(fix >= 0 && fix < sm->linesF_size);
    810       lineF = &sm->linesF[fix];
    811       tl_assert(lineF->inUse);
    812       lineZ = NULL;
    813    }
    814    *zp = lineZ;
    815    *fp = lineF;
    816 }
    817 
    818 /* Given address 'tag', return the relevant SecMap and the index of
    819    the LineZ within it, in the expectation that the line is to be
    820    overwritten.  Regardless of whether 'tag' is currently associated
    821    with a Z or F representation, to rcdec on the current
    822    representation, in recognition of the fact that the contents are
    823    just about to be overwritten. */
    824 static __attribute__((noinline))
    825 void find_Z_for_writing ( /*OUT*/SecMap** smp,
    826                           /*OUT*/Word* zixp,
    827                           Addr tag ) {
    828    LineZ* lineZ;
    829    LineF* lineF;
    830    UWord   zix;
    831    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
    832    UWord   smoff = shmem__get_SecMap_offset(tag);
    833    /* since smoff is derived from a valid tag, it should be
    834       cacheline-aligned. */
    835    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
    836    zix = smoff >> N_LINE_BITS;
    837    tl_assert(zix < N_SECMAP_ZLINES);
    838    lineZ = &sm->linesZ[zix];
    839    lineF = NULL;
    840    /* re RCs, we are freeing up this LineZ/LineF so that new data can
    841       be parked in it.  Hence have to rcdec it accordingly. */
    842    /* If lineZ has an associated lineF, free it up. */
    843    if (lineZ->dict[0] == SVal_INVALID) {
    844       UInt fix = (UInt)lineZ->dict[1];
    845       tl_assert(sm->linesF);
    846       tl_assert(sm->linesF_size > 0);
    847       tl_assert(fix >= 0 && fix < sm->linesF_size);
    848       lineF = &sm->linesF[fix];
    849       tl_assert(lineF->inUse);
    850       rcdec_LineF(lineF);
    851       lineF->inUse = False;
    852    } else {
    853       rcdec_LineZ(lineZ);
    854    }
    855    *smp  = sm;
    856    *zixp = zix;
    857 }
    858 
    859 static __attribute__((noinline))
    860 void alloc_F_for_writing ( /*MOD*/SecMap* sm, /*OUT*/Word* fixp ) {
    861    UInt        i, new_size;
    862    LineF* nyu;
    863 
    864    if (sm->linesF) {
    865       tl_assert(sm->linesF_size > 0);
    866    } else {
    867       tl_assert(sm->linesF_size == 0);
    868    }
    869 
    870    if (sm->linesF) {
    871       for (i = 0; i < sm->linesF_size; i++) {
    872          if (!sm->linesF[i].inUse) {
    873             *fixp = (Word)i;
    874             return;
    875          }
    876       }
    877    }
    878 
    879    /* No free F line found.  Expand existing array and try again. */
    880    new_size = sm->linesF_size==0 ? 1 : 2 * sm->linesF_size;
    881    nyu      = HG_(zalloc)( "libhb.aFfw.1 (LineF storage)",
    882                            new_size * sizeof(LineF) );
    883    tl_assert(nyu);
    884 
    885    stats__secmap_linesF_allocd += (new_size - sm->linesF_size);
    886    stats__secmap_linesF_bytes  += (new_size - sm->linesF_size)
    887                                   * sizeof(LineF);
    888 
    889    if (0)
    890    VG_(printf)("SM %p: expand F array from %d to %d\n",
    891                sm, (Int)sm->linesF_size, new_size);
    892 
    893    for (i = 0; i < new_size; i++)
    894       nyu[i].inUse = False;
    895 
    896    if (sm->linesF) {
    897       for (i = 0; i < sm->linesF_size; i++) {
    898          tl_assert(sm->linesF[i].inUse);
    899          nyu[i] = sm->linesF[i];
    900       }
    901       VG_(memset)(sm->linesF, 0, sm->linesF_size * sizeof(LineF) );
    902       HG_(free)(sm->linesF);
    903    }
    904 
    905    sm->linesF      = nyu;
    906    sm->linesF_size = new_size;
    907 
    908    for (i = 0; i < sm->linesF_size; i++) {
    909       if (!sm->linesF[i].inUse) {
    910          *fixp = (Word)i;
    911          return;
    912       }
    913     }
    914 
    915     /*NOTREACHED*/
    916     tl_assert(0);
    917 }
    918 
    919 
    920 /* ------------ CacheLine and implicit-tree related ------------ */
    921 
    922 __attribute__((unused))
    923 static void pp_CacheLine ( CacheLine* cl ) {
    924    Word i;
    925    if (!cl) {
    926       VG_(printf)("%s","pp_CacheLine(NULL)\n");
    927       return;
    928    }
    929    for (i = 0; i < N_LINE_TREES; i++)
    930       VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
    931    for (i = 0; i < N_LINE_ARANGE; i++)
    932       VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
    933 }
    934 
    935 static UChar descr_to_validbits ( UShort descr )
    936 {
    937    /* a.k.a Party Time for gcc's constant folder */
    938 #  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
    939                 b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
    940              ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
    941                           ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
    942                           ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
    943                           ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
    944                           ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
    945                           ( (b16_2) << 4)  | ( (b64)   << 3)  | \
    946                           ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
    947                           ( (b16_0) << 0) ) )
    948 
    949 #  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
    950              ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
    951                          ( (bit5) << 5) | ( (bit4) << 4) | \
    952                          ( (bit3) << 3) | ( (bit2) << 2) | \
    953                          ( (bit1) << 1) | ( (bit0) << 0) ) )
    954 
    955    /* these should all get folded out at compile time */
    956    tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
    957    tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
    958    tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
    959    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
    960    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
    961    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
    962    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
    963    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
    964    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);
    965 
    966    switch (descr) {
    967    /*
    968               +--------------------------------- TREE_DESCR_8_7
    969               |             +------------------- TREE_DESCR_8_0
    970               |             |  +---------------- TREE_DESCR_16_3
    971               |             |  | +-------------- TREE_DESCR_32_1
    972               |             |  | | +------------ TREE_DESCR_16_2
    973               |             |  | | |  +--------- TREE_DESCR_64
    974               |             |  | | |  |  +------ TREE_DESCR_16_1
    975               |             |  | | |  |  | +---- TREE_DESCR_32_0
    976               |             |  | | |  |  | | +-- TREE_DESCR_16_0
    977               |             |  | | |  |  | | |
    978               |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
    979    case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
    980                                                  return BYTE(1,1,1,1,1,1,1,1);
    981    case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
    982                                                  return BYTE(1,1,0,1,1,1,1,1);
    983    case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */
    984                                                  return BYTE(0,1,1,1,1,1,1,1);
    985    case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
    986                                                  return BYTE(0,1,0,1,1,1,1,1);
    987 
    988    case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */
    989                                                  return BYTE(1,1,1,1,1,1,0,1);
    990    case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
    991                                                  return BYTE(1,1,0,1,1,1,0,1);
    992    case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
    993                                                  return BYTE(0,1,1,1,1,1,0,1);
    994    case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
    995                                                  return BYTE(0,1,0,1,1,1,0,1);
    996 
    997    case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
    998                                                  return BYTE(1,1,1,1,0,1,1,1);
    999    case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
   1000                                                  return BYTE(1,1,0,1,0,1,1,1);
   1001    case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
   1002                                                  return BYTE(0,1,1,1,0,1,1,1);
   1003    case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
   1004                                                  return BYTE(0,1,0,1,0,1,1,1);
   1005 
   1006    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
   1007                                                  return BYTE(1,1,1,1,0,1,0,1);
   1008    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
   1009                                                  return BYTE(1,1,0,1,0,1,0,1);
   1010    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
   1011                                                  return BYTE(0,1,1,1,0,1,0,1);
   1012    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
   1013                                                  return BYTE(0,1,0,1,0,1,0,1);
   1014 
   1015    case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
   1016                                                  return BYTE(0,0,0,1,1,1,1,1);
   1017    case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
   1018                                                  return BYTE(0,0,0,1,1,1,0,1);
   1019    case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
   1020                                                  return BYTE(0,0,0,1,0,1,1,1);
   1021    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
   1022                                                  return BYTE(0,0,0,1,0,1,0,1);
   1023 
   1024    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
   1025                                                  return BYTE(1,1,1,1,0,0,0,1);
   1026    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
   1027                                                  return BYTE(1,1,0,1,0,0,0,1);
   1028    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
   1029                                                  return BYTE(0,1,1,1,0,0,0,1);
   1030    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
   1031                                                  return BYTE(0,1,0,1,0,0,0,1);
   1032 
   1033    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
   1034                                                  return BYTE(0,0,0,1,0,0,0,1);
   1035 
   1036    case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
   1037                                                  return BYTE(0,0,0,0,0,0,0,1);
   1038 
   1039    default: return BYTE(0,0,0,0,0,0,0,0);
   1040                    /* INVALID - any valid descr produces at least one
   1041                       valid bit in tree[0..7]*/
   1042    }
   1043    /* NOTREACHED*/
   1044    tl_assert(0);
   1045 
   1046 #  undef DESCR
   1047 #  undef BYTE
   1048 }
   1049 
   1050 __attribute__((unused))
   1051 static Bool is_sane_Descr ( UShort descr ) {
   1052    return descr_to_validbits(descr) != 0;
   1053 }
   1054 
   1055 static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
   1056    VG_(sprintf)(dst,
   1057                 "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
   1058                 (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
   1059                 (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
   1060                 (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
   1061                 (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
   1062                 (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
   1063                 (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
   1064                 (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
   1065                 (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
   1066                 (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
   1067                 (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
   1068                 (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
   1069                 (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
   1070                 (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
   1071                 (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
   1072                 (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
   1073    );
   1074 }
   1075 static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
   1076    VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
   1077                      (Int)((byte & 128) ? 1 : 0),
   1078                      (Int)((byte &  64) ? 1 : 0),
   1079                      (Int)((byte &  32) ? 1 : 0),
   1080                      (Int)((byte &  16) ? 1 : 0),
   1081                      (Int)((byte &   8) ? 1 : 0),
   1082                      (Int)((byte &   4) ? 1 : 0),
   1083                      (Int)((byte &   2) ? 1 : 0),
   1084                      (Int)((byte &   1) ? 1 : 0)
   1085    );
   1086 }
   1087 
   1088 static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
   1089    Word  i;
   1090    UChar validbits = descr_to_validbits(descr);
   1091    HChar buf[128], buf2[128];
   1092    if (validbits == 0)
   1093       goto bad;
   1094    for (i = 0; i < 8; i++) {
   1095       if (validbits & (1<<i)) {
   1096          if (tree[i] == SVal_INVALID)
   1097             goto bad;
   1098       } else {
   1099          if (tree[i] != SVal_INVALID)
   1100             goto bad;
   1101       }
   1102    }
   1103    return True;
   1104   bad:
   1105    sprintf_Descr( buf, descr );
   1106    sprintf_Byte( buf2, validbits );
   1107    VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
   1108    VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
   1109    VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
   1110    for (i = 0; i < 8; i++)
   1111       VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
   1112    VG_(printf)("%s","}\n");
   1113    return 0;
   1114 }
   1115 
   1116 static Bool is_sane_CacheLine ( CacheLine* cl )
   1117 {
   1118    Word tno, cloff;
   1119 
   1120    if (!cl) goto bad;
   1121 
   1122    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
   1123       UShort descr = cl->descrs[tno];
   1124       SVal*  tree  = &cl->svals[cloff];
   1125       if (!is_sane_Descr_and_Tree(descr, tree))
   1126          goto bad;
   1127    }
   1128    tl_assert(cloff == N_LINE_ARANGE);
   1129    return True;
   1130   bad:
   1131    pp_CacheLine(cl);
   1132    return False;
   1133 }
   1134 
   1135 static UShort normalise_tree ( /*MOD*/SVal* tree )
   1136 {
   1137    UShort descr;
   1138    /* pre: incoming tree[0..7] does not have any invalid shvals, in
   1139       particular no zeroes. */
   1140    if (UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
   1141                 || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
   1142                 || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
   1143                 || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
   1144       tl_assert(0);
   1145 
   1146    descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
   1147            | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
   1148            | TREE_DESCR_8_1 | TREE_DESCR_8_0;
   1149    /* build 16-bit layer */
   1150    if (tree[1] == tree[0]) {
   1151       tree[1] = SVal_INVALID;
   1152       descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
   1153       descr |= TREE_DESCR_16_0;
   1154    }
   1155    if (tree[3] == tree[2]) {
   1156       tree[3] = SVal_INVALID;
   1157       descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
   1158       descr |= TREE_DESCR_16_1;
   1159    }
   1160    if (tree[5] == tree[4]) {
   1161       tree[5] = SVal_INVALID;
   1162       descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
   1163       descr |= TREE_DESCR_16_2;
   1164    }
   1165    if (tree[7] == tree[6]) {
   1166       tree[7] = SVal_INVALID;
   1167       descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
   1168       descr |= TREE_DESCR_16_3;
   1169    }
   1170    /* build 32-bit layer */
   1171    if (tree[2] == tree[0]
   1172        && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
   1173       tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
   1174       descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
   1175       descr |= TREE_DESCR_32_0;
   1176    }
   1177    if (tree[6] == tree[4]
   1178        && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
   1179       tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
   1180       descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
   1181       descr |= TREE_DESCR_32_1;
   1182    }
   1183    /* build 64-bit layer */
   1184    if (tree[4] == tree[0]
   1185        && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
   1186       tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
   1187       descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
   1188       descr |= TREE_DESCR_64;
   1189    }
   1190    return descr;
   1191 }
   1192 
   1193 /* This takes a cacheline where all the data is at the leaves
   1194    (w8[..]) and builds a correctly normalised tree. */
   1195 static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
   1196 {
   1197    Word tno, cloff;
   1198    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
   1199       SVal* tree = &cl->svals[cloff];
   1200       cl->descrs[tno] = normalise_tree( tree );
   1201    }
   1202    tl_assert(cloff == N_LINE_ARANGE);
   1203    if (CHECK_ZSM)
   1204       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   1205    stats__cline_normalises++;
   1206 }
   1207 
   1208 
   1209 typedef struct { UChar count; SVal sval; } CountedSVal;
   1210 
   1211 static
   1212 void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
   1213                                /*OUT*/Word* dstUsedP,
   1214                                Word nDst, CacheLine* src )
   1215 {
   1216    Word  tno, cloff, dstUsed;
   1217 
   1218    tl_assert(nDst == N_LINE_ARANGE);
   1219    dstUsed = 0;
   1220 
   1221    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
   1222       UShort descr = src->descrs[tno];
   1223       SVal*  tree  = &src->svals[cloff];
   1224 
   1225       /* sequentialise the tree described by (descr,tree). */
   1226 #     define PUT(_n,_v)                                \
   1227          do { dst[dstUsed  ].count = (_n);             \
   1228               dst[dstUsed++].sval  = (_v);             \
   1229          } while (0)
   1230 
   1231       /* byte 0 */
   1232       if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
   1233       if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
   1234       if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
   1235       if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
   1236       /* byte 1 */
   1237       if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
   1238       /* byte 2 */
   1239       if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
   1240       if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
   1241       /* byte 3 */
   1242       if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
   1243       /* byte 4 */
   1244       if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
   1245       if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
   1246       if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
   1247       /* byte 5 */
   1248       if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
   1249       /* byte 6 */
   1250       if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
   1251       if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
   1252       /* byte 7 */
   1253       if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);
   1254 
   1255 #     undef PUT
   1256       /* END sequentialise the tree described by (descr,tree). */
   1257 
   1258    }
   1259    tl_assert(cloff == N_LINE_ARANGE);
   1260    tl_assert(dstUsed <= nDst);
   1261 
   1262    *dstUsedP = dstUsed;
   1263 }
   1264 
   1265 /* Write the cacheline 'wix' to backing store.  Where it ends up
   1266    is determined by its tag field. */
   1267 static __attribute__((noinline)) void cacheline_wback ( UWord wix )
   1268 {
   1269    Word        i, j, k, m;
   1270    Addr        tag;
   1271    SecMap*     sm;
   1272    CacheLine*  cl;
   1273    LineZ* lineZ;
   1274    LineF* lineF;
   1275    Word        zix, fix, csvalsUsed;
   1276    CountedSVal csvals[N_LINE_ARANGE];
   1277    SVal        sv;
   1278 
   1279    if (0)
   1280    VG_(printf)("scache wback line %d\n", (Int)wix);
   1281 
   1282    tl_assert(wix >= 0 && wix < N_WAY_NENT);
   1283 
   1284    tag =  cache_shmem.tags0[wix];
   1285    cl  = &cache_shmem.lyns0[wix];
   1286 
   1287    /* The cache line may have been invalidated; if so, ignore it. */
   1288    if (!is_valid_scache_tag(tag))
   1289       return;
   1290 
   1291    /* Where are we going to put it? */
   1292    sm         = NULL;
   1293    lineZ      = NULL;
   1294    lineF      = NULL;
   1295    zix = fix = -1;
   1296 
   1297    /* find the Z line to write in and rcdec it or the associated F
   1298       line. */
   1299    find_Z_for_writing( &sm, &zix, tag );
   1300 
   1301    tl_assert(sm);
   1302    tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
   1303    lineZ = &sm->linesZ[zix];
   1304 
   1305    /* Generate the data to be stored */
   1306    if (CHECK_ZSM)
   1307       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   1308 
   1309    csvalsUsed = -1;
   1310    sequentialise_CacheLine( csvals, &csvalsUsed,
   1311                             N_LINE_ARANGE, cl );
   1312    tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
   1313    if (0) VG_(printf)("%lu ", csvalsUsed);
   1314 
   1315    lineZ->dict[0] = lineZ->dict[1]
   1316                   = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
   1317 
   1318    /* i indexes actual shadow values, k is cursor in csvals */
   1319    i = 0;
   1320    for (k = 0; k < csvalsUsed; k++) {
   1321 
   1322       sv = csvals[k].sval;
   1323       if (CHECK_ZSM)
   1324          tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
   1325       /* do we already have it? */
   1326       if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
   1327       if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
   1328       if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
   1329       if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
   1330       /* no.  look for a free slot. */
   1331       if (CHECK_ZSM)
   1332          tl_assert(sv != SVal_INVALID);
   1333       if (lineZ->dict[0]
   1334           == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
   1335       if (lineZ->dict[1]
   1336           == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
   1337       if (lineZ->dict[2]
   1338           == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
   1339       if (lineZ->dict[3]
   1340           == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
   1341       break; /* we'll have to use the f rep */
   1342      dict_ok:
   1343       m = csvals[k].count;
   1344       if (m == 8) {
   1345          write_twobit_array( lineZ->ix2s, i+0, j );
   1346          write_twobit_array( lineZ->ix2s, i+1, j );
   1347          write_twobit_array( lineZ->ix2s, i+2, j );
   1348          write_twobit_array( lineZ->ix2s, i+3, j );
   1349          write_twobit_array( lineZ->ix2s, i+4, j );
   1350          write_twobit_array( lineZ->ix2s, i+5, j );
   1351          write_twobit_array( lineZ->ix2s, i+6, j );
   1352          write_twobit_array( lineZ->ix2s, i+7, j );
   1353          i += 8;
   1354       }
   1355       else if (m == 4) {
   1356          write_twobit_array( lineZ->ix2s, i+0, j );
   1357          write_twobit_array( lineZ->ix2s, i+1, j );
   1358          write_twobit_array( lineZ->ix2s, i+2, j );
   1359          write_twobit_array( lineZ->ix2s, i+3, j );
   1360          i += 4;
   1361       }
   1362       else if (m == 1) {
   1363          write_twobit_array( lineZ->ix2s, i+0, j );
   1364          i += 1;
   1365       }
   1366       else if (m == 2) {
   1367          write_twobit_array( lineZ->ix2s, i+0, j );
   1368          write_twobit_array( lineZ->ix2s, i+1, j );
   1369          i += 2;
   1370       }
   1371       else {
   1372          tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
   1373       }
   1374 
   1375    }
   1376 
   1377    if (LIKELY(i == N_LINE_ARANGE)) {
   1378       /* Construction of the compressed representation was
   1379          successful. */
   1380       rcinc_LineZ(lineZ);
   1381       stats__cache_Z_wbacks++;
   1382    } else {
   1383       /* Cannot use the compressed(z) representation.  Use the full(f)
   1384          rep instead. */
   1385       tl_assert(i >= 0 && i < N_LINE_ARANGE);
   1386       alloc_F_for_writing( sm, &fix );
   1387       tl_assert(sm->linesF);
   1388       tl_assert(sm->linesF_size > 0);
   1389       tl_assert(fix >= 0 && fix < (Word)sm->linesF_size);
   1390       lineF = &sm->linesF[fix];
   1391       tl_assert(!lineF->inUse);
   1392       lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
   1393       lineZ->dict[1] = (SVal)fix;
   1394       lineF->inUse = True;
   1395       i = 0;
   1396       for (k = 0; k < csvalsUsed; k++) {
   1397          if (CHECK_ZSM)
   1398             tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
   1399          sv = csvals[k].sval;
   1400          if (CHECK_ZSM)
   1401             tl_assert(sv != SVal_INVALID);
   1402          for (m = csvals[k].count; m > 0; m--) {
   1403             lineF->w64s[i] = sv;
   1404             i++;
   1405          }
   1406       }
   1407       tl_assert(i == N_LINE_ARANGE);
   1408       rcinc_LineF(lineF);
   1409       stats__cache_F_wbacks++;
   1410    }
   1411 }
   1412 
   1413 /* Fetch the cacheline 'wix' from the backing store.  The tag
   1414    associated with 'wix' is assumed to have already been filled in;
   1415    hence that is used to determine where in the backing store to read
   1416    from. */
   1417 static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
   1418 {
   1419    Word       i;
   1420    Addr       tag;
   1421    CacheLine* cl;
   1422    LineZ*     lineZ;
   1423    LineF*     lineF;
   1424 
   1425    if (0)
   1426    VG_(printf)("scache fetch line %d\n", (Int)wix);
   1427 
   1428    tl_assert(wix >= 0 && wix < N_WAY_NENT);
   1429 
   1430    tag =  cache_shmem.tags0[wix];
   1431    cl  = &cache_shmem.lyns0[wix];
   1432 
   1433    /* reject nonsense requests */
   1434    tl_assert(is_valid_scache_tag(tag));
   1435 
   1436    lineZ = NULL;
   1437    lineF = NULL;
   1438    find_ZF_for_reading( &lineZ, &lineF, tag );
   1439    tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );
   1440 
   1441    /* expand the data into the bottom layer of the tree, then get
   1442       cacheline_normalise to build the descriptor array. */
   1443    if (lineF) {
   1444       tl_assert(lineF->inUse);
   1445       for (i = 0; i < N_LINE_ARANGE; i++) {
   1446          cl->svals[i] = lineF->w64s[i];
   1447       }
   1448       stats__cache_F_fetches++;
   1449    } else {
   1450       for (i = 0; i < N_LINE_ARANGE; i++) {
   1451          SVal sv;
   1452          UWord ix = read_twobit_array( lineZ->ix2s, i );
   1453          /* correct, but expensive: tl_assert(ix >= 0 && ix <= 3); */
   1454          sv = lineZ->dict[ix];
   1455          tl_assert(sv != SVal_INVALID);
   1456          cl->svals[i] = sv;
   1457       }
   1458       stats__cache_Z_fetches++;
   1459    }
   1460    normalise_CacheLine( cl );
   1461 }
   1462 
   1463 static void shmem__invalidate_scache ( void ) {
   1464    Word wix;
   1465    if (0) VG_(printf)("%s","scache inval\n");
   1466    tl_assert(!is_valid_scache_tag(1));
   1467    for (wix = 0; wix < N_WAY_NENT; wix++) {
   1468       cache_shmem.tags0[wix] = 1/*INVALID*/;
   1469    }
   1470    stats__cache_invals++;
   1471 }
   1472 
   1473 static void shmem__flush_and_invalidate_scache ( void ) {
   1474    Word wix;
   1475    Addr tag;
   1476    if (0) VG_(printf)("%s","scache flush and invalidate\n");
   1477    tl_assert(!is_valid_scache_tag(1));
   1478    for (wix = 0; wix < N_WAY_NENT; wix++) {
   1479       tag = cache_shmem.tags0[wix];
   1480       if (tag == 1/*INVALID*/) {
   1481          /* already invalid; nothing to do */
   1482       } else {
   1483          tl_assert(is_valid_scache_tag(tag));
   1484          cacheline_wback( wix );
   1485       }
   1486       cache_shmem.tags0[wix] = 1/*INVALID*/;
   1487    }
   1488    stats__cache_flushes++;
   1489    stats__cache_invals++;
   1490 }
   1491 
   1492 
   1493 static inline Bool aligned16 ( Addr a ) {
   1494    return 0 == (a & 1);
   1495 }
   1496 static inline Bool aligned32 ( Addr a ) {
   1497    return 0 == (a & 3);
   1498 }
   1499 static inline Bool aligned64 ( Addr a ) {
   1500    return 0 == (a & 7);
   1501 }
   1502 static inline UWord get_cacheline_offset ( Addr a ) {
   1503    return (UWord)(a & (N_LINE_ARANGE - 1));
   1504 }
   1505 static inline Addr cacheline_ROUNDUP ( Addr a ) {
   1506    return ROUNDUP(a, N_LINE_ARANGE);
   1507 }
   1508 static inline Addr cacheline_ROUNDDN ( Addr a ) {
   1509    return ROUNDDN(a, N_LINE_ARANGE);
   1510 }
   1511 static inline UWord get_treeno ( Addr a ) {
   1512    return get_cacheline_offset(a) >> 3;
   1513 }
   1514 static inline UWord get_tree_offset ( Addr a ) {
   1515    return a & 7;
   1516 }
   1517 
   1518 static __attribute__((noinline))
   1519        CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
   1520 static inline CacheLine* get_cacheline ( Addr a )
   1521 {
   1522    /* tag is 'a' with the in-line offset masked out,
   1523       eg a[31]..a[4] 0000 */
   1524    Addr       tag = a & ~(N_LINE_ARANGE - 1);
   1525    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
   1526    stats__cache_totrefs++;
   1527    if (LIKELY(tag == cache_shmem.tags0[wix])) {
   1528       return &cache_shmem.lyns0[wix];
   1529    } else {
   1530       return get_cacheline_MISS( a );
   1531    }
   1532 }
   1533 
   1534 static __attribute__((noinline))
   1535        CacheLine* get_cacheline_MISS ( Addr a )
   1536 {
   1537    /* tag is 'a' with the in-line offset masked out,
   1538       eg a[31]..a[4] 0000 */
   1539 
   1540    CacheLine* cl;
   1541    Addr*      tag_old_p;
   1542    Addr       tag = a & ~(N_LINE_ARANGE - 1);
   1543    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
   1544 
   1545    tl_assert(tag != cache_shmem.tags0[wix]);
   1546 
   1547    /* Dump the old line into the backing store. */
   1548    stats__cache_totmisses++;
   1549 
   1550    cl        = &cache_shmem.lyns0[wix];
   1551    tag_old_p = &cache_shmem.tags0[wix];
   1552 
   1553    if (is_valid_scache_tag( *tag_old_p )) {
   1554       /* EXPENSIVE and REDUNDANT: callee does it */
   1555       if (CHECK_ZSM)
   1556          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   1557       cacheline_wback( wix );
   1558    }
   1559    /* and reload the new one */
   1560    *tag_old_p = tag;
   1561    cacheline_fetch( wix );
   1562    if (CHECK_ZSM)
   1563       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   1564    return cl;
   1565 }
   1566 
   1567 static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   1568    stats__cline_64to32pulldown++;
   1569    switch (toff) {
   1570       case 0: case 4:
   1571          tl_assert(descr & TREE_DESCR_64);
   1572          tree[4] = tree[0];
   1573          descr &= ~TREE_DESCR_64;
   1574          descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
   1575          break;
   1576       default:
   1577          tl_assert(0);
   1578    }
   1579    return descr;
   1580 }
   1581 
   1582 static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   1583    stats__cline_32to16pulldown++;
   1584    switch (toff) {
   1585       case 0: case 2:
   1586          if (!(descr & TREE_DESCR_32_0)) {
   1587             descr = pulldown_to_32(tree, 0, descr);
   1588          }
   1589          tl_assert(descr & TREE_DESCR_32_0);
   1590          tree[2] = tree[0];
   1591          descr &= ~TREE_DESCR_32_0;
   1592          descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
   1593          break;
   1594       case 4: case 6:
   1595          if (!(descr & TREE_DESCR_32_1)) {
   1596             descr = pulldown_to_32(tree, 4, descr);
   1597          }
   1598          tl_assert(descr & TREE_DESCR_32_1);
   1599          tree[6] = tree[4];
   1600          descr &= ~TREE_DESCR_32_1;
   1601          descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
   1602          break;
   1603       default:
   1604          tl_assert(0);
   1605    }
   1606    return descr;
   1607 }
   1608 
   1609 static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   1610    stats__cline_16to8pulldown++;
   1611    switch (toff) {
   1612       case 0: case 1:
   1613          if (!(descr & TREE_DESCR_16_0)) {
   1614             descr = pulldown_to_16(tree, 0, descr);
   1615          }
   1616          tl_assert(descr & TREE_DESCR_16_0);
   1617          tree[1] = tree[0];
   1618          descr &= ~TREE_DESCR_16_0;
   1619          descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
   1620          break;
   1621       case 2: case 3:
   1622          if (!(descr & TREE_DESCR_16_1)) {
   1623             descr = pulldown_to_16(tree, 2, descr);
   1624          }
   1625          tl_assert(descr & TREE_DESCR_16_1);
   1626          tree[3] = tree[2];
   1627          descr &= ~TREE_DESCR_16_1;
   1628          descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
   1629          break;
   1630       case 4: case 5:
   1631          if (!(descr & TREE_DESCR_16_2)) {
   1632             descr = pulldown_to_16(tree, 4, descr);
   1633          }
   1634          tl_assert(descr & TREE_DESCR_16_2);
   1635          tree[5] = tree[4];
   1636          descr &= ~TREE_DESCR_16_2;
   1637          descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
   1638          break;
   1639       case 6: case 7:
   1640          if (!(descr & TREE_DESCR_16_3)) {
   1641             descr = pulldown_to_16(tree, 6, descr);
   1642          }
   1643          tl_assert(descr & TREE_DESCR_16_3);
   1644          tree[7] = tree[6];
   1645          descr &= ~TREE_DESCR_16_3;
   1646          descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
   1647          break;
   1648       default:
   1649          tl_assert(0);
   1650    }
   1651    return descr;
   1652 }
   1653 
   1654 
   1655 static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
   1656    UShort mask;
   1657    switch (toff) {
   1658       case 0:
   1659          mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
   1660          tl_assert( (descr & mask) == mask );
   1661          descr &= ~mask;
   1662          descr |= TREE_DESCR_16_0;
   1663          break;
   1664       case 2:
   1665          mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
   1666          tl_assert( (descr & mask) == mask );
   1667          descr &= ~mask;
   1668          descr |= TREE_DESCR_16_1;
   1669          break;
   1670       case 4:
   1671          mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
   1672          tl_assert( (descr & mask) == mask );
   1673          descr &= ~mask;
   1674          descr |= TREE_DESCR_16_2;
   1675          break;
   1676       case 6:
   1677          mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
   1678          tl_assert( (descr & mask) == mask );
   1679          descr &= ~mask;
   1680          descr |= TREE_DESCR_16_3;
   1681          break;
   1682       default:
   1683          tl_assert(0);
   1684    }
   1685    return descr;
   1686 }
   1687 
   1688 static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
   1689    UShort mask;
   1690    switch (toff) {
   1691       case 0:
   1692          if (!(descr & TREE_DESCR_16_0))
   1693             descr = pullup_descr_to_16(descr, 0);
   1694          if (!(descr & TREE_DESCR_16_1))
   1695             descr = pullup_descr_to_16(descr, 2);
   1696          mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
   1697          tl_assert( (descr & mask) == mask );
   1698          descr &= ~mask;
   1699          descr |= TREE_DESCR_32_0;
   1700          break;
   1701       case 4:
   1702          if (!(descr & TREE_DESCR_16_2))
   1703             descr = pullup_descr_to_16(descr, 4);
   1704          if (!(descr & TREE_DESCR_16_3))
   1705             descr = pullup_descr_to_16(descr, 6);
   1706          mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
   1707          tl_assert( (descr & mask) == mask );
   1708          descr &= ~mask;
   1709          descr |= TREE_DESCR_32_1;
   1710          break;
   1711       default:
   1712          tl_assert(0);
   1713    }
   1714    return descr;
   1715 }
   1716 
   1717 static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
   1718    switch (toff) {
   1719       case 0: case 4:
   1720          return 0 != (descr & TREE_DESCR_64);
   1721       default:
   1722          tl_assert(0);
   1723    }
   1724 }
   1725 
   1726 static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
   1727    switch (toff) {
   1728       case 0:
   1729          return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
   1730       case 2:
   1731          return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
   1732       case 4:
   1733          return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
   1734       case 6:
   1735          return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
   1736       default:
   1737          tl_assert(0);
   1738    }
   1739 }
   1740 
   1741 /* ------------ Cache management ------------ */
   1742 
   1743 static void zsm_flush_cache ( void )
   1744 {
   1745    shmem__flush_and_invalidate_scache();
   1746 }
   1747 
   1748 
   1749 static void zsm_init ( void(*p_rcinc)(SVal), void(*p_rcdec)(SVal) )
   1750 {
   1751    tl_assert( sizeof(UWord) == sizeof(Addr) );
   1752 
   1753    rcinc = p_rcinc;
   1754    rcdec = p_rcdec;
   1755 
   1756    tl_assert(map_shmem == NULL);
   1757    map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
   1758                            HG_(free),
   1759                            NULL/*unboxed UWord cmp*/);
   1760    tl_assert(map_shmem != NULL);
   1761    shmem__invalidate_scache();
   1762 
   1763    /* a SecMap must contain an integral number of CacheLines */
   1764    tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
   1765    /* also ... a CacheLine holds an integral number of trees */
   1766    tl_assert(0 == (N_LINE_ARANGE % 8));
   1767 }
   1768 
   1769 /////////////////////////////////////////////////////////////////
   1770 /////////////////////////////////////////////////////////////////
   1771 //                                                             //
   1772 // SECTION END compressed shadow memory                        //
   1773 //                                                             //
   1774 /////////////////////////////////////////////////////////////////
   1775 /////////////////////////////////////////////////////////////////
   1776 
   1777 
   1778 
   1779 /////////////////////////////////////////////////////////////////
   1780 /////////////////////////////////////////////////////////////////
   1781 //                                                             //
   1782 // SECTION BEGIN vts primitives                                //
   1783 //                                                             //
   1784 /////////////////////////////////////////////////////////////////
   1785 /////////////////////////////////////////////////////////////////
   1786 
   1787 
   1788 /* There's a 1-1 mapping between Thr and ThrIDs -- the latter merely
   1789    being compact stand-ins for Thr*'s.  Use these functions to map
   1790    between them. */
   1791 static ThrID Thr__to_ThrID   ( Thr*  thr   ); /* fwds */
   1792 static Thr*  Thr__from_ThrID ( ThrID thrid ); /* fwds */
   1793 
   1794 __attribute__((noreturn))
   1795 static void scalarts_limitations_fail_NORETURN ( Bool due_to_nThrs )
   1796 {
   1797    if (due_to_nThrs) {
   1798       HChar* s =
   1799          "\n"
   1800          "Helgrind: cannot continue, run aborted: too many threads.\n"
   1801          "Sorry.  Helgrind can only handle programs that create\n"
   1802          "%'llu or fewer threads over their entire lifetime.\n"
   1803          "\n";
   1804       VG_(umsg)(s, (ULong)(ThrID_MAX_VALID - 1024));
   1805    } else {
   1806       HChar* s =
   1807          "\n"
   1808          "Helgrind: cannot continue, run aborted: too many\n"
   1809          "synchronisation events.  Sorry. Helgrind can only handle\n"
   1810          "programs which perform %'llu or fewer\n"
   1811          "inter-thread synchronisation events (locks, unlocks, etc).\n"
   1812          "\n";
   1813       VG_(umsg)(s, (1ULL << SCALARTS_N_TYMBITS) - 1);
   1814    }
   1815    VG_(exit)(1);
   1816    /*NOTREACHED*/
   1817    tl_assert(0); /*wtf?!*/
   1818 }
   1819 
   1820 
   1821 /* The dead thread (ThrID, actually) table.  A thread may only be
   1822    listed here if we have been notified thereof by libhb_async_exit.
   1823    New entries are added at the end.  The order isn't important, but
   1824    the ThrID values must be unique.  This table lists the identity of
   1825    all threads that have ever died -- none are ever removed.  We keep
   1826    this table so as to be able to prune entries from VTSs.  We don't
   1827    actually need to keep the set of threads that have ever died --
   1828    only the threads that have died since the previous round of
   1829    pruning.  But it's useful for sanity check purposes to keep the
   1830    entire set, so we do. */
   1831 static XArray* /* of ThrID */ verydead_thread_table = NULL;
   1832 
   1833 /* Arbitrary total ordering on ThrIDs. */
   1834 static Int cmp__ThrID ( void* v1, void* v2 ) {
   1835    ThrID id1 = *(ThrID*)v1;
   1836    ThrID id2 = *(ThrID*)v2;
   1837    if (id1 < id2) return -1;
   1838    if (id1 > id2) return 1;
   1839    return 0;
   1840 }
   1841 
   1842 static void verydead_thread_table_init ( void )
   1843 {
   1844    tl_assert(!verydead_thread_table);
   1845    verydead_thread_table
   1846      = VG_(newXA)( HG_(zalloc),
   1847                    "libhb.verydead_thread_table_init.1",
   1848                    HG_(free), sizeof(ThrID) );
   1849    tl_assert(verydead_thread_table);
   1850    VG_(setCmpFnXA)(verydead_thread_table, cmp__ThrID);
   1851 }
   1852 
   1853 
   1854 /* A VTS contains .ts, its vector clock, and also .id, a field to hold
   1855    a backlink for the caller's convenience.  Since we have no idea
   1856    what to set that to in the library, it always gets set to
   1857    VtsID_INVALID. */
   1858 typedef
   1859    struct {
   1860       VtsID    id;
   1861       UInt     usedTS;
   1862       UInt     sizeTS;
   1863       ScalarTS ts[0];
   1864    }
   1865    VTS;
   1866 
   1867 /* Allocate a VTS capable of storing 'sizeTS' entries. */
   1868 static VTS* VTS__new ( HChar* who, UInt sizeTS );
   1869 
   1870 /* Make a clone of 'vts', sizing the new array to exactly match the
   1871    number of ScalarTSs present. */
   1872 static VTS* VTS__clone ( HChar* who, VTS* vts );
   1873 
   1874 /* Make a clone of 'vts' with the thrids in 'thrids' removed.  The new
   1875    array is sized exactly to hold the number of required elements.
   1876    'thridsToDel' is an array of ThrIDs to be omitted in the clone, and
   1877    must be in strictly increasing order. */
   1878 static VTS* VTS__subtract ( HChar* who, VTS* vts, XArray* thridsToDel );
   1879 
   1880 /* Delete this VTS in its entirety. */
   1881 static void VTS__delete ( VTS* vts );
   1882 
   1883 /* Create a new singleton VTS in 'out'.  Caller must have
   1884    pre-allocated 'out' sufficiently big to hold the result in all
   1885    possible cases. */
   1886 static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym );
   1887 
   1888 /* Create in 'out' a VTS which is the same as 'vts' except with
   1889    vts[me]++, so to speak.  Caller must have pre-allocated 'out'
   1890    sufficiently big to hold the result in all possible cases. */
   1891 static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts );
   1892 
   1893 /* Create in 'out' a VTS which is the join (max) of 'a' and
   1894    'b'. Caller must have pre-allocated 'out' sufficiently big to hold
   1895    the result in all possible cases. */
   1896 static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b );
   1897 
   1898 /* Compute the partial ordering relation of the two args.  Although we
   1899    could be completely general and return an enumeration value (EQ,
   1900    LT, GT, UN), in fact we only need LEQ, and so we may as well
   1901    hardwire that fact.
   1902 
   1903    Returns zero iff LEQ(A,B), or a valid ThrID if not (zero is an
   1904    invald ThrID).  In the latter case, the returned ThrID indicates
   1905    the discovered point for which they are not.  There may be more
   1906    than one such point, but we only care about seeing one of them, not
   1907    all of them.  This rather strange convention is used because
   1908    sometimes we want to know the actual index at which they first
   1909    differ. */
   1910 static UInt VTS__cmpLEQ ( VTS* a, VTS* b );
   1911 
   1912 /* Compute an arbitrary structural (total) ordering on the two args,
   1913    based on their VCs, so they can be looked up in a table, tree, etc.
   1914    Returns -1, 0 or 1. */
   1915 static Word VTS__cmp_structural ( VTS* a, VTS* b );
   1916 
   1917 /* Debugging only.  Display the given VTS in the buffer. */
   1918 static void VTS__show ( HChar* buf, Int nBuf, VTS* vts );
   1919 
   1920 /* Debugging only.  Return vts[index], so to speak. */
   1921 static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );
   1922 
   1923 /* Notify the VTS machinery that a thread has been declared
   1924    comprehensively dead: that is, it has done an async exit AND it has
   1925    been joined with.  This should ensure that its local clocks (.viR
   1926    and .viW) will never again change, and so all mentions of this
   1927    thread from all VTSs in the system may be removed. */
   1928 static void VTS__declare_thread_very_dead ( Thr* idx );
   1929 
   1930 /*--------------- to do with Vector Timestamps ---------------*/
   1931 
   1932 static Bool is_sane_VTS ( VTS* vts )
   1933 {
   1934    UWord     i, n;
   1935    ScalarTS  *st1, *st2;
   1936    if (!vts) return False;
   1937    if (!vts->ts) return False;
   1938    if (vts->usedTS > vts->sizeTS) return False;
   1939    n = vts->usedTS;
   1940    if (n == 1) {
   1941       st1 = &vts->ts[0];
   1942       if (st1->tym == 0)
   1943          return False;
   1944    }
   1945    else
   1946    if (n >= 2) {
   1947       for (i = 0; i < n-1; i++) {
   1948          st1 = &vts->ts[i];
   1949          st2 = &vts->ts[i+1];
   1950          if (st1->thrid >= st2->thrid)
   1951             return False;
   1952          if (st1->tym == 0 || st2->tym == 0)
   1953             return False;
   1954       }
   1955    }
   1956    return True;
   1957 }
   1958 
   1959 
   1960 /* Create a new, empty VTS.
   1961 */
   1962 static VTS* VTS__new ( HChar* who, UInt sizeTS )
   1963 {
   1964    VTS* vts = HG_(zalloc)(who, sizeof(VTS) + (sizeTS+1) * sizeof(ScalarTS));
   1965    tl_assert(vts->usedTS == 0);
   1966    vts->sizeTS = sizeTS;
   1967    *(ULong*)(&vts->ts[sizeTS]) = 0x0ddC0ffeeBadF00dULL;
   1968    return vts;
   1969 }
   1970 
   1971 /* Clone this VTS.
   1972 */
   1973 static VTS* VTS__clone ( HChar* who, VTS* vts )
   1974 {
   1975    tl_assert(vts);
   1976    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   1977    UInt nTS = vts->usedTS;
   1978    VTS* clone = VTS__new(who, nTS);
   1979    clone->id = vts->id;
   1980    clone->sizeTS = nTS;
   1981    clone->usedTS = nTS;
   1982    UInt i;
   1983    for (i = 0; i < nTS; i++) {
   1984       clone->ts[i] = vts->ts[i];
   1985    }
   1986    tl_assert( *(ULong*)(&clone->ts[clone->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   1987    return clone;
   1988 }
   1989 
   1990 
   1991 /* Make a clone of a VTS with specified ThrIDs removed.  'thridsToDel'
   1992    must be in strictly increasing order.  We could obviously do this
   1993    much more efficiently (in linear time) if necessary.
   1994 */
   1995 static VTS* VTS__subtract ( HChar* who, VTS* vts, XArray* thridsToDel )
   1996 {
   1997    UInt i, j;
   1998    tl_assert(vts);
   1999    tl_assert(thridsToDel);
   2000    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   2001    UInt nTS = vts->usedTS;
   2002    /* Figure out how many ScalarTSs will remain in the output. */
   2003    UInt nReq = nTS;
   2004    for (i = 0; i < nTS; i++) {
   2005       ThrID thrid = vts->ts[i].thrid;
   2006       if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
   2007          nReq--;
   2008    }
   2009    tl_assert(nReq <= nTS);
   2010    /* Copy the ones that will remain. */
   2011    VTS* res = VTS__new(who, nReq);
   2012    j = 0;
   2013    for (i = 0; i < nTS; i++) {
   2014       ThrID thrid = vts->ts[i].thrid;
   2015       if (VG_(lookupXA)(thridsToDel, &thrid, NULL, NULL))
   2016          continue;
   2017       res->ts[j++] = vts->ts[i];
   2018    }
   2019    tl_assert(j == nReq);
   2020    tl_assert(j == res->sizeTS);
   2021    res->usedTS = j;
   2022    tl_assert( *(ULong*)(&res->ts[j]) == 0x0ddC0ffeeBadF00dULL);
   2023    return res;
   2024 }
   2025 
   2026 
   2027 /* Delete this VTS in its entirety.
   2028 */
   2029 static void VTS__delete ( VTS* vts )
   2030 {
   2031    tl_assert(vts);
   2032    tl_assert(vts->usedTS <= vts->sizeTS);
   2033    tl_assert( *(ULong*)(&vts->ts[vts->sizeTS]) == 0x0ddC0ffeeBadF00dULL);
   2034    HG_(free)(vts);
   2035 }
   2036 
   2037 
   2038 /* Create a new singleton VTS.
   2039 */
   2040 static void VTS__singleton ( /*OUT*/VTS* out, Thr* thr, ULong tym )
   2041 {
   2042    tl_assert(thr);
   2043    tl_assert(tym >= 1);
   2044    tl_assert(out);
   2045    tl_assert(out->usedTS == 0);
   2046    tl_assert(out->sizeTS >= 1);
   2047    UInt hi = out->usedTS++;
   2048    out->ts[hi].thrid = Thr__to_ThrID(thr);
   2049    out->ts[hi].tym   = tym;
   2050 }
   2051 
   2052 
   2053 /* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
   2054    not modified.
   2055 */
   2056 static void VTS__tick ( /*OUT*/VTS* out, Thr* me, VTS* vts )
   2057 {
   2058    UInt      i, n;
   2059    ThrID     me_thrid;
   2060    Bool      found = False;
   2061 
   2062    stats__vts__tick++;
   2063 
   2064    tl_assert(out);
   2065    tl_assert(out->usedTS == 0);
   2066    if (vts->usedTS >= ThrID_MAX_VALID)
   2067       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
   2068    tl_assert(out->sizeTS >= 1 + vts->usedTS);
   2069 
   2070    tl_assert(me);
   2071    me_thrid = Thr__to_ThrID(me);
   2072    tl_assert(is_sane_VTS(vts));
   2073    n = vts->usedTS;
   2074 
   2075    /* Copy all entries which precede 'me'. */
   2076    for (i = 0; i < n; i++) {
   2077       ScalarTS* here = &vts->ts[i];
   2078       if (UNLIKELY(here->thrid >= me_thrid))
   2079          break;
   2080       UInt hi = out->usedTS++;
   2081       out->ts[hi] = *here;
   2082    }
   2083 
   2084    /* 'i' now indicates the next entry to copy, if any.
   2085        There are 3 possibilities:
   2086        (a) there is no next entry (we used them all up already):
   2087            add (me_thrid,1) to the output, and quit
   2088        (b) there is a next entry, and its thrid > me_thrid:
   2089            add (me_thrid,1) to the output, then copy the remaining entries
   2090        (c) there is a next entry, and its thrid == me_thrid:
   2091            copy it to the output but increment its timestamp value.
   2092            Then copy the remaining entries.  (c) is the common case.
   2093    */
   2094    tl_assert(i >= 0 && i <= n);
   2095    if (i == n) { /* case (a) */
   2096       UInt hi = out->usedTS++;
   2097       out->ts[hi].thrid = me_thrid;
   2098       out->ts[hi].tym   = 1;
   2099    } else {
   2100       /* cases (b) and (c) */
   2101       ScalarTS* here = &vts->ts[i];
   2102       if (me_thrid == here->thrid) { /* case (c) */
   2103          if (UNLIKELY(here->tym >= (1ULL << SCALARTS_N_TYMBITS) - 2ULL)) {
   2104             /* We're hosed.  We have to stop. */
   2105             scalarts_limitations_fail_NORETURN( False/*!due_to_nThrs*/ );
   2106          }
   2107          UInt hi = out->usedTS++;
   2108          out->ts[hi].thrid = here->thrid;
   2109          out->ts[hi].tym   = here->tym + 1;
   2110          i++;
   2111          found = True;
   2112       } else { /* case (b) */
   2113          UInt hi = out->usedTS++;
   2114          out->ts[hi].thrid = me_thrid;
   2115          out->ts[hi].tym   = 1;
   2116       }
   2117       /* And copy any remaining entries. */
   2118       for (/*keepgoing*/; i < n; i++) {
   2119          ScalarTS* here2 = &vts->ts[i];
   2120          UInt hi = out->usedTS++;
   2121          out->ts[hi] = *here2;
   2122       }
   2123    }
   2124 
   2125    tl_assert(is_sane_VTS(out));
   2126    tl_assert(out->usedTS == vts->usedTS + (found ? 0 : 1));
   2127    tl_assert(out->usedTS <= out->sizeTS);
   2128 }
   2129 
   2130 
   2131 /* Return a new VTS constructed as the join (max) of the 2 args.
   2132    Neither arg is modified.
   2133 */
   2134 static void VTS__join ( /*OUT*/VTS* out, VTS* a, VTS* b )
   2135 {
   2136    UInt     ia, ib, useda, usedb;
   2137    ULong    tyma, tymb, tymMax;
   2138    ThrID    thrid;
   2139    UInt     ncommon = 0;
   2140 
   2141    stats__vts__join++;
   2142 
   2143    tl_assert(a);
   2144    tl_assert(b);
   2145    useda = a->usedTS;
   2146    usedb = b->usedTS;
   2147 
   2148    tl_assert(out);
   2149    tl_assert(out->usedTS == 0);
   2150    /* overly conservative test, but doing better involves comparing
   2151       the two VTSs, which we don't want to do at this point. */
   2152    if (useda + usedb >= ThrID_MAX_VALID)
   2153       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
   2154    tl_assert(out->sizeTS >= useda + usedb);
   2155 
   2156    ia = ib = 0;
   2157 
   2158    while (1) {
   2159 
   2160       /* This logic is to enumerate triples (thrid, tyma, tymb) drawn
   2161          from a and b in order, where thrid is the next ThrID
   2162          occurring in either a or b, and tyma/b are the relevant
   2163          scalar timestamps, taking into account implicit zeroes. */
   2164       tl_assert(ia >= 0 && ia <= useda);
   2165       tl_assert(ib >= 0 && ib <= usedb);
   2166 
   2167       if        (ia == useda && ib == usedb) {
   2168          /* both empty - done */
   2169          break;
   2170 
   2171       } else if (ia == useda && ib != usedb) {
   2172          /* a empty, use up b */
   2173          ScalarTS* tmpb = &b->ts[ib];
   2174          thrid = tmpb->thrid;
   2175          tyma  = 0;
   2176          tymb  = tmpb->tym;
   2177          ib++;
   2178 
   2179       } else if (ia != useda && ib == usedb) {
   2180          /* b empty, use up a */
   2181          ScalarTS* tmpa = &a->ts[ia];
   2182          thrid = tmpa->thrid;
   2183          tyma  = tmpa->tym;
   2184          tymb  = 0;
   2185          ia++;
   2186 
   2187       } else {
   2188          /* both not empty; extract lowest-ThrID'd triple */
   2189          ScalarTS* tmpa = &a->ts[ia];
   2190          ScalarTS* tmpb = &b->ts[ib];
   2191          if (tmpa->thrid < tmpb->thrid) {
   2192             /* a has the lowest unconsidered ThrID */
   2193             thrid = tmpa->thrid;
   2194             tyma  = tmpa->tym;
   2195             tymb  = 0;
   2196             ia++;
   2197          } else if (tmpa->thrid > tmpb->thrid) {
   2198             /* b has the lowest unconsidered ThrID */
   2199             thrid = tmpb->thrid;
   2200             tyma  = 0;
   2201             tymb  = tmpb->tym;
   2202             ib++;
   2203          } else {
   2204             /* they both next mention the same ThrID */
   2205             tl_assert(tmpa->thrid == tmpb->thrid);
   2206             thrid = tmpa->thrid; /* == tmpb->thrid */
   2207             tyma  = tmpa->tym;
   2208             tymb  = tmpb->tym;
   2209             ia++;
   2210             ib++;
   2211             ncommon++;
   2212          }
   2213       }
   2214 
   2215       /* having laboriously determined (thr, tyma, tymb), do something
   2216          useful with it. */
   2217       tymMax = tyma > tymb ? tyma : tymb;
   2218       if (tymMax > 0) {
   2219          UInt hi = out->usedTS++;
   2220          out->ts[hi].thrid = thrid;
   2221          out->ts[hi].tym   = tymMax;
   2222       }
   2223 
   2224    }
   2225 
   2226    tl_assert(is_sane_VTS(out));
   2227    tl_assert(out->usedTS <= out->sizeTS);
   2228    tl_assert(out->usedTS == useda + usedb - ncommon);
   2229 }
   2230 
   2231 
   2232 /* Determine if 'a' <= 'b', in the partial ordering.  Returns zero if
   2233    they are, or the first ThrID for which they are not (no valid ThrID
   2234    has the value zero).  This rather strange convention is used
   2235    because sometimes we want to know the actual index at which they
   2236    first differ. */
   2237 static UInt/*ThrID*/ VTS__cmpLEQ ( VTS* a, VTS* b )
   2238 {
   2239    Word  ia, ib, useda, usedb;
   2240    ULong tyma, tymb;
   2241 
   2242    stats__vts__cmpLEQ++;
   2243 
   2244    tl_assert(a);
   2245    tl_assert(b);
   2246    useda = a->usedTS;
   2247    usedb = b->usedTS;
   2248 
   2249    ia = ib = 0;
   2250 
   2251    while (1) {
   2252 
   2253       /* This logic is to enumerate doubles (tyma, tymb) drawn
   2254          from a and b in order, and tyma/b are the relevant
   2255          scalar timestamps, taking into account implicit zeroes. */
   2256       ThrID thrid;
   2257 
   2258       tl_assert(ia >= 0 && ia <= useda);
   2259       tl_assert(ib >= 0 && ib <= usedb);
   2260 
   2261       if        (ia == useda && ib == usedb) {
   2262          /* both empty - done */
   2263          break;
   2264 
   2265       } else if (ia == useda && ib != usedb) {
   2266          /* a empty, use up b */
   2267          ScalarTS* tmpb = &b->ts[ib];
   2268          tyma  = 0;
   2269          tymb  = tmpb->tym;
   2270          thrid = tmpb->thrid;
   2271          ib++;
   2272 
   2273       } else if (ia != useda && ib == usedb) {
   2274          /* b empty, use up a */
   2275          ScalarTS* tmpa = &a->ts[ia];
   2276          tyma  = tmpa->tym;
   2277          thrid = tmpa->thrid;
   2278          tymb  = 0;
   2279          ia++;
   2280 
   2281       } else {
   2282          /* both not empty; extract lowest-ThrID'd triple */
   2283          ScalarTS* tmpa = &a->ts[ia];
   2284          ScalarTS* tmpb = &b->ts[ib];
   2285          if (tmpa->thrid < tmpb->thrid) {
   2286             /* a has the lowest unconsidered ThrID */
   2287             tyma  = tmpa->tym;
   2288             thrid = tmpa->thrid;
   2289             tymb  = 0;
   2290             ia++;
   2291          }
   2292          else
   2293          if (tmpa->thrid > tmpb->thrid) {
   2294             /* b has the lowest unconsidered ThrID */
   2295             tyma  = 0;
   2296             tymb  = tmpb->tym;
   2297             thrid = tmpb->thrid;
   2298             ib++;
   2299          } else {
   2300             /* they both next mention the same ThrID */
   2301             tl_assert(tmpa->thrid == tmpb->thrid);
   2302             tyma  = tmpa->tym;
   2303             thrid = tmpa->thrid;
   2304             tymb  = tmpb->tym;
   2305             ia++;
   2306             ib++;
   2307          }
   2308       }
   2309 
   2310       /* having laboriously determined (tyma, tymb), do something
   2311          useful with it. */
   2312       if (tyma > tymb) {
   2313          /* not LEQ at this index.  Quit, since the answer is
   2314             determined already. */
   2315          tl_assert(thrid >= 1024);
   2316          return thrid;
   2317       }
   2318    }
   2319 
   2320    return 0; /* all points are LEQ => return an invalid ThrID */
   2321 }
   2322 
   2323 
   2324 /* Compute an arbitrary structural (total) ordering on the two args,
   2325    based on their VCs, so they can be looked up in a table, tree, etc.
   2326    Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
   2327    performance critical so there is some effort expended to make it sa
   2328    fast as possible.
   2329 */
   2330 Word VTS__cmp_structural ( VTS* a, VTS* b )
   2331 {
   2332    /* We just need to generate an arbitrary total ordering based on
   2333       a->ts and b->ts.  Preferably do it in a way which comes across likely
   2334       differences relatively quickly. */
   2335    Word     i;
   2336    Word     useda = 0,    usedb = 0;
   2337    ScalarTS *ctsa = NULL, *ctsb = NULL;
   2338 
   2339    stats__vts__cmp_structural++;
   2340 
   2341    tl_assert(a);
   2342    tl_assert(b);
   2343 
   2344    ctsa = &a->ts[0]; useda = a->usedTS;
   2345    ctsb = &b->ts[0]; usedb = b->usedTS;
   2346 
   2347    if (LIKELY(useda == usedb)) {
   2348       ScalarTS *tmpa = NULL, *tmpb = NULL;
   2349       stats__vts__cmp_structural_slow++;
   2350       /* Same length vectors.  Find the first difference, if any, as
   2351          fast as possible. */
   2352       for (i = 0; i < useda; i++) {
   2353          tmpa = &ctsa[i];
   2354          tmpb = &ctsb[i];
   2355          if (LIKELY(tmpa->tym == tmpb->tym
   2356                     && tmpa->thrid == tmpb->thrid))
   2357             continue;
   2358          else
   2359             break;
   2360       }
   2361       if (UNLIKELY(i == useda)) {
   2362          /* They're identical. */
   2363          return 0;
   2364       } else {
   2365          tl_assert(i >= 0 && i < useda);
   2366          if (tmpa->tym < tmpb->tym) return -1;
   2367          if (tmpa->tym > tmpb->tym) return 1;
   2368          if (tmpa->thrid < tmpb->thrid) return -1;
   2369          if (tmpa->thrid > tmpb->thrid) return 1;
   2370          /* we just established them as non-identical, hence: */
   2371       }
   2372       /*NOTREACHED*/
   2373       tl_assert(0);
   2374    }
   2375 
   2376    if (useda < usedb) return -1;
   2377    if (useda > usedb) return 1;
   2378    /*NOTREACHED*/
   2379    tl_assert(0);
   2380 }
   2381 
   2382 
   2383 /* Debugging only.  Display the given VTS in the buffer.
   2384 */
   2385 void VTS__show ( HChar* buf, Int nBuf, VTS* vts )
   2386 {
   2387    ScalarTS* st;
   2388    HChar     unit[64];
   2389    Word      i, n;
   2390    Int       avail = nBuf;
   2391    tl_assert(vts && vts->ts);
   2392    tl_assert(nBuf > 16);
   2393    buf[0] = '[';
   2394    buf[1] = 0;
   2395    n =  vts->usedTS;
   2396    for (i = 0; i < n; i++) {
   2397       tl_assert(avail >= 40);
   2398       st = &vts->ts[i];
   2399       VG_(memset)(unit, 0, sizeof(unit));
   2400       VG_(sprintf)(unit, i < n-1 ? "%u:%llu " : "%u:%llu",
   2401                          st->thrid, (ULong)st->tym);
   2402       if (avail < VG_(strlen)(unit) + 40/*let's say*/) {
   2403          VG_(strcat)(buf, " ...]");
   2404          buf[nBuf-1] = 0;
   2405          return;
   2406       }
   2407       VG_(strcat)(buf, unit);
   2408       avail -= VG_(strlen)(unit);
   2409    }
   2410    VG_(strcat)(buf, "]");
   2411    buf[nBuf-1] = 0;
   2412 }
   2413 
   2414 
   2415 /* Debugging only.  Return vts[index], so to speak.
   2416 */
   2417 ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx )
   2418 {
   2419    UWord i, n;
   2420    ThrID idx_thrid = Thr__to_ThrID(idx);
   2421    stats__vts__indexat_slow++;
   2422    tl_assert(vts && vts->ts);
   2423    n = vts->usedTS;
   2424    for (i = 0; i < n; i++) {
   2425       ScalarTS* st = &vts->ts[i];
   2426       if (st->thrid == idx_thrid)
   2427          return st->tym;
   2428    }
   2429    return 0;
   2430 }
   2431 
   2432 
   2433 /* See comment on prototype above.
   2434 */
   2435 static void VTS__declare_thread_very_dead ( Thr* thr )
   2436 {
   2437    if (0) VG_(printf)("VTQ:  tae %p\n", thr);
   2438 
   2439    tl_assert(thr->llexit_done);
   2440    tl_assert(thr->joinedwith_done);
   2441 
   2442    ThrID nyu;
   2443    nyu = Thr__to_ThrID(thr);
   2444    VG_(addToXA)( verydead_thread_table, &nyu );
   2445 
   2446    /* We can only get here if we're assured that we'll never again
   2447       need to look at this thread's ::viR or ::viW.  Set them to
   2448       VtsID_INVALID, partly so as to avoid holding on to the VTSs, but
   2449       mostly so that we don't wind up pruning them (as that would be
   2450       nonsensical: the only interesting ScalarTS entry for a dead
   2451       thread is its own index, and the pruning will remove that.). */
   2452    VtsID__rcdec(thr->viR);
   2453    VtsID__rcdec(thr->viW);
   2454    thr->viR = VtsID_INVALID;
   2455    thr->viW = VtsID_INVALID;
   2456 }
   2457 
   2458 
   2459 /////////////////////////////////////////////////////////////////
   2460 /////////////////////////////////////////////////////////////////
   2461 //                                                             //
   2462 // SECTION END vts primitives                                  //
   2463 //                                                             //
   2464 /////////////////////////////////////////////////////////////////
   2465 /////////////////////////////////////////////////////////////////
   2466 
   2467 
   2468 
   2469 /////////////////////////////////////////////////////////////////
   2470 /////////////////////////////////////////////////////////////////
   2471 //                                                             //
   2472 // SECTION BEGIN main library                                  //
   2473 //                                                             //
   2474 /////////////////////////////////////////////////////////////////
   2475 /////////////////////////////////////////////////////////////////
   2476 
   2477 
   2478 /////////////////////////////////////////////////////////
   2479 //                                                     //
   2480 // VTS set                                             //
   2481 //                                                     //
   2482 /////////////////////////////////////////////////////////
   2483 
   2484 static WordFM* /* WordFM VTS* void */ vts_set = NULL;
   2485 
   2486 static void vts_set_init ( void )
   2487 {
   2488    tl_assert(!vts_set);
   2489    vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
   2490                          HG_(free),
   2491                          (Word(*)(UWord,UWord))VTS__cmp_structural );
   2492    tl_assert(vts_set);
   2493 }
   2494 
   2495 /* Given a VTS, look in vts_set to see if we already have a
   2496    structurally identical one.  If yes, return the pair (True, pointer
   2497    to the existing one).  If no, clone this one, add the clone to the
   2498    set, and return (False, pointer to the clone). */
   2499 static Bool vts_set__find__or__clone_and_add ( /*OUT*/VTS** res, VTS* cand )
   2500 {
   2501    UWord keyW, valW;
   2502    stats__vts_set__focaa++;
   2503    tl_assert(cand->id == VtsID_INVALID);
   2504    /* lookup cand (by value) */
   2505    if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
   2506       /* found it */
   2507       tl_assert(valW == 0);
   2508       /* if this fails, cand (by ref) was already present (!) */
   2509       tl_assert(keyW != (UWord)cand);
   2510       *res = (VTS*)keyW;
   2511       return True;
   2512    } else {
   2513       /* not present.  Clone, add and return address of clone. */
   2514       stats__vts_set__focaa_a++;
   2515       VTS* clone = VTS__clone( "libhb.vts_set_focaa.1", cand );
   2516       tl_assert(clone != cand);
   2517       VG_(addToFM)( vts_set, (UWord)clone, 0/*val is unused*/ );
   2518       *res = clone;
   2519       return False;
   2520    }
   2521 }
   2522 
   2523 
   2524 /////////////////////////////////////////////////////////
   2525 //                                                     //
   2526 // VTS table                                           //
   2527 //                                                     //
   2528 /////////////////////////////////////////////////////////
   2529 
   2530 static void VtsID__invalidate_caches ( void ); /* fwds */
   2531 
   2532 /* A type to hold VTS table entries.  Invariants:
   2533    If .vts == NULL, then this entry is not in use, so:
   2534    - .rc == 0
   2535    - this entry is on the freelist (unfortunately, does not imply
   2536      any constraints on value for .freelink)
   2537    If .vts != NULL, then this entry is in use:
   2538    - .vts is findable in vts_set
   2539    - .vts->id == this entry number
   2540    - no specific value for .rc (even 0 is OK)
   2541    - this entry is not on freelist, so .freelink == VtsID_INVALID
   2542 */
   2543 typedef
   2544    struct {
   2545       VTS*  vts;      /* vts, in vts_set */
   2546       UWord rc;       /* reference count - enough for entire aspace */
   2547       VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
   2548       VtsID remap;    /* used only during pruning */
   2549    }
   2550    VtsTE;
   2551 
   2552 /* The VTS table. */
   2553 static XArray* /* of VtsTE */ vts_tab = NULL;
   2554 
   2555 /* An index into the VTS table, indicating the start of the list of
   2556    free (available for use) entries.  If the list is empty, this is
   2557    VtsID_INVALID. */
   2558 static VtsID vts_tab_freelist = VtsID_INVALID;
   2559 
   2560 /* Do a GC of vts_tab when the freelist becomes empty AND the size of
   2561    vts_tab equals or exceeds this size.  After GC, the value here is
   2562    set appropriately so as to check for the next GC point. */
   2563 static Word vts_next_GC_at = 1000;
   2564 
   2565 static void vts_tab_init ( void )
   2566 {
   2567    vts_tab
   2568       = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
   2569                     HG_(free), sizeof(VtsTE) );
   2570    vts_tab_freelist
   2571       = VtsID_INVALID;
   2572    tl_assert(vts_tab);
   2573 }
   2574 
   2575 /* Add ii to the free list, checking that it looks out-of-use. */
   2576 static void add_to_free_list ( VtsID ii )
   2577 {
   2578    VtsTE* ie = VG_(indexXA)( vts_tab, ii );
   2579    tl_assert(ie->vts == NULL);
   2580    tl_assert(ie->rc == 0);
   2581    tl_assert(ie->freelink == VtsID_INVALID);
   2582    ie->freelink = vts_tab_freelist;
   2583    vts_tab_freelist = ii;
   2584 }
   2585 
   2586 /* Get an entry from the free list.  This will return VtsID_INVALID if
   2587    the free list is empty. */
   2588 static VtsID get_from_free_list ( void )
   2589 {
   2590    VtsID  ii;
   2591    VtsTE* ie;
   2592    if (vts_tab_freelist == VtsID_INVALID)
   2593       return VtsID_INVALID;
   2594    ii = vts_tab_freelist;
   2595    ie = VG_(indexXA)( vts_tab, ii );
   2596    tl_assert(ie->vts == NULL);
   2597    tl_assert(ie->rc == 0);
   2598    vts_tab_freelist = ie->freelink;
   2599    return ii;
   2600 }
   2601 
   2602 /* Produce a new VtsID that can be used, either by getting it from
   2603    the freelist, or, if that is empty, by expanding vts_tab. */
   2604 static VtsID get_new_VtsID ( void )
   2605 {
   2606    VtsID ii;
   2607    VtsTE te;
   2608    ii = get_from_free_list();
   2609    if (ii != VtsID_INVALID)
   2610       return ii;
   2611    te.vts = NULL;
   2612    te.rc = 0;
   2613    te.freelink = VtsID_INVALID;
   2614    te.remap    = VtsID_INVALID;
   2615    ii = (VtsID)VG_(addToXA)( vts_tab, &te );
   2616    return ii;
   2617 }
   2618 
   2619 
   2620 /* Indirect callback from lib_zsm. */
   2621 static void VtsID__rcinc ( VtsID ii )
   2622 {
   2623    VtsTE* ie;
   2624    /* VG_(indexXA) does a range check for us */
   2625    ie = VG_(indexXA)( vts_tab, ii );
   2626    tl_assert(ie->vts); /* else it's not in use */
   2627    tl_assert(ie->rc < ~0UL); /* else we can't continue */
   2628    tl_assert(ie->vts->id == ii);
   2629    ie->rc++;
   2630 }
   2631 
   2632 /* Indirect callback from lib_zsm. */
   2633 static void VtsID__rcdec ( VtsID ii )
   2634 {
   2635    VtsTE* ie;
   2636    /* VG_(indexXA) does a range check for us */
   2637    ie = VG_(indexXA)( vts_tab, ii );
   2638    tl_assert(ie->vts); /* else it's not in use */
   2639    tl_assert(ie->rc > 0); /* else RC snafu */
   2640    tl_assert(ie->vts->id == ii);
   2641    ie->rc--;
   2642 }
   2643 
   2644 
   2645 /* Look up 'cand' in our collection of VTSs.  If present, return the
   2646    VtsID for the pre-existing version.  If not present, clone it, add
   2647    the clone to both vts_tab and vts_set, allocate a fresh VtsID for
   2648    it, and return that. */
   2649 static VtsID vts_tab__find__or__clone_and_add ( VTS* cand )
   2650 {
   2651    VTS* in_tab = NULL;
   2652    tl_assert(cand->id == VtsID_INVALID);
   2653    Bool already_have = vts_set__find__or__clone_and_add( &in_tab, cand );
   2654    tl_assert(in_tab);
   2655    if (already_have) {
   2656       /* We already have a copy of 'cand'.  Use that. */
   2657       VtsTE* ie;
   2658       tl_assert(in_tab->id != VtsID_INVALID);
   2659       ie = VG_(indexXA)( vts_tab, in_tab->id );
   2660       tl_assert(ie->vts == in_tab);
   2661       return in_tab->id;
   2662    } else {
   2663       VtsID  ii = get_new_VtsID();
   2664       VtsTE* ie = VG_(indexXA)( vts_tab, ii );
   2665       ie->vts = in_tab;
   2666       ie->rc = 0;
   2667       ie->freelink = VtsID_INVALID;
   2668       in_tab->id = ii;
   2669       return ii;
   2670    }
   2671 }
   2672 
   2673 
   2674 static void show_vts_stats ( HChar* caller )
   2675 {
   2676    UWord nSet, nTab, nLive;
   2677    ULong totrc;
   2678    UWord n, i;
   2679    nSet = VG_(sizeFM)( vts_set );
   2680    nTab = VG_(sizeXA)( vts_tab );
   2681    totrc = 0;
   2682    nLive = 0;
   2683    n = VG_(sizeXA)( vts_tab );
   2684    for (i = 0; i < n; i++) {
   2685       VtsTE* ie = VG_(indexXA)( vts_tab, i );
   2686       if (ie->vts) {
   2687          nLive++;
   2688          totrc += (ULong)ie->rc;
   2689       } else {
   2690          tl_assert(ie->rc == 0);
   2691       }
   2692    }
   2693    VG_(printf)("  show_vts_stats %s\n", caller);
   2694    VG_(printf)("    vts_tab size %4lu\n", nTab);
   2695    VG_(printf)("    vts_tab live %4lu\n", nLive);
   2696    VG_(printf)("    vts_set size %4lu\n", nSet);
   2697    VG_(printf)("        total rc %4llu\n", totrc);
   2698 }
   2699 
   2700 
   2701 /* --- Helpers for VtsID pruning --- */
   2702 
   2703 static
   2704 void remap_VtsID ( /*MOD*/XArray* /* of VtsTE */ old_tab,
   2705                    /*MOD*/XArray* /* of VtsTE */ new_tab,
   2706                    VtsID* ii )
   2707 {
   2708    VtsTE *old_te, *new_te;
   2709    VtsID old_id, new_id;
   2710    /* We're relying here on VG_(indexXA)'s range checking to assert on
   2711       any stupid values, in particular *ii == VtsID_INVALID. */
   2712    old_id = *ii;
   2713    old_te = VG_(indexXA)( old_tab, old_id );
   2714    old_te->rc--;
   2715    new_id = old_te->remap;
   2716    new_te = VG_(indexXA)( new_tab, new_id );
   2717    new_te->rc++;
   2718    *ii = new_id;
   2719 }
   2720 
   2721 static
   2722 void remap_VtsIDs_in_SVal ( /*MOD*/XArray* /* of VtsTE */ old_tab,
   2723                             /*MOD*/XArray* /* of VtsTE */ new_tab,
   2724                             SVal* s )
   2725 {
   2726    SVal old_sv, new_sv;
   2727    old_sv = *s;
   2728    if (SVal__isC(old_sv)) {
   2729       VtsID rMin, wMin;
   2730       rMin = SVal__unC_Rmin(old_sv);
   2731       wMin = SVal__unC_Wmin(old_sv);
   2732       remap_VtsID( old_tab, new_tab, &rMin );
   2733       remap_VtsID( old_tab, new_tab, &wMin );
   2734       new_sv = SVal__mkC( rMin, wMin );
   2735       *s = new_sv;
   2736   }
   2737 }
   2738 
   2739 
   2740 /* NOT TO BE CALLED FROM WITHIN libzsm. */
   2741 __attribute__((noinline))
   2742 static void vts_tab__do_GC ( Bool show_stats )
   2743 {
   2744    UWord i, nTab, nLive, nFreed;
   2745 
   2746    /* ---------- BEGIN VTS GC ---------- */
   2747    /* check this is actually necessary. */
   2748    tl_assert(vts_tab_freelist == VtsID_INVALID);
   2749 
   2750    /* empty the caches for partial order checks and binary joins.  We
   2751       could do better and prune out the entries to be deleted, but it
   2752       ain't worth the hassle. */
   2753    VtsID__invalidate_caches();
   2754 
   2755    /* First, make the reference counts up to date. */
   2756    zsm_flush_cache();
   2757 
   2758    nTab = VG_(sizeXA)( vts_tab );
   2759 
   2760    if (show_stats) {
   2761       VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
   2762       show_vts_stats("before GC");
   2763    }
   2764 
   2765    /* Now we can inspect the entire vts_tab.  Any entries with zero
   2766       .rc fields are now no longer in use and can be put back on the
   2767       free list, removed from vts_set, and deleted. */
   2768    nFreed = 0;
   2769    for (i = 0; i < nTab; i++) {
   2770       Bool present;
   2771       UWord oldK = 0, oldV = 12345;
   2772       VtsTE* te = VG_(indexXA)( vts_tab, i );
   2773       if (te->vts == NULL) {
   2774          tl_assert(te->rc == 0);
   2775          continue; /* already on the free list (presumably) */
   2776       }
   2777       if (te->rc > 0)
   2778          continue; /* in use */
   2779       /* Ok, we got one we can free. */
   2780       tl_assert(te->vts->id == i);
   2781       /* first, remove it from vts_set. */
   2782       present = VG_(delFromFM)( vts_set,
   2783                                 &oldK, &oldV, (UWord)te->vts );
   2784       tl_assert(present); /* else it isn't in vts_set ?! */
   2785       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
   2786       tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
   2787       /* now free the VTS itself */
   2788       VTS__delete(te->vts);
   2789       te->vts = NULL;
   2790       /* and finally put this entry on the free list */
   2791       tl_assert(te->freelink == VtsID_INVALID); /* can't already be on it */
   2792       add_to_free_list( i );
   2793       nFreed++;
   2794    }
   2795 
   2796    /* Now figure out when the next GC should be.  We'll allow the
   2797       number of VTSs to double before GCing again.  Except of course
   2798       that since we can't (or, at least, don't) shrink vts_tab, we
   2799       can't set the threshhold value smaller than it. */
   2800    tl_assert(nFreed <= nTab);
   2801    nLive = nTab - nFreed;
   2802    tl_assert(nLive >= 0 && nLive <= nTab);
   2803    vts_next_GC_at = 2 * nLive;
   2804    if (vts_next_GC_at < nTab)
   2805       vts_next_GC_at = nTab;
   2806 
   2807    if (show_stats) {
   2808       show_vts_stats("after GC");
   2809       VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
   2810    }
   2811 
   2812    if (VG_(clo_stats)) {
   2813       static UInt ctr = 1;
   2814       tl_assert(nTab > 0);
   2815       VG_(message)(Vg_DebugMsg,
   2816                   "libhb: VTS GC: #%u  old size %lu  live %lu  (%2llu%%)\n",
   2817                   ctr++, nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
   2818    }
   2819    /* ---------- END VTS GC ---------- */
   2820 
   2821    /* Decide whether to do VTS pruning.  We have one of three
   2822       settings. */
   2823    static UInt pruning_auto_ctr = 0; /* do not make non-static */
   2824 
   2825    Bool do_pruning = False;
   2826    switch (HG_(clo_vts_pruning)) {
   2827       case 0: /* never */
   2828          break;
   2829       case 1: /* auto */
   2830          do_pruning = (++pruning_auto_ctr % 5) == 0;
   2831          break;
   2832       case 2: /* always */
   2833          do_pruning = True;
   2834          break;
   2835       default:
   2836          tl_assert(0);
   2837    }
   2838 
   2839    /* The rest of this routine only handles pruning, so we can
   2840       quit at this point if it is not to be done. */
   2841    if (!do_pruning)
   2842       return;
   2843 
   2844    /* ---------- BEGIN VTS PRUNING ---------- */
   2845    /* We begin by sorting the backing table on its .thr values, so as
   2846       to (1) check they are unique [else something has gone wrong,
   2847       since it means we must have seen some Thr* exiting more than
   2848       once, which can't happen], and (2) so that we can quickly look
   2849       up the dead-thread entries as we work through the VTSs. */
   2850    VG_(sortXA)( verydead_thread_table );
   2851    /* Sanity check: check for unique .sts.thr values. */
   2852    UWord nBT = VG_(sizeXA)( verydead_thread_table );
   2853    if (nBT > 0) {
   2854       ThrID thrid1, thrid2;
   2855       thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, 0 );
   2856       for (i = 1; i < nBT; i++) {
   2857          thrid1 = thrid2;
   2858          thrid2 = *(ThrID*)VG_(indexXA)( verydead_thread_table, i );
   2859          tl_assert(thrid1 < thrid2);
   2860       }
   2861    }
   2862    /* Ok, so the dead thread table has unique and in-order keys. */
   2863 
   2864    /* We will run through the old table, and create a new table and
   2865       set, at the same time setting the .remap entries in the old
   2866       table to point to the new entries.  Then, visit every VtsID in
   2867       the system, and replace all of them with new ones, using the
   2868       .remap entries in the old table.  Finally, we can delete the old
   2869       table and set. */
   2870 
   2871    XArray* /* of VtsTE */ new_tab
   2872       = VG_(newXA)( HG_(zalloc), "libhb.vts_tab__do_GC.new_tab",
   2873                     HG_(free), sizeof(VtsTE) );
   2874 
   2875    /* WordFM VTS* void */
   2876    WordFM* new_set
   2877       = VG_(newFM)( HG_(zalloc), "libhb.vts_tab__do_GC.new_set",
   2878                     HG_(free),
   2879                     (Word(*)(UWord,UWord))VTS__cmp_structural );
   2880 
   2881    /* Visit each old VTS.  For each one:
   2882 
   2883       * make a pruned version
   2884 
   2885       * search new_set for the pruned version, yielding either
   2886         Nothing (not present) or the new VtsID for it.
   2887 
   2888       * if not present, allocate a new VtsID for it, insert (pruned
   2889         VTS, new VtsID) in the tree, and set
   2890         remap_table[old VtsID] = new VtsID.
   2891 
   2892       * if present, set remap_table[old VtsID] = new VtsID, where
   2893         new VtsID was determined by the tree lookup.  Then free up
   2894         the clone.
   2895    */
   2896 
   2897    UWord nBeforePruning = 0, nAfterPruning = 0;
   2898    UWord nSTSsBefore = 0, nSTSsAfter = 0;
   2899    VtsID new_VtsID_ctr = 0;
   2900 
   2901    for (i = 0; i < nTab; i++) {
   2902 
   2903       /* For each old VTS .. */
   2904       VtsTE* old_te  = VG_(indexXA)( vts_tab, i );
   2905       VTS*   old_vts = old_te->vts;
   2906       tl_assert(old_te->remap == VtsID_INVALID);
   2907 
   2908       /* Skip it if not in use */
   2909       if (old_te->rc == 0) {
   2910          tl_assert(old_vts == NULL);
   2911          continue;
   2912       }
   2913       tl_assert(old_vts != NULL);
   2914       tl_assert(old_vts->id == i);
   2915       tl_assert(old_vts->ts != NULL);
   2916 
   2917       /* It is in use. Make a pruned version. */
   2918       nBeforePruning++;
   2919       nSTSsBefore += old_vts->usedTS;
   2920       VTS* new_vts = VTS__subtract("libhb.vts_tab__do_GC.new_vts",
   2921                                    old_vts, verydead_thread_table);
   2922       tl_assert(new_vts->sizeTS == new_vts->usedTS);
   2923       tl_assert(*(ULong*)(&new_vts->ts[new_vts->usedTS])
   2924                 == 0x0ddC0ffeeBadF00dULL);
   2925 
   2926       /* Get rid of the old VTS and the tree entry.  It's a bit more
   2927          complex to incrementally delete the VTSs now than to nuke
   2928          them all after we're done, but the upside is that we don't
   2929          wind up temporarily storing potentially two complete copies
   2930          of each VTS and hence spiking memory use. */
   2931       UWord oldK = 0, oldV = 12345;
   2932       Bool  present = VG_(delFromFM)( vts_set,
   2933                                       &oldK, &oldV, (UWord)old_vts );
   2934       tl_assert(present); /* else it isn't in vts_set ?! */
   2935       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
   2936       tl_assert(oldK == (UWord)old_vts); /* else what did delFromFM find?! */
   2937       /* now free the VTS itself */
   2938       VTS__delete(old_vts);
   2939       old_te->vts = NULL;
   2940       old_vts = NULL;
   2941 
   2942       /* NO MENTIONS of old_vts allowed beyond this point. */
   2943 
   2944       /* Ok, we have the pruned copy in new_vts.  See if a
   2945          structurally identical version is already present in new_set.
   2946          If so, delete the one we just made and move on; if not, add
   2947          it. */
   2948       VTS*  identical_version = NULL;
   2949       UWord valW = 12345;
   2950       if (VG_(lookupFM)(new_set, (UWord*)&identical_version, &valW,
   2951                         (UWord)new_vts)) {
   2952          // already have it
   2953          tl_assert(valW == 0);
   2954          tl_assert(identical_version != NULL);
   2955          tl_assert(identical_version != new_vts);
   2956          VTS__delete(new_vts);
   2957          new_vts = identical_version;
   2958          tl_assert(new_vts->id != VtsID_INVALID);
   2959       } else {
   2960          tl_assert(valW == 12345);
   2961          tl_assert(identical_version == NULL);
   2962          new_vts->id = new_VtsID_ctr++;
   2963          Bool b = VG_(addToFM)(new_set, (UWord)new_vts, 0);
   2964          tl_assert(!b);
   2965          VtsTE new_te;
   2966          new_te.vts      = new_vts;
   2967          new_te.rc       = 0;
   2968          new_te.freelink = VtsID_INVALID;
   2969          new_te.remap    = VtsID_INVALID;
   2970          Word j = VG_(addToXA)( new_tab, &new_te );
   2971          tl_assert(j <= i);
   2972          tl_assert(j == new_VtsID_ctr - 1);
   2973          // stats
   2974          nAfterPruning++;
   2975          nSTSsAfter += new_vts->usedTS;
   2976       }
   2977       old_te->remap = new_vts->id;
   2978 
   2979    } /* for (i = 0; i < nTab; i++) */
   2980 
   2981    /* At this point, we have:
   2982       * the old VTS table, with its .remap entries set,
   2983         and with all .vts == NULL.
   2984       * the old VTS tree should be empty, since it and the old VTSs
   2985         it contained have been incrementally deleted was we worked
   2986         through the old table.
   2987       * the new VTS table, with all .rc == 0, all .freelink and .remap
   2988         == VtsID_INVALID.
   2989       * the new VTS tree.
   2990    */
   2991    tl_assert( VG_(sizeFM)(vts_set) == 0 );
   2992 
   2993    /* Now actually apply the mapping. */
   2994    /* Visit all the VtsIDs in the entire system.  Where do we expect
   2995       to find them?
   2996       (a) in shadow memory -- the LineZs and LineFs
   2997       (b) in our collection of struct _Thrs.
   2998       (c) in our collection of struct _SOs.
   2999       Nowhere else, AFAICS.  Not in the zsm cache, because that just
   3000       got invalidated.
   3001 
   3002       Using the .remap fields in vts_tab, map each old VtsID to a new
   3003       VtsID.  For each old VtsID, dec its rc; and for each new one,
   3004       inc it.  This sets up the new refcounts, and it also gives a
   3005       cheap sanity check of the old ones: all old refcounts should be
   3006       zero after this operation.
   3007    */
   3008 
   3009    /* Do the mappings for (a) above: iterate over the Primary shadow
   3010       mem map (WordFM Addr SecMap*). */
   3011    UWord secmapW = 0;
   3012    VG_(initIterFM)( map_shmem );
   3013    while (VG_(nextIterFM)( map_shmem, NULL, &secmapW )) {
   3014       UWord   j;
   3015       SecMap* sm = (SecMap*)secmapW;
   3016       tl_assert(sm->magic == SecMap_MAGIC);
   3017       /* Deal with the LineZs */
   3018       for (i = 0; i < N_SECMAP_ZLINES; i++) {
   3019          LineZ* lineZ = &sm->linesZ[i];
   3020          if (lineZ->dict[0] == SVal_INVALID)
   3021             continue; /* not in use -- data is in F rep instead */
   3022          for (j = 0; j < 4; j++)
   3023             remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineZ->dict[j]);
   3024       }
   3025       /* Deal with the LineFs */
   3026       for (i = 0; i < sm->linesF_size; i++) {
   3027          LineF* lineF = &sm->linesF[i];
   3028          if (!lineF->inUse)
   3029             continue;
   3030          for (j = 0; j < N_LINE_ARANGE; j++)
   3031             remap_VtsIDs_in_SVal(vts_tab, new_tab, &lineF->w64s[j]);
   3032       }
   3033    }
   3034    VG_(doneIterFM)( map_shmem );
   3035 
   3036    /* Do the mappings for (b) above: visit our collection of struct
   3037       _Thrs. */
   3038    Thread* hgthread = get_admin_threads();
   3039    tl_assert(hgthread);
   3040    while (hgthread) {
   3041       Thr* hbthr = hgthread->hbthr;
   3042       tl_assert(hbthr);
   3043       /* Threads that are listed in the prunable set have their viR
   3044          and viW set to VtsID_INVALID, so we can't mess with them. */
   3045       if (hbthr->llexit_done && hbthr->joinedwith_done) {
   3046          tl_assert(hbthr->viR == VtsID_INVALID);
   3047          tl_assert(hbthr->viW == VtsID_INVALID);
   3048          hgthread = hgthread->admin;
   3049          continue;
   3050       }
   3051       remap_VtsID( vts_tab, new_tab, &hbthr->viR );
   3052       remap_VtsID( vts_tab, new_tab, &hbthr->viW );
   3053       hgthread = hgthread->admin;
   3054    }
   3055 
   3056    /* Do the mappings for (c) above: visit the struct _SOs. */
   3057    SO* so = admin_SO;
   3058    while (so) {
   3059       if (so->viR != VtsID_INVALID)
   3060          remap_VtsID( vts_tab, new_tab, &so->viR );
   3061       if (so->viW != VtsID_INVALID)
   3062          remap_VtsID( vts_tab, new_tab, &so->viW );
   3063       so = so->admin_next;
   3064    }
   3065 
   3066    /* So, we're nearly done (with this incredibly complex operation).
   3067       Check the refcounts for the old VtsIDs all fell to zero, as
   3068       expected.  Any failure is serious. */
   3069    for (i = 0; i < nTab; i++) {
   3070       VtsTE* te = VG_(indexXA)( vts_tab, i );
   3071       tl_assert(te->vts == NULL);
   3072       /* This is the assert proper.  Note we're also asserting
   3073          zeroness for old entries which are unmapped (hence have
   3074          .remap == VtsID_INVALID).  That's OK. */
   3075       tl_assert(te->rc == 0);
   3076    }
   3077 
   3078    /* Install the new table and set. */
   3079    VG_(deleteFM)(vts_set, NULL/*kFin*/, NULL/*vFin*/);
   3080    vts_set = new_set;
   3081    VG_(deleteXA)( vts_tab );
   3082    vts_tab = new_tab;
   3083 
   3084    /* The freelist of vts_tab entries is empty now, because we've
   3085       compacted all of the live entries at the low end of the
   3086       table. */
   3087    vts_tab_freelist = VtsID_INVALID;
   3088 
   3089    /* Sanity check vts_set and vts_tab. */
   3090 
   3091    /* Because all the live entries got slid down to the bottom of vts_tab: */
   3092    tl_assert( VG_(sizeXA)( vts_tab ) == VG_(sizeFM)( vts_set ));
   3093 
   3094    /* Assert that the vts_tab and vts_set entries point at each other
   3095       in the required way */
   3096    UWord wordK = 0, wordV = 0;
   3097    VG_(initIterFM)( vts_set );
   3098    while (VG_(nextIterFM)( vts_set, &wordK, &wordV )) {
   3099       tl_assert(wordK != 0);
   3100       tl_assert(wordV == 0);
   3101       VTS* vts = (VTS*)wordK;
   3102       tl_assert(vts->id != VtsID_INVALID);
   3103       VtsTE* te = VG_(indexXA)( vts_tab, vts->id );
   3104       tl_assert(te->vts == vts);
   3105    }
   3106    VG_(doneIterFM)( vts_set );
   3107 
   3108    /* Also iterate over the table, and check each entry is
   3109       plausible. */
   3110    nTab = VG_(sizeXA)( vts_tab );
   3111    for (i = 0; i < nTab; i++) {
   3112       VtsTE* te = VG_(indexXA)( vts_tab, i );
   3113       tl_assert(te->vts);
   3114       tl_assert(te->vts->id == i);
   3115       tl_assert(te->rc > 0); /* 'cos we just GC'd */
   3116       tl_assert(te->freelink == VtsID_INVALID); /* in use */
   3117       tl_assert(te->remap == VtsID_INVALID); /* not relevant */
   3118    }
   3119 
   3120    /* And we're done.  Bwahahaha. Ha. Ha. Ha. */
   3121    if (VG_(clo_stats)) {
   3122       static UInt ctr = 1;
   3123       tl_assert(nTab > 0);
   3124       VG_(message)(
   3125          Vg_DebugMsg,
   3126          "libhb: VTS PR: #%u  before %lu (avg sz %lu)  "
   3127             "after %lu (avg sz %lu)\n",
   3128          ctr++,
   3129          nBeforePruning, nSTSsBefore / (nBeforePruning ? nBeforePruning : 1),
   3130          nAfterPruning, nSTSsAfter / (nAfterPruning ? nAfterPruning : 1)
   3131       );
   3132    }
   3133    if (0)
   3134    VG_(printf)("VTQ: before pruning %lu (avg sz %lu), "
   3135                "after pruning %lu (avg sz %lu)\n",
   3136                nBeforePruning, nSTSsBefore / nBeforePruning,
   3137                nAfterPruning, nSTSsAfter / nAfterPruning);
   3138    /* ---------- END VTS PRUNING ---------- */
   3139 }
   3140 
   3141 
   3142 /////////////////////////////////////////////////////////
   3143 //                                                     //
   3144 // Vts IDs                                             //
   3145 //                                                     //
   3146 /////////////////////////////////////////////////////////
   3147 
   3148 //////////////////////////
   3149 /* A temporary, max-sized VTS which is used as a temporary (the first
   3150    argument) in VTS__singleton, VTS__tick and VTS__join operations. */
   3151 static VTS* temp_max_sized_VTS = NULL;
   3152 
   3153 //////////////////////////
   3154 static ULong stats__cmpLEQ_queries = 0;
   3155 static ULong stats__cmpLEQ_misses  = 0;
   3156 static ULong stats__join2_queries  = 0;
   3157 static ULong stats__join2_misses   = 0;
   3158 
   3159 static inline UInt ROL32 ( UInt w, Int n ) {
   3160    w = (w << n) | (w >> (32-n));
   3161    return w;
   3162 }
   3163 static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) {
   3164    UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13);
   3165    return hash % nTab;
   3166 }
   3167 
   3168 #define N_CMPLEQ_CACHE 1023
   3169 static
   3170    struct { VtsID vi1; VtsID vi2; Bool leq; }
   3171    cmpLEQ_cache[N_CMPLEQ_CACHE];
   3172 
   3173 #define N_JOIN2_CACHE 1023
   3174 static
   3175    struct { VtsID vi1; VtsID vi2; VtsID res; }
   3176    join2_cache[N_JOIN2_CACHE];
   3177 
   3178 static void VtsID__invalidate_caches ( void ) {
   3179    Int i;
   3180    for (i = 0; i < N_CMPLEQ_CACHE; i++) {
   3181       cmpLEQ_cache[i].vi1 = VtsID_INVALID;
   3182       cmpLEQ_cache[i].vi2 = VtsID_INVALID;
   3183       cmpLEQ_cache[i].leq = False;
   3184    }
   3185    for (i = 0; i < N_JOIN2_CACHE; i++) {
   3186      join2_cache[i].vi1 = VtsID_INVALID;
   3187      join2_cache[i].vi2 = VtsID_INVALID;
   3188      join2_cache[i].res = VtsID_INVALID;
   3189    }
   3190 }
   3191 //////////////////////////
   3192 
   3193 //static Bool VtsID__is_valid ( VtsID vi ) {
   3194 //   VtsTE* ve;
   3195 //   if (vi >= (VtsID)VG_(sizeXA)( vts_tab ))
   3196 //      return False;
   3197 //   ve = VG_(indexXA)( vts_tab, vi );
   3198 //   if (!ve->vts)
   3199 //      return False;
   3200 //   tl_assert(ve->vts->id == vi);
   3201 //   return True;
   3202 //}
   3203 
   3204 static VTS* VtsID__to_VTS ( VtsID vi ) {
   3205    VtsTE* te = VG_(indexXA)( vts_tab, vi );
   3206    tl_assert(te->vts);
   3207    return te->vts;
   3208 }
   3209 
   3210 static void VtsID__pp ( VtsID vi ) {
   3211    HChar buf[100];
   3212    VTS* vts = VtsID__to_VTS(vi);
   3213    VTS__show( buf, sizeof(buf)-1, vts );
   3214    buf[sizeof(buf)-1] = 0;
   3215    VG_(printf)("%s", buf);
   3216 }
   3217 
   3218 /* compute partial ordering relation of vi1 and vi2. */
   3219 __attribute__((noinline))
   3220 static Bool VtsID__cmpLEQ_WRK ( VtsID vi1, VtsID vi2 ) {
   3221    UInt hash;
   3222    Bool leq;
   3223    VTS  *v1, *v2;
   3224    //if (vi1 == vi2) return True;
   3225    tl_assert(vi1 != vi2);
   3226    ////++
   3227    stats__cmpLEQ_queries++;
   3228    hash = hash_VtsIDs(vi1, vi2, N_CMPLEQ_CACHE);
   3229    if (cmpLEQ_cache[hash].vi1 == vi1
   3230        && cmpLEQ_cache[hash].vi2 == vi2)
   3231       return cmpLEQ_cache[hash].leq;
   3232    stats__cmpLEQ_misses++;
   3233    ////--
   3234    v1  = VtsID__to_VTS(vi1);
   3235    v2  = VtsID__to_VTS(vi2);
   3236    leq = VTS__cmpLEQ( v1, v2 ) == 0;
   3237    ////++
   3238    cmpLEQ_cache[hash].vi1 = vi1;
   3239    cmpLEQ_cache[hash].vi2 = vi2;
   3240    cmpLEQ_cache[hash].leq = leq;
   3241    ////--
   3242    return leq;
   3243 }
   3244 static inline Bool VtsID__cmpLEQ ( VtsID vi1, VtsID vi2 ) {
   3245    return LIKELY(vi1 == vi2)  ? True  : VtsID__cmpLEQ_WRK(vi1, vi2);
   3246 }
   3247 
   3248 /* compute binary join */
   3249 __attribute__((noinline))
   3250 static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
   3251    UInt  hash;
   3252    VtsID res;
   3253    VTS   *vts1, *vts2;
   3254    //if (vi1 == vi2) return vi1;
   3255    tl_assert(vi1 != vi2);
   3256    ////++
   3257    stats__join2_queries++;
   3258    hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE);
   3259    if (join2_cache[hash].vi1 == vi1
   3260        && join2_cache[hash].vi2 == vi2)
   3261       return join2_cache[hash].res;
   3262    stats__join2_misses++;
   3263    ////--
   3264    vts1 = VtsID__to_VTS(vi1);
   3265    vts2 = VtsID__to_VTS(vi2);
   3266    temp_max_sized_VTS->usedTS = 0;
   3267    VTS__join(temp_max_sized_VTS, vts1,vts2);
   3268    res = vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
   3269    ////++
   3270    join2_cache[hash].vi1 = vi1;
   3271    join2_cache[hash].vi2 = vi2;
   3272    join2_cache[hash].res = res;
   3273    ////--
   3274    return res;
   3275 }
   3276 static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
   3277    return LIKELY(vi1 == vi2)  ? vi1  : VtsID__join2_WRK(vi1, vi2);
   3278 }
   3279 
   3280 /* create a singleton VTS, namely [thr:1] */
   3281 static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) {
   3282    temp_max_sized_VTS->usedTS = 0;
   3283    VTS__singleton(temp_max_sized_VTS, thr,tym);
   3284    return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
   3285 }
   3286 
   3287 /* tick operation, creates value 1 if specified index is absent */
   3288 static VtsID VtsID__tick ( VtsID vi, Thr* idx ) {
   3289    VTS* vts = VtsID__to_VTS(vi);
   3290    temp_max_sized_VTS->usedTS = 0;
   3291    VTS__tick(temp_max_sized_VTS, idx,vts);
   3292    return vts_tab__find__or__clone_and_add(temp_max_sized_VTS);
   3293 }
   3294 
   3295 /* index into a VTS (only for assertions) */
   3296 static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) {
   3297    VTS* vts = VtsID__to_VTS(vi);
   3298    return VTS__indexAt_SLOW( vts, idx );
   3299 }
   3300 
   3301 /* Assuming that !cmpLEQ(vi1, vi2), find the index of the first (or
   3302    any, really) element in vi1 which is pointwise greater-than the
   3303    corresponding element in vi2.  If no such element exists, return
   3304    NULL.  This needs to be fairly quick since it is called every time
   3305    a race is detected. */
   3306 static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 )
   3307 {
   3308    VTS  *vts1, *vts2;
   3309    Thr*  diffthr;
   3310    ThrID diffthrid;
   3311    tl_assert(vi1 != vi2);
   3312    vts1 = VtsID__to_VTS(vi1);
   3313    vts2 = VtsID__to_VTS(vi2);
   3314    tl_assert(vts1 != vts2);
   3315    diffthrid = VTS__cmpLEQ(vts1, vts2);
   3316    diffthr = Thr__from_ThrID(diffthrid);
   3317    tl_assert(diffthr); /* else they are LEQ ! */
   3318    return diffthr;
   3319 }
   3320 
   3321 
   3322 /////////////////////////////////////////////////////////
   3323 //                                                     //
   3324 // Filters                                             //
   3325 //                                                     //
   3326 /////////////////////////////////////////////////////////
   3327 
   3328 /* Forget everything we know -- clear the filter and let everything
   3329    through.  This needs to be as fast as possible, since it is called
   3330    every time the running thread changes, and every time a thread's
   3331    vector clocks change, which can be quite frequent.  The obvious
   3332    fast way to do this is simply to stuff in tags which we know are
   3333    not going to match anything, since they're not aligned to the start
   3334    of a line. */
   3335 static void Filter__clear ( Filter* fi, HChar* who )
   3336 {
   3337    UWord i;
   3338    if (0) VG_(printf)("  Filter__clear(%p, %s)\n", fi, who);
   3339    for (i = 0; i < FI_NUM_LINES; i += 8) {
   3340       fi->tags[i+0] = 1; /* impossible value -- cannot match */
   3341       fi->tags[i+1] = 1;
   3342       fi->tags[i+2] = 1;
   3343       fi->tags[i+3] = 1;
   3344       fi->tags[i+4] = 1;
   3345       fi->tags[i+5] = 1;
   3346       fi->tags[i+6] = 1;
   3347       fi->tags[i+7] = 1;
   3348    }
   3349    tl_assert(i == FI_NUM_LINES);
   3350 }
   3351 
   3352 /* Clearing an arbitrary range in the filter.  Unfortunately
   3353    we have to do this due to core-supplied new/die-mem events. */
   3354 
   3355 static void Filter__clear_1byte ( Filter* fi, Addr a )
   3356 {
   3357    Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   3358    UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   3359    FiLine* line   = &fi->lines[lineno];
   3360    UWord   loff   = (a - atag) / 8;
   3361    UShort  mask   = 0x3 << (2 * (a & 7));
   3362    /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
   3363    if (LIKELY( fi->tags[lineno] == atag )) {
   3364       /* hit.  clear the bits. */
   3365       UShort  u16  = line->u16s[loff];
   3366       line->u16s[loff] = u16 & ~mask; /* clear them */
   3367    } else {
   3368       /* miss.  The filter doesn't hold this address, so ignore. */
   3369    }
   3370 }
   3371 
   3372 static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a )
   3373 {
   3374    Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   3375    UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   3376    FiLine* line   = &fi->lines[lineno];
   3377    UWord   loff   = (a - atag) / 8;
   3378    if (LIKELY( fi->tags[lineno] == atag )) {
   3379       line->u16s[loff] = 0;
   3380    } else {
   3381     /* miss.  The filter doesn't hold this address, so ignore. */
   3382    }
   3383 }
   3384 
   3385 static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
   3386 {
   3387   //VG_(printf)("%lu ", len);
   3388    /* slowly do part preceding 8-alignment */
   3389    while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) {
   3390       Filter__clear_1byte( fi, a );
   3391       a++;
   3392       len--;
   3393    }
   3394    /* vector loop */
   3395    while (len >= 8) {
   3396       Filter__clear_8bytes_aligned( fi, a );
   3397       a += 8;
   3398       len -= 8;
   3399    }
   3400    /* slowly do tail */
   3401    while (UNLIKELY(len > 0)) {
   3402       Filter__clear_1byte( fi, a );
   3403       a++;
   3404       len--;
   3405    }
   3406 }
   3407 
   3408 
   3409 /* ------ Read handlers for the filter. ------ */
   3410 
   3411 static inline Bool Filter__ok_to_skip_crd64 ( Filter* fi, Addr a )
   3412 {
   3413    if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
   3414       return False;
   3415    {
   3416      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   3417      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   3418      FiLine* line   = &fi->lines[lineno];
   3419      UWord   loff   = (a - atag) / 8;
   3420      UShort  mask   = 0xAAAA;
   3421      if (LIKELY( fi->tags[lineno] == atag )) {
   3422         /* hit.  check line and update. */
   3423         UShort u16  = line->u16s[loff];
   3424         Bool   ok   = (u16 & mask) == mask; /* all R bits set? */
   3425         line->u16s[loff] = u16 | mask; /* set them */
   3426         return ok;
   3427      } else {
   3428         /* miss.  nuke existing line and re-use it. */
   3429         UWord i;
   3430         fi->tags[lineno] = atag;
   3431         for (i = 0; i < FI_LINE_SZB / 8; i++)
   3432            line->u16s[i] = 0;
   3433         line->u16s[loff] = mask;
   3434         return False;
   3435      }
   3436    }
   3437 }
   3438 
   3439 static inline Bool Filter__ok_to_skip_crd32 ( Filter* fi, Addr a )
   3440 {
   3441    if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
   3442       return False;
   3443    {
   3444      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   3445      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   3446      FiLine* line   = &fi->lines[lineno];
   3447      UWord   loff   = (a - atag) / 8;
   3448      UShort  mask   = 0xAA << (2 * (a & 4)); /* 0xAA00 or 0x00AA */
   3449      if (LIKELY( fi->tags[lineno] == atag )) {
   3450         /* hit.  check line and update. */
   3451         UShort  u16  = line->u16s[loff];
   3452         Bool    ok   = (u16 & mask) == mask; /* 4 x R bits set? */
   3453         line->u16s[loff] = u16 | mask; /* set them */
   3454         return ok;
   3455      } else {
   3456         /* miss.  nuke existing line and re-use it. */
   3457         UWord   i;
   3458         fi->tags[lineno] = atag;
   3459         for (i = 0; i < FI_LINE_SZB / 8; i++)
   3460            line->u16s[i] = 0;
   3461         line->u16s[loff] = mask;
   3462         return False;
   3463      }
   3464    }
   3465 }
   3466 
   3467 static inline Bool Filter__ok_to_skip_crd16 ( Filter* fi, Addr a )
   3468 {
   3469    if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
   3470       return False;
   3471    {
   3472      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   3473      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   3474      FiLine* line   = &fi->lines[lineno];
   3475      UWord   loff   = (a - atag) / 8;
   3476      UShort  mask   = 0xA << (2 * (a & 6));
   3477      /* mask is A000, 0A00, 00A0 or 000A */
   3478      if (LIKELY( fi->tags[lineno] == atag )) {
   3479         /* hit.  check line and update. */
   3480         UShort  u16  = line->u16s[loff];
   3481         Bool    ok   = (u16 & mask) == mask; /* 2 x R bits set? */
   3482         line->u16s[loff] = u16 | mask; /* set them */
   3483         return ok;
   3484      } else {
   3485         /* miss.  nuke existing line and re-use it. */
   3486         UWord   i;
   3487         fi->tags[lineno] = atag;
   3488         for (i = 0; i < FI_LINE_SZB / 8; i++)
   3489            line->u16s[i] = 0;
   3490         line->u16s[loff] = mask;
   3491         return False;
   3492      }
   3493    }
   3494 }
   3495 
   3496 static inline Bool Filter__ok_to_skip_crd08 ( Filter* fi, Addr a )
   3497 {
   3498    {
   3499      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   3500      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   3501      FiLine* line   = &fi->lines[lineno];
   3502      UWord   loff   = (a - atag) / 8;
   3503      UShort  mask   = 0x2 << (2 * (a & 7));
   3504      /* mask is 8000, 2000, 0800, 0200, 0080, 0020, 0008 or 0002 */
   3505      if (LIKELY( fi->tags[lineno] == atag )) {
   3506         /* hit.  check line and update. */
   3507         UShort  u16  = line->u16s[loff];
   3508         Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
   3509         line->u16s[loff] = u16 | mask; /* set them */
   3510         return ok;
   3511      } else {
   3512         /* miss.  nuke existing line and re-use it. */
   3513         UWord   i;
   3514         fi->tags[lineno] = atag;
   3515         for (i = 0; i < FI_LINE_SZB / 8; i++)
   3516            line->u16s[i] = 0;
   3517         line->u16s[loff] = mask;
   3518         return False;
   3519      }
   3520    }
   3521 }
   3522 
   3523 
   3524 /* ------ Write handlers for the filter. ------ */
   3525 
   3526 static inline Bool Filter__ok_to_skip_cwr64 ( Filter* fi, Addr a )
   3527 {
   3528    if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
   3529       return False;
   3530    {
   3531      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   3532      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   3533      FiLine* line   = &fi->lines[lineno];
   3534      UWord   loff   = (a - atag) / 8;
   3535      UShort  mask   = 0xFFFF;
   3536      if (LIKELY( fi->tags[lineno] == atag )) {
   3537         /* hit.  check line and update. */
   3538         UShort u16  = line->u16s[loff];
   3539         Bool   ok   = (u16 & mask) == mask; /* all R & W bits set? */
   3540         line->u16s[loff] = u16 | mask; /* set them */
   3541         return ok;
   3542      } else {
   3543         /* miss.  nuke existing line and re-use it. */
   3544         UWord i;
   3545         fi->tags[lineno] = atag;
   3546         for (i = 0; i < FI_LINE_SZB / 8; i++)
   3547            line->u16s[i] = 0;
   3548         line->u16s[loff] = mask;
   3549         return False;
   3550      }
   3551    }
   3552 }
   3553 
   3554 static inline Bool Filter__ok_to_skip_cwr32 ( Filter* fi, Addr a )
   3555 {
   3556    if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
   3557       return False;
   3558    {
   3559      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   3560      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   3561      FiLine* line   = &fi->lines[lineno];
   3562      UWord   loff   = (a - atag) / 8;
   3563      UShort  mask   = 0xFF << (2 * (a & 4)); /* 0xFF00 or 0x00FF */
   3564      if (LIKELY( fi->tags[lineno] == atag )) {
   3565         /* hit.  check line and update. */
   3566         UShort  u16  = line->u16s[loff];
   3567         Bool    ok   = (u16 & mask) == mask; /* 4 x R & W bits set? */
   3568         line->u16s[loff] = u16 | mask; /* set them */
   3569         return ok;
   3570      } else {
   3571         /* miss.  nuke existing line and re-use it. */
   3572         UWord   i;
   3573         fi->tags[lineno] = atag;
   3574         for (i = 0; i < FI_LINE_SZB / 8; i++)
   3575            line->u16s[i] = 0;
   3576         line->u16s[loff] = mask;
   3577         return False;
   3578      }
   3579    }
   3580 }
   3581 
   3582 static inline Bool Filter__ok_to_skip_cwr16 ( Filter* fi, Addr a )
   3583 {
   3584    if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
   3585       return False;
   3586    {
   3587      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   3588      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   3589      FiLine* line   = &fi->lines[lineno];
   3590      UWord   loff   = (a - atag) / 8;
   3591      UShort  mask   = 0xF << (2 * (a & 6));
   3592      /* mask is F000, 0F00, 00F0 or 000F */
   3593      if (LIKELY( fi->tags[lineno] == atag )) {
   3594         /* hit.  check line and update. */
   3595         UShort  u16  = line->u16s[loff];
   3596         Bool    ok   = (u16 & mask) == mask; /* 2 x R & W bits set? */
   3597         line->u16s[loff] = u16 | mask; /* set them */
   3598         return ok;
   3599      } else {
   3600         /* miss.  nuke existing line and re-use it. */
   3601         UWord   i;
   3602         fi->tags[lineno] = atag;
   3603         for (i = 0; i < FI_LINE_SZB / 8; i++)
   3604            line->u16s[i] = 0;
   3605         line->u16s[loff] = mask;
   3606         return False;
   3607      }
   3608    }
   3609 }
   3610 
   3611 static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a )
   3612 {
   3613    {
   3614      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   3615      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   3616      FiLine* line   = &fi->lines[lineno];
   3617      UWord   loff   = (a - atag) / 8;
   3618      UShort  mask   = 0x3 << (2 * (a & 7));
   3619      /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
   3620      if (LIKELY( fi->tags[lineno] == atag )) {
   3621         /* hit.  check line and update. */
   3622         UShort  u16  = line->u16s[loff];
   3623         Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
   3624         line->u16s[loff] = u16 | mask; /* set them */
   3625         return ok;
   3626      } else {
   3627         /* miss.  nuke existing line and re-use it. */
   3628         UWord   i;
   3629         fi->tags[lineno] = atag;
   3630         for (i = 0; i < FI_LINE_SZB / 8; i++)
   3631            line->u16s[i] = 0;
   3632         line->u16s[loff] = mask;
   3633         return False;
   3634      }
   3635    }
   3636 }
   3637 
   3638 
   3639 /////////////////////////////////////////////////////////
   3640 //                                                     //
   3641 // Threads                                             //
   3642 //                                                     //
   3643 /////////////////////////////////////////////////////////
   3644 
   3645 /* Maps ThrID values to their Thr*s (which contain ThrID values that
   3646    should point back to the relevant slot in the array.  Lowest
   3647    numbered slot (0) is for thrid = 1024, (1) is for 1025, etc. */
   3648 static XArray* /* of Thr* */ thrid_to_thr_map = NULL;
   3649 
   3650 /* And a counter to dole out ThrID values.  For rationale/background,
   3651    see comments on definition of ScalarTS (far) above. */
   3652 static ThrID thrid_counter = 1024; /* runs up to ThrID_MAX_VALID */
   3653 
   3654 static ThrID Thr__to_ThrID ( Thr* thr ) {
   3655    return thr->thrid;
   3656 }
   3657 static Thr* Thr__from_ThrID ( UInt thrid ) {
   3658    Thr* thr = *(Thr**)VG_(indexXA)( thrid_to_thr_map, thrid - 1024 );
   3659    tl_assert(thr->thrid == thrid);
   3660    return thr;
   3661 }
   3662 
   3663 static Thr* Thr__new ( void )
   3664 {
   3665    Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) );
   3666    thr->viR = VtsID_INVALID;
   3667    thr->viW = VtsID_INVALID;
   3668    thr->llexit_done = False;
   3669    thr->joinedwith_done = False;
   3670    thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
   3671    /* We only really need this at history level 1, but unfortunately
   3672       this routine is called before the command line processing is
   3673       done (sigh), so we can't rely on HG_(clo_history_level) at this
   3674       point.  Hence always allocate it.  Bah. */
   3675    thr->local_Kws_n_stacks
   3676       = VG_(newXA)( HG_(zalloc),
   3677                     "libhb.Thr__new.3 (local_Kws_and_stacks)",
   3678                     HG_(free), sizeof(ULong_n_EC) );
   3679 
   3680    /* Add this Thr* <-> ThrID binding to the mapping, and
   3681       cross-check */
   3682    if (!thrid_to_thr_map) {
   3683       thrid_to_thr_map = VG_(newXA)( HG_(zalloc), "libhb.Thr__new.4",
   3684                                      HG_(free), sizeof(Thr*) );
   3685       tl_assert(thrid_to_thr_map);
   3686    }
   3687 
   3688    if (thrid_counter >= ThrID_MAX_VALID) {
   3689       /* We're hosed.  We have to stop. */
   3690       scalarts_limitations_fail_NORETURN( True/*due_to_nThrs*/ );
   3691    }
   3692 
   3693    thr->thrid = thrid_counter++;
   3694    Word ix = VG_(addToXA)( thrid_to_thr_map, &thr );
   3695    tl_assert(ix + 1024 == thr->thrid);
   3696 
   3697    return thr;
   3698 }
   3699 
   3700 static void note_local_Kw_n_stack_for ( Thr* thr )
   3701 {
   3702    Word       nPresent;
   3703    ULong_n_EC pair;
   3704    tl_assert(thr);
   3705 
   3706    // We only collect this info at history level 1 (approx)
   3707    if (HG_(clo_history_level) != 1)
   3708       return;
   3709 
   3710    /* This is the scalar Kw for thr. */
   3711    pair.ull = VtsID__indexAt( thr->viW, thr );
   3712    pair.ec  = main_get_EC( thr );
   3713    tl_assert(pair.ec);
   3714    tl_assert(thr->local_Kws_n_stacks);
   3715 
   3716    /* check that we're not adding duplicates */
   3717    nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
   3718 
   3719    /* Throw away old stacks, if necessary.  We can't accumulate stuff
   3720       indefinitely. */
   3721    if (nPresent >= N_KWs_N_STACKs_PER_THREAD) {
   3722       VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 );
   3723       nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
   3724       if (0)
   3725          VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p (!!! gc !!!)\n",
   3726                      thr, pair.ull, pair.ec );
   3727    }
   3728 
   3729    if (nPresent > 0) {
   3730       ULong_n_EC* prevPair
   3731          = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 );
   3732       tl_assert( prevPair->ull <= pair.ull );
   3733    }
   3734 
   3735    if (nPresent == 0)
   3736       pair.ec = NULL;
   3737 
   3738    VG_(addToXA)( thr->local_Kws_n_stacks, &pair );
   3739 
   3740    if (0)
   3741       VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p\n",
   3742                   thr, pair.ull, pair.ec );
   3743    if (0)
   3744       VG_(pp_ExeContext)(pair.ec);
   3745 }
   3746 
   3747 static Int cmp__ULong_n_EC__by_ULong ( ULong_n_EC* pair1, ULong_n_EC* pair2 )
   3748 {
   3749    if (pair1->ull < pair2->ull) return -1;
   3750    if (pair1->ull > pair2->ull) return 1;
   3751    return 0;
   3752 }
   3753 
   3754 
   3755 /////////////////////////////////////////////////////////
   3756 //                                                     //
   3757 // Shadow Values                                       //
   3758 //                                                     //
   3759 /////////////////////////////////////////////////////////
   3760 
   3761 // type SVal, SVal_INVALID and SVal_NOACCESS are defined by
   3762 // hb_zsm.h.  We have to do everything else here.
   3763 
   3764 /* SVal is 64 bit unsigned int.
   3765 
   3766       <---------30--------->    <---------30--------->
   3767    00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X   C(Rmin,Wmin)
   3768    10 X--------------------X XX X--------------------X   A: SVal_NOACCESS
   3769    11 0--------------------0 00 0--------------------0   A: SVal_INVALID
   3770 
   3771 */
   3772 #define SVAL_TAGMASK (3ULL << 62)
   3773 
   3774 static inline Bool SVal__isC ( SVal s ) {
   3775    return (0ULL << 62) == (s & SVAL_TAGMASK);
   3776 }
   3777 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) {
   3778    //tl_assert(VtsID__is_valid(rmini));
   3779    //tl_assert(VtsID__is_valid(wmini));
   3780    return (((ULong)rmini) << 32) | ((ULong)wmini);
   3781 }
   3782 static inline VtsID SVal__unC_Rmin ( SVal s ) {
   3783    tl_assert(SVal__isC(s));
   3784    return (VtsID)(s >> 32);
   3785 }
   3786 static inline VtsID SVal__unC_Wmin ( SVal s ) {
   3787    tl_assert(SVal__isC(s));
   3788    return (VtsID)(s & 0xFFFFFFFFULL);
   3789 }
   3790 
   3791 static inline Bool SVal__isA ( SVal s ) {
   3792    return (2ULL << 62) == (s & SVAL_TAGMASK);
   3793 }
   3794 static inline SVal SVal__mkA ( void ) {
   3795    return 2ULL << 62;
   3796 }
   3797 
   3798 /* Direct callback from lib_zsm. */
   3799 static void SVal__rcinc ( SVal s ) {
   3800    if (SVal__isC(s)) {
   3801       VtsID__rcinc( SVal__unC_Rmin(s) );
   3802       VtsID__rcinc( SVal__unC_Wmin(s) );
   3803    }
   3804 }
   3805 
   3806 /* Direct callback from lib_zsm. */
   3807 static void SVal__rcdec ( SVal s ) {
   3808    if (SVal__isC(s)) {
   3809       VtsID__rcdec( SVal__unC_Rmin(s) );
   3810       VtsID__rcdec( SVal__unC_Wmin(s) );
   3811    }
   3812 }
   3813 
   3814 
   3815 /////////////////////////////////////////////////////////
   3816 //                                                     //
   3817 // A simple group (memory) allocator                   //
   3818 //                                                     //
   3819 /////////////////////////////////////////////////////////
   3820 
   3821 //////////////// BEGIN general group allocator
   3822 typedef
   3823    struct {
   3824       UWord   elemSzB;        /* element size */
   3825       UWord   nPerGroup;      /* # elems per group */
   3826       void*   (*alloc)(HChar*, SizeT); /* group allocator */
   3827       HChar*  cc; /* group allocator's cc */
   3828       void    (*free)(void*); /* group allocator's free-er (unused) */
   3829       /* XArray of void* (pointers to groups).  The groups themselves.
   3830          Each element is a pointer to a block of size (elemSzB *
   3831          nPerGroup) bytes. */
   3832       XArray* groups;
   3833       /* next free element.  Is a pointer to an element in one of the
   3834          groups pointed to by .groups. */
   3835       void* nextFree;
   3836    }
   3837    GroupAlloc;
   3838 
   3839 static void init_GroupAlloc ( /*MOD*/GroupAlloc* ga,
   3840                               UWord  elemSzB,
   3841                               UWord  nPerGroup,
   3842                               void*  (*alloc)(HChar*, SizeT),
   3843                               HChar* cc,
   3844                               void   (*free)(void*) )
   3845 {
   3846    tl_assert(0 == (elemSzB % sizeof(UWord)));
   3847    tl_assert(elemSzB >= sizeof(UWord));
   3848    tl_assert(nPerGroup >= 100); /* let's say */
   3849    tl_assert(alloc);
   3850    tl_assert(cc);
   3851    tl_assert(free);
   3852    tl_assert(ga);
   3853    VG_(memset)(ga, 0, sizeof(*ga));
   3854    ga->elemSzB   = elemSzB;
   3855    ga->nPerGroup = nPerGroup;
   3856    ga->groups    = NULL;
   3857    ga->alloc     = alloc;
   3858    ga->cc        = cc;
   3859    ga->free      = free;
   3860    ga->groups    = VG_(newXA)( alloc, cc, free, sizeof(void*) );
   3861    ga->nextFree  = NULL;
   3862    tl_assert(ga->groups);
   3863 }
   3864 
   3865 /* The freelist is empty.  Allocate a new group and put all the new
   3866    elements in it onto the freelist. */
   3867 __attribute__((noinline))
   3868 static void gal_add_new_group ( GroupAlloc* ga )
   3869 {
   3870    Word   i;
   3871    UWord* group;
   3872    tl_assert(ga);
   3873    tl_assert(ga->nextFree == NULL);
   3874    group = ga->alloc( ga->cc, ga->elemSzB * ga->nPerGroup );
   3875    tl_assert(group);
   3876    /* extend the freelist through the new group.  Place the freelist
   3877       pointer in the first word of each element.  That's why the
   3878       element size must be at least one word. */
   3879    for (i = ga->nPerGroup-1; i >= 0; i--) {
   3880       UChar* elemC = ((UChar*)group) + i * ga->elemSzB;
   3881       UWord* elem  = (UWord*)elemC;
   3882       tl_assert(0 == (((UWord)elem) % sizeof(UWord)));
   3883       *elem = (UWord)ga->nextFree;
   3884       ga->nextFree = elem;
   3885    }
   3886    /* and add to our collection of groups */
   3887    VG_(addToXA)( ga->groups, &group );
   3888 }
   3889 
   3890 inline static void* gal_Alloc ( GroupAlloc* ga )
   3891 {
   3892    UWord* elem;
   3893    if (UNLIKELY(ga->nextFree == NULL)) {
   3894       gal_add_new_group(ga);
   3895    }
   3896    elem = ga->nextFree;
   3897    ga->nextFree = (void*)*elem;
   3898    *elem = 0; /* unnecessary, but just to be on the safe side */
   3899    return elem;
   3900 }
   3901 
   3902 inline static void* gal_Alloc_w_size_check ( GroupAlloc* ga, SizeT n )
   3903 {
   3904    tl_assert(n == ga->elemSzB);
   3905    return gal_Alloc( ga );
   3906 }
   3907 
   3908 inline static void gal_Free ( GroupAlloc* ga, void* p )
   3909 {
   3910    UWord* elem = (UWord*)p;
   3911    *elem = (UWord)ga->nextFree;
   3912    ga->nextFree = elem;
   3913 }
   3914 //////////////// END general group allocator
   3915 
   3916 
   3917 /////////////////////////////////////////////////////////
   3918 //                                                     //
   3919 // Change-event map2                                   //
   3920 //                                                     //
   3921 /////////////////////////////////////////////////////////
   3922 
   3923 #define EVENT_MAP_GC_DISCARD_FRACTION  0.5
   3924 
   3925 /* This is in two parts:
   3926 
   3927    1. A hash table of RCECs.  This is a set of reference-counted stack
   3928       traces.  When the reference count of a stack trace becomes zero,
   3929       it is removed from the set and freed up.  The intent is to have
   3930       a set of stack traces which can be referred to from (2), but to
   3931       only represent each one once.  The set is indexed/searched by
   3932       ordering on the stack trace vectors.
   3933 
   3934    2. A SparseWA of OldRefs.  These store information about each old
   3935       ref that we need to record.  It is indexed by address of the
   3936       location for which the information is recorded.  For LRU
   3937       purposes, each OldRef also contains a generation number,
   3938       indicating when it was most recently accessed.
   3939 
   3940       The important part of an OldRef is, however, its accs[] array.
   3941       This is an array of N_OLDREF_ACCS which binds (thread, R/W,
   3942       size) triples to RCECs.  This allows us to collect the last
   3943       access-traceback by up to N_OLDREF_ACCS different triples for
   3944       this location.  The accs[] array is a MTF-array.  If a binding
   3945       falls off the end, that's too bad -- we will lose info about
   3946       that triple's access to this location.
   3947 
   3948       When the SparseWA becomes too big, we can throw away the OldRefs
   3949       whose generation numbers are below some threshold; hence doing
   3950       approximate LRU discarding.  For each discarded OldRef we must
   3951       of course decrement the reference count on the all RCECs it
   3952       refers to, in order that entries from (1) eventually get
   3953       discarded too.
   3954 
   3955    A major improvement in reliability of this mechanism would be to
   3956    have a dynamically sized OldRef.accs[] array, so no entries ever
   3957    fall off the end.  In investigations (Dec 08) it appears that a
   3958    major cause for the non-availability of conflicting-access traces
   3959    in race reports is caused by the fixed size of this array.  I
   3960    suspect for most OldRefs, only a few entries are used, but for a
   3961    minority of cases there is an overflow, leading to info lossage.
   3962    Investigations also suggest this is very workload and scheduling
   3963    sensitive.  Therefore a dynamic sizing would be better.
   3964 
   3965    However, dynamic sizing would defeat the use of a GroupAllocator
   3966    for OldRef structures.  And that's important for performance.  So
   3967    it's not straightforward to do.
   3968 */
   3969 
   3970 
   3971 static UWord stats__ctxt_rcdec1 = 0;
   3972 static UWord stats__ctxt_rcdec2 = 0;
   3973 static UWord stats__ctxt_rcdec3 = 0;
   3974 static UWord stats__ctxt_rcdec_calls = 0;
   3975 static UWord stats__ctxt_rcdec_discards = 0;
   3976 static UWord stats__ctxt_rcdec1_eq = 0;
   3977 
   3978 static UWord stats__ctxt_tab_curr = 0;
   3979 static UWord stats__ctxt_tab_max  = 0;
   3980 
   3981 static UWord stats__ctxt_tab_qs   = 0;
   3982 static UWord stats__ctxt_tab_cmps = 0;
   3983 
   3984 
   3985 ///////////////////////////////////////////////////////
   3986 //// Part (1): A hash table of RCECs
   3987 ///
   3988 
   3989 #define N_FRAMES 8
   3990 
   3991 // (UInt) `echo "Reference Counted Execution Context" | md5sum`
   3992 #define RCEC_MAGIC 0xab88abb2UL
   3993 
   3994 //#define N_RCEC_TAB 98317 /* prime */
   3995 #define N_RCEC_TAB 196613 /* prime */
   3996 
   3997 typedef
   3998    struct _RCEC {
   3999       UWord magic;  /* sanity check only */
   4000       struct _RCEC* next;
   4001       UWord rc;
   4002       UWord rcX; /* used for crosschecking */
   4003       UWord frames_hash;          /* hash of all the frames */
   4004       UWord frames[N_FRAMES];
   4005    }
   4006    RCEC;
   4007 
   4008 static RCEC** contextTab = NULL; /* hash table of RCEC*s */
   4009 
   4010 
   4011 /* Gives an arbitrary total order on RCEC .frames fields */
   4012 static Word RCEC__cmp_by_frames ( RCEC* ec1, RCEC* ec2 ) {
   4013    Word i;
   4014    tl_assert(ec1 && ec1->magic == RCEC_MAGIC);
   4015    tl_assert(ec2 && ec2->magic == RCEC_MAGIC);
   4016    if (ec1->frames_hash < ec2->frames_hash) return -1;
   4017    if (ec1->frames_hash > ec2->frames_hash) return  1;
   4018    for (i = 0; i < N_FRAMES; i++) {
   4019       if (ec1->frames[i] < ec2->frames[i]) return -1;
   4020       if (ec1->frames[i] > ec2->frames[i]) return  1;
   4021    }
   4022    return 0;
   4023 }
   4024 
   4025 
   4026 /* Dec the ref of this RCEC. */
   4027 static void ctxt__rcdec ( RCEC* ec )
   4028 {
   4029    stats__ctxt_rcdec_calls++;
   4030    tl_assert(ec && ec->magic == RCEC_MAGIC);
   4031    tl_assert(ec->rc > 0);
   4032    ec->rc--;
   4033 }
   4034 
   4035 static void ctxt__rcinc ( RCEC* ec )
   4036 {
   4037    tl_assert(ec && ec->magic == RCEC_MAGIC);
   4038    ec->rc++;
   4039 }
   4040 
   4041 
   4042 //////////// BEGIN RCEC group allocator
   4043 static GroupAlloc rcec_group_allocator;
   4044 
   4045 static RCEC* alloc_RCEC ( void ) {
   4046    return gal_Alloc ( &rcec_group_allocator );
   4047 }
   4048 
   4049 static void free_RCEC ( RCEC* rcec ) {
   4050    tl_assert(rcec->magic == RCEC_MAGIC);
   4051    gal_Free( &rcec_group_allocator, rcec );
   4052 }
   4053 //////////// END RCEC group allocator
   4054 
   4055 
   4056 /* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and
   4057    move it one step closer the the front of the list, so as to make
   4058    subsequent searches for it cheaper. */
   4059 static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec )
   4060 {
   4061    RCEC *ec0, *ec1, *ec2;
   4062    if (ec == *headp)
   4063       tl_assert(0); /* already at head of list */
   4064    tl_assert(ec != NULL);
   4065    ec0 = *headp;
   4066    ec1 = NULL;
   4067    ec2 = NULL;
   4068    while (True) {
   4069       if (ec0 == NULL || ec0 == ec) break;
   4070       ec2 = ec1;
   4071       ec1 = ec0;
   4072       ec0 = ec0->next;
   4073    }
   4074    tl_assert(ec0 == ec);
   4075    if (ec0 != NULL && ec1 != NULL && ec2 != NULL) {
   4076       RCEC* tmp;
   4077       /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's
   4078          predecessor.  Swap ec0 and ec1, that is, move ec0 one step
   4079          closer to the start of the list. */
   4080       tl_assert(ec2->next == ec1);
   4081       tl_assert(ec1->next == ec0);
   4082       tmp = ec0->next;
   4083       ec2->next = ec0;
   4084       ec0->next = ec1;
   4085       ec1->next = tmp;
   4086    }
   4087    else
   4088    if (ec0 != NULL && ec1 != NULL && ec2 == NULL) {
   4089       /* it's second in the list. */
   4090       tl_assert(*headp == ec1);
   4091       tl_assert(ec1->next == ec0);
   4092       ec1->next = ec0->next;
   4093       ec0->next = ec1;
   4094       *headp = ec0;
   4095    }
   4096 }
   4097 
   4098 
   4099 /* Find the given RCEC in the tree, and return a pointer to it.  Or,
   4100    if not present, add the given one to the tree (by making a copy of
   4101    it, so the caller can immediately deallocate the original) and
   4102    return a pointer to the copy.  The caller can safely have 'example'
   4103    on its stack, since we will always return a pointer to a copy of
   4104    it, not to the original.  Note that the inserted node will have .rc
   4105    of zero and so the caller must immediatly increment it. */
   4106 __attribute__((noinline))
   4107 static RCEC* ctxt__find_or_add ( RCEC* example )
   4108 {
   4109    UWord hent;
   4110    RCEC* copy;
   4111    tl_assert(example && example->magic == RCEC_MAGIC);
   4112    tl_assert(example->rc == 0);
   4113 
   4114    /* Search the hash table to see if we already have it. */
   4115    stats__ctxt_tab_qs++;
   4116    hent = example->frames_hash % N_RCEC_TAB;
   4117    copy = contextTab[hent];
   4118    while (1) {
   4119       if (!copy) break;
   4120       tl_assert(copy->magic == RCEC_MAGIC);
   4121       stats__ctxt_tab_cmps++;
   4122       if (0 == RCEC__cmp_by_frames(copy, example)) break;
   4123       copy = copy->next;
   4124    }
   4125 
   4126    if (copy) {
   4127       tl_assert(copy != example);
   4128       /* optimisation: if it's not at the head of its list, move 1
   4129          step fwds, to make future searches cheaper */
   4130       if (copy != contextTab[hent]) {
   4131          move_RCEC_one_step_forward( &contextTab[hent], copy );
   4132       }
   4133    } else {
   4134       copy = alloc_RCEC();
   4135       tl_assert(copy != example);
   4136       *copy = *example;
   4137       copy->next = contextTab[hent];
   4138       contextTab[hent] = copy;
   4139       stats__ctxt_tab_curr++;
   4140       if (stats__ctxt_tab_curr > stats__ctxt_tab_max)
   4141          stats__ctxt_tab_max = stats__ctxt_tab_curr;
   4142    }
   4143    return copy;
   4144 }
   4145 
   4146 static inline UWord ROLW ( UWord w, Int n )
   4147 {
   4148    Int bpw = 8 * sizeof(UWord);
   4149    w = (w << n) | (w >> (bpw-n));
   4150    return w;
   4151 }
   4152 
   4153 __attribute__((noinline))
   4154 static RCEC* get_RCEC ( Thr* thr )
   4155 {
   4156    UWord hash, i;
   4157    RCEC  example;
   4158    example.magic = RCEC_MAGIC;
   4159    example.rc = 0;
   4160    example.rcX = 0;
   4161    main_get_stacktrace( thr, &example.frames[0], N_FRAMES );
   4162    hash = 0;
   4163    for (i = 0; i < N_FRAMES; i++) {
   4164       hash ^= example.frames[i];
   4165       hash = ROLW(hash, 19);
   4166    }
   4167    example.frames_hash = hash;
   4168    return ctxt__find_or_add( &example );
   4169 }
   4170 
   4171 ///////////////////////////////////////////////////////
   4172 //// Part (2):
   4173 ///  A SparseWA guest-addr -> OldRef, that refers to (1)
   4174 ///
   4175 
   4176 // (UInt) `echo "Old Reference Information" | md5sum`
   4177 #define OldRef_MAGIC 0x30b1f075UL
   4178 
   4179 /* Records an access: a thread, a context (size & writeness) and the
   4180    number of held locks. The size (1,2,4,8) is encoded as 00 = 1, 01 =
   4181    2, 10 = 4, 11 = 8.
   4182 */
   4183 typedef
   4184    struct {
   4185       RCEC*     rcec;
   4186       WordSetID locksHeldW;
   4187       UInt      thrid  : SCALARTS_N_THRBITS;
   4188       UInt      szLg2B : 2;
   4189       UInt      isW    : 1;
   4190    }
   4191    Thr_n_RCEC;
   4192 
   4193 #define N_OLDREF_ACCS 5
   4194 
   4195 typedef
   4196    struct {
   4197       UWord magic;  /* sanity check only */
   4198       UWord gen;    /* when most recently accessed */
   4199                     /* or free list when not in use */
   4200       /* unused slots in this array have .thrid == 0, which is invalid */
   4201       Thr_n_RCEC accs[N_OLDREF_ACCS];
   4202    }
   4203    OldRef;
   4204 
   4205 
   4206 //////////// BEGIN OldRef group allocator
   4207 static GroupAlloc oldref_group_allocator;
   4208 
   4209 static OldRef* alloc_OldRef ( void ) {
   4210    return gal_Alloc ( &oldref_group_allocator );
   4211 }
   4212 
   4213 static void free_OldRef ( OldRef* r ) {
   4214    tl_assert(r->magic == OldRef_MAGIC);
   4215    gal_Free( &oldref_group_allocator, r );
   4216 }
   4217 //////////// END OldRef group allocator
   4218 
   4219 
   4220 static SparseWA* oldrefTree     = NULL; /* SparseWA* OldRef* */
   4221 static UWord     oldrefGen      = 0;    /* current LRU generation # */
   4222 static UWord     oldrefTreeN    = 0;    /* # elems in oldrefTree */
   4223 static UWord     oldrefGenIncAt = 0;    /* inc gen # when size hits this */
   4224 
   4225 inline static UInt min_UInt ( UInt a, UInt b ) {
   4226    return a < b ? a : b;
   4227 }
   4228 
   4229 /* Compare the intervals [a1,a1+n1) and [a2,a2+n2).  Return -1 if the
   4230    first interval is lower, 1 if the first interval is higher, and 0
   4231    if there is any overlap.  Redundant paranoia with casting is there
   4232    following what looked distinctly like a bug in gcc-4.1.2, in which
   4233    some of the comparisons were done signedly instead of
   4234    unsignedly. */
   4235 /* Copied from exp-ptrcheck/sg_main.c */
   4236 static Word cmp_nonempty_intervals ( Addr a1, SizeT n1,
   4237                                      Addr a2, SizeT n2 ) {
   4238    UWord a1w = (UWord)a1;
   4239    UWord n1w = (UWord)n1;
   4240    UWord a2w = (UWord)a2;
   4241    UWord n2w = (UWord)n2;
   4242    tl_assert(n1w > 0 && n2w > 0);
   4243    if (a1w + n1w <= a2w) return -1L;
   4244    if (a2w + n2w <= a1w) return 1L;
   4245    return 0;
   4246 }
   4247 
   4248 static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
   4249 {
   4250    OldRef* ref;
   4251    RCEC*   rcec;
   4252    Word    i, j;
   4253    UWord   keyW, valW;
   4254    Bool    b;
   4255 
   4256    tl_assert(thr);
   4257    ThrID thrid = thr->thrid;
   4258    tl_assert(thrid != 0); /* zero is used to denote an empty slot. */
   4259 
   4260    WordSetID locksHeldW = thr->hgthread->locksetW;
   4261 
   4262    rcec = get_RCEC( thr );
   4263    ctxt__rcinc(rcec);
   4264 
   4265    UInt szLg2B = 0;
   4266    switch (szB) {
   4267       /* This doesn't look particularly branch-predictor friendly. */
   4268       case 1:  szLg2B = 0; break;
   4269       case 2:  szLg2B = 1; break;
   4270       case 4:  szLg2B = 2; break;
   4271       case 8:  szLg2B = 3; break;
   4272       default: tl_assert(0);
   4273    }
   4274 
   4275    /* Look in the map to see if we already have a record for this
   4276       address. */
   4277    b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, a );
   4278 
   4279    if (b) {
   4280 
   4281       /* We already have a record for this address.  We now need to
   4282          see if we have a stack trace pertaining to this (thrid, R/W,
   4283          size) triple. */
   4284       tl_assert(keyW == a);
   4285       ref = (OldRef*)valW;
   4286       tl_assert(ref->magic == OldRef_MAGIC);
   4287 
   4288       for (i = 0; i < N_OLDREF_ACCS; i++) {
   4289          if (ref->accs[i].thrid != thrid)
   4290             continue;
   4291          if (ref->accs[i].szLg2B != szLg2B)
   4292             continue;
   4293          if (ref->accs[i].isW != (UInt)(isW & 1))
   4294             continue;
   4295          /* else we have a match, so stop looking. */
   4296          break;
   4297       }
   4298 
   4299       if (i < N_OLDREF_ACCS) {
   4300          /* thread 'thr' has an entry at index 'i'.  Update its RCEC. */
   4301          if (i > 0) {
   4302             Thr_n_RCEC tmp = ref->accs[i-1];
   4303             ref->accs[i-1] = ref->accs[i];
   4304             ref->accs[i] = tmp;
   4305             i--;
   4306          }
   4307          if (rcec == ref->accs[i].rcec) stats__ctxt_rcdec1_eq++;
   4308          stats__ctxt_rcdec1++;
   4309          ctxt__rcdec( ref->accs[i].rcec );
   4310          tl_assert(ref->accs[i].thrid == thrid);
   4311          /* Update the RCEC and the W-held lockset. */
   4312          ref->accs[i].rcec       = rcec;
   4313          ref->accs[i].locksHeldW = locksHeldW;
   4314       } else {
   4315          /* No entry for this (thread, R/W, size, nWHeld) quad.
   4316             Shuffle all of them down one slot, and put the new entry
   4317             at the start of the array. */
   4318          if (ref->accs[N_OLDREF_ACCS-1].thrid != 0) {
   4319             /* the last slot is in use.  We must dec the rc on the
   4320                associated rcec. */
   4321             tl_assert(ref->accs[N_OLDREF_ACCS-1].rcec);
   4322             stats__ctxt_rcdec2++;
   4323             if (0 && 0 == (stats__ctxt_rcdec2 & 0xFFF))
   4324                VG_(printf)("QQQQ %lu overflows\n",stats__ctxt_rcdec2);
   4325             ctxt__rcdec( ref->accs[N_OLDREF_ACCS-1].rcec );
   4326          } else {
   4327             tl_assert(!ref->accs[N_OLDREF_ACCS-1].rcec);
   4328          }
   4329          for (j = N_OLDREF_ACCS-1; j >= 1; j--)
   4330             ref->accs[j] = ref->accs[j-1];
   4331          ref->accs[0].thrid      = thrid;
   4332          ref->accs[0].szLg2B     = szLg2B;
   4333          ref->accs[0].isW        = (UInt)(isW & 1);
   4334          ref->accs[0].locksHeldW = locksHeldW;
   4335          ref->accs[0].rcec       = rcec;
   4336          /* thrid==0 is used to signify an empty slot, so we can't
   4337             add zero thrid (such a ThrID is invalid anyway). */
   4338          /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */
   4339       }
   4340 
   4341       ref->gen = oldrefGen;
   4342 
   4343    } else {
   4344 
   4345       /* We don't have a record for this address.  Create a new one. */
   4346       if (oldrefTreeN >= oldrefGenIncAt) {
   4347          oldrefGen++;
   4348          oldrefGenIncAt = oldrefTreeN + 50000;
   4349          if (0) VG_(printf)("oldrefTree: new gen %lu at size %lu\n",
   4350                             oldrefGen, oldrefTreeN );
   4351       }
   4352 
   4353       ref = alloc_OldRef();
   4354       ref->magic = OldRef_MAGIC;
   4355       ref->gen   = oldrefGen;
   4356       ref->accs[0].thrid      = thrid;
   4357       ref->accs[0].szLg2B     = szLg2B;
   4358       ref->accs[0].isW        = (UInt)(isW & 1);
   4359       ref->accs[0].locksHeldW = locksHeldW;
   4360       ref->accs[0].rcec       = rcec;
   4361 
   4362       /* thrid==0 is used to signify an empty slot, so we can't
   4363          add zero thrid (such a ThrID is invalid anyway). */
   4364       /* tl_assert(thrid != 0); */ /* There's a dominating assert above. */
   4365 
   4366       /* Clear out the rest of the entries */
   4367       for (j = 1; j < N_OLDREF_ACCS; j++) {
   4368          ref->accs[j].rcec       = NULL;
   4369          ref->accs[j].thrid      = 0;
   4370          ref->accs[j].szLg2B     = 0;
   4371          ref->accs[j].isW        = 0;
   4372          ref->accs[j].locksHeldW = 0;
   4373       }
   4374       VG_(addToSWA)( oldrefTree, a, (UWord)ref );
   4375       oldrefTreeN++;
   4376 
   4377    }
   4378 }
   4379 
   4380 
   4381 /* Extract info from the conflicting-access machinery. */
   4382 Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
   4383                               /*OUT*/Thr**        resThr,
   4384                               /*OUT*/SizeT*       resSzB,
   4385                               /*OUT*/Bool*        resIsW,
   4386                               /*OUT*/WordSetID*   locksHeldW,
   4387                               Thr* thr, Addr a, SizeT szB, Bool isW )
   4388 {
   4389    Word    i, j;
   4390    OldRef* ref;
   4391    UWord   keyW, valW;
   4392    Bool    b;
   4393 
   4394    ThrID     cand_thrid;
   4395    RCEC*     cand_rcec;
   4396    Bool      cand_isW;
   4397    SizeT     cand_szB;
   4398    WordSetID cand_locksHeldW;
   4399    Addr      cand_a;
   4400 
   4401    Addr toCheck[15];
   4402    Int  nToCheck = 0;
   4403 
   4404    tl_assert(thr);
   4405    tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
   4406 
   4407    ThrID thrid = thr->thrid;
   4408 
   4409    toCheck[nToCheck++] = a;
   4410    for (i = -7; i < (Word)szB; i++) {
   4411       if (i != 0)
   4412          toCheck[nToCheck++] = a + i;
   4413    }
   4414    tl_assert(nToCheck <= 15);
   4415 
   4416    /* Now see if we can find a suitable matching event for
   4417       any of the addresses in toCheck[0 .. nToCheck-1]. */
   4418    for (j = 0; j < nToCheck; j++) {
   4419 
   4420       cand_a = toCheck[j];
   4421       //      VG_(printf)("test %ld %p\n", j, cand_a);
   4422 
   4423       b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, cand_a );
   4424       if (!b)
   4425          continue;
   4426 
   4427       ref = (OldRef*)valW;
   4428       tl_assert(keyW == cand_a);
   4429       tl_assert(ref->magic == OldRef_MAGIC);
   4430       tl_assert(ref->accs[0].thrid != 0); /* first slot must always be used */
   4431 
   4432       cand_thrid      = 0; /* invalid; see comments in event_map_bind */
   4433       cand_rcec       = NULL;
   4434       cand_isW        = False;
   4435       cand_szB        = 0;
   4436       cand_locksHeldW = 0; /* always valid; see initialise_data_structures() */
   4437 
   4438       for (i = 0; i < N_OLDREF_ACCS; i++) {
   4439          Thr_n_RCEC* cand = &ref->accs[i];
   4440          cand_rcec       = cand->rcec;
   4441          cand_thrid      = cand->thrid;
   4442          cand_isW        = (Bool)cand->isW;
   4443          cand_szB        = 1 << cand->szLg2B;
   4444          cand_locksHeldW = cand->locksHeldW;
   4445 
   4446          if (cand_thrid == 0)
   4447             /* This slot isn't in use.  Ignore it. */
   4448             continue;
   4449 
   4450          if (cand_thrid == thrid)
   4451             /* This is an access by the same thread, but we're only
   4452                interested in accesses from other threads.  Ignore. */
   4453             continue;
   4454 
   4455          if ((!cand_isW) && (!isW))
   4456             /* We don't want to report a read racing against another
   4457                read; that's stupid.  So in this case move on. */
   4458             continue;
   4459 
   4460          if (cmp_nonempty_intervals(a, szB, cand_a, cand_szB) != 0)
   4461             /* No overlap with the access we're asking about.  Ignore. */
   4462             continue;
   4463 
   4464          /* We have a match.  Stop searching. */
   4465          break;
   4466       }
   4467 
   4468       tl_assert(i >= 0 && i <= N_OLDREF_ACCS);
   4469 
   4470       if (i < N_OLDREF_ACCS) {
   4471          Int n, maxNFrames;
   4472          /* return with success */
   4473          tl_assert(cand_thrid);
   4474          tl_assert(cand_rcec);
   4475          tl_assert(cand_rcec->magic == RCEC_MAGIC);
   4476          tl_assert(cand_szB >= 1);
   4477          /* Count how many non-zero frames we have. */
   4478          maxNFrames = min_UInt(N_FRAMES, VG_(clo_backtrace_size));
   4479          for (n = 0; n < maxNFrames; n++) {
   4480             if (0 == cand_rcec->frames[n]) break;
   4481          }
   4482          *resEC      = VG_(make_ExeContext_from_StackTrace)
   4483                           (cand_rcec->frames, n);
   4484          *resThr     = Thr__from_ThrID(cand_thrid);
   4485          *resSzB     = cand_szB;
   4486          *resIsW     = cand_isW;
   4487          *locksHeldW = cand_locksHeldW;
   4488          return True;
   4489       }
   4490 
   4491       /* consider next address in toCheck[] */
   4492    } /* for (j = 0; j < nToCheck; j++) */
   4493 
   4494    /* really didn't find anything. */
   4495    return False;
   4496 }
   4497 
   4498 static void event_map_init ( void )
   4499 {
   4500    Word i;
   4501 
   4502    /* Context (RCEC) group allocator */
   4503    init_GroupAlloc ( &rcec_group_allocator,
   4504                      sizeof(RCEC),
   4505                      1000 /* RCECs per group */,
   4506                      HG_(zalloc),
   4507                      "libhb.event_map_init.1 (RCEC groups)",
   4508                      HG_(free) );
   4509 
   4510    /* Context table */
   4511    tl_assert(!contextTab);
   4512    contextTab = HG_(zalloc)( "libhb.event_map_init.2 (context table)",
   4513                              N_RCEC_TAB * sizeof(RCEC*) );
   4514    tl_assert(contextTab);
   4515    for (i = 0; i < N_RCEC_TAB; i++)
   4516       contextTab[i] = NULL;
   4517 
   4518    /* Oldref group allocator */
   4519    init_GroupAlloc ( &oldref_group_allocator,
   4520                      sizeof(OldRef),
   4521                      1000 /* OldRefs per group */,
   4522                      HG_(zalloc),
   4523                      "libhb.event_map_init.3 (OldRef groups)",
   4524                      HG_(free) );
   4525 
   4526    /* Oldref tree */
   4527    tl_assert(!oldrefTree);
   4528    oldrefTree = VG_(newSWA)(
   4529                    HG_(zalloc),
   4530                    "libhb.event_map_init.4 (oldref tree)",
   4531                    HG_(free)
   4532                 );
   4533    tl_assert(oldrefTree);
   4534 
   4535    oldrefGen = 0;
   4536    oldrefGenIncAt = 0;
   4537    oldrefTreeN = 0;
   4538 }
   4539 
   4540 static void event_map__check_reference_counts ( Bool before )
   4541 {
   4542    RCEC*   rcec;
   4543    OldRef* oldref;
   4544    Word    i;
   4545    UWord   nEnts = 0;
   4546    UWord   keyW, valW;
   4547 
   4548    /* Set the 'check' reference counts to zero.  Also, optionally
   4549       check that the real reference counts are non-zero.  We allow
   4550       these to fall to zero before a GC, but the GC must get rid of
   4551       all those that are zero, hence none should be zero after a
   4552       GC. */
   4553    for (i = 0; i < N_RCEC_TAB; i++) {
   4554       for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
   4555          nEnts++;
   4556          tl_assert(rcec);
   4557          tl_assert(rcec->magic == RCEC_MAGIC);
   4558          if (!before)
   4559             tl_assert(rcec->rc > 0);
   4560          rcec->rcX = 0;
   4561       }
   4562    }
   4563 
   4564    /* check that the stats are sane */
   4565    tl_assert(nEnts == stats__ctxt_tab_curr);
   4566    tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max);
   4567 
   4568    /* visit all the referencing points, inc check ref counts */
   4569    VG_(initIterSWA)( oldrefTree );
   4570    while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
   4571       oldref = (OldRef*)valW;
   4572       tl_assert(oldref->magic == OldRef_MAGIC);
   4573       for (i = 0; i < N_OLDREF_ACCS; i++) {
   4574          ThrID aThrID = oldref->accs[i].thrid;
   4575          RCEC* aRef   = oldref->accs[i].rcec;
   4576          if (aThrID != 0) {
   4577             tl_assert(aRef);
   4578             tl_assert(aRef->magic == RCEC_MAGIC);
   4579             aRef->rcX++;
   4580          } else {
   4581             tl_assert(!aRef);
   4582          }
   4583       }
   4584    }
   4585 
   4586    /* compare check ref counts with actual */
   4587    for (i = 0; i < N_RCEC_TAB; i++) {
   4588       for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
   4589          tl_assert(rcec->rc == rcec->rcX);
   4590       }
   4591    }
   4592 }
   4593 
   4594 __attribute__((noinline))
   4595 static void event_map_maybe_GC ( void )
   4596 {
   4597    OldRef* oldref;
   4598    UWord   keyW, valW, retained, maxGen;
   4599    XArray* refs2del;
   4600    Word    i, j, n2del;
   4601 
   4602    UWord* genMap      = NULL;
   4603    UWord  genMap_min  = 0;
   4604    UWord  genMap_size = 0;
   4605 
   4606    if (LIKELY(oldrefTreeN < HG_(clo_conflict_cache_size)))
   4607       return;
   4608 
   4609    if (0)
   4610       VG_(printf)("libhb: event_map GC at size %lu\n", oldrefTreeN);
   4611 
   4612    /* Check for sane command line params.  Limit values must match
   4613       those in hg_process_cmd_line_option. */
   4614    tl_assert( HG_(clo_conflict_cache_size) >= 10*1000 );
   4615    tl_assert( HG_(clo_conflict_cache_size) <= 30*1000*1000 );
   4616 
   4617    /* Check our counting is sane (expensive) */
   4618    if (CHECK_CEM)
   4619       tl_assert(oldrefTreeN == VG_(sizeSWA)( oldrefTree ));
   4620 
   4621    /* Check the reference counts (expensive) */
   4622    if (CHECK_CEM)
   4623       event_map__check_reference_counts( True/*before*/ );
   4624 
   4625    /* Compute the distribution of generation values in the ref tree.
   4626       There are likely only to be a few different generation numbers
   4627       in the whole tree, but we don't know what they are.  Hence use a
   4628       dynamically resized array of counters.  The array is genMap[0
   4629       .. genMap_size-1], where genMap[0] is the count for the
   4630       generation number genMap_min, genMap[1] is the count for
   4631       genMap_min+1, etc.  If a new number is seen outside the range
   4632       [genMap_min .. genMap_min + genMap_size - 1] then the array is
   4633       copied into a larger array, and genMap_min and genMap_size are
   4634       adjusted accordingly. */
   4635 
   4636    /* genMap :: generation-number -> count-of-nodes-with-that-number */
   4637 
   4638    VG_(initIterSWA)( oldrefTree );
   4639    while ( VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
   4640 
   4641        UWord ea, key;
   4642        oldref = (OldRef*)valW;
   4643        key = oldref->gen;
   4644 
   4645       /* BEGIN find 'ea', which is the index in genMap holding the
   4646          count for generation number 'key'. */
   4647       if (UNLIKELY(genMap == NULL)) {
   4648          /* deal with the first key to be seen, so that the following
   4649             cases don't need to handle the complexity of a NULL count
   4650             array. */
   4651          genMap_min  = key;
   4652          genMap_size = 1;
   4653          genMap = HG_(zalloc)( "libhb.emmG.1a",
   4654                                 genMap_size * sizeof(UWord) );
   4655          ea = 0;
   4656          if (0) VG_(printf)("(%lu) case 1 [%lu .. %lu]\n",
   4657                             key, genMap_min, genMap_min+genMap_size- 1 );
   4658       }
   4659       else
   4660       if (LIKELY(key >= genMap_min && key < genMap_min + genMap_size)) {
   4661          /* this is the expected (almost-always-happens) case: 'key'
   4662             is already mapped in the array. */
   4663          ea = key - genMap_min;
   4664       }
   4665       else
   4666       if (key < genMap_min) {
   4667          /* 'key' appears before the start of the current array.
   4668             Extend the current array by allocating a larger one and
   4669             copying the current one to the upper end of it. */
   4670          Word   more;
   4671          UWord* map2;
   4672          more = genMap_min - key;
   4673          tl_assert(more > 0);
   4674          map2 = HG_(zalloc)( "libhb.emmG.1b",
   4675                              (genMap_size + more) * sizeof(UWord) );
   4676          VG_(memcpy)( &map2[more], genMap, genMap_size * sizeof(UWord) );
   4677          HG_(free)( genMap );
   4678          genMap = map2;
   4679          genMap_size += more;
   4680          genMap_min -= more;
   4681          ea = 0;
   4682          tl_assert(genMap_min == key);
   4683          if (0) VG_(printf)("(%lu) case 2 [%lu .. %lu]\n",
   4684                             key, genMap_min,  genMap_min+genMap_size- 1 );
   4685       }
   4686       else {
   4687          /* 'key' appears after the end of the current array.  Extend
   4688             the current array by allocating a larger one and copying
   4689             the current one to the lower end of it. */
   4690          Word   more;
   4691          UWord* map2;
   4692          tl_assert(key >= genMap_min + genMap_size);
   4693          more = key - (genMap_min + genMap_size) + 1;
   4694          tl_assert(more > 0);
   4695          map2 = HG_(zalloc)( "libhb.emmG.1c",
   4696                              (genMap_size + more) * sizeof(UWord) );
   4697          VG_(memcpy)( &map2[0], genMap, genMap_size * sizeof(UWord) );
   4698          HG_(free)( genMap );
   4699          genMap = map2;
   4700          genMap_size += more;
   4701          ea = genMap_size - 1;;
   4702          tl_assert(genMap_min + genMap_size - 1 == key);
   4703          if (0) VG_(printf)("(%lu) case 3 [%lu .. %lu]\n",
   4704                             key, genMap_min, genMap_min+genMap_size- 1 );
   4705       }
   4706       /* END find 'ea' from 'key' */
   4707 
   4708       tl_assert(ea >= 0 && ea < genMap_size);
   4709       /* and the whole point of this elaborate computation of 'ea' is .. */
   4710       genMap[ea]++;
   4711    }
   4712 
   4713    tl_assert(genMap);
   4714    tl_assert(genMap_size > 0);
   4715 
   4716    /* Sanity check what we just computed */
   4717    { UWord sum = 0;
   4718      for (i = 0; i < genMap_size; i++) {
   4719         if (0) VG_(printf)("  xxx: gen %ld has %lu\n",
   4720                            i + genMap_min, genMap[i] );
   4721         sum += genMap[i];
   4722      }
   4723      tl_assert(sum == oldrefTreeN);
   4724    }
   4725 
   4726    /* Figure out how many generations to throw away */
   4727    retained = oldrefTreeN;
   4728    maxGen = 0;
   4729 
   4730    for (i = 0; i < genMap_size; i++) {
   4731       keyW = i + genMap_min;
   4732       valW = genMap[i];
   4733       tl_assert(keyW > 0); /* can't allow a generation # 0 */
   4734       if (0) VG_(printf)("  XXX: gen %lu has %lu\n", keyW, valW );
   4735       tl_assert(keyW >= maxGen);
   4736       tl_assert(retained >= valW);
   4737       if (retained - valW
   4738           > (UWord)(HG_(clo_conflict_cache_size)
   4739                     * EVENT_MAP_GC_DISCARD_FRACTION)) {
   4740          retained -= valW;
   4741          maxGen = keyW;
   4742       } else {
   4743          break;
   4744       }
   4745    }
   4746 
   4747    HG_(free)(genMap);
   4748 
   4749    tl_assert(retained >= 0 && retained <= oldrefTreeN);
   4750 
   4751    /* Now make up a big list of the oldrefTree entries we want to
   4752       delete.  We can't simultaneously traverse the tree and delete
   4753       stuff from it, so first we need to copy them off somewhere
   4754       else. (sigh) */
   4755    refs2del = VG_(newXA)( HG_(zalloc), "libhb.emmG.2",
   4756                           HG_(free), sizeof(Addr) );
   4757 
   4758    if (retained < oldrefTreeN) {
   4759 
   4760       /* This is the normal (expected) case.  We discard any ref whose
   4761          generation number <= maxGen. */
   4762       VG_(initIterSWA)( oldrefTree );
   4763       while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
   4764          oldref = (OldRef*)valW;
   4765          tl_assert(oldref->magic == OldRef_MAGIC);
   4766          if (oldref->gen <= maxGen) {
   4767             VG_(addToXA)( refs2del, &keyW );
   4768          }
   4769       }
   4770       if (VG_(clo_stats)) {
   4771          VG_(message)(Vg_DebugMsg,
   4772             "libhb: EvM GC: delete generations %lu and below, "
   4773             "retaining %lu entries\n",
   4774             maxGen, retained );
   4775       }
   4776 
   4777    } else {
   4778 
   4779       static UInt rand_seed = 0; /* leave as static */
   4780 
   4781       /* Degenerate case: there's only one generation in the entire
   4782          tree, so we need to have some other way of deciding which
   4783          refs to throw away.  Just throw out half of them randomly. */
   4784       tl_assert(retained == oldrefTreeN);
   4785       VG_(initIterSWA)( oldrefTree );
   4786       while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
   4787          UInt n;
   4788          oldref = (OldRef*)valW;
   4789          tl_assert(oldref->magic == OldRef_MAGIC);
   4790          n = VG_(random)( &rand_seed );
   4791          if ((n & 0xFFF) < 0x800) {
   4792             VG_(addToXA)( refs2del, &keyW );
   4793             retained--;
   4794          }
   4795       }
   4796       if (VG_(clo_stats)) {
   4797          VG_(message)(Vg_DebugMsg,
   4798             "libhb: EvM GC: randomly delete half the entries, "
   4799             "retaining %lu entries\n",
   4800             retained );
   4801       }
   4802 
   4803    }
   4804 
   4805    n2del = VG_(sizeXA)( refs2del );
   4806    tl_assert(n2del == (Word)(oldrefTreeN - retained));
   4807 
   4808    if (0) VG_(printf)("%s","deleting entries\n");
   4809    for (i = 0; i < n2del; i++) {
   4810       Bool  b;
   4811       Addr  ga2del = *(Addr*)VG_(indexXA)( refs2del, i );
   4812       b = VG_(delFromSWA)( oldrefTree, &keyW, &valW, ga2del );
   4813       tl_assert(b);
   4814       tl_assert(keyW == ga2del);
   4815       oldref = (OldRef*)valW;
   4816       for (j = 0; j < N_OLDREF_ACCS; j++) {
   4817          ThrID aThrID = oldref->accs[j].thrid;
   4818          RCEC* aRef   = oldref->accs[j].rcec;
   4819          if (aRef) {
   4820             tl_assert(aThrID != 0);
   4821             stats__ctxt_rcdec3++;
   4822             ctxt__rcdec( aRef );
   4823          } else {
   4824             tl_assert(aThrID == 0);
   4825          }
   4826       }
   4827 
   4828       free_OldRef( oldref );
   4829    }
   4830 
   4831    VG_(deleteXA)( refs2del );
   4832 
   4833    tl_assert( VG_(sizeSWA)( oldrefTree ) == retained );
   4834 
   4835    oldrefTreeN = retained;
   4836    oldrefGenIncAt = oldrefTreeN; /* start new gen right away */
   4837 
   4838    /* Throw away all RCECs with zero reference counts */
   4839    for (i = 0; i < N_RCEC_TAB; i++) {
   4840       RCEC** pp = &contextTab[i];
   4841       RCEC*  p  = *pp;
   4842       while (p) {
   4843          if (p->rc == 0) {
   4844             *pp = p->next;
   4845             free_RCEC(p);
   4846             p = *pp;
   4847             tl_assert(stats__ctxt_tab_curr > 0);
   4848             stats__ctxt_tab_curr--;
   4849          } else {
   4850             pp = &p->next;
   4851             p = p->next;
   4852          }
   4853       }
   4854    }
   4855 
   4856    /* Check the reference counts (expensive) */
   4857    if (CHECK_CEM)
   4858       event_map__check_reference_counts( False/*after*/ );
   4859 
   4860    //if (0)
   4861    //VG_(printf)("XXXX final sizes: oldrefTree %ld, contextTree %ld\n\n",
   4862    //            VG_(OSetGen_Size)(oldrefTree), VG_(OSetGen_Size)(contextTree));
   4863 
   4864 }
   4865 
   4866 
   4867 /////////////////////////////////////////////////////////
   4868 //                                                     //
   4869 // Core MSM                                            //
   4870 //                                                     //
   4871 /////////////////////////////////////////////////////////
   4872 
   4873 /* Logic in msmcread/msmcwrite updated/verified after re-analysis, 19
   4874    Nov 08, and again after [...],
   4875    June 09. */
   4876 
   4877 static ULong stats__msmcread         = 0;
   4878 static ULong stats__msmcread_change  = 0;
   4879 static ULong stats__msmcwrite        = 0;
   4880 static ULong stats__msmcwrite_change = 0;
   4881 
   4882 /* Some notes on the H1 history mechanism:
   4883 
   4884    Transition rules are:
   4885 
   4886    read_{Kr,Kw}(Cr,Cw)  = (Cr,           Cr `join` Kw)
   4887    write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw)
   4888 
   4889    After any access by a thread T to a location L, L's constraint pair
   4890    (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock.
   4891 
   4892    After a race by thread T conflicting with some previous access by
   4893    some other thread U, for a location with constraint (before
   4894    processing the later access) (Cr,Cw), then Cw[U] is the segment in
   4895    which the previously access lies.
   4896 
   4897    Hence in record_race_info, we pass in Cfailed and Kfailed, which
   4898    are compared so as to find out which thread(s) this access
   4899    conflicts with.  Once that is established, we also require the
   4900    pre-update Cw for the location, so we can index into it for those
   4901    threads, to get the scalar clock values for the point at which the
   4902    former accesses were made.  (In fact we only bother to do any of
   4903    this for an arbitrarily chosen one of the conflicting threads, as
   4904    that's simpler, it avoids flooding the user with vast amounts of
   4905    mostly useless information, and because the program is wrong if it
   4906    contains any races at all -- so we don't really need to show all
   4907    conflicting access pairs initially, so long as we only show none if
   4908    none exist).
   4909 
   4910    ---
   4911 
   4912    That requires the auxiliary proof that
   4913 
   4914       (Cr `join` Kw)[T] == Kw[T]
   4915 
   4916    Why should that be true?  Because for any thread T, Kw[T] >= the
   4917    scalar clock value for T known by any other thread.  In other
   4918    words, because T's value for its own scalar clock is at least as up
   4919    to date as the value for it known by any other thread (that is true
   4920    for both the R- and W- scalar clocks).  Hence no other thread will
   4921    be able to feed in a value for that element (indirectly via a
   4922    constraint) which will exceed Kw[T], and hence the join cannot
   4923    cause that particular element to advance.
   4924 */
   4925 
   4926 __attribute__((noinline))
   4927 static void record_race_info ( Thr* acc_thr,
   4928                                Addr acc_addr, SizeT szB, Bool isWrite,
   4929                                VtsID Cfailed,
   4930                                VtsID Kfailed,
   4931                                VtsID Cw )
   4932 {
   4933    /* Call here to report a race.  We just hand it onwards to
   4934       HG_(record_error_Race).  If that in turn discovers that the
   4935       error is going to be collected, then, at history_level 2, that
   4936       queries the conflicting-event map.  The alternative would be to
   4937       query it right here.  But that causes a lot of pointless queries
   4938       for errors which will shortly be discarded as duplicates, and
   4939       can become a performance overhead; so we defer the query until
   4940       we know the error is not a duplicate. */
   4941 
   4942    /* Stacks for the bounds of the (or one of the) conflicting
   4943       segment(s).  These are only set at history_level 1. */
   4944    ExeContext* hist1_seg_start = NULL;
   4945    ExeContext* hist1_seg_end   = NULL;
   4946    Thread*     hist1_conf_thr  = NULL;
   4947 
   4948    tl_assert(acc_thr);
   4949    tl_assert(acc_thr->hgthread);
   4950    tl_assert(acc_thr->hgthread->hbthr == acc_thr);
   4951    tl_assert(HG_(clo_history_level) >= 0 && HG_(clo_history_level) <= 2);
   4952 
   4953    if (HG_(clo_history_level) == 1) {
   4954       Bool found;
   4955       Word firstIx, lastIx;
   4956       ULong_n_EC key;
   4957 
   4958       /* At history_level 1, we must round up the relevant stack-pair
   4959          for the conflicting segment right now.  This is because
   4960          deferring it is complex; we can't (easily) put Kfailed and
   4961          Cfailed into the XError and wait for later without
   4962          getting tied up in difficulties with VtsID reference
   4963          counting.  So just do it now. */
   4964       Thr*  confThr;
   4965       ULong confTym = 0;
   4966       /* Which thread are we in conflict with?  There may be more than
   4967          one, in which case VtsID__findFirst_notLEQ selects one arbitrarily
   4968          (in fact it's the one with the lowest Thr* value). */
   4969       confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed );
   4970       /* This must exist!  since if it was NULL then there's no
   4971          conflict (semantics of return value of
   4972          VtsID__findFirst_notLEQ), and msmc{read,write}, which has
   4973          called us, just checked exactly this -- that there was in
   4974          fact a race. */
   4975       tl_assert(confThr);
   4976 
   4977       /* Get the scalar clock value that the conflicting thread
   4978          introduced into the constraint.  A careful examination of the
   4979          base machine rules shows that this must be the same as the
   4980          conflicting thread's scalar clock when it created this
   4981          constraint.  Hence we know the scalar clock of the
   4982          conflicting thread when the conflicting access was made. */
   4983       confTym = VtsID__indexAt( Cfailed, confThr );
   4984 
   4985       /* Using this scalar clock, index into the conflicting thread's
   4986          collection of stack traces made each time its vector clock
   4987          (hence its scalar clock) changed.  This gives the stack
   4988          traces at the start and end of the conflicting segment (well,
   4989          as per comment just above, of one of the conflicting
   4990          segments, if there are more than one). */
   4991       key.ull = confTym;
   4992       key.ec  = NULL;
   4993       /* tl_assert(confThr); -- asserted just above */
   4994       tl_assert(confThr->local_Kws_n_stacks);
   4995       firstIx = lastIx = 0;
   4996       found = VG_(lookupXA_UNSAFE)(
   4997                  confThr->local_Kws_n_stacks,
   4998                  &key, &firstIx, &lastIx,
   4999                  (Int(*)(void*,void*))cmp__ULong_n_EC__by_ULong
   5000               );
   5001       if (0) VG_(printf)("record_race_info %u %u %u  confThr %p "
   5002                          "confTym %llu found %d (%lu,%lu)\n",
   5003                          Cfailed, Kfailed, Cw,
   5004                          confThr, confTym, found, firstIx, lastIx);
   5005       /* We can't indefinitely collect stack traces at VTS
   5006          transitions, since we'd eventually run out of memory.  Hence
   5007          note_local_Kw_n_stack_for will eventually throw away old
   5008          ones, which in turn means we might fail to find index value
   5009          confTym in the array. */
   5010       if (found) {
   5011          ULong_n_EC *pair_start, *pair_end;
   5012          pair_start
   5013             = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx );
   5014          hist1_seg_start = pair_start->ec;
   5015          if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) {
   5016             pair_end
   5017                = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks,
   5018                                             lastIx+1 );
   5019             /* from properties of VG_(lookupXA) and the comparison fn used: */
   5020             tl_assert(pair_start->ull < pair_end->ull);
   5021             hist1_seg_end = pair_end->ec;
   5022             /* Could do a bit better here.  It may be that pair_end
   5023                doesn't have a stack, but the following entries in the
   5024                array have the same scalar Kw and to have a stack.  So
   5025                we should search a bit further along the array than
   5026                lastIx+1 if hist1_seg_end is NULL. */
   5027          } else {
   5028             if (!confThr->llexit_done)
   5029                hist1_seg_end = main_get_EC( confThr );
   5030          }
   5031          // seg_start could be NULL iff this is the first stack in the thread
   5032          //if (seg_start) VG_(pp_ExeContext)(seg_start);
   5033          //if (seg_end)   VG_(pp_ExeContext)(seg_end);
   5034          hist1_conf_thr = confThr->hgthread;
   5035       }
   5036    }
   5037 
   5038    HG_(record_error_Race)( acc_thr->hgthread, acc_addr,
   5039                            szB, isWrite,
   5040                            hist1_conf_thr, hist1_seg_start, hist1_seg_end );
   5041 }
   5042 
   5043 static Bool is_sane_SVal_C ( SVal sv ) {
   5044    Bool leq;
   5045    if (!SVal__isC(sv)) return True;
   5046    leq = VtsID__cmpLEQ( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) );
   5047    return leq;
   5048 }
   5049 
   5050 
   5051 /* Compute new state following a read */
   5052 static inline SVal msmcread ( SVal svOld,
   5053                               /* The following are only needed for
   5054                                  creating error reports. */
   5055                               Thr* acc_thr,
   5056                               Addr acc_addr, SizeT szB )
   5057 {
   5058    SVal svNew = SVal_INVALID;
   5059    stats__msmcread++;
   5060 
   5061    /* Redundant sanity check on the constraints */
   5062    if (CHECK_MSM) {
   5063       tl_assert(is_sane_SVal_C(svOld));
   5064    }
   5065 
   5066    if (LIKELY(SVal__isC(svOld))) {
   5067       VtsID tviR  = acc_thr->viR;
   5068       VtsID tviW  = acc_thr->viW;
   5069       VtsID rmini = SVal__unC_Rmin(svOld);
   5070       VtsID wmini = SVal__unC_Wmin(svOld);
   5071       Bool  leq   = VtsID__cmpLEQ(rmini,tviR);
   5072       if (LIKELY(leq)) {
   5073          /* no race */
   5074          /* Note: RWLOCK subtlety: use tviW, not tviR */
   5075          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
   5076          goto out;
   5077       } else {
   5078          /* assert on sanity of constraints. */
   5079          Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
   5080          tl_assert(leqxx);
   5081          // same as in non-race case
   5082          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
   5083          record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
   5084                            rmini, /* Cfailed */
   5085                            tviR,  /* Kfailed */
   5086                            wmini  /* Cw */ );
   5087          goto out;
   5088       }
   5089    }
   5090    if (SVal__isA(svOld)) {
   5091       /* reading no-access memory (sigh); leave unchanged */
   5092       /* check for no pollution */
   5093       tl_assert(svOld == SVal_NOACCESS);
   5094       svNew = SVal_NOACCESS;
   5095       goto out;
   5096    }
   5097    if (0) VG_(printf)("msmcread: bad svOld: 0x%016llx\n", svOld);
   5098    tl_assert(0);
   5099 
   5100   out:
   5101    if (CHECK_MSM) {
   5102       tl_assert(is_sane_SVal_C(svNew));
   5103    }
   5104    if (UNLIKELY(svNew != svOld)) {
   5105       tl_assert(svNew != SVal_INVALID);
   5106       if (HG_(clo_history_level) >= 2
   5107           && SVal__isC(svOld) && SVal__isC(svNew)) {
   5108          event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr );
   5109          stats__msmcread_change++;
   5110       }
   5111    }
   5112    return svNew;
   5113 }
   5114 
   5115 
   5116 /* Compute new state following a write */
   5117 static inline SVal msmcwrite ( SVal svOld,
   5118                               /* The following are only needed for
   5119                                  creating error reports. */
   5120                               Thr* acc_thr,
   5121                               Addr acc_addr, SizeT szB )
   5122 {
   5123    SVal svNew = SVal_INVALID;
   5124    stats__msmcwrite++;
   5125 
   5126    /* Redundant sanity check on the constraints */
   5127    if (CHECK_MSM) {
   5128       tl_assert(is_sane_SVal_C(svOld));
   5129    }
   5130 
   5131    if (LIKELY(SVal__isC(svOld))) {
   5132       VtsID tviW  = acc_thr->viW;
   5133       VtsID wmini = SVal__unC_Wmin(svOld);
   5134       Bool  leq   = VtsID__cmpLEQ(wmini,tviW);
   5135       if (LIKELY(leq)) {
   5136          /* no race */
   5137          svNew = SVal__mkC( tviW, tviW );
   5138          goto out;
   5139       } else {
   5140          VtsID rmini = SVal__unC_Rmin(svOld);
   5141          /* assert on sanity of constraints. */
   5142          Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
   5143          tl_assert(leqxx);
   5144          // same as in non-race case
   5145          // proof: in the non-race case, we have
   5146          //    rmini <= wmini (invar on constraints)
   5147          //    tviW <= tviR (invar on thread clocks)
   5148          //    wmini <= tviW (from run-time check)
   5149          // hence from transitivity of <= we have
   5150          //    rmini <= wmini <= tviW
   5151          // and so join(rmini,tviW) == tviW
   5152          // and    join(wmini,tviW) == tviW
   5153          // qed.
   5154          svNew = SVal__mkC( VtsID__join2(rmini, tviW),
   5155                             VtsID__join2(wmini, tviW) );
   5156          record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
   5157                            wmini, /* Cfailed */
   5158                            tviW,  /* Kfailed */
   5159                            wmini  /* Cw */ );
   5160          goto out;
   5161       }
   5162    }
   5163    if (SVal__isA(svOld)) {
   5164       /* writing no-access memory (sigh); leave unchanged */
   5165       /* check for no pollution */
   5166       tl_assert(svOld == SVal_NOACCESS);
   5167       svNew = SVal_NOACCESS;
   5168       goto out;
   5169    }
   5170    if (0) VG_(printf)("msmcwrite: bad svOld: 0x%016llx\n", svOld);
   5171    tl_assert(0);
   5172 
   5173   out:
   5174    if (CHECK_MSM) {
   5175       tl_assert(is_sane_SVal_C(svNew));
   5176    }
   5177    if (UNLIKELY(svNew != svOld)) {
   5178       tl_assert(svNew != SVal_INVALID);
   5179       if (HG_(clo_history_level) >= 2
   5180           && SVal__isC(svOld) && SVal__isC(svNew)) {
   5181          event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr );
   5182          stats__msmcwrite_change++;
   5183       }
   5184    }
   5185    return svNew;
   5186 }
   5187 
   5188 
   5189 /////////////////////////////////////////////////////////
   5190 //                                                     //
   5191 // Apply core MSM to specific memory locations         //
   5192 //                                                     //
   5193 /////////////////////////////////////////////////////////
   5194 
   5195 /*------------- ZSM accesses: 8 bit sapply ------------- */
   5196 
   5197 static void zsm_sapply08__msmcread ( Thr* thr, Addr a ) {
   5198    CacheLine* cl;
   5199    UWord      cloff, tno, toff;
   5200    SVal       svOld, svNew;
   5201    UShort     descr;
   5202    stats__cline_cread08s++;
   5203    cl    = get_cacheline(a);
   5204    cloff = get_cacheline_offset(a);
   5205    tno   = get_treeno(a);
   5206    toff  = get_tree_offset(a); /* == 0 .. 7 */
   5207    descr = cl->descrs[tno];
   5208    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
   5209       SVal* tree = &cl->svals[tno << 3];
   5210       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
   5211       if (CHECK_ZSM)
   5212          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   5213    }
   5214    svOld = cl->svals[cloff];
   5215    svNew = msmcread( svOld, thr,a,1 );
   5216    if (CHECK_ZSM)
   5217       tl_assert(svNew != SVal_INVALID);
   5218    cl->svals[cloff] = svNew;
   5219 }
   5220 
   5221 static void zsm_sapply08__msmcwrite ( Thr* thr, Addr a ) {
   5222    CacheLine* cl;
   5223    UWord      cloff, tno, toff;
   5224    SVal       svOld, svNew;
   5225    UShort     descr;
   5226    stats__cline_cwrite08s++;
   5227    cl    = get_cacheline(a);
   5228    cloff = get_cacheline_offset(a);
   5229    tno   = get_treeno(a);
   5230    toff  = get_tree_offset(a); /* == 0 .. 7 */
   5231    descr = cl->descrs[tno];
   5232    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
   5233       SVal* tree = &cl->svals[tno << 3];
   5234       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
   5235       if (CHECK_ZSM)
   5236          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   5237    }
   5238    svOld = cl->svals[cloff];
   5239    svNew = msmcwrite( svOld, thr,a,1 );
   5240    if (CHECK_ZSM)
   5241       tl_assert(svNew != SVal_INVALID);
   5242    cl->svals[cloff] = svNew;
   5243 }
   5244 
   5245 /*------------- ZSM accesses: 16 bit sapply ------------- */
   5246 
   5247 static void zsm_sapply16__msmcread ( Thr* thr, Addr a ) {
   5248    CacheLine* cl;
   5249    UWord      cloff, tno, toff;
   5250    SVal       svOld, svNew;
   5251    UShort     descr;
   5252    stats__cline_cread16s++;
   5253    if (UNLIKELY(!aligned16(a))) goto slowcase;
   5254    cl    = get_cacheline(a);
   5255    cloff = get_cacheline_offset(a);
   5256    tno   = get_treeno(a);
   5257    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
   5258    descr = cl->descrs[tno];
   5259    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
   5260       if (valid_value_is_below_me_16(descr, toff)) {
   5261          goto slowcase;
   5262       } else {
   5263          SVal* tree = &cl->svals[tno << 3];
   5264          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
   5265       }
   5266       if (CHECK_ZSM)
   5267          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   5268    }
   5269    svOld = cl->svals[cloff];
   5270    svNew = msmcread( svOld, thr,a,2 );
   5271    if (CHECK_ZSM)
   5272       tl_assert(svNew != SVal_INVALID);
   5273    cl->svals[cloff] = svNew;
   5274    return;
   5275   slowcase: /* misaligned, or must go further down the tree */
   5276    stats__cline_16to8splits++;
   5277    zsm_sapply08__msmcread( thr, a + 0 );
   5278    zsm_sapply08__msmcread( thr, a + 1 );
   5279 }
   5280 
   5281 static void zsm_sapply16__msmcwrite ( Thr* thr, Addr a ) {
   5282    CacheLine* cl;
   5283    UWord      cloff, tno, toff;
   5284    SVal       svOld, svNew;
   5285    UShort     descr;
   5286    stats__cline_cwrite16s++;
   5287    if (UNLIKELY(!aligned16(a))) goto slowcase;
   5288    cl    = get_cacheline(a);
   5289    cloff = get_cacheline_offset(a);
   5290    tno   = get_treeno(a);
   5291    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
   5292    descr = cl->descrs[tno];
   5293    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
   5294       if (valid_value_is_below_me_16(descr, toff)) {
   5295          goto slowcase;
   5296       } else {
   5297          SVal* tree = &cl->svals[tno << 3];
   5298          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
   5299       }
   5300       if (CHECK_ZSM)
   5301          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   5302    }
   5303    svOld = cl->svals[cloff];
   5304    svNew = msmcwrite( svOld, thr,a,2 );
   5305    if (CHECK_ZSM)
   5306       tl_assert(svNew != SVal_INVALID);
   5307    cl->svals[cloff] = svNew;
   5308    return;
   5309   slowcase: /* misaligned, or must go further down the tree */
   5310    stats__cline_16to8splits++;
   5311    zsm_sapply08__msmcwrite( thr, a + 0 );
   5312    zsm_sapply08__msmcwrite( thr, a + 1 );
   5313 }
   5314 
   5315 /*------------- ZSM accesses: 32 bit sapply ------------- */
   5316 
   5317 static void zsm_sapply32__msmcread ( Thr* thr, Addr a ) {
   5318    CacheLine* cl;
   5319    UWord      cloff, tno, toff;
   5320    SVal       svOld, svNew;
   5321    UShort     descr;
   5322    stats__cline_cread32s++;
   5323    if (UNLIKELY(!aligned32(a))) goto slowcase;
   5324    cl    = get_cacheline(a);
   5325    cloff = get_cacheline_offset(a);
   5326    tno   = get_treeno(a);
   5327    toff  = get_tree_offset(a); /* == 0 or 4 */
   5328    descr = cl->descrs[tno];
   5329    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
   5330       if (valid_value_is_above_me_32(descr, toff)) {
   5331          SVal* tree = &cl->svals[tno << 3];
   5332          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
   5333       } else {
   5334          goto slowcase;
   5335       }
   5336       if (CHECK_ZSM)
   5337          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   5338    }
   5339    svOld = cl->svals[cloff];
   5340    svNew = msmcread( svOld, thr,a,4 );
   5341    if (CHECK_ZSM)
   5342       tl_assert(svNew != SVal_INVALID);
   5343    cl->svals[cloff] = svNew;
   5344    return;
   5345   slowcase: /* misaligned, or must go further down the tree */
   5346    stats__cline_32to16splits++;
   5347    zsm_sapply16__msmcread( thr, a + 0 );
   5348    zsm_sapply16__msmcread( thr, a + 2 );
   5349 }
   5350 
   5351 static void zsm_sapply32__msmcwrite ( Thr* thr, Addr a ) {
   5352    CacheLine* cl;
   5353    UWord      cloff, tno, toff;
   5354    SVal       svOld, svNew;
   5355    UShort     descr;
   5356    stats__cline_cwrite32s++;
   5357    if (UNLIKELY(!aligned32(a))) goto slowcase;
   5358    cl    = get_cacheline(a);
   5359    cloff = get_cacheline_offset(a);
   5360    tno   = get_treeno(a);
   5361    toff  = get_tree_offset(a); /* == 0 or 4 */
   5362    descr = cl->descrs[tno];
   5363    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
   5364       if (valid_value_is_above_me_32(descr, toff)) {
   5365          SVal* tree = &cl->svals[tno << 3];
   5366          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
   5367       } else {
   5368          goto slowcase;
   5369       }
   5370       if (CHECK_ZSM)
   5371          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   5372    }
   5373    svOld = cl->svals[cloff];
   5374    svNew = msmcwrite( svOld, thr,a,4 );
   5375    if (CHECK_ZSM)
   5376       tl_assert(svNew != SVal_INVALID);
   5377    cl->svals[cloff] = svNew;
   5378    return;
   5379   slowcase: /* misaligned, or must go further down the tree */
   5380    stats__cline_32to16splits++;
   5381    zsm_sapply16__msmcwrite( thr, a + 0 );
   5382    zsm_sapply16__msmcwrite( thr, a + 2 );
   5383 }
   5384 
   5385 /*------------- ZSM accesses: 64 bit sapply ------------- */
   5386 
   5387 static void zsm_sapply64__msmcread ( Thr* thr, Addr a ) {
   5388    CacheLine* cl;
   5389    UWord      cloff, tno;
   5390    //UWord      toff;
   5391    SVal       svOld, svNew;
   5392    UShort     descr;
   5393    stats__cline_cread64s++;
   5394    if (UNLIKELY(!aligned64(a))) goto slowcase;
   5395    cl    = get_cacheline(a);
   5396    cloff = get_cacheline_offset(a);
   5397    tno   = get_treeno(a);
   5398    //toff  = get_tree_offset(a); /* == 0, unused */
   5399    descr = cl->descrs[tno];
   5400    if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
   5401       goto slowcase;
   5402    }
   5403    svOld = cl->svals[cloff];
   5404    svNew = msmcread( svOld, thr,a,8 );
   5405    if (CHECK_ZSM)
   5406       tl_assert(svNew != SVal_INVALID);
   5407    cl->svals[cloff] = svNew;
   5408    return;
   5409   slowcase: /* misaligned, or must go further down the tree */
   5410    stats__cline_64to32splits++;
   5411    zsm_sapply32__msmcread( thr, a + 0 );
   5412    zsm_sapply32__msmcread( thr, a + 4 );
   5413 }
   5414 
   5415 static void zsm_sapply64__msmcwrite ( Thr* thr, Addr a ) {
   5416    CacheLine* cl;
   5417    UWord      cloff, tno;
   5418    //UWord      toff;
   5419    SVal       svOld, svNew;
   5420    UShort     descr;
   5421    stats__cline_cwrite64s++;
   5422    if (UNLIKELY(!aligned64(a))) goto slowcase;
   5423    cl    = get_cacheline(a);
   5424    cloff = get_cacheline_offset(a);
   5425    tno   = get_treeno(a);
   5426    //toff  = get_tree_offset(a); /* == 0, unused */
   5427    descr = cl->descrs[tno];
   5428    if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
   5429       goto slowcase;
   5430    }
   5431    svOld = cl->svals[cloff];
   5432    svNew = msmcwrite( svOld, thr,a,8 );
   5433    if (CHECK_ZSM)
   5434       tl_assert(svNew != SVal_INVALID);
   5435    cl->svals[cloff] = svNew;
   5436    return;
   5437   slowcase: /* misaligned, or must go further down the tree */
   5438    stats__cline_64to32splits++;
   5439    zsm_sapply32__msmcwrite( thr, a + 0 );
   5440    zsm_sapply32__msmcwrite( thr, a + 4 );
   5441 }
   5442 
   5443 /*--------------- ZSM accesses: 8 bit swrite --------------- */
   5444 
   5445 static
   5446 void zsm_swrite08 ( Addr a, SVal svNew ) {
   5447    CacheLine* cl;
   5448    UWord      cloff, tno, toff;
   5449    UShort     descr;
   5450    stats__cline_swrite08s++;
   5451    cl    = get_cacheline(a);
   5452    cloff = get_cacheline_offset(a);
   5453    tno   = get_treeno(a);
   5454    toff  = get_tree_offset(a); /* == 0 .. 7 */
   5455    descr = cl->descrs[tno];
   5456    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
   5457       SVal* tree = &cl->svals[tno << 3];
   5458       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
   5459       if (CHECK_ZSM)
   5460          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   5461    }
   5462    tl_assert(svNew != SVal_INVALID);
   5463    cl->svals[cloff] = svNew;
   5464 }
   5465 
   5466 /*--------------- ZSM accesses: 16 bit swrite --------------- */
   5467 
   5468 static
   5469 void zsm_swrite16 ( Addr a, SVal svNew ) {
   5470    CacheLine* cl;
   5471    UWord      cloff, tno, toff;
   5472    UShort     descr;
   5473    stats__cline_swrite16s++;
   5474    if (UNLIKELY(!aligned16(a))) goto slowcase;
   5475    cl    = get_cacheline(a);
   5476    cloff = get_cacheline_offset(a);
   5477    tno   = get_treeno(a);
   5478    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
   5479    descr = cl->descrs[tno];
   5480    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
   5481       if (valid_value_is_below_me_16(descr, toff)) {
   5482          /* Writing at this level.  Need to fix up 'descr'. */
   5483          cl->descrs[tno] = pullup_descr_to_16(descr, toff);
   5484          /* At this point, the tree does not match cl->descr[tno] any
   5485             more.  The assignments below will fix it up. */
   5486       } else {
   5487          /* We can't indiscriminately write on the w16 node as in the
   5488             w64 case, as that might make the node inconsistent with
   5489             its parent.  So first, pull down to this level. */
   5490          SVal* tree = &cl->svals[tno << 3];
   5491          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
   5492       if (CHECK_ZSM)
   5493          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   5494       }
   5495    }
   5496    tl_assert(svNew != SVal_INVALID);
   5497    cl->svals[cloff + 0] = svNew;
   5498    cl->svals[cloff + 1] = SVal_INVALID;
   5499    return;
   5500   slowcase: /* misaligned */
   5501    stats__cline_16to8splits++;
   5502    zsm_swrite08( a + 0, svNew );
   5503    zsm_swrite08( a + 1, svNew );
   5504 }
   5505 
   5506 /*--------------- ZSM accesses: 32 bit swrite --------------- */
   5507 
   5508 static
   5509 void zsm_swrite32 ( Addr a, SVal svNew ) {
   5510    CacheLine* cl;
   5511    UWord      cloff, tno, toff;
   5512    UShort     descr;
   5513    stats__cline_swrite32s++;
   5514    if (UNLIKELY(!aligned32(a))) goto slowcase;
   5515    cl    = get_cacheline(a);
   5516    cloff = get_cacheline_offset(a);
   5517    tno   = get_treeno(a);
   5518    toff  = get_tree_offset(a); /* == 0 or 4 */
   5519    descr = cl->descrs[tno];
   5520    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
   5521       if (valid_value_is_above_me_32(descr, toff)) {
   5522          /* We can't indiscriminately write on the w32 node as in the
   5523             w64 case, as that might make the node inconsistent with
   5524             its parent.  So first, pull down to this level. */
   5525          SVal* tree = &cl->svals[tno << 3];
   5526          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
   5527          if (CHECK_ZSM)
   5528             tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   5529       } else {
   5530          /* Writing at this level.  Need to fix up 'descr'. */
   5531          cl->descrs[tno] = pullup_descr_to_32(descr, toff);
   5532          /* At this point, the tree does not match cl->descr[tno] any
   5533             more.  The assignments below will fix it up. */
   5534       }
   5535    }
   5536    tl_assert(svNew != SVal_INVALID);
   5537    cl->svals[cloff + 0] = svNew;
   5538    cl->svals[cloff + 1] = SVal_INVALID;
   5539    cl->svals[cloff + 2] = SVal_INVALID;
   5540    cl->svals[cloff + 3] = SVal_INVALID;
   5541    return;
   5542   slowcase: /* misaligned */
   5543    stats__cline_32to16splits++;
   5544    zsm_swrite16( a + 0, svNew );
   5545    zsm_swrite16( a + 2, svNew );
   5546 }
   5547 
   5548 /*--------------- ZSM accesses: 64 bit swrite --------------- */
   5549 
   5550 static
   5551 void zsm_swrite64 ( Addr a, SVal svNew ) {
   5552    CacheLine* cl;
   5553    UWord      cloff, tno;
   5554    //UWord    toff;
   5555    stats__cline_swrite64s++;
   5556    if (UNLIKELY(!aligned64(a))) goto slowcase;
   5557    cl    = get_cacheline(a);
   5558    cloff = get_cacheline_offset(a);
   5559    tno   = get_treeno(a);
   5560    //toff  = get_tree_offset(a); /* == 0, unused */
   5561    cl->descrs[tno] = TREE_DESCR_64;
   5562    tl_assert(svNew != SVal_INVALID);
   5563    cl->svals[cloff + 0] = svNew;
   5564    cl->svals[cloff + 1] = SVal_INVALID;
   5565    cl->svals[cloff + 2] = SVal_INVALID;
   5566    cl->svals[cloff + 3] = SVal_INVALID;
   5567    cl->svals[cloff + 4] = SVal_INVALID;
   5568    cl->svals[cloff + 5] = SVal_INVALID;
   5569    cl->svals[cloff + 6] = SVal_INVALID;
   5570    cl->svals[cloff + 7] = SVal_INVALID;
   5571    return;
   5572   slowcase: /* misaligned */
   5573    stats__cline_64to32splits++;
   5574    zsm_swrite32( a + 0, svNew );
   5575    zsm_swrite32( a + 4, svNew );
   5576 }
   5577 
   5578 /*------------- ZSM accesses: 8 bit sread/scopy ------------- */
   5579 
   5580 static
   5581 SVal zsm_sread08 ( Addr a ) {
   5582    CacheLine* cl;
   5583    UWord      cloff, tno, toff;
   5584    UShort     descr;
   5585    stats__cline_sread08s++;
   5586    cl    = get_cacheline(a);
   5587    cloff = get_cacheline_offset(a);
   5588    tno   = get_treeno(a);
   5589    toff  = get_tree_offset(a); /* == 0 .. 7 */
   5590    descr = cl->descrs[tno];
   5591    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
   5592       SVal* tree = &cl->svals[tno << 3];
   5593       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
   5594    }
   5595    return cl->svals[cloff];
   5596 }
   5597 
   5598 static void zsm_scopy08 ( Addr src, Addr dst, Bool uu_normalise ) {
   5599    SVal       sv;
   5600    stats__cline_scopy08s++;
   5601    sv = zsm_sread08( src );
   5602    zsm_swrite08( dst, sv );
   5603 }
   5604 
   5605 
   5606 /* Block-copy states (needed for implementing realloc()).  Note this
   5607    doesn't change the filtering arrangements.  The caller of
   5608    zsm_scopy_range needs to attend to that. */
   5609 
   5610 static void zsm_scopy_range ( Addr src, Addr dst, SizeT len )
   5611 {
   5612    SizeT i;
   5613    if (len == 0)
   5614       return;
   5615 
   5616    /* assert for non-overlappingness */
   5617    tl_assert(src+len <= dst || dst+len <= src);
   5618 
   5619    /* To be simple, just copy byte by byte.  But so as not to wreck
   5620       performance for later accesses to dst[0 .. len-1], normalise
   5621       destination lines as we finish with them, and also normalise the
   5622       line containing the first and last address. */
   5623    for (i = 0; i < len; i++) {
   5624       Bool normalise
   5625          = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */
   5626            || i == 0       /* first in range */
   5627            || i == len-1;  /* last in range */
   5628       zsm_scopy08( src+i, dst+i, normalise );
   5629    }
   5630 }
   5631 
   5632 
   5633 /* For setting address ranges to a given value.  Has considerable
   5634    sophistication so as to avoid generating large numbers of pointless
   5635    cache loads/writebacks for large ranges. */
   5636 
   5637 /* Do small ranges in-cache, in the obvious way. */
   5638 static
   5639 void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew )
   5640 {
   5641    /* fast track a couple of common cases */
   5642    if (len == 4 && aligned32(a)) {
   5643       zsm_swrite32( a, svNew );
   5644       return;
   5645    }
   5646    if (len == 8 && aligned64(a)) {
   5647       zsm_swrite64( a, svNew );
   5648       return;
   5649    }
   5650 
   5651    /* be completely general (but as efficient as possible) */
   5652    if (len == 0) return;
   5653 
   5654    if (!aligned16(a) && len >= 1) {
   5655       zsm_swrite08( a, svNew );
   5656       a += 1;
   5657       len -= 1;
   5658       tl_assert(aligned16(a));
   5659    }
   5660    if (len == 0) return;
   5661 
   5662    if (!aligned32(a) && len >= 2) {
   5663       zsm_swrite16( a, svNew );
   5664       a += 2;
   5665       len -= 2;
   5666       tl_assert(aligned32(a));
   5667    }
   5668    if (len == 0) return;
   5669 
   5670    if (!aligned64(a) && len >= 4) {
   5671       zsm_swrite32( a, svNew );
   5672       a += 4;
   5673       len -= 4;
   5674       tl_assert(aligned64(a));
   5675    }
   5676    if (len == 0) return;
   5677 
   5678    if (len >= 8) {
   5679       tl_assert(aligned64(a));
   5680       while (len >= 8) {
   5681          zsm_swrite64( a, svNew );
   5682          a += 8;
   5683          len -= 8;
   5684       }
   5685       tl_assert(aligned64(a));
   5686    }
   5687    if (len == 0) return;
   5688 
   5689    if (len >= 4)
   5690       tl_assert(aligned32(a));
   5691    if (len >= 4) {
   5692       zsm_swrite32( a, svNew );
   5693       a += 4;
   5694       len -= 4;
   5695    }
   5696    if (len == 0) return;
   5697 
   5698    if (len >= 2)
   5699       tl_assert(aligned16(a));
   5700    if (len >= 2) {
   5701       zsm_swrite16( a, svNew );
   5702       a += 2;
   5703       len -= 2;
   5704    }
   5705    if (len == 0) return;
   5706 
   5707    if (len >= 1) {
   5708       zsm_swrite08( a, svNew );
   5709       //a += 1;
   5710       len -= 1;
   5711    }
   5712    tl_assert(len == 0);
   5713 }
   5714 
   5715 
   5716 /* If we're doing a small range, hand off to zsm_sset_range_SMALL.  But
   5717    for larger ranges, try to operate directly on the out-of-cache
   5718    representation, rather than dragging lines into the cache,
   5719    overwriting them, and forcing them out.  This turns out to be an
   5720    important performance optimisation.
   5721 
   5722    Note that this doesn't change the filtering arrangements.  The
   5723    caller of zsm_sset_range needs to attend to that. */
   5724 
   5725 static void zsm_sset_range ( Addr a, SizeT len, SVal svNew )
   5726 {
   5727    tl_assert(svNew != SVal_INVALID);
   5728    stats__cache_make_New_arange += (ULong)len;
   5729 
   5730    if (0 && len > 500)
   5731       VG_(printf)("make New      ( %#lx, %ld )\n", a, len );
   5732 
   5733    if (0) {
   5734       static UWord n_New_in_cache = 0;
   5735       static UWord n_New_not_in_cache = 0;
   5736       /* tag is 'a' with the in-line offset masked out,
   5737          eg a[31]..a[4] 0000 */
   5738       Addr       tag = a & ~(N_LINE_ARANGE - 1);
   5739       UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
   5740       if (LIKELY(tag == cache_shmem.tags0[wix])) {
   5741          n_New_in_cache++;
   5742       } else {
   5743          n_New_not_in_cache++;
   5744       }
   5745       if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000))
   5746          VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n",
   5747                      n_New_in_cache, n_New_not_in_cache );
   5748    }
   5749 
   5750    if (LIKELY(len < 2 * N_LINE_ARANGE)) {
   5751       zsm_sset_range_SMALL( a, len, svNew );
   5752    } else {
   5753       Addr  before_start  = a;
   5754       Addr  aligned_start = cacheline_ROUNDUP(a);
   5755       Addr  after_start   = cacheline_ROUNDDN(a + len);
   5756       UWord before_len    = aligned_start - before_start;
   5757       UWord aligned_len   = after_start - aligned_start;
   5758       UWord after_len     = a + len - after_start;
   5759       tl_assert(before_start <= aligned_start);
   5760       tl_assert(aligned_start <= after_start);
   5761       tl_assert(before_len < N_LINE_ARANGE);
   5762       tl_assert(after_len < N_LINE_ARANGE);
   5763       tl_assert(get_cacheline_offset(aligned_start) == 0);
   5764       if (get_cacheline_offset(a) == 0) {
   5765          tl_assert(before_len == 0);
   5766          tl_assert(a == aligned_start);
   5767       }
   5768       if (get_cacheline_offset(a+len) == 0) {
   5769          tl_assert(after_len == 0);
   5770          tl_assert(after_start == a+len);
   5771       }
   5772       if (before_len > 0) {
   5773          zsm_sset_range_SMALL( before_start, before_len, svNew );
   5774       }
   5775       if (after_len > 0) {
   5776          zsm_sset_range_SMALL( after_start, after_len, svNew );
   5777       }
   5778       stats__cache_make_New_inZrep += (ULong)aligned_len;
   5779 
   5780       while (1) {
   5781          Addr tag;
   5782          UWord wix;
   5783          if (aligned_start >= after_start)
   5784             break;
   5785          tl_assert(get_cacheline_offset(aligned_start) == 0);
   5786          tag = aligned_start & ~(N_LINE_ARANGE - 1);
   5787          wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1);
   5788          if (tag == cache_shmem.tags0[wix]) {
   5789             UWord i;
   5790             for (i = 0; i < N_LINE_ARANGE / 8; i++)
   5791                zsm_swrite64( aligned_start + i * 8, svNew );
   5792          } else {
   5793             UWord i;
   5794             Word zix;
   5795             SecMap* sm;
   5796             LineZ* lineZ;
   5797             /* This line is not in the cache.  Do not force it in; instead
   5798                modify it in-place. */
   5799             /* find the Z line to write in and rcdec it or the
   5800                associated F line. */
   5801             find_Z_for_writing( &sm, &zix, tag );
   5802             tl_assert(sm);
   5803             tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
   5804             lineZ = &sm->linesZ[zix];
   5805             lineZ->dict[0] = svNew;
   5806             lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
   5807             for (i = 0; i < N_LINE_ARANGE/4; i++)
   5808                lineZ->ix2s[i] = 0; /* all refer to dict[0] */
   5809             rcinc_LineZ(lineZ);
   5810          }
   5811          aligned_start += N_LINE_ARANGE;
   5812          aligned_len -= N_LINE_ARANGE;
   5813       }
   5814       tl_assert(aligned_start == after_start);
   5815       tl_assert(aligned_len == 0);
   5816    }
   5817 }
   5818 
   5819 
   5820 /////////////////////////////////////////////////////////
   5821 //                                                     //
   5822 // Front-filtering accesses                            //
   5823 //                                                     //
   5824 /////////////////////////////////////////////////////////
   5825 
   5826 static UWord stats__f_ac = 0;
   5827 static UWord stats__f_sk = 0;
   5828 
   5829 #if 0
   5830 #  define STATS__F_SHOW \
   5831      do { \
   5832         if (UNLIKELY(0 == (stats__f_ac & 0xFFFFFF))) \
   5833            VG_(printf)("filters: ac %lu sk %lu\n",   \
   5834            stats__f_ac, stats__f_sk); \
   5835      } while (0)
   5836 #else
   5837 #  define STATS__F_SHOW /* */
   5838 #endif
   5839 
   5840 void zsm_sapply08_f__msmcwrite ( Thr* thr, Addr a ) {
   5841    stats__f_ac++;
   5842    STATS__F_SHOW;
   5843    if (LIKELY(Filter__ok_to_skip_cwr08(thr->filter, a))) {
   5844       stats__f_sk++;
   5845       return;
   5846    }
   5847    zsm_sapply08__msmcwrite(thr, a);
   5848 }
   5849 
   5850 void zsm_sapply16_f__msmcwrite ( Thr* thr, Addr a ) {
   5851    stats__f_ac++;
   5852    STATS__F_SHOW;
   5853    if (LIKELY(Filter__ok_to_skip_cwr16(thr->filter, a))) {
   5854       stats__f_sk++;
   5855       return;
   5856    }
   5857    zsm_sapply16__msmcwrite(thr, a);
   5858 }
   5859 
   5860 void zsm_sapply32_f__msmcwrite ( Thr* thr, Addr a ) {
   5861    stats__f_ac++;
   5862    STATS__F_SHOW;
   5863    if (LIKELY(Filter__ok_to_skip_cwr32(thr->filter, a))) {
   5864       stats__f_sk++;
   5865       return;
   5866    }
   5867    zsm_sapply32__msmcwrite(thr, a);
   5868 }
   5869 
   5870 void zsm_sapply64_f__msmcwrite ( Thr* thr, Addr a ) {
   5871    stats__f_ac++;
   5872    STATS__F_SHOW;
   5873    if (LIKELY(Filter__ok_to_skip_cwr64(thr->filter, a))) {
   5874       stats__f_sk++;
   5875       return;
   5876    }
   5877    zsm_sapply64__msmcwrite(thr, a);
   5878 }
   5879 
   5880 void zsm_sapplyNN_f__msmcwrite ( Thr* thr, Addr a, SizeT len )
   5881 {
   5882    /* fast track a couple of common cases */
   5883    if (len == 4 && aligned32(a)) {
   5884       zsm_sapply32_f__msmcwrite( thr, a );
   5885       return;
   5886    }
   5887    if (len == 8 && aligned64(a)) {
   5888       zsm_sapply64_f__msmcwrite( thr, a );
   5889       return;
   5890    }
   5891 
   5892    /* be completely general (but as efficient as possible) */
   5893    if (len == 0) return;
   5894 
   5895    if (!aligned16(a) && len >= 1) {
   5896       zsm_sapply08_f__msmcwrite( thr, a );
   5897       a += 1;
   5898       len -= 1;
   5899       tl_assert(aligned16(a));
   5900    }
   5901    if (len == 0) return;
   5902 
   5903    if (!aligned32(a) && len >= 2) {
   5904       zsm_sapply16_f__msmcwrite( thr, a );
   5905       a += 2;
   5906       len -= 2;
   5907       tl_assert(aligned32(a));
   5908    }
   5909    if (len == 0) return;
   5910 
   5911    if (!aligned64(a) && len >= 4) {
   5912       zsm_sapply32_f__msmcwrite( thr, a );
   5913       a += 4;
   5914       len -= 4;
   5915       tl_assert(aligned64(a));
   5916    }
   5917    if (len == 0) return;
   5918 
   5919    if (len >= 8) {
   5920       tl_assert(aligned64(a));
   5921       while (len >= 8) {
   5922          zsm_sapply64_f__msmcwrite( thr, a );
   5923          a += 8;
   5924          len -= 8;
   5925       }
   5926       tl_assert(aligned64(a));
   5927    }
   5928    if (len == 0) return;
   5929 
   5930    if (len >= 4)
   5931       tl_assert(aligned32(a));
   5932    if (len >= 4) {
   5933       zsm_sapply32_f__msmcwrite( thr, a );
   5934       a += 4;
   5935       len -= 4;
   5936    }
   5937    if (len == 0) return;
   5938 
   5939    if (len >= 2)
   5940       tl_assert(aligned16(a));
   5941    if (len >= 2) {
   5942       zsm_sapply16_f__msmcwrite( thr, a );
   5943       a += 2;
   5944       len -= 2;
   5945    }
   5946    if (len == 0) return;
   5947 
   5948    if (len >= 1) {
   5949       zsm_sapply08_f__msmcwrite( thr, a );
   5950       //a += 1;
   5951       len -= 1;
   5952    }
   5953    tl_assert(len == 0);
   5954 }
   5955 
   5956 void zsm_sapply08_f__msmcread ( Thr* thr, Addr a ) {
   5957    stats__f_ac++;
   5958    STATS__F_SHOW;
   5959    if (LIKELY(Filter__ok_to_skip_crd08(thr->filter, a))) {
   5960       stats__f_sk++;
   5961       return;
   5962    }
   5963    zsm_sapply08__msmcread(thr, a);
   5964 }
   5965 
   5966 void zsm_sapply16_f__msmcread ( Thr* thr, Addr a ) {
   5967    stats__f_ac++;
   5968    STATS__F_SHOW;
   5969    if (LIKELY(Filter__ok_to_skip_crd16(thr->filter, a))) {
   5970       stats__f_sk++;
   5971       return;
   5972    }
   5973    zsm_sapply16__msmcread(thr, a);
   5974 }
   5975 
   5976 void zsm_sapply32_f__msmcread ( Thr* thr, Addr a ) {
   5977    stats__f_ac++;
   5978    STATS__F_SHOW;
   5979    if (LIKELY(Filter__ok_to_skip_crd32(thr->filter, a))) {
   5980       stats__f_sk++;
   5981       return;
   5982    }
   5983    zsm_sapply32__msmcread(thr, a);
   5984 }
   5985 
   5986 void zsm_sapply64_f__msmcread ( Thr* thr, Addr a ) {
   5987    stats__f_ac++;
   5988    STATS__F_SHOW;
   5989    if (LIKELY(Filter__ok_to_skip_crd64(thr->filter, a))) {
   5990       stats__f_sk++;
   5991       return;
   5992    }
   5993    zsm_sapply64__msmcread(thr, a);
   5994 }
   5995 
   5996 void zsm_sapplyNN_f__msmcread ( Thr* thr, Addr a, SizeT len )
   5997 {
   5998    /* fast track a couple of common cases */
   5999    if (len == 4 && aligned32(a)) {
   6000       zsm_sapply32_f__msmcread( thr, a );
   6001       return;
   6002    }
   6003    if (len == 8 && aligned64(a)) {
   6004       zsm_sapply64_f__msmcread( thr, a );
   6005       return;
   6006    }
   6007 
   6008    /* be completely general (but as efficient as possible) */
   6009    if (len == 0) return;
   6010 
   6011    if (!aligned16(a) && len >= 1) {
   6012       zsm_sapply08_f__msmcread( thr, a );
   6013       a += 1;
   6014       len -= 1;
   6015       tl_assert(aligned16(a));
   6016    }
   6017    if (len == 0) return;
   6018 
   6019    if (!aligned32(a) && len >= 2) {
   6020       zsm_sapply16_f__msmcread( thr, a );
   6021       a += 2;
   6022       len -= 2;
   6023       tl_assert(aligned32(a));
   6024    }
   6025    if (len == 0) return;
   6026 
   6027    if (!aligned64(a) && len >= 4) {
   6028       zsm_sapply32_f__msmcread( thr, a );
   6029       a += 4;
   6030       len -= 4;
   6031       tl_assert(aligned64(a));
   6032    }
   6033    if (len == 0) return;
   6034 
   6035    if (len >= 8) {
   6036       tl_assert(aligned64(a));
   6037       while (len >= 8) {
   6038          zsm_sapply64_f__msmcread( thr, a );
   6039          a += 8;
   6040          len -= 8;
   6041       }
   6042       tl_assert(aligned64(a));
   6043    }
   6044    if (len == 0) return;
   6045 
   6046    if (len >= 4)
   6047       tl_assert(aligned32(a));
   6048    if (len >= 4) {
   6049       zsm_sapply32_f__msmcread( thr, a );
   6050       a += 4;
   6051       len -= 4;
   6052    }
   6053    if (len == 0) return;
   6054 
   6055    if (len >= 2)
   6056       tl_assert(aligned16(a));
   6057    if (len >= 2) {
   6058       zsm_sapply16_f__msmcread( thr, a );
   6059       a += 2;
   6060       len -= 2;
   6061    }
   6062    if (len == 0) return;
   6063 
   6064    if (len >= 1) {
   6065       zsm_sapply08_f__msmcread( thr, a );
   6066       //a += 1;
   6067       len -= 1;
   6068    }
   6069    tl_assert(len == 0);
   6070 }
   6071 
   6072 void libhb_Thr_resumes ( Thr* thr )
   6073 {
   6074    if (0) VG_(printf)("resume %p\n", thr);
   6075    tl_assert(thr);
   6076    tl_assert(!thr->llexit_done);
   6077    Filter__clear(thr->filter, "libhb_Thr_resumes");
   6078    /* A kludge, but .. if this thread doesn't have any marker stacks
   6079       at all, get one right now.  This is easier than figuring out
   6080       exactly when at thread startup we can and can't take a stack
   6081       snapshot. */
   6082    if (HG_(clo_history_level) == 1) {
   6083       tl_assert(thr->local_Kws_n_stacks);
   6084       if (VG_(sizeXA)( thr->local_Kws_n_stacks ) == 0)
   6085          note_local_Kw_n_stack_for(thr);
   6086    }
   6087 }
   6088 
   6089 
   6090 /////////////////////////////////////////////////////////
   6091 //                                                     //
   6092 // Synchronisation objects                             //
   6093 //                                                     //
   6094 /////////////////////////////////////////////////////////
   6095 
   6096 /* A double linked list of all the SO's. */
   6097 SO* admin_SO = NULL;
   6098 
   6099 static SO* SO__Alloc ( void )
   6100 {
   6101    SO* so = HG_(zalloc)( "libhb.SO__Alloc.1", sizeof(SO) );
   6102    so->viR   = VtsID_INVALID;
   6103    so->viW   = VtsID_INVALID;
   6104    so->magic = SO_MAGIC;
   6105    /* Add to double linked list */
   6106    if (admin_SO) {
   6107       tl_assert(admin_SO->admin_prev == NULL);
   6108       admin_SO->admin_prev = so;
   6109       so->admin_next = admin_SO;
   6110    } else {
   6111       so->admin_next = NULL;
   6112    }
   6113    so->admin_prev = NULL;
   6114    admin_SO = so;
   6115    /* */
   6116    return so;
   6117 }
   6118 
   6119 static void SO__Dealloc ( SO* so )
   6120 {
   6121    tl_assert(so);
   6122    tl_assert(so->magic == SO_MAGIC);
   6123    if (so->viR == VtsID_INVALID) {
   6124       tl_assert(so->viW == VtsID_INVALID);
   6125    } else {
   6126       tl_assert(so->viW != VtsID_INVALID);
   6127       VtsID__rcdec(so->viR);
   6128       VtsID__rcdec(so->viW);
   6129    }
   6130    so->magic = 0;
   6131    /* Del from double linked list */
   6132    if (so->admin_prev)
   6133       so->admin_prev->admin_next = so->admin_next;
   6134    if (so->admin_next)
   6135       so->admin_next->admin_prev = so->admin_prev;
   6136    if (so == admin_SO)
   6137       admin_SO = so->admin_next;
   6138    /* */
   6139    HG_(free)( so );
   6140 }
   6141 
   6142 
   6143 /////////////////////////////////////////////////////////
   6144 //                                                     //
   6145 // Top Level API                                       //
   6146 //                                                     //
   6147 /////////////////////////////////////////////////////////
   6148 
   6149 static void show_thread_state ( HChar* str, Thr* t )
   6150 {
   6151    if (1) return;
   6152    if (t->viR == t->viW) {
   6153       VG_(printf)("thr \"%s\" %p has vi* %u==", str, t, t->viR );
   6154       VtsID__pp( t->viR );
   6155       VG_(printf)("%s","\n");
   6156    } else {
   6157       VG_(printf)("thr \"%s\" %p has viR %u==", str, t, t->viR );
   6158       VtsID__pp( t->viR );
   6159       VG_(printf)(" viW %u==", t->viW);
   6160       VtsID__pp( t->viW );
   6161       VG_(printf)("%s","\n");
   6162    }
   6163 }
   6164 
   6165 
   6166 Thr* libhb_init (
   6167         void        (*get_stacktrace)( Thr*, Addr*, UWord ),
   6168         ExeContext* (*get_EC)( Thr* )
   6169      )
   6170 {
   6171    Thr*  thr;
   6172    VtsID vi;
   6173 
   6174    // We will have to have to store a large number of these,
   6175    // so make sure they're the size we expect them to be.
   6176    tl_assert(sizeof(ScalarTS) == 8);
   6177 
   6178    /* because first 1024 unusable */
   6179    tl_assert(SCALARTS_N_THRBITS >= 11);
   6180    /* so as to fit in a UInt w/ 3 bits to spare (see defn of
   6181       Thr_n_RCEC). */
   6182    tl_assert(SCALARTS_N_THRBITS <= 29);
   6183 
   6184    /* Need to be sure that Thr_n_RCEC is 2 words (64-bit) or 3 words
   6185       (32-bit).  It's not correctness-critical, but there are a lot of
   6186       them, so it's important from a space viewpoint.  Unfortunately
   6187       we simply can't pack it into 2 words on a 32-bit target. */
   6188    if (sizeof(UWord) == 8) {
   6189       tl_assert(sizeof(Thr_n_RCEC) == 16);
   6190    } else {
   6191       tl_assert(sizeof(Thr_n_RCEC) == 12);
   6192    }
   6193 
   6194    /* Word sets really are 32 bits.  Even on a 64 bit target. */
   6195    tl_assert(sizeof(WordSetID) == 4);
   6196    tl_assert(sizeof(WordSet) == sizeof(WordSetID));
   6197 
   6198    tl_assert(get_stacktrace);
   6199    tl_assert(get_EC);
   6200    main_get_stacktrace   = get_stacktrace;
   6201    main_get_EC           = get_EC;
   6202 
   6203    // No need to initialise hg_wordfm.
   6204    // No need to initialise hg_wordset.
   6205 
   6206    /* Allocated once and never deallocated.  Used as a temporary in
   6207       VTS singleton, tick and join operations. */
   6208    temp_max_sized_VTS = VTS__new( "libhb.libhb_init.1", ThrID_MAX_VALID );
   6209    temp_max_sized_VTS->id = VtsID_INVALID;
   6210    verydead_thread_table_init();
   6211    vts_set_init();
   6212    vts_tab_init();
   6213    event_map_init();
   6214    VtsID__invalidate_caches();
   6215 
   6216    // initialise shadow memory
   6217    zsm_init( SVal__rcinc, SVal__rcdec );
   6218 
   6219    thr = Thr__new();
   6220    vi  = VtsID__mk_Singleton( thr, 1 );
   6221    thr->viR = vi;
   6222    thr->viW = vi;
   6223    VtsID__rcinc(thr->viR);
   6224    VtsID__rcinc(thr->viW);
   6225 
   6226    show_thread_state("  root", thr);
   6227    return thr;
   6228 }
   6229 
   6230 
   6231 Thr* libhb_create ( Thr* parent )
   6232 {
   6233    /* The child's VTSs are copies of the parent's VTSs, but ticked at
   6234       the child's index.  Since the child's index is guaranteed
   6235       unique, it has never been seen before, so the implicit value
   6236       before the tick is zero and after that is one. */
   6237    Thr* child = Thr__new();
   6238 
   6239    child->viR = VtsID__tick( parent->viR, child );
   6240    child->viW = VtsID__tick( parent->viW, child );
   6241    Filter__clear(child->filter, "libhb_create(child)");
   6242    VtsID__rcinc(child->viR);
   6243    VtsID__rcinc(child->viW);
   6244    /* We need to do note_local_Kw_n_stack_for( child ), but it's too
   6245       early for that - it may not have a valid TId yet.  So, let
   6246       libhb_Thr_resumes pick it up the first time the thread runs. */
   6247 
   6248    tl_assert(VtsID__indexAt( child->viR, child ) == 1);
   6249    tl_assert(VtsID__indexAt( child->viW, child ) == 1);
   6250 
   6251    /* and the parent has to move along too */
   6252    VtsID__rcdec(parent->viR);
   6253    VtsID__rcdec(parent->viW);
   6254    parent->viR = VtsID__tick( parent->viR, parent );
   6255    parent->viW = VtsID__tick( parent->viW, parent );
   6256    Filter__clear(parent->filter, "libhb_create(parent)");
   6257    VtsID__rcinc(parent->viR);
   6258    VtsID__rcinc(parent->viW);
   6259    note_local_Kw_n_stack_for( parent );
   6260 
   6261    show_thread_state(" child", child);
   6262    show_thread_state("parent", parent);
   6263 
   6264    return child;
   6265 }
   6266 
   6267 /* Shut down the library, and print stats (in fact that's _all_
   6268    this is for. */
   6269 void libhb_shutdown ( Bool show_stats )
   6270 {
   6271    if (show_stats) {
   6272       VG_(printf)("%s","<<< BEGIN libhb stats >>>\n");
   6273       VG_(printf)(" secmaps: %'10lu allocd (%'12lu g-a-range)\n",
   6274                   stats__secmaps_allocd,
   6275                   stats__secmap_ga_space_covered);
   6276       VG_(printf)("  linesZ: %'10lu allocd (%'12lu bytes occupied)\n",
   6277                   stats__secmap_linesZ_allocd,
   6278                   stats__secmap_linesZ_bytes);
   6279       VG_(printf)("  linesF: %'10lu allocd (%'12lu bytes occupied)\n",
   6280                   stats__secmap_linesF_allocd,
   6281                   stats__secmap_linesF_bytes);
   6282       VG_(printf)(" secmaps: %'10lu iterator steppings\n",
   6283                   stats__secmap_iterator_steppings);
   6284       VG_(printf)(" secmaps: %'10lu searches (%'12lu slow)\n",
   6285                   stats__secmaps_search, stats__secmaps_search_slow);
   6286 
   6287       VG_(printf)("%s","\n");
   6288       VG_(printf)("   cache: %'lu totrefs (%'lu misses)\n",
   6289                   stats__cache_totrefs, stats__cache_totmisses );
   6290       VG_(printf)("   cache: %'14lu Z-fetch,    %'14lu F-fetch\n",
   6291                   stats__cache_Z_fetches, stats__cache_F_fetches );
   6292       VG_(printf)("   cache: %'14lu Z-wback,    %'14lu F-wback\n",
   6293                   stats__cache_Z_wbacks, stats__cache_F_wbacks );
   6294       VG_(printf)("   cache: %'14lu invals,     %'14lu flushes\n",
   6295                   stats__cache_invals, stats__cache_flushes );
   6296       VG_(printf)("   cache: %'14llu arange_New  %'14llu direct-to-Zreps\n",
   6297                   stats__cache_make_New_arange,
   6298                   stats__cache_make_New_inZrep);
   6299 
   6300       VG_(printf)("%s","\n");
   6301       VG_(printf)("   cline: %'10lu normalises\n",
   6302                   stats__cline_normalises );
   6303       VG_(printf)("   cline: c rds 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
   6304                   stats__cline_cread64s,
   6305                   stats__cline_cread32s,
   6306                   stats__cline_cread16s,
   6307                   stats__cline_cread08s );
   6308       VG_(printf)("   cline: c wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
   6309                   stats__cline_cwrite64s,
   6310                   stats__cline_cwrite32s,
   6311                   stats__cline_cwrite16s,
   6312                   stats__cline_cwrite08s );
   6313       VG_(printf)("   cline: s wrs 8/4/2/1: %'13lu %'13lu %'13lu %'13lu\n",
   6314                   stats__cline_swrite64s,
   6315                   stats__cline_swrite32s,
   6316                   stats__cline_swrite16s,
   6317                   stats__cline_swrite08s );
   6318       VG_(printf)("   cline: s rd1s %'lu, s copy1s %'lu\n",
   6319                   stats__cline_sread08s, stats__cline_scopy08s );
   6320       VG_(printf)("   cline:    splits: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
   6321                  stats__cline_64to32splits,
   6322                  stats__cline_32to16splits,
   6323                  stats__cline_16to8splits );
   6324       VG_(printf)("   cline: pulldowns: 8to4 %'12lu    4to2 %'12lu    2to1 %'12lu\n",
   6325                  stats__cline_64to32pulldown,
   6326                  stats__cline_32to16pulldown,
   6327                  stats__cline_16to8pulldown );
   6328       if (0)
   6329       VG_(printf)("   cline: sizeof(CacheLineZ) %ld, covers %ld bytes of arange\n",
   6330                   (Word)sizeof(LineZ), (Word)N_LINE_ARANGE);
   6331 
   6332       VG_(printf)("%s","\n");
   6333 
   6334       VG_(printf)("   libhb: %'13llu msmcread  (%'llu dragovers)\n",
   6335                   stats__msmcread, stats__msmcread_change);
   6336       VG_(printf)("   libhb: %'13llu msmcwrite (%'llu dragovers)\n",
   6337                   stats__msmcwrite, stats__msmcwrite_change);
   6338       VG_(printf)("   libhb: %'13llu cmpLEQ queries (%'llu misses)\n",
   6339                   stats__cmpLEQ_queries, stats__cmpLEQ_misses);
   6340       VG_(printf)("   libhb: %'13llu join2  queries (%'llu misses)\n",
   6341                   stats__join2_queries, stats__join2_misses);
   6342 
   6343       VG_(printf)("%s","\n");
   6344       VG_(printf)( "   libhb: VTSops: tick %'lu,  join %'lu,  cmpLEQ %'lu\n",
   6345                    stats__vts__tick, stats__vts__join,  stats__vts__cmpLEQ );
   6346       VG_(printf)( "   libhb: VTSops: cmp_structural %'lu (%'lu slow)\n",
   6347                    stats__vts__cmp_structural, stats__vts__cmp_structural_slow );
   6348       VG_(printf)( "   libhb: VTSset: find__or__clone_and_add %'lu (%'lu allocd)\n",
   6349                    stats__vts_set__focaa, stats__vts_set__focaa_a );
   6350       VG_(printf)( "   libhb: VTSops: indexAt_SLOW %'lu\n",
   6351                    stats__vts__indexat_slow );
   6352 
   6353       VG_(printf)("%s","\n");
   6354       VG_(printf)(
   6355          "   libhb: %ld entries in vts_table (approximately %lu bytes)\n",
   6356          VG_(sizeXA)( vts_tab ), VG_(sizeXA)( vts_tab ) * sizeof(VtsTE)
   6357       );
   6358       VG_(printf)( "   libhb: %lu entries in vts_set\n",
   6359                    VG_(sizeFM)( vts_set ) );
   6360 
   6361       VG_(printf)("%s","\n");
   6362       VG_(printf)( "   libhb: ctxt__rcdec: 1=%lu(%lu eq), 2=%lu, 3=%lu\n",
   6363                    stats__ctxt_rcdec1, stats__ctxt_rcdec1_eq,
   6364                    stats__ctxt_rcdec2,
   6365                    stats__ctxt_rcdec3 );
   6366       VG_(printf)( "   libhb: ctxt__rcdec: calls %lu, discards %lu\n",
   6367                    stats__ctxt_rcdec_calls, stats__ctxt_rcdec_discards);
   6368       VG_(printf)( "   libhb: contextTab: %lu slots, %lu max ents\n",
   6369                    (UWord)N_RCEC_TAB,
   6370                    stats__ctxt_tab_curr );
   6371       VG_(printf)( "   libhb: contextTab: %lu queries, %lu cmps\n",
   6372                    stats__ctxt_tab_qs,
   6373                    stats__ctxt_tab_cmps );
   6374 #if 0
   6375       VG_(printf)("sizeof(AvlNode)     = %lu\n", sizeof(AvlNode));
   6376       VG_(printf)("sizeof(WordBag)     = %lu\n", sizeof(WordBag));
   6377       VG_(printf)("sizeof(MaybeWord)   = %lu\n", sizeof(MaybeWord));
   6378       VG_(printf)("sizeof(CacheLine)   = %lu\n", sizeof(CacheLine));
   6379       VG_(printf)("sizeof(LineZ)       = %lu\n", sizeof(LineZ));
   6380       VG_(printf)("sizeof(LineF)       = %lu\n", sizeof(LineF));
   6381       VG_(printf)("sizeof(SecMap)      = %lu\n", sizeof(SecMap));
   6382       VG_(printf)("sizeof(Cache)       = %lu\n", sizeof(Cache));
   6383       VG_(printf)("sizeof(SMCacheEnt)  = %lu\n", sizeof(SMCacheEnt));
   6384       VG_(printf)("sizeof(CountedSVal) = %lu\n", sizeof(CountedSVal));
   6385       VG_(printf)("sizeof(VTS)         = %lu\n", sizeof(VTS));
   6386       VG_(printf)("sizeof(ScalarTS)    = %lu\n", sizeof(ScalarTS));
   6387       VG_(printf)("sizeof(VtsTE)       = %lu\n", sizeof(VtsTE));
   6388       VG_(printf)("sizeof(MSMInfo)     = %lu\n", sizeof(MSMInfo));
   6389 
   6390       VG_(printf)("sizeof(struct _XArray)     = %lu\n", sizeof(struct _XArray));
   6391       VG_(printf)("sizeof(struct _WordFM)     = %lu\n", sizeof(struct _WordFM));
   6392       VG_(printf)("sizeof(struct _Thr)     = %lu\n", sizeof(struct _Thr));
   6393       VG_(printf)("sizeof(struct _SO)     = %lu\n", sizeof(struct _SO));
   6394 #endif
   6395 
   6396       VG_(printf)("%s","<<< END libhb stats >>>\n");
   6397       VG_(printf)("%s","\n");
   6398 
   6399    }
   6400 }
   6401 
   6402 /* Receive notification that a thread has low level exited.  The
   6403    significance here is that we do not expect to see any more memory
   6404    references from it. */
   6405 void libhb_async_exit ( Thr* thr )
   6406 {
   6407    tl_assert(thr);
   6408    tl_assert(!thr->llexit_done);
   6409    thr->llexit_done = True;
   6410 
   6411    /* free up Filter and local_Kws_n_stacks (well, actually not the
   6412       latter ..) */
   6413    tl_assert(thr->filter);
   6414    HG_(free)(thr->filter);
   6415    thr->filter = NULL;
   6416 
   6417    /* Tell the VTS mechanism this thread has exited, so it can
   6418       participate in VTS pruning.  Note this can only happen if the
   6419       thread has both ll_exited and has been joined with. */
   6420    if (thr->joinedwith_done)
   6421       VTS__declare_thread_very_dead(thr);
   6422 
   6423    /* Another space-accuracy tradeoff.  Do we want to be able to show
   6424       H1 history for conflicts in threads which have since exited?  If
   6425       yes, then we better not free up thr->local_Kws_n_stacks.  The
   6426       downside is a potential per-thread leak of up to
   6427       N_KWs_N_STACKs_PER_THREAD * sizeof(ULong_n_EC) * whatever the
   6428       XArray average overcommit factor is (1.5 I'd guess). */
   6429    // hence:
   6430    // VG_(deleteXA)(thr->local_Kws_n_stacks);
   6431    // thr->local_Kws_n_stacks = NULL;
   6432 }
   6433 
   6434 /* Receive notification that a thread has been joined with.  The
   6435    significance here is that we do not expect to see any further
   6436    references to its vector clocks (Thr::viR and Thr::viW). */
   6437 void libhb_joinedwith_done ( Thr* thr )
   6438 {
   6439    tl_assert(thr);
   6440    /* Caller must ensure that this is only ever called once per Thr. */
   6441    tl_assert(!thr->joinedwith_done);
   6442    thr->joinedwith_done = True;
   6443    if (thr->llexit_done)
   6444       VTS__declare_thread_very_dead(thr);
   6445 }
   6446 
   6447 
   6448 /* Both Segs and SOs point to VTSs.  However, there is no sharing, so
   6449    a Seg that points at a VTS is its one-and-only owner, and ditto for
   6450    a SO that points at a VTS. */
   6451 
   6452 SO* libhb_so_alloc ( void )
   6453 {
   6454    return SO__Alloc();
   6455 }
   6456 
   6457 void libhb_so_dealloc ( SO* so )
   6458 {
   6459    tl_assert(so);
   6460    tl_assert(so->magic == SO_MAGIC);
   6461    SO__Dealloc(so);
   6462 }
   6463 
   6464 /* See comments in libhb.h for details on the meaning of
   6465    strong vs weak sends and strong vs weak receives. */
   6466 void libhb_so_send ( Thr* thr, SO* so, Bool strong_send )
   6467 {
   6468    /* Copy the VTSs from 'thr' into the sync object, and then move
   6469       the thread along one step. */
   6470 
   6471    tl_assert(so);
   6472    tl_assert(so->magic == SO_MAGIC);
   6473 
   6474    /* stay sane .. a thread's read-clock must always lead or be the
   6475       same as its write-clock */
   6476    { Bool leq = VtsID__cmpLEQ(thr->viW, thr->viR);
   6477      tl_assert(leq);
   6478    }
   6479 
   6480    /* since we're overwriting the VtsIDs in the SO, we need to drop
   6481       any references made by the previous contents thereof */
   6482    if (so->viR == VtsID_INVALID) {
   6483       tl_assert(so->viW == VtsID_INVALID);
   6484       so->viR = thr->viR;
   6485       so->viW = thr->viW;
   6486       VtsID__rcinc(so->viR);
   6487       VtsID__rcinc(so->viW);
   6488    } else {
   6489       /* In a strong send, we dump any previous VC in the SO and
   6490          install the sending thread's VC instead.  For a weak send we
   6491          must join2 with what's already there. */
   6492       tl_assert(so->viW != VtsID_INVALID);
   6493       VtsID__rcdec(so->viR);
   6494       VtsID__rcdec(so->viW);
   6495       so->viR = strong_send ? thr->viR : VtsID__join2( so->viR, thr->viR );
   6496       so->viW = strong_send ? thr->viW : VtsID__join2( so->viW, thr->viW );
   6497       VtsID__rcinc(so->viR);
   6498       VtsID__rcinc(so->viW);
   6499    }
   6500 
   6501    /* move both parent clocks along */
   6502    VtsID__rcdec(thr->viR);
   6503    VtsID__rcdec(thr->viW);
   6504    thr->viR = VtsID__tick( thr->viR, thr );
   6505    thr->viW = VtsID__tick( thr->viW, thr );
   6506    if (!thr->llexit_done) {
   6507       Filter__clear(thr->filter, "libhb_so_send");
   6508       note_local_Kw_n_stack_for(thr);
   6509    }
   6510    VtsID__rcinc(thr->viR);
   6511    VtsID__rcinc(thr->viW);
   6512 
   6513    if (strong_send)
   6514       show_thread_state("s-send", thr);
   6515    else
   6516       show_thread_state("w-send", thr);
   6517 }
   6518 
   6519 void libhb_so_recv ( Thr* thr, SO* so, Bool strong_recv )
   6520 {
   6521    tl_assert(so);
   6522    tl_assert(so->magic == SO_MAGIC);
   6523 
   6524    if (so->viR != VtsID_INVALID) {
   6525       tl_assert(so->viW != VtsID_INVALID);
   6526 
   6527       /* Weak receive (basically, an R-acquisition of a R-W lock).
   6528          This advances the read-clock of the receiver, but not the
   6529          write-clock. */
   6530       VtsID__rcdec(thr->viR);
   6531       thr->viR = VtsID__join2( thr->viR, so->viR );
   6532       VtsID__rcinc(thr->viR);
   6533 
   6534       /* At one point (r10589) it seemed safest to tick the clocks for
   6535          the receiving thread after the join.  But on reflection, I
   6536          wonder if that might cause it to 'overtake' constraints,
   6537          which could lead to missing races.  So, back out that part of
   6538          r10589. */
   6539       //VtsID__rcdec(thr->viR);
   6540       //thr->viR = VtsID__tick( thr->viR, thr );
   6541       //VtsID__rcinc(thr->viR);
   6542 
   6543       /* For a strong receive, we also advance the receiver's write
   6544          clock, which means the receive as a whole is essentially
   6545          equivalent to a W-acquisition of a R-W lock. */
   6546       if (strong_recv) {
   6547          VtsID__rcdec(thr->viW);
   6548          thr->viW = VtsID__join2( thr->viW, so->viW );
   6549          VtsID__rcinc(thr->viW);
   6550 
   6551          /* See comment just above, re r10589. */
   6552          //VtsID__rcdec(thr->viW);
   6553          //thr->viW = VtsID__tick( thr->viW, thr );
   6554          //VtsID__rcinc(thr->viW);
   6555       }
   6556 
   6557       if (thr->filter)
   6558          Filter__clear(thr->filter, "libhb_so_recv");
   6559       note_local_Kw_n_stack_for(thr);
   6560 
   6561       if (strong_recv)
   6562          show_thread_state("s-recv", thr);
   6563       else
   6564          show_thread_state("w-recv", thr);
   6565 
   6566    } else {
   6567       tl_assert(so->viW == VtsID_INVALID);
   6568       /* Deal with degenerate case: 'so' has no vts, so there has been
   6569          no message posted to it.  Just ignore this case. */
   6570       show_thread_state("d-recv", thr);
   6571    }
   6572 }
   6573 
   6574 Bool libhb_so_everSent ( SO* so )
   6575 {
   6576    if (so->viR == VtsID_INVALID) {
   6577       tl_assert(so->viW == VtsID_INVALID);
   6578       return False;
   6579    } else {
   6580       tl_assert(so->viW != VtsID_INVALID);
   6581       return True;
   6582    }
   6583 }
   6584 
   6585 #define XXX1 0 // 0x67a106c
   6586 #define XXX2 0
   6587 
   6588 static inline Bool TRACEME(Addr a, SizeT szB) {
   6589    if (XXX1 && a <= XXX1 && XXX1 <= a+szB) return True;
   6590    if (XXX2 && a <= XXX2 && XXX2 <= a+szB) return True;
   6591    return False;
   6592 }
   6593 static void trace ( Thr* thr, Addr a, SizeT szB, HChar* s ) {
   6594   SVal sv = zsm_sread08(a);
   6595   VG_(printf)("thr %p (%#lx,%lu) %s: 0x%016llx ", thr,a,szB,s,sv);
   6596   show_thread_state("", thr);
   6597   VG_(printf)("%s","\n");
   6598 }
   6599 
   6600 void libhb_srange_new ( Thr* thr, Addr a, SizeT szB )
   6601 {
   6602    SVal sv = SVal__mkC(thr->viW, thr->viW);
   6603    tl_assert(is_sane_SVal_C(sv));
   6604    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-before");
   6605    zsm_sset_range( a, szB, sv );
   6606    Filter__clear_range( thr->filter, a, szB );
   6607    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"nw-after ");
   6608 }
   6609 
   6610 void libhb_srange_noaccess_NoFX ( Thr* thr, Addr a, SizeT szB )
   6611 {
   6612    /* do nothing */
   6613 }
   6614 
   6615 void libhb_srange_noaccess_AHAE ( Thr* thr, Addr a, SizeT szB )
   6616 {
   6617    /* This really does put the requested range in NoAccess.  It's
   6618       expensive though. */
   6619    SVal sv = SVal_NOACCESS;
   6620    tl_assert(is_sane_SVal_C(sv));
   6621    zsm_sset_range( a, szB, sv );
   6622    Filter__clear_range( thr->filter, a, szB );
   6623 }
   6624 
   6625 void libhb_srange_untrack ( Thr* thr, Addr a, SizeT szB )
   6626 {
   6627    SVal sv = SVal_NOACCESS;
   6628    tl_assert(is_sane_SVal_C(sv));
   6629    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-before");
   6630    zsm_sset_range( a, szB, sv );
   6631    Filter__clear_range( thr->filter, a, szB );
   6632    if (0 && TRACEME(a,szB)) trace(thr,a,szB,"untrack-after ");
   6633 }
   6634 
   6635 Thread* libhb_get_Thr_hgthread ( Thr* thr ) {
   6636    tl_assert(thr);
   6637    return thr->hgthread;
   6638 }
   6639 
   6640 void libhb_set_Thr_hgthread ( Thr* thr, Thread* hgthread ) {
   6641    tl_assert(thr);
   6642    thr->hgthread = hgthread;
   6643 }
   6644 
   6645 void libhb_copy_shadow_state ( Thr* thr, Addr src, Addr dst, SizeT len )
   6646 {
   6647    zsm_scopy_range(src, dst, len);
   6648    Filter__clear_range( thr->filter, dst, len );
   6649 }
   6650 
   6651 void libhb_maybe_GC ( void )
   6652 {
   6653    event_map_maybe_GC();
   6654    /* If there are still freelist entries available, no need for a
   6655       GC. */
   6656    if (vts_tab_freelist != VtsID_INVALID)
   6657       return;
   6658    /* So all the table entries are full, and we're having to expand
   6659       the table.  But did we hit the threshhold point yet? */
   6660    if (VG_(sizeXA)( vts_tab ) < vts_next_GC_at)
   6661       return;
   6662    vts_tab__do_GC( False/*don't show stats*/ );
   6663 }
   6664 
   6665 
   6666 /////////////////////////////////////////////////////////////////
   6667 /////////////////////////////////////////////////////////////////
   6668 //                                                             //
   6669 // SECTION END main library                                    //
   6670 //                                                             //
   6671 /////////////////////////////////////////////////////////////////
   6672 /////////////////////////////////////////////////////////////////
   6673 
   6674 /*--------------------------------------------------------------------*/
   6675 /*--- end                                             libhb_main.c ---*/
   6676 /*--------------------------------------------------------------------*/
   6677