Home | History | Annotate | Download | only in helgrind
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- LibHB: a library for implementing and checking               ---*/
      4 /*--- the happens-before relationship in concurrent programs.      ---*/
      5 /*---                                                 libhb_main.c ---*/
      6 /*--------------------------------------------------------------------*/
      7 
      8 /*
      9    This file is part of LibHB, a library for implementing and checking
     10    the happens-before relationship in concurrent programs.
     11 
     12    Copyright (C) 2008-2010 OpenWorks Ltd
     13       info (at) open-works.co.uk
     14 
     15    This program is free software; you can redistribute it and/or
     16    modify it under the terms of the GNU General Public License as
     17    published by the Free Software Foundation; either version 2 of the
     18    License, or (at your option) any later version.
     19 
     20    This program is distributed in the hope that it will be useful, but
     21    WITHOUT ANY WARRANTY; without even the implied warranty of
     22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     23    General Public License for more details.
     24 
     25    You should have received a copy of the GNU General Public License
     26    along with this program; if not, write to the Free Software
     27    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     28    02111-1307, USA.
     29 
     30    The GNU General Public License is contained in the file COPYING.
     31 */
     32 
     33 #include "pub_tool_basics.h"
     34 #include "pub_tool_libcassert.h"
     35 #include "pub_tool_libcbase.h"
     36 #include "pub_tool_libcprint.h"
     37 #include "pub_tool_mallocfree.h"
     38 #include "pub_tool_wordfm.h"
     39 #include "pub_tool_sparsewa.h"
     40 #include "pub_tool_xarray.h"
     41 #include "pub_tool_oset.h"
     42 #include "pub_tool_threadstate.h"
     43 #include "pub_tool_aspacemgr.h"
     44 #include "pub_tool_execontext.h"
     45 #include "pub_tool_errormgr.h"
     46 #include "pub_tool_options.h"        // VG_(clo_stats)
     47 #include "hg_basics.h"
     48 #include "hg_wordset.h"
     49 #include "hg_lock_n_thread.h"
     50 #include "hg_errors.h"
     51 
     52 #include "libhb.h"
     53 
     54 
     55 /////////////////////////////////////////////////////////////////
     56 /////////////////////////////////////////////////////////////////
     57 //                                                             //
     58 // Debugging #defines                                          //
     59 //                                                             //
     60 /////////////////////////////////////////////////////////////////
     61 /////////////////////////////////////////////////////////////////
     62 
     63 /* Check the sanity of shadow values in the core memory state
     64    machine.  Change #if 0 to #if 1 to enable this. */
     65 #if 0
     66 #  define CHECK_MSM 1
     67 #else
     68 #  define CHECK_MSM 0
     69 #endif
     70 
     71 
     72 /* Check sanity (reference counts, etc) in the conflicting access
     73    machinery.  Change #if 0 to #if 1 to enable this. */
     74 #if 0
     75 #  define CHECK_CEM 1
     76 #else
     77 #  define CHECK_CEM 0
     78 #endif
     79 
     80 
     81 /* Check sanity in the compressed shadow memory machinery,
     82    particularly in its caching innards.  Unfortunately there's no
     83    almost-zero-cost way to make them selectable at run time.  Hence
     84    set the #if 0 to #if 1 and rebuild if you want them. */
     85 #if 0
     86 #  define CHECK_ZSM 1  /* do sanity-check CacheLine stuff */
     87 #  define inline __attribute__((noinline))
     88    /* probably want to ditch -fomit-frame-pointer too */
     89 #else
     90 #  define CHECK_ZSM 0   /* don't sanity-check CacheLine stuff */
     91 #endif
     92 
     93 
     94 /////////////////////////////////////////////////////////////////
     95 /////////////////////////////////////////////////////////////////
     96 //                                                             //
     97 // Forward declarations                                        //
     98 //                                                             //
     99 /////////////////////////////////////////////////////////////////
    100 /////////////////////////////////////////////////////////////////
    101 
    102 /* fwds for
    103    Globals needed by other parts of the library.  These are set
    104    once at startup and then never changed. */
    105 static void        (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL;
    106 static ExeContext* (*main_get_EC)( Thr* ) = NULL;
    107 
    108 
    109 
    110 /////////////////////////////////////////////////////////////////
    111 /////////////////////////////////////////////////////////////////
    112 //                                                             //
    113 // SECTION BEGIN compressed shadow memory                      //
    114 //                                                             //
    115 /////////////////////////////////////////////////////////////////
    116 /////////////////////////////////////////////////////////////////
    117 
    118 #ifndef __HB_ZSM_H
    119 #define __HB_ZSM_H
    120 
    121 typedef  ULong  SVal;
    122 
    123 /* This value has special significance to the implementation, and callers
    124    may not store it in the shadow memory. */
    125 #define SVal_INVALID (3ULL << 62)
    126 
    127 /* This is the default value for shadow memory.  Initially the shadow
    128    memory contains no accessible areas and so all reads produce this
    129    value.  TODO: make this caller-defineable. */
    130 #define SVal_NOACCESS (2ULL << 62)
    131 
    132 /* Initialise the library.  Once initialised, it will (or may) call
    133    rcinc and rcdec in response to all the calls below, in order to
    134    allow the user to do reference counting on the SVals stored herein.
    135    It is important to understand, however, that due to internal
    136    caching, the reference counts are in general inaccurate, and can be
    137    both above or below the true reference count for an item.  In
    138    particular, the library may indicate that the reference count for
    139    an item is zero, when in fact it is not.
    140 
    141    To make the reference counting exact and therefore non-pointless,
    142    call zsm_flush_cache.  Immediately after it returns, the reference
    143    counts for all items, as deduced by the caller by observing calls
    144    to rcinc and rcdec, will be correct, and so any items with a zero
    145    reference count may be freed (or at least considered to be
    146    unreferenced by this library).
    147 */
    148 static void zsm_init ( void(*rcinc)(SVal), void(*rcdec)(SVal) );
    149 
    150 static void zsm_sset_range  ( Addr, SizeT, SVal );
    151 static void zsm_scopy_range ( Addr, Addr, SizeT );
    152 static void zsm_flush_cache ( void );
    153 
    154 #endif /* ! __HB_ZSM_H */
    155 
    156 
    157 /* Round a up to the next multiple of N.  N must be a power of 2 */
    158 #define ROUNDUP(a, N)   ((a + N - 1) & ~(N-1))
    159 /* Round a down to the next multiple of N.  N must be a power of 2 */
    160 #define ROUNDDN(a, N)   ((a) & ~(N-1))
    161 
    162 
    163 
    164 /* ------ User-supplied RC functions ------ */
    165 static void(*rcinc)(SVal) = NULL;
    166 static void(*rcdec)(SVal) = NULL;
    167 
    168 
    169 /* ------ CacheLine ------ */
    170 
    171 #define N_LINE_BITS      6 /* must be >= 3 */
    172 #define N_LINE_ARANGE    (1 << N_LINE_BITS)
    173 #define N_LINE_TREES     (N_LINE_ARANGE >> 3)
    174 
    175 typedef
    176    struct {
    177       UShort descrs[N_LINE_TREES];
    178       SVal   svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8
    179    }
    180    CacheLine;
    181 
    182 #define TREE_DESCR_16_0 (1<<0)
    183 #define TREE_DESCR_32_0 (1<<1)
    184 #define TREE_DESCR_16_1 (1<<2)
    185 #define TREE_DESCR_64   (1<<3)
    186 #define TREE_DESCR_16_2 (1<<4)
    187 #define TREE_DESCR_32_1 (1<<5)
    188 #define TREE_DESCR_16_3 (1<<6)
    189 #define TREE_DESCR_8_0  (1<<7)
    190 #define TREE_DESCR_8_1  (1<<8)
    191 #define TREE_DESCR_8_2  (1<<9)
    192 #define TREE_DESCR_8_3  (1<<10)
    193 #define TREE_DESCR_8_4  (1<<11)
    194 #define TREE_DESCR_8_5  (1<<12)
    195 #define TREE_DESCR_8_6  (1<<13)
    196 #define TREE_DESCR_8_7  (1<<14)
    197 #define TREE_DESCR_DTY  (1<<15)
    198 
    199 typedef
    200    struct {
    201       SVal  dict[4]; /* can represent up to 4 diff values in the line */
    202       UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit
    203                                       dict indexes */
    204       /* if dict[0] == SVal_INVALID then dict[1] is the index of the
    205          LineF to use, and dict[2..] are also SVal_INVALID. */
    206    }
    207    LineZ; /* compressed rep for a cache line */
    208 
    209 typedef
    210    struct {
    211       Bool inUse;
    212       SVal w64s[N_LINE_ARANGE];
    213    }
    214    LineF; /* full rep for a cache line */
    215 
    216 /* Shadow memory.
    217    Primary map is a WordFM Addr SecMap*.
    218    SecMaps cover some page-size-ish section of address space and hold
    219      a compressed representation.
    220    CacheLine-sized chunks of SecMaps are copied into a Cache, being
    221    decompressed when moved into the cache and recompressed on the
    222    way out.  Because of this, the cache must operate as a writeback
    223    cache, not a writethrough one.
    224 
    225    Each SecMap must hold a power-of-2 number of CacheLines.  Hence
    226    N_SECMAP_BITS must >= N_LINE_BITS.
    227 */
    228 #define N_SECMAP_BITS   13
    229 #define N_SECMAP_ARANGE (1 << N_SECMAP_BITS)
    230 
    231 // # CacheLines held by a SecMap
    232 #define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE)
    233 
    234 /* The data in the SecMap is held in the array of LineZs.  Each LineZ
    235    either carries the required data directly, in a compressed
    236    representation, or it holds (in .dict[0]) an index to the LineF in
    237    .linesF that holds the full representation.
    238 
    239    Currently-unused LineF's have their .inUse bit set to zero.
    240    Since each in-use LineF is referred to be exactly one LineZ,
    241    the number of .linesZ[] that refer to .linesF should equal
    242    the number of .linesF[] that have .inUse == True.
    243 
    244    RC obligations: the RCs presented to the user include exactly
    245    the values in:
    246    * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID
    247    * F reps that are in use (.inUse == True)
    248 
    249    Hence the following actions at the following transitions are required:
    250 
    251    F rep: .inUse==True  -> .inUse==False        -- rcdec_LineF
    252    F rep: .inUse==False -> .inUse==True         -- rcinc_LineF
    253    Z rep: .dict[0] from other to SVal_INVALID   -- rcdec_LineZ
    254    Z rep: .dict[0] from SVal_INVALID to other   -- rcinc_LineZ
    255 */
    256 typedef
    257    struct {
    258       UInt   magic;
    259       LineZ  linesZ[N_SECMAP_ZLINES];
    260       LineF* linesF;
    261       UInt   linesF_size;
    262    }
    263    SecMap;
    264 
    265 #define SecMap_MAGIC   0x571e58cbU
    266 
    267 static inline Bool is_sane_SecMap ( SecMap* sm ) {
    268    return sm != NULL && sm->magic == SecMap_MAGIC;
    269 }
    270 
    271 /* ------ Cache ------ */
    272 
    273 #define N_WAY_BITS 16
    274 #define N_WAY_NENT (1 << N_WAY_BITS)
    275 
    276 /* Each tag is the address of the associated CacheLine, rounded down
    277    to a CacheLine address boundary.  A CacheLine size must be a power
    278    of 2 and must be 8 or more.  Hence an easy way to initialise the
    279    cache so it is empty is to set all the tag values to any value % 8
    280    != 0, eg 1.  This means all queries in the cache initially miss.
    281    It does however require us to detect and not writeback, any line
    282    with a bogus tag. */
    283 typedef
    284    struct {
    285       CacheLine lyns0[N_WAY_NENT];
    286       Addr      tags0[N_WAY_NENT];
    287    }
    288    Cache;
    289 
    290 static inline Bool is_valid_scache_tag ( Addr tag ) {
    291    /* a valid tag should be naturally aligned to the start of
    292       a CacheLine. */
    293    return 0 == (tag & (N_LINE_ARANGE - 1));
    294 }
    295 
    296 
    297 /* --------- Primary data structures --------- */
    298 
    299 /* Shadow memory primary map */
    300 static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */
    301 static Cache   cache_shmem;
    302 
    303 
    304 static UWord stats__secmaps_search       = 0; // # SM finds
    305 static UWord stats__secmaps_search_slow  = 0; // # SM lookupFMs
    306 static UWord stats__secmaps_allocd       = 0; // # SecMaps issued
    307 static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered
    308 static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued
    309 static UWord stats__secmap_linesZ_bytes  = 0; // .. using this much storage
    310 static UWord stats__secmap_linesF_allocd = 0; // # LineF's issued
    311 static UWord stats__secmap_linesF_bytes  = 0; //  .. using this much storage
    312 static UWord stats__secmap_iterator_steppings = 0; // # calls to stepSMIter
    313 static UWord stats__cache_Z_fetches      = 0; // # Z lines fetched
    314 static UWord stats__cache_Z_wbacks       = 0; // # Z lines written back
    315 static UWord stats__cache_F_fetches      = 0; // # F lines fetched
    316 static UWord stats__cache_F_wbacks       = 0; // # F lines written back
    317 static UWord stats__cache_invals         = 0; // # cache invals
    318 static UWord stats__cache_flushes        = 0; // # cache flushes
    319 static UWord stats__cache_totrefs        = 0; // # total accesses
    320 static UWord stats__cache_totmisses      = 0; // # misses
    321 static ULong stats__cache_make_New_arange = 0; // total arange made New
    322 static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps
    323 static UWord stats__cline_normalises     = 0; // # calls to cacheline_normalise
    324 static UWord stats__cline_cread64s       = 0; // # calls to s_m_read64
    325 static UWord stats__cline_cread32s       = 0; // # calls to s_m_read32
    326 static UWord stats__cline_cread16s       = 0; // # calls to s_m_read16
    327 static UWord stats__cline_cread08s       = 0; // # calls to s_m_read8
    328 static UWord stats__cline_cwrite64s      = 0; // # calls to s_m_write64
    329 static UWord stats__cline_cwrite32s      = 0; // # calls to s_m_write32
    330 static UWord stats__cline_cwrite16s      = 0; // # calls to s_m_write16
    331 static UWord stats__cline_cwrite08s      = 0; // # calls to s_m_write8
    332 static UWord stats__cline_sread08s       = 0; // # calls to s_m_set8
    333 static UWord stats__cline_swrite08s      = 0; // # calls to s_m_get8
    334 static UWord stats__cline_swrite16s      = 0; // # calls to s_m_get8
    335 static UWord stats__cline_swrite32s      = 0; // # calls to s_m_get8
    336 static UWord stats__cline_swrite64s      = 0; // # calls to s_m_get8
    337 static UWord stats__cline_scopy08s       = 0; // # calls to s_m_copy8
    338 static UWord stats__cline_64to32splits   = 0; // # 64-bit accesses split
    339 static UWord stats__cline_32to16splits   = 0; // # 32-bit accesses split
    340 static UWord stats__cline_16to8splits    = 0; // # 16-bit accesses split
    341 static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32
    342 static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16
    343 static UWord stats__cline_16to8pulldown  = 0; // # calls to pulldown_to_8
    344 static UWord stats__vts__tick            = 0; // # calls to VTS__tick
    345 static UWord stats__vts__join            = 0; // # calls to VTS__join
    346 static UWord stats__vts__cmpLEQ          = 0; // # calls to VTS__cmpLEQ
    347 static UWord stats__vts__cmp_structural  = 0; // # calls to VTS__cmp_structural
    348 static UWord stats__vts__cmp_structural_slow = 0; // # calls to VTS__cmp_structural w/ slow case
    349 static UWord stats__vts__indexat_slow    = 0; // # calls to VTS__indexAt_SLOW
    350 static UWord stats__vts_set__fadoa       = 0; // # calls to vts_set__find_and_dealloc__or_add
    351 static UWord stats__vts_set__fadoa_d     = 0; // # calls to vts_set__find_and_dealloc__or_add
    352                                               // that lead to a deallocation
    353 
    354 
    355 static inline Addr shmem__round_to_SecMap_base ( Addr a ) {
    356    return a & ~(N_SECMAP_ARANGE - 1);
    357 }
    358 static inline UWord shmem__get_SecMap_offset ( Addr a ) {
    359    return a & (N_SECMAP_ARANGE - 1);
    360 }
    361 
    362 
    363 /*----------------------------------------------------------------*/
    364 /*--- map_shmem :: WordFM Addr SecMap                          ---*/
    365 /*--- shadow memory (low level handlers) (shmem__* fns)        ---*/
    366 /*----------------------------------------------------------------*/
    367 
    368 /*--------------- SecMap allocation --------------- */
    369 
    370 static HChar* shmem__bigchunk_next = NULL;
    371 static HChar* shmem__bigchunk_end1 = NULL;
    372 
    373 static void* shmem__bigchunk_alloc ( SizeT n )
    374 {
    375    const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4;
    376    tl_assert(n > 0);
    377    n = VG_ROUNDUP(n, 16);
    378    tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1);
    379    tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next
    380              <= (SSizeT)sHMEM__BIGCHUNK_SIZE);
    381    if (shmem__bigchunk_next + n > shmem__bigchunk_end1) {
    382       if (0)
    383       VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n",
    384                   (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next));
    385       shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE );
    386       if (shmem__bigchunk_next == NULL)
    387          VG_(out_of_memory_NORETURN)(
    388             "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE );
    389       shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE;
    390    }
    391    tl_assert(shmem__bigchunk_next);
    392    tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) );
    393    tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1);
    394    shmem__bigchunk_next += n;
    395    return shmem__bigchunk_next - n;
    396 }
    397 
    398 static SecMap* shmem__alloc_SecMap ( void )
    399 {
    400    Word    i, j;
    401    SecMap* sm = shmem__bigchunk_alloc( sizeof(SecMap) );
    402    if (0) VG_(printf)("alloc_SecMap %p\n",sm);
    403    tl_assert(sm);
    404    sm->magic = SecMap_MAGIC;
    405    for (i = 0; i < N_SECMAP_ZLINES; i++) {
    406       sm->linesZ[i].dict[0] = SVal_NOACCESS;
    407       sm->linesZ[i].dict[1] = SVal_INVALID;
    408       sm->linesZ[i].dict[2] = SVal_INVALID;
    409       sm->linesZ[i].dict[3] = SVal_INVALID;
    410       for (j = 0; j < N_LINE_ARANGE/4; j++)
    411          sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */
    412    }
    413    sm->linesF      = NULL;
    414    sm->linesF_size = 0;
    415    stats__secmaps_allocd++;
    416    stats__secmap_ga_space_covered += N_SECMAP_ARANGE;
    417    stats__secmap_linesZ_allocd += N_SECMAP_ZLINES;
    418    stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ);
    419    return sm;
    420 }
    421 
    422 typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt;
    423 static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} };
    424 
    425 static SecMap* shmem__find_SecMap ( Addr ga )
    426 {
    427    SecMap* sm    = NULL;
    428    Addr    gaKey = shmem__round_to_SecMap_base(ga);
    429    // Cache
    430    stats__secmaps_search++;
    431    if (LIKELY(gaKey == smCache[0].gaKey))
    432       return smCache[0].sm;
    433    if (LIKELY(gaKey == smCache[1].gaKey)) {
    434       SMCacheEnt tmp = smCache[0];
    435       smCache[0] = smCache[1];
    436       smCache[1] = tmp;
    437       return smCache[0].sm;
    438    }
    439    if (gaKey == smCache[2].gaKey) {
    440       SMCacheEnt tmp = smCache[1];
    441       smCache[1] = smCache[2];
    442       smCache[2] = tmp;
    443       return smCache[1].sm;
    444    }
    445    // end Cache
    446    stats__secmaps_search_slow++;
    447    if (VG_(lookupFM)( map_shmem,
    448                       NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) {
    449       tl_assert(sm != NULL);
    450       smCache[2] = smCache[1];
    451       smCache[1] = smCache[0];
    452       smCache[0].gaKey = gaKey;
    453       smCache[0].sm    = sm;
    454    } else {
    455       tl_assert(sm == NULL);
    456    }
    457    return sm;
    458 }
    459 
    460 static SecMap* shmem__find_or_alloc_SecMap ( Addr ga )
    461 {
    462    SecMap* sm = shmem__find_SecMap ( ga );
    463    if (LIKELY(sm)) {
    464       return sm;
    465    } else {
    466       /* create a new one */
    467       Addr gaKey = shmem__round_to_SecMap_base(ga);
    468       sm = shmem__alloc_SecMap();
    469       tl_assert(sm);
    470       VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm );
    471       return sm;
    472    }
    473 }
    474 
    475 
    476 /* ------------ LineF and LineZ related ------------ */
    477 
    478 static void rcinc_LineF ( LineF* lineF ) {
    479    UWord i;
    480    tl_assert(lineF->inUse);
    481    for (i = 0; i < N_LINE_ARANGE; i++)
    482       rcinc(lineF->w64s[i]);
    483 }
    484 
    485 static void rcdec_LineF ( LineF* lineF ) {
    486    UWord i;
    487    tl_assert(lineF->inUse);
    488    for (i = 0; i < N_LINE_ARANGE; i++)
    489       rcdec(lineF->w64s[i]);
    490 }
    491 
    492 static void rcinc_LineZ ( LineZ* lineZ ) {
    493    tl_assert(lineZ->dict[0] != SVal_INVALID);
    494    rcinc(lineZ->dict[0]);
    495    if (lineZ->dict[1] != SVal_INVALID) rcinc(lineZ->dict[1]);
    496    if (lineZ->dict[2] != SVal_INVALID) rcinc(lineZ->dict[2]);
    497    if (lineZ->dict[3] != SVal_INVALID) rcinc(lineZ->dict[3]);
    498 }
    499 
    500 static void rcdec_LineZ ( LineZ* lineZ ) {
    501    tl_assert(lineZ->dict[0] != SVal_INVALID);
    502    rcdec(lineZ->dict[0]);
    503    if (lineZ->dict[1] != SVal_INVALID) rcdec(lineZ->dict[1]);
    504    if (lineZ->dict[2] != SVal_INVALID) rcdec(lineZ->dict[2]);
    505    if (lineZ->dict[3] != SVal_INVALID) rcdec(lineZ->dict[3]);
    506 }
    507 
    508 inline
    509 static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) {
    510    Word bix, shft, mask, prep;
    511    tl_assert(ix >= 0);
    512    bix  = ix >> 2;
    513    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
    514    mask = 3 << shft;
    515    prep = b2 << shft;
    516    arr[bix] = (arr[bix] & ~mask) | prep;
    517 }
    518 
    519 inline
    520 static UWord read_twobit_array ( UChar* arr, UWord ix ) {
    521    Word bix, shft;
    522    tl_assert(ix >= 0);
    523    bix  = ix >> 2;
    524    shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */
    525    return (arr[bix] >> shft) & 3;
    526 }
    527 
    528 /* Given address 'tag', find either the Z or F line containing relevant
    529    data, so it can be read into the cache.
    530 */
    531 static void find_ZF_for_reading ( /*OUT*/LineZ** zp,
    532                                   /*OUT*/LineF** fp, Addr tag ) {
    533    LineZ* lineZ;
    534    LineF* lineF;
    535    UWord   zix;
    536    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
    537    UWord   smoff = shmem__get_SecMap_offset(tag);
    538    /* since smoff is derived from a valid tag, it should be
    539       cacheline-aligned. */
    540    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
    541    zix = smoff >> N_LINE_BITS;
    542    tl_assert(zix < N_SECMAP_ZLINES);
    543    lineZ = &sm->linesZ[zix];
    544    lineF = NULL;
    545    if (lineZ->dict[0] == SVal_INVALID) {
    546       UInt fix = (UInt)lineZ->dict[1];
    547       tl_assert(sm->linesF);
    548       tl_assert(sm->linesF_size > 0);
    549       tl_assert(fix >= 0 && fix < sm->linesF_size);
    550       lineF = &sm->linesF[fix];
    551       tl_assert(lineF->inUse);
    552       lineZ = NULL;
    553    }
    554    *zp = lineZ;
    555    *fp = lineF;
    556 }
    557 
    558 /* Given address 'tag', return the relevant SecMap and the index of
    559    the LineZ within it, in the expectation that the line is to be
    560    overwritten.  Regardless of whether 'tag' is currently associated
    561    with a Z or F representation, to rcdec on the current
    562    representation, in recognition of the fact that the contents are
    563    just about to be overwritten. */
    564 static __attribute__((noinline))
    565 void find_Z_for_writing ( /*OUT*/SecMap** smp,
    566                           /*OUT*/Word* zixp,
    567                           Addr tag ) {
    568    LineZ* lineZ;
    569    LineF* lineF;
    570    UWord   zix;
    571    SecMap* sm    = shmem__find_or_alloc_SecMap(tag);
    572    UWord   smoff = shmem__get_SecMap_offset(tag);
    573    /* since smoff is derived from a valid tag, it should be
    574       cacheline-aligned. */
    575    tl_assert(0 == (smoff & (N_LINE_ARANGE - 1)));
    576    zix = smoff >> N_LINE_BITS;
    577    tl_assert(zix < N_SECMAP_ZLINES);
    578    lineZ = &sm->linesZ[zix];
    579    lineF = NULL;
    580    /* re RCs, we are freeing up this LineZ/LineF so that new data can
    581       be parked in it.  Hence have to rcdec it accordingly. */
    582    /* If lineZ has an associated lineF, free it up. */
    583    if (lineZ->dict[0] == SVal_INVALID) {
    584       UInt fix = (UInt)lineZ->dict[1];
    585       tl_assert(sm->linesF);
    586       tl_assert(sm->linesF_size > 0);
    587       tl_assert(fix >= 0 && fix < sm->linesF_size);
    588       lineF = &sm->linesF[fix];
    589       tl_assert(lineF->inUse);
    590       rcdec_LineF(lineF);
    591       lineF->inUse = False;
    592    } else {
    593       rcdec_LineZ(lineZ);
    594    }
    595    *smp  = sm;
    596    *zixp = zix;
    597 }
    598 
    599 static __attribute__((noinline))
    600 void alloc_F_for_writing ( /*MOD*/SecMap* sm, /*OUT*/Word* fixp ) {
    601    UInt        i, new_size;
    602    LineF* nyu;
    603 
    604    if (sm->linesF) {
    605       tl_assert(sm->linesF_size > 0);
    606    } else {
    607       tl_assert(sm->linesF_size == 0);
    608    }
    609 
    610    if (sm->linesF) {
    611       for (i = 0; i < sm->linesF_size; i++) {
    612          if (!sm->linesF[i].inUse) {
    613             *fixp = (Word)i;
    614             return;
    615          }
    616       }
    617    }
    618 
    619    /* No free F line found.  Expand existing array and try again. */
    620    new_size = sm->linesF_size==0 ? 1 : 2 * sm->linesF_size;
    621    nyu      = HG_(zalloc)( "libhb.aFfw.1 (LineF storage)",
    622                            new_size * sizeof(LineF) );
    623    tl_assert(nyu);
    624 
    625    stats__secmap_linesF_allocd += (new_size - sm->linesF_size);
    626    stats__secmap_linesF_bytes  += (new_size - sm->linesF_size)
    627                                   * sizeof(LineF);
    628 
    629    if (0)
    630    VG_(printf)("SM %p: expand F array from %d to %d\n",
    631                sm, (Int)sm->linesF_size, new_size);
    632 
    633    for (i = 0; i < new_size; i++)
    634       nyu[i].inUse = False;
    635 
    636    if (sm->linesF) {
    637       for (i = 0; i < sm->linesF_size; i++) {
    638          tl_assert(sm->linesF[i].inUse);
    639          nyu[i] = sm->linesF[i];
    640       }
    641       VG_(memset)(sm->linesF, 0, sm->linesF_size * sizeof(LineF) );
    642       HG_(free)(sm->linesF);
    643    }
    644 
    645    sm->linesF      = nyu;
    646    sm->linesF_size = new_size;
    647 
    648    for (i = 0; i < sm->linesF_size; i++) {
    649       if (!sm->linesF[i].inUse) {
    650          *fixp = (Word)i;
    651          return;
    652       }
    653     }
    654 
    655     /*NOTREACHED*/
    656     tl_assert(0);
    657 }
    658 
    659 
    660 /* ------------ CacheLine and implicit-tree related ------------ */
    661 
    662 __attribute__((unused))
    663 static void pp_CacheLine ( CacheLine* cl ) {
    664    Word i;
    665    if (!cl) {
    666       VG_(printf)("%s","pp_CacheLine(NULL)\n");
    667       return;
    668    }
    669    for (i = 0; i < N_LINE_TREES; i++)
    670       VG_(printf)("   descr: %04lx\n", (UWord)cl->descrs[i]);
    671    for (i = 0; i < N_LINE_ARANGE; i++)
    672       VG_(printf)("    sval: %08lx\n", (UWord)cl->svals[i]);
    673 }
    674 
    675 static UChar descr_to_validbits ( UShort descr )
    676 {
    677    /* a.k.a Party Time for gcc's constant folder */
    678 #  define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \
    679                 b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0)  \
    680              ( (UShort) ( ( (b8_7)  << 14) | ( (b8_6)  << 13) | \
    681                           ( (b8_5)  << 12) | ( (b8_4)  << 11) | \
    682                           ( (b8_3)  << 10) | ( (b8_2)  << 9)  | \
    683                           ( (b8_1)  << 8)  | ( (b8_0)  << 7)  | \
    684                           ( (b16_3) << 6)  | ( (b32_1) << 5)  | \
    685                           ( (b16_2) << 4)  | ( (b64)   << 3)  | \
    686                           ( (b16_1) << 2)  | ( (b32_0) << 1)  | \
    687                           ( (b16_0) << 0) ) )
    688 
    689 #  define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \
    690              ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \
    691                          ( (bit5) << 5) | ( (bit4) << 4) | \
    692                          ( (bit3) << 3) | ( (bit2) << 2) | \
    693                          ( (bit1) << 1) | ( (bit0) << 0) ) )
    694 
    695    /* these should all get folded out at compile time */
    696    tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7);
    697    tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0);
    698    tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3);
    699    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1);
    700    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2);
    701    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64);
    702    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1);
    703    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0);
    704    tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0);
    705 
    706    switch (descr) {
    707    /*
    708               +--------------------------------- TREE_DESCR_8_7
    709               |             +------------------- TREE_DESCR_8_0
    710               |             |  +---------------- TREE_DESCR_16_3
    711               |             |  | +-------------- TREE_DESCR_32_1
    712               |             |  | | +------------ TREE_DESCR_16_2
    713               |             |  | | |  +--------- TREE_DESCR_64
    714               |             |  | | |  |  +------ TREE_DESCR_16_1
    715               |             |  | | |  |  | +---- TREE_DESCR_32_0
    716               |             |  | | |  |  | | +-- TREE_DESCR_16_0
    717               |             |  | | |  |  | | |
    718               |             |  | | |  |  | | |   GRANULARITY, 7 -> 0 */
    719    case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8  8 8 8 8 */
    720                                                  return BYTE(1,1,1,1,1,1,1,1);
    721    case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16   8 8 8 8 */
    722                                                  return BYTE(1,1,0,1,1,1,1,1);
    723    case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16  8 8  8 8 8 8 */
    724                                                  return BYTE(0,1,1,1,1,1,1,1);
    725    case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16  16   8 8 8 8 */
    726                                                  return BYTE(0,1,0,1,1,1,1,1);
    727 
    728    case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8  8 8 16 */
    729                                                  return BYTE(1,1,1,1,1,1,0,1);
    730    case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16   8 8 16 */
    731                                                  return BYTE(1,1,0,1,1,1,0,1);
    732    case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16  8 8  8 8 16 */
    733                                                  return BYTE(0,1,1,1,1,1,0,1);
    734    case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16  16   8 8 16 */
    735                                                  return BYTE(0,1,0,1,1,1,0,1);
    736 
    737    case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8  16 8 8 */
    738                                                  return BYTE(1,1,1,1,0,1,1,1);
    739    case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16   16 8 8 */
    740                                                  return BYTE(1,1,0,1,0,1,1,1);
    741    case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16  8 8  16 8 8 */
    742                                                  return BYTE(0,1,1,1,0,1,1,1);
    743    case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16  16   16 8 8 */
    744                                                  return BYTE(0,1,0,1,0,1,1,1);
    745 
    746    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8  16 16 */
    747                                                  return BYTE(1,1,1,1,0,1,0,1);
    748    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16   16 16 */
    749                                                  return BYTE(1,1,0,1,0,1,0,1);
    750    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16  8 8  16 16 */
    751                                                  return BYTE(0,1,1,1,0,1,0,1);
    752    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16  16   16 16 */
    753                                                  return BYTE(0,1,0,1,0,1,0,1);
    754 
    755    case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32  8 8 8 8 */
    756                                                  return BYTE(0,0,0,1,1,1,1,1);
    757    case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32  8 8 16  */
    758                                                  return BYTE(0,0,0,1,1,1,0,1);
    759    case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32  16  8 8 */
    760                                                  return BYTE(0,0,0,1,0,1,1,1);
    761    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32  16  16  */
    762                                                  return BYTE(0,0,0,1,0,1,0,1);
    763 
    764    case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8  32 */
    765                                                  return BYTE(1,1,1,1,0,0,0,1);
    766    case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16   32 */
    767                                                  return BYTE(1,1,0,1,0,0,0,1);
    768    case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16  8 8  32 */
    769                                                  return BYTE(0,1,1,1,0,0,0,1);
    770    case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16  16   32 */
    771                                                  return BYTE(0,1,0,1,0,0,0,1);
    772 
    773    case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */
    774                                                  return BYTE(0,0,0,1,0,0,0,1);
    775 
    776    case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */
    777                                                  return BYTE(0,0,0,0,0,0,0,1);
    778 
    779    default: return BYTE(0,0,0,0,0,0,0,0);
    780                    /* INVALID - any valid descr produces at least one
    781                       valid bit in tree[0..7]*/
    782    }
    783    /* NOTREACHED*/
    784    tl_assert(0);
    785 
    786 #  undef DESCR
    787 #  undef BYTE
    788 }
    789 
    790 __attribute__((unused))
    791 static Bool is_sane_Descr ( UShort descr ) {
    792    return descr_to_validbits(descr) != 0;
    793 }
    794 
    795 static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) {
    796    VG_(sprintf)(dst,
    797                 "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d",
    798                 (Int)((descr & TREE_DESCR_8_7) ? 1 : 0),
    799                 (Int)((descr & TREE_DESCR_8_6) ? 1 : 0),
    800                 (Int)((descr & TREE_DESCR_8_5) ? 1 : 0),
    801                 (Int)((descr & TREE_DESCR_8_4) ? 1 : 0),
    802                 (Int)((descr & TREE_DESCR_8_3) ? 1 : 0),
    803                 (Int)((descr & TREE_DESCR_8_2) ? 1 : 0),
    804                 (Int)((descr & TREE_DESCR_8_1) ? 1 : 0),
    805                 (Int)((descr & TREE_DESCR_8_0) ? 1 : 0),
    806                 (Int)((descr & TREE_DESCR_16_3) ? 1 : 0),
    807                 (Int)((descr & TREE_DESCR_32_1) ? 1 : 0),
    808                 (Int)((descr & TREE_DESCR_16_2) ? 1 : 0),
    809                 (Int)((descr & TREE_DESCR_64)   ? 1 : 0),
    810                 (Int)((descr & TREE_DESCR_16_1) ? 1 : 0),
    811                 (Int)((descr & TREE_DESCR_32_0) ? 1 : 0),
    812                 (Int)((descr & TREE_DESCR_16_0) ? 1 : 0)
    813    );
    814 }
    815 static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) {
    816    VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d",
    817                      (Int)((byte & 128) ? 1 : 0),
    818                      (Int)((byte &  64) ? 1 : 0),
    819                      (Int)((byte &  32) ? 1 : 0),
    820                      (Int)((byte &  16) ? 1 : 0),
    821                      (Int)((byte &   8) ? 1 : 0),
    822                      (Int)((byte &   4) ? 1 : 0),
    823                      (Int)((byte &   2) ? 1 : 0),
    824                      (Int)((byte &   1) ? 1 : 0)
    825    );
    826 }
    827 
    828 static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) {
    829    Word  i;
    830    UChar validbits = descr_to_validbits(descr);
    831    HChar buf[128], buf2[128];
    832    if (validbits == 0)
    833       goto bad;
    834    for (i = 0; i < 8; i++) {
    835       if (validbits & (1<<i)) {
    836          if (tree[i] == SVal_INVALID)
    837             goto bad;
    838       } else {
    839          if (tree[i] != SVal_INVALID)
    840             goto bad;
    841       }
    842    }
    843    return True;
    844   bad:
    845    sprintf_Descr( buf, descr );
    846    sprintf_Byte( buf2, validbits );
    847    VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n");
    848    VG_(printf)("   validbits 0x%02lx    %s\n", (UWord)validbits, buf2);
    849    VG_(printf)("       descr 0x%04lx  %s\n", (UWord)descr, buf);
    850    for (i = 0; i < 8; i++)
    851       VG_(printf)("   [%ld] 0x%016llx\n", i, tree[i]);
    852    VG_(printf)("%s","}\n");
    853    return 0;
    854 }
    855 
    856 static Bool is_sane_CacheLine ( CacheLine* cl )
    857 {
    858    Word tno, cloff;
    859 
    860    if (!cl) goto bad;
    861 
    862    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
    863       UShort descr = cl->descrs[tno];
    864       SVal*  tree  = &cl->svals[cloff];
    865       if (!is_sane_Descr_and_Tree(descr, tree))
    866          goto bad;
    867    }
    868    tl_assert(cloff == N_LINE_ARANGE);
    869    return True;
    870   bad:
    871    pp_CacheLine(cl);
    872    return False;
    873 }
    874 
    875 static UShort normalise_tree ( /*MOD*/SVal* tree )
    876 {
    877    UShort descr;
    878    /* pre: incoming tree[0..7] does not have any invalid shvals, in
    879       particular no zeroes. */
    880    if (UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID
    881                 || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID
    882                 || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID
    883                 || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID))
    884       tl_assert(0);
    885 
    886    descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5
    887            | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2
    888            | TREE_DESCR_8_1 | TREE_DESCR_8_0;
    889    /* build 16-bit layer */
    890    if (tree[1] == tree[0]) {
    891       tree[1] = SVal_INVALID;
    892       descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0);
    893       descr |= TREE_DESCR_16_0;
    894    }
    895    if (tree[3] == tree[2]) {
    896       tree[3] = SVal_INVALID;
    897       descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2);
    898       descr |= TREE_DESCR_16_1;
    899    }
    900    if (tree[5] == tree[4]) {
    901       tree[5] = SVal_INVALID;
    902       descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4);
    903       descr |= TREE_DESCR_16_2;
    904    }
    905    if (tree[7] == tree[6]) {
    906       tree[7] = SVal_INVALID;
    907       descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6);
    908       descr |= TREE_DESCR_16_3;
    909    }
    910    /* build 32-bit layer */
    911    if (tree[2] == tree[0]
    912        && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) {
    913       tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */
    914       descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0);
    915       descr |= TREE_DESCR_32_0;
    916    }
    917    if (tree[6] == tree[4]
    918        && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) {
    919       tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */
    920       descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2);
    921       descr |= TREE_DESCR_32_1;
    922    }
    923    /* build 64-bit layer */
    924    if (tree[4] == tree[0]
    925        && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) {
    926       tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */
    927       descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0);
    928       descr |= TREE_DESCR_64;
    929    }
    930    return descr;
    931 }
    932 
    933 /* This takes a cacheline where all the data is at the leaves
    934    (w8[..]) and builds a correctly normalised tree. */
    935 static void normalise_CacheLine ( /*MOD*/CacheLine* cl )
    936 {
    937    Word tno, cloff;
    938    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
    939       SVal* tree = &cl->svals[cloff];
    940       cl->descrs[tno] = normalise_tree( tree );
    941    }
    942    tl_assert(cloff == N_LINE_ARANGE);
    943    if (CHECK_ZSM)
    944       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
    945    stats__cline_normalises++;
    946 }
    947 
    948 
    949 typedef struct { UChar count; SVal sval; } CountedSVal;
    950 
    951 static
    952 void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst,
    953                                /*OUT*/Word* dstUsedP,
    954                                Word nDst, CacheLine* src )
    955 {
    956    Word  tno, cloff, dstUsed;
    957 
    958    tl_assert(nDst == N_LINE_ARANGE);
    959    dstUsed = 0;
    960 
    961    for (tno = 0, cloff = 0;  tno < N_LINE_TREES;  tno++, cloff += 8) {
    962       UShort descr = src->descrs[tno];
    963       SVal*  tree  = &src->svals[cloff];
    964 
    965       /* sequentialise the tree described by (descr,tree). */
    966 #     define PUT(_n,_v)                                \
    967          do { dst[dstUsed  ].count = (_n);             \
    968               dst[dstUsed++].sval  = (_v);             \
    969          } while (0)
    970 
    971       /* byte 0 */
    972       if (descr & TREE_DESCR_64)   PUT(8, tree[0]); else
    973       if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else
    974       if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else
    975       if (descr & TREE_DESCR_8_0)  PUT(1, tree[0]);
    976       /* byte 1 */
    977       if (descr & TREE_DESCR_8_1)  PUT(1, tree[1]);
    978       /* byte 2 */
    979       if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else
    980       if (descr & TREE_DESCR_8_2)  PUT(1, tree[2]);
    981       /* byte 3 */
    982       if (descr & TREE_DESCR_8_3)  PUT(1, tree[3]);
    983       /* byte 4 */
    984       if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else
    985       if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else
    986       if (descr & TREE_DESCR_8_4)  PUT(1, tree[4]);
    987       /* byte 5 */
    988       if (descr & TREE_DESCR_8_5)  PUT(1, tree[5]);
    989       /* byte 6 */
    990       if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else
    991       if (descr & TREE_DESCR_8_6)  PUT(1, tree[6]);
    992       /* byte 7 */
    993       if (descr & TREE_DESCR_8_7)  PUT(1, tree[7]);
    994 
    995 #     undef PUT
    996       /* END sequentialise the tree described by (descr,tree). */
    997 
    998    }
    999    tl_assert(cloff == N_LINE_ARANGE);
   1000    tl_assert(dstUsed <= nDst);
   1001 
   1002    *dstUsedP = dstUsed;
   1003 }
   1004 
   1005 /* Write the cacheline 'wix' to backing store.  Where it ends up
   1006    is determined by its tag field. */
   1007 static __attribute__((noinline)) void cacheline_wback ( UWord wix )
   1008 {
   1009    Word        i, j, k, m;
   1010    Addr        tag;
   1011    SecMap*     sm;
   1012    CacheLine*  cl;
   1013    LineZ* lineZ;
   1014    LineF* lineF;
   1015    Word        zix, fix, csvalsUsed;
   1016    CountedSVal csvals[N_LINE_ARANGE];
   1017    SVal        sv;
   1018 
   1019    if (0)
   1020    VG_(printf)("scache wback line %d\n", (Int)wix);
   1021 
   1022    tl_assert(wix >= 0 && wix < N_WAY_NENT);
   1023 
   1024    tag =  cache_shmem.tags0[wix];
   1025    cl  = &cache_shmem.lyns0[wix];
   1026 
   1027    /* The cache line may have been invalidated; if so, ignore it. */
   1028    if (!is_valid_scache_tag(tag))
   1029       return;
   1030 
   1031    /* Where are we going to put it? */
   1032    sm         = NULL;
   1033    lineZ      = NULL;
   1034    lineF      = NULL;
   1035    zix = fix = -1;
   1036 
   1037    /* find the Z line to write in and rcdec it or the associated F
   1038       line. */
   1039    find_Z_for_writing( &sm, &zix, tag );
   1040 
   1041    tl_assert(sm);
   1042    tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
   1043    lineZ = &sm->linesZ[zix];
   1044 
   1045    /* Generate the data to be stored */
   1046    if (CHECK_ZSM)
   1047       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   1048 
   1049    csvalsUsed = -1;
   1050    sequentialise_CacheLine( csvals, &csvalsUsed,
   1051                             N_LINE_ARANGE, cl );
   1052    tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE);
   1053    if (0) VG_(printf)("%lu ", csvalsUsed);
   1054 
   1055    lineZ->dict[0] = lineZ->dict[1]
   1056                   = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
   1057 
   1058    /* i indexes actual shadow values, k is cursor in csvals */
   1059    i = 0;
   1060    for (k = 0; k < csvalsUsed; k++) {
   1061 
   1062       sv = csvals[k].sval;
   1063       if (CHECK_ZSM)
   1064          tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
   1065       /* do we already have it? */
   1066       if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; }
   1067       if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; }
   1068       if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; }
   1069       if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; }
   1070       /* no.  look for a free slot. */
   1071       if (CHECK_ZSM)
   1072          tl_assert(sv != SVal_INVALID);
   1073       if (lineZ->dict[0]
   1074           == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; }
   1075       if (lineZ->dict[1]
   1076           == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; }
   1077       if (lineZ->dict[2]
   1078           == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; }
   1079       if (lineZ->dict[3]
   1080           == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; }
   1081       break; /* we'll have to use the f rep */
   1082      dict_ok:
   1083       m = csvals[k].count;
   1084       if (m == 8) {
   1085          write_twobit_array( lineZ->ix2s, i+0, j );
   1086          write_twobit_array( lineZ->ix2s, i+1, j );
   1087          write_twobit_array( lineZ->ix2s, i+2, j );
   1088          write_twobit_array( lineZ->ix2s, i+3, j );
   1089          write_twobit_array( lineZ->ix2s, i+4, j );
   1090          write_twobit_array( lineZ->ix2s, i+5, j );
   1091          write_twobit_array( lineZ->ix2s, i+6, j );
   1092          write_twobit_array( lineZ->ix2s, i+7, j );
   1093          i += 8;
   1094       }
   1095       else if (m == 4) {
   1096          write_twobit_array( lineZ->ix2s, i+0, j );
   1097          write_twobit_array( lineZ->ix2s, i+1, j );
   1098          write_twobit_array( lineZ->ix2s, i+2, j );
   1099          write_twobit_array( lineZ->ix2s, i+3, j );
   1100          i += 4;
   1101       }
   1102       else if (m == 1) {
   1103          write_twobit_array( lineZ->ix2s, i+0, j );
   1104          i += 1;
   1105       }
   1106       else if (m == 2) {
   1107          write_twobit_array( lineZ->ix2s, i+0, j );
   1108          write_twobit_array( lineZ->ix2s, i+1, j );
   1109          i += 2;
   1110       }
   1111       else {
   1112          tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */
   1113       }
   1114 
   1115    }
   1116 
   1117    if (LIKELY(i == N_LINE_ARANGE)) {
   1118       /* Construction of the compressed representation was
   1119          successful. */
   1120       rcinc_LineZ(lineZ);
   1121       stats__cache_Z_wbacks++;
   1122    } else {
   1123       /* Cannot use the compressed(z) representation.  Use the full(f)
   1124          rep instead. */
   1125       tl_assert(i >= 0 && i < N_LINE_ARANGE);
   1126       alloc_F_for_writing( sm, &fix );
   1127       tl_assert(sm->linesF);
   1128       tl_assert(sm->linesF_size > 0);
   1129       tl_assert(fix >= 0 && fix < (Word)sm->linesF_size);
   1130       lineF = &sm->linesF[fix];
   1131       tl_assert(!lineF->inUse);
   1132       lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
   1133       lineZ->dict[1] = (SVal)fix;
   1134       lineF->inUse = True;
   1135       i = 0;
   1136       for (k = 0; k < csvalsUsed; k++) {
   1137          if (CHECK_ZSM)
   1138             tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8);
   1139          sv = csvals[k].sval;
   1140          if (CHECK_ZSM)
   1141             tl_assert(sv != SVal_INVALID);
   1142          for (m = csvals[k].count; m > 0; m--) {
   1143             lineF->w64s[i] = sv;
   1144             i++;
   1145          }
   1146       }
   1147       tl_assert(i == N_LINE_ARANGE);
   1148       rcinc_LineF(lineF);
   1149       stats__cache_F_wbacks++;
   1150    }
   1151 }
   1152 
   1153 /* Fetch the cacheline 'wix' from the backing store.  The tag
   1154    associated with 'wix' is assumed to have already been filled in;
   1155    hence that is used to determine where in the backing store to read
   1156    from. */
   1157 static __attribute__((noinline)) void cacheline_fetch ( UWord wix )
   1158 {
   1159    Word       i;
   1160    Addr       tag;
   1161    CacheLine* cl;
   1162    LineZ*     lineZ;
   1163    LineF*     lineF;
   1164 
   1165    if (0)
   1166    VG_(printf)("scache fetch line %d\n", (Int)wix);
   1167 
   1168    tl_assert(wix >= 0 && wix < N_WAY_NENT);
   1169 
   1170    tag =  cache_shmem.tags0[wix];
   1171    cl  = &cache_shmem.lyns0[wix];
   1172 
   1173    /* reject nonsense requests */
   1174    tl_assert(is_valid_scache_tag(tag));
   1175 
   1176    lineZ = NULL;
   1177    lineF = NULL;
   1178    find_ZF_for_reading( &lineZ, &lineF, tag );
   1179    tl_assert( (lineZ && !lineF) || (!lineZ && lineF) );
   1180 
   1181    /* expand the data into the bottom layer of the tree, then get
   1182       cacheline_normalise to build the descriptor array. */
   1183    if (lineF) {
   1184       tl_assert(lineF->inUse);
   1185       for (i = 0; i < N_LINE_ARANGE; i++) {
   1186          cl->svals[i] = lineF->w64s[i];
   1187       }
   1188       stats__cache_F_fetches++;
   1189    } else {
   1190       for (i = 0; i < N_LINE_ARANGE; i++) {
   1191          SVal sv;
   1192          UWord ix = read_twobit_array( lineZ->ix2s, i );
   1193          /* correct, but expensive: tl_assert(ix >= 0 && ix <= 3); */
   1194          sv = lineZ->dict[ix];
   1195          tl_assert(sv != SVal_INVALID);
   1196          cl->svals[i] = sv;
   1197       }
   1198       stats__cache_Z_fetches++;
   1199    }
   1200    normalise_CacheLine( cl );
   1201 }
   1202 
   1203 static void shmem__invalidate_scache ( void ) {
   1204    Word wix;
   1205    if (0) VG_(printf)("%s","scache inval\n");
   1206    tl_assert(!is_valid_scache_tag(1));
   1207    for (wix = 0; wix < N_WAY_NENT; wix++) {
   1208       cache_shmem.tags0[wix] = 1/*INVALID*/;
   1209    }
   1210    stats__cache_invals++;
   1211 }
   1212 
   1213 static void shmem__flush_and_invalidate_scache ( void ) {
   1214    Word wix;
   1215    Addr tag;
   1216    if (0) VG_(printf)("%s","scache flush and invalidate\n");
   1217    tl_assert(!is_valid_scache_tag(1));
   1218    for (wix = 0; wix < N_WAY_NENT; wix++) {
   1219       tag = cache_shmem.tags0[wix];
   1220       if (tag == 1/*INVALID*/) {
   1221          /* already invalid; nothing to do */
   1222       } else {
   1223          tl_assert(is_valid_scache_tag(tag));
   1224          cacheline_wback( wix );
   1225       }
   1226       cache_shmem.tags0[wix] = 1/*INVALID*/;
   1227    }
   1228    stats__cache_flushes++;
   1229    stats__cache_invals++;
   1230 }
   1231 
   1232 
   1233 static inline Bool aligned16 ( Addr a ) {
   1234    return 0 == (a & 1);
   1235 }
   1236 static inline Bool aligned32 ( Addr a ) {
   1237    return 0 == (a & 3);
   1238 }
   1239 static inline Bool aligned64 ( Addr a ) {
   1240    return 0 == (a & 7);
   1241 }
   1242 static inline UWord get_cacheline_offset ( Addr a ) {
   1243    return (UWord)(a & (N_LINE_ARANGE - 1));
   1244 }
   1245 static inline Addr cacheline_ROUNDUP ( Addr a ) {
   1246    return ROUNDUP(a, N_LINE_ARANGE);
   1247 }
   1248 static inline Addr cacheline_ROUNDDN ( Addr a ) {
   1249    return ROUNDDN(a, N_LINE_ARANGE);
   1250 }
   1251 static inline UWord get_treeno ( Addr a ) {
   1252    return get_cacheline_offset(a) >> 3;
   1253 }
   1254 static inline UWord get_tree_offset ( Addr a ) {
   1255    return a & 7;
   1256 }
   1257 
   1258 static __attribute__((noinline))
   1259        CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */
   1260 static inline CacheLine* get_cacheline ( Addr a )
   1261 {
   1262    /* tag is 'a' with the in-line offset masked out,
   1263       eg a[31]..a[4] 0000 */
   1264    Addr       tag = a & ~(N_LINE_ARANGE - 1);
   1265    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
   1266    stats__cache_totrefs++;
   1267    if (LIKELY(tag == cache_shmem.tags0[wix])) {
   1268       return &cache_shmem.lyns0[wix];
   1269    } else {
   1270       return get_cacheline_MISS( a );
   1271    }
   1272 }
   1273 
   1274 static __attribute__((noinline))
   1275        CacheLine* get_cacheline_MISS ( Addr a )
   1276 {
   1277    /* tag is 'a' with the in-line offset masked out,
   1278       eg a[31]..a[4] 0000 */
   1279 
   1280    CacheLine* cl;
   1281    Addr*      tag_old_p;
   1282    Addr       tag = a & ~(N_LINE_ARANGE - 1);
   1283    UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
   1284 
   1285    tl_assert(tag != cache_shmem.tags0[wix]);
   1286 
   1287    /* Dump the old line into the backing store. */
   1288    stats__cache_totmisses++;
   1289 
   1290    cl        = &cache_shmem.lyns0[wix];
   1291    tag_old_p = &cache_shmem.tags0[wix];
   1292 
   1293    if (is_valid_scache_tag( *tag_old_p )) {
   1294       /* EXPENSIVE and REDUNDANT: callee does it */
   1295       if (CHECK_ZSM)
   1296          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   1297       cacheline_wback( wix );
   1298    }
   1299    /* and reload the new one */
   1300    *tag_old_p = tag;
   1301    cacheline_fetch( wix );
   1302    if (CHECK_ZSM)
   1303       tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   1304    return cl;
   1305 }
   1306 
   1307 static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   1308    stats__cline_64to32pulldown++;
   1309    switch (toff) {
   1310       case 0: case 4:
   1311          tl_assert(descr & TREE_DESCR_64);
   1312          tree[4] = tree[0];
   1313          descr &= ~TREE_DESCR_64;
   1314          descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0);
   1315          break;
   1316       default:
   1317          tl_assert(0);
   1318    }
   1319    return descr;
   1320 }
   1321 
   1322 static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   1323    stats__cline_32to16pulldown++;
   1324    switch (toff) {
   1325       case 0: case 2:
   1326          if (!(descr & TREE_DESCR_32_0)) {
   1327             descr = pulldown_to_32(tree, 0, descr);
   1328          }
   1329          tl_assert(descr & TREE_DESCR_32_0);
   1330          tree[2] = tree[0];
   1331          descr &= ~TREE_DESCR_32_0;
   1332          descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0);
   1333          break;
   1334       case 4: case 6:
   1335          if (!(descr & TREE_DESCR_32_1)) {
   1336             descr = pulldown_to_32(tree, 4, descr);
   1337          }
   1338          tl_assert(descr & TREE_DESCR_32_1);
   1339          tree[6] = tree[4];
   1340          descr &= ~TREE_DESCR_32_1;
   1341          descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2);
   1342          break;
   1343       default:
   1344          tl_assert(0);
   1345    }
   1346    return descr;
   1347 }
   1348 
   1349 static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) {
   1350    stats__cline_16to8pulldown++;
   1351    switch (toff) {
   1352       case 0: case 1:
   1353          if (!(descr & TREE_DESCR_16_0)) {
   1354             descr = pulldown_to_16(tree, 0, descr);
   1355          }
   1356          tl_assert(descr & TREE_DESCR_16_0);
   1357          tree[1] = tree[0];
   1358          descr &= ~TREE_DESCR_16_0;
   1359          descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0);
   1360          break;
   1361       case 2: case 3:
   1362          if (!(descr & TREE_DESCR_16_1)) {
   1363             descr = pulldown_to_16(tree, 2, descr);
   1364          }
   1365          tl_assert(descr & TREE_DESCR_16_1);
   1366          tree[3] = tree[2];
   1367          descr &= ~TREE_DESCR_16_1;
   1368          descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2);
   1369          break;
   1370       case 4: case 5:
   1371          if (!(descr & TREE_DESCR_16_2)) {
   1372             descr = pulldown_to_16(tree, 4, descr);
   1373          }
   1374          tl_assert(descr & TREE_DESCR_16_2);
   1375          tree[5] = tree[4];
   1376          descr &= ~TREE_DESCR_16_2;
   1377          descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4);
   1378          break;
   1379       case 6: case 7:
   1380          if (!(descr & TREE_DESCR_16_3)) {
   1381             descr = pulldown_to_16(tree, 6, descr);
   1382          }
   1383          tl_assert(descr & TREE_DESCR_16_3);
   1384          tree[7] = tree[6];
   1385          descr &= ~TREE_DESCR_16_3;
   1386          descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6);
   1387          break;
   1388       default:
   1389          tl_assert(0);
   1390    }
   1391    return descr;
   1392 }
   1393 
   1394 
   1395 static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) {
   1396    UShort mask;
   1397    switch (toff) {
   1398       case 0:
   1399          mask = TREE_DESCR_8_1 | TREE_DESCR_8_0;
   1400          tl_assert( (descr & mask) == mask );
   1401          descr &= ~mask;
   1402          descr |= TREE_DESCR_16_0;
   1403          break;
   1404       case 2:
   1405          mask = TREE_DESCR_8_3 | TREE_DESCR_8_2;
   1406          tl_assert( (descr & mask) == mask );
   1407          descr &= ~mask;
   1408          descr |= TREE_DESCR_16_1;
   1409          break;
   1410       case 4:
   1411          mask = TREE_DESCR_8_5 | TREE_DESCR_8_4;
   1412          tl_assert( (descr & mask) == mask );
   1413          descr &= ~mask;
   1414          descr |= TREE_DESCR_16_2;
   1415          break;
   1416       case 6:
   1417          mask = TREE_DESCR_8_7 | TREE_DESCR_8_6;
   1418          tl_assert( (descr & mask) == mask );
   1419          descr &= ~mask;
   1420          descr |= TREE_DESCR_16_3;
   1421          break;
   1422       default:
   1423          tl_assert(0);
   1424    }
   1425    return descr;
   1426 }
   1427 
   1428 static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) {
   1429    UShort mask;
   1430    switch (toff) {
   1431       case 0:
   1432          if (!(descr & TREE_DESCR_16_0))
   1433             descr = pullup_descr_to_16(descr, 0);
   1434          if (!(descr & TREE_DESCR_16_1))
   1435             descr = pullup_descr_to_16(descr, 2);
   1436          mask = TREE_DESCR_16_1 | TREE_DESCR_16_0;
   1437          tl_assert( (descr & mask) == mask );
   1438          descr &= ~mask;
   1439          descr |= TREE_DESCR_32_0;
   1440          break;
   1441       case 4:
   1442          if (!(descr & TREE_DESCR_16_2))
   1443             descr = pullup_descr_to_16(descr, 4);
   1444          if (!(descr & TREE_DESCR_16_3))
   1445             descr = pullup_descr_to_16(descr, 6);
   1446          mask = TREE_DESCR_16_3 | TREE_DESCR_16_2;
   1447          tl_assert( (descr & mask) == mask );
   1448          descr &= ~mask;
   1449          descr |= TREE_DESCR_32_1;
   1450          break;
   1451       default:
   1452          tl_assert(0);
   1453    }
   1454    return descr;
   1455 }
   1456 
   1457 static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) {
   1458    switch (toff) {
   1459       case 0: case 4:
   1460          return 0 != (descr & TREE_DESCR_64);
   1461       default:
   1462          tl_assert(0);
   1463    }
   1464 }
   1465 
   1466 static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) {
   1467    switch (toff) {
   1468       case 0:
   1469          return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0));
   1470       case 2:
   1471          return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2));
   1472       case 4:
   1473          return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4));
   1474       case 6:
   1475          return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6));
   1476       default:
   1477          tl_assert(0);
   1478    }
   1479 }
   1480 
   1481 /* ------------ Cache management ------------ */
   1482 
   1483 static void zsm_flush_cache ( void )
   1484 {
   1485    shmem__flush_and_invalidate_scache();
   1486 }
   1487 
   1488 
   1489 static void zsm_init ( void(*p_rcinc)(SVal), void(*p_rcdec)(SVal) )
   1490 {
   1491    tl_assert( sizeof(UWord) == sizeof(Addr) );
   1492 
   1493    rcinc = p_rcinc;
   1494    rcdec = p_rcdec;
   1495 
   1496    tl_assert(map_shmem == NULL);
   1497    map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)",
   1498                            HG_(free),
   1499                            NULL/*unboxed UWord cmp*/);
   1500    tl_assert(map_shmem != NULL);
   1501    shmem__invalidate_scache();
   1502 
   1503    /* a SecMap must contain an integral number of CacheLines */
   1504    tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE));
   1505    /* also ... a CacheLine holds an integral number of trees */
   1506    tl_assert(0 == (N_LINE_ARANGE % 8));
   1507 }
   1508 
   1509 /////////////////////////////////////////////////////////////////
   1510 /////////////////////////////////////////////////////////////////
   1511 //                                                             //
   1512 // SECTION END compressed shadow memory                        //
   1513 //                                                             //
   1514 /////////////////////////////////////////////////////////////////
   1515 /////////////////////////////////////////////////////////////////
   1516 
   1517 
   1518 
   1519 /////////////////////////////////////////////////////////////////
   1520 /////////////////////////////////////////////////////////////////
   1521 //                                                             //
   1522 // SECTION BEGIN vts primitives                                //
   1523 //                                                             //
   1524 /////////////////////////////////////////////////////////////////
   1525 /////////////////////////////////////////////////////////////////
   1526 
   1527 #ifndef __HB_VTS_H
   1528 #define __HB_VTS_H
   1529 
   1530 /* VtsIDs can't exceed 30 bits, since they have to be packed into the
   1531    lowest 30 bits of an SVal. */
   1532 typedef  UInt  VtsID;
   1533 #define VtsID_INVALID 0xFFFFFFFF
   1534 
   1535 /* A VTS contains .ts, its vector clock, and also .id, a field to hold
   1536    a backlink for the caller's convenience.  Since we have no idea
   1537    what to set that to in the library, it always gets set to
   1538    VtsID_INVALID. */
   1539 typedef
   1540    struct {
   1541       VtsID   id;
   1542       XArray* ts; /* XArray* ScalarTS(abstract) */
   1543    }
   1544    VTS;
   1545 
   1546 
   1547 /* Create a new, empty VTS. */
   1548 static VTS* VTS__new ( void );
   1549 
   1550 /* Delete this VTS in its entirety. */
   1551 static void VTS__delete ( VTS* vts );
   1552 
   1553 /* Create a new singleton VTS. */
   1554 static VTS* VTS__singleton ( Thr* thr, ULong tym );
   1555 
   1556 /* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
   1557    not modified. */
   1558 static VTS* VTS__tick ( Thr* me, VTS* vts );
   1559 
   1560 /* Return a new VTS constructed as the join (max) of the 2 args.
   1561    Neither arg is modified. */
   1562 static VTS* VTS__join ( VTS* a, VTS* b );
   1563 
   1564 /* Compute the partial ordering relation of the two args.  Although we
   1565    could be completely general and return an enumeration value (EQ,
   1566    LT, GT, UN), in fact we only need LEQ, and so we may as well
   1567    hardwire that fact.
   1568 
   1569    Returns NULL iff LEQ(A,B), or non-NULL if not.  In the latter case,
   1570    the returned Thr* indicates the discovered point for which they are
   1571    not.  There may be more than one such point, but we only care about
   1572    seeing one of them, not all of them.  This rather strange
   1573    convention is used because sometimes we want to know the actual
   1574    index at which they first differ. */
   1575 static Thr* VTS__cmpLEQ ( VTS* a, VTS* b );
   1576 
   1577 /* Compute an arbitrary structural (total) ordering on the two args,
   1578    based on their VCs, so they can be looked up in a table, tree, etc.
   1579    Returns -1, 0 or 1. */
   1580 static Word VTS__cmp_structural ( VTS* a, VTS* b );
   1581 
   1582 /* Debugging only.  Display the given VTS in the buffer. */
   1583 static void VTS__show ( HChar* buf, Int nBuf, VTS* vts );
   1584 
   1585 /* Debugging only.  Return vts[index], so to speak. */
   1586 static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx );
   1587 
   1588 #endif /* ! __HB_VTS_H */
   1589 
   1590 
   1591 /*--------------- to do with Vector Timestamps ---------------*/
   1592 
   1593 /* Scalar Timestamp */
   1594 typedef
   1595    struct {
   1596       Thr*    thr;
   1597       ULong   tym;
   1598    }
   1599    ScalarTS;
   1600 
   1601 
   1602 static Bool is_sane_VTS ( VTS* vts )
   1603 {
   1604    UWord     i, n;
   1605    ScalarTS  *st1, *st2;
   1606    if (!vts) return False;
   1607    if (!vts->ts) return False;
   1608    n = VG_(sizeXA)( vts->ts );
   1609    if (n >= 2) {
   1610       for (i = 0; i < n-1; i++) {
   1611          st1 = VG_(indexXA)( vts->ts, i );
   1612          st2 = VG_(indexXA)( vts->ts, i+1 );
   1613          if (st1->thr >= st2->thr)
   1614             return False;
   1615          if (st1->tym == 0 || st2->tym == 0)
   1616             return False;
   1617       }
   1618    }
   1619    return True;
   1620 }
   1621 
   1622 
   1623 /* Create a new, empty VTS.
   1624 */
   1625 VTS* VTS__new ( void )
   1626 {
   1627    VTS* vts;
   1628    vts = HG_(zalloc)( "libhb.VTS__new.1", sizeof(VTS) );
   1629    tl_assert(vts);
   1630    vts->id = VtsID_INVALID;
   1631    vts->ts = VG_(newXA)( HG_(zalloc), "libhb.VTS__new.2",
   1632                          HG_(free), sizeof(ScalarTS) );
   1633    tl_assert(vts->ts);
   1634    return vts;
   1635 }
   1636 
   1637 
   1638 /* Delete this VTS in its entirety.
   1639 */
   1640 void VTS__delete ( VTS* vts )
   1641 {
   1642    tl_assert(vts);
   1643    tl_assert(vts->ts);
   1644    VG_(deleteXA)( vts->ts );
   1645    HG_(free)(vts);
   1646 }
   1647 
   1648 
   1649 /* Create a new singleton VTS.
   1650 */
   1651 VTS* VTS__singleton ( Thr* thr, ULong tym ) {
   1652    ScalarTS st;
   1653    VTS*     vts;
   1654    tl_assert(thr);
   1655    tl_assert(tym >= 1);
   1656    vts = VTS__new();
   1657    st.thr = thr;
   1658    st.tym = tym;
   1659    VG_(addToXA)( vts->ts, &st );
   1660    return vts;
   1661 }
   1662 
   1663 
   1664 /* Return a new VTS in which vts[me]++, so to speak.  'vts' itself is
   1665    not modified.
   1666 */
   1667 VTS* VTS__tick ( Thr* me, VTS* vts )
   1668 {
   1669    ScalarTS* here = NULL;
   1670    ScalarTS  tmp;
   1671    VTS*      res;
   1672    Word      i, n;
   1673 
   1674    stats__vts__tick++;
   1675 
   1676    tl_assert(me);
   1677    tl_assert(is_sane_VTS(vts));
   1678    //if (0) VG_(printf)("tick vts thrno %ld szin %d\n",
   1679    //                   (Word)me->errmsg_index, (Int)VG_(sizeXA)(vts) );
   1680    res = VTS__new();
   1681    n = VG_(sizeXA)( vts->ts );
   1682 
   1683    /* main loop doesn't handle zero-entry case correctly, so
   1684       special-case it. */
   1685    if (n == 0) {
   1686       tmp.thr = me;
   1687       tmp.tym = 1;
   1688       VG_(addToXA)( res->ts, &tmp );
   1689       tl_assert(is_sane_VTS(res));
   1690       return res;
   1691    }
   1692 
   1693    for (i = 0; i < n; i++) {
   1694       here = VG_(indexXA)( vts->ts, i );
   1695       if (me < here->thr) {
   1696          /* We just went past 'me', without seeing it. */
   1697          tmp.thr = me;
   1698          tmp.tym = 1;
   1699          VG_(addToXA)( res->ts, &tmp );
   1700          tmp = *here;
   1701          VG_(addToXA)( res->ts, &tmp );
   1702          i++;
   1703          break;
   1704       }
   1705       else if (me == here->thr) {
   1706          tmp = *here;
   1707          tmp.tym++;
   1708          VG_(addToXA)( res->ts, &tmp );
   1709          i++;
   1710          break;
   1711       }
   1712       else /* me > here->thr */ {
   1713          tmp = *here;
   1714          VG_(addToXA)( res->ts, &tmp );
   1715       }
   1716    }
   1717    tl_assert(i >= 0 && i <= n);
   1718    if (i == n && here && here->thr < me) {
   1719       tmp.thr = me;
   1720       tmp.tym = 1;
   1721       VG_(addToXA)( res->ts, &tmp );
   1722    } else {
   1723       for (/*keepgoing*/; i < n; i++) {
   1724          here = VG_(indexXA)( vts->ts, i );
   1725          tmp = *here;
   1726          VG_(addToXA)( res->ts, &tmp );
   1727       }
   1728    }
   1729    tl_assert(is_sane_VTS(res));
   1730    //if (0) VG_(printf)("tick vts thrno %ld szou %d\n",
   1731    //                   (Word)me->errmsg_index, (Int)VG_(sizeXA)(res) );
   1732    return res;
   1733 }
   1734 
   1735 
   1736 /* Return a new VTS constructed as the join (max) of the 2 args.
   1737    Neither arg is modified.
   1738 */
   1739 VTS* VTS__join ( VTS* a, VTS* b )
   1740 {
   1741    Word     ia, ib, useda, usedb;
   1742    ULong    tyma, tymb, tymMax;
   1743    Thr*     thr;
   1744    VTS*     res;
   1745 
   1746    stats__vts__join++;
   1747 
   1748    tl_assert(a && a->ts);
   1749    tl_assert(b && b->ts);
   1750    useda = VG_(sizeXA)( a->ts );
   1751    usedb = VG_(sizeXA)( b->ts );
   1752 
   1753    res = VTS__new();
   1754    ia = ib = 0;
   1755 
   1756    while (1) {
   1757 
   1758       /* This logic is to enumerate triples (thr, tyma, tymb) drawn
   1759          from a and b in order, where thr is the next Thr*
   1760          occurring in either a or b, and tyma/b are the relevant
   1761          scalar timestamps, taking into account implicit zeroes. */
   1762       tl_assert(ia >= 0 && ia <= useda);
   1763       tl_assert(ib >= 0 && ib <= usedb);
   1764 
   1765       if        (ia == useda && ib == usedb) {
   1766          /* both empty - done */
   1767          break;
   1768 
   1769       } else if (ia == useda && ib != usedb) {
   1770          /* a empty, use up b */
   1771          ScalarTS* tmpb = VG_(indexXA)( b->ts, ib );
   1772          thr  = tmpb->thr;
   1773          tyma = 0;
   1774          tymb = tmpb->tym;
   1775          ib++;
   1776 
   1777       } else if (ia != useda && ib == usedb) {
   1778          /* b empty, use up a */
   1779          ScalarTS* tmpa = VG_(indexXA)( a->ts, ia );
   1780          thr  = tmpa->thr;
   1781          tyma = tmpa->tym;
   1782          tymb = 0;
   1783          ia++;
   1784 
   1785       } else {
   1786          /* both not empty; extract lowest-Thr*'d triple */
   1787          ScalarTS* tmpa = VG_(indexXA)( a->ts, ia );
   1788          ScalarTS* tmpb = VG_(indexXA)( b->ts, ib );
   1789          if (tmpa->thr < tmpb->thr) {
   1790             /* a has the lowest unconsidered Thr* */
   1791             thr  = tmpa->thr;
   1792             tyma = tmpa->tym;
   1793             tymb = 0;
   1794             ia++;
   1795          } else if (tmpa->thr > tmpb->thr) {
   1796             /* b has the lowest unconsidered Thr* */
   1797             thr  = tmpb->thr;
   1798             tyma = 0;
   1799             tymb = tmpb->tym;
   1800             ib++;
   1801          } else {
   1802             /* they both next mention the same Thr* */
   1803             tl_assert(tmpa->thr == tmpb->thr);
   1804             thr  = tmpa->thr; /* == tmpb->thr */
   1805             tyma = tmpa->tym;
   1806             tymb = tmpb->tym;
   1807             ia++;
   1808             ib++;
   1809          }
   1810       }
   1811 
   1812       /* having laboriously determined (thr, tyma, tymb), do something
   1813          useful with it. */
   1814       tymMax = tyma > tymb ? tyma : tymb;
   1815       if (tymMax > 0) {
   1816          ScalarTS st;
   1817          st.thr = thr;
   1818          st.tym = tymMax;
   1819          VG_(addToXA)( res->ts, &st );
   1820       }
   1821 
   1822    }
   1823 
   1824    tl_assert(is_sane_VTS( res ));
   1825 
   1826    return res;
   1827 }
   1828 
   1829 
   1830 /* Determine if 'a' <= 'b', in the partial ordering.  Returns NULL if
   1831    they are, or the first Thr* for which they are not.  This rather
   1832    strange convention is used because sometimes we want to know the
   1833    actual index at which they first differ. */
   1834 static Thr* VTS__cmpLEQ ( VTS* a, VTS* b )
   1835 {
   1836    Word  ia, ib, useda, usedb;
   1837    ULong tyma, tymb;
   1838 
   1839    stats__vts__cmpLEQ++;
   1840 
   1841    tl_assert(a && a->ts);
   1842    tl_assert(b && b->ts);
   1843    useda = VG_(sizeXA)( a->ts );
   1844    usedb = VG_(sizeXA)( b->ts );
   1845 
   1846    ia = ib = 0;
   1847 
   1848    while (1) {
   1849 
   1850       /* This logic is to enumerate doubles (tyma, tymb) drawn
   1851          from a and b in order, and tyma/b are the relevant
   1852          scalar timestamps, taking into account implicit zeroes. */
   1853       Thr* thr;
   1854 
   1855       tl_assert(ia >= 0 && ia <= useda);
   1856       tl_assert(ib >= 0 && ib <= usedb);
   1857 
   1858       if        (ia == useda && ib == usedb) {
   1859          /* both empty - done */
   1860          break;
   1861 
   1862       } else if (ia == useda && ib != usedb) {
   1863          /* a empty, use up b */
   1864          ScalarTS* tmpb = VG_(indexXA)( b->ts, ib );
   1865          tyma = 0;
   1866          tymb = tmpb->tym;
   1867          thr  = tmpb->thr;
   1868          ib++;
   1869 
   1870       } else if (ia != useda && ib == usedb) {
   1871          /* b empty, use up a */
   1872          ScalarTS* tmpa = VG_(indexXA)( a->ts, ia );
   1873          tyma = tmpa->tym;
   1874          thr  = tmpa->thr;
   1875          tymb = 0;
   1876          ia++;
   1877 
   1878       } else {
   1879          /* both not empty; extract lowest-Thr*'d triple */
   1880          ScalarTS* tmpa = VG_(indexXA)( a->ts, ia );
   1881          ScalarTS* tmpb = VG_(indexXA)( b->ts, ib );
   1882          if (tmpa->thr < tmpb->thr) {
   1883             /* a has the lowest unconsidered Thr* */
   1884             tyma = tmpa->tym;
   1885             thr  = tmpa->thr;
   1886             tymb = 0;
   1887             ia++;
   1888          }
   1889          else
   1890          if (tmpa->thr > tmpb->thr) {
   1891             /* b has the lowest unconsidered Thr* */
   1892             tyma = 0;
   1893             tymb = tmpb->tym;
   1894             thr  = tmpb->thr;
   1895             ib++;
   1896          } else {
   1897             /* they both next mention the same Thr* */
   1898             tl_assert(tmpa->thr == tmpb->thr);
   1899             tyma = tmpa->tym;
   1900             thr  = tmpa->thr;
   1901             tymb = tmpb->tym;
   1902             ia++;
   1903             ib++;
   1904          }
   1905       }
   1906 
   1907       /* having laboriously determined (tyma, tymb), do something
   1908          useful with it. */
   1909       if (tyma > tymb) {
   1910          /* not LEQ at this index.  Quit, since the answer is
   1911             determined already. */
   1912          tl_assert(thr);
   1913          return thr;
   1914       }
   1915    }
   1916 
   1917    return NULL; /* all points are LEQ */
   1918 }
   1919 
   1920 
   1921 /* Compute an arbitrary structural (total) ordering on the two args,
   1922    based on their VCs, so they can be looked up in a table, tree, etc.
   1923    Returns -1, 0 or 1.  (really just 'deriving Ord' :-) This can be
   1924    performance critical so there is some effort expended to make it sa
   1925    fast as possible.
   1926 */
   1927 Word VTS__cmp_structural ( VTS* a, VTS* b )
   1928 {
   1929    /* We just need to generate an arbitrary total ordering based on
   1930       a->ts and b->ts.  Preferably do it in a way which comes across likely
   1931       differences relatively quickly. */
   1932    Word     i;
   1933    Word     useda = 0,    usedb = 0;
   1934    ScalarTS *ctsa = NULL, *ctsb = NULL;
   1935 
   1936    stats__vts__cmp_structural++;
   1937 
   1938    tl_assert(a);
   1939    tl_assert(b);
   1940 
   1941    VG_(getContentsXA_UNSAFE)( a->ts, (void**)&ctsa, &useda );
   1942    VG_(getContentsXA_UNSAFE)( b->ts, (void**)&ctsb, &usedb );
   1943 
   1944    if (LIKELY(useda == usedb)) {
   1945       ScalarTS *tmpa = NULL, *tmpb = NULL;
   1946       stats__vts__cmp_structural_slow++;
   1947       /* Same length vectors.  Find the first difference, if any, as
   1948          fast as possible. */
   1949       for (i = 0; i < useda; i++) {
   1950          tmpa = &ctsa[i];
   1951          tmpb = &ctsb[i];
   1952          if (LIKELY(tmpa->tym == tmpb->tym && tmpa->thr == tmpb->thr))
   1953             continue;
   1954          else
   1955             break;
   1956       }
   1957       if (UNLIKELY(i == useda)) {
   1958          /* They're identical. */
   1959          return 0;
   1960       } else {
   1961          tl_assert(i >= 0 && i < useda);
   1962          if (tmpa->tym < tmpb->tym) return -1;
   1963          if (tmpa->tym > tmpb->tym) return 1;
   1964          if (tmpa->thr < tmpb->thr) return -1;
   1965          if (tmpa->thr > tmpb->thr) return 1;
   1966          /* we just established them as non-identical, hence: */
   1967       }
   1968       /*NOTREACHED*/
   1969       tl_assert(0);
   1970    }
   1971 
   1972    if (useda < usedb) return -1;
   1973    if (useda > usedb) return 1;
   1974    /*NOTREACHED*/
   1975    tl_assert(0);
   1976 }
   1977 
   1978 
   1979 /* Debugging only.  Display the given VTS in the buffer.
   1980 */
   1981 void VTS__show ( HChar* buf, Int nBuf, VTS* vts ) {
   1982    ScalarTS* st;
   1983    HChar     unit[64];
   1984    Word      i, n;
   1985    Int       avail = nBuf;
   1986    tl_assert(vts && vts->ts);
   1987    tl_assert(nBuf > 16);
   1988    buf[0] = '[';
   1989    buf[1] = 0;
   1990    n = VG_(sizeXA)( vts->ts );
   1991    for (i = 0; i < n; i++) {
   1992       tl_assert(avail >= 40);
   1993       st = VG_(indexXA)( vts->ts, i );
   1994       VG_(memset)(unit, 0, sizeof(unit));
   1995       VG_(sprintf)(unit, i < n-1 ? "%p:%lld " : "%p:%lld",
   1996                          st->thr, st->tym);
   1997       if (avail < VG_(strlen)(unit) + 40/*let's say*/) {
   1998          VG_(strcat)(buf, " ...]");
   1999          buf[nBuf-1] = 0;
   2000          return;
   2001       }
   2002       VG_(strcat)(buf, unit);
   2003       avail -= VG_(strlen)(unit);
   2004    }
   2005    VG_(strcat)(buf, "]");
   2006    buf[nBuf-1] = 0;
   2007 }
   2008 
   2009 
   2010 /* Debugging only.  Return vts[index], so to speak.
   2011 */
   2012 ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx ) {
   2013    UWord i, n;
   2014    stats__vts__indexat_slow++;
   2015    tl_assert(vts && vts->ts);
   2016    n = VG_(sizeXA)( vts->ts );
   2017    for (i = 0; i < n; i++) {
   2018       ScalarTS* st = VG_(indexXA)( vts->ts, i );
   2019       if (st->thr == idx)
   2020          return st->tym;
   2021    }
   2022    return 0;
   2023 }
   2024 
   2025 
   2026 /////////////////////////////////////////////////////////////////
   2027 /////////////////////////////////////////////////////////////////
   2028 //                                                             //
   2029 // SECTION END vts primitives                                  //
   2030 //                                                             //
   2031 /////////////////////////////////////////////////////////////////
   2032 /////////////////////////////////////////////////////////////////
   2033 
   2034 
   2035 
   2036 /////////////////////////////////////////////////////////////////
   2037 /////////////////////////////////////////////////////////////////
   2038 //                                                             //
   2039 // SECTION BEGIN main library                                  //
   2040 //                                                             //
   2041 /////////////////////////////////////////////////////////////////
   2042 /////////////////////////////////////////////////////////////////
   2043 
   2044 
   2045 /////////////////////////////////////////////////////////
   2046 //                                                     //
   2047 // VTS set                                             //
   2048 //                                                     //
   2049 /////////////////////////////////////////////////////////
   2050 
   2051 static WordFM* /* VTS* void void */ vts_set = NULL;
   2052 
   2053 static void vts_set_init ( void )
   2054 {
   2055    tl_assert(!vts_set);
   2056    vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1",
   2057                          HG_(free),
   2058                          (Word(*)(UWord,UWord))VTS__cmp_structural );
   2059    tl_assert(vts_set);
   2060 }
   2061 
   2062 /* Given a newly made VTS, look in vts_set to see if we already have
   2063    an identical one.  If yes, free up this one and return instead a
   2064    pointer to the existing one.  If no, add this one to the set and
   2065    return the same pointer.  Caller differentiates the two cases by
   2066    comparing returned pointer with the supplied one (although that
   2067    does require that the supplied VTS is not already in the set).
   2068 */
   2069 static VTS* vts_set__find_and_dealloc__or_add ( VTS* cand )
   2070 {
   2071    UWord keyW, valW;
   2072    stats__vts_set__fadoa++;
   2073    /* lookup cand (by value) */
   2074    if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) {
   2075       /* found it */
   2076       tl_assert(valW == 0);
   2077       /* if this fails, cand (by ref) was already present (!) */
   2078       tl_assert(keyW != (UWord)cand);
   2079       stats__vts_set__fadoa_d++;
   2080       VTS__delete(cand);
   2081       return (VTS*)keyW;
   2082    } else {
   2083       /* not present.  Add and return pointer to same. */
   2084       VG_(addToFM)( vts_set, (UWord)cand, 0/*val is unused*/ );
   2085       return cand;
   2086    }
   2087 }
   2088 
   2089 
   2090 /////////////////////////////////////////////////////////
   2091 //                                                     //
   2092 // VTS table                                           //
   2093 //                                                     //
   2094 /////////////////////////////////////////////////////////
   2095 
   2096 static void VtsID__invalidate_caches ( void ); /* fwds */
   2097 
   2098 /* A type to hold VTS table entries.  Invariants:
   2099    If .vts == NULL, then this entry is not in use, so:
   2100    - .rc == 0
   2101    - this entry is on the freelist (unfortunately, does not imply
   2102      any constraints on value for .nextfree)
   2103    If .vts != NULL, then this entry is in use:
   2104    - .vts is findable in vts_set
   2105    - .vts->id == this entry number
   2106    - no specific value for .rc (even 0 is OK)
   2107    - this entry is not on freelist, so .nextfree == VtsID_INVALID
   2108 */
   2109 typedef
   2110    struct {
   2111       VTS*  vts;      /* vts, in vts_set */
   2112       UWord rc;       /* reference count - enough for entire aspace */
   2113       VtsID freelink; /* chain for free entries, VtsID_INVALID at end */
   2114    }
   2115    VtsTE;
   2116 
   2117 /* The VTS table. */
   2118 static XArray* /* of VtsTE */ vts_tab = NULL;
   2119 
   2120 /* An index into the VTS table, indicating the start of the list of
   2121    free (available for use) entries.  If the list is empty, this is
   2122    VtsID_INVALID. */
   2123 static VtsID vts_tab_freelist = VtsID_INVALID;
   2124 
   2125 /* Do a GC of vts_tab when the freelist becomes empty AND the size of
   2126    vts_tab equals or exceeds this size.  After GC, the value here is
   2127    set appropriately so as to check for the next GC point. */
   2128 static Word vts_next_GC_at = 1000;
   2129 
   2130 static void vts_tab_init ( void )
   2131 {
   2132    vts_tab
   2133       = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1",
   2134                     HG_(free), sizeof(VtsTE) );
   2135    vts_tab_freelist
   2136       = VtsID_INVALID;
   2137    tl_assert(vts_tab);
   2138 }
   2139 
   2140 /* Add ii to the free list, checking that it looks out-of-use. */
   2141 static void add_to_free_list ( VtsID ii )
   2142 {
   2143    VtsTE* ie = VG_(indexXA)( vts_tab, ii );
   2144    tl_assert(ie->vts == NULL);
   2145    tl_assert(ie->rc == 0);
   2146    tl_assert(ie->freelink == VtsID_INVALID);
   2147    ie->freelink = vts_tab_freelist;
   2148    vts_tab_freelist = ii;
   2149 }
   2150 
   2151 /* Get an entry from the free list.  This will return VtsID_INVALID if
   2152    the free list is empty. */
   2153 static VtsID get_from_free_list ( void )
   2154 {
   2155    VtsID  ii;
   2156    VtsTE* ie;
   2157    if (vts_tab_freelist == VtsID_INVALID)
   2158       return VtsID_INVALID;
   2159    ii = vts_tab_freelist;
   2160    ie = VG_(indexXA)( vts_tab, ii );
   2161    tl_assert(ie->vts == NULL);
   2162    tl_assert(ie->rc == 0);
   2163    vts_tab_freelist = ie->freelink;
   2164    return ii;
   2165 }
   2166 
   2167 /* Produce a new VtsID that can be used, either by getting it from
   2168    the freelist, or, if that is empty, by expanding vts_tab. */
   2169 static VtsID get_new_VtsID ( void )
   2170 {
   2171    VtsID ii;
   2172    VtsTE te;
   2173    ii = get_from_free_list();
   2174    if (ii != VtsID_INVALID)
   2175       return ii;
   2176    te.vts = NULL;
   2177    te.rc = 0;
   2178    te.freelink = VtsID_INVALID;
   2179    ii = (VtsID)VG_(addToXA)( vts_tab, &te );
   2180    return ii;
   2181 }
   2182 
   2183 
   2184 /* Indirect callback from lib_zsm. */
   2185 static void VtsID__rcinc ( VtsID ii )
   2186 {
   2187    VtsTE* ie;
   2188    /* VG_(indexXA) does a range check for us */
   2189    ie = VG_(indexXA)( vts_tab, ii );
   2190    tl_assert(ie->vts); /* else it's not in use */
   2191    tl_assert(ie->rc < ~0UL); /* else we can't continue */
   2192    tl_assert(ie->vts->id == ii);
   2193    ie->rc++;
   2194 }
   2195 
   2196 /* Indirect callback from lib_zsm. */
   2197 static void VtsID__rcdec ( VtsID ii )
   2198 {
   2199    VtsTE* ie;
   2200    /* VG_(indexXA) does a range check for us */
   2201    ie = VG_(indexXA)( vts_tab, ii );
   2202    tl_assert(ie->vts); /* else it's not in use */
   2203    tl_assert(ie->rc > 0); /* else RC snafu */
   2204    tl_assert(ie->vts->id == ii);
   2205    ie->rc--;
   2206 }
   2207 
   2208 
   2209 /* Look up 'cand' in our collection of VTSs.  If present, deallocate
   2210    it and return the VtsID for the pre-existing version.  If not
   2211    present, add it to both vts_tab and vts_set, allocate a fresh VtsID
   2212    for it, and return that. */
   2213 static VtsID vts_tab__find_and_dealloc__or_add ( VTS* cand )
   2214 {
   2215    VTS* auld;
   2216    tl_assert(cand->id == VtsID_INVALID);
   2217    auld = vts_set__find_and_dealloc__or_add(cand);
   2218    if (auld != cand) {
   2219       /* We already have an Aulde one.  Use that. */
   2220       VtsTE* ie;
   2221       tl_assert(auld->id != VtsID_INVALID);
   2222       ie = VG_(indexXA)( vts_tab, auld->id );
   2223       tl_assert(ie->vts == auld);
   2224       return auld->id;
   2225    } else {
   2226       VtsID  ii = get_new_VtsID();
   2227       VtsTE* ie = VG_(indexXA)( vts_tab, ii );
   2228       ie->vts = cand;
   2229       ie->rc = 0;
   2230       ie->freelink = VtsID_INVALID;
   2231       cand->id = ii;
   2232       return ii;
   2233    }
   2234 }
   2235 
   2236 
   2237 static void show_vts_stats ( HChar* caller )
   2238 {
   2239    UWord nSet, nTab, nLive;
   2240    ULong totrc;
   2241    UWord n, i;
   2242    nSet = VG_(sizeFM)( vts_set );
   2243    nTab = VG_(sizeXA)( vts_tab );
   2244    totrc = 0;
   2245    nLive = 0;
   2246    n = VG_(sizeXA)( vts_tab );
   2247    for (i = 0; i < n; i++) {
   2248       VtsTE* ie = VG_(indexXA)( vts_tab, i );
   2249       if (ie->vts) {
   2250          nLive++;
   2251          totrc += (ULong)ie->rc;
   2252       } else {
   2253          tl_assert(ie->rc == 0);
   2254       }
   2255    }
   2256    VG_(printf)("  show_vts_stats %s\n", caller);
   2257    VG_(printf)("    vts_tab size %4lu\n", nTab);
   2258    VG_(printf)("    vts_tab live %4lu\n", nLive);
   2259    VG_(printf)("    vts_set size %4lu\n", nSet);
   2260    VG_(printf)("        total rc %4llu\n", totrc);
   2261 }
   2262 
   2263 /* NOT TO BE CALLED FROM WITHIN libzsm. */
   2264 __attribute__((noinline))
   2265 static void vts_tab__do_GC ( Bool show_stats )
   2266 {
   2267    UWord i, nTab, nLive, nFreed;
   2268 
   2269    /* check this is actually necessary. */
   2270    tl_assert(vts_tab_freelist == VtsID_INVALID);
   2271 
   2272    /* empty the caches for partial order checks and binary joins.  We
   2273       could do better and prune out the entries to be deleted, but it
   2274       ain't worth the hassle. */
   2275    VtsID__invalidate_caches();
   2276 
   2277    /* First, make the reference counts up to date. */
   2278    zsm_flush_cache();
   2279 
   2280    nTab = VG_(sizeXA)( vts_tab );
   2281 
   2282    if (show_stats) {
   2283       VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab);
   2284       show_vts_stats("before GC");
   2285    }
   2286 
   2287    /* Now we can inspect the entire vts_tab.  Any entries
   2288       with zero .rc fields are now no longer in use and can be
   2289       free list, removed from vts_set, and deleted. */
   2290    nFreed = 0;
   2291    for (i = 0; i < nTab; i++) {
   2292       Bool present;
   2293       UWord oldK = 0, oldV = 0;
   2294       VtsTE* te = VG_(indexXA)( vts_tab, i );
   2295       if (te->vts == NULL) {
   2296          tl_assert(te->rc == 0);
   2297          continue; /* already on the free list (presumably) */
   2298       }
   2299       if (te->rc > 0)
   2300          continue; /* in use */
   2301       /* Ok, we got one we can free. */
   2302       tl_assert(te->vts->id == i);
   2303       /* first, remove it from vts_set. */
   2304       present = VG_(delFromFM)( vts_set,
   2305                                 &oldK, &oldV, (UWord)te->vts );
   2306       tl_assert(present); /* else it isn't in vts_set ?! */
   2307       tl_assert(oldV == 0); /* no info stored in vts_set val fields */
   2308       tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */
   2309       /* now free the VTS itself */
   2310       VTS__delete(te->vts);
   2311       te->vts = NULL;
   2312       /* and finally put this entry on the free list */
   2313       tl_assert(te->freelink == VtsID_INVALID); /* can't already be on it */
   2314       add_to_free_list( i );
   2315       nFreed++;
   2316    }
   2317 
   2318    /* Now figure out when the next GC should be.  We'll allow the
   2319       number of VTSs to double before GCing again.  Except of course
   2320       that since we can't (or, at least, don't) shrink vts_tab, we
   2321       can't set the threshhold value smaller than it. */
   2322    tl_assert(nFreed <= nTab);
   2323    nLive = nTab - nFreed;
   2324    tl_assert(nLive >= 0 && nLive <= nTab);
   2325    vts_next_GC_at = 2 * nLive;
   2326    if (vts_next_GC_at < nTab)
   2327       vts_next_GC_at = nTab;
   2328 
   2329    if (show_stats) {
   2330       show_vts_stats("after GC");
   2331       VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at);
   2332    }
   2333 
   2334    if (VG_(clo_stats)) {
   2335       static UInt ctr = 0;
   2336       tl_assert(nTab > 0);
   2337       VG_(message)(Vg_DebugMsg,
   2338                   "libhb: VTS GC: #%u  old size %lu  live %lu  (%2llu%%)\n",
   2339                   ctr++, nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab);
   2340    }
   2341 }
   2342 
   2343 
   2344 /////////////////////////////////////////////////////////
   2345 //                                                     //
   2346 // Vts IDs                                             //
   2347 //                                                     //
   2348 /////////////////////////////////////////////////////////
   2349 
   2350 //////////////////////////
   2351 static ULong stats__cmpLEQ_queries = 0;
   2352 static ULong stats__cmpLEQ_misses  = 0;
   2353 static ULong stats__join2_queries  = 0;
   2354 static ULong stats__join2_misses   = 0;
   2355 
   2356 static inline UInt ROL32 ( UInt w, Int n ) {
   2357    w = (w << n) | (w >> (32-n));
   2358    return w;
   2359 }
   2360 static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) {
   2361    UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13);
   2362    return hash % nTab;
   2363 }
   2364 
   2365 #define N_CMPLEQ_CACHE 1023
   2366 static
   2367    struct { VtsID vi1; VtsID vi2; Bool leq; }
   2368    cmpLEQ_cache[N_CMPLEQ_CACHE];
   2369 
   2370 #define N_JOIN2_CACHE 1023
   2371 static
   2372    struct { VtsID vi1; VtsID vi2; VtsID res; }
   2373    join2_cache[N_JOIN2_CACHE];
   2374 
   2375 static void VtsID__invalidate_caches ( void ) {
   2376    Int i;
   2377    for (i = 0; i < N_CMPLEQ_CACHE; i++) {
   2378       cmpLEQ_cache[i].vi1 = VtsID_INVALID;
   2379       cmpLEQ_cache[i].vi2 = VtsID_INVALID;
   2380       cmpLEQ_cache[i].leq = False;
   2381    }
   2382    for (i = 0; i < N_JOIN2_CACHE; i++) {
   2383      join2_cache[i].vi1 = VtsID_INVALID;
   2384      join2_cache[i].vi2 = VtsID_INVALID;
   2385      join2_cache[i].res = VtsID_INVALID;
   2386    }
   2387 }
   2388 //////////////////////////
   2389 
   2390 //static Bool VtsID__is_valid ( VtsID vi ) {
   2391 //   VtsTE* ve;
   2392 //   if (vi >= (VtsID)VG_(sizeXA)( vts_tab ))
   2393 //      return False;
   2394 //   ve = VG_(indexXA)( vts_tab, vi );
   2395 //   if (!ve->vts)
   2396 //      return False;
   2397 //   tl_assert(ve->vts->id == vi);
   2398 //   return True;
   2399 //}
   2400 
   2401 static VTS* VtsID__to_VTS ( VtsID vi ) {
   2402    VtsTE* te = VG_(indexXA)( vts_tab, vi );
   2403    tl_assert(te->vts);
   2404    return te->vts;
   2405 }
   2406 
   2407 static void VtsID__pp ( VtsID vi ) {
   2408    HChar buf[100];
   2409    VTS* vts = VtsID__to_VTS(vi);
   2410    VTS__show( buf, sizeof(buf)-1, vts );
   2411    buf[sizeof(buf)-1] = 0;
   2412    VG_(printf)("%s", buf);
   2413 }
   2414 
   2415 /* compute partial ordering relation of vi1 and vi2. */
   2416 __attribute__((noinline))
   2417 static Bool VtsID__cmpLEQ_WRK ( VtsID vi1, VtsID vi2 ) {
   2418    UInt hash;
   2419    Bool leq;
   2420    VTS  *v1, *v2;
   2421    //if (vi1 == vi2) return True;
   2422    tl_assert(vi1 != vi2);
   2423    ////++
   2424    stats__cmpLEQ_queries++;
   2425    hash = hash_VtsIDs(vi1, vi2, N_CMPLEQ_CACHE);
   2426    if (cmpLEQ_cache[hash].vi1 == vi1
   2427        && cmpLEQ_cache[hash].vi2 == vi2)
   2428       return cmpLEQ_cache[hash].leq;
   2429    stats__cmpLEQ_misses++;
   2430    ////--
   2431    v1  = VtsID__to_VTS(vi1);
   2432    v2  = VtsID__to_VTS(vi2);
   2433    leq = VTS__cmpLEQ( v1, v2 ) == NULL;
   2434    ////++
   2435    cmpLEQ_cache[hash].vi1 = vi1;
   2436    cmpLEQ_cache[hash].vi2 = vi2;
   2437    cmpLEQ_cache[hash].leq = leq;
   2438    ////--
   2439    return leq;
   2440 }
   2441 static inline Bool VtsID__cmpLEQ ( VtsID vi1, VtsID vi2 ) {
   2442    return LIKELY(vi1 == vi2)  ? True  : VtsID__cmpLEQ_WRK(vi1, vi2);
   2443 }
   2444 
   2445 /* compute binary join */
   2446 __attribute__((noinline))
   2447 static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) {
   2448    UInt  hash;
   2449    VtsID res;
   2450    VTS   *vts1, *vts2, *nyu;
   2451    //if (vi1 == vi2) return vi1;
   2452    tl_assert(vi1 != vi2);
   2453    ////++
   2454    stats__join2_queries++;
   2455    hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE);
   2456    if (join2_cache[hash].vi1 == vi1
   2457        && join2_cache[hash].vi2 == vi2)
   2458       return join2_cache[hash].res;
   2459    stats__join2_misses++;
   2460    ////--
   2461    vts1 = VtsID__to_VTS(vi1);
   2462    vts2 = VtsID__to_VTS(vi2);
   2463    nyu  = VTS__join(vts1,vts2);
   2464    res  = vts_tab__find_and_dealloc__or_add(nyu);
   2465    ////++
   2466    join2_cache[hash].vi1 = vi1;
   2467    join2_cache[hash].vi2 = vi2;
   2468    join2_cache[hash].res = res;
   2469    ////--
   2470    return res;
   2471 }
   2472 static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) {
   2473    return LIKELY(vi1 == vi2)  ? vi1  : VtsID__join2_WRK(vi1, vi2);
   2474 }
   2475 
   2476 /* create a singleton VTS, namely [thr:1] */
   2477 static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) {
   2478    VTS* nyu = VTS__singleton(thr,tym);
   2479    return vts_tab__find_and_dealloc__or_add(nyu);
   2480 }
   2481 
   2482 /* tick operation, creates value 1 if specified index is absent */
   2483 static VtsID VtsID__tick ( VtsID vi, Thr* idx ) {
   2484    VTS* vts = VtsID__to_VTS(vi);
   2485    VTS* nyu = VTS__tick(idx,vts);
   2486    return vts_tab__find_and_dealloc__or_add(nyu);
   2487 }
   2488 
   2489 /* index into a VTS (only for assertions) */
   2490 static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) {
   2491    VTS* vts = VtsID__to_VTS(vi);
   2492    return VTS__indexAt_SLOW( vts, idx );
   2493 }
   2494 
   2495 /* Assuming that !cmpLEQ(vi1, vi2), find the index of the first (or
   2496    any, really) element in vi1 which is pointwise greater-than the
   2497    corresponding element in vi2.  If no such element exists, return
   2498    NULL.  This needs to be fairly quick since it is called every time
   2499    a race is detected. */
   2500 static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 )
   2501 {
   2502    VTS  *vts1, *vts2;
   2503    Thr* diffthr;
   2504    tl_assert(vi1 != vi2);
   2505    vts1 = VtsID__to_VTS(vi1);
   2506    vts2 = VtsID__to_VTS(vi2);
   2507    tl_assert(vts1 != vts2);
   2508    diffthr = VTS__cmpLEQ(vts1, vts2);
   2509    tl_assert(diffthr); /* else they are LEQ ! */
   2510    return diffthr;
   2511 }
   2512 
   2513 
   2514 /////////////////////////////////////////////////////////
   2515 //                                                     //
   2516 // Filters                                             //
   2517 //                                                     //
   2518 /////////////////////////////////////////////////////////
   2519 
   2520 // baseline: 5, 9
   2521 #define FI_LINE_SZB_LOG2  5
   2522 #define FI_NUM_LINES_LOG2 10
   2523 
   2524 #define FI_LINE_SZB       (1 << FI_LINE_SZB_LOG2)
   2525 #define FI_NUM_LINES      (1 << FI_NUM_LINES_LOG2)
   2526 
   2527 #define FI_TAG_MASK        (~(Addr)(FI_LINE_SZB - 1))
   2528 #define FI_GET_TAG(_a)     ((_a) & FI_TAG_MASK)
   2529 
   2530 #define FI_GET_LINENO(_a)  ( ((_a) >> FI_LINE_SZB_LOG2) \
   2531                              & (Addr)(FI_NUM_LINES-1) )
   2532 
   2533 
   2534 /* In the lines, each 8 bytes are treated individually, and are mapped
   2535    to a UShort.  Regardless of endianness of the underlying machine,
   2536    bits 1 and 0 pertain to the lowest address and bits 15 and 14 to
   2537    the highest address.
   2538 
   2539    Of each bit pair, the higher numbered bit is set if a R has been
   2540    seen, so the actual layout is:
   2541 
   2542    15 14             ...  01 00
   2543 
   2544    R  W  for addr+7  ...  R  W  for addr+0
   2545 
   2546    So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555.
   2547 */
   2548 
   2549 /* tags are separated from lines.  tags are Addrs and are
   2550    the base address of the line. */
   2551 typedef
   2552    struct {
   2553       UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */
   2554    }
   2555    FiLine;
   2556 
   2557 typedef
   2558    struct {
   2559       Addr   tags[FI_NUM_LINES];
   2560       FiLine lines[FI_NUM_LINES];
   2561    }
   2562    Filter;
   2563 
   2564 /* Forget everything we know -- clear the filter and let everything
   2565    through.  This needs to be as fast as possible, since it is called
   2566    every time the running thread changes, and every time a thread's
   2567    vector clocks change, which can be quite frequent.  The obvious
   2568    fast way to do this is simply to stuff in tags which we know are
   2569    not going to match anything, since they're not aligned to the start
   2570    of a line. */
   2571 static void Filter__clear ( Filter* fi, HChar* who )
   2572 {
   2573    UWord i;
   2574    if (0) VG_(printf)("  Filter__clear(%p, %s)\n", fi, who);
   2575    for (i = 0; i < FI_NUM_LINES; i += 8) {
   2576       fi->tags[i+0] = 1; /* impossible value -- cannot match */
   2577       fi->tags[i+1] = 1;
   2578       fi->tags[i+2] = 1;
   2579       fi->tags[i+3] = 1;
   2580       fi->tags[i+4] = 1;
   2581       fi->tags[i+5] = 1;
   2582       fi->tags[i+6] = 1;
   2583       fi->tags[i+7] = 1;
   2584    }
   2585    tl_assert(i == FI_NUM_LINES);
   2586 }
   2587 
   2588 /* Clearing an arbitrary range in the filter.  Unfortunately
   2589    we have to do this due to core-supplied new/die-mem events. */
   2590 
   2591 static void Filter__clear_1byte ( Filter* fi, Addr a )
   2592 {
   2593    Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   2594    UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   2595    FiLine* line   = &fi->lines[lineno];
   2596    UWord   loff   = (a - atag) / 8;
   2597    UShort  mask   = 0x3 << (2 * (a & 7));
   2598    /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
   2599    if (LIKELY( fi->tags[lineno] == atag )) {
   2600       /* hit.  clear the bits. */
   2601       UShort  u16  = line->u16s[loff];
   2602       line->u16s[loff] = u16 & ~mask; /* clear them */
   2603    } else {
   2604       /* miss.  The filter doesn't hold this address, so ignore. */
   2605    }
   2606 }
   2607 
   2608 static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a )
   2609 {
   2610    Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   2611    UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   2612    FiLine* line   = &fi->lines[lineno];
   2613    UWord   loff   = (a - atag) / 8;
   2614    if (LIKELY( fi->tags[lineno] == atag )) {
   2615       line->u16s[loff] = 0;
   2616    } else {
   2617     /* miss.  The filter doesn't hold this address, so ignore. */
   2618    }
   2619 }
   2620 
   2621 static void Filter__clear_range ( Filter* fi, Addr a, UWord len )
   2622 {
   2623   //VG_(printf)("%lu ", len);
   2624    /* slowly do part preceding 8-alignment */
   2625    while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) {
   2626       Filter__clear_1byte( fi, a );
   2627       a++;
   2628       len--;
   2629    }
   2630    /* vector loop */
   2631    while (len >= 8) {
   2632       Filter__clear_8bytes_aligned( fi, a );
   2633       a += 8;
   2634       len -= 8;
   2635    }
   2636    /* slowly do tail */
   2637    while (UNLIKELY(len > 0)) {
   2638       Filter__clear_1byte( fi, a );
   2639       a++;
   2640       len--;
   2641    }
   2642 }
   2643 
   2644 
   2645 /* ------ Read handlers for the filter. ------ */
   2646 
   2647 static inline Bool Filter__ok_to_skip_crd64 ( Filter* fi, Addr a )
   2648 {
   2649    if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
   2650       return False;
   2651    {
   2652      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   2653      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   2654      FiLine* line   = &fi->lines[lineno];
   2655      UWord   loff   = (a - atag) / 8;
   2656      UShort  mask   = 0xAAAA;
   2657      if (LIKELY( fi->tags[lineno] == atag )) {
   2658         /* hit.  check line and update. */
   2659         UShort u16  = line->u16s[loff];
   2660         Bool   ok   = (u16 & mask) == mask; /* all R bits set? */
   2661         line->u16s[loff] = u16 | mask; /* set them */
   2662         return ok;
   2663      } else {
   2664         /* miss.  nuke existing line and re-use it. */
   2665         UWord i;
   2666         fi->tags[lineno] = atag;
   2667         for (i = 0; i < FI_LINE_SZB / 8; i++)
   2668            line->u16s[i] = 0;
   2669         line->u16s[loff] = mask;
   2670         return False;
   2671      }
   2672    }
   2673 }
   2674 
   2675 static inline Bool Filter__ok_to_skip_crd32 ( Filter* fi, Addr a )
   2676 {
   2677    if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
   2678       return False;
   2679    {
   2680      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   2681      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   2682      FiLine* line   = &fi->lines[lineno];
   2683      UWord   loff   = (a - atag) / 8;
   2684      UShort  mask   = 0xAA << (2 * (a & 4)); /* 0xAA00 or 0x00AA */
   2685      if (LIKELY( fi->tags[lineno] == atag )) {
   2686         /* hit.  check line and update. */
   2687         UShort  u16  = line->u16s[loff];
   2688         Bool    ok   = (u16 & mask) == mask; /* 4 x R bits set? */
   2689         line->u16s[loff] = u16 | mask; /* set them */
   2690         return ok;
   2691      } else {
   2692         /* miss.  nuke existing line and re-use it. */
   2693         UWord   i;
   2694         fi->tags[lineno] = atag;
   2695         for (i = 0; i < FI_LINE_SZB / 8; i++)
   2696            line->u16s[i] = 0;
   2697         line->u16s[loff] = mask;
   2698         return False;
   2699      }
   2700    }
   2701 }
   2702 
   2703 static inline Bool Filter__ok_to_skip_crd16 ( Filter* fi, Addr a )
   2704 {
   2705    if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
   2706       return False;
   2707    {
   2708      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   2709      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   2710      FiLine* line   = &fi->lines[lineno];
   2711      UWord   loff   = (a - atag) / 8;
   2712      UShort  mask   = 0xA << (2 * (a & 6));
   2713      /* mask is A000, 0A00, 00A0 or 000A */
   2714      if (LIKELY( fi->tags[lineno] == atag )) {
   2715         /* hit.  check line and update. */
   2716         UShort  u16  = line->u16s[loff];
   2717         Bool    ok   = (u16 & mask) == mask; /* 2 x R bits set? */
   2718         line->u16s[loff] = u16 | mask; /* set them */
   2719         return ok;
   2720      } else {
   2721         /* miss.  nuke existing line and re-use it. */
   2722         UWord   i;
   2723         fi->tags[lineno] = atag;
   2724         for (i = 0; i < FI_LINE_SZB / 8; i++)
   2725            line->u16s[i] = 0;
   2726         line->u16s[loff] = mask;
   2727         return False;
   2728      }
   2729    }
   2730 }
   2731 
   2732 static inline Bool Filter__ok_to_skip_crd08 ( Filter* fi, Addr a )
   2733 {
   2734    {
   2735      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   2736      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   2737      FiLine* line   = &fi->lines[lineno];
   2738      UWord   loff   = (a - atag) / 8;
   2739      UShort  mask   = 0x2 << (2 * (a & 7));
   2740      /* mask is 8000, 2000, 0800, 0200, 0080, 0020, 0008 or 0002 */
   2741      if (LIKELY( fi->tags[lineno] == atag )) {
   2742         /* hit.  check line and update. */
   2743         UShort  u16  = line->u16s[loff];
   2744         Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
   2745         line->u16s[loff] = u16 | mask; /* set them */
   2746         return ok;
   2747      } else {
   2748         /* miss.  nuke existing line and re-use it. */
   2749         UWord   i;
   2750         fi->tags[lineno] = atag;
   2751         for (i = 0; i < FI_LINE_SZB / 8; i++)
   2752            line->u16s[i] = 0;
   2753         line->u16s[loff] = mask;
   2754         return False;
   2755      }
   2756    }
   2757 }
   2758 
   2759 
   2760 /* ------ Write handlers for the filter. ------ */
   2761 
   2762 static inline Bool Filter__ok_to_skip_cwr64 ( Filter* fi, Addr a )
   2763 {
   2764    if (UNLIKELY( !VG_IS_8_ALIGNED(a) ))
   2765       return False;
   2766    {
   2767      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   2768      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   2769      FiLine* line   = &fi->lines[lineno];
   2770      UWord   loff   = (a - atag) / 8;
   2771      UShort  mask   = 0xFFFF;
   2772      if (LIKELY( fi->tags[lineno] == atag )) {
   2773         /* hit.  check line and update. */
   2774         UShort u16  = line->u16s[loff];
   2775         Bool   ok   = (u16 & mask) == mask; /* all R & W bits set? */
   2776         line->u16s[loff] = u16 | mask; /* set them */
   2777         return ok;
   2778      } else {
   2779         /* miss.  nuke existing line and re-use it. */
   2780         UWord i;
   2781         fi->tags[lineno] = atag;
   2782         for (i = 0; i < FI_LINE_SZB / 8; i++)
   2783            line->u16s[i] = 0;
   2784         line->u16s[loff] = mask;
   2785         return False;
   2786      }
   2787    }
   2788 }
   2789 
   2790 static inline Bool Filter__ok_to_skip_cwr32 ( Filter* fi, Addr a )
   2791 {
   2792    if (UNLIKELY( !VG_IS_4_ALIGNED(a) ))
   2793       return False;
   2794    {
   2795      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   2796      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   2797      FiLine* line   = &fi->lines[lineno];
   2798      UWord   loff   = (a - atag) / 8;
   2799      UShort  mask   = 0xFF << (2 * (a & 4)); /* 0xFF00 or 0x00FF */
   2800      if (LIKELY( fi->tags[lineno] == atag )) {
   2801         /* hit.  check line and update. */
   2802         UShort  u16  = line->u16s[loff];
   2803         Bool    ok   = (u16 & mask) == mask; /* 4 x R & W bits set? */
   2804         line->u16s[loff] = u16 | mask; /* set them */
   2805         return ok;
   2806      } else {
   2807         /* miss.  nuke existing line and re-use it. */
   2808         UWord   i;
   2809         fi->tags[lineno] = atag;
   2810         for (i = 0; i < FI_LINE_SZB / 8; i++)
   2811            line->u16s[i] = 0;
   2812         line->u16s[loff] = mask;
   2813         return False;
   2814      }
   2815    }
   2816 }
   2817 
   2818 static inline Bool Filter__ok_to_skip_cwr16 ( Filter* fi, Addr a )
   2819 {
   2820    if (UNLIKELY( !VG_IS_2_ALIGNED(a) ))
   2821       return False;
   2822    {
   2823      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   2824      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   2825      FiLine* line   = &fi->lines[lineno];
   2826      UWord   loff   = (a - atag) / 8;
   2827      UShort  mask   = 0xF << (2 * (a & 6));
   2828      /* mask is F000, 0F00, 00F0 or 000F */
   2829      if (LIKELY( fi->tags[lineno] == atag )) {
   2830         /* hit.  check line and update. */
   2831         UShort  u16  = line->u16s[loff];
   2832         Bool    ok   = (u16 & mask) == mask; /* 2 x R & W bits set? */
   2833         line->u16s[loff] = u16 | mask; /* set them */
   2834         return ok;
   2835      } else {
   2836         /* miss.  nuke existing line and re-use it. */
   2837         UWord   i;
   2838         fi->tags[lineno] = atag;
   2839         for (i = 0; i < FI_LINE_SZB / 8; i++)
   2840            line->u16s[i] = 0;
   2841         line->u16s[loff] = mask;
   2842         return False;
   2843      }
   2844    }
   2845 }
   2846 
   2847 static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a )
   2848 {
   2849    {
   2850      Addr    atag   = FI_GET_TAG(a);     /* tag of 'a' */
   2851      UWord   lineno = FI_GET_LINENO(a);  /* lineno for 'a' */
   2852      FiLine* line   = &fi->lines[lineno];
   2853      UWord   loff   = (a - atag) / 8;
   2854      UShort  mask   = 0x3 << (2 * (a & 7));
   2855      /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */
   2856      if (LIKELY( fi->tags[lineno] == atag )) {
   2857         /* hit.  check line and update. */
   2858         UShort  u16  = line->u16s[loff];
   2859         Bool    ok   = (u16 & mask) == mask; /* 1 x R bits set? */
   2860         line->u16s[loff] = u16 | mask; /* set them */
   2861         return ok;
   2862      } else {
   2863         /* miss.  nuke existing line and re-use it. */
   2864         UWord   i;
   2865         fi->tags[lineno] = atag;
   2866         for (i = 0; i < FI_LINE_SZB / 8; i++)
   2867            line->u16s[i] = 0;
   2868         line->u16s[loff] = mask;
   2869         return False;
   2870      }
   2871    }
   2872 }
   2873 
   2874 
   2875 /////////////////////////////////////////////////////////
   2876 //                                                     //
   2877 // Threads                                             //
   2878 //                                                     //
   2879 /////////////////////////////////////////////////////////
   2880 
   2881 // QQQ move this somewhere else
   2882 typedef  struct { ULong ull; ExeContext* ec; }  ULong_n_EC;
   2883 
   2884 /* How many of the above records to collect for each thread?  Older
   2885    ones are dumped when we run out of space.  62.5k requires 1MB per
   2886    thread, since each ULong_n_EC record is 16 bytes long.  When more
   2887    than N_KWs_N_STACKs_PER_THREAD are present, the older half are
   2888    deleted to make space.  Hence in the worst case we will be able to
   2889    produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2
   2890    Kw transitions (segments in this thread).  For the current setting
   2891    that gives a guaranteed stack for at least the last 31.25k
   2892    segments. */
   2893 #define N_KWs_N_STACKs_PER_THREAD 62500
   2894 
   2895 
   2896 struct _Thr {
   2897    /* Current VTSs for this thread.  They change as we go along.  viR
   2898       is the VTS to be used for reads, viW for writes.  Usually they
   2899       are the same, but can differ when we deal with reader-writer
   2900       locks.  It is always the case that
   2901          VtsID__cmpLEQ(viW,viR) == True
   2902       that is, viW must be the same, or lagging behind, viR. */
   2903    VtsID viR;
   2904    VtsID viW;
   2905 
   2906    /* Is initially False, and is set to true after the thread really
   2907       has done a low-level exit. */
   2908    Bool still_alive;
   2909 
   2910    /* A filter that removes references for which we believe that
   2911       msmcread/msmcwrite will not change the state, nor report a
   2912       race. */
   2913    Filter* filter;
   2914 
   2915    /* opaque (to us) data we hold on behalf of the library's user. */
   2916    void* opaque;
   2917 
   2918    /* The ULongs (scalar Kws) in this accumulate in strictly
   2919       increasing order, without duplicates.  This is important because
   2920       we need to be able to find a given scalar Kw in this array
   2921       later, by binary search. */
   2922    XArray* /* ULong_n_EC */ local_Kws_n_stacks;
   2923 };
   2924 
   2925 static Thr* Thr__new ( void ) {
   2926    Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) );
   2927    thr->viR = VtsID_INVALID;
   2928    thr->viW = VtsID_INVALID;
   2929    thr->still_alive = True;
   2930    thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) );
   2931    /* We only really need this at history level 1, but unfortunately
   2932       this routine is called before the command line processing is
   2933       done (sigh), so we can't rely on HG_(clo_history_level) at this
   2934       point.  Hence always allocate it.  Bah. */
   2935    thr->local_Kws_n_stacks
   2936       = VG_(newXA)( HG_(zalloc),
   2937                     "libhb.Thr__new.3 (local_Kws_and_stacks)",
   2938                     HG_(free), sizeof(ULong_n_EC) );
   2939    return thr;
   2940 }
   2941 
   2942 static void note_local_Kw_n_stack_for ( Thr* thr )
   2943 {
   2944    Word       nPresent;
   2945    ULong_n_EC pair;
   2946    tl_assert(thr);
   2947 
   2948    // We only collect this info at history level 1 (approx)
   2949    if (HG_(clo_history_level) != 1)
   2950       return;
   2951 
   2952    /* This is the scalar Kw for thr. */
   2953    pair.ull = VtsID__indexAt( thr->viW, thr );
   2954    pair.ec  = main_get_EC( thr );
   2955    tl_assert(pair.ec);
   2956    tl_assert(thr->local_Kws_n_stacks);
   2957 
   2958    /* check that we're not adding duplicates */
   2959    nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
   2960 
   2961    /* Throw away old stacks, if necessary.  We can't accumulate stuff
   2962       indefinitely. */
   2963    if (nPresent >= N_KWs_N_STACKs_PER_THREAD) {
   2964       VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 );
   2965       nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks );
   2966       if (0)
   2967          VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p (!!! gc !!!)\n",
   2968                      thr, pair.ull, pair.ec );
   2969    }
   2970 
   2971    if (nPresent > 0) {
   2972       ULong_n_EC* prevPair
   2973          = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 );
   2974       tl_assert( prevPair->ull <= pair.ull );
   2975    }
   2976 
   2977    if (nPresent == 0)
   2978       pair.ec = NULL;
   2979 
   2980    VG_(addToXA)( thr->local_Kws_n_stacks, &pair );
   2981 
   2982    if (0)
   2983       VG_(printf)("LOCAL Kw: thr %p,  Kw %llu,  ec %p\n",
   2984                   thr, pair.ull, pair.ec );
   2985    if (0)
   2986       VG_(pp_ExeContext)(pair.ec);
   2987 }
   2988 
   2989 static Int cmp__ULong_n_EC__by_ULong ( ULong_n_EC* pair1, ULong_n_EC* pair2 )
   2990 {
   2991    if (pair1->ull < pair2->ull) return -1;
   2992    if (pair1->ull > pair2->ull) return 1;
   2993    return 0;
   2994 }
   2995 
   2996 
   2997 /////////////////////////////////////////////////////////
   2998 //                                                     //
   2999 // Shadow Values                                       //
   3000 //                                                     //
   3001 /////////////////////////////////////////////////////////
   3002 
   3003 // type SVal, SVal_INVALID and SVal_NOACCESS are defined by
   3004 // hb_zsm.h.  We have to do everything else here.
   3005 
   3006 /* SVal is 64 bit unsigned int.
   3007 
   3008       <---------30--------->    <---------30--------->
   3009    00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X   C(Rmin,Wmin)
   3010    10 X--------------------X XX X--------------------X   A: SVal_NOACCESS
   3011    11 0--------------------0 00 0--------------------0   A: SVal_INVALID
   3012 
   3013 */
   3014 #define SVAL_TAGMASK (3ULL << 62)
   3015 
   3016 static inline Bool SVal__isC ( SVal s ) {
   3017    return (0ULL << 62) == (s & SVAL_TAGMASK);
   3018 }
   3019 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) {
   3020    //tl_assert(VtsID__is_valid(rmini));
   3021    //tl_assert(VtsID__is_valid(wmini));
   3022    return (((ULong)rmini) << 32) | ((ULong)wmini);
   3023 }
   3024 static inline VtsID SVal__unC_Rmin ( SVal s ) {
   3025    tl_assert(SVal__isC(s));
   3026    return (VtsID)(s >> 32);
   3027 }
   3028 static inline VtsID SVal__unC_Wmin ( SVal s ) {
   3029    tl_assert(SVal__isC(s));
   3030    return (VtsID)(s & 0xFFFFFFFFULL);
   3031 }
   3032 
   3033 static inline Bool SVal__isA ( SVal s ) {
   3034    return (2ULL << 62) == (s & SVAL_TAGMASK);
   3035 }
   3036 static inline SVal SVal__mkA ( void ) {
   3037    return 2ULL << 62;
   3038 }
   3039 
   3040 /* Direct callback from lib_zsm. */
   3041 static void SVal__rcinc ( SVal s ) {
   3042    if (SVal__isC(s)) {
   3043       VtsID__rcinc( SVal__unC_Rmin(s) );
   3044       VtsID__rcinc( SVal__unC_Wmin(s) );
   3045    }
   3046 }
   3047 
   3048 /* Direct callback from lib_zsm. */
   3049 static void SVal__rcdec ( SVal s ) {
   3050    if (SVal__isC(s)) {
   3051       VtsID__rcdec( SVal__unC_Rmin(s) );
   3052       VtsID__rcdec( SVal__unC_Wmin(s) );
   3053    }
   3054 }
   3055 
   3056 
   3057 /////////////////////////////////////////////////////////
   3058 //                                                     //
   3059 // A simple group (memory) allocator                   //
   3060 //                                                     //
   3061 /////////////////////////////////////////////////////////
   3062 
   3063 //////////////// BEGIN general group allocator
   3064 typedef
   3065    struct {
   3066       UWord   elemSzB;        /* element size */
   3067       UWord   nPerGroup;      /* # elems per group */
   3068       void*   (*alloc)(HChar*, SizeT); /* group allocator */
   3069       HChar*  cc; /* group allocator's cc */
   3070       void    (*free)(void*); /* group allocator's free-er (unused) */
   3071       /* XArray of void* (pointers to groups).  The groups themselves.
   3072          Each element is a pointer to a block of size (elemSzB *
   3073          nPerGroup) bytes. */
   3074       XArray* groups;
   3075       /* next free element.  Is a pointer to an element in one of the
   3076          groups pointed to by .groups. */
   3077       void* nextFree;
   3078    }
   3079    GroupAlloc;
   3080 
   3081 static void init_GroupAlloc ( /*MOD*/GroupAlloc* ga,
   3082                               UWord  elemSzB,
   3083                               UWord  nPerGroup,
   3084                               void*  (*alloc)(HChar*, SizeT),
   3085                               HChar* cc,
   3086                               void   (*free)(void*) )
   3087 {
   3088    tl_assert(0 == (elemSzB % sizeof(UWord)));
   3089    tl_assert(elemSzB >= sizeof(UWord));
   3090    tl_assert(nPerGroup >= 100); /* let's say */
   3091    tl_assert(alloc);
   3092    tl_assert(cc);
   3093    tl_assert(free);
   3094    tl_assert(ga);
   3095    VG_(memset)(ga, 0, sizeof(*ga));
   3096    ga->elemSzB   = elemSzB;
   3097    ga->nPerGroup = nPerGroup;
   3098    ga->groups    = NULL;
   3099    ga->alloc     = alloc;
   3100    ga->cc        = cc;
   3101    ga->free      = free;
   3102    ga->groups    = VG_(newXA)( alloc, cc, free, sizeof(void*) );
   3103    ga->nextFree  = NULL;
   3104    tl_assert(ga->groups);
   3105 }
   3106 
   3107 /* The freelist is empty.  Allocate a new group and put all the new
   3108    elements in it onto the freelist. */
   3109 __attribute__((noinline))
   3110 static void gal_add_new_group ( GroupAlloc* ga )
   3111 {
   3112    Word   i;
   3113    UWord* group;
   3114    tl_assert(ga);
   3115    tl_assert(ga->nextFree == NULL);
   3116    group = ga->alloc( ga->cc, ga->elemSzB * ga->nPerGroup );
   3117    tl_assert(group);
   3118    /* extend the freelist through the new group.  Place the freelist
   3119       pointer in the first word of each element.  That's why the
   3120       element size must be at least one word. */
   3121    for (i = ga->nPerGroup-1; i >= 0; i--) {
   3122       UChar* elemC = ((UChar*)group) + i * ga->elemSzB;
   3123       UWord* elem  = (UWord*)elemC;
   3124       tl_assert(0 == (((UWord)elem) % sizeof(UWord)));
   3125       *elem = (UWord)ga->nextFree;
   3126       ga->nextFree = elem;
   3127    }
   3128    /* and add to our collection of groups */
   3129    VG_(addToXA)( ga->groups, &group );
   3130 }
   3131 
   3132 inline static void* gal_Alloc ( GroupAlloc* ga )
   3133 {
   3134    UWord* elem;
   3135    if (UNLIKELY(ga->nextFree == NULL)) {
   3136       gal_add_new_group(ga);
   3137    }
   3138    elem = ga->nextFree;
   3139    ga->nextFree = (void*)*elem;
   3140    *elem = 0; /* unnecessary, but just to be on the safe side */
   3141    return elem;
   3142 }
   3143 
   3144 inline static void* gal_Alloc_w_size_check ( GroupAlloc* ga, SizeT n )
   3145 {
   3146    tl_assert(n == ga->elemSzB);
   3147    return gal_Alloc( ga );
   3148 }
   3149 
   3150 inline static void gal_Free ( GroupAlloc* ga, void* p )
   3151 {
   3152    UWord* elem = (UWord*)p;
   3153    *elem = (UWord)ga->nextFree;
   3154    ga->nextFree = elem;
   3155 }
   3156 //////////////// END general group allocator
   3157 
   3158 
   3159 /////////////////////////////////////////////////////////
   3160 //                                                     //
   3161 // Change-event map2                                   //
   3162 //                                                     //
   3163 /////////////////////////////////////////////////////////
   3164 
   3165 #define EVENT_MAP_GC_DISCARD_FRACTION  0.5
   3166 
   3167 /* This is in two parts:
   3168 
   3169    1. A hash table of RCECs.  This is a set of reference-counted stack
   3170       traces.  When the reference count of a stack trace becomes zero,
   3171       it is removed from the set and freed up.  The intent is to have
   3172       a set of stack traces which can be referred to from (2), but to
   3173       only represent each one once.  The set is indexed/searched by
   3174       ordering on the stack trace vectors.
   3175 
   3176    2. A SparseWA of OldRefs.  These store information about each old
   3177       ref that we need to record.  It is indexed by address of the
   3178       location for which the information is recorded.  For LRU
   3179       purposes, each OldRef also contains a generation number,
   3180       indicating when it was most recently accessed.
   3181 
   3182       The important part of an OldRef is, however, its accs[] array.
   3183       This is an array of N_OLDREF_ACCS which binds (thread, R/W,
   3184       size) triples to RCECs.  This allows us to collect the last
   3185       access-traceback by up to N_OLDREF_ACCS different triples for
   3186       this location.  The accs[] array is a MTF-array.  If a binding
   3187       falls off the end, that's too bad -- we will lose info about
   3188       that triple's access to this location.
   3189 
   3190       When the SparseWA becomes too big, we can throw away the OldRefs
   3191       whose generation numbers are below some threshold; hence doing
   3192       approximate LRU discarding.  For each discarded OldRef we must
   3193       of course decrement the reference count on the all RCECs it
   3194       refers to, in order that entries from (1) eventually get
   3195       discarded too.
   3196 
   3197    A major improvement in reliability of this mechanism would be to
   3198    have a dynamically sized OldRef.accs[] array, so no entries ever
   3199    fall off the end.  In investigations (Dec 08) it appears that a
   3200    major cause for the non-availability of conflicting-access traces
   3201    in race reports is caused by the fixed size of this array.  I
   3202    suspect for most OldRefs, only a few entries are used, but for a
   3203    minority of cases there is an overflow, leading to info lossage.
   3204    Investigations also suggest this is very workload and scheduling
   3205    sensitive.  Therefore a dynamic sizing would be better.
   3206 
   3207    However, dynamic sizing would defeat the use of a GroupAllocator
   3208    for OldRef structures.  And that's important for performance.  So
   3209    it's not straightforward to do.
   3210 */
   3211 
   3212 
   3213 static UWord stats__ctxt_rcdec1 = 0;
   3214 static UWord stats__ctxt_rcdec2 = 0;
   3215 static UWord stats__ctxt_rcdec3 = 0;
   3216 static UWord stats__ctxt_rcdec_calls = 0;
   3217 static UWord stats__ctxt_rcdec_discards = 0;
   3218 static UWord stats__ctxt_rcdec1_eq = 0;
   3219 
   3220 static UWord stats__ctxt_tab_curr = 0;
   3221 static UWord stats__ctxt_tab_max  = 0;
   3222 
   3223 static UWord stats__ctxt_tab_qs   = 0;
   3224 static UWord stats__ctxt_tab_cmps = 0;
   3225 
   3226 
   3227 ///////////////////////////////////////////////////////
   3228 //// Part (1): A hash table of RCECs
   3229 ///
   3230 
   3231 #define N_FRAMES 8
   3232 
   3233 // (UInt) `echo "Reference Counted Execution Context" | md5sum`
   3234 #define RCEC_MAGIC 0xab88abb2UL
   3235 
   3236 //#define N_RCEC_TAB 98317 /* prime */
   3237 #define N_RCEC_TAB 196613 /* prime */
   3238 
   3239 typedef
   3240    struct _RCEC {
   3241       UWord magic;  /* sanity check only */
   3242       struct _RCEC* next;
   3243       UWord rc;
   3244       UWord rcX; /* used for crosschecking */
   3245       UWord frames_hash;          /* hash of all the frames */
   3246       UWord frames[N_FRAMES];
   3247    }
   3248    RCEC;
   3249 
   3250 static RCEC** contextTab = NULL; /* hash table of RCEC*s */
   3251 
   3252 
   3253 /* Gives an arbitrary total order on RCEC .frames fields */
   3254 static Word RCEC__cmp_by_frames ( RCEC* ec1, RCEC* ec2 ) {
   3255    Word i;
   3256    tl_assert(ec1 && ec1->magic == RCEC_MAGIC);
   3257    tl_assert(ec2 && ec2->magic == RCEC_MAGIC);
   3258    if (ec1->frames_hash < ec2->frames_hash) return -1;
   3259    if (ec1->frames_hash > ec2->frames_hash) return  1;
   3260    for (i = 0; i < N_FRAMES; i++) {
   3261       if (ec1->frames[i] < ec2->frames[i]) return -1;
   3262       if (ec1->frames[i] > ec2->frames[i]) return  1;
   3263    }
   3264    return 0;
   3265 }
   3266 
   3267 
   3268 /* Dec the ref of this RCEC. */
   3269 static void ctxt__rcdec ( RCEC* ec )
   3270 {
   3271    stats__ctxt_rcdec_calls++;
   3272    tl_assert(ec && ec->magic == RCEC_MAGIC);
   3273    tl_assert(ec->rc > 0);
   3274    ec->rc--;
   3275 }
   3276 
   3277 static void ctxt__rcinc ( RCEC* ec )
   3278 {
   3279    tl_assert(ec && ec->magic == RCEC_MAGIC);
   3280    ec->rc++;
   3281 }
   3282 
   3283 
   3284 //////////// BEGIN RCEC group allocator
   3285 static GroupAlloc rcec_group_allocator;
   3286 
   3287 static RCEC* alloc_RCEC ( void ) {
   3288    return gal_Alloc ( &rcec_group_allocator );
   3289 }
   3290 
   3291 static void free_RCEC ( RCEC* rcec ) {
   3292    tl_assert(rcec->magic == RCEC_MAGIC);
   3293    gal_Free( &rcec_group_allocator, rcec );
   3294 }
   3295 //////////// END RCEC group allocator
   3296 
   3297 
   3298 /* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and
   3299    move it one step closer the the front of the list, so as to make
   3300    subsequent searches for it cheaper. */
   3301 static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec )
   3302 {
   3303    RCEC *ec0, *ec1, *ec2;
   3304    if (ec == *headp)
   3305       tl_assert(0); /* already at head of list */
   3306    tl_assert(ec != NULL);
   3307    ec0 = *headp;
   3308    ec1 = NULL;
   3309    ec2 = NULL;
   3310    while (True) {
   3311       if (ec0 == NULL || ec0 == ec) break;
   3312       ec2 = ec1;
   3313       ec1 = ec0;
   3314       ec0 = ec0->next;
   3315    }
   3316    tl_assert(ec0 == ec);
   3317    if (ec0 != NULL && ec1 != NULL && ec2 != NULL) {
   3318       RCEC* tmp;
   3319       /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's
   3320          predecessor.  Swap ec0 and ec1, that is, move ec0 one step
   3321          closer to the start of the list. */
   3322       tl_assert(ec2->next == ec1);
   3323       tl_assert(ec1->next == ec0);
   3324       tmp = ec0->next;
   3325       ec2->next = ec0;
   3326       ec0->next = ec1;
   3327       ec1->next = tmp;
   3328    }
   3329    else
   3330    if (ec0 != NULL && ec1 != NULL && ec2 == NULL) {
   3331       /* it's second in the list. */
   3332       tl_assert(*headp == ec1);
   3333       tl_assert(ec1->next == ec0);
   3334       ec1->next = ec0->next;
   3335       ec0->next = ec1;
   3336       *headp = ec0;
   3337    }
   3338 }
   3339 
   3340 
   3341 /* Find the given RCEC in the tree, and return a pointer to it.  Or,
   3342    if not present, add the given one to the tree (by making a copy of
   3343    it, so the caller can immediately deallocate the original) and
   3344    return a pointer to the copy.  The caller can safely have 'example'
   3345    on its stack, since we will always return a pointer to a copy of
   3346    it, not to the original.  Note that the inserted node will have .rc
   3347    of zero and so the caller must immediatly increment it. */
   3348 __attribute__((noinline))
   3349 static RCEC* ctxt__find_or_add ( RCEC* example )
   3350 {
   3351    UWord hent;
   3352    RCEC* copy;
   3353    tl_assert(example && example->magic == RCEC_MAGIC);
   3354    tl_assert(example->rc == 0);
   3355 
   3356    /* Search the hash table to see if we already have it. */
   3357    stats__ctxt_tab_qs++;
   3358    hent = example->frames_hash % N_RCEC_TAB;
   3359    copy = contextTab[hent];
   3360    while (1) {
   3361       if (!copy) break;
   3362       tl_assert(copy->magic == RCEC_MAGIC);
   3363       stats__ctxt_tab_cmps++;
   3364       if (0 == RCEC__cmp_by_frames(copy, example)) break;
   3365       copy = copy->next;
   3366    }
   3367 
   3368    if (copy) {
   3369       tl_assert(copy != example);
   3370       /* optimisation: if it's not at the head of its list, move 1
   3371          step fwds, to make future searches cheaper */
   3372       if (copy != contextTab[hent]) {
   3373          move_RCEC_one_step_forward( &contextTab[hent], copy );
   3374       }
   3375    } else {
   3376       copy = alloc_RCEC();
   3377       tl_assert(copy != example);
   3378       *copy = *example;
   3379       copy->next = contextTab[hent];
   3380       contextTab[hent] = copy;
   3381       stats__ctxt_tab_curr++;
   3382       if (stats__ctxt_tab_curr > stats__ctxt_tab_max)
   3383          stats__ctxt_tab_max = stats__ctxt_tab_curr;
   3384    }
   3385    return copy;
   3386 }
   3387 
   3388 static inline UWord ROLW ( UWord w, Int n )
   3389 {
   3390    Int bpw = 8 * sizeof(UWord);
   3391    w = (w << n) | (w >> (bpw-n));
   3392    return w;
   3393 }
   3394 
   3395 __attribute__((noinline))
   3396 static RCEC* get_RCEC ( Thr* thr )
   3397 {
   3398    UWord hash, i;
   3399    RCEC  example;
   3400    example.magic = RCEC_MAGIC;
   3401    example.rc = 0;
   3402    example.rcX = 0;
   3403    main_get_stacktrace( thr, &example.frames[0], N_FRAMES );
   3404    hash = 0;
   3405    for (i = 0; i < N_FRAMES; i++) {
   3406       hash ^= example.frames[i];
   3407       hash = ROLW(hash, 19);
   3408    }
   3409    example.frames_hash = hash;
   3410    return ctxt__find_or_add( &example );
   3411 }
   3412 
   3413 ///////////////////////////////////////////////////////
   3414 //// Part (2):
   3415 ///  A SparseWA guest-addr -> OldRef, that refers to (1)
   3416 ///
   3417 
   3418 // (UInt) `echo "Old Reference Information" | md5sum`
   3419 #define OldRef_MAGIC 0x30b1f075UL
   3420 
   3421 /* Records an access: a thread and a context.  The size
   3422    (1,2,4,8) and read-or-writeness are also encoded as
   3423    follows: bottom bit of .thr is 1 if write, 0 if read
   3424             bottom 2 bits of .rcec are encode size:
   3425             00 = 1, 01 = 2, 10 = 4, 11 = 8
   3426 */
   3427 typedef  struct { Thr* thr; RCEC* rcec; }  Thr_n_RCEC;
   3428 
   3429 #define N_OLDREF_ACCS 5
   3430 
   3431 typedef
   3432    struct {
   3433       UWord magic;  /* sanity check only */
   3434       UWord gen;    /* when most recently accessed */
   3435                     /* or free list when not in use */
   3436       /* unused slots in this array have .thr == NULL */
   3437       Thr_n_RCEC accs[N_OLDREF_ACCS];
   3438    }
   3439    OldRef;
   3440 
   3441 
   3442 //////////// BEGIN OldRef group allocator
   3443 static GroupAlloc oldref_group_allocator;
   3444 
   3445 static OldRef* alloc_OldRef ( void ) {
   3446    return gal_Alloc ( &oldref_group_allocator );
   3447 }
   3448 
   3449 static void free_OldRef ( OldRef* r ) {
   3450    tl_assert(r->magic == OldRef_MAGIC);
   3451    gal_Free( &oldref_group_allocator, r );
   3452 }
   3453 //////////// END OldRef group allocator
   3454 
   3455 
   3456 static SparseWA* oldrefTree     = NULL; /* SparseWA* OldRef* */
   3457 static UWord     oldrefGen      = 0;    /* current LRU generation # */
   3458 static UWord     oldrefTreeN    = 0;    /* # elems in oldrefTree */
   3459 static UWord     oldrefGenIncAt = 0;    /* inc gen # when size hits this */
   3460 
   3461 inline static void* ptr_or_UWord ( void* p, UWord w ) {
   3462    return (void*)( ((UWord)p) | ((UWord)w) );
   3463 }
   3464 inline static void* ptr_and_UWord ( void* p, UWord w ) {
   3465    return (void*)( ((UWord)p) & ((UWord)w) );
   3466 }
   3467 
   3468 inline static UInt min_UInt ( UInt a, UInt b ) {
   3469    return a < b ? a : b;
   3470 }
   3471 
   3472 /* Compare the intervals [a1,a1+n1) and [a2,a2+n2).  Return -1 if the
   3473    first interval is lower, 1 if the first interval is higher, and 0
   3474    if there is any overlap.  Redundant paranoia with casting is there
   3475    following what looked distinctly like a bug in gcc-4.1.2, in which
   3476    some of the comparisons were done signedly instead of
   3477    unsignedly. */
   3478 /* Copied from exp-ptrcheck/sg_main.c */
   3479 static Word cmp_nonempty_intervals ( Addr a1, SizeT n1,
   3480                                      Addr a2, SizeT n2 ) {
   3481    UWord a1w = (UWord)a1;
   3482    UWord n1w = (UWord)n1;
   3483    UWord a2w = (UWord)a2;
   3484    UWord n2w = (UWord)n2;
   3485    tl_assert(n1w > 0 && n2w > 0);
   3486    if (a1w + n1w <= a2w) return -1L;
   3487    if (a2w + n2w <= a1w) return 1L;
   3488    return 0;
   3489 }
   3490 
   3491 static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr )
   3492 {
   3493    OldRef* ref;
   3494    RCEC*   rcec;
   3495    Word    i, j;
   3496    UWord   keyW, valW;
   3497    Bool    b;
   3498 
   3499    rcec = get_RCEC( thr );
   3500    ctxt__rcinc(rcec);
   3501 
   3502    /* encode the size and writeness of the transaction in the bottom
   3503       two bits of thr and rcec. */
   3504    thr = ptr_or_UWord(thr, isW ? 1 : 0);
   3505    switch (szB) {
   3506       /* This doesn't look particularly branch-predictor friendly. */
   3507       case 1:  rcec = ptr_or_UWord(rcec, 0); break;
   3508       case 2:  rcec = ptr_or_UWord(rcec, 1); break;
   3509       case 4:  rcec = ptr_or_UWord(rcec, 2); break;
   3510       case 8:  rcec = ptr_or_UWord(rcec, 3); break;
   3511       default: tl_assert(0);
   3512    }
   3513 
   3514    /* Look in the map to see if we already have this. */
   3515    b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, a );
   3516 
   3517    if (b) {
   3518 
   3519       /* We already have a record for this address.  We now need to
   3520          see if we have a stack trace pertaining to this (thread, R/W,
   3521          size) triple. */
   3522       tl_assert(keyW == a);
   3523       ref = (OldRef*)valW;
   3524       tl_assert(ref->magic == OldRef_MAGIC);
   3525 
   3526       tl_assert(thr);
   3527       for (i = 0; i < N_OLDREF_ACCS; i++) {
   3528          if (ref->accs[i].thr != thr)
   3529             continue;
   3530          /* since .thr encodes both the accessing thread and the
   3531             read/writeness, we know now that at least those features
   3532             of the access match this entry.  So we just need to check
   3533             the size indication.  Do this by inspecting the lowest 2 bits of
   3534             .rcec, which contain the encoded size info. */
   3535          if (ptr_and_UWord(ref->accs[i].rcec,3) != ptr_and_UWord(rcec,3))
   3536             continue;
   3537          /* else we have a match, so stop looking. */
   3538          break;
   3539       }
   3540 
   3541       if (i < N_OLDREF_ACCS) {
   3542          /* thread 'thr' has an entry at index 'i'.  Update it. */
   3543          if (i > 0) {
   3544             Thr_n_RCEC tmp = ref->accs[i-1];
   3545             ref->accs[i-1] = ref->accs[i];
   3546             ref->accs[i] = tmp;
   3547             i--;
   3548          }
   3549          if (rcec == ref->accs[i].rcec) stats__ctxt_rcdec1_eq++;
   3550          stats__ctxt_rcdec1++;
   3551          ctxt__rcdec( ptr_and_UWord(ref->accs[i].rcec, ~3) );
   3552          ref->accs[i].rcec = rcec;
   3553          tl_assert(ref->accs[i].thr == thr);
   3554       } else {
   3555          /* No entry for this (thread, R/W, size) triple.  Shuffle all
   3556             of them down one slot, and put the new entry at the start
   3557             of the array. */
   3558          if (ref->accs[N_OLDREF_ACCS-1].thr) {
   3559             /* the last slot is in use.  We must dec the rc on the
   3560                associated rcec. */
   3561             tl_assert(ref->accs[N_OLDREF_ACCS-1].rcec);
   3562             stats__ctxt_rcdec2++;
   3563             if (0 && 0 == (stats__ctxt_rcdec2 & 0xFFF))
   3564                VG_(printf)("QQQQ %lu overflows\n",stats__ctxt_rcdec2);
   3565             ctxt__rcdec( ptr_and_UWord(ref->accs[N_OLDREF_ACCS-1].rcec, ~3) );
   3566          } else {
   3567             tl_assert(!ref->accs[N_OLDREF_ACCS-1].rcec);
   3568          }
   3569          for (j = N_OLDREF_ACCS-1; j >= 1; j--)
   3570             ref->accs[j] = ref->accs[j-1];
   3571          ref->accs[0].thr = thr;
   3572          ref->accs[0].rcec = rcec;
   3573          /* thr==NULL is used to signify an empty slot, so we can't
   3574             add a NULL thr. */
   3575          tl_assert(ptr_and_UWord(thr, ~3) != 0);
   3576       }
   3577 
   3578       ref->gen = oldrefGen;
   3579 
   3580    } else {
   3581 
   3582       /* We don't have a record for this address.  Create a new one. */
   3583       if (oldrefTreeN >= oldrefGenIncAt) {
   3584          oldrefGen++;
   3585          oldrefGenIncAt = oldrefTreeN + 50000;
   3586          if (0) VG_(printf)("oldrefTree: new gen %lu at size %lu\n",
   3587                             oldrefGen, oldrefTreeN );
   3588       }
   3589 
   3590       ref = alloc_OldRef();
   3591       ref->magic = OldRef_MAGIC;
   3592       ref->gen = oldrefGen;
   3593       ref->accs[0].rcec = rcec;
   3594       ref->accs[0].thr = thr;
   3595       /* thr==NULL is used to signify an empty slot, so we can't add a
   3596          NULL thr. */
   3597       tl_assert(ptr_and_UWord(thr, ~3) != 0);
   3598       for (j = 1; j < N_OLDREF_ACCS; j++) {
   3599          ref->accs[j].thr = NULL;
   3600          ref->accs[j].rcec = NULL;
   3601       }
   3602       VG_(addToSWA)( oldrefTree, a, (UWord)ref );
   3603       oldrefTreeN++;
   3604 
   3605    }
   3606 }
   3607 
   3608 
   3609 Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC,
   3610                               /*OUT*/Thr**  resThr,
   3611                               /*OUT*/SizeT* resSzB,
   3612                               /*OUT*/Bool*  resIsW,
   3613                               Thr* thr, Addr a, SizeT szB, Bool isW )
   3614 {
   3615    Word    i, j;
   3616    OldRef* ref;
   3617    UWord   keyW, valW;
   3618    Bool    b;
   3619 
   3620    Thr*    cand_thr;
   3621    RCEC*   cand_rcec;
   3622    Bool    cand_isW;
   3623    SizeT   cand_szB;
   3624    Addr    cand_a;
   3625 
   3626    Addr toCheck[15];
   3627    Int  nToCheck = 0;
   3628 
   3629    tl_assert(thr);
   3630    tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1);
   3631 
   3632    toCheck[nToCheck++] = a;
   3633    for (i = -7; i < (Word)szB; i++) {
   3634       if (i != 0)
   3635          toCheck[nToCheck++] = a + i;
   3636    }
   3637    tl_assert(nToCheck <= 15);
   3638 
   3639    /* Now see if we can find a suitable matching event for
   3640       any of the addresses in toCheck[0 .. nToCheck-1]. */
   3641    for (j = 0; j < nToCheck; j++) {
   3642 
   3643       cand_a = toCheck[j];
   3644       //      VG_(printf)("test %ld %p\n", j, cand_a);
   3645 
   3646       b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, cand_a );
   3647       if (!b)
   3648          continue;
   3649 
   3650       ref = (OldRef*)valW;
   3651       tl_assert(keyW == cand_a);
   3652       tl_assert(ref->magic == OldRef_MAGIC);
   3653       tl_assert(ref->accs[0].thr); /* first slot must always be used */
   3654 
   3655       cand_thr  = NULL;
   3656       cand_rcec = NULL;
   3657       cand_isW  = False;
   3658       cand_szB  = 0;
   3659 
   3660       for (i = 0; i < N_OLDREF_ACCS; i++) {
   3661          Thr_n_RCEC* cand = &ref->accs[i];
   3662          cand_thr  = ptr_and_UWord(cand->thr, ~3);
   3663          cand_rcec = ptr_and_UWord(cand->rcec, ~3);
   3664          /* Decode the writeness from the bottom bit of .thr. */
   3665          cand_isW = 1 == (UWord)ptr_and_UWord(cand->thr, 1);
   3666          /* Decode the size from the bottom two bits of .rcec. */
   3667          switch ((UWord)ptr_and_UWord(cand->rcec, 3)) {
   3668             case 0:  cand_szB = 1; break;
   3669             case 1:  cand_szB = 2; break;
   3670             case 2:  cand_szB = 4; break;
   3671             case 3:  cand_szB = 8; break;
   3672             default: tl_assert(0);
   3673          }
   3674 
   3675          if (cand_thr == NULL)
   3676             /* This slot isn't in use.  Ignore it. */
   3677             continue;
   3678 
   3679          if (cand_thr == thr)
   3680             /* This is an access by the same thread, but we're only
   3681                interested in accesses from other threads.  Ignore. */
   3682             continue;
   3683 
   3684          if ((!cand_isW) && (!isW))
   3685             /* We don't want to report a read racing against another
   3686                read; that's stupid.  So in this case move on. */
   3687             continue;
   3688 
   3689          if (cmp_nonempty_intervals(a, szB, cand_a, cand_szB) != 0)
   3690             /* No overlap with the access we're asking about.  Ignore. */
   3691             continue;
   3692 
   3693          /* We have a match.  Stop searching. */
   3694          break;
   3695       }
   3696 
   3697       tl_assert(i >= 0 && i <= N_OLDREF_ACCS);
   3698 
   3699       if (i < N_OLDREF_ACCS) {
   3700          Int n, maxNFrames;
   3701          /* return with success */
   3702          tl_assert(cand_thr);
   3703          tl_assert(cand_rcec);
   3704          tl_assert(cand_rcec->magic == RCEC_MAGIC);
   3705          tl_assert(cand_szB >= 1);
   3706          /* Count how many non-zero frames we have. */
   3707          maxNFrames = min_UInt(N_FRAMES, VG_(clo_backtrace_size));
   3708          for (n = 0; n < maxNFrames; n++) {
   3709             if (0 == cand_rcec->frames[n]) break;
   3710          }
   3711          *resEC  = VG_(make_ExeContext_from_StackTrace)(cand_rcec->frames, n);
   3712          *resThr = cand_thr;
   3713          *resSzB = cand_szB;
   3714          *resIsW = cand_isW;
   3715          return True;
   3716       }
   3717 
   3718       /* consider next address in toCheck[] */
   3719    } /* for (j = 0; j < nToCheck; j++) */
   3720 
   3721    /* really didn't find anything. */
   3722    return False;
   3723 }
   3724 
   3725 static void event_map_init ( void )
   3726 {
   3727    Word i;
   3728 
   3729    /* Context (RCEC) group allocator */
   3730    init_GroupAlloc ( &rcec_group_allocator,
   3731                      sizeof(RCEC),
   3732                      1000 /* RCECs per group */,
   3733                      HG_(zalloc),
   3734                      "libhb.event_map_init.1 (RCEC groups)",
   3735                      HG_(free) );
   3736 
   3737    /* Context table */
   3738    tl_assert(!contextTab);
   3739    contextTab = HG_(zalloc)( "libhb.event_map_init.2 (context table)",
   3740                              N_RCEC_TAB * sizeof(RCEC*) );
   3741    tl_assert(contextTab);
   3742    for (i = 0; i < N_RCEC_TAB; i++)
   3743       contextTab[i] = NULL;
   3744 
   3745    /* Oldref group allocator */
   3746    init_GroupAlloc ( &oldref_group_allocator,
   3747                      sizeof(OldRef),
   3748                      1000 /* OldRefs per group */,
   3749                      HG_(zalloc),
   3750                      "libhb.event_map_init.3 (OldRef groups)",
   3751                      HG_(free) );
   3752 
   3753    /* Oldref tree */
   3754    tl_assert(!oldrefTree);
   3755    oldrefTree = VG_(newSWA)(
   3756                    HG_(zalloc),
   3757                    "libhb.event_map_init.4 (oldref tree)",
   3758                    HG_(free)
   3759                 );
   3760    tl_assert(oldrefTree);
   3761 
   3762    oldrefGen = 0;
   3763    oldrefGenIncAt = 0;
   3764    oldrefTreeN = 0;
   3765 }
   3766 
   3767 static void event_map__check_reference_counts ( Bool before )
   3768 {
   3769    RCEC*   rcec;
   3770    OldRef* oldref;
   3771    Word    i;
   3772    UWord   nEnts = 0;
   3773    UWord   keyW, valW;
   3774 
   3775    /* Set the 'check' reference counts to zero.  Also, optionally
   3776       check that the real reference counts are non-zero.  We allow
   3777       these to fall to zero before a GC, but the GC must get rid of
   3778       all those that are zero, hence none should be zero after a
   3779       GC. */
   3780    for (i = 0; i < N_RCEC_TAB; i++) {
   3781       for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
   3782          nEnts++;
   3783          tl_assert(rcec);
   3784          tl_assert(rcec->magic == RCEC_MAGIC);
   3785          if (!before)
   3786             tl_assert(rcec->rc > 0);
   3787          rcec->rcX = 0;
   3788       }
   3789    }
   3790 
   3791    /* check that the stats are sane */
   3792    tl_assert(nEnts == stats__ctxt_tab_curr);
   3793    tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max);
   3794 
   3795    /* visit all the referencing points, inc check ref counts */
   3796    VG_(initIterSWA)( oldrefTree );
   3797    while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
   3798       oldref = (OldRef*)valW;
   3799       tl_assert(oldref->magic == OldRef_MAGIC);
   3800       for (i = 0; i < N_OLDREF_ACCS; i++) {
   3801          Thr*  aThr = ptr_and_UWord(oldref->accs[i].thr, ~3);
   3802          RCEC* aRef = ptr_and_UWord(oldref->accs[i].rcec, ~3);
   3803          if (aThr) {
   3804             tl_assert(aRef);
   3805             tl_assert(aRef->magic == RCEC_MAGIC);
   3806             aRef->rcX++;
   3807          } else {
   3808             tl_assert(!aRef);
   3809          }
   3810       }
   3811    }
   3812 
   3813    /* compare check ref counts with actual */
   3814    for (i = 0; i < N_RCEC_TAB; i++) {
   3815       for (rcec = contextTab[i]; rcec; rcec = rcec->next) {
   3816          tl_assert(rcec->rc == rcec->rcX);
   3817       }
   3818    }
   3819 }
   3820 
   3821 __attribute__((noinline))
   3822 static void event_map_maybe_GC ( void )
   3823 {
   3824    OldRef* oldref;
   3825    UWord   keyW, valW, retained, maxGen;
   3826    XArray* refs2del;
   3827    Word    i, j, n2del;
   3828 
   3829    UWord* genMap      = NULL;
   3830    UWord  genMap_min  = 0;
   3831    UWord  genMap_size = 0;
   3832 
   3833    if (LIKELY(oldrefTreeN < HG_(clo_conflict_cache_size)))
   3834       return;
   3835 
   3836    if (0)
   3837       VG_(printf)("libhb: event_map GC at size %lu\n", oldrefTreeN);
   3838 
   3839    /* Check for sane command line params.  Limit values must match
   3840       those in hg_process_cmd_line_option. */
   3841    tl_assert( HG_(clo_conflict_cache_size) >= 10*1000 );
   3842    tl_assert( HG_(clo_conflict_cache_size) <= 30*1000*1000 );
   3843 
   3844    /* Check our counting is sane (expensive) */
   3845    if (CHECK_CEM)
   3846       tl_assert(oldrefTreeN == VG_(sizeSWA)( oldrefTree ));
   3847 
   3848    /* Check the reference counts (expensive) */
   3849    if (CHECK_CEM)
   3850       event_map__check_reference_counts( True/*before*/ );
   3851 
   3852    /* Compute the distribution of generation values in the ref tree.
   3853       There are likely only to be a few different generation numbers
   3854       in the whole tree, but we don't know what they are.  Hence use a
   3855       dynamically resized array of counters.  The array is genMap[0
   3856       .. genMap_size-1], where genMap[0] is the count for the
   3857       generation number genMap_min, genMap[1] is the count for
   3858       genMap_min+1, etc.  If a new number is seen outside the range
   3859       [genMap_min .. genMap_min + genMap_size - 1] then the array is
   3860       copied into a larger array, and genMap_min and genMap_size are
   3861       adjusted accordingly. */
   3862 
   3863    /* genMap :: generation-number -> count-of-nodes-with-that-number */
   3864 
   3865    VG_(initIterSWA)( oldrefTree );
   3866    while ( VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
   3867 
   3868        UWord ea, key;
   3869        oldref = (OldRef*)valW;
   3870        key = oldref->gen;
   3871 
   3872       /* BEGIN find 'ea', which is the index in genMap holding the
   3873          count for generation number 'key'. */
   3874       if (UNLIKELY(genMap == NULL)) {
   3875          /* deal with the first key to be seen, so that the following
   3876             cases don't need to handle the complexity of a NULL count
   3877             array. */
   3878          genMap_min  = key;
   3879          genMap_size = 1;
   3880          genMap = HG_(zalloc)( "libhb.emmG.1a",
   3881                                 genMap_size * sizeof(UWord) );
   3882          ea = 0;
   3883          if (0) VG_(printf)("(%lu) case 1 [%lu .. %lu]\n",
   3884                             key, genMap_min, genMap_min+genMap_size- 1 );
   3885       }
   3886       else
   3887       if (LIKELY(key >= genMap_min && key < genMap_min + genMap_size)) {
   3888          /* this is the expected (almost-always-happens) case: 'key'
   3889             is already mapped in the array. */
   3890          ea = key - genMap_min;
   3891       }
   3892       else
   3893       if (key < genMap_min) {
   3894          /* 'key' appears before the start of the current array.
   3895             Extend the current array by allocating a larger one and
   3896             copying the current one to the upper end of it. */
   3897          Word   more;
   3898          UWord* map2;
   3899          more = genMap_min - key;
   3900          tl_assert(more > 0);
   3901          map2 = HG_(zalloc)( "libhb.emmG.1b",
   3902                              (genMap_size + more) * sizeof(UWord) );
   3903          VG_(memcpy)( &map2[more], genMap, genMap_size * sizeof(UWord) );
   3904          HG_(free)( genMap );
   3905          genMap = map2;
   3906          genMap_size += more;
   3907          genMap_min -= more;
   3908          ea = 0;
   3909          tl_assert(genMap_min == key);
   3910          if (0) VG_(printf)("(%lu) case 2 [%lu .. %lu]\n",
   3911                             key, genMap_min,  genMap_min+genMap_size- 1 );
   3912       }
   3913       else {
   3914          /* 'key' appears after the end of the current array.  Extend
   3915             the current array by allocating a larger one and copying
   3916             the current one to the lower end of it. */
   3917          Word   more;
   3918          UWord* map2;
   3919          tl_assert(key >= genMap_min + genMap_size);
   3920          more = key - (genMap_min + genMap_size) + 1;
   3921          tl_assert(more > 0);
   3922          map2 = HG_(zalloc)( "libhb.emmG.1c",
   3923                              (genMap_size + more) * sizeof(UWord) );
   3924          VG_(memcpy)( &map2[0], genMap, genMap_size * sizeof(UWord) );
   3925          HG_(free)( genMap );
   3926          genMap = map2;
   3927          genMap_size += more;
   3928          ea = genMap_size - 1;;
   3929          tl_assert(genMap_min + genMap_size - 1 == key);
   3930          if (0) VG_(printf)("(%lu) case 3 [%lu .. %lu]\n",
   3931                             key, genMap_min, genMap_min+genMap_size- 1 );
   3932       }
   3933       /* END find 'ea' from 'key' */
   3934 
   3935       tl_assert(ea >= 0 && ea < genMap_size);
   3936       /* and the whole point of this elaborate computation of 'ea' is .. */
   3937       genMap[ea]++;
   3938    }
   3939 
   3940    tl_assert(genMap);
   3941    tl_assert(genMap_size > 0);
   3942 
   3943    /* Sanity check what we just computed */
   3944    { UWord sum = 0;
   3945      for (i = 0; i < genMap_size; i++) {
   3946         if (0) VG_(printf)("  xxx: gen %ld has %lu\n",
   3947                            i + genMap_min, genMap[i] );
   3948         sum += genMap[i];
   3949      }
   3950      tl_assert(sum == oldrefTreeN);
   3951    }
   3952 
   3953    /* Figure out how many generations to throw away */
   3954    retained = oldrefTreeN;
   3955    maxGen = 0;
   3956 
   3957    for (i = 0; i < genMap_size; i++) {
   3958       keyW = i + genMap_min;
   3959       valW = genMap[i];
   3960       tl_assert(keyW > 0); /* can't allow a generation # 0 */
   3961       if (0) VG_(printf)("  XXX: gen %lu has %lu\n", keyW, valW );
   3962       tl_assert(keyW >= maxGen);
   3963       tl_assert(retained >= valW);
   3964       if (retained - valW
   3965           > (UWord)(HG_(clo_conflict_cache_size)
   3966                     * EVENT_MAP_GC_DISCARD_FRACTION)) {
   3967          retained -= valW;
   3968          maxGen = keyW;
   3969       } else {
   3970          break;
   3971       }
   3972    }
   3973 
   3974    HG_(free)(genMap);
   3975 
   3976    tl_assert(retained >= 0 && retained <= oldrefTreeN);
   3977 
   3978    /* Now make up a big list of the oldrefTree entries we want to
   3979       delete.  We can't simultaneously traverse the tree and delete
   3980       stuff from it, so first we need to copy them off somewhere
   3981       else. (sigh) */
   3982    refs2del = VG_(newXA)( HG_(zalloc), "libhb.emmG.2",
   3983                           HG_(free), sizeof(Addr) );
   3984 
   3985    if (retained < oldrefTreeN) {
   3986 
   3987       /* This is the normal (expected) case.  We discard any ref whose
   3988          generation number <= maxGen. */
   3989       VG_(initIterSWA)( oldrefTree );
   3990       while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
   3991          oldref = (OldRef*)valW;
   3992          tl_assert(oldref->magic == OldRef_MAGIC);
   3993          if (oldref->gen <= maxGen) {
   3994             VG_(addToXA)( refs2del, &keyW );
   3995          }
   3996       }
   3997       if (VG_(clo_stats)) {
   3998          VG_(message)(Vg_DebugMsg,
   3999             "libhb: EvM GC: delete generations %lu and below, "
   4000             "retaining %lu entries\n",
   4001             maxGen, retained );
   4002       }
   4003 
   4004    } else {
   4005 
   4006       static UInt rand_seed = 0; /* leave as static */
   4007 
   4008       /* Degenerate case: there's only one generation in the entire
   4009          tree, so we need to have some other way of deciding which
   4010          refs to throw away.  Just throw out half of them randomly. */
   4011       tl_assert(retained == oldrefTreeN);
   4012       VG_(initIterSWA)( oldrefTree );
   4013       while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) {
   4014          UInt n;
   4015          oldref = (OldRef*)valW;
   4016          tl_assert(oldref->magic == OldRef_MAGIC);
   4017          n = VG_(random)( &rand_seed );
   4018          if ((n & 0xFFF) < 0x800) {
   4019             VG_(addToXA)( refs2del, &keyW );
   4020             retained--;
   4021          }
   4022       }
   4023       if (VG_(clo_stats)) {
   4024          VG_(message)(Vg_DebugMsg,
   4025             "libhb: EvM GC: randomly delete half the entries, "
   4026             "retaining %lu entries\n",
   4027             retained );
   4028       }
   4029 
   4030    }
   4031 
   4032    n2del = VG_(sizeXA)( refs2del );
   4033    tl_assert(n2del == (Word)(oldrefTreeN - retained));
   4034 
   4035    if (0) VG_(printf)("%s","deleting entries\n");
   4036    for (i = 0; i < n2del; i++) {
   4037       Bool  b;
   4038       Addr  ga2del = *(Addr*)VG_(indexXA)( refs2del, i );
   4039       b = VG_(delFromSWA)( oldrefTree, &keyW, &valW, ga2del );
   4040       tl_assert(b);
   4041       tl_assert(keyW == ga2del);
   4042       oldref = (OldRef*)valW;
   4043       for (j = 0; j < N_OLDREF_ACCS; j++) {
   4044          Thr*  aThr = ptr_and_UWord(oldref->accs[j].thr, ~3);
   4045          RCEC* aRef = ptr_and_UWord(oldref->accs[j].rcec, ~3);
   4046          if (aRef) {
   4047             tl_assert(aThr);
   4048             stats__ctxt_rcdec3++;
   4049             ctxt__rcdec( aRef );
   4050          } else {
   4051             tl_assert(!aThr);
   4052          }
   4053       }
   4054 
   4055       free_OldRef( oldref );
   4056    }
   4057 
   4058    VG_(deleteXA)( refs2del );
   4059 
   4060    tl_assert( VG_(sizeSWA)( oldrefTree ) == retained );
   4061 
   4062    oldrefTreeN = retained;
   4063    oldrefGenIncAt = oldrefTreeN; /* start new gen right away */
   4064 
   4065    /* Throw away all RCECs with zero reference counts */
   4066    for (i = 0; i < N_RCEC_TAB; i++) {
   4067       RCEC** pp = &contextTab[i];
   4068       RCEC*  p  = *pp;
   4069       while (p) {
   4070          if (p->rc == 0) {
   4071             *pp = p->next;
   4072             free_RCEC(p);
   4073             p = *pp;
   4074             tl_assert(stats__ctxt_tab_curr > 0);
   4075             stats__ctxt_tab_curr--;
   4076          } else {
   4077             pp = &p->next;
   4078             p = p->next;
   4079          }
   4080       }
   4081    }
   4082 
   4083    /* Check the reference counts (expensive) */
   4084    if (CHECK_CEM)
   4085       event_map__check_reference_counts( False/*after*/ );
   4086 
   4087    //if (0)
   4088    //VG_(printf)("XXXX final sizes: oldrefTree %ld, contextTree %ld\n\n",
   4089    //            VG_(OSetGen_Size)(oldrefTree), VG_(OSetGen_Size)(contextTree));
   4090 
   4091 }
   4092 
   4093 
   4094 /////////////////////////////////////////////////////////
   4095 //                                                     //
   4096 // Core MSM                                            //
   4097 //                                                     //
   4098 /////////////////////////////////////////////////////////
   4099 
   4100 /* Logic in msmcread/msmcwrite updated/verified after re-analysis, 19
   4101    Nov 08, and again after [...],
   4102    June 09. */
   4103 
   4104 static ULong stats__msmcread         = 0;
   4105 static ULong stats__msmcread_change  = 0;
   4106 static ULong stats__msmcwrite        = 0;
   4107 static ULong stats__msmcwrite_change = 0;
   4108 
   4109 /* Some notes on the H1 history mechanism:
   4110 
   4111    Transition rules are:
   4112 
   4113    read_{Kr,Kw}(Cr,Cw)  = (Cr,           Cr `join` Kw)
   4114    write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw)
   4115 
   4116    After any access by a thread T to a location L, L's constraint pair
   4117    (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock.
   4118 
   4119    After a race by thread T conflicting with some previous access by
   4120    some other thread U, for a location with constraint (before
   4121    processing the later access) (Cr,Cw), then Cw[U] is the segment in
   4122    which the previously access lies.
   4123 
   4124    Hence in record_race_info, we pass in Cfailed and Kfailed, which
   4125    are compared so as to find out which thread(s) this access
   4126    conflicts with.  Once that is established, we also require the
   4127    pre-update Cw for the location, so we can index into it for those
   4128    threads, to get the scalar clock values for the point at which the
   4129    former accesses were made.  (In fact we only bother to do any of
   4130    this for an arbitrarily chosen one of the conflicting threads, as
   4131    that's simpler, it avoids flooding the user with vast amounts of
   4132    mostly useless information, and because the program is wrong if it
   4133    contains any races at all -- so we don't really need to show all
   4134    conflicting access pairs initially, so long as we only show none if
   4135    none exist).
   4136 
   4137    ---
   4138 
   4139    That requires the auxiliary proof that
   4140 
   4141       (Cr `join` Kw)[T] == Kw[T]
   4142 
   4143    Why should that be true?  Because for any thread T, Kw[T] >= the
   4144    scalar clock value for T known by any other thread.  In other
   4145    words, because T's value for its own scalar clock is at least as up
   4146    to date as the value for it known by any other thread (that is true
   4147    for both the R- and W- scalar clocks).  Hence no other thread will
   4148    be able to feed in a value for that element (indirectly via a
   4149    constraint) which will exceed Kw[T], and hence the join cannot
   4150    cause that particular element to advance.
   4151 */
   4152 
   4153 __attribute__((noinline))
   4154 static void record_race_info ( Thr* acc_thr,
   4155                                Addr acc_addr, SizeT szB, Bool isWrite,
   4156                                VtsID Cfailed,
   4157                                VtsID Kfailed,
   4158                                VtsID Cw )
   4159 {
   4160    /* Call here to report a race.  We just hand it onwards to
   4161       HG_(record_error_Race).  If that in turn discovers that the
   4162       error is going to be collected, then, at history_level 2, that
   4163       queries the conflicting-event map.  The alternative would be to
   4164       query it right here.  But that causes a lot of pointless queries
   4165       for errors which will shortly be discarded as duplicates, and
   4166       can become a performance overhead; so we defer the query until
   4167       we know the error is not a duplicate. */
   4168 
   4169    /* Stacks for the bounds of the (or one of the) conflicting
   4170       segment(s).  These are only set at history_level 1. */
   4171    ExeContext* hist1_seg_start = NULL;
   4172    ExeContext* hist1_seg_end   = NULL;
   4173    Thread*     hist1_conf_thr  = NULL;
   4174 
   4175    tl_assert(acc_thr);
   4176    tl_assert(acc_thr->opaque);
   4177    tl_assert(HG_(clo_history_level) >= 0 && HG_(clo_history_level) <= 2);
   4178 
   4179    if (HG_(clo_history_level) == 1) {
   4180       Bool found;
   4181       Word firstIx, lastIx;
   4182       ULong_n_EC key;
   4183 
   4184       /* At history_level 1, we must round up the relevant stack-pair
   4185          for the conflicting segment right now.  This is because
   4186          deferring it is complex; we can't (easily) put Kfailed and
   4187          Cfailed into the XError and wait for later without
   4188          getting tied up in difficulties with VtsID reference
   4189          counting.  So just do it now. */
   4190       Thr*  confThr;
   4191       ULong confTym = 0;
   4192       /* Which thread are we in conflict with?  There may be more than
   4193          one, in which case VtsID__findFirst_notLEQ selects one arbitrarily
   4194          (in fact it's the one with the lowest Thr* value). */
   4195       confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed );
   4196       /* This must exist!  since if it was NULL then there's no
   4197          conflict (semantics of return value of
   4198          VtsID__findFirst_notLEQ), and msmc{read,write}, which has
   4199          called us, just checked exactly this -- that there was in
   4200          fact a race. */
   4201       tl_assert(confThr);
   4202 
   4203       /* Get the scalar clock value that the conflicting thread
   4204          introduced into the constraint.  A careful examination of the
   4205          base machine rules shows that this must be the same as the
   4206          conflicting thread's scalar clock when it created this
   4207          constraint.  Hence we know the scalar clock of the
   4208          conflicting thread when the conflicting access was made. */
   4209       confTym = VtsID__indexAt( Cfailed, confThr );
   4210 
   4211       /* Using this scalar clock, index into the conflicting thread's
   4212          collection of stack traces made each time its vector clock
   4213          (hence its scalar clock) changed.  This gives the stack
   4214          traces at the start and end of the conflicting segment (well,
   4215          as per comment just above, of one of the conflicting
   4216          segments, if there are more than one). */
   4217       key.ull = confTym;
   4218       key.ec  = NULL;
   4219       /* tl_assert(confThr); -- asserted just above */
   4220       tl_assert(confThr->local_Kws_n_stacks);
   4221       firstIx = lastIx = 0;
   4222       found = VG_(lookupXA_UNSAFE)(
   4223                  confThr->local_Kws_n_stacks,
   4224                  &key, &firstIx, &lastIx,
   4225                  (Int(*)(void*,void*))cmp__ULong_n_EC__by_ULong
   4226               );
   4227       if (0) VG_(printf)("record_race_info %u %u %u  confThr %p "
   4228                          "confTym %llu found %d (%lu,%lu)\n",
   4229                          Cfailed, Kfailed, Cw,
   4230                          confThr, confTym, found, firstIx, lastIx);
   4231       /* We can't indefinitely collect stack traces at VTS
   4232          transitions, since we'd eventually run out of memory.  Hence
   4233          note_local_Kw_n_stack_for will eventually throw away old
   4234          ones, which in turn means we might fail to find index value
   4235          confTym in the array. */
   4236       if (found) {
   4237          ULong_n_EC *pair_start, *pair_end;
   4238          pair_start
   4239             = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx );
   4240          hist1_seg_start = pair_start->ec;
   4241          if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) {
   4242             pair_end
   4243                = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks,
   4244                                             lastIx+1 );
   4245             /* from properties of VG_(lookupXA) and the comparison fn used: */
   4246             tl_assert(pair_start->ull < pair_end->ull);
   4247             hist1_seg_end = pair_end->ec;
   4248             /* Could do a bit better here.  It may be that pair_end
   4249                doesn't have a stack, but the following entries in the
   4250                array have the same scalar Kw and to have a stack.  So
   4251                we should search a bit further along the array than
   4252                lastIx+1 if hist1_seg_end is NULL. */
   4253          } else {
   4254             if (confThr->still_alive)
   4255                hist1_seg_end = main_get_EC( confThr );
   4256          }
   4257          // seg_start could be NULL iff this is the first stack in the thread
   4258          //if (seg_start) VG_(pp_ExeContext)(seg_start);
   4259          //if (seg_end)   VG_(pp_ExeContext)(seg_end);
   4260          hist1_conf_thr = confThr->opaque;
   4261       }
   4262    }
   4263 
   4264    HG_(record_error_Race)( acc_thr->opaque, acc_addr,
   4265                            szB, isWrite,
   4266                            hist1_conf_thr, hist1_seg_start, hist1_seg_end );
   4267 }
   4268 
   4269 static Bool is_sane_SVal_C ( SVal sv ) {
   4270    Bool leq;
   4271    if (!SVal__isC(sv)) return True;
   4272    leq = VtsID__cmpLEQ( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) );
   4273    return leq;
   4274 }
   4275 
   4276 
   4277 /* Compute new state following a read */
   4278 static inline SVal msmcread ( SVal svOld,
   4279                               /* The following are only needed for
   4280                                  creating error reports. */
   4281                               Thr* acc_thr,
   4282                               Addr acc_addr, SizeT szB )
   4283 {
   4284    SVal svNew = SVal_INVALID;
   4285    stats__msmcread++;
   4286 
   4287    /* Redundant sanity check on the constraints */
   4288    if (CHECK_MSM) {
   4289       tl_assert(is_sane_SVal_C(svOld));
   4290    }
   4291 
   4292    if (LIKELY(SVal__isC(svOld))) {
   4293       VtsID tviR  = acc_thr->viR;
   4294       VtsID tviW  = acc_thr->viW;
   4295       VtsID rmini = SVal__unC_Rmin(svOld);
   4296       VtsID wmini = SVal__unC_Wmin(svOld);
   4297       Bool  leq   = VtsID__cmpLEQ(rmini,tviR);
   4298       if (LIKELY(leq)) {
   4299          /* no race */
   4300          /* Note: RWLOCK subtlety: use tviW, not tviR */
   4301          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
   4302          goto out;
   4303       } else {
   4304          /* assert on sanity of constraints. */
   4305          Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
   4306          tl_assert(leqxx);
   4307          // same as in non-race case
   4308          svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) );
   4309          record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/,
   4310                            rmini, /* Cfailed */
   4311                            tviR,  /* Kfailed */
   4312                            wmini  /* Cw */ );
   4313          goto out;
   4314       }
   4315    }
   4316    if (SVal__isA(svOld)) {
   4317       /* reading no-access memory (sigh); leave unchanged */
   4318       /* check for no pollution */
   4319       tl_assert(svOld == SVal_NOACCESS);
   4320       svNew = SVal_NOACCESS;
   4321       goto out;
   4322    }
   4323    if (0) VG_(printf)("msmcread: bad svOld: 0x%016llx\n", svOld);
   4324    tl_assert(0);
   4325 
   4326   out:
   4327    if (CHECK_MSM) {
   4328       tl_assert(is_sane_SVal_C(svNew));
   4329    }
   4330    if (UNLIKELY(svNew != svOld)) {
   4331       tl_assert(svNew != SVal_INVALID);
   4332       if (HG_(clo_history_level) >= 2
   4333           && SVal__isC(svOld) && SVal__isC(svNew)) {
   4334          event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr );
   4335          stats__msmcread_change++;
   4336       }
   4337    }
   4338    return svNew;
   4339 }
   4340 
   4341 
   4342 /* Compute new state following a write */
   4343 static inline SVal msmcwrite ( SVal svOld,
   4344                               /* The following are only needed for
   4345                                  creating error reports. */
   4346                               Thr* acc_thr,
   4347                               Addr acc_addr, SizeT szB )
   4348 {
   4349    SVal svNew = SVal_INVALID;
   4350    stats__msmcwrite++;
   4351 
   4352    /* Redundant sanity check on the constraints */
   4353    if (CHECK_MSM) {
   4354       tl_assert(is_sane_SVal_C(svOld));
   4355    }
   4356 
   4357    if (LIKELY(SVal__isC(svOld))) {
   4358       VtsID tviW  = acc_thr->viW;
   4359       VtsID wmini = SVal__unC_Wmin(svOld);
   4360       Bool  leq   = VtsID__cmpLEQ(wmini,tviW);
   4361       if (LIKELY(leq)) {
   4362          /* no race */
   4363          svNew = SVal__mkC( tviW, tviW );
   4364          goto out;
   4365       } else {
   4366          VtsID rmini = SVal__unC_Rmin(svOld);
   4367          /* assert on sanity of constraints. */
   4368          Bool leqxx = VtsID__cmpLEQ(rmini,wmini);
   4369          tl_assert(leqxx);
   4370          // same as in non-race case
   4371          // proof: in the non-race case, we have
   4372          //    rmini <= wmini (invar on constraints)
   4373          //    tviW <= tviR (invar on thread clocks)
   4374          //    wmini <= tviW (from run-time check)
   4375          // hence from transitivity of <= we have
   4376          //    rmini <= wmini <= tviW
   4377          // and so join(rmini,tviW) == tviW
   4378          // and    join(wmini,tviW) == tviW
   4379          // qed.
   4380          svNew = SVal__mkC( VtsID__join2(rmini, tviW),
   4381                             VtsID__join2(wmini, tviW) );
   4382          record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/,
   4383                            wmini, /* Cfailed */
   4384                            tviW,  /* Kfailed */
   4385                            wmini  /* Cw */ );
   4386          goto out;
   4387       }
   4388    }
   4389    if (SVal__isA(svOld)) {
   4390       /* writing no-access memory (sigh); leave unchanged */
   4391       /* check for no pollution */
   4392       tl_assert(svOld == SVal_NOACCESS);
   4393       svNew = SVal_NOACCESS;
   4394       goto out;
   4395    }
   4396    if (0) VG_(printf)("msmcwrite: bad svOld: 0x%016llx\n", svOld);
   4397    tl_assert(0);
   4398 
   4399   out:
   4400    if (CHECK_MSM) {
   4401       tl_assert(is_sane_SVal_C(svNew));
   4402    }
   4403    if (UNLIKELY(svNew != svOld)) {
   4404       tl_assert(svNew != SVal_INVALID);
   4405       if (HG_(clo_history_level) >= 2
   4406           && SVal__isC(svOld) && SVal__isC(svNew)) {
   4407          event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr );
   4408          stats__msmcwrite_change++;
   4409       }
   4410    }
   4411    return svNew;
   4412 }
   4413 
   4414 
   4415 /////////////////////////////////////////////////////////
   4416 //                                                     //
   4417 // Apply core MSM to specific memory locations         //
   4418 //                                                     //
   4419 /////////////////////////////////////////////////////////
   4420 
   4421 /*------------- ZSM accesses: 8 bit sapply ------------- */
   4422 
   4423 static void zsm_sapply08__msmcread ( Thr* thr, Addr a ) {
   4424    CacheLine* cl;
   4425    UWord      cloff, tno, toff;
   4426    SVal       svOld, svNew;
   4427    UShort     descr;
   4428    stats__cline_cread08s++;
   4429    cl    = get_cacheline(a);
   4430    cloff = get_cacheline_offset(a);
   4431    tno   = get_treeno(a);
   4432    toff  = get_tree_offset(a); /* == 0 .. 7 */
   4433    descr = cl->descrs[tno];
   4434    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
   4435       SVal* tree = &cl->svals[tno << 3];
   4436       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
   4437       if (CHECK_ZSM)
   4438          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   4439    }
   4440    svOld = cl->svals[cloff];
   4441    svNew = msmcread( svOld, thr,a,1 );
   4442    if (CHECK_ZSM)
   4443       tl_assert(svNew != SVal_INVALID);
   4444    cl->svals[cloff] = svNew;
   4445 }
   4446 
   4447 static void zsm_sapply08__msmcwrite ( Thr* thr, Addr a ) {
   4448    CacheLine* cl;
   4449    UWord      cloff, tno, toff;
   4450    SVal       svOld, svNew;
   4451    UShort     descr;
   4452    stats__cline_cwrite08s++;
   4453    cl    = get_cacheline(a);
   4454    cloff = get_cacheline_offset(a);
   4455    tno   = get_treeno(a);
   4456    toff  = get_tree_offset(a); /* == 0 .. 7 */
   4457    descr = cl->descrs[tno];
   4458    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
   4459       SVal* tree = &cl->svals[tno << 3];
   4460       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
   4461       if (CHECK_ZSM)
   4462          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   4463    }
   4464    svOld = cl->svals[cloff];
   4465    svNew = msmcwrite( svOld, thr,a,1 );
   4466    if (CHECK_ZSM)
   4467       tl_assert(svNew != SVal_INVALID);
   4468    cl->svals[cloff] = svNew;
   4469 }
   4470 
   4471 /*------------- ZSM accesses: 16 bit sapply ------------- */
   4472 
   4473 static void zsm_sapply16__msmcread ( Thr* thr, Addr a ) {
   4474    CacheLine* cl;
   4475    UWord      cloff, tno, toff;
   4476    SVal       svOld, svNew;
   4477    UShort     descr;
   4478    stats__cline_cread16s++;
   4479    if (UNLIKELY(!aligned16(a))) goto slowcase;
   4480    cl    = get_cacheline(a);
   4481    cloff = get_cacheline_offset(a);
   4482    tno   = get_treeno(a);
   4483    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
   4484    descr = cl->descrs[tno];
   4485    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
   4486       if (valid_value_is_below_me_16(descr, toff)) {
   4487          goto slowcase;
   4488       } else {
   4489          SVal* tree = &cl->svals[tno << 3];
   4490          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
   4491       }
   4492       if (CHECK_ZSM)
   4493          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   4494    }
   4495    svOld = cl->svals[cloff];
   4496    svNew = msmcread( svOld, thr,a,2 );
   4497    if (CHECK_ZSM)
   4498       tl_assert(svNew != SVal_INVALID);
   4499    cl->svals[cloff] = svNew;
   4500    return;
   4501   slowcase: /* misaligned, or must go further down the tree */
   4502    stats__cline_16to8splits++;
   4503    zsm_sapply08__msmcread( thr, a + 0 );
   4504    zsm_sapply08__msmcread( thr, a + 1 );
   4505 }
   4506 
   4507 static void zsm_sapply16__msmcwrite ( Thr* thr, Addr a ) {
   4508    CacheLine* cl;
   4509    UWord      cloff, tno, toff;
   4510    SVal       svOld, svNew;
   4511    UShort     descr;
   4512    stats__cline_cwrite16s++;
   4513    if (UNLIKELY(!aligned16(a))) goto slowcase;
   4514    cl    = get_cacheline(a);
   4515    cloff = get_cacheline_offset(a);
   4516    tno   = get_treeno(a);
   4517    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
   4518    descr = cl->descrs[tno];
   4519    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
   4520       if (valid_value_is_below_me_16(descr, toff)) {
   4521          goto slowcase;
   4522       } else {
   4523          SVal* tree = &cl->svals[tno << 3];
   4524          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
   4525       }
   4526       if (CHECK_ZSM)
   4527          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   4528    }
   4529    svOld = cl->svals[cloff];
   4530    svNew = msmcwrite( svOld, thr,a,2 );
   4531    if (CHECK_ZSM)
   4532       tl_assert(svNew != SVal_INVALID);
   4533    cl->svals[cloff] = svNew;
   4534    return;
   4535   slowcase: /* misaligned, or must go further down the tree */
   4536    stats__cline_16to8splits++;
   4537    zsm_sapply08__msmcwrite( thr, a + 0 );
   4538    zsm_sapply08__msmcwrite( thr, a + 1 );
   4539 }
   4540 
   4541 /*------------- ZSM accesses: 32 bit sapply ------------- */
   4542 
   4543 static void zsm_sapply32__msmcread ( Thr* thr, Addr a ) {
   4544    CacheLine* cl;
   4545    UWord      cloff, tno, toff;
   4546    SVal       svOld, svNew;
   4547    UShort     descr;
   4548    stats__cline_cread32s++;
   4549    if (UNLIKELY(!aligned32(a))) goto slowcase;
   4550    cl    = get_cacheline(a);
   4551    cloff = get_cacheline_offset(a);
   4552    tno   = get_treeno(a);
   4553    toff  = get_tree_offset(a); /* == 0 or 4 */
   4554    descr = cl->descrs[tno];
   4555    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
   4556       if (valid_value_is_above_me_32(descr, toff)) {
   4557          SVal* tree = &cl->svals[tno << 3];
   4558          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
   4559       } else {
   4560          goto slowcase;
   4561       }
   4562       if (CHECK_ZSM)
   4563          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   4564    }
   4565    svOld = cl->svals[cloff];
   4566    svNew = msmcread( svOld, thr,a,4 );
   4567    if (CHECK_ZSM)
   4568       tl_assert(svNew != SVal_INVALID);
   4569    cl->svals[cloff] = svNew;
   4570    return;
   4571   slowcase: /* misaligned, or must go further down the tree */
   4572    stats__cline_32to16splits++;
   4573    zsm_sapply16__msmcread( thr, a + 0 );
   4574    zsm_sapply16__msmcread( thr, a + 2 );
   4575 }
   4576 
   4577 static void zsm_sapply32__msmcwrite ( Thr* thr, Addr a ) {
   4578    CacheLine* cl;
   4579    UWord      cloff, tno, toff;
   4580    SVal       svOld, svNew;
   4581    UShort     descr;
   4582    stats__cline_cwrite32s++;
   4583    if (UNLIKELY(!aligned32(a))) goto slowcase;
   4584    cl    = get_cacheline(a);
   4585    cloff = get_cacheline_offset(a);
   4586    tno   = get_treeno(a);
   4587    toff  = get_tree_offset(a); /* == 0 or 4 */
   4588    descr = cl->descrs[tno];
   4589    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
   4590       if (valid_value_is_above_me_32(descr, toff)) {
   4591          SVal* tree = &cl->svals[tno << 3];
   4592          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
   4593       } else {
   4594          goto slowcase;
   4595       }
   4596       if (CHECK_ZSM)
   4597          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   4598    }
   4599    svOld = cl->svals[cloff];
   4600    svNew = msmcwrite( svOld, thr,a,4 );
   4601    if (CHECK_ZSM)
   4602       tl_assert(svNew != SVal_INVALID);
   4603    cl->svals[cloff] = svNew;
   4604    return;
   4605   slowcase: /* misaligned, or must go further down the tree */
   4606    stats__cline_32to16splits++;
   4607    zsm_sapply16__msmcwrite( thr, a + 0 );
   4608    zsm_sapply16__msmcwrite( thr, a + 2 );
   4609 }
   4610 
   4611 /*------------- ZSM accesses: 64 bit sapply ------------- */
   4612 
   4613 static void zsm_sapply64__msmcread ( Thr* thr, Addr a ) {
   4614    CacheLine* cl;
   4615    UWord      cloff, tno;
   4616    //UWord      toff;
   4617    SVal       svOld, svNew;
   4618    UShort     descr;
   4619    stats__cline_cread64s++;
   4620    if (UNLIKELY(!aligned64(a))) goto slowcase;
   4621    cl    = get_cacheline(a);
   4622    cloff = get_cacheline_offset(a);
   4623    tno   = get_treeno(a);
   4624    //toff  = get_tree_offset(a); /* == 0, unused */
   4625    descr = cl->descrs[tno];
   4626    if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
   4627       goto slowcase;
   4628    }
   4629    svOld = cl->svals[cloff];
   4630    svNew = msmcread( svOld, thr,a,8 );
   4631    if (CHECK_ZSM)
   4632       tl_assert(svNew != SVal_INVALID);
   4633    cl->svals[cloff] = svNew;
   4634    return;
   4635   slowcase: /* misaligned, or must go further down the tree */
   4636    stats__cline_64to32splits++;
   4637    zsm_sapply32__msmcread( thr, a + 0 );
   4638    zsm_sapply32__msmcread( thr, a + 4 );
   4639 }
   4640 
   4641 static void zsm_sapply64__msmcwrite ( Thr* thr, Addr a ) {
   4642    CacheLine* cl;
   4643    UWord      cloff, tno;
   4644    //UWord      toff;
   4645    SVal       svOld, svNew;
   4646    UShort     descr;
   4647    stats__cline_cwrite64s++;
   4648    if (UNLIKELY(!aligned64(a))) goto slowcase;
   4649    cl    = get_cacheline(a);
   4650    cloff = get_cacheline_offset(a);
   4651    tno   = get_treeno(a);
   4652    //toff  = get_tree_offset(a); /* == 0, unused */
   4653    descr = cl->descrs[tno];
   4654    if (UNLIKELY( !(descr & TREE_DESCR_64) )) {
   4655       goto slowcase;
   4656    }
   4657    svOld = cl->svals[cloff];
   4658    svNew = msmcwrite( svOld, thr,a,8 );
   4659    if (CHECK_ZSM)
   4660       tl_assert(svNew != SVal_INVALID);
   4661    cl->svals[cloff] = svNew;
   4662    return;
   4663   slowcase: /* misaligned, or must go further down the tree */
   4664    stats__cline_64to32splits++;
   4665    zsm_sapply32__msmcwrite( thr, a + 0 );
   4666    zsm_sapply32__msmcwrite( thr, a + 4 );
   4667 }
   4668 
   4669 /*--------------- ZSM accesses: 8 bit swrite --------------- */
   4670 
   4671 static
   4672 void zsm_swrite08 ( Addr a, SVal svNew ) {
   4673    CacheLine* cl;
   4674    UWord      cloff, tno, toff;
   4675    UShort     descr;
   4676    stats__cline_swrite08s++;
   4677    cl    = get_cacheline(a);
   4678    cloff = get_cacheline_offset(a);
   4679    tno   = get_treeno(a);
   4680    toff  = get_tree_offset(a); /* == 0 .. 7 */
   4681    descr = cl->descrs[tno];
   4682    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
   4683       SVal* tree = &cl->svals[tno << 3];
   4684       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
   4685       if (CHECK_ZSM)
   4686          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   4687    }
   4688    tl_assert(svNew != SVal_INVALID);
   4689    cl->svals[cloff] = svNew;
   4690 }
   4691 
   4692 /*--------------- ZSM accesses: 16 bit swrite --------------- */
   4693 
   4694 static
   4695 void zsm_swrite16 ( Addr a, SVal svNew ) {
   4696    CacheLine* cl;
   4697    UWord      cloff, tno, toff;
   4698    UShort     descr;
   4699    stats__cline_swrite16s++;
   4700    if (UNLIKELY(!aligned16(a))) goto slowcase;
   4701    cl    = get_cacheline(a);
   4702    cloff = get_cacheline_offset(a);
   4703    tno   = get_treeno(a);
   4704    toff  = get_tree_offset(a); /* == 0, 2, 4 or 6 */
   4705    descr = cl->descrs[tno];
   4706    if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) {
   4707       if (valid_value_is_below_me_16(descr, toff)) {
   4708          /* Writing at this level.  Need to fix up 'descr'. */
   4709          cl->descrs[tno] = pullup_descr_to_16(descr, toff);
   4710          /* At this point, the tree does not match cl->descr[tno] any
   4711             more.  The assignments below will fix it up. */
   4712       } else {
   4713          /* We can't indiscriminately write on the w16 node as in the
   4714             w64 case, as that might make the node inconsistent with
   4715             its parent.  So first, pull down to this level. */
   4716          SVal* tree = &cl->svals[tno << 3];
   4717          cl->descrs[tno] = pulldown_to_16(tree, toff, descr);
   4718       if (CHECK_ZSM)
   4719          tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   4720       }
   4721    }
   4722    tl_assert(svNew != SVal_INVALID);
   4723    cl->svals[cloff + 0] = svNew;
   4724    cl->svals[cloff + 1] = SVal_INVALID;
   4725    return;
   4726   slowcase: /* misaligned */
   4727    stats__cline_16to8splits++;
   4728    zsm_swrite08( a + 0, svNew );
   4729    zsm_swrite08( a + 1, svNew );
   4730 }
   4731 
   4732 /*--------------- ZSM accesses: 32 bit swrite --------------- */
   4733 
   4734 static
   4735 void zsm_swrite32 ( Addr a, SVal svNew ) {
   4736    CacheLine* cl;
   4737    UWord      cloff, tno, toff;
   4738    UShort     descr;
   4739    stats__cline_swrite32s++;
   4740    if (UNLIKELY(!aligned32(a))) goto slowcase;
   4741    cl    = get_cacheline(a);
   4742    cloff = get_cacheline_offset(a);
   4743    tno   = get_treeno(a);
   4744    toff  = get_tree_offset(a); /* == 0 or 4 */
   4745    descr = cl->descrs[tno];
   4746    if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) {
   4747       if (valid_value_is_above_me_32(descr, toff)) {
   4748          /* We can't indiscriminately write on the w32 node as in the
   4749             w64 case, as that might make the node inconsistent with
   4750             its parent.  So first, pull down to this level. */
   4751          SVal* tree = &cl->svals[tno << 3];
   4752          cl->descrs[tno] = pulldown_to_32(tree, toff, descr);
   4753          if (CHECK_ZSM)
   4754             tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */
   4755       } else {
   4756          /* Writing at this level.  Need to fix up 'descr'. */
   4757          cl->descrs[tno] = pullup_descr_to_32(descr, toff);
   4758          /* At this point, the tree does not match cl->descr[tno] any
   4759             more.  The assignments below will fix it up. */
   4760       }
   4761    }
   4762    tl_assert(svNew != SVal_INVALID);
   4763    cl->svals[cloff + 0] = svNew;
   4764    cl->svals[cloff + 1] = SVal_INVALID;
   4765    cl->svals[cloff + 2] = SVal_INVALID;
   4766    cl->svals[cloff + 3] = SVal_INVALID;
   4767    return;
   4768   slowcase: /* misaligned */
   4769    stats__cline_32to16splits++;
   4770    zsm_swrite16( a + 0, svNew );
   4771    zsm_swrite16( a + 2, svNew );
   4772 }
   4773 
   4774 /*--------------- ZSM accesses: 64 bit swrite --------------- */
   4775 
   4776 static
   4777 void zsm_swrite64 ( Addr a, SVal svNew ) {
   4778    CacheLine* cl;
   4779    UWord      cloff, tno;
   4780    //UWord    toff;
   4781    stats__cline_swrite64s++;
   4782    if (UNLIKELY(!aligned64(a))) goto slowcase;
   4783    cl    = get_cacheline(a);
   4784    cloff = get_cacheline_offset(a);
   4785    tno   = get_treeno(a);
   4786    //toff  = get_tree_offset(a); /* == 0, unused */
   4787    cl->descrs[tno] = TREE_DESCR_64;
   4788    tl_assert(svNew != SVal_INVALID);
   4789    cl->svals[cloff + 0] = svNew;
   4790    cl->svals[cloff + 1] = SVal_INVALID;
   4791    cl->svals[cloff + 2] = SVal_INVALID;
   4792    cl->svals[cloff + 3] = SVal_INVALID;
   4793    cl->svals[cloff + 4] = SVal_INVALID;
   4794    cl->svals[cloff + 5] = SVal_INVALID;
   4795    cl->svals[cloff + 6] = SVal_INVALID;
   4796    cl->svals[cloff + 7] = SVal_INVALID;
   4797    return;
   4798   slowcase: /* misaligned */
   4799    stats__cline_64to32splits++;
   4800    zsm_swrite32( a + 0, svNew );
   4801    zsm_swrite32( a + 4, svNew );
   4802 }
   4803 
   4804 /*------------- ZSM accesses: 8 bit sread/scopy ------------- */
   4805 
   4806 static
   4807 SVal zsm_sread08 ( Addr a ) {
   4808    CacheLine* cl;
   4809    UWord      cloff, tno, toff;
   4810    UShort     descr;
   4811    stats__cline_sread08s++;
   4812    cl    = get_cacheline(a);
   4813    cloff = get_cacheline_offset(a);
   4814    tno   = get_treeno(a);
   4815    toff  = get_tree_offset(a); /* == 0 .. 7 */
   4816    descr = cl->descrs[tno];
   4817    if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) {
   4818       SVal* tree = &cl->svals[tno << 3];
   4819       cl->descrs[tno] = pulldown_to_8(tree, toff, descr);
   4820    }
   4821    return cl->svals[cloff];
   4822 }
   4823 
   4824 static void zsm_scopy08 ( Addr src, Addr dst, Bool uu_normalise ) {
   4825    SVal       sv;
   4826    stats__cline_scopy08s++;
   4827    sv = zsm_sread08( src );
   4828    zsm_swrite08( dst, sv );
   4829 }
   4830 
   4831 
   4832 /* Block-copy states (needed for implementing realloc()).  Note this
   4833    doesn't change the filtering arrangements.  The caller of
   4834    zsm_scopy_range needs to attend to that. */
   4835 
   4836 static void zsm_scopy_range ( Addr src, Addr dst, SizeT len )
   4837 {
   4838    SizeT i;
   4839    if (len == 0)
   4840       return;
   4841 
   4842    /* assert for non-overlappingness */
   4843    tl_assert(src+len <= dst || dst+len <= src);
   4844 
   4845    /* To be simple, just copy byte by byte.  But so as not to wreck
   4846       performance for later accesses to dst[0 .. len-1], normalise
   4847       destination lines as we finish with them, and also normalise the
   4848       line containing the first and last address. */
   4849    for (i = 0; i < len; i++) {
   4850       Bool normalise
   4851          = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */
   4852            || i == 0       /* first in range */
   4853            || i == len-1;  /* last in range */
   4854       zsm_scopy08( src+i, dst+i, normalise );
   4855    }
   4856 }
   4857 
   4858 
   4859 /* For setting address ranges to a given value.  Has considerable
   4860    sophistication so as to avoid generating large numbers of pointless
   4861    cache loads/writebacks for large ranges. */
   4862 
   4863 /* Do small ranges in-cache, in the obvious way. */
   4864 static
   4865 void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew )
   4866 {
   4867    /* fast track a couple of common cases */
   4868    if (len == 4 && aligned32(a)) {
   4869       zsm_swrite32( a, svNew );
   4870       return;
   4871    }
   4872    if (len == 8 && aligned64(a)) {
   4873       zsm_swrite64( a, svNew );
   4874       return;
   4875    }
   4876 
   4877    /* be completely general (but as efficient as possible) */
   4878    if (len == 0) return;
   4879 
   4880    if (!aligned16(a) && len >= 1) {
   4881       zsm_swrite08( a, svNew );
   4882       a += 1;
   4883       len -= 1;
   4884       tl_assert(aligned16(a));
   4885    }
   4886    if (len == 0) return;
   4887 
   4888    if (!aligned32(a) && len >= 2) {
   4889       zsm_swrite16( a, svNew );
   4890       a += 2;
   4891       len -= 2;
   4892       tl_assert(aligned32(a));
   4893    }
   4894    if (len == 0) return;
   4895 
   4896    if (!aligned64(a) && len >= 4) {
   4897       zsm_swrite32( a, svNew );
   4898       a += 4;
   4899       len -= 4;
   4900       tl_assert(aligned64(a));
   4901    }
   4902    if (len == 0) return;
   4903 
   4904    if (len >= 8) {
   4905       tl_assert(aligned64(a));
   4906       while (len >= 8) {
   4907          zsm_swrite64( a, svNew );
   4908          a += 8;
   4909          len -= 8;
   4910       }
   4911       tl_assert(aligned64(a));
   4912    }
   4913    if (len == 0) return;
   4914 
   4915    if (len >= 4)
   4916       tl_assert(aligned32(a));
   4917    if (len >= 4) {
   4918       zsm_swrite32( a, svNew );
   4919       a += 4;
   4920       len -= 4;
   4921    }
   4922    if (len == 0) return;
   4923 
   4924    if (len >= 2)
   4925       tl_assert(aligned16(a));
   4926    if (len >= 2) {
   4927       zsm_swrite16( a, svNew );
   4928       a += 2;
   4929       len -= 2;
   4930    }
   4931    if (len == 0) return;
   4932 
   4933    if (len >= 1) {
   4934       zsm_swrite08( a, svNew );
   4935       //a += 1;
   4936       len -= 1;
   4937    }
   4938    tl_assert(len == 0);
   4939 }
   4940 
   4941 
   4942 /* If we're doing a small range, hand off to zsm_sset_range_SMALL.  But
   4943    for larger ranges, try to operate directly on the out-of-cache
   4944    representation, rather than dragging lines into the cache,
   4945    overwriting them, and forcing them out.  This turns out to be an
   4946    important performance optimisation.
   4947 
   4948    Note that this doesn't change the filtering arrangements.  The
   4949    caller of zsm_sset_range needs to attend to that. */
   4950 
   4951 static void zsm_sset_range ( Addr a, SizeT len, SVal svNew )
   4952 {
   4953    tl_assert(svNew != SVal_INVALID);
   4954    stats__cache_make_New_arange += (ULong)len;
   4955 
   4956    if (0 && len > 500)
   4957       VG_(printf)("make New      ( %#lx, %ld )\n", a, len );
   4958 
   4959    if (0) {
   4960       static UWord n_New_in_cache = 0;
   4961       static UWord n_New_not_in_cache = 0;
   4962       /* tag is 'a' with the in-line offset masked out,
   4963          eg a[31]..a[4] 0000 */
   4964       Addr       tag = a & ~(N_LINE_ARANGE - 1);
   4965       UWord      wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1);
   4966       if (LIKELY(tag == cache_shmem.tags0[wix])) {
   4967          n_New_in_cache++;
   4968       } else {
   4969          n_New_not_in_cache++;
   4970       }
   4971       if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000))
   4972          VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n",
   4973                      n_New_in_cache, n_New_not_in_cache );
   4974    }
   4975 
   4976    if (LIKELY(len < 2 * N_LINE_ARANGE)) {
   4977       zsm_sset_range_SMALL( a, len, svNew );
   4978    } else {
   4979       Addr  before_start  = a;
   4980       Addr  aligned_start = cacheline_ROUNDUP(a);
   4981       Addr  after_start   = cacheline_ROUNDDN(a + len);
   4982       UWord before_len    = aligned_start - before_start;
   4983       UWord aligned_len   = after_start - aligned_start;
   4984       UWord after_len     = a + len - after_start;
   4985       tl_assert(before_start <= aligned_start);
   4986       tl_assert(aligned_start <= after_start);
   4987       tl_assert(before_len < N_LINE_ARANGE);
   4988       tl_assert(after_len < N_LINE_ARANGE);
   4989       tl_assert(get_cacheline_offset(aligned_start) == 0);
   4990       if (get_cacheline_offset(a) == 0) {
   4991          tl_assert(before_len == 0);
   4992          tl_assert(a == aligned_start);
   4993       }
   4994       if (get_cacheline_offset(a+len) == 0) {
   4995          tl_assert(after_len == 0);
   4996          tl_assert(after_start == a+len);
   4997       }
   4998       if (before_len > 0) {
   4999          zsm_sset_range_SMALL( before_start, before_len, svNew );
   5000       }
   5001       if (after_len > 0) {
   5002          zsm_sset_range_SMALL( after_start, after_len, svNew );
   5003       }
   5004       stats__cache_make_New_inZrep += (ULong)aligned_len;
   5005 
   5006       while (1) {
   5007          Addr tag;
   5008          UWord wix;
   5009          if (aligned_start >= after_start)
   5010             break;
   5011          tl_assert(get_cacheline_offset(aligned_start) == 0);
   5012          tag = aligned_start & ~(N_LINE_ARANGE - 1);
   5013          wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1);
   5014          if (tag == cache_shmem.tags0[wix]) {
   5015             UWord i;
   5016             for (i = 0; i < N_LINE_ARANGE / 8; i++)
   5017                zsm_swrite64( aligned_start + i * 8, svNew );
   5018          } else {
   5019             UWord i;
   5020             Word zix;
   5021             SecMap* sm;
   5022             LineZ* lineZ;
   5023             /* This line is not in the cache.  Do not force it in; instead
   5024                modify it in-place. */
   5025             /* find the Z line to write in and rcdec it or the
   5026                associated F line. */
   5027             find_Z_for_writing( &sm, &zix, tag );
   5028             tl_assert(sm);
   5029             tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES);
   5030             lineZ = &sm->linesZ[zix];
   5031             lineZ->dict[0] = svNew;
   5032             lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID;
   5033             for (i = 0; i < N_LINE_ARANGE/4; i++)
   5034                lineZ->ix2s[i] = 0; /* all refer to dict[0] */
   5035             rcinc_LineZ(lineZ);
   5036          }
   5037          aligned_start += N_LINE_ARANGE;
   5038          aligned_len -= N_LINE_ARANGE;
   5039       }
   5040       tl_assert(aligned_start == after_start);
   5041       tl_assert(aligned_len == 0);
   5042    }
   5043 }
   5044 
   5045 
   5046 /////////////////////////////////////////////////////////
   5047 //                                                     //
   5048 // Front-filtering accesses                            //
   5049 //                                                     //
   5050 /////////////////////////////////////////////////////////
   5051 
   5052 static UWord stats__f_ac = 0;
   5053 static UWord stats__f_sk = 0;
   5054 
   5055 #if 0
   5056 #  define STATS__F_SHOW \
   5057      do { \
   5058         if (UNLIKELY(0 == (stats__f_ac & 0xFFFFFF))) \
   5059            VG_(printf)("filters: ac %lu sk %lu\n",   \
   5060            stats__f_ac, stats__f_sk); \
   5061      } while (0)
   5062 #else
   5063 #  define STATS__F_SHOW /* */
   5064 #endif
   5065 
   5066 void zsm_sapply08_f__msmcwrite ( Thr* thr, Addr a ) {
   5067    stats__f_ac++;
   5068    STATS__F_SHOW;
   5069    if (LIKELY(Filter__ok_to_skip_cwr08(thr->filter, a))) {
   5070       stats__f_sk++;
   5071       return;
   5072    }
   5073    zsm_sapply08__msmcwrite(thr, a);
   5074 }
   5075 
   5076 void zsm_sapply16_f__msmcwrite ( Thr* thr, Addr a ) {
   5077    stats__f_ac++;
   5078    STATS__F_SHOW;
   5079    if (LIKELY(Filter__ok_to_skip_cwr16(thr->filter, a))) {
   5080       stats__f_sk++;
   5081       return;
   5082    }
   5083    zsm_sapply16__msmcwrite(thr, a);
   5084 }
   5085 
   5086 void zsm_sapply32_f__msmcwrite ( Thr* thr, Addr a ) {
   5087    stats__f_ac++;
   5088    STATS__F_SHOW;
   5089    if (LIKELY(Filter__ok_to_skip_cwr32(thr->filter, a))) {
   5090       stats__f_sk++;
   5091       return;
   5092    }
   5093    zsm_sapply32__msmcwrite(thr, a);
   5094 }
   5095 
   5096 void zsm_sapply64_f__msmcwrite ( Thr* thr, Addr a ) {
   5097    stats__f_ac++;
   5098    STATS__F_SHOW;
   5099    if (LIKELY(Filter__ok_to_skip_cwr64(thr->filter, a))) {
   5100       stats__f_sk++;
   5101       return;
   5102    }
   5103    zsm_sapply64__msmcwrite(thr, a);
   5104 }
   5105 
   5106 void zsm_sapplyNN_f__msmcwrite ( Thr* thr, Addr a, SizeT len )
   5107 {
   5108    /* fast track a couple of common cases */
   5109    if (len == 4 && aligned32(a)) {
   5110       zsm_sapply32_f__msmcwrite( thr, a );
   5111       return;
   5112    }
   5113    if (len == 8 && aligned64(a)) {
   5114       zsm_sapply64_f__msmcwrite( thr, a );
   5115       return;
   5116    }
   5117 
   5118    /* be completely general (but as efficient as possible) */
   5119    if (len == 0) return;
   5120 
   5121    if (!aligned16(a) && len >= 1) {
   5122       zsm_sapply08_f__msmcwrite( thr, a );
   5123       a += 1;
   5124       len -= 1;
   5125       tl_assert(aligned16(a));
   5126    }
   5127    if (len == 0) return;
   5128 
   5129    if (!aligned32(a) && len >= 2) {
   5130       zsm_sapply16_f__msmcwrite( thr, a );
   5131       a += 2;
   5132       len -= 2;
   5133       tl_assert(aligned32(a));
   5134    }
   5135    if (len == 0) return;
   5136 
   5137    if (!aligned64(a) && len >= 4) {
   5138       zsm_sapply32_f__msmcwrite( thr, a );
   5139       a += 4;
   5140       len -= 4;
   5141       tl_assert(aligned64(a));
   5142    }
   5143    if (len == 0) return;
   5144 
   5145    if (len >= 8) {
   5146       tl_assert(aligned64(a));
   5147       while (len >= 8) {
   5148          zsm_sapply64_f__msmcwrite( thr, a );
   5149          a += 8;
   5150          len -= 8;
   5151       }
   5152       tl_assert(aligned64(a));
   5153    }
   5154    if (len == 0) return;
   5155 
   5156    if (len >= 4)
   5157       tl_assert(aligned32(a));
   5158    if (len >= 4) {
   5159       zsm_sapply32_f__msmcwrite( thr, a );
   5160       a += 4;
   5161       len -= 4;
   5162    }
   5163    if (len == 0) return;
   5164 
   5165    if (len >= 2)
   5166       tl_assert(aligned16(a));
   5167    if (len >= 2) {
   5168       zsm_sapply16_f__msmcwrite( thr, a );
   5169       a += 2;
   5170       len -= 2;
   5171    }
   5172    if (len == 0) return;
   5173 
   5174    if (len >= 1) {
   5175       zsm_sapply08_f__msmcwrite( thr, a );
   5176       //a += 1;
   5177       len -= 1;
   5178    }
   5179    tl_assert(len == 0);
   5180 }
   5181 
   5182 void zsm_sapply08_f__msmcread ( Thr* thr, Addr a ) {
   5183    stats__f_ac++;
   5184    STATS__F_SHOW;
   5185    if (LIKELY(Filter__ok_to_skip_crd08(thr->filter, a))) {
   5186       stats__f_sk++;
   5187       return;
   5188    }
   5189    zsm_sapply08__msmcread(thr, a);
   5190 }
   5191 
   5192 void zsm_sapply16_f__msmcread ( Thr* thr, Addr a ) {
   5193    stats__f_ac++;
   5194    STATS__F_SHOW;
   5195    if (LIKELY(Filter__ok_to_skip_crd16(thr->filter, a))) {
   5196       stats__f_sk++;
   5197       return;
   5198    }
   5199    zsm_sapply16__msmcread(thr, a);
   5200 }
   5201 
   5202 void zsm_sapply32_f__msmcread ( Thr* thr, Addr a ) {
   5203    stats__f_ac++;
   5204    STATS__F_SHOW;
   5205    if (LIKELY(Filter__ok_to_skip_crd32(thr->filter, a))) {
   5206       stats__f_sk++;
   5207       return;
   5208    }
   5209    zsm_sapply32__msmcread(thr, a);
   5210 }
   5211 
   5212 void zsm_sapply64_f__msmcread ( Thr* thr, Addr a ) {
   5213    stats__f_ac++;
   5214    STATS__F_SHOW;
   5215    if (LIKELY(Filter__ok_to_skip_crd64(thr->filter, a))) {
   5216       stats__f_sk++;
   5217       return;
   5218    }
   5219    zsm_sapply64__msmcread(thr, a);
   5220 }
   5221 
   5222 void zsm_sapplyNN_f__msmcread ( Thr* thr, Addr a, SizeT len )
   5223 {
   5224    /* fast track a couple of common cases */
   5225    if (len == 4 && aligned32(a)) {
   5226       zsm_sapply32_f__msmcread( thr, a );
   5227       return;
   5228    }
   5229    if (len == 8 && aligned64(a)) {
   5230       zsm_sapply64_f__msmcread( thr, a );
   5231       return;
   5232    }
   5233 
   5234    /* be completely general (but as efficient as possible) */
   5235    if (len == 0) return;
   5236 
   5237    if (!aligned16(a) && len >= 1) {
   5238       zsm_sapply08_f__msmcread( thr, a );
   5239       a += 1;
   5240       len -= 1;
   5241       tl_assert(aligned16(a));
   5242    }
   5243    if (len == 0) return;
   5244 
   5245    if (!aligned32(a) && len >= 2) {
   5246       zsm_sapply16_f__msmcread( thr, a );
   5247       a += 2;
   5248       len -= 2;
   5249       tl_assert(aligned32(a));
   5250    }
   5251    if (len == 0) return;
   5252 
   5253    if (!aligned64(a) && len >= 4) {
   5254       zsm_sapply32_f__msmcread( thr, a );
   5255       a += 4;
   5256       len -= 4;
   5257       tl_assert(aligned64(a));
   5258    }
   5259    if (len == 0) return;
   5260 
   5261    if (len >= 8) {
   5262       tl_assert(aligned64(a));
   5263       while (len >= 8) {
   5264          zsm_sapply64_f__msmcread( thr