1 2 /*--------------------------------------------------------------------*/ 3 /*--- LibHB: a library for implementing and checking ---*/ 4 /*--- the happens-before relationship in concurrent programs. ---*/ 5 /*--- libhb_main.c ---*/ 6 /*--------------------------------------------------------------------*/ 7 8 /* 9 This file is part of LibHB, a library for implementing and checking 10 the happens-before relationship in concurrent programs. 11 12 Copyright (C) 2008-2010 OpenWorks Ltd 13 info (at) open-works.co.uk 14 15 This program is free software; you can redistribute it and/or 16 modify it under the terms of the GNU General Public License as 17 published by the Free Software Foundation; either version 2 of the 18 License, or (at your option) any later version. 19 20 This program is distributed in the hope that it will be useful, but 21 WITHOUT ANY WARRANTY; without even the implied warranty of 22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 23 General Public License for more details. 24 25 You should have received a copy of the GNU General Public License 26 along with this program; if not, write to the Free Software 27 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 28 02111-1307, USA. 29 30 The GNU General Public License is contained in the file COPYING. 31 */ 32 33 #include "pub_tool_basics.h" 34 #include "pub_tool_libcassert.h" 35 #include "pub_tool_libcbase.h" 36 #include "pub_tool_libcprint.h" 37 #include "pub_tool_mallocfree.h" 38 #include "pub_tool_wordfm.h" 39 #include "pub_tool_sparsewa.h" 40 #include "pub_tool_xarray.h" 41 #include "pub_tool_oset.h" 42 #include "pub_tool_threadstate.h" 43 #include "pub_tool_aspacemgr.h" 44 #include "pub_tool_execontext.h" 45 #include "pub_tool_errormgr.h" 46 #include "pub_tool_options.h" // VG_(clo_stats) 47 #include "hg_basics.h" 48 #include "hg_wordset.h" 49 #include "hg_lock_n_thread.h" 50 #include "hg_errors.h" 51 52 #include "libhb.h" 53 54 55 ///////////////////////////////////////////////////////////////// 56 ///////////////////////////////////////////////////////////////// 57 // // 58 // Debugging #defines // 59 // // 60 ///////////////////////////////////////////////////////////////// 61 ///////////////////////////////////////////////////////////////// 62 63 /* Check the sanity of shadow values in the core memory state 64 machine. Change #if 0 to #if 1 to enable this. */ 65 #if 0 66 # define CHECK_MSM 1 67 #else 68 # define CHECK_MSM 0 69 #endif 70 71 72 /* Check sanity (reference counts, etc) in the conflicting access 73 machinery. Change #if 0 to #if 1 to enable this. */ 74 #if 0 75 # define CHECK_CEM 1 76 #else 77 # define CHECK_CEM 0 78 #endif 79 80 81 /* Check sanity in the compressed shadow memory machinery, 82 particularly in its caching innards. Unfortunately there's no 83 almost-zero-cost way to make them selectable at run time. Hence 84 set the #if 0 to #if 1 and rebuild if you want them. */ 85 #if 0 86 # define CHECK_ZSM 1 /* do sanity-check CacheLine stuff */ 87 # define inline __attribute__((noinline)) 88 /* probably want to ditch -fomit-frame-pointer too */ 89 #else 90 # define CHECK_ZSM 0 /* don't sanity-check CacheLine stuff */ 91 #endif 92 93 94 ///////////////////////////////////////////////////////////////// 95 ///////////////////////////////////////////////////////////////// 96 // // 97 // Forward declarations // 98 // // 99 ///////////////////////////////////////////////////////////////// 100 ///////////////////////////////////////////////////////////////// 101 102 /* fwds for 103 Globals needed by other parts of the library. These are set 104 once at startup and then never changed. */ 105 static void (*main_get_stacktrace)( Thr*, Addr*, UWord ) = NULL; 106 static ExeContext* (*main_get_EC)( Thr* ) = NULL; 107 108 109 110 ///////////////////////////////////////////////////////////////// 111 ///////////////////////////////////////////////////////////////// 112 // // 113 // SECTION BEGIN compressed shadow memory // 114 // // 115 ///////////////////////////////////////////////////////////////// 116 ///////////////////////////////////////////////////////////////// 117 118 #ifndef __HB_ZSM_H 119 #define __HB_ZSM_H 120 121 typedef ULong SVal; 122 123 /* This value has special significance to the implementation, and callers 124 may not store it in the shadow memory. */ 125 #define SVal_INVALID (3ULL << 62) 126 127 /* This is the default value for shadow memory. Initially the shadow 128 memory contains no accessible areas and so all reads produce this 129 value. TODO: make this caller-defineable. */ 130 #define SVal_NOACCESS (2ULL << 62) 131 132 /* Initialise the library. Once initialised, it will (or may) call 133 rcinc and rcdec in response to all the calls below, in order to 134 allow the user to do reference counting on the SVals stored herein. 135 It is important to understand, however, that due to internal 136 caching, the reference counts are in general inaccurate, and can be 137 both above or below the true reference count for an item. In 138 particular, the library may indicate that the reference count for 139 an item is zero, when in fact it is not. 140 141 To make the reference counting exact and therefore non-pointless, 142 call zsm_flush_cache. Immediately after it returns, the reference 143 counts for all items, as deduced by the caller by observing calls 144 to rcinc and rcdec, will be correct, and so any items with a zero 145 reference count may be freed (or at least considered to be 146 unreferenced by this library). 147 */ 148 static void zsm_init ( void(*rcinc)(SVal), void(*rcdec)(SVal) ); 149 150 static void zsm_sset_range ( Addr, SizeT, SVal ); 151 static void zsm_scopy_range ( Addr, Addr, SizeT ); 152 static void zsm_flush_cache ( void ); 153 154 #endif /* ! __HB_ZSM_H */ 155 156 157 /* Round a up to the next multiple of N. N must be a power of 2 */ 158 #define ROUNDUP(a, N) ((a + N - 1) & ~(N-1)) 159 /* Round a down to the next multiple of N. N must be a power of 2 */ 160 #define ROUNDDN(a, N) ((a) & ~(N-1)) 161 162 163 164 /* ------ User-supplied RC functions ------ */ 165 static void(*rcinc)(SVal) = NULL; 166 static void(*rcdec)(SVal) = NULL; 167 168 169 /* ------ CacheLine ------ */ 170 171 #define N_LINE_BITS 6 /* must be >= 3 */ 172 #define N_LINE_ARANGE (1 << N_LINE_BITS) 173 #define N_LINE_TREES (N_LINE_ARANGE >> 3) 174 175 typedef 176 struct { 177 UShort descrs[N_LINE_TREES]; 178 SVal svals[N_LINE_ARANGE]; // == N_LINE_TREES * 8 179 } 180 CacheLine; 181 182 #define TREE_DESCR_16_0 (1<<0) 183 #define TREE_DESCR_32_0 (1<<1) 184 #define TREE_DESCR_16_1 (1<<2) 185 #define TREE_DESCR_64 (1<<3) 186 #define TREE_DESCR_16_2 (1<<4) 187 #define TREE_DESCR_32_1 (1<<5) 188 #define TREE_DESCR_16_3 (1<<6) 189 #define TREE_DESCR_8_0 (1<<7) 190 #define TREE_DESCR_8_1 (1<<8) 191 #define TREE_DESCR_8_2 (1<<9) 192 #define TREE_DESCR_8_3 (1<<10) 193 #define TREE_DESCR_8_4 (1<<11) 194 #define TREE_DESCR_8_5 (1<<12) 195 #define TREE_DESCR_8_6 (1<<13) 196 #define TREE_DESCR_8_7 (1<<14) 197 #define TREE_DESCR_DTY (1<<15) 198 199 typedef 200 struct { 201 SVal dict[4]; /* can represent up to 4 diff values in the line */ 202 UChar ix2s[N_LINE_ARANGE/4]; /* array of N_LINE_ARANGE 2-bit 203 dict indexes */ 204 /* if dict[0] == SVal_INVALID then dict[1] is the index of the 205 LineF to use, and dict[2..] are also SVal_INVALID. */ 206 } 207 LineZ; /* compressed rep for a cache line */ 208 209 typedef 210 struct { 211 Bool inUse; 212 SVal w64s[N_LINE_ARANGE]; 213 } 214 LineF; /* full rep for a cache line */ 215 216 /* Shadow memory. 217 Primary map is a WordFM Addr SecMap*. 218 SecMaps cover some page-size-ish section of address space and hold 219 a compressed representation. 220 CacheLine-sized chunks of SecMaps are copied into a Cache, being 221 decompressed when moved into the cache and recompressed on the 222 way out. Because of this, the cache must operate as a writeback 223 cache, not a writethrough one. 224 225 Each SecMap must hold a power-of-2 number of CacheLines. Hence 226 N_SECMAP_BITS must >= N_LINE_BITS. 227 */ 228 #define N_SECMAP_BITS 13 229 #define N_SECMAP_ARANGE (1 << N_SECMAP_BITS) 230 231 // # CacheLines held by a SecMap 232 #define N_SECMAP_ZLINES (N_SECMAP_ARANGE / N_LINE_ARANGE) 233 234 /* The data in the SecMap is held in the array of LineZs. Each LineZ 235 either carries the required data directly, in a compressed 236 representation, or it holds (in .dict[0]) an index to the LineF in 237 .linesF that holds the full representation. 238 239 Currently-unused LineF's have their .inUse bit set to zero. 240 Since each in-use LineF is referred to be exactly one LineZ, 241 the number of .linesZ[] that refer to .linesF should equal 242 the number of .linesF[] that have .inUse == True. 243 244 RC obligations: the RCs presented to the user include exactly 245 the values in: 246 * direct Z reps, that is, ones for which .dict[0] != SVal_INVALID 247 * F reps that are in use (.inUse == True) 248 249 Hence the following actions at the following transitions are required: 250 251 F rep: .inUse==True -> .inUse==False -- rcdec_LineF 252 F rep: .inUse==False -> .inUse==True -- rcinc_LineF 253 Z rep: .dict[0] from other to SVal_INVALID -- rcdec_LineZ 254 Z rep: .dict[0] from SVal_INVALID to other -- rcinc_LineZ 255 */ 256 typedef 257 struct { 258 UInt magic; 259 LineZ linesZ[N_SECMAP_ZLINES]; 260 LineF* linesF; 261 UInt linesF_size; 262 } 263 SecMap; 264 265 #define SecMap_MAGIC 0x571e58cbU 266 267 static inline Bool is_sane_SecMap ( SecMap* sm ) { 268 return sm != NULL && sm->magic == SecMap_MAGIC; 269 } 270 271 /* ------ Cache ------ */ 272 273 #define N_WAY_BITS 16 274 #define N_WAY_NENT (1 << N_WAY_BITS) 275 276 /* Each tag is the address of the associated CacheLine, rounded down 277 to a CacheLine address boundary. A CacheLine size must be a power 278 of 2 and must be 8 or more. Hence an easy way to initialise the 279 cache so it is empty is to set all the tag values to any value % 8 280 != 0, eg 1. This means all queries in the cache initially miss. 281 It does however require us to detect and not writeback, any line 282 with a bogus tag. */ 283 typedef 284 struct { 285 CacheLine lyns0[N_WAY_NENT]; 286 Addr tags0[N_WAY_NENT]; 287 } 288 Cache; 289 290 static inline Bool is_valid_scache_tag ( Addr tag ) { 291 /* a valid tag should be naturally aligned to the start of 292 a CacheLine. */ 293 return 0 == (tag & (N_LINE_ARANGE - 1)); 294 } 295 296 297 /* --------- Primary data structures --------- */ 298 299 /* Shadow memory primary map */ 300 static WordFM* map_shmem = NULL; /* WordFM Addr SecMap* */ 301 static Cache cache_shmem; 302 303 304 static UWord stats__secmaps_search = 0; // # SM finds 305 static UWord stats__secmaps_search_slow = 0; // # SM lookupFMs 306 static UWord stats__secmaps_allocd = 0; // # SecMaps issued 307 static UWord stats__secmap_ga_space_covered = 0; // # ga bytes covered 308 static UWord stats__secmap_linesZ_allocd = 0; // # LineZ's issued 309 static UWord stats__secmap_linesZ_bytes = 0; // .. using this much storage 310 static UWord stats__secmap_linesF_allocd = 0; // # LineF's issued 311 static UWord stats__secmap_linesF_bytes = 0; // .. using this much storage 312 static UWord stats__secmap_iterator_steppings = 0; // # calls to stepSMIter 313 static UWord stats__cache_Z_fetches = 0; // # Z lines fetched 314 static UWord stats__cache_Z_wbacks = 0; // # Z lines written back 315 static UWord stats__cache_F_fetches = 0; // # F lines fetched 316 static UWord stats__cache_F_wbacks = 0; // # F lines written back 317 static UWord stats__cache_invals = 0; // # cache invals 318 static UWord stats__cache_flushes = 0; // # cache flushes 319 static UWord stats__cache_totrefs = 0; // # total accesses 320 static UWord stats__cache_totmisses = 0; // # misses 321 static ULong stats__cache_make_New_arange = 0; // total arange made New 322 static ULong stats__cache_make_New_inZrep = 0; // arange New'd on Z reps 323 static UWord stats__cline_normalises = 0; // # calls to cacheline_normalise 324 static UWord stats__cline_cread64s = 0; // # calls to s_m_read64 325 static UWord stats__cline_cread32s = 0; // # calls to s_m_read32 326 static UWord stats__cline_cread16s = 0; // # calls to s_m_read16 327 static UWord stats__cline_cread08s = 0; // # calls to s_m_read8 328 static UWord stats__cline_cwrite64s = 0; // # calls to s_m_write64 329 static UWord stats__cline_cwrite32s = 0; // # calls to s_m_write32 330 static UWord stats__cline_cwrite16s = 0; // # calls to s_m_write16 331 static UWord stats__cline_cwrite08s = 0; // # calls to s_m_write8 332 static UWord stats__cline_sread08s = 0; // # calls to s_m_set8 333 static UWord stats__cline_swrite08s = 0; // # calls to s_m_get8 334 static UWord stats__cline_swrite16s = 0; // # calls to s_m_get8 335 static UWord stats__cline_swrite32s = 0; // # calls to s_m_get8 336 static UWord stats__cline_swrite64s = 0; // # calls to s_m_get8 337 static UWord stats__cline_scopy08s = 0; // # calls to s_m_copy8 338 static UWord stats__cline_64to32splits = 0; // # 64-bit accesses split 339 static UWord stats__cline_32to16splits = 0; // # 32-bit accesses split 340 static UWord stats__cline_16to8splits = 0; // # 16-bit accesses split 341 static UWord stats__cline_64to32pulldown = 0; // # calls to pulldown_to_32 342 static UWord stats__cline_32to16pulldown = 0; // # calls to pulldown_to_16 343 static UWord stats__cline_16to8pulldown = 0; // # calls to pulldown_to_8 344 static UWord stats__vts__tick = 0; // # calls to VTS__tick 345 static UWord stats__vts__join = 0; // # calls to VTS__join 346 static UWord stats__vts__cmpLEQ = 0; // # calls to VTS__cmpLEQ 347 static UWord stats__vts__cmp_structural = 0; // # calls to VTS__cmp_structural 348 static UWord stats__vts__cmp_structural_slow = 0; // # calls to VTS__cmp_structural w/ slow case 349 static UWord stats__vts__indexat_slow = 0; // # calls to VTS__indexAt_SLOW 350 static UWord stats__vts_set__fadoa = 0; // # calls to vts_set__find_and_dealloc__or_add 351 static UWord stats__vts_set__fadoa_d = 0; // # calls to vts_set__find_and_dealloc__or_add 352 // that lead to a deallocation 353 354 355 static inline Addr shmem__round_to_SecMap_base ( Addr a ) { 356 return a & ~(N_SECMAP_ARANGE - 1); 357 } 358 static inline UWord shmem__get_SecMap_offset ( Addr a ) { 359 return a & (N_SECMAP_ARANGE - 1); 360 } 361 362 363 /*----------------------------------------------------------------*/ 364 /*--- map_shmem :: WordFM Addr SecMap ---*/ 365 /*--- shadow memory (low level handlers) (shmem__* fns) ---*/ 366 /*----------------------------------------------------------------*/ 367 368 /*--------------- SecMap allocation --------------- */ 369 370 static HChar* shmem__bigchunk_next = NULL; 371 static HChar* shmem__bigchunk_end1 = NULL; 372 373 static void* shmem__bigchunk_alloc ( SizeT n ) 374 { 375 const SizeT sHMEM__BIGCHUNK_SIZE = 4096 * 256 * 4; 376 tl_assert(n > 0); 377 n = VG_ROUNDUP(n, 16); 378 tl_assert(shmem__bigchunk_next <= shmem__bigchunk_end1); 379 tl_assert(shmem__bigchunk_end1 - shmem__bigchunk_next 380 <= (SSizeT)sHMEM__BIGCHUNK_SIZE); 381 if (shmem__bigchunk_next + n > shmem__bigchunk_end1) { 382 if (0) 383 VG_(printf)("XXXXX bigchunk: abandoning %d bytes\n", 384 (Int)(shmem__bigchunk_end1 - shmem__bigchunk_next)); 385 shmem__bigchunk_next = VG_(am_shadow_alloc)( sHMEM__BIGCHUNK_SIZE ); 386 if (shmem__bigchunk_next == NULL) 387 VG_(out_of_memory_NORETURN)( 388 "helgrind:shmem__bigchunk_alloc", sHMEM__BIGCHUNK_SIZE ); 389 shmem__bigchunk_end1 = shmem__bigchunk_next + sHMEM__BIGCHUNK_SIZE; 390 } 391 tl_assert(shmem__bigchunk_next); 392 tl_assert( 0 == (((Addr)shmem__bigchunk_next) & (16-1)) ); 393 tl_assert(shmem__bigchunk_next + n <= shmem__bigchunk_end1); 394 shmem__bigchunk_next += n; 395 return shmem__bigchunk_next - n; 396 } 397 398 static SecMap* shmem__alloc_SecMap ( void ) 399 { 400 Word i, j; 401 SecMap* sm = shmem__bigchunk_alloc( sizeof(SecMap) ); 402 if (0) VG_(printf)("alloc_SecMap %p\n",sm); 403 tl_assert(sm); 404 sm->magic = SecMap_MAGIC; 405 for (i = 0; i < N_SECMAP_ZLINES; i++) { 406 sm->linesZ[i].dict[0] = SVal_NOACCESS; 407 sm->linesZ[i].dict[1] = SVal_INVALID; 408 sm->linesZ[i].dict[2] = SVal_INVALID; 409 sm->linesZ[i].dict[3] = SVal_INVALID; 410 for (j = 0; j < N_LINE_ARANGE/4; j++) 411 sm->linesZ[i].ix2s[j] = 0; /* all reference dict[0] */ 412 } 413 sm->linesF = NULL; 414 sm->linesF_size = 0; 415 stats__secmaps_allocd++; 416 stats__secmap_ga_space_covered += N_SECMAP_ARANGE; 417 stats__secmap_linesZ_allocd += N_SECMAP_ZLINES; 418 stats__secmap_linesZ_bytes += N_SECMAP_ZLINES * sizeof(LineZ); 419 return sm; 420 } 421 422 typedef struct { Addr gaKey; SecMap* sm; } SMCacheEnt; 423 static SMCacheEnt smCache[3] = { {1,NULL}, {1,NULL}, {1,NULL} }; 424 425 static SecMap* shmem__find_SecMap ( Addr ga ) 426 { 427 SecMap* sm = NULL; 428 Addr gaKey = shmem__round_to_SecMap_base(ga); 429 // Cache 430 stats__secmaps_search++; 431 if (LIKELY(gaKey == smCache[0].gaKey)) 432 return smCache[0].sm; 433 if (LIKELY(gaKey == smCache[1].gaKey)) { 434 SMCacheEnt tmp = smCache[0]; 435 smCache[0] = smCache[1]; 436 smCache[1] = tmp; 437 return smCache[0].sm; 438 } 439 if (gaKey == smCache[2].gaKey) { 440 SMCacheEnt tmp = smCache[1]; 441 smCache[1] = smCache[2]; 442 smCache[2] = tmp; 443 return smCache[1].sm; 444 } 445 // end Cache 446 stats__secmaps_search_slow++; 447 if (VG_(lookupFM)( map_shmem, 448 NULL/*keyP*/, (UWord*)&sm, (UWord)gaKey )) { 449 tl_assert(sm != NULL); 450 smCache[2] = smCache[1]; 451 smCache[1] = smCache[0]; 452 smCache[0].gaKey = gaKey; 453 smCache[0].sm = sm; 454 } else { 455 tl_assert(sm == NULL); 456 } 457 return sm; 458 } 459 460 static SecMap* shmem__find_or_alloc_SecMap ( Addr ga ) 461 { 462 SecMap* sm = shmem__find_SecMap ( ga ); 463 if (LIKELY(sm)) { 464 return sm; 465 } else { 466 /* create a new one */ 467 Addr gaKey = shmem__round_to_SecMap_base(ga); 468 sm = shmem__alloc_SecMap(); 469 tl_assert(sm); 470 VG_(addToFM)( map_shmem, (UWord)gaKey, (UWord)sm ); 471 return sm; 472 } 473 } 474 475 476 /* ------------ LineF and LineZ related ------------ */ 477 478 static void rcinc_LineF ( LineF* lineF ) { 479 UWord i; 480 tl_assert(lineF->inUse); 481 for (i = 0; i < N_LINE_ARANGE; i++) 482 rcinc(lineF->w64s[i]); 483 } 484 485 static void rcdec_LineF ( LineF* lineF ) { 486 UWord i; 487 tl_assert(lineF->inUse); 488 for (i = 0; i < N_LINE_ARANGE; i++) 489 rcdec(lineF->w64s[i]); 490 } 491 492 static void rcinc_LineZ ( LineZ* lineZ ) { 493 tl_assert(lineZ->dict[0] != SVal_INVALID); 494 rcinc(lineZ->dict[0]); 495 if (lineZ->dict[1] != SVal_INVALID) rcinc(lineZ->dict[1]); 496 if (lineZ->dict[2] != SVal_INVALID) rcinc(lineZ->dict[2]); 497 if (lineZ->dict[3] != SVal_INVALID) rcinc(lineZ->dict[3]); 498 } 499 500 static void rcdec_LineZ ( LineZ* lineZ ) { 501 tl_assert(lineZ->dict[0] != SVal_INVALID); 502 rcdec(lineZ->dict[0]); 503 if (lineZ->dict[1] != SVal_INVALID) rcdec(lineZ->dict[1]); 504 if (lineZ->dict[2] != SVal_INVALID) rcdec(lineZ->dict[2]); 505 if (lineZ->dict[3] != SVal_INVALID) rcdec(lineZ->dict[3]); 506 } 507 508 inline 509 static void write_twobit_array ( UChar* arr, UWord ix, UWord b2 ) { 510 Word bix, shft, mask, prep; 511 tl_assert(ix >= 0); 512 bix = ix >> 2; 513 shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */ 514 mask = 3 << shft; 515 prep = b2 << shft; 516 arr[bix] = (arr[bix] & ~mask) | prep; 517 } 518 519 inline 520 static UWord read_twobit_array ( UChar* arr, UWord ix ) { 521 Word bix, shft; 522 tl_assert(ix >= 0); 523 bix = ix >> 2; 524 shft = 2 * (ix & 3); /* 0, 2, 4 or 6 */ 525 return (arr[bix] >> shft) & 3; 526 } 527 528 /* Given address 'tag', find either the Z or F line containing relevant 529 data, so it can be read into the cache. 530 */ 531 static void find_ZF_for_reading ( /*OUT*/LineZ** zp, 532 /*OUT*/LineF** fp, Addr tag ) { 533 LineZ* lineZ; 534 LineF* lineF; 535 UWord zix; 536 SecMap* sm = shmem__find_or_alloc_SecMap(tag); 537 UWord smoff = shmem__get_SecMap_offset(tag); 538 /* since smoff is derived from a valid tag, it should be 539 cacheline-aligned. */ 540 tl_assert(0 == (smoff & (N_LINE_ARANGE - 1))); 541 zix = smoff >> N_LINE_BITS; 542 tl_assert(zix < N_SECMAP_ZLINES); 543 lineZ = &sm->linesZ[zix]; 544 lineF = NULL; 545 if (lineZ->dict[0] == SVal_INVALID) { 546 UInt fix = (UInt)lineZ->dict[1]; 547 tl_assert(sm->linesF); 548 tl_assert(sm->linesF_size > 0); 549 tl_assert(fix >= 0 && fix < sm->linesF_size); 550 lineF = &sm->linesF[fix]; 551 tl_assert(lineF->inUse); 552 lineZ = NULL; 553 } 554 *zp = lineZ; 555 *fp = lineF; 556 } 557 558 /* Given address 'tag', return the relevant SecMap and the index of 559 the LineZ within it, in the expectation that the line is to be 560 overwritten. Regardless of whether 'tag' is currently associated 561 with a Z or F representation, to rcdec on the current 562 representation, in recognition of the fact that the contents are 563 just about to be overwritten. */ 564 static __attribute__((noinline)) 565 void find_Z_for_writing ( /*OUT*/SecMap** smp, 566 /*OUT*/Word* zixp, 567 Addr tag ) { 568 LineZ* lineZ; 569 LineF* lineF; 570 UWord zix; 571 SecMap* sm = shmem__find_or_alloc_SecMap(tag); 572 UWord smoff = shmem__get_SecMap_offset(tag); 573 /* since smoff is derived from a valid tag, it should be 574 cacheline-aligned. */ 575 tl_assert(0 == (smoff & (N_LINE_ARANGE - 1))); 576 zix = smoff >> N_LINE_BITS; 577 tl_assert(zix < N_SECMAP_ZLINES); 578 lineZ = &sm->linesZ[zix]; 579 lineF = NULL; 580 /* re RCs, we are freeing up this LineZ/LineF so that new data can 581 be parked in it. Hence have to rcdec it accordingly. */ 582 /* If lineZ has an associated lineF, free it up. */ 583 if (lineZ->dict[0] == SVal_INVALID) { 584 UInt fix = (UInt)lineZ->dict[1]; 585 tl_assert(sm->linesF); 586 tl_assert(sm->linesF_size > 0); 587 tl_assert(fix >= 0 && fix < sm->linesF_size); 588 lineF = &sm->linesF[fix]; 589 tl_assert(lineF->inUse); 590 rcdec_LineF(lineF); 591 lineF->inUse = False; 592 } else { 593 rcdec_LineZ(lineZ); 594 } 595 *smp = sm; 596 *zixp = zix; 597 } 598 599 static __attribute__((noinline)) 600 void alloc_F_for_writing ( /*MOD*/SecMap* sm, /*OUT*/Word* fixp ) { 601 UInt i, new_size; 602 LineF* nyu; 603 604 if (sm->linesF) { 605 tl_assert(sm->linesF_size > 0); 606 } else { 607 tl_assert(sm->linesF_size == 0); 608 } 609 610 if (sm->linesF) { 611 for (i = 0; i < sm->linesF_size; i++) { 612 if (!sm->linesF[i].inUse) { 613 *fixp = (Word)i; 614 return; 615 } 616 } 617 } 618 619 /* No free F line found. Expand existing array and try again. */ 620 new_size = sm->linesF_size==0 ? 1 : 2 * sm->linesF_size; 621 nyu = HG_(zalloc)( "libhb.aFfw.1 (LineF storage)", 622 new_size * sizeof(LineF) ); 623 tl_assert(nyu); 624 625 stats__secmap_linesF_allocd += (new_size - sm->linesF_size); 626 stats__secmap_linesF_bytes += (new_size - sm->linesF_size) 627 * sizeof(LineF); 628 629 if (0) 630 VG_(printf)("SM %p: expand F array from %d to %d\n", 631 sm, (Int)sm->linesF_size, new_size); 632 633 for (i = 0; i < new_size; i++) 634 nyu[i].inUse = False; 635 636 if (sm->linesF) { 637 for (i = 0; i < sm->linesF_size; i++) { 638 tl_assert(sm->linesF[i].inUse); 639 nyu[i] = sm->linesF[i]; 640 } 641 VG_(memset)(sm->linesF, 0, sm->linesF_size * sizeof(LineF) ); 642 HG_(free)(sm->linesF); 643 } 644 645 sm->linesF = nyu; 646 sm->linesF_size = new_size; 647 648 for (i = 0; i < sm->linesF_size; i++) { 649 if (!sm->linesF[i].inUse) { 650 *fixp = (Word)i; 651 return; 652 } 653 } 654 655 /*NOTREACHED*/ 656 tl_assert(0); 657 } 658 659 660 /* ------------ CacheLine and implicit-tree related ------------ */ 661 662 __attribute__((unused)) 663 static void pp_CacheLine ( CacheLine* cl ) { 664 Word i; 665 if (!cl) { 666 VG_(printf)("%s","pp_CacheLine(NULL)\n"); 667 return; 668 } 669 for (i = 0; i < N_LINE_TREES; i++) 670 VG_(printf)(" descr: %04lx\n", (UWord)cl->descrs[i]); 671 for (i = 0; i < N_LINE_ARANGE; i++) 672 VG_(printf)(" sval: %08lx\n", (UWord)cl->svals[i]); 673 } 674 675 static UChar descr_to_validbits ( UShort descr ) 676 { 677 /* a.k.a Party Time for gcc's constant folder */ 678 # define DESCR(b8_7, b8_6, b8_5, b8_4, b8_3, b8_2, b8_1, b8_0, \ 679 b16_3, b32_1, b16_2, b64, b16_1, b32_0, b16_0) \ 680 ( (UShort) ( ( (b8_7) << 14) | ( (b8_6) << 13) | \ 681 ( (b8_5) << 12) | ( (b8_4) << 11) | \ 682 ( (b8_3) << 10) | ( (b8_2) << 9) | \ 683 ( (b8_1) << 8) | ( (b8_0) << 7) | \ 684 ( (b16_3) << 6) | ( (b32_1) << 5) | \ 685 ( (b16_2) << 4) | ( (b64) << 3) | \ 686 ( (b16_1) << 2) | ( (b32_0) << 1) | \ 687 ( (b16_0) << 0) ) ) 688 689 # define BYTE(bit7, bit6, bit5, bit4, bit3, bit2, bit1, bit0) \ 690 ( (UChar) ( ( (bit7) << 7) | ( (bit6) << 6) | \ 691 ( (bit5) << 5) | ( (bit4) << 4) | \ 692 ( (bit3) << 3) | ( (bit2) << 2) | \ 693 ( (bit1) << 1) | ( (bit0) << 0) ) ) 694 695 /* these should all get folded out at compile time */ 696 tl_assert(DESCR(1,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_7); 697 tl_assert(DESCR(0,0,0,0,0,0,0,1, 0,0,0, 0, 0,0,0) == TREE_DESCR_8_0); 698 tl_assert(DESCR(0,0,0,0,0,0,0,0, 1,0,0, 0, 0,0,0) == TREE_DESCR_16_3); 699 tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,0,0) == TREE_DESCR_32_1); 700 tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,1, 0, 0,0,0) == TREE_DESCR_16_2); 701 tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0) == TREE_DESCR_64); 702 tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 1,0,0) == TREE_DESCR_16_1); 703 tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,1,0) == TREE_DESCR_32_0); 704 tl_assert(DESCR(0,0,0,0,0,0,0,0, 0,0,0, 0, 0,0,1) == TREE_DESCR_16_0); 705 706 switch (descr) { 707 /* 708 +--------------------------------- TREE_DESCR_8_7 709 | +------------------- TREE_DESCR_8_0 710 | | +---------------- TREE_DESCR_16_3 711 | | | +-------------- TREE_DESCR_32_1 712 | | | | +------------ TREE_DESCR_16_2 713 | | | | | +--------- TREE_DESCR_64 714 | | | | | | +------ TREE_DESCR_16_1 715 | | | | | | | +---- TREE_DESCR_32_0 716 | | | | | | | | +-- TREE_DESCR_16_0 717 | | | | | | | | | 718 | | | | | | | | | GRANULARITY, 7 -> 0 */ 719 case DESCR(1,1,1,1,1,1,1,1, 0,0,0, 0, 0,0,0): /* 8 8 8 8 8 8 8 8 */ 720 return BYTE(1,1,1,1,1,1,1,1); 721 case DESCR(1,1,0,0,1,1,1,1, 0,0,1, 0, 0,0,0): /* 8 8 16 8 8 8 8 */ 722 return BYTE(1,1,0,1,1,1,1,1); 723 case DESCR(0,0,1,1,1,1,1,1, 1,0,0, 0, 0,0,0): /* 16 8 8 8 8 8 8 */ 724 return BYTE(0,1,1,1,1,1,1,1); 725 case DESCR(0,0,0,0,1,1,1,1, 1,0,1, 0, 0,0,0): /* 16 16 8 8 8 8 */ 726 return BYTE(0,1,0,1,1,1,1,1); 727 728 case DESCR(1,1,1,1,1,1,0,0, 0,0,0, 0, 0,0,1): /* 8 8 8 8 8 8 16 */ 729 return BYTE(1,1,1,1,1,1,0,1); 730 case DESCR(1,1,0,0,1,1,0,0, 0,0,1, 0, 0,0,1): /* 8 8 16 8 8 16 */ 731 return BYTE(1,1,0,1,1,1,0,1); 732 case DESCR(0,0,1,1,1,1,0,0, 1,0,0, 0, 0,0,1): /* 16 8 8 8 8 16 */ 733 return BYTE(0,1,1,1,1,1,0,1); 734 case DESCR(0,0,0,0,1,1,0,0, 1,0,1, 0, 0,0,1): /* 16 16 8 8 16 */ 735 return BYTE(0,1,0,1,1,1,0,1); 736 737 case DESCR(1,1,1,1,0,0,1,1, 0,0,0, 0, 1,0,0): /* 8 8 8 8 16 8 8 */ 738 return BYTE(1,1,1,1,0,1,1,1); 739 case DESCR(1,1,0,0,0,0,1,1, 0,0,1, 0, 1,0,0): /* 8 8 16 16 8 8 */ 740 return BYTE(1,1,0,1,0,1,1,1); 741 case DESCR(0,0,1,1,0,0,1,1, 1,0,0, 0, 1,0,0): /* 16 8 8 16 8 8 */ 742 return BYTE(0,1,1,1,0,1,1,1); 743 case DESCR(0,0,0,0,0,0,1,1, 1,0,1, 0, 1,0,0): /* 16 16 16 8 8 */ 744 return BYTE(0,1,0,1,0,1,1,1); 745 746 case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 1,0,1): /* 8 8 8 8 16 16 */ 747 return BYTE(1,1,1,1,0,1,0,1); 748 case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 1,0,1): /* 8 8 16 16 16 */ 749 return BYTE(1,1,0,1,0,1,0,1); 750 case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 1,0,1): /* 16 8 8 16 16 */ 751 return BYTE(0,1,1,1,0,1,0,1); 752 case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 1,0,1): /* 16 16 16 16 */ 753 return BYTE(0,1,0,1,0,1,0,1); 754 755 case DESCR(0,0,0,0,1,1,1,1, 0,1,0, 0, 0,0,0): /* 32 8 8 8 8 */ 756 return BYTE(0,0,0,1,1,1,1,1); 757 case DESCR(0,0,0,0,1,1,0,0, 0,1,0, 0, 0,0,1): /* 32 8 8 16 */ 758 return BYTE(0,0,0,1,1,1,0,1); 759 case DESCR(0,0,0,0,0,0,1,1, 0,1,0, 0, 1,0,0): /* 32 16 8 8 */ 760 return BYTE(0,0,0,1,0,1,1,1); 761 case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 1,0,1): /* 32 16 16 */ 762 return BYTE(0,0,0,1,0,1,0,1); 763 764 case DESCR(1,1,1,1,0,0,0,0, 0,0,0, 0, 0,1,0): /* 8 8 8 8 32 */ 765 return BYTE(1,1,1,1,0,0,0,1); 766 case DESCR(1,1,0,0,0,0,0,0, 0,0,1, 0, 0,1,0): /* 8 8 16 32 */ 767 return BYTE(1,1,0,1,0,0,0,1); 768 case DESCR(0,0,1,1,0,0,0,0, 1,0,0, 0, 0,1,0): /* 16 8 8 32 */ 769 return BYTE(0,1,1,1,0,0,0,1); 770 case DESCR(0,0,0,0,0,0,0,0, 1,0,1, 0, 0,1,0): /* 16 16 32 */ 771 return BYTE(0,1,0,1,0,0,0,1); 772 773 case DESCR(0,0,0,0,0,0,0,0, 0,1,0, 0, 0,1,0): /* 32 32 */ 774 return BYTE(0,0,0,1,0,0,0,1); 775 776 case DESCR(0,0,0,0,0,0,0,0, 0,0,0, 1, 0,0,0): /* 64 */ 777 return BYTE(0,0,0,0,0,0,0,1); 778 779 default: return BYTE(0,0,0,0,0,0,0,0); 780 /* INVALID - any valid descr produces at least one 781 valid bit in tree[0..7]*/ 782 } 783 /* NOTREACHED*/ 784 tl_assert(0); 785 786 # undef DESCR 787 # undef BYTE 788 } 789 790 __attribute__((unused)) 791 static Bool is_sane_Descr ( UShort descr ) { 792 return descr_to_validbits(descr) != 0; 793 } 794 795 static void sprintf_Descr ( /*OUT*/HChar* dst, UShort descr ) { 796 VG_(sprintf)(dst, 797 "%d%d%d%d%d%d%d%d %d%d%d %d %d%d%d", 798 (Int)((descr & TREE_DESCR_8_7) ? 1 : 0), 799 (Int)((descr & TREE_DESCR_8_6) ? 1 : 0), 800 (Int)((descr & TREE_DESCR_8_5) ? 1 : 0), 801 (Int)((descr & TREE_DESCR_8_4) ? 1 : 0), 802 (Int)((descr & TREE_DESCR_8_3) ? 1 : 0), 803 (Int)((descr & TREE_DESCR_8_2) ? 1 : 0), 804 (Int)((descr & TREE_DESCR_8_1) ? 1 : 0), 805 (Int)((descr & TREE_DESCR_8_0) ? 1 : 0), 806 (Int)((descr & TREE_DESCR_16_3) ? 1 : 0), 807 (Int)((descr & TREE_DESCR_32_1) ? 1 : 0), 808 (Int)((descr & TREE_DESCR_16_2) ? 1 : 0), 809 (Int)((descr & TREE_DESCR_64) ? 1 : 0), 810 (Int)((descr & TREE_DESCR_16_1) ? 1 : 0), 811 (Int)((descr & TREE_DESCR_32_0) ? 1 : 0), 812 (Int)((descr & TREE_DESCR_16_0) ? 1 : 0) 813 ); 814 } 815 static void sprintf_Byte ( /*OUT*/HChar* dst, UChar byte ) { 816 VG_(sprintf)(dst, "%d%d%d%d%d%d%d%d", 817 (Int)((byte & 128) ? 1 : 0), 818 (Int)((byte & 64) ? 1 : 0), 819 (Int)((byte & 32) ? 1 : 0), 820 (Int)((byte & 16) ? 1 : 0), 821 (Int)((byte & 8) ? 1 : 0), 822 (Int)((byte & 4) ? 1 : 0), 823 (Int)((byte & 2) ? 1 : 0), 824 (Int)((byte & 1) ? 1 : 0) 825 ); 826 } 827 828 static Bool is_sane_Descr_and_Tree ( UShort descr, SVal* tree ) { 829 Word i; 830 UChar validbits = descr_to_validbits(descr); 831 HChar buf[128], buf2[128]; 832 if (validbits == 0) 833 goto bad; 834 for (i = 0; i < 8; i++) { 835 if (validbits & (1<<i)) { 836 if (tree[i] == SVal_INVALID) 837 goto bad; 838 } else { 839 if (tree[i] != SVal_INVALID) 840 goto bad; 841 } 842 } 843 return True; 844 bad: 845 sprintf_Descr( buf, descr ); 846 sprintf_Byte( buf2, validbits ); 847 VG_(printf)("%s","is_sane_Descr_and_Tree: bad tree {\n"); 848 VG_(printf)(" validbits 0x%02lx %s\n", (UWord)validbits, buf2); 849 VG_(printf)(" descr 0x%04lx %s\n", (UWord)descr, buf); 850 for (i = 0; i < 8; i++) 851 VG_(printf)(" [%ld] 0x%016llx\n", i, tree[i]); 852 VG_(printf)("%s","}\n"); 853 return 0; 854 } 855 856 static Bool is_sane_CacheLine ( CacheLine* cl ) 857 { 858 Word tno, cloff; 859 860 if (!cl) goto bad; 861 862 for (tno = 0, cloff = 0; tno < N_LINE_TREES; tno++, cloff += 8) { 863 UShort descr = cl->descrs[tno]; 864 SVal* tree = &cl->svals[cloff]; 865 if (!is_sane_Descr_and_Tree(descr, tree)) 866 goto bad; 867 } 868 tl_assert(cloff == N_LINE_ARANGE); 869 return True; 870 bad: 871 pp_CacheLine(cl); 872 return False; 873 } 874 875 static UShort normalise_tree ( /*MOD*/SVal* tree ) 876 { 877 UShort descr; 878 /* pre: incoming tree[0..7] does not have any invalid shvals, in 879 particular no zeroes. */ 880 if (UNLIKELY(tree[7] == SVal_INVALID || tree[6] == SVal_INVALID 881 || tree[5] == SVal_INVALID || tree[4] == SVal_INVALID 882 || tree[3] == SVal_INVALID || tree[2] == SVal_INVALID 883 || tree[1] == SVal_INVALID || tree[0] == SVal_INVALID)) 884 tl_assert(0); 885 886 descr = TREE_DESCR_8_7 | TREE_DESCR_8_6 | TREE_DESCR_8_5 887 | TREE_DESCR_8_4 | TREE_DESCR_8_3 | TREE_DESCR_8_2 888 | TREE_DESCR_8_1 | TREE_DESCR_8_0; 889 /* build 16-bit layer */ 890 if (tree[1] == tree[0]) { 891 tree[1] = SVal_INVALID; 892 descr &= ~(TREE_DESCR_8_1 | TREE_DESCR_8_0); 893 descr |= TREE_DESCR_16_0; 894 } 895 if (tree[3] == tree[2]) { 896 tree[3] = SVal_INVALID; 897 descr &= ~(TREE_DESCR_8_3 | TREE_DESCR_8_2); 898 descr |= TREE_DESCR_16_1; 899 } 900 if (tree[5] == tree[4]) { 901 tree[5] = SVal_INVALID; 902 descr &= ~(TREE_DESCR_8_5 | TREE_DESCR_8_4); 903 descr |= TREE_DESCR_16_2; 904 } 905 if (tree[7] == tree[6]) { 906 tree[7] = SVal_INVALID; 907 descr &= ~(TREE_DESCR_8_7 | TREE_DESCR_8_6); 908 descr |= TREE_DESCR_16_3; 909 } 910 /* build 32-bit layer */ 911 if (tree[2] == tree[0] 912 && (descr & TREE_DESCR_16_1) && (descr & TREE_DESCR_16_0)) { 913 tree[2] = SVal_INVALID; /* [3,1] must already be SVal_INVALID */ 914 descr &= ~(TREE_DESCR_16_1 | TREE_DESCR_16_0); 915 descr |= TREE_DESCR_32_0; 916 } 917 if (tree[6] == tree[4] 918 && (descr & TREE_DESCR_16_3) && (descr & TREE_DESCR_16_2)) { 919 tree[6] = SVal_INVALID; /* [7,5] must already be SVal_INVALID */ 920 descr &= ~(TREE_DESCR_16_3 | TREE_DESCR_16_2); 921 descr |= TREE_DESCR_32_1; 922 } 923 /* build 64-bit layer */ 924 if (tree[4] == tree[0] 925 && (descr & TREE_DESCR_32_1) && (descr & TREE_DESCR_32_0)) { 926 tree[4] = SVal_INVALID; /* [7,6,5,3,2,1] must already be SVal_INVALID */ 927 descr &= ~(TREE_DESCR_32_1 | TREE_DESCR_32_0); 928 descr |= TREE_DESCR_64; 929 } 930 return descr; 931 } 932 933 /* This takes a cacheline where all the data is at the leaves 934 (w8[..]) and builds a correctly normalised tree. */ 935 static void normalise_CacheLine ( /*MOD*/CacheLine* cl ) 936 { 937 Word tno, cloff; 938 for (tno = 0, cloff = 0; tno < N_LINE_TREES; tno++, cloff += 8) { 939 SVal* tree = &cl->svals[cloff]; 940 cl->descrs[tno] = normalise_tree( tree ); 941 } 942 tl_assert(cloff == N_LINE_ARANGE); 943 if (CHECK_ZSM) 944 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 945 stats__cline_normalises++; 946 } 947 948 949 typedef struct { UChar count; SVal sval; } CountedSVal; 950 951 static 952 void sequentialise_CacheLine ( /*OUT*/CountedSVal* dst, 953 /*OUT*/Word* dstUsedP, 954 Word nDst, CacheLine* src ) 955 { 956 Word tno, cloff, dstUsed; 957 958 tl_assert(nDst == N_LINE_ARANGE); 959 dstUsed = 0; 960 961 for (tno = 0, cloff = 0; tno < N_LINE_TREES; tno++, cloff += 8) { 962 UShort descr = src->descrs[tno]; 963 SVal* tree = &src->svals[cloff]; 964 965 /* sequentialise the tree described by (descr,tree). */ 966 # define PUT(_n,_v) \ 967 do { dst[dstUsed ].count = (_n); \ 968 dst[dstUsed++].sval = (_v); \ 969 } while (0) 970 971 /* byte 0 */ 972 if (descr & TREE_DESCR_64) PUT(8, tree[0]); else 973 if (descr & TREE_DESCR_32_0) PUT(4, tree[0]); else 974 if (descr & TREE_DESCR_16_0) PUT(2, tree[0]); else 975 if (descr & TREE_DESCR_8_0) PUT(1, tree[0]); 976 /* byte 1 */ 977 if (descr & TREE_DESCR_8_1) PUT(1, tree[1]); 978 /* byte 2 */ 979 if (descr & TREE_DESCR_16_1) PUT(2, tree[2]); else 980 if (descr & TREE_DESCR_8_2) PUT(1, tree[2]); 981 /* byte 3 */ 982 if (descr & TREE_DESCR_8_3) PUT(1, tree[3]); 983 /* byte 4 */ 984 if (descr & TREE_DESCR_32_1) PUT(4, tree[4]); else 985 if (descr & TREE_DESCR_16_2) PUT(2, tree[4]); else 986 if (descr & TREE_DESCR_8_4) PUT(1, tree[4]); 987 /* byte 5 */ 988 if (descr & TREE_DESCR_8_5) PUT(1, tree[5]); 989 /* byte 6 */ 990 if (descr & TREE_DESCR_16_3) PUT(2, tree[6]); else 991 if (descr & TREE_DESCR_8_6) PUT(1, tree[6]); 992 /* byte 7 */ 993 if (descr & TREE_DESCR_8_7) PUT(1, tree[7]); 994 995 # undef PUT 996 /* END sequentialise the tree described by (descr,tree). */ 997 998 } 999 tl_assert(cloff == N_LINE_ARANGE); 1000 tl_assert(dstUsed <= nDst); 1001 1002 *dstUsedP = dstUsed; 1003 } 1004 1005 /* Write the cacheline 'wix' to backing store. Where it ends up 1006 is determined by its tag field. */ 1007 static __attribute__((noinline)) void cacheline_wback ( UWord wix ) 1008 { 1009 Word i, j, k, m; 1010 Addr tag; 1011 SecMap* sm; 1012 CacheLine* cl; 1013 LineZ* lineZ; 1014 LineF* lineF; 1015 Word zix, fix, csvalsUsed; 1016 CountedSVal csvals[N_LINE_ARANGE]; 1017 SVal sv; 1018 1019 if (0) 1020 VG_(printf)("scache wback line %d\n", (Int)wix); 1021 1022 tl_assert(wix >= 0 && wix < N_WAY_NENT); 1023 1024 tag = cache_shmem.tags0[wix]; 1025 cl = &cache_shmem.lyns0[wix]; 1026 1027 /* The cache line may have been invalidated; if so, ignore it. */ 1028 if (!is_valid_scache_tag(tag)) 1029 return; 1030 1031 /* Where are we going to put it? */ 1032 sm = NULL; 1033 lineZ = NULL; 1034 lineF = NULL; 1035 zix = fix = -1; 1036 1037 /* find the Z line to write in and rcdec it or the associated F 1038 line. */ 1039 find_Z_for_writing( &sm, &zix, tag ); 1040 1041 tl_assert(sm); 1042 tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES); 1043 lineZ = &sm->linesZ[zix]; 1044 1045 /* Generate the data to be stored */ 1046 if (CHECK_ZSM) 1047 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 1048 1049 csvalsUsed = -1; 1050 sequentialise_CacheLine( csvals, &csvalsUsed, 1051 N_LINE_ARANGE, cl ); 1052 tl_assert(csvalsUsed >= 1 && csvalsUsed <= N_LINE_ARANGE); 1053 if (0) VG_(printf)("%lu ", csvalsUsed); 1054 1055 lineZ->dict[0] = lineZ->dict[1] 1056 = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID; 1057 1058 /* i indexes actual shadow values, k is cursor in csvals */ 1059 i = 0; 1060 for (k = 0; k < csvalsUsed; k++) { 1061 1062 sv = csvals[k].sval; 1063 if (CHECK_ZSM) 1064 tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8); 1065 /* do we already have it? */ 1066 if (sv == lineZ->dict[0]) { j = 0; goto dict_ok; } 1067 if (sv == lineZ->dict[1]) { j = 1; goto dict_ok; } 1068 if (sv == lineZ->dict[2]) { j = 2; goto dict_ok; } 1069 if (sv == lineZ->dict[3]) { j = 3; goto dict_ok; } 1070 /* no. look for a free slot. */ 1071 if (CHECK_ZSM) 1072 tl_assert(sv != SVal_INVALID); 1073 if (lineZ->dict[0] 1074 == SVal_INVALID) { lineZ->dict[0] = sv; j = 0; goto dict_ok; } 1075 if (lineZ->dict[1] 1076 == SVal_INVALID) { lineZ->dict[1] = sv; j = 1; goto dict_ok; } 1077 if (lineZ->dict[2] 1078 == SVal_INVALID) { lineZ->dict[2] = sv; j = 2; goto dict_ok; } 1079 if (lineZ->dict[3] 1080 == SVal_INVALID) { lineZ->dict[3] = sv; j = 3; goto dict_ok; } 1081 break; /* we'll have to use the f rep */ 1082 dict_ok: 1083 m = csvals[k].count; 1084 if (m == 8) { 1085 write_twobit_array( lineZ->ix2s, i+0, j ); 1086 write_twobit_array( lineZ->ix2s, i+1, j ); 1087 write_twobit_array( lineZ->ix2s, i+2, j ); 1088 write_twobit_array( lineZ->ix2s, i+3, j ); 1089 write_twobit_array( lineZ->ix2s, i+4, j ); 1090 write_twobit_array( lineZ->ix2s, i+5, j ); 1091 write_twobit_array( lineZ->ix2s, i+6, j ); 1092 write_twobit_array( lineZ->ix2s, i+7, j ); 1093 i += 8; 1094 } 1095 else if (m == 4) { 1096 write_twobit_array( lineZ->ix2s, i+0, j ); 1097 write_twobit_array( lineZ->ix2s, i+1, j ); 1098 write_twobit_array( lineZ->ix2s, i+2, j ); 1099 write_twobit_array( lineZ->ix2s, i+3, j ); 1100 i += 4; 1101 } 1102 else if (m == 1) { 1103 write_twobit_array( lineZ->ix2s, i+0, j ); 1104 i += 1; 1105 } 1106 else if (m == 2) { 1107 write_twobit_array( lineZ->ix2s, i+0, j ); 1108 write_twobit_array( lineZ->ix2s, i+1, j ); 1109 i += 2; 1110 } 1111 else { 1112 tl_assert(0); /* 8 4 2 or 1 are the only legitimate values for m */ 1113 } 1114 1115 } 1116 1117 if (LIKELY(i == N_LINE_ARANGE)) { 1118 /* Construction of the compressed representation was 1119 successful. */ 1120 rcinc_LineZ(lineZ); 1121 stats__cache_Z_wbacks++; 1122 } else { 1123 /* Cannot use the compressed(z) representation. Use the full(f) 1124 rep instead. */ 1125 tl_assert(i >= 0 && i < N_LINE_ARANGE); 1126 alloc_F_for_writing( sm, &fix ); 1127 tl_assert(sm->linesF); 1128 tl_assert(sm->linesF_size > 0); 1129 tl_assert(fix >= 0 && fix < (Word)sm->linesF_size); 1130 lineF = &sm->linesF[fix]; 1131 tl_assert(!lineF->inUse); 1132 lineZ->dict[0] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID; 1133 lineZ->dict[1] = (SVal)fix; 1134 lineF->inUse = True; 1135 i = 0; 1136 for (k = 0; k < csvalsUsed; k++) { 1137 if (CHECK_ZSM) 1138 tl_assert(csvals[k].count >= 1 && csvals[k].count <= 8); 1139 sv = csvals[k].sval; 1140 if (CHECK_ZSM) 1141 tl_assert(sv != SVal_INVALID); 1142 for (m = csvals[k].count; m > 0; m--) { 1143 lineF->w64s[i] = sv; 1144 i++; 1145 } 1146 } 1147 tl_assert(i == N_LINE_ARANGE); 1148 rcinc_LineF(lineF); 1149 stats__cache_F_wbacks++; 1150 } 1151 } 1152 1153 /* Fetch the cacheline 'wix' from the backing store. The tag 1154 associated with 'wix' is assumed to have already been filled in; 1155 hence that is used to determine where in the backing store to read 1156 from. */ 1157 static __attribute__((noinline)) void cacheline_fetch ( UWord wix ) 1158 { 1159 Word i; 1160 Addr tag; 1161 CacheLine* cl; 1162 LineZ* lineZ; 1163 LineF* lineF; 1164 1165 if (0) 1166 VG_(printf)("scache fetch line %d\n", (Int)wix); 1167 1168 tl_assert(wix >= 0 && wix < N_WAY_NENT); 1169 1170 tag = cache_shmem.tags0[wix]; 1171 cl = &cache_shmem.lyns0[wix]; 1172 1173 /* reject nonsense requests */ 1174 tl_assert(is_valid_scache_tag(tag)); 1175 1176 lineZ = NULL; 1177 lineF = NULL; 1178 find_ZF_for_reading( &lineZ, &lineF, tag ); 1179 tl_assert( (lineZ && !lineF) || (!lineZ && lineF) ); 1180 1181 /* expand the data into the bottom layer of the tree, then get 1182 cacheline_normalise to build the descriptor array. */ 1183 if (lineF) { 1184 tl_assert(lineF->inUse); 1185 for (i = 0; i < N_LINE_ARANGE; i++) { 1186 cl->svals[i] = lineF->w64s[i]; 1187 } 1188 stats__cache_F_fetches++; 1189 } else { 1190 for (i = 0; i < N_LINE_ARANGE; i++) { 1191 SVal sv; 1192 UWord ix = read_twobit_array( lineZ->ix2s, i ); 1193 /* correct, but expensive: tl_assert(ix >= 0 && ix <= 3); */ 1194 sv = lineZ->dict[ix]; 1195 tl_assert(sv != SVal_INVALID); 1196 cl->svals[i] = sv; 1197 } 1198 stats__cache_Z_fetches++; 1199 } 1200 normalise_CacheLine( cl ); 1201 } 1202 1203 static void shmem__invalidate_scache ( void ) { 1204 Word wix; 1205 if (0) VG_(printf)("%s","scache inval\n"); 1206 tl_assert(!is_valid_scache_tag(1)); 1207 for (wix = 0; wix < N_WAY_NENT; wix++) { 1208 cache_shmem.tags0[wix] = 1/*INVALID*/; 1209 } 1210 stats__cache_invals++; 1211 } 1212 1213 static void shmem__flush_and_invalidate_scache ( void ) { 1214 Word wix; 1215 Addr tag; 1216 if (0) VG_(printf)("%s","scache flush and invalidate\n"); 1217 tl_assert(!is_valid_scache_tag(1)); 1218 for (wix = 0; wix < N_WAY_NENT; wix++) { 1219 tag = cache_shmem.tags0[wix]; 1220 if (tag == 1/*INVALID*/) { 1221 /* already invalid; nothing to do */ 1222 } else { 1223 tl_assert(is_valid_scache_tag(tag)); 1224 cacheline_wback( wix ); 1225 } 1226 cache_shmem.tags0[wix] = 1/*INVALID*/; 1227 } 1228 stats__cache_flushes++; 1229 stats__cache_invals++; 1230 } 1231 1232 1233 static inline Bool aligned16 ( Addr a ) { 1234 return 0 == (a & 1); 1235 } 1236 static inline Bool aligned32 ( Addr a ) { 1237 return 0 == (a & 3); 1238 } 1239 static inline Bool aligned64 ( Addr a ) { 1240 return 0 == (a & 7); 1241 } 1242 static inline UWord get_cacheline_offset ( Addr a ) { 1243 return (UWord)(a & (N_LINE_ARANGE - 1)); 1244 } 1245 static inline Addr cacheline_ROUNDUP ( Addr a ) { 1246 return ROUNDUP(a, N_LINE_ARANGE); 1247 } 1248 static inline Addr cacheline_ROUNDDN ( Addr a ) { 1249 return ROUNDDN(a, N_LINE_ARANGE); 1250 } 1251 static inline UWord get_treeno ( Addr a ) { 1252 return get_cacheline_offset(a) >> 3; 1253 } 1254 static inline UWord get_tree_offset ( Addr a ) { 1255 return a & 7; 1256 } 1257 1258 static __attribute__((noinline)) 1259 CacheLine* get_cacheline_MISS ( Addr a ); /* fwds */ 1260 static inline CacheLine* get_cacheline ( Addr a ) 1261 { 1262 /* tag is 'a' with the in-line offset masked out, 1263 eg a[31]..a[4] 0000 */ 1264 Addr tag = a & ~(N_LINE_ARANGE - 1); 1265 UWord wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1); 1266 stats__cache_totrefs++; 1267 if (LIKELY(tag == cache_shmem.tags0[wix])) { 1268 return &cache_shmem.lyns0[wix]; 1269 } else { 1270 return get_cacheline_MISS( a ); 1271 } 1272 } 1273 1274 static __attribute__((noinline)) 1275 CacheLine* get_cacheline_MISS ( Addr a ) 1276 { 1277 /* tag is 'a' with the in-line offset masked out, 1278 eg a[31]..a[4] 0000 */ 1279 1280 CacheLine* cl; 1281 Addr* tag_old_p; 1282 Addr tag = a & ~(N_LINE_ARANGE - 1); 1283 UWord wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1); 1284 1285 tl_assert(tag != cache_shmem.tags0[wix]); 1286 1287 /* Dump the old line into the backing store. */ 1288 stats__cache_totmisses++; 1289 1290 cl = &cache_shmem.lyns0[wix]; 1291 tag_old_p = &cache_shmem.tags0[wix]; 1292 1293 if (is_valid_scache_tag( *tag_old_p )) { 1294 /* EXPENSIVE and REDUNDANT: callee does it */ 1295 if (CHECK_ZSM) 1296 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 1297 cacheline_wback( wix ); 1298 } 1299 /* and reload the new one */ 1300 *tag_old_p = tag; 1301 cacheline_fetch( wix ); 1302 if (CHECK_ZSM) 1303 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 1304 return cl; 1305 } 1306 1307 static UShort pulldown_to_32 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) { 1308 stats__cline_64to32pulldown++; 1309 switch (toff) { 1310 case 0: case 4: 1311 tl_assert(descr & TREE_DESCR_64); 1312 tree[4] = tree[0]; 1313 descr &= ~TREE_DESCR_64; 1314 descr |= (TREE_DESCR_32_1 | TREE_DESCR_32_0); 1315 break; 1316 default: 1317 tl_assert(0); 1318 } 1319 return descr; 1320 } 1321 1322 static UShort pulldown_to_16 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) { 1323 stats__cline_32to16pulldown++; 1324 switch (toff) { 1325 case 0: case 2: 1326 if (!(descr & TREE_DESCR_32_0)) { 1327 descr = pulldown_to_32(tree, 0, descr); 1328 } 1329 tl_assert(descr & TREE_DESCR_32_0); 1330 tree[2] = tree[0]; 1331 descr &= ~TREE_DESCR_32_0; 1332 descr |= (TREE_DESCR_16_1 | TREE_DESCR_16_0); 1333 break; 1334 case 4: case 6: 1335 if (!(descr & TREE_DESCR_32_1)) { 1336 descr = pulldown_to_32(tree, 4, descr); 1337 } 1338 tl_assert(descr & TREE_DESCR_32_1); 1339 tree[6] = tree[4]; 1340 descr &= ~TREE_DESCR_32_1; 1341 descr |= (TREE_DESCR_16_3 | TREE_DESCR_16_2); 1342 break; 1343 default: 1344 tl_assert(0); 1345 } 1346 return descr; 1347 } 1348 1349 static UShort pulldown_to_8 ( /*MOD*/SVal* tree, UWord toff, UShort descr ) { 1350 stats__cline_16to8pulldown++; 1351 switch (toff) { 1352 case 0: case 1: 1353 if (!(descr & TREE_DESCR_16_0)) { 1354 descr = pulldown_to_16(tree, 0, descr); 1355 } 1356 tl_assert(descr & TREE_DESCR_16_0); 1357 tree[1] = tree[0]; 1358 descr &= ~TREE_DESCR_16_0; 1359 descr |= (TREE_DESCR_8_1 | TREE_DESCR_8_0); 1360 break; 1361 case 2: case 3: 1362 if (!(descr & TREE_DESCR_16_1)) { 1363 descr = pulldown_to_16(tree, 2, descr); 1364 } 1365 tl_assert(descr & TREE_DESCR_16_1); 1366 tree[3] = tree[2]; 1367 descr &= ~TREE_DESCR_16_1; 1368 descr |= (TREE_DESCR_8_3 | TREE_DESCR_8_2); 1369 break; 1370 case 4: case 5: 1371 if (!(descr & TREE_DESCR_16_2)) { 1372 descr = pulldown_to_16(tree, 4, descr); 1373 } 1374 tl_assert(descr & TREE_DESCR_16_2); 1375 tree[5] = tree[4]; 1376 descr &= ~TREE_DESCR_16_2; 1377 descr |= (TREE_DESCR_8_5 | TREE_DESCR_8_4); 1378 break; 1379 case 6: case 7: 1380 if (!(descr & TREE_DESCR_16_3)) { 1381 descr = pulldown_to_16(tree, 6, descr); 1382 } 1383 tl_assert(descr & TREE_DESCR_16_3); 1384 tree[7] = tree[6]; 1385 descr &= ~TREE_DESCR_16_3; 1386 descr |= (TREE_DESCR_8_7 | TREE_DESCR_8_6); 1387 break; 1388 default: 1389 tl_assert(0); 1390 } 1391 return descr; 1392 } 1393 1394 1395 static UShort pullup_descr_to_16 ( UShort descr, UWord toff ) { 1396 UShort mask; 1397 switch (toff) { 1398 case 0: 1399 mask = TREE_DESCR_8_1 | TREE_DESCR_8_0; 1400 tl_assert( (descr & mask) == mask ); 1401 descr &= ~mask; 1402 descr |= TREE_DESCR_16_0; 1403 break; 1404 case 2: 1405 mask = TREE_DESCR_8_3 | TREE_DESCR_8_2; 1406 tl_assert( (descr & mask) == mask ); 1407 descr &= ~mask; 1408 descr |= TREE_DESCR_16_1; 1409 break; 1410 case 4: 1411 mask = TREE_DESCR_8_5 | TREE_DESCR_8_4; 1412 tl_assert( (descr & mask) == mask ); 1413 descr &= ~mask; 1414 descr |= TREE_DESCR_16_2; 1415 break; 1416 case 6: 1417 mask = TREE_DESCR_8_7 | TREE_DESCR_8_6; 1418 tl_assert( (descr & mask) == mask ); 1419 descr &= ~mask; 1420 descr |= TREE_DESCR_16_3; 1421 break; 1422 default: 1423 tl_assert(0); 1424 } 1425 return descr; 1426 } 1427 1428 static UShort pullup_descr_to_32 ( UShort descr, UWord toff ) { 1429 UShort mask; 1430 switch (toff) { 1431 case 0: 1432 if (!(descr & TREE_DESCR_16_0)) 1433 descr = pullup_descr_to_16(descr, 0); 1434 if (!(descr & TREE_DESCR_16_1)) 1435 descr = pullup_descr_to_16(descr, 2); 1436 mask = TREE_DESCR_16_1 | TREE_DESCR_16_0; 1437 tl_assert( (descr & mask) == mask ); 1438 descr &= ~mask; 1439 descr |= TREE_DESCR_32_0; 1440 break; 1441 case 4: 1442 if (!(descr & TREE_DESCR_16_2)) 1443 descr = pullup_descr_to_16(descr, 4); 1444 if (!(descr & TREE_DESCR_16_3)) 1445 descr = pullup_descr_to_16(descr, 6); 1446 mask = TREE_DESCR_16_3 | TREE_DESCR_16_2; 1447 tl_assert( (descr & mask) == mask ); 1448 descr &= ~mask; 1449 descr |= TREE_DESCR_32_1; 1450 break; 1451 default: 1452 tl_assert(0); 1453 } 1454 return descr; 1455 } 1456 1457 static Bool valid_value_is_above_me_32 ( UShort descr, UWord toff ) { 1458 switch (toff) { 1459 case 0: case 4: 1460 return 0 != (descr & TREE_DESCR_64); 1461 default: 1462 tl_assert(0); 1463 } 1464 } 1465 1466 static Bool valid_value_is_below_me_16 ( UShort descr, UWord toff ) { 1467 switch (toff) { 1468 case 0: 1469 return 0 != (descr & (TREE_DESCR_8_1 | TREE_DESCR_8_0)); 1470 case 2: 1471 return 0 != (descr & (TREE_DESCR_8_3 | TREE_DESCR_8_2)); 1472 case 4: 1473 return 0 != (descr & (TREE_DESCR_8_5 | TREE_DESCR_8_4)); 1474 case 6: 1475 return 0 != (descr & (TREE_DESCR_8_7 | TREE_DESCR_8_6)); 1476 default: 1477 tl_assert(0); 1478 } 1479 } 1480 1481 /* ------------ Cache management ------------ */ 1482 1483 static void zsm_flush_cache ( void ) 1484 { 1485 shmem__flush_and_invalidate_scache(); 1486 } 1487 1488 1489 static void zsm_init ( void(*p_rcinc)(SVal), void(*p_rcdec)(SVal) ) 1490 { 1491 tl_assert( sizeof(UWord) == sizeof(Addr) ); 1492 1493 rcinc = p_rcinc; 1494 rcdec = p_rcdec; 1495 1496 tl_assert(map_shmem == NULL); 1497 map_shmem = VG_(newFM)( HG_(zalloc), "libhb.zsm_init.1 (map_shmem)", 1498 HG_(free), 1499 NULL/*unboxed UWord cmp*/); 1500 tl_assert(map_shmem != NULL); 1501 shmem__invalidate_scache(); 1502 1503 /* a SecMap must contain an integral number of CacheLines */ 1504 tl_assert(0 == (N_SECMAP_ARANGE % N_LINE_ARANGE)); 1505 /* also ... a CacheLine holds an integral number of trees */ 1506 tl_assert(0 == (N_LINE_ARANGE % 8)); 1507 } 1508 1509 ///////////////////////////////////////////////////////////////// 1510 ///////////////////////////////////////////////////////////////// 1511 // // 1512 // SECTION END compressed shadow memory // 1513 // // 1514 ///////////////////////////////////////////////////////////////// 1515 ///////////////////////////////////////////////////////////////// 1516 1517 1518 1519 ///////////////////////////////////////////////////////////////// 1520 ///////////////////////////////////////////////////////////////// 1521 // // 1522 // SECTION BEGIN vts primitives // 1523 // // 1524 ///////////////////////////////////////////////////////////////// 1525 ///////////////////////////////////////////////////////////////// 1526 1527 #ifndef __HB_VTS_H 1528 #define __HB_VTS_H 1529 1530 /* VtsIDs can't exceed 30 bits, since they have to be packed into the 1531 lowest 30 bits of an SVal. */ 1532 typedef UInt VtsID; 1533 #define VtsID_INVALID 0xFFFFFFFF 1534 1535 /* A VTS contains .ts, its vector clock, and also .id, a field to hold 1536 a backlink for the caller's convenience. Since we have no idea 1537 what to set that to in the library, it always gets set to 1538 VtsID_INVALID. */ 1539 typedef 1540 struct { 1541 VtsID id; 1542 XArray* ts; /* XArray* ScalarTS(abstract) */ 1543 } 1544 VTS; 1545 1546 1547 /* Create a new, empty VTS. */ 1548 static VTS* VTS__new ( void ); 1549 1550 /* Delete this VTS in its entirety. */ 1551 static void VTS__delete ( VTS* vts ); 1552 1553 /* Create a new singleton VTS. */ 1554 static VTS* VTS__singleton ( Thr* thr, ULong tym ); 1555 1556 /* Return a new VTS in which vts[me]++, so to speak. 'vts' itself is 1557 not modified. */ 1558 static VTS* VTS__tick ( Thr* me, VTS* vts ); 1559 1560 /* Return a new VTS constructed as the join (max) of the 2 args. 1561 Neither arg is modified. */ 1562 static VTS* VTS__join ( VTS* a, VTS* b ); 1563 1564 /* Compute the partial ordering relation of the two args. Although we 1565 could be completely general and return an enumeration value (EQ, 1566 LT, GT, UN), in fact we only need LEQ, and so we may as well 1567 hardwire that fact. 1568 1569 Returns NULL iff LEQ(A,B), or non-NULL if not. In the latter case, 1570 the returned Thr* indicates the discovered point for which they are 1571 not. There may be more than one such point, but we only care about 1572 seeing one of them, not all of them. This rather strange 1573 convention is used because sometimes we want to know the actual 1574 index at which they first differ. */ 1575 static Thr* VTS__cmpLEQ ( VTS* a, VTS* b ); 1576 1577 /* Compute an arbitrary structural (total) ordering on the two args, 1578 based on their VCs, so they can be looked up in a table, tree, etc. 1579 Returns -1, 0 or 1. */ 1580 static Word VTS__cmp_structural ( VTS* a, VTS* b ); 1581 1582 /* Debugging only. Display the given VTS in the buffer. */ 1583 static void VTS__show ( HChar* buf, Int nBuf, VTS* vts ); 1584 1585 /* Debugging only. Return vts[index], so to speak. */ 1586 static ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx ); 1587 1588 #endif /* ! __HB_VTS_H */ 1589 1590 1591 /*--------------- to do with Vector Timestamps ---------------*/ 1592 1593 /* Scalar Timestamp */ 1594 typedef 1595 struct { 1596 Thr* thr; 1597 ULong tym; 1598 } 1599 ScalarTS; 1600 1601 1602 static Bool is_sane_VTS ( VTS* vts ) 1603 { 1604 UWord i, n; 1605 ScalarTS *st1, *st2; 1606 if (!vts) return False; 1607 if (!vts->ts) return False; 1608 n = VG_(sizeXA)( vts->ts ); 1609 if (n >= 2) { 1610 for (i = 0; i < n-1; i++) { 1611 st1 = VG_(indexXA)( vts->ts, i ); 1612 st2 = VG_(indexXA)( vts->ts, i+1 ); 1613 if (st1->thr >= st2->thr) 1614 return False; 1615 if (st1->tym == 0 || st2->tym == 0) 1616 return False; 1617 } 1618 } 1619 return True; 1620 } 1621 1622 1623 /* Create a new, empty VTS. 1624 */ 1625 VTS* VTS__new ( void ) 1626 { 1627 VTS* vts; 1628 vts = HG_(zalloc)( "libhb.VTS__new.1", sizeof(VTS) ); 1629 tl_assert(vts); 1630 vts->id = VtsID_INVALID; 1631 vts->ts = VG_(newXA)( HG_(zalloc), "libhb.VTS__new.2", 1632 HG_(free), sizeof(ScalarTS) ); 1633 tl_assert(vts->ts); 1634 return vts; 1635 } 1636 1637 1638 /* Delete this VTS in its entirety. 1639 */ 1640 void VTS__delete ( VTS* vts ) 1641 { 1642 tl_assert(vts); 1643 tl_assert(vts->ts); 1644 VG_(deleteXA)( vts->ts ); 1645 HG_(free)(vts); 1646 } 1647 1648 1649 /* Create a new singleton VTS. 1650 */ 1651 VTS* VTS__singleton ( Thr* thr, ULong tym ) { 1652 ScalarTS st; 1653 VTS* vts; 1654 tl_assert(thr); 1655 tl_assert(tym >= 1); 1656 vts = VTS__new(); 1657 st.thr = thr; 1658 st.tym = tym; 1659 VG_(addToXA)( vts->ts, &st ); 1660 return vts; 1661 } 1662 1663 1664 /* Return a new VTS in which vts[me]++, so to speak. 'vts' itself is 1665 not modified. 1666 */ 1667 VTS* VTS__tick ( Thr* me, VTS* vts ) 1668 { 1669 ScalarTS* here = NULL; 1670 ScalarTS tmp; 1671 VTS* res; 1672 Word i, n; 1673 1674 stats__vts__tick++; 1675 1676 tl_assert(me); 1677 tl_assert(is_sane_VTS(vts)); 1678 //if (0) VG_(printf)("tick vts thrno %ld szin %d\n", 1679 // (Word)me->errmsg_index, (Int)VG_(sizeXA)(vts) ); 1680 res = VTS__new(); 1681 n = VG_(sizeXA)( vts->ts ); 1682 1683 /* main loop doesn't handle zero-entry case correctly, so 1684 special-case it. */ 1685 if (n == 0) { 1686 tmp.thr = me; 1687 tmp.tym = 1; 1688 VG_(addToXA)( res->ts, &tmp ); 1689 tl_assert(is_sane_VTS(res)); 1690 return res; 1691 } 1692 1693 for (i = 0; i < n; i++) { 1694 here = VG_(indexXA)( vts->ts, i ); 1695 if (me < here->thr) { 1696 /* We just went past 'me', without seeing it. */ 1697 tmp.thr = me; 1698 tmp.tym = 1; 1699 VG_(addToXA)( res->ts, &tmp ); 1700 tmp = *here; 1701 VG_(addToXA)( res->ts, &tmp ); 1702 i++; 1703 break; 1704 } 1705 else if (me == here->thr) { 1706 tmp = *here; 1707 tmp.tym++; 1708 VG_(addToXA)( res->ts, &tmp ); 1709 i++; 1710 break; 1711 } 1712 else /* me > here->thr */ { 1713 tmp = *here; 1714 VG_(addToXA)( res->ts, &tmp ); 1715 } 1716 } 1717 tl_assert(i >= 0 && i <= n); 1718 if (i == n && here && here->thr < me) { 1719 tmp.thr = me; 1720 tmp.tym = 1; 1721 VG_(addToXA)( res->ts, &tmp ); 1722 } else { 1723 for (/*keepgoing*/; i < n; i++) { 1724 here = VG_(indexXA)( vts->ts, i ); 1725 tmp = *here; 1726 VG_(addToXA)( res->ts, &tmp ); 1727 } 1728 } 1729 tl_assert(is_sane_VTS(res)); 1730 //if (0) VG_(printf)("tick vts thrno %ld szou %d\n", 1731 // (Word)me->errmsg_index, (Int)VG_(sizeXA)(res) ); 1732 return res; 1733 } 1734 1735 1736 /* Return a new VTS constructed as the join (max) of the 2 args. 1737 Neither arg is modified. 1738 */ 1739 VTS* VTS__join ( VTS* a, VTS* b ) 1740 { 1741 Word ia, ib, useda, usedb; 1742 ULong tyma, tymb, tymMax; 1743 Thr* thr; 1744 VTS* res; 1745 1746 stats__vts__join++; 1747 1748 tl_assert(a && a->ts); 1749 tl_assert(b && b->ts); 1750 useda = VG_(sizeXA)( a->ts ); 1751 usedb = VG_(sizeXA)( b->ts ); 1752 1753 res = VTS__new(); 1754 ia = ib = 0; 1755 1756 while (1) { 1757 1758 /* This logic is to enumerate triples (thr, tyma, tymb) drawn 1759 from a and b in order, where thr is the next Thr* 1760 occurring in either a or b, and tyma/b are the relevant 1761 scalar timestamps, taking into account implicit zeroes. */ 1762 tl_assert(ia >= 0 && ia <= useda); 1763 tl_assert(ib >= 0 && ib <= usedb); 1764 1765 if (ia == useda && ib == usedb) { 1766 /* both empty - done */ 1767 break; 1768 1769 } else if (ia == useda && ib != usedb) { 1770 /* a empty, use up b */ 1771 ScalarTS* tmpb = VG_(indexXA)( b->ts, ib ); 1772 thr = tmpb->thr; 1773 tyma = 0; 1774 tymb = tmpb->tym; 1775 ib++; 1776 1777 } else if (ia != useda && ib == usedb) { 1778 /* b empty, use up a */ 1779 ScalarTS* tmpa = VG_(indexXA)( a->ts, ia ); 1780 thr = tmpa->thr; 1781 tyma = tmpa->tym; 1782 tymb = 0; 1783 ia++; 1784 1785 } else { 1786 /* both not empty; extract lowest-Thr*'d triple */ 1787 ScalarTS* tmpa = VG_(indexXA)( a->ts, ia ); 1788 ScalarTS* tmpb = VG_(indexXA)( b->ts, ib ); 1789 if (tmpa->thr < tmpb->thr) { 1790 /* a has the lowest unconsidered Thr* */ 1791 thr = tmpa->thr; 1792 tyma = tmpa->tym; 1793 tymb = 0; 1794 ia++; 1795 } else if (tmpa->thr > tmpb->thr) { 1796 /* b has the lowest unconsidered Thr* */ 1797 thr = tmpb->thr; 1798 tyma = 0; 1799 tymb = tmpb->tym; 1800 ib++; 1801 } else { 1802 /* they both next mention the same Thr* */ 1803 tl_assert(tmpa->thr == tmpb->thr); 1804 thr = tmpa->thr; /* == tmpb->thr */ 1805 tyma = tmpa->tym; 1806 tymb = tmpb->tym; 1807 ia++; 1808 ib++; 1809 } 1810 } 1811 1812 /* having laboriously determined (thr, tyma, tymb), do something 1813 useful with it. */ 1814 tymMax = tyma > tymb ? tyma : tymb; 1815 if (tymMax > 0) { 1816 ScalarTS st; 1817 st.thr = thr; 1818 st.tym = tymMax; 1819 VG_(addToXA)( res->ts, &st ); 1820 } 1821 1822 } 1823 1824 tl_assert(is_sane_VTS( res )); 1825 1826 return res; 1827 } 1828 1829 1830 /* Determine if 'a' <= 'b', in the partial ordering. Returns NULL if 1831 they are, or the first Thr* for which they are not. This rather 1832 strange convention is used because sometimes we want to know the 1833 actual index at which they first differ. */ 1834 static Thr* VTS__cmpLEQ ( VTS* a, VTS* b ) 1835 { 1836 Word ia, ib, useda, usedb; 1837 ULong tyma, tymb; 1838 1839 stats__vts__cmpLEQ++; 1840 1841 tl_assert(a && a->ts); 1842 tl_assert(b && b->ts); 1843 useda = VG_(sizeXA)( a->ts ); 1844 usedb = VG_(sizeXA)( b->ts ); 1845 1846 ia = ib = 0; 1847 1848 while (1) { 1849 1850 /* This logic is to enumerate doubles (tyma, tymb) drawn 1851 from a and b in order, and tyma/b are the relevant 1852 scalar timestamps, taking into account implicit zeroes. */ 1853 Thr* thr; 1854 1855 tl_assert(ia >= 0 && ia <= useda); 1856 tl_assert(ib >= 0 && ib <= usedb); 1857 1858 if (ia == useda && ib == usedb) { 1859 /* both empty - done */ 1860 break; 1861 1862 } else if (ia == useda && ib != usedb) { 1863 /* a empty, use up b */ 1864 ScalarTS* tmpb = VG_(indexXA)( b->ts, ib ); 1865 tyma = 0; 1866 tymb = tmpb->tym; 1867 thr = tmpb->thr; 1868 ib++; 1869 1870 } else if (ia != useda && ib == usedb) { 1871 /* b empty, use up a */ 1872 ScalarTS* tmpa = VG_(indexXA)( a->ts, ia ); 1873 tyma = tmpa->tym; 1874 thr = tmpa->thr; 1875 tymb = 0; 1876 ia++; 1877 1878 } else { 1879 /* both not empty; extract lowest-Thr*'d triple */ 1880 ScalarTS* tmpa = VG_(indexXA)( a->ts, ia ); 1881 ScalarTS* tmpb = VG_(indexXA)( b->ts, ib ); 1882 if (tmpa->thr < tmpb->thr) { 1883 /* a has the lowest unconsidered Thr* */ 1884 tyma = tmpa->tym; 1885 thr = tmpa->thr; 1886 tymb = 0; 1887 ia++; 1888 } 1889 else 1890 if (tmpa->thr > tmpb->thr) { 1891 /* b has the lowest unconsidered Thr* */ 1892 tyma = 0; 1893 tymb = tmpb->tym; 1894 thr = tmpb->thr; 1895 ib++; 1896 } else { 1897 /* they both next mention the same Thr* */ 1898 tl_assert(tmpa->thr == tmpb->thr); 1899 tyma = tmpa->tym; 1900 thr = tmpa->thr; 1901 tymb = tmpb->tym; 1902 ia++; 1903 ib++; 1904 } 1905 } 1906 1907 /* having laboriously determined (tyma, tymb), do something 1908 useful with it. */ 1909 if (tyma > tymb) { 1910 /* not LEQ at this index. Quit, since the answer is 1911 determined already. */ 1912 tl_assert(thr); 1913 return thr; 1914 } 1915 } 1916 1917 return NULL; /* all points are LEQ */ 1918 } 1919 1920 1921 /* Compute an arbitrary structural (total) ordering on the two args, 1922 based on their VCs, so they can be looked up in a table, tree, etc. 1923 Returns -1, 0 or 1. (really just 'deriving Ord' :-) This can be 1924 performance critical so there is some effort expended to make it sa 1925 fast as possible. 1926 */ 1927 Word VTS__cmp_structural ( VTS* a, VTS* b ) 1928 { 1929 /* We just need to generate an arbitrary total ordering based on 1930 a->ts and b->ts. Preferably do it in a way which comes across likely 1931 differences relatively quickly. */ 1932 Word i; 1933 Word useda = 0, usedb = 0; 1934 ScalarTS *ctsa = NULL, *ctsb = NULL; 1935 1936 stats__vts__cmp_structural++; 1937 1938 tl_assert(a); 1939 tl_assert(b); 1940 1941 VG_(getContentsXA_UNSAFE)( a->ts, (void**)&ctsa, &useda ); 1942 VG_(getContentsXA_UNSAFE)( b->ts, (void**)&ctsb, &usedb ); 1943 1944 if (LIKELY(useda == usedb)) { 1945 ScalarTS *tmpa = NULL, *tmpb = NULL; 1946 stats__vts__cmp_structural_slow++; 1947 /* Same length vectors. Find the first difference, if any, as 1948 fast as possible. */ 1949 for (i = 0; i < useda; i++) { 1950 tmpa = &ctsa[i]; 1951 tmpb = &ctsb[i]; 1952 if (LIKELY(tmpa->tym == tmpb->tym && tmpa->thr == tmpb->thr)) 1953 continue; 1954 else 1955 break; 1956 } 1957 if (UNLIKELY(i == useda)) { 1958 /* They're identical. */ 1959 return 0; 1960 } else { 1961 tl_assert(i >= 0 && i < useda); 1962 if (tmpa->tym < tmpb->tym) return -1; 1963 if (tmpa->tym > tmpb->tym) return 1; 1964 if (tmpa->thr < tmpb->thr) return -1; 1965 if (tmpa->thr > tmpb->thr) return 1; 1966 /* we just established them as non-identical, hence: */ 1967 } 1968 /*NOTREACHED*/ 1969 tl_assert(0); 1970 } 1971 1972 if (useda < usedb) return -1; 1973 if (useda > usedb) return 1; 1974 /*NOTREACHED*/ 1975 tl_assert(0); 1976 } 1977 1978 1979 /* Debugging only. Display the given VTS in the buffer. 1980 */ 1981 void VTS__show ( HChar* buf, Int nBuf, VTS* vts ) { 1982 ScalarTS* st; 1983 HChar unit[64]; 1984 Word i, n; 1985 Int avail = nBuf; 1986 tl_assert(vts && vts->ts); 1987 tl_assert(nBuf > 16); 1988 buf[0] = '['; 1989 buf[1] = 0; 1990 n = VG_(sizeXA)( vts->ts ); 1991 for (i = 0; i < n; i++) { 1992 tl_assert(avail >= 40); 1993 st = VG_(indexXA)( vts->ts, i ); 1994 VG_(memset)(unit, 0, sizeof(unit)); 1995 VG_(sprintf)(unit, i < n-1 ? "%p:%lld " : "%p:%lld", 1996 st->thr, st->tym); 1997 if (avail < VG_(strlen)(unit) + 40/*let's say*/) { 1998 VG_(strcat)(buf, " ...]"); 1999 buf[nBuf-1] = 0; 2000 return; 2001 } 2002 VG_(strcat)(buf, unit); 2003 avail -= VG_(strlen)(unit); 2004 } 2005 VG_(strcat)(buf, "]"); 2006 buf[nBuf-1] = 0; 2007 } 2008 2009 2010 /* Debugging only. Return vts[index], so to speak. 2011 */ 2012 ULong VTS__indexAt_SLOW ( VTS* vts, Thr* idx ) { 2013 UWord i, n; 2014 stats__vts__indexat_slow++; 2015 tl_assert(vts && vts->ts); 2016 n = VG_(sizeXA)( vts->ts ); 2017 for (i = 0; i < n; i++) { 2018 ScalarTS* st = VG_(indexXA)( vts->ts, i ); 2019 if (st->thr == idx) 2020 return st->tym; 2021 } 2022 return 0; 2023 } 2024 2025 2026 ///////////////////////////////////////////////////////////////// 2027 ///////////////////////////////////////////////////////////////// 2028 // // 2029 // SECTION END vts primitives // 2030 // // 2031 ///////////////////////////////////////////////////////////////// 2032 ///////////////////////////////////////////////////////////////// 2033 2034 2035 2036 ///////////////////////////////////////////////////////////////// 2037 ///////////////////////////////////////////////////////////////// 2038 // // 2039 // SECTION BEGIN main library // 2040 // // 2041 ///////////////////////////////////////////////////////////////// 2042 ///////////////////////////////////////////////////////////////// 2043 2044 2045 ///////////////////////////////////////////////////////// 2046 // // 2047 // VTS set // 2048 // // 2049 ///////////////////////////////////////////////////////// 2050 2051 static WordFM* /* VTS* void void */ vts_set = NULL; 2052 2053 static void vts_set_init ( void ) 2054 { 2055 tl_assert(!vts_set); 2056 vts_set = VG_(newFM)( HG_(zalloc), "libhb.vts_set_init.1", 2057 HG_(free), 2058 (Word(*)(UWord,UWord))VTS__cmp_structural ); 2059 tl_assert(vts_set); 2060 } 2061 2062 /* Given a newly made VTS, look in vts_set to see if we already have 2063 an identical one. If yes, free up this one and return instead a 2064 pointer to the existing one. If no, add this one to the set and 2065 return the same pointer. Caller differentiates the two cases by 2066 comparing returned pointer with the supplied one (although that 2067 does require that the supplied VTS is not already in the set). 2068 */ 2069 static VTS* vts_set__find_and_dealloc__or_add ( VTS* cand ) 2070 { 2071 UWord keyW, valW; 2072 stats__vts_set__fadoa++; 2073 /* lookup cand (by value) */ 2074 if (VG_(lookupFM)( vts_set, &keyW, &valW, (UWord)cand )) { 2075 /* found it */ 2076 tl_assert(valW == 0); 2077 /* if this fails, cand (by ref) was already present (!) */ 2078 tl_assert(keyW != (UWord)cand); 2079 stats__vts_set__fadoa_d++; 2080 VTS__delete(cand); 2081 return (VTS*)keyW; 2082 } else { 2083 /* not present. Add and return pointer to same. */ 2084 VG_(addToFM)( vts_set, (UWord)cand, 0/*val is unused*/ ); 2085 return cand; 2086 } 2087 } 2088 2089 2090 ///////////////////////////////////////////////////////// 2091 // // 2092 // VTS table // 2093 // // 2094 ///////////////////////////////////////////////////////// 2095 2096 static void VtsID__invalidate_caches ( void ); /* fwds */ 2097 2098 /* A type to hold VTS table entries. Invariants: 2099 If .vts == NULL, then this entry is not in use, so: 2100 - .rc == 0 2101 - this entry is on the freelist (unfortunately, does not imply 2102 any constraints on value for .nextfree) 2103 If .vts != NULL, then this entry is in use: 2104 - .vts is findable in vts_set 2105 - .vts->id == this entry number 2106 - no specific value for .rc (even 0 is OK) 2107 - this entry is not on freelist, so .nextfree == VtsID_INVALID 2108 */ 2109 typedef 2110 struct { 2111 VTS* vts; /* vts, in vts_set */ 2112 UWord rc; /* reference count - enough for entire aspace */ 2113 VtsID freelink; /* chain for free entries, VtsID_INVALID at end */ 2114 } 2115 VtsTE; 2116 2117 /* The VTS table. */ 2118 static XArray* /* of VtsTE */ vts_tab = NULL; 2119 2120 /* An index into the VTS table, indicating the start of the list of 2121 free (available for use) entries. If the list is empty, this is 2122 VtsID_INVALID. */ 2123 static VtsID vts_tab_freelist = VtsID_INVALID; 2124 2125 /* Do a GC of vts_tab when the freelist becomes empty AND the size of 2126 vts_tab equals or exceeds this size. After GC, the value here is 2127 set appropriately so as to check for the next GC point. */ 2128 static Word vts_next_GC_at = 1000; 2129 2130 static void vts_tab_init ( void ) 2131 { 2132 vts_tab 2133 = VG_(newXA)( HG_(zalloc), "libhb.vts_tab_init.1", 2134 HG_(free), sizeof(VtsTE) ); 2135 vts_tab_freelist 2136 = VtsID_INVALID; 2137 tl_assert(vts_tab); 2138 } 2139 2140 /* Add ii to the free list, checking that it looks out-of-use. */ 2141 static void add_to_free_list ( VtsID ii ) 2142 { 2143 VtsTE* ie = VG_(indexXA)( vts_tab, ii ); 2144 tl_assert(ie->vts == NULL); 2145 tl_assert(ie->rc == 0); 2146 tl_assert(ie->freelink == VtsID_INVALID); 2147 ie->freelink = vts_tab_freelist; 2148 vts_tab_freelist = ii; 2149 } 2150 2151 /* Get an entry from the free list. This will return VtsID_INVALID if 2152 the free list is empty. */ 2153 static VtsID get_from_free_list ( void ) 2154 { 2155 VtsID ii; 2156 VtsTE* ie; 2157 if (vts_tab_freelist == VtsID_INVALID) 2158 return VtsID_INVALID; 2159 ii = vts_tab_freelist; 2160 ie = VG_(indexXA)( vts_tab, ii ); 2161 tl_assert(ie->vts == NULL); 2162 tl_assert(ie->rc == 0); 2163 vts_tab_freelist = ie->freelink; 2164 return ii; 2165 } 2166 2167 /* Produce a new VtsID that can be used, either by getting it from 2168 the freelist, or, if that is empty, by expanding vts_tab. */ 2169 static VtsID get_new_VtsID ( void ) 2170 { 2171 VtsID ii; 2172 VtsTE te; 2173 ii = get_from_free_list(); 2174 if (ii != VtsID_INVALID) 2175 return ii; 2176 te.vts = NULL; 2177 te.rc = 0; 2178 te.freelink = VtsID_INVALID; 2179 ii = (VtsID)VG_(addToXA)( vts_tab, &te ); 2180 return ii; 2181 } 2182 2183 2184 /* Indirect callback from lib_zsm. */ 2185 static void VtsID__rcinc ( VtsID ii ) 2186 { 2187 VtsTE* ie; 2188 /* VG_(indexXA) does a range check for us */ 2189 ie = VG_(indexXA)( vts_tab, ii ); 2190 tl_assert(ie->vts); /* else it's not in use */ 2191 tl_assert(ie->rc < ~0UL); /* else we can't continue */ 2192 tl_assert(ie->vts->id == ii); 2193 ie->rc++; 2194 } 2195 2196 /* Indirect callback from lib_zsm. */ 2197 static void VtsID__rcdec ( VtsID ii ) 2198 { 2199 VtsTE* ie; 2200 /* VG_(indexXA) does a range check for us */ 2201 ie = VG_(indexXA)( vts_tab, ii ); 2202 tl_assert(ie->vts); /* else it's not in use */ 2203 tl_assert(ie->rc > 0); /* else RC snafu */ 2204 tl_assert(ie->vts->id == ii); 2205 ie->rc--; 2206 } 2207 2208 2209 /* Look up 'cand' in our collection of VTSs. If present, deallocate 2210 it and return the VtsID for the pre-existing version. If not 2211 present, add it to both vts_tab and vts_set, allocate a fresh VtsID 2212 for it, and return that. */ 2213 static VtsID vts_tab__find_and_dealloc__or_add ( VTS* cand ) 2214 { 2215 VTS* auld; 2216 tl_assert(cand->id == VtsID_INVALID); 2217 auld = vts_set__find_and_dealloc__or_add(cand); 2218 if (auld != cand) { 2219 /* We already have an Aulde one. Use that. */ 2220 VtsTE* ie; 2221 tl_assert(auld->id != VtsID_INVALID); 2222 ie = VG_(indexXA)( vts_tab, auld->id ); 2223 tl_assert(ie->vts == auld); 2224 return auld->id; 2225 } else { 2226 VtsID ii = get_new_VtsID(); 2227 VtsTE* ie = VG_(indexXA)( vts_tab, ii ); 2228 ie->vts = cand; 2229 ie->rc = 0; 2230 ie->freelink = VtsID_INVALID; 2231 cand->id = ii; 2232 return ii; 2233 } 2234 } 2235 2236 2237 static void show_vts_stats ( HChar* caller ) 2238 { 2239 UWord nSet, nTab, nLive; 2240 ULong totrc; 2241 UWord n, i; 2242 nSet = VG_(sizeFM)( vts_set ); 2243 nTab = VG_(sizeXA)( vts_tab ); 2244 totrc = 0; 2245 nLive = 0; 2246 n = VG_(sizeXA)( vts_tab ); 2247 for (i = 0; i < n; i++) { 2248 VtsTE* ie = VG_(indexXA)( vts_tab, i ); 2249 if (ie->vts) { 2250 nLive++; 2251 totrc += (ULong)ie->rc; 2252 } else { 2253 tl_assert(ie->rc == 0); 2254 } 2255 } 2256 VG_(printf)(" show_vts_stats %s\n", caller); 2257 VG_(printf)(" vts_tab size %4lu\n", nTab); 2258 VG_(printf)(" vts_tab live %4lu\n", nLive); 2259 VG_(printf)(" vts_set size %4lu\n", nSet); 2260 VG_(printf)(" total rc %4llu\n", totrc); 2261 } 2262 2263 /* NOT TO BE CALLED FROM WITHIN libzsm. */ 2264 __attribute__((noinline)) 2265 static void vts_tab__do_GC ( Bool show_stats ) 2266 { 2267 UWord i, nTab, nLive, nFreed; 2268 2269 /* check this is actually necessary. */ 2270 tl_assert(vts_tab_freelist == VtsID_INVALID); 2271 2272 /* empty the caches for partial order checks and binary joins. We 2273 could do better and prune out the entries to be deleted, but it 2274 ain't worth the hassle. */ 2275 VtsID__invalidate_caches(); 2276 2277 /* First, make the reference counts up to date. */ 2278 zsm_flush_cache(); 2279 2280 nTab = VG_(sizeXA)( vts_tab ); 2281 2282 if (show_stats) { 2283 VG_(printf)("<<GC begins at vts_tab size %lu>>\n", nTab); 2284 show_vts_stats("before GC"); 2285 } 2286 2287 /* Now we can inspect the entire vts_tab. Any entries 2288 with zero .rc fields are now no longer in use and can be 2289 free list, removed from vts_set, and deleted. */ 2290 nFreed = 0; 2291 for (i = 0; i < nTab; i++) { 2292 Bool present; 2293 UWord oldK = 0, oldV = 0; 2294 VtsTE* te = VG_(indexXA)( vts_tab, i ); 2295 if (te->vts == NULL) { 2296 tl_assert(te->rc == 0); 2297 continue; /* already on the free list (presumably) */ 2298 } 2299 if (te->rc > 0) 2300 continue; /* in use */ 2301 /* Ok, we got one we can free. */ 2302 tl_assert(te->vts->id == i); 2303 /* first, remove it from vts_set. */ 2304 present = VG_(delFromFM)( vts_set, 2305 &oldK, &oldV, (UWord)te->vts ); 2306 tl_assert(present); /* else it isn't in vts_set ?! */ 2307 tl_assert(oldV == 0); /* no info stored in vts_set val fields */ 2308 tl_assert(oldK == (UWord)te->vts); /* else what did delFromFM find?! */ 2309 /* now free the VTS itself */ 2310 VTS__delete(te->vts); 2311 te->vts = NULL; 2312 /* and finally put this entry on the free list */ 2313 tl_assert(te->freelink == VtsID_INVALID); /* can't already be on it */ 2314 add_to_free_list( i ); 2315 nFreed++; 2316 } 2317 2318 /* Now figure out when the next GC should be. We'll allow the 2319 number of VTSs to double before GCing again. Except of course 2320 that since we can't (or, at least, don't) shrink vts_tab, we 2321 can't set the threshhold value smaller than it. */ 2322 tl_assert(nFreed <= nTab); 2323 nLive = nTab - nFreed; 2324 tl_assert(nLive >= 0 && nLive <= nTab); 2325 vts_next_GC_at = 2 * nLive; 2326 if (vts_next_GC_at < nTab) 2327 vts_next_GC_at = nTab; 2328 2329 if (show_stats) { 2330 show_vts_stats("after GC"); 2331 VG_(printf)("<<GC ends, next gc at %ld>>\n", vts_next_GC_at); 2332 } 2333 2334 if (VG_(clo_stats)) { 2335 static UInt ctr = 0; 2336 tl_assert(nTab > 0); 2337 VG_(message)(Vg_DebugMsg, 2338 "libhb: VTS GC: #%u old size %lu live %lu (%2llu%%)\n", 2339 ctr++, nTab, nLive, (100ULL * (ULong)nLive) / (ULong)nTab); 2340 } 2341 } 2342 2343 2344 ///////////////////////////////////////////////////////// 2345 // // 2346 // Vts IDs // 2347 // // 2348 ///////////////////////////////////////////////////////// 2349 2350 ////////////////////////// 2351 static ULong stats__cmpLEQ_queries = 0; 2352 static ULong stats__cmpLEQ_misses = 0; 2353 static ULong stats__join2_queries = 0; 2354 static ULong stats__join2_misses = 0; 2355 2356 static inline UInt ROL32 ( UInt w, Int n ) { 2357 w = (w << n) | (w >> (32-n)); 2358 return w; 2359 } 2360 static inline UInt hash_VtsIDs ( VtsID vi1, VtsID vi2, UInt nTab ) { 2361 UInt hash = ROL32(vi1,19) ^ ROL32(vi2,13); 2362 return hash % nTab; 2363 } 2364 2365 #define N_CMPLEQ_CACHE 1023 2366 static 2367 struct { VtsID vi1; VtsID vi2; Bool leq; } 2368 cmpLEQ_cache[N_CMPLEQ_CACHE]; 2369 2370 #define N_JOIN2_CACHE 1023 2371 static 2372 struct { VtsID vi1; VtsID vi2; VtsID res; } 2373 join2_cache[N_JOIN2_CACHE]; 2374 2375 static void VtsID__invalidate_caches ( void ) { 2376 Int i; 2377 for (i = 0; i < N_CMPLEQ_CACHE; i++) { 2378 cmpLEQ_cache[i].vi1 = VtsID_INVALID; 2379 cmpLEQ_cache[i].vi2 = VtsID_INVALID; 2380 cmpLEQ_cache[i].leq = False; 2381 } 2382 for (i = 0; i < N_JOIN2_CACHE; i++) { 2383 join2_cache[i].vi1 = VtsID_INVALID; 2384 join2_cache[i].vi2 = VtsID_INVALID; 2385 join2_cache[i].res = VtsID_INVALID; 2386 } 2387 } 2388 ////////////////////////// 2389 2390 //static Bool VtsID__is_valid ( VtsID vi ) { 2391 // VtsTE* ve; 2392 // if (vi >= (VtsID)VG_(sizeXA)( vts_tab )) 2393 // return False; 2394 // ve = VG_(indexXA)( vts_tab, vi ); 2395 // if (!ve->vts) 2396 // return False; 2397 // tl_assert(ve->vts->id == vi); 2398 // return True; 2399 //} 2400 2401 static VTS* VtsID__to_VTS ( VtsID vi ) { 2402 VtsTE* te = VG_(indexXA)( vts_tab, vi ); 2403 tl_assert(te->vts); 2404 return te->vts; 2405 } 2406 2407 static void VtsID__pp ( VtsID vi ) { 2408 HChar buf[100]; 2409 VTS* vts = VtsID__to_VTS(vi); 2410 VTS__show( buf, sizeof(buf)-1, vts ); 2411 buf[sizeof(buf)-1] = 0; 2412 VG_(printf)("%s", buf); 2413 } 2414 2415 /* compute partial ordering relation of vi1 and vi2. */ 2416 __attribute__((noinline)) 2417 static Bool VtsID__cmpLEQ_WRK ( VtsID vi1, VtsID vi2 ) { 2418 UInt hash; 2419 Bool leq; 2420 VTS *v1, *v2; 2421 //if (vi1 == vi2) return True; 2422 tl_assert(vi1 != vi2); 2423 ////++ 2424 stats__cmpLEQ_queries++; 2425 hash = hash_VtsIDs(vi1, vi2, N_CMPLEQ_CACHE); 2426 if (cmpLEQ_cache[hash].vi1 == vi1 2427 && cmpLEQ_cache[hash].vi2 == vi2) 2428 return cmpLEQ_cache[hash].leq; 2429 stats__cmpLEQ_misses++; 2430 ////-- 2431 v1 = VtsID__to_VTS(vi1); 2432 v2 = VtsID__to_VTS(vi2); 2433 leq = VTS__cmpLEQ( v1, v2 ) == NULL; 2434 ////++ 2435 cmpLEQ_cache[hash].vi1 = vi1; 2436 cmpLEQ_cache[hash].vi2 = vi2; 2437 cmpLEQ_cache[hash].leq = leq; 2438 ////-- 2439 return leq; 2440 } 2441 static inline Bool VtsID__cmpLEQ ( VtsID vi1, VtsID vi2 ) { 2442 return LIKELY(vi1 == vi2) ? True : VtsID__cmpLEQ_WRK(vi1, vi2); 2443 } 2444 2445 /* compute binary join */ 2446 __attribute__((noinline)) 2447 static VtsID VtsID__join2_WRK ( VtsID vi1, VtsID vi2 ) { 2448 UInt hash; 2449 VtsID res; 2450 VTS *vts1, *vts2, *nyu; 2451 //if (vi1 == vi2) return vi1; 2452 tl_assert(vi1 != vi2); 2453 ////++ 2454 stats__join2_queries++; 2455 hash = hash_VtsIDs(vi1, vi2, N_JOIN2_CACHE); 2456 if (join2_cache[hash].vi1 == vi1 2457 && join2_cache[hash].vi2 == vi2) 2458 return join2_cache[hash].res; 2459 stats__join2_misses++; 2460 ////-- 2461 vts1 = VtsID__to_VTS(vi1); 2462 vts2 = VtsID__to_VTS(vi2); 2463 nyu = VTS__join(vts1,vts2); 2464 res = vts_tab__find_and_dealloc__or_add(nyu); 2465 ////++ 2466 join2_cache[hash].vi1 = vi1; 2467 join2_cache[hash].vi2 = vi2; 2468 join2_cache[hash].res = res; 2469 ////-- 2470 return res; 2471 } 2472 static inline VtsID VtsID__join2 ( VtsID vi1, VtsID vi2 ) { 2473 return LIKELY(vi1 == vi2) ? vi1 : VtsID__join2_WRK(vi1, vi2); 2474 } 2475 2476 /* create a singleton VTS, namely [thr:1] */ 2477 static VtsID VtsID__mk_Singleton ( Thr* thr, ULong tym ) { 2478 VTS* nyu = VTS__singleton(thr,tym); 2479 return vts_tab__find_and_dealloc__or_add(nyu); 2480 } 2481 2482 /* tick operation, creates value 1 if specified index is absent */ 2483 static VtsID VtsID__tick ( VtsID vi, Thr* idx ) { 2484 VTS* vts = VtsID__to_VTS(vi); 2485 VTS* nyu = VTS__tick(idx,vts); 2486 return vts_tab__find_and_dealloc__or_add(nyu); 2487 } 2488 2489 /* index into a VTS (only for assertions) */ 2490 static ULong VtsID__indexAt ( VtsID vi, Thr* idx ) { 2491 VTS* vts = VtsID__to_VTS(vi); 2492 return VTS__indexAt_SLOW( vts, idx ); 2493 } 2494 2495 /* Assuming that !cmpLEQ(vi1, vi2), find the index of the first (or 2496 any, really) element in vi1 which is pointwise greater-than the 2497 corresponding element in vi2. If no such element exists, return 2498 NULL. This needs to be fairly quick since it is called every time 2499 a race is detected. */ 2500 static Thr* VtsID__findFirst_notLEQ ( VtsID vi1, VtsID vi2 ) 2501 { 2502 VTS *vts1, *vts2; 2503 Thr* diffthr; 2504 tl_assert(vi1 != vi2); 2505 vts1 = VtsID__to_VTS(vi1); 2506 vts2 = VtsID__to_VTS(vi2); 2507 tl_assert(vts1 != vts2); 2508 diffthr = VTS__cmpLEQ(vts1, vts2); 2509 tl_assert(diffthr); /* else they are LEQ ! */ 2510 return diffthr; 2511 } 2512 2513 2514 ///////////////////////////////////////////////////////// 2515 // // 2516 // Filters // 2517 // // 2518 ///////////////////////////////////////////////////////// 2519 2520 // baseline: 5, 9 2521 #define FI_LINE_SZB_LOG2 5 2522 #define FI_NUM_LINES_LOG2 10 2523 2524 #define FI_LINE_SZB (1 << FI_LINE_SZB_LOG2) 2525 #define FI_NUM_LINES (1 << FI_NUM_LINES_LOG2) 2526 2527 #define FI_TAG_MASK (~(Addr)(FI_LINE_SZB - 1)) 2528 #define FI_GET_TAG(_a) ((_a) & FI_TAG_MASK) 2529 2530 #define FI_GET_LINENO(_a) ( ((_a) >> FI_LINE_SZB_LOG2) \ 2531 & (Addr)(FI_NUM_LINES-1) ) 2532 2533 2534 /* In the lines, each 8 bytes are treated individually, and are mapped 2535 to a UShort. Regardless of endianness of the underlying machine, 2536 bits 1 and 0 pertain to the lowest address and bits 15 and 14 to 2537 the highest address. 2538 2539 Of each bit pair, the higher numbered bit is set if a R has been 2540 seen, so the actual layout is: 2541 2542 15 14 ... 01 00 2543 2544 R W for addr+7 ... R W for addr+0 2545 2546 So a mask for the R-bits is 0xAAAA and for the W bits is 0x5555. 2547 */ 2548 2549 /* tags are separated from lines. tags are Addrs and are 2550 the base address of the line. */ 2551 typedef 2552 struct { 2553 UShort u16s[FI_LINE_SZB / 8]; /* each UShort covers 8 bytes */ 2554 } 2555 FiLine; 2556 2557 typedef 2558 struct { 2559 Addr tags[FI_NUM_LINES]; 2560 FiLine lines[FI_NUM_LINES]; 2561 } 2562 Filter; 2563 2564 /* Forget everything we know -- clear the filter and let everything 2565 through. This needs to be as fast as possible, since it is called 2566 every time the running thread changes, and every time a thread's 2567 vector clocks change, which can be quite frequent. The obvious 2568 fast way to do this is simply to stuff in tags which we know are 2569 not going to match anything, since they're not aligned to the start 2570 of a line. */ 2571 static void Filter__clear ( Filter* fi, HChar* who ) 2572 { 2573 UWord i; 2574 if (0) VG_(printf)(" Filter__clear(%p, %s)\n", fi, who); 2575 for (i = 0; i < FI_NUM_LINES; i += 8) { 2576 fi->tags[i+0] = 1; /* impossible value -- cannot match */ 2577 fi->tags[i+1] = 1; 2578 fi->tags[i+2] = 1; 2579 fi->tags[i+3] = 1; 2580 fi->tags[i+4] = 1; 2581 fi->tags[i+5] = 1; 2582 fi->tags[i+6] = 1; 2583 fi->tags[i+7] = 1; 2584 } 2585 tl_assert(i == FI_NUM_LINES); 2586 } 2587 2588 /* Clearing an arbitrary range in the filter. Unfortunately 2589 we have to do this due to core-supplied new/die-mem events. */ 2590 2591 static void Filter__clear_1byte ( Filter* fi, Addr a ) 2592 { 2593 Addr atag = FI_GET_TAG(a); /* tag of 'a' */ 2594 UWord lineno = FI_GET_LINENO(a); /* lineno for 'a' */ 2595 FiLine* line = &fi->lines[lineno]; 2596 UWord loff = (a - atag) / 8; 2597 UShort mask = 0x3 << (2 * (a & 7)); 2598 /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */ 2599 if (LIKELY( fi->tags[lineno] == atag )) { 2600 /* hit. clear the bits. */ 2601 UShort u16 = line->u16s[loff]; 2602 line->u16s[loff] = u16 & ~mask; /* clear them */ 2603 } else { 2604 /* miss. The filter doesn't hold this address, so ignore. */ 2605 } 2606 } 2607 2608 static void Filter__clear_8bytes_aligned ( Filter* fi, Addr a ) 2609 { 2610 Addr atag = FI_GET_TAG(a); /* tag of 'a' */ 2611 UWord lineno = FI_GET_LINENO(a); /* lineno for 'a' */ 2612 FiLine* line = &fi->lines[lineno]; 2613 UWord loff = (a - atag) / 8; 2614 if (LIKELY( fi->tags[lineno] == atag )) { 2615 line->u16s[loff] = 0; 2616 } else { 2617 /* miss. The filter doesn't hold this address, so ignore. */ 2618 } 2619 } 2620 2621 static void Filter__clear_range ( Filter* fi, Addr a, UWord len ) 2622 { 2623 //VG_(printf)("%lu ", len); 2624 /* slowly do part preceding 8-alignment */ 2625 while (UNLIKELY(!VG_IS_8_ALIGNED(a)) && LIKELY(len > 0)) { 2626 Filter__clear_1byte( fi, a ); 2627 a++; 2628 len--; 2629 } 2630 /* vector loop */ 2631 while (len >= 8) { 2632 Filter__clear_8bytes_aligned( fi, a ); 2633 a += 8; 2634 len -= 8; 2635 } 2636 /* slowly do tail */ 2637 while (UNLIKELY(len > 0)) { 2638 Filter__clear_1byte( fi, a ); 2639 a++; 2640 len--; 2641 } 2642 } 2643 2644 2645 /* ------ Read handlers for the filter. ------ */ 2646 2647 static inline Bool Filter__ok_to_skip_crd64 ( Filter* fi, Addr a ) 2648 { 2649 if (UNLIKELY( !VG_IS_8_ALIGNED(a) )) 2650 return False; 2651 { 2652 Addr atag = FI_GET_TAG(a); /* tag of 'a' */ 2653 UWord lineno = FI_GET_LINENO(a); /* lineno for 'a' */ 2654 FiLine* line = &fi->lines[lineno]; 2655 UWord loff = (a - atag) / 8; 2656 UShort mask = 0xAAAA; 2657 if (LIKELY( fi->tags[lineno] == atag )) { 2658 /* hit. check line and update. */ 2659 UShort u16 = line->u16s[loff]; 2660 Bool ok = (u16 & mask) == mask; /* all R bits set? */ 2661 line->u16s[loff] = u16 | mask; /* set them */ 2662 return ok; 2663 } else { 2664 /* miss. nuke existing line and re-use it. */ 2665 UWord i; 2666 fi->tags[lineno] = atag; 2667 for (i = 0; i < FI_LINE_SZB / 8; i++) 2668 line->u16s[i] = 0; 2669 line->u16s[loff] = mask; 2670 return False; 2671 } 2672 } 2673 } 2674 2675 static inline Bool Filter__ok_to_skip_crd32 ( Filter* fi, Addr a ) 2676 { 2677 if (UNLIKELY( !VG_IS_4_ALIGNED(a) )) 2678 return False; 2679 { 2680 Addr atag = FI_GET_TAG(a); /* tag of 'a' */ 2681 UWord lineno = FI_GET_LINENO(a); /* lineno for 'a' */ 2682 FiLine* line = &fi->lines[lineno]; 2683 UWord loff = (a - atag) / 8; 2684 UShort mask = 0xAA << (2 * (a & 4)); /* 0xAA00 or 0x00AA */ 2685 if (LIKELY( fi->tags[lineno] == atag )) { 2686 /* hit. check line and update. */ 2687 UShort u16 = line->u16s[loff]; 2688 Bool ok = (u16 & mask) == mask; /* 4 x R bits set? */ 2689 line->u16s[loff] = u16 | mask; /* set them */ 2690 return ok; 2691 } else { 2692 /* miss. nuke existing line and re-use it. */ 2693 UWord i; 2694 fi->tags[lineno] = atag; 2695 for (i = 0; i < FI_LINE_SZB / 8; i++) 2696 line->u16s[i] = 0; 2697 line->u16s[loff] = mask; 2698 return False; 2699 } 2700 } 2701 } 2702 2703 static inline Bool Filter__ok_to_skip_crd16 ( Filter* fi, Addr a ) 2704 { 2705 if (UNLIKELY( !VG_IS_2_ALIGNED(a) )) 2706 return False; 2707 { 2708 Addr atag = FI_GET_TAG(a); /* tag of 'a' */ 2709 UWord lineno = FI_GET_LINENO(a); /* lineno for 'a' */ 2710 FiLine* line = &fi->lines[lineno]; 2711 UWord loff = (a - atag) / 8; 2712 UShort mask = 0xA << (2 * (a & 6)); 2713 /* mask is A000, 0A00, 00A0 or 000A */ 2714 if (LIKELY( fi->tags[lineno] == atag )) { 2715 /* hit. check line and update. */ 2716 UShort u16 = line->u16s[loff]; 2717 Bool ok = (u16 & mask) == mask; /* 2 x R bits set? */ 2718 line->u16s[loff] = u16 | mask; /* set them */ 2719 return ok; 2720 } else { 2721 /* miss. nuke existing line and re-use it. */ 2722 UWord i; 2723 fi->tags[lineno] = atag; 2724 for (i = 0; i < FI_LINE_SZB / 8; i++) 2725 line->u16s[i] = 0; 2726 line->u16s[loff] = mask; 2727 return False; 2728 } 2729 } 2730 } 2731 2732 static inline Bool Filter__ok_to_skip_crd08 ( Filter* fi, Addr a ) 2733 { 2734 { 2735 Addr atag = FI_GET_TAG(a); /* tag of 'a' */ 2736 UWord lineno = FI_GET_LINENO(a); /* lineno for 'a' */ 2737 FiLine* line = &fi->lines[lineno]; 2738 UWord loff = (a - atag) / 8; 2739 UShort mask = 0x2 << (2 * (a & 7)); 2740 /* mask is 8000, 2000, 0800, 0200, 0080, 0020, 0008 or 0002 */ 2741 if (LIKELY( fi->tags[lineno] == atag )) { 2742 /* hit. check line and update. */ 2743 UShort u16 = line->u16s[loff]; 2744 Bool ok = (u16 & mask) == mask; /* 1 x R bits set? */ 2745 line->u16s[loff] = u16 | mask; /* set them */ 2746 return ok; 2747 } else { 2748 /* miss. nuke existing line and re-use it. */ 2749 UWord i; 2750 fi->tags[lineno] = atag; 2751 for (i = 0; i < FI_LINE_SZB / 8; i++) 2752 line->u16s[i] = 0; 2753 line->u16s[loff] = mask; 2754 return False; 2755 } 2756 } 2757 } 2758 2759 2760 /* ------ Write handlers for the filter. ------ */ 2761 2762 static inline Bool Filter__ok_to_skip_cwr64 ( Filter* fi, Addr a ) 2763 { 2764 if (UNLIKELY( !VG_IS_8_ALIGNED(a) )) 2765 return False; 2766 { 2767 Addr atag = FI_GET_TAG(a); /* tag of 'a' */ 2768 UWord lineno = FI_GET_LINENO(a); /* lineno for 'a' */ 2769 FiLine* line = &fi->lines[lineno]; 2770 UWord loff = (a - atag) / 8; 2771 UShort mask = 0xFFFF; 2772 if (LIKELY( fi->tags[lineno] == atag )) { 2773 /* hit. check line and update. */ 2774 UShort u16 = line->u16s[loff]; 2775 Bool ok = (u16 & mask) == mask; /* all R & W bits set? */ 2776 line->u16s[loff] = u16 | mask; /* set them */ 2777 return ok; 2778 } else { 2779 /* miss. nuke existing line and re-use it. */ 2780 UWord i; 2781 fi->tags[lineno] = atag; 2782 for (i = 0; i < FI_LINE_SZB / 8; i++) 2783 line->u16s[i] = 0; 2784 line->u16s[loff] = mask; 2785 return False; 2786 } 2787 } 2788 } 2789 2790 static inline Bool Filter__ok_to_skip_cwr32 ( Filter* fi, Addr a ) 2791 { 2792 if (UNLIKELY( !VG_IS_4_ALIGNED(a) )) 2793 return False; 2794 { 2795 Addr atag = FI_GET_TAG(a); /* tag of 'a' */ 2796 UWord lineno = FI_GET_LINENO(a); /* lineno for 'a' */ 2797 FiLine* line = &fi->lines[lineno]; 2798 UWord loff = (a - atag) / 8; 2799 UShort mask = 0xFF << (2 * (a & 4)); /* 0xFF00 or 0x00FF */ 2800 if (LIKELY( fi->tags[lineno] == atag )) { 2801 /* hit. check line and update. */ 2802 UShort u16 = line->u16s[loff]; 2803 Bool ok = (u16 & mask) == mask; /* 4 x R & W bits set? */ 2804 line->u16s[loff] = u16 | mask; /* set them */ 2805 return ok; 2806 } else { 2807 /* miss. nuke existing line and re-use it. */ 2808 UWord i; 2809 fi->tags[lineno] = atag; 2810 for (i = 0; i < FI_LINE_SZB / 8; i++) 2811 line->u16s[i] = 0; 2812 line->u16s[loff] = mask; 2813 return False; 2814 } 2815 } 2816 } 2817 2818 static inline Bool Filter__ok_to_skip_cwr16 ( Filter* fi, Addr a ) 2819 { 2820 if (UNLIKELY( !VG_IS_2_ALIGNED(a) )) 2821 return False; 2822 { 2823 Addr atag = FI_GET_TAG(a); /* tag of 'a' */ 2824 UWord lineno = FI_GET_LINENO(a); /* lineno for 'a' */ 2825 FiLine* line = &fi->lines[lineno]; 2826 UWord loff = (a - atag) / 8; 2827 UShort mask = 0xF << (2 * (a & 6)); 2828 /* mask is F000, 0F00, 00F0 or 000F */ 2829 if (LIKELY( fi->tags[lineno] == atag )) { 2830 /* hit. check line and update. */ 2831 UShort u16 = line->u16s[loff]; 2832 Bool ok = (u16 & mask) == mask; /* 2 x R & W bits set? */ 2833 line->u16s[loff] = u16 | mask; /* set them */ 2834 return ok; 2835 } else { 2836 /* miss. nuke existing line and re-use it. */ 2837 UWord i; 2838 fi->tags[lineno] = atag; 2839 for (i = 0; i < FI_LINE_SZB / 8; i++) 2840 line->u16s[i] = 0; 2841 line->u16s[loff] = mask; 2842 return False; 2843 } 2844 } 2845 } 2846 2847 static inline Bool Filter__ok_to_skip_cwr08 ( Filter* fi, Addr a ) 2848 { 2849 { 2850 Addr atag = FI_GET_TAG(a); /* tag of 'a' */ 2851 UWord lineno = FI_GET_LINENO(a); /* lineno for 'a' */ 2852 FiLine* line = &fi->lines[lineno]; 2853 UWord loff = (a - atag) / 8; 2854 UShort mask = 0x3 << (2 * (a & 7)); 2855 /* mask is C000, 3000, 0C00, 0300, 00C0, 0030, 000C or 0003 */ 2856 if (LIKELY( fi->tags[lineno] == atag )) { 2857 /* hit. check line and update. */ 2858 UShort u16 = line->u16s[loff]; 2859 Bool ok = (u16 & mask) == mask; /* 1 x R bits set? */ 2860 line->u16s[loff] = u16 | mask; /* set them */ 2861 return ok; 2862 } else { 2863 /* miss. nuke existing line and re-use it. */ 2864 UWord i; 2865 fi->tags[lineno] = atag; 2866 for (i = 0; i < FI_LINE_SZB / 8; i++) 2867 line->u16s[i] = 0; 2868 line->u16s[loff] = mask; 2869 return False; 2870 } 2871 } 2872 } 2873 2874 2875 ///////////////////////////////////////////////////////// 2876 // // 2877 // Threads // 2878 // // 2879 ///////////////////////////////////////////////////////// 2880 2881 // QQQ move this somewhere else 2882 typedef struct { ULong ull; ExeContext* ec; } ULong_n_EC; 2883 2884 /* How many of the above records to collect for each thread? Older 2885 ones are dumped when we run out of space. 62.5k requires 1MB per 2886 thread, since each ULong_n_EC record is 16 bytes long. When more 2887 than N_KWs_N_STACKs_PER_THREAD are present, the older half are 2888 deleted to make space. Hence in the worst case we will be able to 2889 produce a stack at least for the last N_KWs_N_STACKs_PER_THREAD / 2 2890 Kw transitions (segments in this thread). For the current setting 2891 that gives a guaranteed stack for at least the last 31.25k 2892 segments. */ 2893 #define N_KWs_N_STACKs_PER_THREAD 62500 2894 2895 2896 struct _Thr { 2897 /* Current VTSs for this thread. They change as we go along. viR 2898 is the VTS to be used for reads, viW for writes. Usually they 2899 are the same, but can differ when we deal with reader-writer 2900 locks. It is always the case that 2901 VtsID__cmpLEQ(viW,viR) == True 2902 that is, viW must be the same, or lagging behind, viR. */ 2903 VtsID viR; 2904 VtsID viW; 2905 2906 /* Is initially False, and is set to true after the thread really 2907 has done a low-level exit. */ 2908 Bool still_alive; 2909 2910 /* A filter that removes references for which we believe that 2911 msmcread/msmcwrite will not change the state, nor report a 2912 race. */ 2913 Filter* filter; 2914 2915 /* opaque (to us) data we hold on behalf of the library's user. */ 2916 void* opaque; 2917 2918 /* The ULongs (scalar Kws) in this accumulate in strictly 2919 increasing order, without duplicates. This is important because 2920 we need to be able to find a given scalar Kw in this array 2921 later, by binary search. */ 2922 XArray* /* ULong_n_EC */ local_Kws_n_stacks; 2923 }; 2924 2925 static Thr* Thr__new ( void ) { 2926 Thr* thr = HG_(zalloc)( "libhb.Thr__new.1", sizeof(Thr) ); 2927 thr->viR = VtsID_INVALID; 2928 thr->viW = VtsID_INVALID; 2929 thr->still_alive = True; 2930 thr->filter = HG_(zalloc)( "libhb.Thr__new.2", sizeof(Filter) ); 2931 /* We only really need this at history level 1, but unfortunately 2932 this routine is called before the command line processing is 2933 done (sigh), so we can't rely on HG_(clo_history_level) at this 2934 point. Hence always allocate it. Bah. */ 2935 thr->local_Kws_n_stacks 2936 = VG_(newXA)( HG_(zalloc), 2937 "libhb.Thr__new.3 (local_Kws_and_stacks)", 2938 HG_(free), sizeof(ULong_n_EC) ); 2939 return thr; 2940 } 2941 2942 static void note_local_Kw_n_stack_for ( Thr* thr ) 2943 { 2944 Word nPresent; 2945 ULong_n_EC pair; 2946 tl_assert(thr); 2947 2948 // We only collect this info at history level 1 (approx) 2949 if (HG_(clo_history_level) != 1) 2950 return; 2951 2952 /* This is the scalar Kw for thr. */ 2953 pair.ull = VtsID__indexAt( thr->viW, thr ); 2954 pair.ec = main_get_EC( thr ); 2955 tl_assert(pair.ec); 2956 tl_assert(thr->local_Kws_n_stacks); 2957 2958 /* check that we're not adding duplicates */ 2959 nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks ); 2960 2961 /* Throw away old stacks, if necessary. We can't accumulate stuff 2962 indefinitely. */ 2963 if (nPresent >= N_KWs_N_STACKs_PER_THREAD) { 2964 VG_(dropHeadXA)( thr->local_Kws_n_stacks, nPresent / 2 ); 2965 nPresent = VG_(sizeXA)( thr->local_Kws_n_stacks ); 2966 if (0) 2967 VG_(printf)("LOCAL Kw: thr %p, Kw %llu, ec %p (!!! gc !!!)\n", 2968 thr, pair.ull, pair.ec ); 2969 } 2970 2971 if (nPresent > 0) { 2972 ULong_n_EC* prevPair 2973 = (ULong_n_EC*)VG_(indexXA)( thr->local_Kws_n_stacks, nPresent-1 ); 2974 tl_assert( prevPair->ull <= pair.ull ); 2975 } 2976 2977 if (nPresent == 0) 2978 pair.ec = NULL; 2979 2980 VG_(addToXA)( thr->local_Kws_n_stacks, &pair ); 2981 2982 if (0) 2983 VG_(printf)("LOCAL Kw: thr %p, Kw %llu, ec %p\n", 2984 thr, pair.ull, pair.ec ); 2985 if (0) 2986 VG_(pp_ExeContext)(pair.ec); 2987 } 2988 2989 static Int cmp__ULong_n_EC__by_ULong ( ULong_n_EC* pair1, ULong_n_EC* pair2 ) 2990 { 2991 if (pair1->ull < pair2->ull) return -1; 2992 if (pair1->ull > pair2->ull) return 1; 2993 return 0; 2994 } 2995 2996 2997 ///////////////////////////////////////////////////////// 2998 // // 2999 // Shadow Values // 3000 // // 3001 ///////////////////////////////////////////////////////// 3002 3003 // type SVal, SVal_INVALID and SVal_NOACCESS are defined by 3004 // hb_zsm.h. We have to do everything else here. 3005 3006 /* SVal is 64 bit unsigned int. 3007 3008 <---------30---------> <---------30---------> 3009 00 X-----Rmin-VtsID-----X 00 X-----Wmin-VtsID-----X C(Rmin,Wmin) 3010 10 X--------------------X XX X--------------------X A: SVal_NOACCESS 3011 11 0--------------------0 00 0--------------------0 A: SVal_INVALID 3012 3013 */ 3014 #define SVAL_TAGMASK (3ULL << 62) 3015 3016 static inline Bool SVal__isC ( SVal s ) { 3017 return (0ULL << 62) == (s & SVAL_TAGMASK); 3018 } 3019 static inline SVal SVal__mkC ( VtsID rmini, VtsID wmini ) { 3020 //tl_assert(VtsID__is_valid(rmini)); 3021 //tl_assert(VtsID__is_valid(wmini)); 3022 return (((ULong)rmini) << 32) | ((ULong)wmini); 3023 } 3024 static inline VtsID SVal__unC_Rmin ( SVal s ) { 3025 tl_assert(SVal__isC(s)); 3026 return (VtsID)(s >> 32); 3027 } 3028 static inline VtsID SVal__unC_Wmin ( SVal s ) { 3029 tl_assert(SVal__isC(s)); 3030 return (VtsID)(s & 0xFFFFFFFFULL); 3031 } 3032 3033 static inline Bool SVal__isA ( SVal s ) { 3034 return (2ULL << 62) == (s & SVAL_TAGMASK); 3035 } 3036 static inline SVal SVal__mkA ( void ) { 3037 return 2ULL << 62; 3038 } 3039 3040 /* Direct callback from lib_zsm. */ 3041 static void SVal__rcinc ( SVal s ) { 3042 if (SVal__isC(s)) { 3043 VtsID__rcinc( SVal__unC_Rmin(s) ); 3044 VtsID__rcinc( SVal__unC_Wmin(s) ); 3045 } 3046 } 3047 3048 /* Direct callback from lib_zsm. */ 3049 static void SVal__rcdec ( SVal s ) { 3050 if (SVal__isC(s)) { 3051 VtsID__rcdec( SVal__unC_Rmin(s) ); 3052 VtsID__rcdec( SVal__unC_Wmin(s) ); 3053 } 3054 } 3055 3056 3057 ///////////////////////////////////////////////////////// 3058 // // 3059 // A simple group (memory) allocator // 3060 // // 3061 ///////////////////////////////////////////////////////// 3062 3063 //////////////// BEGIN general group allocator 3064 typedef 3065 struct { 3066 UWord elemSzB; /* element size */ 3067 UWord nPerGroup; /* # elems per group */ 3068 void* (*alloc)(HChar*, SizeT); /* group allocator */ 3069 HChar* cc; /* group allocator's cc */ 3070 void (*free)(void*); /* group allocator's free-er (unused) */ 3071 /* XArray of void* (pointers to groups). The groups themselves. 3072 Each element is a pointer to a block of size (elemSzB * 3073 nPerGroup) bytes. */ 3074 XArray* groups; 3075 /* next free element. Is a pointer to an element in one of the 3076 groups pointed to by .groups. */ 3077 void* nextFree; 3078 } 3079 GroupAlloc; 3080 3081 static void init_GroupAlloc ( /*MOD*/GroupAlloc* ga, 3082 UWord elemSzB, 3083 UWord nPerGroup, 3084 void* (*alloc)(HChar*, SizeT), 3085 HChar* cc, 3086 void (*free)(void*) ) 3087 { 3088 tl_assert(0 == (elemSzB % sizeof(UWord))); 3089 tl_assert(elemSzB >= sizeof(UWord)); 3090 tl_assert(nPerGroup >= 100); /* let's say */ 3091 tl_assert(alloc); 3092 tl_assert(cc); 3093 tl_assert(free); 3094 tl_assert(ga); 3095 VG_(memset)(ga, 0, sizeof(*ga)); 3096 ga->elemSzB = elemSzB; 3097 ga->nPerGroup = nPerGroup; 3098 ga->groups = NULL; 3099 ga->alloc = alloc; 3100 ga->cc = cc; 3101 ga->free = free; 3102 ga->groups = VG_(newXA)( alloc, cc, free, sizeof(void*) ); 3103 ga->nextFree = NULL; 3104 tl_assert(ga->groups); 3105 } 3106 3107 /* The freelist is empty. Allocate a new group and put all the new 3108 elements in it onto the freelist. */ 3109 __attribute__((noinline)) 3110 static void gal_add_new_group ( GroupAlloc* ga ) 3111 { 3112 Word i; 3113 UWord* group; 3114 tl_assert(ga); 3115 tl_assert(ga->nextFree == NULL); 3116 group = ga->alloc( ga->cc, ga->elemSzB * ga->nPerGroup ); 3117 tl_assert(group); 3118 /* extend the freelist through the new group. Place the freelist 3119 pointer in the first word of each element. That's why the 3120 element size must be at least one word. */ 3121 for (i = ga->nPerGroup-1; i >= 0; i--) { 3122 UChar* elemC = ((UChar*)group) + i * ga->elemSzB; 3123 UWord* elem = (UWord*)elemC; 3124 tl_assert(0 == (((UWord)elem) % sizeof(UWord))); 3125 *elem = (UWord)ga->nextFree; 3126 ga->nextFree = elem; 3127 } 3128 /* and add to our collection of groups */ 3129 VG_(addToXA)( ga->groups, &group ); 3130 } 3131 3132 inline static void* gal_Alloc ( GroupAlloc* ga ) 3133 { 3134 UWord* elem; 3135 if (UNLIKELY(ga->nextFree == NULL)) { 3136 gal_add_new_group(ga); 3137 } 3138 elem = ga->nextFree; 3139 ga->nextFree = (void*)*elem; 3140 *elem = 0; /* unnecessary, but just to be on the safe side */ 3141 return elem; 3142 } 3143 3144 inline static void* gal_Alloc_w_size_check ( GroupAlloc* ga, SizeT n ) 3145 { 3146 tl_assert(n == ga->elemSzB); 3147 return gal_Alloc( ga ); 3148 } 3149 3150 inline static void gal_Free ( GroupAlloc* ga, void* p ) 3151 { 3152 UWord* elem = (UWord*)p; 3153 *elem = (UWord)ga->nextFree; 3154 ga->nextFree = elem; 3155 } 3156 //////////////// END general group allocator 3157 3158 3159 ///////////////////////////////////////////////////////// 3160 // // 3161 // Change-event map2 // 3162 // // 3163 ///////////////////////////////////////////////////////// 3164 3165 #define EVENT_MAP_GC_DISCARD_FRACTION 0.5 3166 3167 /* This is in two parts: 3168 3169 1. A hash table of RCECs. This is a set of reference-counted stack 3170 traces. When the reference count of a stack trace becomes zero, 3171 it is removed from the set and freed up. The intent is to have 3172 a set of stack traces which can be referred to from (2), but to 3173 only represent each one once. The set is indexed/searched by 3174 ordering on the stack trace vectors. 3175 3176 2. A SparseWA of OldRefs. These store information about each old 3177 ref that we need to record. It is indexed by address of the 3178 location for which the information is recorded. For LRU 3179 purposes, each OldRef also contains a generation number, 3180 indicating when it was most recently accessed. 3181 3182 The important part of an OldRef is, however, its accs[] array. 3183 This is an array of N_OLDREF_ACCS which binds (thread, R/W, 3184 size) triples to RCECs. This allows us to collect the last 3185 access-traceback by up to N_OLDREF_ACCS different triples for 3186 this location. The accs[] array is a MTF-array. If a binding 3187 falls off the end, that's too bad -- we will lose info about 3188 that triple's access to this location. 3189 3190 When the SparseWA becomes too big, we can throw away the OldRefs 3191 whose generation numbers are below some threshold; hence doing 3192 approximate LRU discarding. For each discarded OldRef we must 3193 of course decrement the reference count on the all RCECs it 3194 refers to, in order that entries from (1) eventually get 3195 discarded too. 3196 3197 A major improvement in reliability of this mechanism would be to 3198 have a dynamically sized OldRef.accs[] array, so no entries ever 3199 fall off the end. In investigations (Dec 08) it appears that a 3200 major cause for the non-availability of conflicting-access traces 3201 in race reports is caused by the fixed size of this array. I 3202 suspect for most OldRefs, only a few entries are used, but for a 3203 minority of cases there is an overflow, leading to info lossage. 3204 Investigations also suggest this is very workload and scheduling 3205 sensitive. Therefore a dynamic sizing would be better. 3206 3207 However, dynamic sizing would defeat the use of a GroupAllocator 3208 for OldRef structures. And that's important for performance. So 3209 it's not straightforward to do. 3210 */ 3211 3212 3213 static UWord stats__ctxt_rcdec1 = 0; 3214 static UWord stats__ctxt_rcdec2 = 0; 3215 static UWord stats__ctxt_rcdec3 = 0; 3216 static UWord stats__ctxt_rcdec_calls = 0; 3217 static UWord stats__ctxt_rcdec_discards = 0; 3218 static UWord stats__ctxt_rcdec1_eq = 0; 3219 3220 static UWord stats__ctxt_tab_curr = 0; 3221 static UWord stats__ctxt_tab_max = 0; 3222 3223 static UWord stats__ctxt_tab_qs = 0; 3224 static UWord stats__ctxt_tab_cmps = 0; 3225 3226 3227 /////////////////////////////////////////////////////// 3228 //// Part (1): A hash table of RCECs 3229 /// 3230 3231 #define N_FRAMES 8 3232 3233 // (UInt) `echo "Reference Counted Execution Context" | md5sum` 3234 #define RCEC_MAGIC 0xab88abb2UL 3235 3236 //#define N_RCEC_TAB 98317 /* prime */ 3237 #define N_RCEC_TAB 196613 /* prime */ 3238 3239 typedef 3240 struct _RCEC { 3241 UWord magic; /* sanity check only */ 3242 struct _RCEC* next; 3243 UWord rc; 3244 UWord rcX; /* used for crosschecking */ 3245 UWord frames_hash; /* hash of all the frames */ 3246 UWord frames[N_FRAMES]; 3247 } 3248 RCEC; 3249 3250 static RCEC** contextTab = NULL; /* hash table of RCEC*s */ 3251 3252 3253 /* Gives an arbitrary total order on RCEC .frames fields */ 3254 static Word RCEC__cmp_by_frames ( RCEC* ec1, RCEC* ec2 ) { 3255 Word i; 3256 tl_assert(ec1 && ec1->magic == RCEC_MAGIC); 3257 tl_assert(ec2 && ec2->magic == RCEC_MAGIC); 3258 if (ec1->frames_hash < ec2->frames_hash) return -1; 3259 if (ec1->frames_hash > ec2->frames_hash) return 1; 3260 for (i = 0; i < N_FRAMES; i++) { 3261 if (ec1->frames[i] < ec2->frames[i]) return -1; 3262 if (ec1->frames[i] > ec2->frames[i]) return 1; 3263 } 3264 return 0; 3265 } 3266 3267 3268 /* Dec the ref of this RCEC. */ 3269 static void ctxt__rcdec ( RCEC* ec ) 3270 { 3271 stats__ctxt_rcdec_calls++; 3272 tl_assert(ec && ec->magic == RCEC_MAGIC); 3273 tl_assert(ec->rc > 0); 3274 ec->rc--; 3275 } 3276 3277 static void ctxt__rcinc ( RCEC* ec ) 3278 { 3279 tl_assert(ec && ec->magic == RCEC_MAGIC); 3280 ec->rc++; 3281 } 3282 3283 3284 //////////// BEGIN RCEC group allocator 3285 static GroupAlloc rcec_group_allocator; 3286 3287 static RCEC* alloc_RCEC ( void ) { 3288 return gal_Alloc ( &rcec_group_allocator ); 3289 } 3290 3291 static void free_RCEC ( RCEC* rcec ) { 3292 tl_assert(rcec->magic == RCEC_MAGIC); 3293 gal_Free( &rcec_group_allocator, rcec ); 3294 } 3295 //////////// END RCEC group allocator 3296 3297 3298 /* Find 'ec' in the RCEC list whose head pointer lives at 'headp' and 3299 move it one step closer the the front of the list, so as to make 3300 subsequent searches for it cheaper. */ 3301 static void move_RCEC_one_step_forward ( RCEC** headp, RCEC* ec ) 3302 { 3303 RCEC *ec0, *ec1, *ec2; 3304 if (ec == *headp) 3305 tl_assert(0); /* already at head of list */ 3306 tl_assert(ec != NULL); 3307 ec0 = *headp; 3308 ec1 = NULL; 3309 ec2 = NULL; 3310 while (True) { 3311 if (ec0 == NULL || ec0 == ec) break; 3312 ec2 = ec1; 3313 ec1 = ec0; 3314 ec0 = ec0->next; 3315 } 3316 tl_assert(ec0 == ec); 3317 if (ec0 != NULL && ec1 != NULL && ec2 != NULL) { 3318 RCEC* tmp; 3319 /* ec0 points to ec, ec1 to its predecessor, and ec2 to ec1's 3320 predecessor. Swap ec0 and ec1, that is, move ec0 one step 3321 closer to the start of the list. */ 3322 tl_assert(ec2->next == ec1); 3323 tl_assert(ec1->next == ec0); 3324 tmp = ec0->next; 3325 ec2->next = ec0; 3326 ec0->next = ec1; 3327 ec1->next = tmp; 3328 } 3329 else 3330 if (ec0 != NULL && ec1 != NULL && ec2 == NULL) { 3331 /* it's second in the list. */ 3332 tl_assert(*headp == ec1); 3333 tl_assert(ec1->next == ec0); 3334 ec1->next = ec0->next; 3335 ec0->next = ec1; 3336 *headp = ec0; 3337 } 3338 } 3339 3340 3341 /* Find the given RCEC in the tree, and return a pointer to it. Or, 3342 if not present, add the given one to the tree (by making a copy of 3343 it, so the caller can immediately deallocate the original) and 3344 return a pointer to the copy. The caller can safely have 'example' 3345 on its stack, since we will always return a pointer to a copy of 3346 it, not to the original. Note that the inserted node will have .rc 3347 of zero and so the caller must immediatly increment it. */ 3348 __attribute__((noinline)) 3349 static RCEC* ctxt__find_or_add ( RCEC* example ) 3350 { 3351 UWord hent; 3352 RCEC* copy; 3353 tl_assert(example && example->magic == RCEC_MAGIC); 3354 tl_assert(example->rc == 0); 3355 3356 /* Search the hash table to see if we already have it. */ 3357 stats__ctxt_tab_qs++; 3358 hent = example->frames_hash % N_RCEC_TAB; 3359 copy = contextTab[hent]; 3360 while (1) { 3361 if (!copy) break; 3362 tl_assert(copy->magic == RCEC_MAGIC); 3363 stats__ctxt_tab_cmps++; 3364 if (0 == RCEC__cmp_by_frames(copy, example)) break; 3365 copy = copy->next; 3366 } 3367 3368 if (copy) { 3369 tl_assert(copy != example); 3370 /* optimisation: if it's not at the head of its list, move 1 3371 step fwds, to make future searches cheaper */ 3372 if (copy != contextTab[hent]) { 3373 move_RCEC_one_step_forward( &contextTab[hent], copy ); 3374 } 3375 } else { 3376 copy = alloc_RCEC(); 3377 tl_assert(copy != example); 3378 *copy = *example; 3379 copy->next = contextTab[hent]; 3380 contextTab[hent] = copy; 3381 stats__ctxt_tab_curr++; 3382 if (stats__ctxt_tab_curr > stats__ctxt_tab_max) 3383 stats__ctxt_tab_max = stats__ctxt_tab_curr; 3384 } 3385 return copy; 3386 } 3387 3388 static inline UWord ROLW ( UWord w, Int n ) 3389 { 3390 Int bpw = 8 * sizeof(UWord); 3391 w = (w << n) | (w >> (bpw-n)); 3392 return w; 3393 } 3394 3395 __attribute__((noinline)) 3396 static RCEC* get_RCEC ( Thr* thr ) 3397 { 3398 UWord hash, i; 3399 RCEC example; 3400 example.magic = RCEC_MAGIC; 3401 example.rc = 0; 3402 example.rcX = 0; 3403 main_get_stacktrace( thr, &example.frames[0], N_FRAMES ); 3404 hash = 0; 3405 for (i = 0; i < N_FRAMES; i++) { 3406 hash ^= example.frames[i]; 3407 hash = ROLW(hash, 19); 3408 } 3409 example.frames_hash = hash; 3410 return ctxt__find_or_add( &example ); 3411 } 3412 3413 /////////////////////////////////////////////////////// 3414 //// Part (2): 3415 /// A SparseWA guest-addr -> OldRef, that refers to (1) 3416 /// 3417 3418 // (UInt) `echo "Old Reference Information" | md5sum` 3419 #define OldRef_MAGIC 0x30b1f075UL 3420 3421 /* Records an access: a thread and a context. The size 3422 (1,2,4,8) and read-or-writeness are also encoded as 3423 follows: bottom bit of .thr is 1 if write, 0 if read 3424 bottom 2 bits of .rcec are encode size: 3425 00 = 1, 01 = 2, 10 = 4, 11 = 8 3426 */ 3427 typedef struct { Thr* thr; RCEC* rcec; } Thr_n_RCEC; 3428 3429 #define N_OLDREF_ACCS 5 3430 3431 typedef 3432 struct { 3433 UWord magic; /* sanity check only */ 3434 UWord gen; /* when most recently accessed */ 3435 /* or free list when not in use */ 3436 /* unused slots in this array have .thr == NULL */ 3437 Thr_n_RCEC accs[N_OLDREF_ACCS]; 3438 } 3439 OldRef; 3440 3441 3442 //////////// BEGIN OldRef group allocator 3443 static GroupAlloc oldref_group_allocator; 3444 3445 static OldRef* alloc_OldRef ( void ) { 3446 return gal_Alloc ( &oldref_group_allocator ); 3447 } 3448 3449 static void free_OldRef ( OldRef* r ) { 3450 tl_assert(r->magic == OldRef_MAGIC); 3451 gal_Free( &oldref_group_allocator, r ); 3452 } 3453 //////////// END OldRef group allocator 3454 3455 3456 static SparseWA* oldrefTree = NULL; /* SparseWA* OldRef* */ 3457 static UWord oldrefGen = 0; /* current LRU generation # */ 3458 static UWord oldrefTreeN = 0; /* # elems in oldrefTree */ 3459 static UWord oldrefGenIncAt = 0; /* inc gen # when size hits this */ 3460 3461 inline static void* ptr_or_UWord ( void* p, UWord w ) { 3462 return (void*)( ((UWord)p) | ((UWord)w) ); 3463 } 3464 inline static void* ptr_and_UWord ( void* p, UWord w ) { 3465 return (void*)( ((UWord)p) & ((UWord)w) ); 3466 } 3467 3468 inline static UInt min_UInt ( UInt a, UInt b ) { 3469 return a < b ? a : b; 3470 } 3471 3472 /* Compare the intervals [a1,a1+n1) and [a2,a2+n2). Return -1 if the 3473 first interval is lower, 1 if the first interval is higher, and 0 3474 if there is any overlap. Redundant paranoia with casting is there 3475 following what looked distinctly like a bug in gcc-4.1.2, in which 3476 some of the comparisons were done signedly instead of 3477 unsignedly. */ 3478 /* Copied from exp-ptrcheck/sg_main.c */ 3479 static Word cmp_nonempty_intervals ( Addr a1, SizeT n1, 3480 Addr a2, SizeT n2 ) { 3481 UWord a1w = (UWord)a1; 3482 UWord n1w = (UWord)n1; 3483 UWord a2w = (UWord)a2; 3484 UWord n2w = (UWord)n2; 3485 tl_assert(n1w > 0 && n2w > 0); 3486 if (a1w + n1w <= a2w) return -1L; 3487 if (a2w + n2w <= a1w) return 1L; 3488 return 0; 3489 } 3490 3491 static void event_map_bind ( Addr a, SizeT szB, Bool isW, Thr* thr ) 3492 { 3493 OldRef* ref; 3494 RCEC* rcec; 3495 Word i, j; 3496 UWord keyW, valW; 3497 Bool b; 3498 3499 rcec = get_RCEC( thr ); 3500 ctxt__rcinc(rcec); 3501 3502 /* encode the size and writeness of the transaction in the bottom 3503 two bits of thr and rcec. */ 3504 thr = ptr_or_UWord(thr, isW ? 1 : 0); 3505 switch (szB) { 3506 /* This doesn't look particularly branch-predictor friendly. */ 3507 case 1: rcec = ptr_or_UWord(rcec, 0); break; 3508 case 2: rcec = ptr_or_UWord(rcec, 1); break; 3509 case 4: rcec = ptr_or_UWord(rcec, 2); break; 3510 case 8: rcec = ptr_or_UWord(rcec, 3); break; 3511 default: tl_assert(0); 3512 } 3513 3514 /* Look in the map to see if we already have this. */ 3515 b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, a ); 3516 3517 if (b) { 3518 3519 /* We already have a record for this address. We now need to 3520 see if we have a stack trace pertaining to this (thread, R/W, 3521 size) triple. */ 3522 tl_assert(keyW == a); 3523 ref = (OldRef*)valW; 3524 tl_assert(ref->magic == OldRef_MAGIC); 3525 3526 tl_assert(thr); 3527 for (i = 0; i < N_OLDREF_ACCS; i++) { 3528 if (ref->accs[i].thr != thr) 3529 continue; 3530 /* since .thr encodes both the accessing thread and the 3531 read/writeness, we know now that at least those features 3532 of the access match this entry. So we just need to check 3533 the size indication. Do this by inspecting the lowest 2 bits of 3534 .rcec, which contain the encoded size info. */ 3535 if (ptr_and_UWord(ref->accs[i].rcec,3) != ptr_and_UWord(rcec,3)) 3536 continue; 3537 /* else we have a match, so stop looking. */ 3538 break; 3539 } 3540 3541 if (i < N_OLDREF_ACCS) { 3542 /* thread 'thr' has an entry at index 'i'. Update it. */ 3543 if (i > 0) { 3544 Thr_n_RCEC tmp = ref->accs[i-1]; 3545 ref->accs[i-1] = ref->accs[i]; 3546 ref->accs[i] = tmp; 3547 i--; 3548 } 3549 if (rcec == ref->accs[i].rcec) stats__ctxt_rcdec1_eq++; 3550 stats__ctxt_rcdec1++; 3551 ctxt__rcdec( ptr_and_UWord(ref->accs[i].rcec, ~3) ); 3552 ref->accs[i].rcec = rcec; 3553 tl_assert(ref->accs[i].thr == thr); 3554 } else { 3555 /* No entry for this (thread, R/W, size) triple. Shuffle all 3556 of them down one slot, and put the new entry at the start 3557 of the array. */ 3558 if (ref->accs[N_OLDREF_ACCS-1].thr) { 3559 /* the last slot is in use. We must dec the rc on the 3560 associated rcec. */ 3561 tl_assert(ref->accs[N_OLDREF_ACCS-1].rcec); 3562 stats__ctxt_rcdec2++; 3563 if (0 && 0 == (stats__ctxt_rcdec2 & 0xFFF)) 3564 VG_(printf)("QQQQ %lu overflows\n",stats__ctxt_rcdec2); 3565 ctxt__rcdec( ptr_and_UWord(ref->accs[N_OLDREF_ACCS-1].rcec, ~3) ); 3566 } else { 3567 tl_assert(!ref->accs[N_OLDREF_ACCS-1].rcec); 3568 } 3569 for (j = N_OLDREF_ACCS-1; j >= 1; j--) 3570 ref->accs[j] = ref->accs[j-1]; 3571 ref->accs[0].thr = thr; 3572 ref->accs[0].rcec = rcec; 3573 /* thr==NULL is used to signify an empty slot, so we can't 3574 add a NULL thr. */ 3575 tl_assert(ptr_and_UWord(thr, ~3) != 0); 3576 } 3577 3578 ref->gen = oldrefGen; 3579 3580 } else { 3581 3582 /* We don't have a record for this address. Create a new one. */ 3583 if (oldrefTreeN >= oldrefGenIncAt) { 3584 oldrefGen++; 3585 oldrefGenIncAt = oldrefTreeN + 50000; 3586 if (0) VG_(printf)("oldrefTree: new gen %lu at size %lu\n", 3587 oldrefGen, oldrefTreeN ); 3588 } 3589 3590 ref = alloc_OldRef(); 3591 ref->magic = OldRef_MAGIC; 3592 ref->gen = oldrefGen; 3593 ref->accs[0].rcec = rcec; 3594 ref->accs[0].thr = thr; 3595 /* thr==NULL is used to signify an empty slot, so we can't add a 3596 NULL thr. */ 3597 tl_assert(ptr_and_UWord(thr, ~3) != 0); 3598 for (j = 1; j < N_OLDREF_ACCS; j++) { 3599 ref->accs[j].thr = NULL; 3600 ref->accs[j].rcec = NULL; 3601 } 3602 VG_(addToSWA)( oldrefTree, a, (UWord)ref ); 3603 oldrefTreeN++; 3604 3605 } 3606 } 3607 3608 3609 Bool libhb_event_map_lookup ( /*OUT*/ExeContext** resEC, 3610 /*OUT*/Thr** resThr, 3611 /*OUT*/SizeT* resSzB, 3612 /*OUT*/Bool* resIsW, 3613 Thr* thr, Addr a, SizeT szB, Bool isW ) 3614 { 3615 Word i, j; 3616 OldRef* ref; 3617 UWord keyW, valW; 3618 Bool b; 3619 3620 Thr* cand_thr; 3621 RCEC* cand_rcec; 3622 Bool cand_isW; 3623 SizeT cand_szB; 3624 Addr cand_a; 3625 3626 Addr toCheck[15]; 3627 Int nToCheck = 0; 3628 3629 tl_assert(thr); 3630 tl_assert(szB == 8 || szB == 4 || szB == 2 || szB == 1); 3631 3632 toCheck[nToCheck++] = a; 3633 for (i = -7; i < (Word)szB; i++) { 3634 if (i != 0) 3635 toCheck[nToCheck++] = a + i; 3636 } 3637 tl_assert(nToCheck <= 15); 3638 3639 /* Now see if we can find a suitable matching event for 3640 any of the addresses in toCheck[0 .. nToCheck-1]. */ 3641 for (j = 0; j < nToCheck; j++) { 3642 3643 cand_a = toCheck[j]; 3644 // VG_(printf)("test %ld %p\n", j, cand_a); 3645 3646 b = VG_(lookupSWA)( oldrefTree, &keyW, &valW, cand_a ); 3647 if (!b) 3648 continue; 3649 3650 ref = (OldRef*)valW; 3651 tl_assert(keyW == cand_a); 3652 tl_assert(ref->magic == OldRef_MAGIC); 3653 tl_assert(ref->accs[0].thr); /* first slot must always be used */ 3654 3655 cand_thr = NULL; 3656 cand_rcec = NULL; 3657 cand_isW = False; 3658 cand_szB = 0; 3659 3660 for (i = 0; i < N_OLDREF_ACCS; i++) { 3661 Thr_n_RCEC* cand = &ref->accs[i]; 3662 cand_thr = ptr_and_UWord(cand->thr, ~3); 3663 cand_rcec = ptr_and_UWord(cand->rcec, ~3); 3664 /* Decode the writeness from the bottom bit of .thr. */ 3665 cand_isW = 1 == (UWord)ptr_and_UWord(cand->thr, 1); 3666 /* Decode the size from the bottom two bits of .rcec. */ 3667 switch ((UWord)ptr_and_UWord(cand->rcec, 3)) { 3668 case 0: cand_szB = 1; break; 3669 case 1: cand_szB = 2; break; 3670 case 2: cand_szB = 4; break; 3671 case 3: cand_szB = 8; break; 3672 default: tl_assert(0); 3673 } 3674 3675 if (cand_thr == NULL) 3676 /* This slot isn't in use. Ignore it. */ 3677 continue; 3678 3679 if (cand_thr == thr) 3680 /* This is an access by the same thread, but we're only 3681 interested in accesses from other threads. Ignore. */ 3682 continue; 3683 3684 if ((!cand_isW) && (!isW)) 3685 /* We don't want to report a read racing against another 3686 read; that's stupid. So in this case move on. */ 3687 continue; 3688 3689 if (cmp_nonempty_intervals(a, szB, cand_a, cand_szB) != 0) 3690 /* No overlap with the access we're asking about. Ignore. */ 3691 continue; 3692 3693 /* We have a match. Stop searching. */ 3694 break; 3695 } 3696 3697 tl_assert(i >= 0 && i <= N_OLDREF_ACCS); 3698 3699 if (i < N_OLDREF_ACCS) { 3700 Int n, maxNFrames; 3701 /* return with success */ 3702 tl_assert(cand_thr); 3703 tl_assert(cand_rcec); 3704 tl_assert(cand_rcec->magic == RCEC_MAGIC); 3705 tl_assert(cand_szB >= 1); 3706 /* Count how many non-zero frames we have. */ 3707 maxNFrames = min_UInt(N_FRAMES, VG_(clo_backtrace_size)); 3708 for (n = 0; n < maxNFrames; n++) { 3709 if (0 == cand_rcec->frames[n]) break; 3710 } 3711 *resEC = VG_(make_ExeContext_from_StackTrace)(cand_rcec->frames, n); 3712 *resThr = cand_thr; 3713 *resSzB = cand_szB; 3714 *resIsW = cand_isW; 3715 return True; 3716 } 3717 3718 /* consider next address in toCheck[] */ 3719 } /* for (j = 0; j < nToCheck; j++) */ 3720 3721 /* really didn't find anything. */ 3722 return False; 3723 } 3724 3725 static void event_map_init ( void ) 3726 { 3727 Word i; 3728 3729 /* Context (RCEC) group allocator */ 3730 init_GroupAlloc ( &rcec_group_allocator, 3731 sizeof(RCEC), 3732 1000 /* RCECs per group */, 3733 HG_(zalloc), 3734 "libhb.event_map_init.1 (RCEC groups)", 3735 HG_(free) ); 3736 3737 /* Context table */ 3738 tl_assert(!contextTab); 3739 contextTab = HG_(zalloc)( "libhb.event_map_init.2 (context table)", 3740 N_RCEC_TAB * sizeof(RCEC*) ); 3741 tl_assert(contextTab); 3742 for (i = 0; i < N_RCEC_TAB; i++) 3743 contextTab[i] = NULL; 3744 3745 /* Oldref group allocator */ 3746 init_GroupAlloc ( &oldref_group_allocator, 3747 sizeof(OldRef), 3748 1000 /* OldRefs per group */, 3749 HG_(zalloc), 3750 "libhb.event_map_init.3 (OldRef groups)", 3751 HG_(free) ); 3752 3753 /* Oldref tree */ 3754 tl_assert(!oldrefTree); 3755 oldrefTree = VG_(newSWA)( 3756 HG_(zalloc), 3757 "libhb.event_map_init.4 (oldref tree)", 3758 HG_(free) 3759 ); 3760 tl_assert(oldrefTree); 3761 3762 oldrefGen = 0; 3763 oldrefGenIncAt = 0; 3764 oldrefTreeN = 0; 3765 } 3766 3767 static void event_map__check_reference_counts ( Bool before ) 3768 { 3769 RCEC* rcec; 3770 OldRef* oldref; 3771 Word i; 3772 UWord nEnts = 0; 3773 UWord keyW, valW; 3774 3775 /* Set the 'check' reference counts to zero. Also, optionally 3776 check that the real reference counts are non-zero. We allow 3777 these to fall to zero before a GC, but the GC must get rid of 3778 all those that are zero, hence none should be zero after a 3779 GC. */ 3780 for (i = 0; i < N_RCEC_TAB; i++) { 3781 for (rcec = contextTab[i]; rcec; rcec = rcec->next) { 3782 nEnts++; 3783 tl_assert(rcec); 3784 tl_assert(rcec->magic == RCEC_MAGIC); 3785 if (!before) 3786 tl_assert(rcec->rc > 0); 3787 rcec->rcX = 0; 3788 } 3789 } 3790 3791 /* check that the stats are sane */ 3792 tl_assert(nEnts == stats__ctxt_tab_curr); 3793 tl_assert(stats__ctxt_tab_curr <= stats__ctxt_tab_max); 3794 3795 /* visit all the referencing points, inc check ref counts */ 3796 VG_(initIterSWA)( oldrefTree ); 3797 while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) { 3798 oldref = (OldRef*)valW; 3799 tl_assert(oldref->magic == OldRef_MAGIC); 3800 for (i = 0; i < N_OLDREF_ACCS; i++) { 3801 Thr* aThr = ptr_and_UWord(oldref->accs[i].thr, ~3); 3802 RCEC* aRef = ptr_and_UWord(oldref->accs[i].rcec, ~3); 3803 if (aThr) { 3804 tl_assert(aRef); 3805 tl_assert(aRef->magic == RCEC_MAGIC); 3806 aRef->rcX++; 3807 } else { 3808 tl_assert(!aRef); 3809 } 3810 } 3811 } 3812 3813 /* compare check ref counts with actual */ 3814 for (i = 0; i < N_RCEC_TAB; i++) { 3815 for (rcec = contextTab[i]; rcec; rcec = rcec->next) { 3816 tl_assert(rcec->rc == rcec->rcX); 3817 } 3818 } 3819 } 3820 3821 __attribute__((noinline)) 3822 static void event_map_maybe_GC ( void ) 3823 { 3824 OldRef* oldref; 3825 UWord keyW, valW, retained, maxGen; 3826 XArray* refs2del; 3827 Word i, j, n2del; 3828 3829 UWord* genMap = NULL; 3830 UWord genMap_min = 0; 3831 UWord genMap_size = 0; 3832 3833 if (LIKELY(oldrefTreeN < HG_(clo_conflict_cache_size))) 3834 return; 3835 3836 if (0) 3837 VG_(printf)("libhb: event_map GC at size %lu\n", oldrefTreeN); 3838 3839 /* Check for sane command line params. Limit values must match 3840 those in hg_process_cmd_line_option. */ 3841 tl_assert( HG_(clo_conflict_cache_size) >= 10*1000 ); 3842 tl_assert( HG_(clo_conflict_cache_size) <= 30*1000*1000 ); 3843 3844 /* Check our counting is sane (expensive) */ 3845 if (CHECK_CEM) 3846 tl_assert(oldrefTreeN == VG_(sizeSWA)( oldrefTree )); 3847 3848 /* Check the reference counts (expensive) */ 3849 if (CHECK_CEM) 3850 event_map__check_reference_counts( True/*before*/ ); 3851 3852 /* Compute the distribution of generation values in the ref tree. 3853 There are likely only to be a few different generation numbers 3854 in the whole tree, but we don't know what they are. Hence use a 3855 dynamically resized array of counters. The array is genMap[0 3856 .. genMap_size-1], where genMap[0] is the count for the 3857 generation number genMap_min, genMap[1] is the count for 3858 genMap_min+1, etc. If a new number is seen outside the range 3859 [genMap_min .. genMap_min + genMap_size - 1] then the array is 3860 copied into a larger array, and genMap_min and genMap_size are 3861 adjusted accordingly. */ 3862 3863 /* genMap :: generation-number -> count-of-nodes-with-that-number */ 3864 3865 VG_(initIterSWA)( oldrefTree ); 3866 while ( VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) { 3867 3868 UWord ea, key; 3869 oldref = (OldRef*)valW; 3870 key = oldref->gen; 3871 3872 /* BEGIN find 'ea', which is the index in genMap holding the 3873 count for generation number 'key'. */ 3874 if (UNLIKELY(genMap == NULL)) { 3875 /* deal with the first key to be seen, so that the following 3876 cases don't need to handle the complexity of a NULL count 3877 array. */ 3878 genMap_min = key; 3879 genMap_size = 1; 3880 genMap = HG_(zalloc)( "libhb.emmG.1a", 3881 genMap_size * sizeof(UWord) ); 3882 ea = 0; 3883 if (0) VG_(printf)("(%lu) case 1 [%lu .. %lu]\n", 3884 key, genMap_min, genMap_min+genMap_size- 1 ); 3885 } 3886 else 3887 if (LIKELY(key >= genMap_min && key < genMap_min + genMap_size)) { 3888 /* this is the expected (almost-always-happens) case: 'key' 3889 is already mapped in the array. */ 3890 ea = key - genMap_min; 3891 } 3892 else 3893 if (key < genMap_min) { 3894 /* 'key' appears before the start of the current array. 3895 Extend the current array by allocating a larger one and 3896 copying the current one to the upper end of it. */ 3897 Word more; 3898 UWord* map2; 3899 more = genMap_min - key; 3900 tl_assert(more > 0); 3901 map2 = HG_(zalloc)( "libhb.emmG.1b", 3902 (genMap_size + more) * sizeof(UWord) ); 3903 VG_(memcpy)( &map2[more], genMap, genMap_size * sizeof(UWord) ); 3904 HG_(free)( genMap ); 3905 genMap = map2; 3906 genMap_size += more; 3907 genMap_min -= more; 3908 ea = 0; 3909 tl_assert(genMap_min == key); 3910 if (0) VG_(printf)("(%lu) case 2 [%lu .. %lu]\n", 3911 key, genMap_min, genMap_min+genMap_size- 1 ); 3912 } 3913 else { 3914 /* 'key' appears after the end of the current array. Extend 3915 the current array by allocating a larger one and copying 3916 the current one to the lower end of it. */ 3917 Word more; 3918 UWord* map2; 3919 tl_assert(key >= genMap_min + genMap_size); 3920 more = key - (genMap_min + genMap_size) + 1; 3921 tl_assert(more > 0); 3922 map2 = HG_(zalloc)( "libhb.emmG.1c", 3923 (genMap_size + more) * sizeof(UWord) ); 3924 VG_(memcpy)( &map2[0], genMap, genMap_size * sizeof(UWord) ); 3925 HG_(free)( genMap ); 3926 genMap = map2; 3927 genMap_size += more; 3928 ea = genMap_size - 1;; 3929 tl_assert(genMap_min + genMap_size - 1 == key); 3930 if (0) VG_(printf)("(%lu) case 3 [%lu .. %lu]\n", 3931 key, genMap_min, genMap_min+genMap_size- 1 ); 3932 } 3933 /* END find 'ea' from 'key' */ 3934 3935 tl_assert(ea >= 0 && ea < genMap_size); 3936 /* and the whole point of this elaborate computation of 'ea' is .. */ 3937 genMap[ea]++; 3938 } 3939 3940 tl_assert(genMap); 3941 tl_assert(genMap_size > 0); 3942 3943 /* Sanity check what we just computed */ 3944 { UWord sum = 0; 3945 for (i = 0; i < genMap_size; i++) { 3946 if (0) VG_(printf)(" xxx: gen %ld has %lu\n", 3947 i + genMap_min, genMap[i] ); 3948 sum += genMap[i]; 3949 } 3950 tl_assert(sum == oldrefTreeN); 3951 } 3952 3953 /* Figure out how many generations to throw away */ 3954 retained = oldrefTreeN; 3955 maxGen = 0; 3956 3957 for (i = 0; i < genMap_size; i++) { 3958 keyW = i + genMap_min; 3959 valW = genMap[i]; 3960 tl_assert(keyW > 0); /* can't allow a generation # 0 */ 3961 if (0) VG_(printf)(" XXX: gen %lu has %lu\n", keyW, valW ); 3962 tl_assert(keyW >= maxGen); 3963 tl_assert(retained >= valW); 3964 if (retained - valW 3965 > (UWord)(HG_(clo_conflict_cache_size) 3966 * EVENT_MAP_GC_DISCARD_FRACTION)) { 3967 retained -= valW; 3968 maxGen = keyW; 3969 } else { 3970 break; 3971 } 3972 } 3973 3974 HG_(free)(genMap); 3975 3976 tl_assert(retained >= 0 && retained <= oldrefTreeN); 3977 3978 /* Now make up a big list of the oldrefTree entries we want to 3979 delete. We can't simultaneously traverse the tree and delete 3980 stuff from it, so first we need to copy them off somewhere 3981 else. (sigh) */ 3982 refs2del = VG_(newXA)( HG_(zalloc), "libhb.emmG.2", 3983 HG_(free), sizeof(Addr) ); 3984 3985 if (retained < oldrefTreeN) { 3986 3987 /* This is the normal (expected) case. We discard any ref whose 3988 generation number <= maxGen. */ 3989 VG_(initIterSWA)( oldrefTree ); 3990 while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) { 3991 oldref = (OldRef*)valW; 3992 tl_assert(oldref->magic == OldRef_MAGIC); 3993 if (oldref->gen <= maxGen) { 3994 VG_(addToXA)( refs2del, &keyW ); 3995 } 3996 } 3997 if (VG_(clo_stats)) { 3998 VG_(message)(Vg_DebugMsg, 3999 "libhb: EvM GC: delete generations %lu and below, " 4000 "retaining %lu entries\n", 4001 maxGen, retained ); 4002 } 4003 4004 } else { 4005 4006 static UInt rand_seed = 0; /* leave as static */ 4007 4008 /* Degenerate case: there's only one generation in the entire 4009 tree, so we need to have some other way of deciding which 4010 refs to throw away. Just throw out half of them randomly. */ 4011 tl_assert(retained == oldrefTreeN); 4012 VG_(initIterSWA)( oldrefTree ); 4013 while (VG_(nextIterSWA)( oldrefTree, &keyW, &valW )) { 4014 UInt n; 4015 oldref = (OldRef*)valW; 4016 tl_assert(oldref->magic == OldRef_MAGIC); 4017 n = VG_(random)( &rand_seed ); 4018 if ((n & 0xFFF) < 0x800) { 4019 VG_(addToXA)( refs2del, &keyW ); 4020 retained--; 4021 } 4022 } 4023 if (VG_(clo_stats)) { 4024 VG_(message)(Vg_DebugMsg, 4025 "libhb: EvM GC: randomly delete half the entries, " 4026 "retaining %lu entries\n", 4027 retained ); 4028 } 4029 4030 } 4031 4032 n2del = VG_(sizeXA)( refs2del ); 4033 tl_assert(n2del == (Word)(oldrefTreeN - retained)); 4034 4035 if (0) VG_(printf)("%s","deleting entries\n"); 4036 for (i = 0; i < n2del; i++) { 4037 Bool b; 4038 Addr ga2del = *(Addr*)VG_(indexXA)( refs2del, i ); 4039 b = VG_(delFromSWA)( oldrefTree, &keyW, &valW, ga2del ); 4040 tl_assert(b); 4041 tl_assert(keyW == ga2del); 4042 oldref = (OldRef*)valW; 4043 for (j = 0; j < N_OLDREF_ACCS; j++) { 4044 Thr* aThr = ptr_and_UWord(oldref->accs[j].thr, ~3); 4045 RCEC* aRef = ptr_and_UWord(oldref->accs[j].rcec, ~3); 4046 if (aRef) { 4047 tl_assert(aThr); 4048 stats__ctxt_rcdec3++; 4049 ctxt__rcdec( aRef ); 4050 } else { 4051 tl_assert(!aThr); 4052 } 4053 } 4054 4055 free_OldRef( oldref ); 4056 } 4057 4058 VG_(deleteXA)( refs2del ); 4059 4060 tl_assert( VG_(sizeSWA)( oldrefTree ) == retained ); 4061 4062 oldrefTreeN = retained; 4063 oldrefGenIncAt = oldrefTreeN; /* start new gen right away */ 4064 4065 /* Throw away all RCECs with zero reference counts */ 4066 for (i = 0; i < N_RCEC_TAB; i++) { 4067 RCEC** pp = &contextTab[i]; 4068 RCEC* p = *pp; 4069 while (p) { 4070 if (p->rc == 0) { 4071 *pp = p->next; 4072 free_RCEC(p); 4073 p = *pp; 4074 tl_assert(stats__ctxt_tab_curr > 0); 4075 stats__ctxt_tab_curr--; 4076 } else { 4077 pp = &p->next; 4078 p = p->next; 4079 } 4080 } 4081 } 4082 4083 /* Check the reference counts (expensive) */ 4084 if (CHECK_CEM) 4085 event_map__check_reference_counts( False/*after*/ ); 4086 4087 //if (0) 4088 //VG_(printf)("XXXX final sizes: oldrefTree %ld, contextTree %ld\n\n", 4089 // VG_(OSetGen_Size)(oldrefTree), VG_(OSetGen_Size)(contextTree)); 4090 4091 } 4092 4093 4094 ///////////////////////////////////////////////////////// 4095 // // 4096 // Core MSM // 4097 // // 4098 ///////////////////////////////////////////////////////// 4099 4100 /* Logic in msmcread/msmcwrite updated/verified after re-analysis, 19 4101 Nov 08, and again after [...], 4102 June 09. */ 4103 4104 static ULong stats__msmcread = 0; 4105 static ULong stats__msmcread_change = 0; 4106 static ULong stats__msmcwrite = 0; 4107 static ULong stats__msmcwrite_change = 0; 4108 4109 /* Some notes on the H1 history mechanism: 4110 4111 Transition rules are: 4112 4113 read_{Kr,Kw}(Cr,Cw) = (Cr, Cr `join` Kw) 4114 write_{Kr,Kw}(Cr,Cw) = (Cr `join` Kw, Cr `join` Kw) 4115 4116 After any access by a thread T to a location L, L's constraint pair 4117 (Cr,Cw) has Cw[T] == T's Kw[T], that is, == T's scalar W-clock. 4118 4119 After a race by thread T conflicting with some previous access by 4120 some other thread U, for a location with constraint (before 4121 processing the later access) (Cr,Cw), then Cw[U] is the segment in 4122 which the previously access lies. 4123 4124 Hence in record_race_info, we pass in Cfailed and Kfailed, which 4125 are compared so as to find out which thread(s) this access 4126 conflicts with. Once that is established, we also require the 4127 pre-update Cw for the location, so we can index into it for those 4128 threads, to get the scalar clock values for the point at which the 4129 former accesses were made. (In fact we only bother to do any of 4130 this for an arbitrarily chosen one of the conflicting threads, as 4131 that's simpler, it avoids flooding the user with vast amounts of 4132 mostly useless information, and because the program is wrong if it 4133 contains any races at all -- so we don't really need to show all 4134 conflicting access pairs initially, so long as we only show none if 4135 none exist). 4136 4137 --- 4138 4139 That requires the auxiliary proof that 4140 4141 (Cr `join` Kw)[T] == Kw[T] 4142 4143 Why should that be true? Because for any thread T, Kw[T] >= the 4144 scalar clock value for T known by any other thread. In other 4145 words, because T's value for its own scalar clock is at least as up 4146 to date as the value for it known by any other thread (that is true 4147 for both the R- and W- scalar clocks). Hence no other thread will 4148 be able to feed in a value for that element (indirectly via a 4149 constraint) which will exceed Kw[T], and hence the join cannot 4150 cause that particular element to advance. 4151 */ 4152 4153 __attribute__((noinline)) 4154 static void record_race_info ( Thr* acc_thr, 4155 Addr acc_addr, SizeT szB, Bool isWrite, 4156 VtsID Cfailed, 4157 VtsID Kfailed, 4158 VtsID Cw ) 4159 { 4160 /* Call here to report a race. We just hand it onwards to 4161 HG_(record_error_Race). If that in turn discovers that the 4162 error is going to be collected, then, at history_level 2, that 4163 queries the conflicting-event map. The alternative would be to 4164 query it right here. But that causes a lot of pointless queries 4165 for errors which will shortly be discarded as duplicates, and 4166 can become a performance overhead; so we defer the query until 4167 we know the error is not a duplicate. */ 4168 4169 /* Stacks for the bounds of the (or one of the) conflicting 4170 segment(s). These are only set at history_level 1. */ 4171 ExeContext* hist1_seg_start = NULL; 4172 ExeContext* hist1_seg_end = NULL; 4173 Thread* hist1_conf_thr = NULL; 4174 4175 tl_assert(acc_thr); 4176 tl_assert(acc_thr->opaque); 4177 tl_assert(HG_(clo_history_level) >= 0 && HG_(clo_history_level) <= 2); 4178 4179 if (HG_(clo_history_level) == 1) { 4180 Bool found; 4181 Word firstIx, lastIx; 4182 ULong_n_EC key; 4183 4184 /* At history_level 1, we must round up the relevant stack-pair 4185 for the conflicting segment right now. This is because 4186 deferring it is complex; we can't (easily) put Kfailed and 4187 Cfailed into the XError and wait for later without 4188 getting tied up in difficulties with VtsID reference 4189 counting. So just do it now. */ 4190 Thr* confThr; 4191 ULong confTym = 0; 4192 /* Which thread are we in conflict with? There may be more than 4193 one, in which case VtsID__findFirst_notLEQ selects one arbitrarily 4194 (in fact it's the one with the lowest Thr* value). */ 4195 confThr = VtsID__findFirst_notLEQ( Cfailed, Kfailed ); 4196 /* This must exist! since if it was NULL then there's no 4197 conflict (semantics of return value of 4198 VtsID__findFirst_notLEQ), and msmc{read,write}, which has 4199 called us, just checked exactly this -- that there was in 4200 fact a race. */ 4201 tl_assert(confThr); 4202 4203 /* Get the scalar clock value that the conflicting thread 4204 introduced into the constraint. A careful examination of the 4205 base machine rules shows that this must be the same as the 4206 conflicting thread's scalar clock when it created this 4207 constraint. Hence we know the scalar clock of the 4208 conflicting thread when the conflicting access was made. */ 4209 confTym = VtsID__indexAt( Cfailed, confThr ); 4210 4211 /* Using this scalar clock, index into the conflicting thread's 4212 collection of stack traces made each time its vector clock 4213 (hence its scalar clock) changed. This gives the stack 4214 traces at the start and end of the conflicting segment (well, 4215 as per comment just above, of one of the conflicting 4216 segments, if there are more than one). */ 4217 key.ull = confTym; 4218 key.ec = NULL; 4219 /* tl_assert(confThr); -- asserted just above */ 4220 tl_assert(confThr->local_Kws_n_stacks); 4221 firstIx = lastIx = 0; 4222 found = VG_(lookupXA_UNSAFE)( 4223 confThr->local_Kws_n_stacks, 4224 &key, &firstIx, &lastIx, 4225 (Int(*)(void*,void*))cmp__ULong_n_EC__by_ULong 4226 ); 4227 if (0) VG_(printf)("record_race_info %u %u %u confThr %p " 4228 "confTym %llu found %d (%lu,%lu)\n", 4229 Cfailed, Kfailed, Cw, 4230 confThr, confTym, found, firstIx, lastIx); 4231 /* We can't indefinitely collect stack traces at VTS 4232 transitions, since we'd eventually run out of memory. Hence 4233 note_local_Kw_n_stack_for will eventually throw away old 4234 ones, which in turn means we might fail to find index value 4235 confTym in the array. */ 4236 if (found) { 4237 ULong_n_EC *pair_start, *pair_end; 4238 pair_start 4239 = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, lastIx ); 4240 hist1_seg_start = pair_start->ec; 4241 if (lastIx+1 < VG_(sizeXA)( confThr->local_Kws_n_stacks )) { 4242 pair_end 4243 = (ULong_n_EC*)VG_(indexXA)( confThr->local_Kws_n_stacks, 4244 lastIx+1 ); 4245 /* from properties of VG_(lookupXA) and the comparison fn used: */ 4246 tl_assert(pair_start->ull < pair_end->ull); 4247 hist1_seg_end = pair_end->ec; 4248 /* Could do a bit better here. It may be that pair_end 4249 doesn't have a stack, but the following entries in the 4250 array have the same scalar Kw and to have a stack. So 4251 we should search a bit further along the array than 4252 lastIx+1 if hist1_seg_end is NULL. */ 4253 } else { 4254 if (confThr->still_alive) 4255 hist1_seg_end = main_get_EC( confThr ); 4256 } 4257 // seg_start could be NULL iff this is the first stack in the thread 4258 //if (seg_start) VG_(pp_ExeContext)(seg_start); 4259 //if (seg_end) VG_(pp_ExeContext)(seg_end); 4260 hist1_conf_thr = confThr->opaque; 4261 } 4262 } 4263 4264 HG_(record_error_Race)( acc_thr->opaque, acc_addr, 4265 szB, isWrite, 4266 hist1_conf_thr, hist1_seg_start, hist1_seg_end ); 4267 } 4268 4269 static Bool is_sane_SVal_C ( SVal sv ) { 4270 Bool leq; 4271 if (!SVal__isC(sv)) return True; 4272 leq = VtsID__cmpLEQ( SVal__unC_Rmin(sv), SVal__unC_Wmin(sv) ); 4273 return leq; 4274 } 4275 4276 4277 /* Compute new state following a read */ 4278 static inline SVal msmcread ( SVal svOld, 4279 /* The following are only needed for 4280 creating error reports. */ 4281 Thr* acc_thr, 4282 Addr acc_addr, SizeT szB ) 4283 { 4284 SVal svNew = SVal_INVALID; 4285 stats__msmcread++; 4286 4287 /* Redundant sanity check on the constraints */ 4288 if (CHECK_MSM) { 4289 tl_assert(is_sane_SVal_C(svOld)); 4290 } 4291 4292 if (LIKELY(SVal__isC(svOld))) { 4293 VtsID tviR = acc_thr->viR; 4294 VtsID tviW = acc_thr->viW; 4295 VtsID rmini = SVal__unC_Rmin(svOld); 4296 VtsID wmini = SVal__unC_Wmin(svOld); 4297 Bool leq = VtsID__cmpLEQ(rmini,tviR); 4298 if (LIKELY(leq)) { 4299 /* no race */ 4300 /* Note: RWLOCK subtlety: use tviW, not tviR */ 4301 svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) ); 4302 goto out; 4303 } else { 4304 /* assert on sanity of constraints. */ 4305 Bool leqxx = VtsID__cmpLEQ(rmini,wmini); 4306 tl_assert(leqxx); 4307 // same as in non-race case 4308 svNew = SVal__mkC( rmini, VtsID__join2(wmini, tviW) ); 4309 record_race_info( acc_thr, acc_addr, szB, False/*!isWrite*/, 4310 rmini, /* Cfailed */ 4311 tviR, /* Kfailed */ 4312 wmini /* Cw */ ); 4313 goto out; 4314 } 4315 } 4316 if (SVal__isA(svOld)) { 4317 /* reading no-access memory (sigh); leave unchanged */ 4318 /* check for no pollution */ 4319 tl_assert(svOld == SVal_NOACCESS); 4320 svNew = SVal_NOACCESS; 4321 goto out; 4322 } 4323 if (0) VG_(printf)("msmcread: bad svOld: 0x%016llx\n", svOld); 4324 tl_assert(0); 4325 4326 out: 4327 if (CHECK_MSM) { 4328 tl_assert(is_sane_SVal_C(svNew)); 4329 } 4330 if (UNLIKELY(svNew != svOld)) { 4331 tl_assert(svNew != SVal_INVALID); 4332 if (HG_(clo_history_level) >= 2 4333 && SVal__isC(svOld) && SVal__isC(svNew)) { 4334 event_map_bind( acc_addr, szB, False/*!isWrite*/, acc_thr ); 4335 stats__msmcread_change++; 4336 } 4337 } 4338 return svNew; 4339 } 4340 4341 4342 /* Compute new state following a write */ 4343 static inline SVal msmcwrite ( SVal svOld, 4344 /* The following are only needed for 4345 creating error reports. */ 4346 Thr* acc_thr, 4347 Addr acc_addr, SizeT szB ) 4348 { 4349 SVal svNew = SVal_INVALID; 4350 stats__msmcwrite++; 4351 4352 /* Redundant sanity check on the constraints */ 4353 if (CHECK_MSM) { 4354 tl_assert(is_sane_SVal_C(svOld)); 4355 } 4356 4357 if (LIKELY(SVal__isC(svOld))) { 4358 VtsID tviW = acc_thr->viW; 4359 VtsID wmini = SVal__unC_Wmin(svOld); 4360 Bool leq = VtsID__cmpLEQ(wmini,tviW); 4361 if (LIKELY(leq)) { 4362 /* no race */ 4363 svNew = SVal__mkC( tviW, tviW ); 4364 goto out; 4365 } else { 4366 VtsID rmini = SVal__unC_Rmin(svOld); 4367 /* assert on sanity of constraints. */ 4368 Bool leqxx = VtsID__cmpLEQ(rmini,wmini); 4369 tl_assert(leqxx); 4370 // same as in non-race case 4371 // proof: in the non-race case, we have 4372 // rmini <= wmini (invar on constraints) 4373 // tviW <= tviR (invar on thread clocks) 4374 // wmini <= tviW (from run-time check) 4375 // hence from transitivity of <= we have 4376 // rmini <= wmini <= tviW 4377 // and so join(rmini,tviW) == tviW 4378 // and join(wmini,tviW) == tviW 4379 // qed. 4380 svNew = SVal__mkC( VtsID__join2(rmini, tviW), 4381 VtsID__join2(wmini, tviW) ); 4382 record_race_info( acc_thr, acc_addr, szB, True/*isWrite*/, 4383 wmini, /* Cfailed */ 4384 tviW, /* Kfailed */ 4385 wmini /* Cw */ ); 4386 goto out; 4387 } 4388 } 4389 if (SVal__isA(svOld)) { 4390 /* writing no-access memory (sigh); leave unchanged */ 4391 /* check for no pollution */ 4392 tl_assert(svOld == SVal_NOACCESS); 4393 svNew = SVal_NOACCESS; 4394 goto out; 4395 } 4396 if (0) VG_(printf)("msmcwrite: bad svOld: 0x%016llx\n", svOld); 4397 tl_assert(0); 4398 4399 out: 4400 if (CHECK_MSM) { 4401 tl_assert(is_sane_SVal_C(svNew)); 4402 } 4403 if (UNLIKELY(svNew != svOld)) { 4404 tl_assert(svNew != SVal_INVALID); 4405 if (HG_(clo_history_level) >= 2 4406 && SVal__isC(svOld) && SVal__isC(svNew)) { 4407 event_map_bind( acc_addr, szB, True/*isWrite*/, acc_thr ); 4408 stats__msmcwrite_change++; 4409 } 4410 } 4411 return svNew; 4412 } 4413 4414 4415 ///////////////////////////////////////////////////////// 4416 // // 4417 // Apply core MSM to specific memory locations // 4418 // // 4419 ///////////////////////////////////////////////////////// 4420 4421 /*------------- ZSM accesses: 8 bit sapply ------------- */ 4422 4423 static void zsm_sapply08__msmcread ( Thr* thr, Addr a ) { 4424 CacheLine* cl; 4425 UWord cloff, tno, toff; 4426 SVal svOld, svNew; 4427 UShort descr; 4428 stats__cline_cread08s++; 4429 cl = get_cacheline(a); 4430 cloff = get_cacheline_offset(a); 4431 tno = get_treeno(a); 4432 toff = get_tree_offset(a); /* == 0 .. 7 */ 4433 descr = cl->descrs[tno]; 4434 if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) { 4435 SVal* tree = &cl->svals[tno << 3]; 4436 cl->descrs[tno] = pulldown_to_8(tree, toff, descr); 4437 if (CHECK_ZSM) 4438 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 4439 } 4440 svOld = cl->svals[cloff]; 4441 svNew = msmcread( svOld, thr,a,1 ); 4442 if (CHECK_ZSM) 4443 tl_assert(svNew != SVal_INVALID); 4444 cl->svals[cloff] = svNew; 4445 } 4446 4447 static void zsm_sapply08__msmcwrite ( Thr* thr, Addr a ) { 4448 CacheLine* cl; 4449 UWord cloff, tno, toff; 4450 SVal svOld, svNew; 4451 UShort descr; 4452 stats__cline_cwrite08s++; 4453 cl = get_cacheline(a); 4454 cloff = get_cacheline_offset(a); 4455 tno = get_treeno(a); 4456 toff = get_tree_offset(a); /* == 0 .. 7 */ 4457 descr = cl->descrs[tno]; 4458 if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) { 4459 SVal* tree = &cl->svals[tno << 3]; 4460 cl->descrs[tno] = pulldown_to_8(tree, toff, descr); 4461 if (CHECK_ZSM) 4462 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 4463 } 4464 svOld = cl->svals[cloff]; 4465 svNew = msmcwrite( svOld, thr,a,1 ); 4466 if (CHECK_ZSM) 4467 tl_assert(svNew != SVal_INVALID); 4468 cl->svals[cloff] = svNew; 4469 } 4470 4471 /*------------- ZSM accesses: 16 bit sapply ------------- */ 4472 4473 static void zsm_sapply16__msmcread ( Thr* thr, Addr a ) { 4474 CacheLine* cl; 4475 UWord cloff, tno, toff; 4476 SVal svOld, svNew; 4477 UShort descr; 4478 stats__cline_cread16s++; 4479 if (UNLIKELY(!aligned16(a))) goto slowcase; 4480 cl = get_cacheline(a); 4481 cloff = get_cacheline_offset(a); 4482 tno = get_treeno(a); 4483 toff = get_tree_offset(a); /* == 0, 2, 4 or 6 */ 4484 descr = cl->descrs[tno]; 4485 if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) { 4486 if (valid_value_is_below_me_16(descr, toff)) { 4487 goto slowcase; 4488 } else { 4489 SVal* tree = &cl->svals[tno << 3]; 4490 cl->descrs[tno] = pulldown_to_16(tree, toff, descr); 4491 } 4492 if (CHECK_ZSM) 4493 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 4494 } 4495 svOld = cl->svals[cloff]; 4496 svNew = msmcread( svOld, thr,a,2 ); 4497 if (CHECK_ZSM) 4498 tl_assert(svNew != SVal_INVALID); 4499 cl->svals[cloff] = svNew; 4500 return; 4501 slowcase: /* misaligned, or must go further down the tree */ 4502 stats__cline_16to8splits++; 4503 zsm_sapply08__msmcread( thr, a + 0 ); 4504 zsm_sapply08__msmcread( thr, a + 1 ); 4505 } 4506 4507 static void zsm_sapply16__msmcwrite ( Thr* thr, Addr a ) { 4508 CacheLine* cl; 4509 UWord cloff, tno, toff; 4510 SVal svOld, svNew; 4511 UShort descr; 4512 stats__cline_cwrite16s++; 4513 if (UNLIKELY(!aligned16(a))) goto slowcase; 4514 cl = get_cacheline(a); 4515 cloff = get_cacheline_offset(a); 4516 tno = get_treeno(a); 4517 toff = get_tree_offset(a); /* == 0, 2, 4 or 6 */ 4518 descr = cl->descrs[tno]; 4519 if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) { 4520 if (valid_value_is_below_me_16(descr, toff)) { 4521 goto slowcase; 4522 } else { 4523 SVal* tree = &cl->svals[tno << 3]; 4524 cl->descrs[tno] = pulldown_to_16(tree, toff, descr); 4525 } 4526 if (CHECK_ZSM) 4527 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 4528 } 4529 svOld = cl->svals[cloff]; 4530 svNew = msmcwrite( svOld, thr,a,2 ); 4531 if (CHECK_ZSM) 4532 tl_assert(svNew != SVal_INVALID); 4533 cl->svals[cloff] = svNew; 4534 return; 4535 slowcase: /* misaligned, or must go further down the tree */ 4536 stats__cline_16to8splits++; 4537 zsm_sapply08__msmcwrite( thr, a + 0 ); 4538 zsm_sapply08__msmcwrite( thr, a + 1 ); 4539 } 4540 4541 /*------------- ZSM accesses: 32 bit sapply ------------- */ 4542 4543 static void zsm_sapply32__msmcread ( Thr* thr, Addr a ) { 4544 CacheLine* cl; 4545 UWord cloff, tno, toff; 4546 SVal svOld, svNew; 4547 UShort descr; 4548 stats__cline_cread32s++; 4549 if (UNLIKELY(!aligned32(a))) goto slowcase; 4550 cl = get_cacheline(a); 4551 cloff = get_cacheline_offset(a); 4552 tno = get_treeno(a); 4553 toff = get_tree_offset(a); /* == 0 or 4 */ 4554 descr = cl->descrs[tno]; 4555 if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) { 4556 if (valid_value_is_above_me_32(descr, toff)) { 4557 SVal* tree = &cl->svals[tno << 3]; 4558 cl->descrs[tno] = pulldown_to_32(tree, toff, descr); 4559 } else { 4560 goto slowcase; 4561 } 4562 if (CHECK_ZSM) 4563 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 4564 } 4565 svOld = cl->svals[cloff]; 4566 svNew = msmcread( svOld, thr,a,4 ); 4567 if (CHECK_ZSM) 4568 tl_assert(svNew != SVal_INVALID); 4569 cl->svals[cloff] = svNew; 4570 return; 4571 slowcase: /* misaligned, or must go further down the tree */ 4572 stats__cline_32to16splits++; 4573 zsm_sapply16__msmcread( thr, a + 0 ); 4574 zsm_sapply16__msmcread( thr, a + 2 ); 4575 } 4576 4577 static void zsm_sapply32__msmcwrite ( Thr* thr, Addr a ) { 4578 CacheLine* cl; 4579 UWord cloff, tno, toff; 4580 SVal svOld, svNew; 4581 UShort descr; 4582 stats__cline_cwrite32s++; 4583 if (UNLIKELY(!aligned32(a))) goto slowcase; 4584 cl = get_cacheline(a); 4585 cloff = get_cacheline_offset(a); 4586 tno = get_treeno(a); 4587 toff = get_tree_offset(a); /* == 0 or 4 */ 4588 descr = cl->descrs[tno]; 4589 if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) { 4590 if (valid_value_is_above_me_32(descr, toff)) { 4591 SVal* tree = &cl->svals[tno << 3]; 4592 cl->descrs[tno] = pulldown_to_32(tree, toff, descr); 4593 } else { 4594 goto slowcase; 4595 } 4596 if (CHECK_ZSM) 4597 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 4598 } 4599 svOld = cl->svals[cloff]; 4600 svNew = msmcwrite( svOld, thr,a,4 ); 4601 if (CHECK_ZSM) 4602 tl_assert(svNew != SVal_INVALID); 4603 cl->svals[cloff] = svNew; 4604 return; 4605 slowcase: /* misaligned, or must go further down the tree */ 4606 stats__cline_32to16splits++; 4607 zsm_sapply16__msmcwrite( thr, a + 0 ); 4608 zsm_sapply16__msmcwrite( thr, a + 2 ); 4609 } 4610 4611 /*------------- ZSM accesses: 64 bit sapply ------------- */ 4612 4613 static void zsm_sapply64__msmcread ( Thr* thr, Addr a ) { 4614 CacheLine* cl; 4615 UWord cloff, tno; 4616 //UWord toff; 4617 SVal svOld, svNew; 4618 UShort descr; 4619 stats__cline_cread64s++; 4620 if (UNLIKELY(!aligned64(a))) goto slowcase; 4621 cl = get_cacheline(a); 4622 cloff = get_cacheline_offset(a); 4623 tno = get_treeno(a); 4624 //toff = get_tree_offset(a); /* == 0, unused */ 4625 descr = cl->descrs[tno]; 4626 if (UNLIKELY( !(descr & TREE_DESCR_64) )) { 4627 goto slowcase; 4628 } 4629 svOld = cl->svals[cloff]; 4630 svNew = msmcread( svOld, thr,a,8 ); 4631 if (CHECK_ZSM) 4632 tl_assert(svNew != SVal_INVALID); 4633 cl->svals[cloff] = svNew; 4634 return; 4635 slowcase: /* misaligned, or must go further down the tree */ 4636 stats__cline_64to32splits++; 4637 zsm_sapply32__msmcread( thr, a + 0 ); 4638 zsm_sapply32__msmcread( thr, a + 4 ); 4639 } 4640 4641 static void zsm_sapply64__msmcwrite ( Thr* thr, Addr a ) { 4642 CacheLine* cl; 4643 UWord cloff, tno; 4644 //UWord toff; 4645 SVal svOld, svNew; 4646 UShort descr; 4647 stats__cline_cwrite64s++; 4648 if (UNLIKELY(!aligned64(a))) goto slowcase; 4649 cl = get_cacheline(a); 4650 cloff = get_cacheline_offset(a); 4651 tno = get_treeno(a); 4652 //toff = get_tree_offset(a); /* == 0, unused */ 4653 descr = cl->descrs[tno]; 4654 if (UNLIKELY( !(descr & TREE_DESCR_64) )) { 4655 goto slowcase; 4656 } 4657 svOld = cl->svals[cloff]; 4658 svNew = msmcwrite( svOld, thr,a,8 ); 4659 if (CHECK_ZSM) 4660 tl_assert(svNew != SVal_INVALID); 4661 cl->svals[cloff] = svNew; 4662 return; 4663 slowcase: /* misaligned, or must go further down the tree */ 4664 stats__cline_64to32splits++; 4665 zsm_sapply32__msmcwrite( thr, a + 0 ); 4666 zsm_sapply32__msmcwrite( thr, a + 4 ); 4667 } 4668 4669 /*--------------- ZSM accesses: 8 bit swrite --------------- */ 4670 4671 static 4672 void zsm_swrite08 ( Addr a, SVal svNew ) { 4673 CacheLine* cl; 4674 UWord cloff, tno, toff; 4675 UShort descr; 4676 stats__cline_swrite08s++; 4677 cl = get_cacheline(a); 4678 cloff = get_cacheline_offset(a); 4679 tno = get_treeno(a); 4680 toff = get_tree_offset(a); /* == 0 .. 7 */ 4681 descr = cl->descrs[tno]; 4682 if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) { 4683 SVal* tree = &cl->svals[tno << 3]; 4684 cl->descrs[tno] = pulldown_to_8(tree, toff, descr); 4685 if (CHECK_ZSM) 4686 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 4687 } 4688 tl_assert(svNew != SVal_INVALID); 4689 cl->svals[cloff] = svNew; 4690 } 4691 4692 /*--------------- ZSM accesses: 16 bit swrite --------------- */ 4693 4694 static 4695 void zsm_swrite16 ( Addr a, SVal svNew ) { 4696 CacheLine* cl; 4697 UWord cloff, tno, toff; 4698 UShort descr; 4699 stats__cline_swrite16s++; 4700 if (UNLIKELY(!aligned16(a))) goto slowcase; 4701 cl = get_cacheline(a); 4702 cloff = get_cacheline_offset(a); 4703 tno = get_treeno(a); 4704 toff = get_tree_offset(a); /* == 0, 2, 4 or 6 */ 4705 descr = cl->descrs[tno]; 4706 if (UNLIKELY( !(descr & (TREE_DESCR_16_0 << toff)) )) { 4707 if (valid_value_is_below_me_16(descr, toff)) { 4708 /* Writing at this level. Need to fix up 'descr'. */ 4709 cl->descrs[tno] = pullup_descr_to_16(descr, toff); 4710 /* At this point, the tree does not match cl->descr[tno] any 4711 more. The assignments below will fix it up. */ 4712 } else { 4713 /* We can't indiscriminately write on the w16 node as in the 4714 w64 case, as that might make the node inconsistent with 4715 its parent. So first, pull down to this level. */ 4716 SVal* tree = &cl->svals[tno << 3]; 4717 cl->descrs[tno] = pulldown_to_16(tree, toff, descr); 4718 if (CHECK_ZSM) 4719 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 4720 } 4721 } 4722 tl_assert(svNew != SVal_INVALID); 4723 cl->svals[cloff + 0] = svNew; 4724 cl->svals[cloff + 1] = SVal_INVALID; 4725 return; 4726 slowcase: /* misaligned */ 4727 stats__cline_16to8splits++; 4728 zsm_swrite08( a + 0, svNew ); 4729 zsm_swrite08( a + 1, svNew ); 4730 } 4731 4732 /*--------------- ZSM accesses: 32 bit swrite --------------- */ 4733 4734 static 4735 void zsm_swrite32 ( Addr a, SVal svNew ) { 4736 CacheLine* cl; 4737 UWord cloff, tno, toff; 4738 UShort descr; 4739 stats__cline_swrite32s++; 4740 if (UNLIKELY(!aligned32(a))) goto slowcase; 4741 cl = get_cacheline(a); 4742 cloff = get_cacheline_offset(a); 4743 tno = get_treeno(a); 4744 toff = get_tree_offset(a); /* == 0 or 4 */ 4745 descr = cl->descrs[tno]; 4746 if (UNLIKELY( !(descr & (TREE_DESCR_32_0 << toff)) )) { 4747 if (valid_value_is_above_me_32(descr, toff)) { 4748 /* We can't indiscriminately write on the w32 node as in the 4749 w64 case, as that might make the node inconsistent with 4750 its parent. So first, pull down to this level. */ 4751 SVal* tree = &cl->svals[tno << 3]; 4752 cl->descrs[tno] = pulldown_to_32(tree, toff, descr); 4753 if (CHECK_ZSM) 4754 tl_assert(is_sane_CacheLine(cl)); /* EXPENSIVE */ 4755 } else { 4756 /* Writing at this level. Need to fix up 'descr'. */ 4757 cl->descrs[tno] = pullup_descr_to_32(descr, toff); 4758 /* At this point, the tree does not match cl->descr[tno] any 4759 more. The assignments below will fix it up. */ 4760 } 4761 } 4762 tl_assert(svNew != SVal_INVALID); 4763 cl->svals[cloff + 0] = svNew; 4764 cl->svals[cloff + 1] = SVal_INVALID; 4765 cl->svals[cloff + 2] = SVal_INVALID; 4766 cl->svals[cloff + 3] = SVal_INVALID; 4767 return; 4768 slowcase: /* misaligned */ 4769 stats__cline_32to16splits++; 4770 zsm_swrite16( a + 0, svNew ); 4771 zsm_swrite16( a + 2, svNew ); 4772 } 4773 4774 /*--------------- ZSM accesses: 64 bit swrite --------------- */ 4775 4776 static 4777 void zsm_swrite64 ( Addr a, SVal svNew ) { 4778 CacheLine* cl; 4779 UWord cloff, tno; 4780 //UWord toff; 4781 stats__cline_swrite64s++; 4782 if (UNLIKELY(!aligned64(a))) goto slowcase; 4783 cl = get_cacheline(a); 4784 cloff = get_cacheline_offset(a); 4785 tno = get_treeno(a); 4786 //toff = get_tree_offset(a); /* == 0, unused */ 4787 cl->descrs[tno] = TREE_DESCR_64; 4788 tl_assert(svNew != SVal_INVALID); 4789 cl->svals[cloff + 0] = svNew; 4790 cl->svals[cloff + 1] = SVal_INVALID; 4791 cl->svals[cloff + 2] = SVal_INVALID; 4792 cl->svals[cloff + 3] = SVal_INVALID; 4793 cl->svals[cloff + 4] = SVal_INVALID; 4794 cl->svals[cloff + 5] = SVal_INVALID; 4795 cl->svals[cloff + 6] = SVal_INVALID; 4796 cl->svals[cloff + 7] = SVal_INVALID; 4797 return; 4798 slowcase: /* misaligned */ 4799 stats__cline_64to32splits++; 4800 zsm_swrite32( a + 0, svNew ); 4801 zsm_swrite32( a + 4, svNew ); 4802 } 4803 4804 /*------------- ZSM accesses: 8 bit sread/scopy ------------- */ 4805 4806 static 4807 SVal zsm_sread08 ( Addr a ) { 4808 CacheLine* cl; 4809 UWord cloff, tno, toff; 4810 UShort descr; 4811 stats__cline_sread08s++; 4812 cl = get_cacheline(a); 4813 cloff = get_cacheline_offset(a); 4814 tno = get_treeno(a); 4815 toff = get_tree_offset(a); /* == 0 .. 7 */ 4816 descr = cl->descrs[tno]; 4817 if (UNLIKELY( !(descr & (TREE_DESCR_8_0 << toff)) )) { 4818 SVal* tree = &cl->svals[tno << 3]; 4819 cl->descrs[tno] = pulldown_to_8(tree, toff, descr); 4820 } 4821 return cl->svals[cloff]; 4822 } 4823 4824 static void zsm_scopy08 ( Addr src, Addr dst, Bool uu_normalise ) { 4825 SVal sv; 4826 stats__cline_scopy08s++; 4827 sv = zsm_sread08( src ); 4828 zsm_swrite08( dst, sv ); 4829 } 4830 4831 4832 /* Block-copy states (needed for implementing realloc()). Note this 4833 doesn't change the filtering arrangements. The caller of 4834 zsm_scopy_range needs to attend to that. */ 4835 4836 static void zsm_scopy_range ( Addr src, Addr dst, SizeT len ) 4837 { 4838 SizeT i; 4839 if (len == 0) 4840 return; 4841 4842 /* assert for non-overlappingness */ 4843 tl_assert(src+len <= dst || dst+len <= src); 4844 4845 /* To be simple, just copy byte by byte. But so as not to wreck 4846 performance for later accesses to dst[0 .. len-1], normalise 4847 destination lines as we finish with them, and also normalise the 4848 line containing the first and last address. */ 4849 for (i = 0; i < len; i++) { 4850 Bool normalise 4851 = get_cacheline_offset( dst+i+1 ) == 0 /* last in line */ 4852 || i == 0 /* first in range */ 4853 || i == len-1; /* last in range */ 4854 zsm_scopy08( src+i, dst+i, normalise ); 4855 } 4856 } 4857 4858 4859 /* For setting address ranges to a given value. Has considerable 4860 sophistication so as to avoid generating large numbers of pointless 4861 cache loads/writebacks for large ranges. */ 4862 4863 /* Do small ranges in-cache, in the obvious way. */ 4864 static 4865 void zsm_sset_range_SMALL ( Addr a, SizeT len, SVal svNew ) 4866 { 4867 /* fast track a couple of common cases */ 4868 if (len == 4 && aligned32(a)) { 4869 zsm_swrite32( a, svNew ); 4870 return; 4871 } 4872 if (len == 8 && aligned64(a)) { 4873 zsm_swrite64( a, svNew ); 4874 return; 4875 } 4876 4877 /* be completely general (but as efficient as possible) */ 4878 if (len == 0) return; 4879 4880 if (!aligned16(a) && len >= 1) { 4881 zsm_swrite08( a, svNew ); 4882 a += 1; 4883 len -= 1; 4884 tl_assert(aligned16(a)); 4885 } 4886 if (len == 0) return; 4887 4888 if (!aligned32(a) && len >= 2) { 4889 zsm_swrite16( a, svNew ); 4890 a += 2; 4891 len -= 2; 4892 tl_assert(aligned32(a)); 4893 } 4894 if (len == 0) return; 4895 4896 if (!aligned64(a) && len >= 4) { 4897 zsm_swrite32( a, svNew ); 4898 a += 4; 4899 len -= 4; 4900 tl_assert(aligned64(a)); 4901 } 4902 if (len == 0) return; 4903 4904 if (len >= 8) { 4905 tl_assert(aligned64(a)); 4906 while (len >= 8) { 4907 zsm_swrite64( a, svNew ); 4908 a += 8; 4909 len -= 8; 4910 } 4911 tl_assert(aligned64(a)); 4912 } 4913 if (len == 0) return; 4914 4915 if (len >= 4) 4916 tl_assert(aligned32(a)); 4917 if (len >= 4) { 4918 zsm_swrite32( a, svNew ); 4919 a += 4; 4920 len -= 4; 4921 } 4922 if (len == 0) return; 4923 4924 if (len >= 2) 4925 tl_assert(aligned16(a)); 4926 if (len >= 2) { 4927 zsm_swrite16( a, svNew ); 4928 a += 2; 4929 len -= 2; 4930 } 4931 if (len == 0) return; 4932 4933 if (len >= 1) { 4934 zsm_swrite08( a, svNew ); 4935 //a += 1; 4936 len -= 1; 4937 } 4938 tl_assert(len == 0); 4939 } 4940 4941 4942 /* If we're doing a small range, hand off to zsm_sset_range_SMALL. But 4943 for larger ranges, try to operate directly on the out-of-cache 4944 representation, rather than dragging lines into the cache, 4945 overwriting them, and forcing them out. This turns out to be an 4946 important performance optimisation. 4947 4948 Note that this doesn't change the filtering arrangements. The 4949 caller of zsm_sset_range needs to attend to that. */ 4950 4951 static void zsm_sset_range ( Addr a, SizeT len, SVal svNew ) 4952 { 4953 tl_assert(svNew != SVal_INVALID); 4954 stats__cache_make_New_arange += (ULong)len; 4955 4956 if (0 && len > 500) 4957 VG_(printf)("make New ( %#lx, %ld )\n", a, len ); 4958 4959 if (0) { 4960 static UWord n_New_in_cache = 0; 4961 static UWord n_New_not_in_cache = 0; 4962 /* tag is 'a' with the in-line offset masked out, 4963 eg a[31]..a[4] 0000 */ 4964 Addr tag = a & ~(N_LINE_ARANGE - 1); 4965 UWord wix = (a >> N_LINE_BITS) & (N_WAY_NENT - 1); 4966 if (LIKELY(tag == cache_shmem.tags0[wix])) { 4967 n_New_in_cache++; 4968 } else { 4969 n_New_not_in_cache++; 4970 } 4971 if (0 == ((n_New_in_cache + n_New_not_in_cache) % 100000)) 4972 VG_(printf)("shadow_mem_make_New: IN %lu OUT %lu\n", 4973 n_New_in_cache, n_New_not_in_cache ); 4974 } 4975 4976 if (LIKELY(len < 2 * N_LINE_ARANGE)) { 4977 zsm_sset_range_SMALL( a, len, svNew ); 4978 } else { 4979 Addr before_start = a; 4980 Addr aligned_start = cacheline_ROUNDUP(a); 4981 Addr after_start = cacheline_ROUNDDN(a + len); 4982 UWord before_len = aligned_start - before_start; 4983 UWord aligned_len = after_start - aligned_start; 4984 UWord after_len = a + len - after_start; 4985 tl_assert(before_start <= aligned_start); 4986 tl_assert(aligned_start <= after_start); 4987 tl_assert(before_len < N_LINE_ARANGE); 4988 tl_assert(after_len < N_LINE_ARANGE); 4989 tl_assert(get_cacheline_offset(aligned_start) == 0); 4990 if (get_cacheline_offset(a) == 0) { 4991 tl_assert(before_len == 0); 4992 tl_assert(a == aligned_start); 4993 } 4994 if (get_cacheline_offset(a+len) == 0) { 4995 tl_assert(after_len == 0); 4996 tl_assert(after_start == a+len); 4997 } 4998 if (before_len > 0) { 4999 zsm_sset_range_SMALL( before_start, before_len, svNew ); 5000 } 5001 if (after_len > 0) { 5002 zsm_sset_range_SMALL( after_start, after_len, svNew ); 5003 } 5004 stats__cache_make_New_inZrep += (ULong)aligned_len; 5005 5006 while (1) { 5007 Addr tag; 5008 UWord wix; 5009 if (aligned_start >= after_start) 5010 break; 5011 tl_assert(get_cacheline_offset(aligned_start) == 0); 5012 tag = aligned_start & ~(N_LINE_ARANGE - 1); 5013 wix = (aligned_start >> N_LINE_BITS) & (N_WAY_NENT - 1); 5014 if (tag == cache_shmem.tags0[wix]) { 5015 UWord i; 5016 for (i = 0; i < N_LINE_ARANGE / 8; i++) 5017 zsm_swrite64( aligned_start + i * 8, svNew ); 5018 } else { 5019 UWord i; 5020 Word zix; 5021 SecMap* sm; 5022 LineZ* lineZ; 5023 /* This line is not in the cache. Do not force it in; instead 5024 modify it in-place. */ 5025 /* find the Z line to write in and rcdec it or the 5026 associated F line. */ 5027 find_Z_for_writing( &sm, &zix, tag ); 5028 tl_assert(sm); 5029 tl_assert(zix >= 0 && zix < N_SECMAP_ZLINES); 5030 lineZ = &sm->linesZ[zix]; 5031 lineZ->dict[0] = svNew; 5032 lineZ->dict[1] = lineZ->dict[2] = lineZ->dict[3] = SVal_INVALID; 5033 for (i = 0; i < N_LINE_ARANGE/4; i++) 5034 lineZ->ix2s[i] = 0; /* all refer to dict[0] */ 5035 rcinc_LineZ(lineZ); 5036 } 5037 aligned_start += N_LINE_ARANGE; 5038 aligned_len -= N_LINE_ARANGE; 5039 } 5040 tl_assert(aligned_start == after_start); 5041 tl_assert(aligned_len == 0); 5042 } 5043 } 5044 5045 5046 ///////////////////////////////////////////////////////// 5047 // // 5048 // Front-filtering accesses // 5049 // // 5050 ///////////////////////////////////////////////////////// 5051 5052 static UWord stats__f_ac = 0; 5053 static UWord stats__f_sk = 0; 5054 5055 #if 0 5056 # define STATS__F_SHOW \ 5057 do { \ 5058 if (UNLIKELY(0 == (stats__f_ac & 0xFFFFFF))) \ 5059 VG_(printf)("filters: ac %lu sk %lu\n", \ 5060 stats__f_ac, stats__f_sk); \ 5061 } while (0) 5062 #else 5063 # define STATS__F_SHOW /* */ 5064 #endif 5065 5066 void zsm_sapply08_f__msmcwrite ( Thr* thr, Addr a ) { 5067 stats__f_ac++; 5068 STATS__F_SHOW; 5069 if (LIKELY(Filter__ok_to_skip_cwr08(thr->filter, a))) { 5070 stats__f_sk++; 5071 return; 5072 } 5073 zsm_sapply08__msmcwrite(thr, a); 5074 } 5075 5076 void zsm_sapply16_f__msmcwrite ( Thr* thr, Addr a ) { 5077 stats__f_ac++; 5078 STATS__F_SHOW; 5079 if (LIKELY(Filter__ok_to_skip_cwr16(thr->filter, a))) { 5080 stats__f_sk++; 5081 return; 5082 } 5083 zsm_sapply16__msmcwrite(thr, a); 5084 } 5085 5086 void zsm_sapply32_f__msmcwrite ( Thr* thr, Addr a ) { 5087 stats__f_ac++; 5088 STATS__F_SHOW; 5089 if (LIKELY(Filter__ok_to_skip_cwr32(thr->filter, a))) { 5090 stats__f_sk++; 5091 return; 5092 } 5093 zsm_sapply32__msmcwrite(thr, a); 5094 } 5095 5096 void zsm_sapply64_f__msmcwrite ( Thr* thr, Addr a ) { 5097 stats__f_ac++; 5098 STATS__F_SHOW; 5099 if (LIKELY(Filter__ok_to_skip_cwr64(thr->filter, a))) { 5100 stats__f_sk++; 5101 return; 5102 } 5103 zsm_sapply64__msmcwrite(thr, a); 5104 } 5105 5106 void zsm_sapplyNN_f__msmcwrite ( Thr* thr, Addr a, SizeT len ) 5107 { 5108 /* fast track a couple of common cases */ 5109 if (len == 4 && aligned32(a)) { 5110 zsm_sapply32_f__msmcwrite( thr, a ); 5111 return; 5112 } 5113 if (len == 8 && aligned64(a)) { 5114 zsm_sapply64_f__msmcwrite( thr, a ); 5115 return; 5116 } 5117 5118 /* be completely general (but as efficient as possible) */ 5119 if (len == 0) return; 5120 5121 if (!aligned16(a) && len >= 1) { 5122 zsm_sapply08_f__msmcwrite( thr, a ); 5123 a += 1; 5124 len -= 1; 5125 tl_assert(aligned16(a)); 5126 } 5127 if (len == 0) return; 5128 5129 if (!aligned32(a) && len >= 2) { 5130 zsm_sapply16_f__msmcwrite( thr, a ); 5131 a += 2; 5132 len -= 2; 5133 tl_assert(aligned32(a)); 5134 } 5135 if (len == 0) return; 5136 5137 if (!aligned64(a) && len >= 4) { 5138 zsm_sapply32_f__msmcwrite( thr, a ); 5139 a += 4; 5140 len -= 4; 5141 tl_assert(aligned64(a)); 5142 } 5143 if (len == 0) return; 5144 5145 if (len >= 8) { 5146 tl_assert(aligned64(a)); 5147 while (len >= 8) { 5148 zsm_sapply64_f__msmcwrite( thr, a ); 5149 a += 8; 5150 len -= 8; 5151 } 5152 tl_assert(aligned64(a)); 5153 } 5154 if (len == 0) return; 5155 5156 if (len >= 4) 5157 tl_assert(aligned32(a)); 5158 if (len >= 4) { 5159 zsm_sapply32_f__msmcwrite( thr, a ); 5160 a += 4; 5161 len -= 4; 5162 } 5163 if (len == 0) return; 5164 5165 if (len >= 2) 5166 tl_assert(aligned16(a)); 5167 if (len >= 2) { 5168 zsm_sapply16_f__msmcwrite( thr, a ); 5169 a += 2; 5170 len -= 2; 5171 } 5172 if (len == 0) return; 5173 5174 if (len >= 1) { 5175 zsm_sapply08_f__msmcwrite( thr, a ); 5176 //a += 1; 5177 len -= 1; 5178 } 5179 tl_assert(len == 0); 5180 } 5181 5182 void zsm_sapply08_f__msmcread ( Thr* thr, Addr a ) { 5183 stats__f_ac++; 5184 STATS__F_SHOW; 5185 if (LIKELY(Filter__ok_to_skip_crd08(thr->filter, a))) { 5186 stats__f_sk++; 5187 return; 5188 } 5189 zsm_sapply08__msmcread(thr, a); 5190 } 5191 5192 void zsm_sapply16_f__msmcread ( Thr* thr, Addr a ) { 5193 stats__f_ac++; 5194 STATS__F_SHOW; 5195 if (LIKELY(Filter__ok_to_skip_crd16(thr->filter, a))) { 5196 stats__f_sk++; 5197 return; 5198 } 5199 zsm_sapply16__msmcread(thr, a); 5200 } 5201 5202 void zsm_sapply32_f__msmcread ( Thr* thr, Addr a ) { 5203 stats__f_ac++; 5204 STATS__F_SHOW; 5205 if (LIKELY(Filter__ok_to_skip_crd32(thr->filter, a))) { 5206 stats__f_sk++; 5207 return; 5208 } 5209 zsm_sapply32__msmcread(thr, a); 5210 } 5211 5212 void zsm_sapply64_f__msmcread ( Thr* thr, Addr a ) { 5213 stats__f_ac++; 5214 STATS__F_SHOW; 5215 if (LIKELY(Filter__ok_to_skip_crd64(thr->filter, a))) { 5216 stats__f_sk++; 5217 return; 5218 } 5219 zsm_sapply64__msmcread(thr, a); 5220 } 5221 5222 void zsm_sapplyNN_f__msmcread ( Thr* thr, Addr a, SizeT len ) 5223 { 5224 /* fast track a couple of common cases */ 5225 if (len == 4 && aligned32(a)) { 5226 zsm_sapply32_f__msmcread( thr, a ); 5227 return; 5228 } 5229 if (len == 8 && aligned64(a)) { 5230 zsm_sapply64_f__msmcread( thr, a ); 5231 return; 5232 } 5233 5234 /* be completely general (but as efficient as possible) */ 5235 if (len == 0) return; 5236 5237 if (!aligned16(a) && len >= 1) { 5238 zsm_sapply08_f__msmcread( thr, a ); 5239 a += 1; 5240 len -= 1; 5241 tl_assert(aligned16(a)); 5242 } 5243 if (len == 0) return; 5244 5245 if (!aligned32(a) && len >= 2) { 5246 zsm_sapply16_f__msmcread( thr, a ); 5247 a += 2; 5248 len -= 2; 5249 tl_assert(aligned32(a)); 5250 } 5251 if (len == 0) return; 5252 5253 if (!aligned64(a) && len >= 4) { 5254 zsm_sapply32_f__msmcread( thr, a ); 5255 a += 4; 5256 len -= 4; 5257 tl_assert(aligned64(a)); 5258 } 5259 if (len == 0) return; 5260 5261 if (len >= 8) { 5262 tl_assert(aligned64(a)); 5263 while (len >= 8) { 5264 zsm_sapply64_f__msmcread( thr