1 /*--------------------------------------------------------------------*/ 2 /*--- Cache simulation. ---*/ 3 /*--- sim.c ---*/ 4 /*--------------------------------------------------------------------*/ 5 6 /* 7 This file is part of Callgrind, a Valgrind tool for call graph 8 profiling programs. 9 10 Copyright (C) 2003-2010, Josef Weidendorfer (Josef.Weidendorfer (at) gmx.de) 11 12 This tool is derived from and contains code from Cachegrind 13 Copyright (C) 2002-2010 Nicholas Nethercote (njn (at) valgrind.org) 14 15 This program is free software; you can redistribute it and/or 16 modify it under the terms of the GNU General Public License as 17 published by the Free Software Foundation; either version 2 of the 18 License, or (at your option) any later version. 19 20 This program is distributed in the hope that it will be useful, but 21 WITHOUT ANY WARRANTY; without even the implied warranty of 22 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 23 General Public License for more details. 24 25 You should have received a copy of the GNU General Public License 26 along with this program; if not, write to the Free Software 27 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 28 02111-1307, USA. 29 30 The GNU General Public License is contained in the file COPYING. 31 */ 32 33 #include "global.h" 34 35 36 /* Notes: 37 - simulates a write-allocate cache 38 - (block --> set) hash function uses simple bit selection 39 - handling of references straddling two cache blocks: 40 - counts as only one cache access (not two) 41 - both blocks hit --> one hit 42 - one block hits, the other misses --> one miss 43 - both blocks miss --> one miss (not two) 44 */ 45 46 /* Cache configuration */ 47 #include "cg_arch.h" 48 49 /* additional structures for cache use info, separated 50 * according usage frequency: 51 * - line_loaded : pointer to cost center of instruction 52 * which loaded the line into cache. 53 * Needed to increment counters when line is evicted. 54 * - line_use : updated on every access 55 */ 56 typedef struct { 57 UInt count; 58 UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */ 59 } line_use; 60 61 typedef struct { 62 Addr memline, iaddr; 63 line_use* dep_use; /* point to higher-level cacheblock for this memline */ 64 ULong* use_base; 65 } line_loaded; 66 67 /* Cache state */ 68 typedef struct { 69 char* name; 70 int size; /* bytes */ 71 int assoc; 72 int line_size; /* bytes */ 73 Bool sectored; /* prefetch nearside cacheline on read */ 74 int sets; 75 int sets_min_1; 76 int line_size_bits; 77 int tag_shift; 78 UWord tag_mask; 79 char desc_line[128]; 80 UWord* tags; 81 82 /* for cache use */ 83 int line_size_mask; 84 int* line_start_mask; 85 int* line_end_mask; 86 line_loaded* loaded; 87 line_use* use; 88 } cache_t2; 89 90 /* 91 * States of flat caches in our model. 92 * We use a 2-level hierarchy, 93 */ 94 static cache_t2 I1, D1, LL; 95 96 /* Lower bits of cache tags are used as flags for a cache line */ 97 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1) 98 #define CACHELINE_DIRTY 1 99 100 101 /* Cache simulator Options */ 102 static Bool clo_simulate_writeback = False; 103 static Bool clo_simulate_hwpref = False; 104 static Bool clo_simulate_sectors = False; 105 static Bool clo_collect_cacheuse = False; 106 107 /* Following global vars are setup before by setup_bbcc(): 108 * 109 * - Addr CLG_(bb_base) (instruction start address of original BB) 110 * - ULong* CLG_(cost_base) (start of cost array for BB) 111 */ 112 113 Addr CLG_(bb_base); 114 ULong* CLG_(cost_base); 115 116 static InstrInfo* current_ii; 117 118 /* Cache use offsets */ 119 /* The offsets are only correct because all per-instruction event sets get 120 * the "Use" set added first ! 121 */ 122 static Int off_I1_AcCost = 0; 123 static Int off_I1_SpLoss = 1; 124 static Int off_D1_AcCost = 0; 125 static Int off_D1_SpLoss = 1; 126 static Int off_LL_AcCost = 2; 127 static Int off_LL_SpLoss = 3; 128 129 /* Cache access types */ 130 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType; 131 132 /* Result of a reference into a flat cache */ 133 typedef enum { Hit = 0, Miss, MissDirty } CacheResult; 134 135 /* Result of a reference into a hierarchical cache model */ 136 typedef enum { 137 L1_Hit, 138 LL_Hit, 139 MemAccess, 140 WriteBackMemAccess } CacheModelResult; 141 142 typedef CacheModelResult (*simcall_type)(Addr, UChar); 143 144 static struct { 145 simcall_type I1_Read; 146 simcall_type D1_Read; 147 simcall_type D1_Write; 148 } simulator; 149 150 /*------------------------------------------------------------*/ 151 /*--- Cache Simulator Initialization ---*/ 152 /*------------------------------------------------------------*/ 153 154 static void cachesim_clearcache(cache_t2* c) 155 { 156 Int i; 157 158 for (i = 0; i < c->sets * c->assoc; i++) 159 c->tags[i] = 0; 160 if (c->use) { 161 for (i = 0; i < c->sets * c->assoc; i++) { 162 c->loaded[i].memline = 0; 163 c->loaded[i].use_base = 0; 164 c->loaded[i].dep_use = 0; 165 c->loaded[i].iaddr = 0; 166 c->use[i].mask = 0; 167 c->use[i].count = 0; 168 c->tags[i] = i % c->assoc; /* init lower bits as pointer */ 169 } 170 } 171 } 172 173 static void cacheuse_initcache(cache_t2* c); 174 175 /* By this point, the size/assoc/line_size has been checked. */ 176 static void cachesim_initcache(cache_t config, cache_t2* c) 177 { 178 c->size = config.size; 179 c->assoc = config.assoc; 180 c->line_size = config.line_size; 181 c->sectored = False; // FIXME 182 183 c->sets = (c->size / c->line_size) / c->assoc; 184 c->sets_min_1 = c->sets - 1; 185 c->line_size_bits = VG_(log2)(c->line_size); 186 c->tag_shift = c->line_size_bits + VG_(log2)(c->sets); 187 c->tag_mask = ~((1<<c->tag_shift)-1); 188 189 /* Can bits in tag entries be used for flags? 190 * Should be always true as MIN_LINE_SIZE >= 16 */ 191 CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0); 192 193 if (c->assoc == 1) { 194 VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s", 195 c->size, c->line_size, 196 c->sectored ? ", sectored":""); 197 } else { 198 VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s", 199 c->size, c->line_size, c->assoc, 200 c->sectored ? ", sectored":""); 201 } 202 203 c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1", 204 sizeof(UWord) * c->sets * c->assoc); 205 if (clo_collect_cacheuse) 206 cacheuse_initcache(c); 207 else 208 c->use = 0; 209 cachesim_clearcache(c); 210 } 211 212 213 #if 0 214 static void print_cache(cache_t2* c) 215 { 216 UInt set, way, i; 217 218 /* Note initialisation and update of 'i'. */ 219 for (i = 0, set = 0; set < c->sets; set++) { 220 for (way = 0; way < c->assoc; way++, i++) { 221 VG_(printf)("%8x ", c->tags[i]); 222 } 223 VG_(printf)("\n"); 224 } 225 } 226 #endif 227 228 229 /*------------------------------------------------------------*/ 230 /*--- Write Through Cache Simulation ---*/ 231 /*------------------------------------------------------------*/ 232 233 /* 234 * Simple model: L1 & LL Write Through 235 * Does not distinguish among read and write references 236 * 237 * Simulator functions: 238 * CacheModelResult cachesim_I1_ref(Addr a, UChar size) 239 * CacheModelResult cachesim_D1_ref(Addr a, UChar size) 240 */ 241 242 static __inline__ 243 CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag) 244 { 245 int i, j; 246 UWord *set; 247 248 set = &(c->tags[set_no * c->assoc]); 249 250 /* This loop is unrolled for just the first case, which is the most */ 251 /* common. We can't unroll any further because it would screw up */ 252 /* if we have a direct-mapped (1-way) cache. */ 253 if (tag == set[0]) 254 return Hit; 255 256 /* If the tag is one other than the MRU, move it into the MRU spot */ 257 /* and shuffle the rest down. */ 258 for (i = 1; i < c->assoc; i++) { 259 if (tag == set[i]) { 260 for (j = i; j > 0; j--) { 261 set[j] = set[j - 1]; 262 } 263 set[0] = tag; 264 return Hit; 265 } 266 } 267 268 /* A miss; install this tag as MRU, shuffle rest down. */ 269 for (j = c->assoc - 1; j > 0; j--) { 270 set[j] = set[j - 1]; 271 } 272 set[0] = tag; 273 274 return Miss; 275 } 276 277 static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size) 278 { 279 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1); 280 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1); 281 UWord tag = a >> c->tag_shift; 282 283 /* Access entirely within line. */ 284 if (set1 == set2) 285 return cachesim_setref(c, set1, tag); 286 287 /* Access straddles two lines. */ 288 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */ 289 else if (((set1 + 1) & (c->sets-1)) == set2) { 290 UWord tag2 = (a+size-1) >> c->tag_shift; 291 292 /* the call updates cache structures as side effect */ 293 CacheResult res1 = cachesim_setref(c, set1, tag); 294 CacheResult res2 = cachesim_setref(c, set2, tag2); 295 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit; 296 297 } else { 298 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2); 299 VG_(tool_panic)("item straddles more than two cache sets"); 300 } 301 return Hit; 302 } 303 304 static 305 CacheModelResult cachesim_I1_ref(Addr a, UChar size) 306 { 307 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit; 308 if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit; 309 return MemAccess; 310 } 311 312 static 313 CacheModelResult cachesim_D1_ref(Addr a, UChar size) 314 { 315 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit; 316 if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit; 317 return MemAccess; 318 } 319 320 321 /*------------------------------------------------------------*/ 322 /*--- Write Back Cache Simulation ---*/ 323 /*------------------------------------------------------------*/ 324 325 /* 326 * More complex model: L1 Write-through, LL Write-back 327 * This needs to distinguish among read and write references. 328 * 329 * Simulator functions: 330 * CacheModelResult cachesim_I1_Read(Addr a, UChar size) 331 * CacheModelResult cachesim_D1_Read(Addr a, UChar size) 332 * CacheModelResult cachesim_D1_Write(Addr a, UChar size) 333 */ 334 335 /* 336 * With write-back, result can be a miss evicting a dirty line 337 * The dirty state of a cache line is stored in Bit0 of the tag for 338 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference 339 * type (Read/Write), the line gets dirty on a write. 340 */ 341 static __inline__ 342 CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag) 343 { 344 int i, j; 345 UWord *set, tmp_tag; 346 347 set = &(c->tags[set_no * c->assoc]); 348 349 /* This loop is unrolled for just the first case, which is the most */ 350 /* common. We can't unroll any further because it would screw up */ 351 /* if we have a direct-mapped (1-way) cache. */ 352 if (tag == (set[0] & ~CACHELINE_DIRTY)) { 353 set[0] |= ref; 354 return Hit; 355 } 356 /* If the tag is one other than the MRU, move it into the MRU spot */ 357 /* and shuffle the rest down. */ 358 for (i = 1; i < c->assoc; i++) { 359 if (tag == (set[i] & ~CACHELINE_DIRTY)) { 360 tmp_tag = set[i] | ref; // update dirty flag 361 for (j = i; j > 0; j--) { 362 set[j] = set[j - 1]; 363 } 364 set[0] = tmp_tag; 365 return Hit; 366 } 367 } 368 369 /* A miss; install this tag as MRU, shuffle rest down. */ 370 tmp_tag = set[c->assoc - 1]; 371 for (j = c->assoc - 1; j > 0; j--) { 372 set[j] = set[j - 1]; 373 } 374 set[0] = tag | ref; 375 376 return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss; 377 } 378 379 380 static __inline__ 381 CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size) 382 { 383 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1); 384 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1); 385 UWord tag = a & c->tag_mask; 386 387 /* Access entirely within line. */ 388 if (set1 == set2) 389 return cachesim_setref_wb(c, ref, set1, tag); 390 391 /* Access straddles two lines. */ 392 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */ 393 else if (((set1 + 1) & (c->sets-1)) == set2) { 394 UWord tag2 = (a+size-1) & c->tag_mask; 395 396 /* the call updates cache structures as side effect */ 397 CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag); 398 CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2); 399 400 if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty; 401 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit; 402 403 } else { 404 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2); 405 VG_(tool_panic)("item straddles more than two cache sets"); 406 } 407 return Hit; 408 } 409 410 411 static 412 CacheModelResult cachesim_I1_Read(Addr a, UChar size) 413 { 414 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit; 415 switch( cachesim_ref_wb( &LL, Read, a, size) ) { 416 case Hit: return LL_Hit; 417 case Miss: return MemAccess; 418 default: break; 419 } 420 return WriteBackMemAccess; 421 } 422 423 static 424 CacheModelResult cachesim_D1_Read(Addr a, UChar size) 425 { 426 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit; 427 switch( cachesim_ref_wb( &LL, Read, a, size) ) { 428 case Hit: return LL_Hit; 429 case Miss: return MemAccess; 430 default: break; 431 } 432 return WriteBackMemAccess; 433 } 434 435 static 436 CacheModelResult cachesim_D1_Write(Addr a, UChar size) 437 { 438 if ( cachesim_ref( &D1, a, size) == Hit ) { 439 /* Even for a L1 hit, the write-trough L1 passes 440 * the write to the LL to make the LL line dirty. 441 * But this causes no latency, so return the hit. 442 */ 443 cachesim_ref_wb( &LL, Write, a, size); 444 return L1_Hit; 445 } 446 switch( cachesim_ref_wb( &LL, Write, a, size) ) { 447 case Hit: return LL_Hit; 448 case Miss: return MemAccess; 449 default: break; 450 } 451 return WriteBackMemAccess; 452 } 453 454 455 /*------------------------------------------------------------*/ 456 /*--- Hardware Prefetch Simulation ---*/ 457 /*------------------------------------------------------------*/ 458 459 static ULong prefetch_up = 0; 460 static ULong prefetch_down = 0; 461 462 #define PF_STREAMS 8 463 #define PF_PAGEBITS 12 464 465 static UInt pf_lastblock[PF_STREAMS]; 466 static Int pf_seqblocks[PF_STREAMS]; 467 468 static 469 void prefetch_clear(void) 470 { 471 int i; 472 for(i=0;i<PF_STREAMS;i++) 473 pf_lastblock[i] = pf_seqblocks[i] = 0; 474 } 475 476 /* 477 * HW Prefetch emulation 478 * Start prefetching when detecting sequential access to 3 memory blocks. 479 * One stream can be detected per 4k page. 480 */ 481 static __inline__ 482 void prefetch_LL_doref(Addr a) 483 { 484 UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS; 485 UInt block = ( a >> LL.line_size_bits); 486 487 if (block != pf_lastblock[stream]) { 488 if (pf_seqblocks[stream] == 0) { 489 if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++; 490 else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--; 491 } 492 else if (pf_seqblocks[stream] >0) { 493 if (pf_lastblock[stream] +1 == block) { 494 pf_seqblocks[stream]++; 495 if (pf_seqblocks[stream] >= 2) { 496 prefetch_up++; 497 cachesim_ref(&LL, a + 5 * LL.line_size,1); 498 } 499 } 500 else pf_seqblocks[stream] = 0; 501 } 502 else if (pf_seqblocks[stream] <0) { 503 if (pf_lastblock[stream] -1 == block) { 504 pf_seqblocks[stream]--; 505 if (pf_seqblocks[stream] <= -2) { 506 prefetch_down++; 507 cachesim_ref(&LL, a - 5 * LL.line_size,1); 508 } 509 } 510 else pf_seqblocks[stream] = 0; 511 } 512 pf_lastblock[stream] = block; 513 } 514 } 515 516 /* simple model with hardware prefetch */ 517 518 static 519 CacheModelResult prefetch_I1_ref(Addr a, UChar size) 520 { 521 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit; 522 prefetch_LL_doref(a); 523 if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit; 524 return MemAccess; 525 } 526 527 static 528 CacheModelResult prefetch_D1_ref(Addr a, UChar size) 529 { 530 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit; 531 prefetch_LL_doref(a); 532 if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit; 533 return MemAccess; 534 } 535 536 537 /* complex model with hardware prefetch */ 538 539 static 540 CacheModelResult prefetch_I1_Read(Addr a, UChar size) 541 { 542 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit; 543 prefetch_LL_doref(a); 544 switch( cachesim_ref_wb( &LL, Read, a, size) ) { 545 case Hit: return LL_Hit; 546 case Miss: return MemAccess; 547 default: break; 548 } 549 return WriteBackMemAccess; 550 } 551 552 static 553 CacheModelResult prefetch_D1_Read(Addr a, UChar size) 554 { 555 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit; 556 prefetch_LL_doref(a); 557 switch( cachesim_ref_wb( &LL, Read, a, size) ) { 558 case Hit: return LL_Hit; 559 case Miss: return MemAccess; 560 default: break; 561 } 562 return WriteBackMemAccess; 563 } 564 565 static 566 CacheModelResult prefetch_D1_Write(Addr a, UChar size) 567 { 568 prefetch_LL_doref(a); 569 if ( cachesim_ref( &D1, a, size) == Hit ) { 570 /* Even for a L1 hit, the write-trough L1 passes 571 * the write to the LL to make the LL line dirty. 572 * But this causes no latency, so return the hit. 573 */ 574 cachesim_ref_wb( &LL, Write, a, size); 575 return L1_Hit; 576 } 577 switch( cachesim_ref_wb( &LL, Write, a, size) ) { 578 case Hit: return LL_Hit; 579 case Miss: return MemAccess; 580 default: break; 581 } 582 return WriteBackMemAccess; 583 } 584 585 586 /*------------------------------------------------------------*/ 587 /*--- Cache Simulation with use metric collection ---*/ 588 /*------------------------------------------------------------*/ 589 590 /* can not be combined with write-back or prefetch */ 591 592 static 593 void cacheuse_initcache(cache_t2* c) 594 { 595 int i; 596 unsigned int start_mask, start_val; 597 unsigned int end_mask, end_val; 598 599 c->use = CLG_MALLOC("cl.sim.cu_ic.1", 600 sizeof(line_use) * c->sets * c->assoc); 601 c->loaded = CLG_MALLOC("cl.sim.cu_ic.2", 602 sizeof(line_loaded) * c->sets * c->assoc); 603 c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3", 604 sizeof(int) * c->line_size); 605 c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4", 606 sizeof(int) * c->line_size); 607 608 c->line_size_mask = c->line_size-1; 609 610 /* Meaning of line_start_mask/line_end_mask 611 * Example: for a given cache line, you get an access starting at 612 * byte offset 5, length 4, byte 5 - 8 was touched. For a cache 613 * line size of 32, you have 1 bit per byte in the mask: 614 * 615 * bit31 bit8 bit5 bit 0 616 * | | | | 617 * 11..111111100000 line_start_mask[5] 618 * 00..000111111111 line_end_mask[(5+4)-1] 619 * 620 * use_mask |= line_start_mask[5] && line_end_mask[8] 621 * 622 */ 623 start_val = end_val = ~0; 624 if (c->line_size < 32) { 625 int bits_per_byte = 32/c->line_size; 626 start_mask = (1<<bits_per_byte)-1; 627 end_mask = start_mask << (32-bits_per_byte); 628 for(i=0;i<c->line_size;i++) { 629 c->line_start_mask[i] = start_val; 630 start_val = start_val & ~start_mask; 631 start_mask = start_mask << bits_per_byte; 632 633 c->line_end_mask[c->line_size-i-1] = end_val; 634 end_val = end_val & ~end_mask; 635 end_mask = end_mask >> bits_per_byte; 636 } 637 } 638 else { 639 int bytes_per_bit = c->line_size/32; 640 start_mask = 1; 641 end_mask = 1 << 31; 642 for(i=0;i<c->line_size;i++) { 643 c->line_start_mask[i] = start_val; 644 c->line_end_mask[c->line_size-i-1] = end_val; 645 if ( ((i+1)%bytes_per_bit) == 0) { 646 start_val &= ~start_mask; 647 end_val &= ~end_mask; 648 start_mask <<= 1; 649 end_mask >>= 1; 650 } 651 } 652 } 653 654 CLG_DEBUG(6, "Config %s:\n", c->desc_line); 655 for(i=0;i<c->line_size;i++) { 656 CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n", 657 i, c->line_start_mask[i], c->line_end_mask[i]); 658 } 659 660 /* We use lower tag bits as offset pointers to cache use info. 661 * I.e. some cache parameters don't work. 662 */ 663 if ( (1<<c->tag_shift) < c->assoc) { 664 VG_(message)(Vg_DebugMsg, 665 "error: Use associativity < %d for cache use statistics!\n", 666 (1<<c->tag_shift) ); 667 VG_(tool_panic)("Unsupported cache configuration"); 668 } 669 } 670 671 672 /* for I1/D1 caches */ 673 #define CACHEUSE(L) \ 674 \ 675 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \ 676 { \ 677 UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \ 678 UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \ 679 UWord tag = a & L.tag_mask; \ 680 UWord tag2; \ 681 int i, j, idx; \ 682 UWord *set, tmp_tag; \ 683 UInt use_mask; \ 684 \ 685 CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n", \ 686 L.name, a, size, set1, set2); \ 687 \ 688 /* First case: word entirely within line. */ \ 689 if (set1 == set2) { \ 690 \ 691 set = &(L.tags[set1 * L.assoc]); \ 692 use_mask = L.line_start_mask[a & L.line_size_mask] & \ 693 L.line_end_mask[(a+size-1) & L.line_size_mask]; \ 694 \ 695 /* This loop is unrolled for just the first case, which is the most */\ 696 /* common. We can't unroll any further because it would screw up */\ 697 /* if we have a direct-mapped (1-way) cache. */\ 698 if (tag == (set[0] & L.tag_mask)) { \ 699 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \ 700 L.use[idx].count ++; \ 701 L.use[idx].mask |= use_mask; \ 702 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ 703 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ 704 use_mask, L.use[idx].mask, L.use[idx].count); \ 705 return L1_Hit; \ 706 } \ 707 /* If the tag is one other than the MRU, move it into the MRU spot */\ 708 /* and shuffle the rest down. */\ 709 for (i = 1; i < L.assoc; i++) { \ 710 if (tag == (set[i] & L.tag_mask)) { \ 711 tmp_tag = set[i]; \ 712 for (j = i; j > 0; j--) { \ 713 set[j] = set[j - 1]; \ 714 } \ 715 set[0] = tmp_tag; \ 716 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \ 717 L.use[idx].count ++; \ 718 L.use[idx].mask |= use_mask; \ 719 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ 720 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ 721 use_mask, L.use[idx].mask, L.use[idx].count); \ 722 return L1_Hit; \ 723 } \ 724 } \ 725 \ 726 /* A miss; install this tag as MRU, shuffle rest down. */ \ 727 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \ 728 for (j = L.assoc - 1; j > 0; j--) { \ 729 set[j] = set[j - 1]; \ 730 } \ 731 set[0] = tag | tmp_tag; \ 732 idx = (set1 * L.assoc) + tmp_tag; \ 733 return update_##L##_use(&L, idx, \ 734 use_mask, a &~ L.line_size_mask); \ 735 \ 736 /* Second case: word straddles two lines. */ \ 737 /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \ 738 } else if (((set1 + 1) & (L.sets-1)) == set2) { \ 739 Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */ \ 740 set = &(L.tags[set1 * L.assoc]); \ 741 use_mask = L.line_start_mask[a & L.line_size_mask]; \ 742 if (tag == (set[0] & L.tag_mask)) { \ 743 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \ 744 L.use[idx].count ++; \ 745 L.use[idx].mask |= use_mask; \ 746 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ 747 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ 748 use_mask, L.use[idx].mask, L.use[idx].count); \ 749 goto block2; \ 750 } \ 751 for (i = 1; i < L.assoc; i++) { \ 752 if (tag == (set[i] & L.tag_mask)) { \ 753 tmp_tag = set[i]; \ 754 for (j = i; j > 0; j--) { \ 755 set[j] = set[j - 1]; \ 756 } \ 757 set[0] = tmp_tag; \ 758 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \ 759 L.use[idx].count ++; \ 760 L.use[idx].mask |= use_mask; \ 761 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ 762 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ 763 use_mask, L.use[idx].mask, L.use[idx].count); \ 764 goto block2; \ 765 } \ 766 } \ 767 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \ 768 for (j = L.assoc - 1; j > 0; j--) { \ 769 set[j] = set[j - 1]; \ 770 } \ 771 set[0] = tag | tmp_tag; \ 772 idx = (set1 * L.assoc) + tmp_tag; \ 773 miss1 = update_##L##_use(&L, idx, \ 774 use_mask, a &~ L.line_size_mask); \ 775 block2: \ 776 set = &(L.tags[set2 * L.assoc]); \ 777 use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \ 778 tag2 = (a+size-1) & L.tag_mask; \ 779 if (tag2 == (set[0] & L.tag_mask)) { \ 780 idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \ 781 L.use[idx].count ++; \ 782 L.use[idx].mask |= use_mask; \ 783 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ 784 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ 785 use_mask, L.use[idx].mask, L.use[idx].count); \ 786 return miss1; \ 787 } \ 788 for (i = 1; i < L.assoc; i++) { \ 789 if (tag2 == (set[i] & L.tag_mask)) { \ 790 tmp_tag = set[i]; \ 791 for (j = i; j > 0; j--) { \ 792 set[j] = set[j - 1]; \ 793 } \ 794 set[0] = tmp_tag; \ 795 idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \ 796 L.use[idx].count ++; \ 797 L.use[idx].mask |= use_mask; \ 798 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ 799 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ 800 use_mask, L.use[idx].mask, L.use[idx].count); \ 801 return miss1; \ 802 } \ 803 } \ 804 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \ 805 for (j = L.assoc - 1; j > 0; j--) { \ 806 set[j] = set[j - 1]; \ 807 } \ 808 set[0] = tag2 | tmp_tag; \ 809 idx = (set2 * L.assoc) + tmp_tag; \ 810 miss2 = update_##L##_use(&L, idx, \ 811 use_mask, (a+size-1) &~ L.line_size_mask); \ 812 return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit; \ 813 \ 814 } else { \ 815 VG_(printf)("addr: %#lx size: %u sets: %d %d", a, size, set1, set2); \ 816 VG_(tool_panic)("item straddles more than two cache sets"); \ 817 } \ 818 return 0; \ 819 } 820 821 822 /* logarithmic bitcounting algorithm, see 823 * http://graphics.stanford.edu/~seander/bithacks.html 824 */ 825 static __inline__ unsigned int countBits(unsigned int bits) 826 { 827 unsigned int c; // store the total here 828 const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers 829 const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF}; 830 831 c = bits; 832 c = ((c >> S[0]) & B[0]) + (c & B[0]); 833 c = ((c >> S[1]) & B[1]) + (c & B[1]); 834 c = ((c >> S[2]) & B[2]) + (c & B[2]); 835 c = ((c >> S[3]) & B[3]) + (c & B[3]); 836 c = ((c >> S[4]) & B[4]) + (c & B[4]); 837 return c; 838 } 839 840 static void update_LL_use(int idx, Addr memline) 841 { 842 line_loaded* loaded = &(LL.loaded[idx]); 843 line_use* use = &(LL.use[idx]); 844 int i = ((32 - countBits(use->mask)) * LL.line_size)>>5; 845 846 CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n", 847 idx, CLG_(bb_base) + current_ii->instr_offset, memline); 848 if (use->count>0) { 849 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n", 850 use->count, i, use->mask, loaded->memline, loaded->iaddr); 851 CLG_DEBUG(2, " collect: %d, use_base %p\n", 852 CLG_(current_state).collect, loaded->use_base); 853 854 if (CLG_(current_state).collect && loaded->use_base) { 855 (loaded->use_base)[off_LL_AcCost] += 1000 / use->count; 856 (loaded->use_base)[off_LL_SpLoss] += i; 857 } 858 } 859 860 use->count = 0; 861 use->mask = 0; 862 863 loaded->memline = memline; 864 loaded->iaddr = CLG_(bb_base) + current_ii->instr_offset; 865 loaded->use_base = (CLG_(current_state).nonskipped) ? 866 CLG_(current_state).nonskipped->skipped : 867 CLG_(cost_base) + current_ii->cost_offset; 868 } 869 870 static 871 CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded) 872 { 873 UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1); 874 UWord* set = &(LL.tags[setNo * LL.assoc]); 875 UWord tag = memline & LL.tag_mask; 876 877 int i, j, idx; 878 UWord tmp_tag; 879 880 CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo); 881 882 if (tag == (set[0] & LL.tag_mask)) { 883 idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask); 884 l1_loaded->dep_use = &(LL.use[idx]); 885 886 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n", 887 idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr, 888 LL.use[idx].mask, LL.use[idx].count); 889 return LL_Hit; 890 } 891 for (i = 1; i < LL.assoc; i++) { 892 if (tag == (set[i] & LL.tag_mask)) { 893 tmp_tag = set[i]; 894 for (j = i; j > 0; j--) { 895 set[j] = set[j - 1]; 896 } 897 set[0] = tmp_tag; 898 idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask); 899 l1_loaded->dep_use = &(LL.use[idx]); 900 901 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n", 902 i, idx, LL.loaded[idx].memline, LL.loaded[idx].iaddr, 903 LL.use[idx].mask, LL.use[idx].count); 904 return LL_Hit; 905 } 906 } 907 908 /* A miss; install this tag as MRU, shuffle rest down. */ 909 tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask; 910 for (j = LL.assoc - 1; j > 0; j--) { 911 set[j] = set[j - 1]; 912 } 913 set[0] = tag | tmp_tag; 914 idx = (setNo * LL.assoc) + tmp_tag; 915 l1_loaded->dep_use = &(LL.use[idx]); 916 917 update_LL_use(idx, memline); 918 919 return MemAccess; 920 } 921 922 923 924 925 #define UPDATE_USE(L) \ 926 \ 927 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \ 928 UInt mask, Addr memline) \ 929 { \ 930 line_loaded* loaded = &(cache->loaded[idx]); \ 931 line_use* use = &(cache->use[idx]); \ 932 int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \ 933 \ 934 CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \ 935 cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \ 936 if (use->count>0) { \ 937 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\ 938 use->count, c, use->mask, loaded->memline, loaded->iaddr); \ 939 CLG_DEBUG(2, " collect: %d, use_base %p\n", \ 940 CLG_(current_state).collect, loaded->use_base); \ 941 \ 942 if (CLG_(current_state).collect && loaded->use_base) { \ 943 (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \ 944 (loaded->use_base)[off_##L##_SpLoss] += c; \ 945 \ 946 /* FIXME (?): L1/LL line sizes must be equal ! */ \ 947 loaded->dep_use->mask |= use->mask; \ 948 loaded->dep_use->count += use->count; \ 949 } \ 950 } \ 951 \ 952 use->count = 1; \ 953 use->mask = mask; \ 954 loaded->memline = memline; \ 955 loaded->iaddr = CLG_(bb_base) + current_ii->instr_offset; \ 956 loaded->use_base = (CLG_(current_state).nonskipped) ? \ 957 CLG_(current_state).nonskipped->skipped : \ 958 CLG_(cost_base) + current_ii->cost_offset; \ 959 \ 960 if (memline == 0) return LL_Hit; \ 961 return cacheuse_LL_access(memline, loaded); \ 962 } 963 964 UPDATE_USE(I1); 965 UPDATE_USE(D1); 966 967 CACHEUSE(I1); 968 CACHEUSE(D1); 969 970 971 static 972 void cacheuse_finish(void) 973 { 974 int i; 975 InstrInfo ii = { 0,0,0,0 }; 976 977 if (!CLG_(current_state).collect) return; 978 979 CLG_(bb_base) = 0; 980 current_ii = ⅈ 981 CLG_(cost_base) = 0; 982 983 /* update usage counters */ 984 if (I1.use) 985 for (i = 0; i < I1.sets * I1.assoc; i++) 986 if (I1.loaded[i].use_base) 987 update_I1_use( &I1, i, 0,0); 988 989 if (D1.use) 990 for (i = 0; i < D1.sets * D1.assoc; i++) 991 if (D1.loaded[i].use_base) 992 update_D1_use( &D1, i, 0,0); 993 994 if (LL.use) 995 for (i = 0; i < LL.sets * LL.assoc; i++) 996 if (LL.loaded[i].use_base) 997 update_LL_use(i, 0); 998 } 999 1000 1001 1002 /*------------------------------------------------------------*/ 1003 /*--- Helper functions called by instrumented code ---*/ 1004 /*------------------------------------------------------------*/ 1005 1006 1007 static __inline__ 1008 void inc_costs(CacheModelResult r, ULong* c1, ULong* c2) 1009 { 1010 switch(r) { 1011 case WriteBackMemAccess: 1012 if (clo_simulate_writeback) { 1013 c1[3]++; 1014 c2[3]++; 1015 } 1016 // fall through 1017 1018 case MemAccess: 1019 c1[2]++; 1020 c2[2]++; 1021 // fall through 1022 1023 case LL_Hit: 1024 c1[1]++; 1025 c2[1]++; 1026 // fall through 1027 1028 default: 1029 c1[0]++; 1030 c2[0]++; 1031 } 1032 } 1033 1034 static 1035 Char* cacheRes(CacheModelResult r) 1036 { 1037 switch(r) { 1038 case L1_Hit: return "L1 Hit "; 1039 case LL_Hit: return "LL Hit "; 1040 case MemAccess: return "LL Miss"; 1041 case WriteBackMemAccess: return "LL Miss (dirty)"; 1042 default: 1043 tl_assert(0); 1044 } 1045 return "??"; 1046 } 1047 1048 VG_REGPARM(1) 1049 static void log_1I0D(InstrInfo* ii) 1050 { 1051 CacheModelResult IrRes; 1052 1053 current_ii = ii; 1054 IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size); 1055 1056 CLG_DEBUG(6, "log_1I0D: Ir %#lx/%u => %s\n", 1057 CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes)); 1058 1059 if (CLG_(current_state).collect) { 1060 ULong* cost_Ir; 1061 1062 if (CLG_(current_state).nonskipped) 1063 cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR); 1064 else 1065 cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR]; 1066 1067 inc_costs(IrRes, cost_Ir, 1068 CLG_(current_state).cost + fullOffset(EG_IR) ); 1069 } 1070 } 1071 1072 VG_REGPARM(2) 1073 static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2) 1074 { 1075 CacheModelResult Ir1Res, Ir2Res; 1076 ULong *global_cost_Ir; 1077 1078 current_ii = ii1; 1079 Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size); 1080 current_ii = ii2; 1081 Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size); 1082 1083 CLG_DEBUG(6, "log_2I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n", 1084 CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res), 1085 CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) ); 1086 1087 if (!CLG_(current_state).collect) return; 1088 1089 global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR); 1090 if (CLG_(current_state).nonskipped) { 1091 ULong* skipped_cost_Ir = 1092 CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR); 1093 1094 inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir); 1095 inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir); 1096 return; 1097 } 1098 1099 inc_costs(Ir1Res, global_cost_Ir, 1100 CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]); 1101 inc_costs(Ir2Res, global_cost_Ir, 1102 CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]); 1103 } 1104 1105 VG_REGPARM(3) 1106 static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3) 1107 { 1108 CacheModelResult Ir1Res, Ir2Res, Ir3Res; 1109 ULong *global_cost_Ir; 1110 1111 current_ii = ii1; 1112 Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size); 1113 current_ii = ii2; 1114 Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size); 1115 current_ii = ii3; 1116 Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size); 1117 1118 CLG_DEBUG(6, "log_3I0D: Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n", 1119 CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res), 1120 CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res), 1121 CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) ); 1122 1123 if (!CLG_(current_state).collect) return; 1124 1125 global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR); 1126 if (CLG_(current_state).nonskipped) { 1127 ULong* skipped_cost_Ir = 1128 CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR); 1129 inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir); 1130 inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir); 1131 inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir); 1132 return; 1133 } 1134 1135 inc_costs(Ir1Res, global_cost_Ir, 1136 CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]); 1137 inc_costs(Ir2Res, global_cost_Ir, 1138 CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]); 1139 inc_costs(Ir3Res, global_cost_Ir, 1140 CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]); 1141 } 1142 1143 /* Instruction doing a read access */ 1144 1145 VG_REGPARM(3) 1146 static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size) 1147 { 1148 CacheModelResult IrRes, DrRes; 1149 1150 current_ii = ii; 1151 IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size); 1152 DrRes = (*simulator.D1_Read)(data_addr, data_size); 1153 1154 CLG_DEBUG(6, "log_1I1Dr: Ir %#lx/%u => %s, Dr %#lx/%lu => %s\n", 1155 CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes), 1156 data_addr, data_size, cacheRes(DrRes)); 1157 1158 if (CLG_(current_state).collect) { 1159 ULong *cost_Ir, *cost_Dr; 1160 1161 if (CLG_(current_state).nonskipped) { 1162 cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR); 1163 cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR); 1164 } 1165 else { 1166 cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR]; 1167 cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR]; 1168 } 1169 1170 inc_costs(IrRes, cost_Ir, 1171 CLG_(current_state).cost + fullOffset(EG_IR) ); 1172 inc_costs(DrRes, cost_Dr, 1173 CLG_(current_state).cost + fullOffset(EG_DR) ); 1174 } 1175 } 1176 1177 1178 VG_REGPARM(3) 1179 static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size) 1180 { 1181 CacheModelResult DrRes; 1182 1183 current_ii = ii; 1184 DrRes = (*simulator.D1_Read)(data_addr, data_size); 1185 1186 CLG_DEBUG(6, "log_0I1Dr: Dr %#lx/%lu => %s\n", 1187 data_addr, data_size, cacheRes(DrRes)); 1188 1189 if (CLG_(current_state).collect) { 1190 ULong *cost_Dr; 1191 1192 if (CLG_(current_state).nonskipped) 1193 cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR); 1194 else 1195 cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR]; 1196 1197 inc_costs(DrRes, cost_Dr, 1198 CLG_(current_state).cost + fullOffset(EG_DR) ); 1199 } 1200 } 1201 1202 1203 /* Instruction doing a write access */ 1204 1205 VG_REGPARM(3) 1206 static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size) 1207 { 1208 CacheModelResult IrRes, DwRes; 1209 1210 current_ii = ii; 1211 IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size); 1212 DwRes = (*simulator.D1_Write)(data_addr, data_size); 1213 1214 CLG_DEBUG(6, "log_1I1Dw: Ir %#lx/%u => %s, Dw %#lx/%lu => %s\n", 1215 CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes), 1216 data_addr, data_size, cacheRes(DwRes)); 1217 1218 if (CLG_(current_state).collect) { 1219 ULong *cost_Ir, *cost_Dw; 1220 1221 if (CLG_(current_state).nonskipped) { 1222 cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR); 1223 cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW); 1224 } 1225 else { 1226 cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR]; 1227 cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW]; 1228 } 1229 1230 inc_costs(IrRes, cost_Ir, 1231 CLG_(current_state).cost + fullOffset(EG_IR) ); 1232 inc_costs(DwRes, cost_Dw, 1233 CLG_(current_state).cost + fullOffset(EG_DW) ); 1234 } 1235 } 1236 1237 VG_REGPARM(3) 1238 static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size) 1239 { 1240 CacheModelResult DwRes; 1241 1242 current_ii = ii; 1243 DwRes = (*simulator.D1_Write)(data_addr, data_size); 1244 1245 CLG_DEBUG(6, "log_0I1Dw: Dw %#lx/%lu => %s\n", 1246 data_addr, data_size, cacheRes(DwRes)); 1247 1248 if (CLG_(current_state).collect) { 1249 ULong *cost_Dw; 1250 1251 if (CLG_(current_state).nonskipped) 1252 cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW); 1253 else 1254 cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW]; 1255 1256 inc_costs(DwRes, cost_Dw, 1257 CLG_(current_state).cost + fullOffset(EG_DW) ); 1258 } 1259 } 1260 1261 1262 1263 /*------------------------------------------------------------*/ 1264 /*--- Cache configuration ---*/ 1265 /*------------------------------------------------------------*/ 1266 1267 #define UNDEFINED_CACHE ((cache_t) { -1, -1, -1 }) 1268 1269 static cache_t clo_I1_cache = UNDEFINED_CACHE; 1270 static cache_t clo_D1_cache = UNDEFINED_CACHE; 1271 static cache_t clo_LL_cache = UNDEFINED_CACHE; 1272 1273 1274 // Checks cache config is ok. Returns NULL if ok, or a pointer to an error 1275 // string otherwise. 1276 static Char* check_cache(cache_t* cache) 1277 { 1278 // Simulator requires line size and set count to be powers of two. 1279 if (( cache->size % (cache->line_size * cache->assoc) != 0) || 1280 (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) 1281 { 1282 return "Cache set count is not a power of two.\n"; 1283 } 1284 1285 // Simulator requires line size to be a power of two. 1286 if (-1 == VG_(log2)(cache->line_size)) { 1287 return "Cache line size is not a power of two.\n"; 1288 } 1289 1290 // Then check line size >= 16 -- any smaller and a single instruction could 1291 // straddle three cache lines, which breaks a simulation assertion and is 1292 // stupid anyway. 1293 if (cache->line_size < MIN_LINE_SIZE) { 1294 return "Cache line size is too small.\n"; 1295 } 1296 1297 /* Then check cache size > line size (causes seg faults if not). */ 1298 if (cache->size <= cache->line_size) { 1299 return "Cache size <= line size.\n"; 1300 } 1301 1302 /* Then check assoc <= (size / line size) (seg faults otherwise). */ 1303 if (cache->assoc > (cache->size / cache->line_size)) { 1304 return "Cache associativity > (size / line size).\n"; 1305 } 1306 1307 return NULL; 1308 } 1309 1310 static 1311 void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc) 1312 { 1313 #define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size) 1314 1315 Char* checkRes; 1316 1317 Bool all_caches_clo_defined = 1318 (DEFINED(clo_I1_cache) && 1319 DEFINED(clo_D1_cache) && 1320 DEFINED(clo_LL_cache)); 1321 1322 // Set the cache config (using auto-detection, if supported by the 1323 // architecture). 1324 VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined ); 1325 1326 // Check the default/auto-detected values. 1327 checkRes = check_cache(I1c); tl_assert(!checkRes); 1328 checkRes = check_cache(D1c); tl_assert(!checkRes); 1329 checkRes = check_cache(LLc); tl_assert(!checkRes); 1330 1331 // Then replace with any defined on the command line. 1332 if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; } 1333 if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; } 1334 if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; } 1335 1336 if (VG_(clo_verbosity) > 1) { 1337 VG_(umsg)("Cache configuration used:\n"); 1338 VG_(umsg)(" I1: %dB, %d-way, %dB lines\n", 1339 I1c->size, I1c->assoc, I1c->line_size); 1340 VG_(umsg)(" D1: %dB, %d-way, %dB lines\n", 1341 D1c->size, D1c->assoc, D1c->line_size); 1342 VG_(umsg)(" LL: %dB, %d-way, %dB lines\n", 1343 LLc->size, LLc->assoc, LLc->line_size); 1344 } 1345 #undef CMD_LINE_DEFINED 1346 } 1347 1348 1349 /* Initialize and clear simulator state */ 1350 static void cachesim_post_clo_init(void) 1351 { 1352 /* Cache configurations. */ 1353 cache_t I1c, D1c, LLc; 1354 1355 /* Initialize access handlers */ 1356 if (!CLG_(clo).simulate_cache) { 1357 CLG_(cachesim).log_1I0D = 0; 1358 CLG_(cachesim).log_1I0D_name = "(no function)"; 1359 CLG_(cachesim).log_2I0D = 0; 1360 CLG_(cachesim).log_2I0D_name = "(no function)"; 1361 CLG_(cachesim).log_3I0D = 0; 1362 CLG_(cachesim).log_3I0D_name = "(no function)"; 1363 1364 CLG_(cachesim).log_1I1Dr = 0; 1365 CLG_(cachesim).log_1I1Dr_name = "(no function)"; 1366 CLG_(cachesim).log_1I1Dw = 0; 1367 CLG_(cachesim).log_1I1Dw_name = "(no function)"; 1368 1369 CLG_(cachesim).log_0I1Dr = 0; 1370 CLG_(cachesim).log_0I1Dr_name = "(no function)"; 1371 CLG_(cachesim).log_0I1Dw = 0; 1372 CLG_(cachesim).log_0I1Dw_name = "(no function)"; 1373 return; 1374 } 1375 1376 /* Configuration of caches only needed with real cache simulation */ 1377 configure_caches(&I1c, &D1c, &LLc); 1378 1379 I1.name = "I1"; 1380 D1.name = "D1"; 1381 LL.name = "LL"; 1382 1383 cachesim_initcache(I1c, &I1); 1384 cachesim_initcache(D1c, &D1); 1385 cachesim_initcache(LLc, &LL); 1386 1387 /* the other cache simulators use the standard helpers 1388 * with dispatching via simulator struct */ 1389 1390 CLG_(cachesim).log_1I0D = log_1I0D; 1391 CLG_(cachesim).log_1I0D_name = "log_1I0D"; 1392 CLG_(cachesim).log_2I0D = log_2I0D; 1393 CLG_(cachesim).log_2I0D_name = "log_2I0D"; 1394 CLG_(cachesim).log_3I0D = log_3I0D; 1395 CLG_(cachesim).log_3I0D_name = "log_3I0D"; 1396 1397 CLG_(cachesim).log_1I1Dr = log_1I1Dr; 1398 CLG_(cachesim).log_1I1Dw = log_1I1Dw; 1399 CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr"; 1400 CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw"; 1401 1402 CLG_(cachesim).log_0I1Dr = log_0I1Dr; 1403 CLG_(cachesim).log_0I1Dw = log_0I1Dw; 1404 CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr"; 1405 CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw"; 1406 1407 if (clo_collect_cacheuse) { 1408 1409 /* Output warning for not supported option combinations */ 1410 if (clo_simulate_hwpref) { 1411 VG_(message)(Vg_DebugMsg, 1412 "warning: prefetch simulation can not be " 1413 "used with cache usage\n"); 1414 clo_simulate_hwpref = False; 1415 } 1416 1417 if (clo_simulate_writeback) { 1418 VG_(message)(Vg_DebugMsg, 1419 "warning: write-back simulation can not be " 1420 "used with cache usage\n"); 1421 clo_simulate_writeback = False; 1422 } 1423 1424 simulator.I1_Read = cacheuse_I1_doRead; 1425 simulator.D1_Read = cacheuse_D1_doRead; 1426 simulator.D1_Write = cacheuse_D1_doRead; 1427 return; 1428 } 1429 1430 if (clo_simulate_hwpref) { 1431 prefetch_clear(); 1432 1433 if (clo_simulate_writeback) { 1434 simulator.I1_Read = prefetch_I1_Read; 1435 simulator.D1_Read = prefetch_D1_Read; 1436 simulator.D1_Write = prefetch_D1_Write; 1437 } 1438 else { 1439 simulator.I1_Read = prefetch_I1_ref; 1440 simulator.D1_Read = prefetch_D1_ref; 1441 simulator.D1_Write = prefetch_D1_ref; 1442 } 1443 1444 return; 1445 } 1446 1447 if (clo_simulate_writeback) { 1448 simulator.I1_Read = cachesim_I1_Read; 1449 simulator.D1_Read = cachesim_D1_Read; 1450 simulator.D1_Write = cachesim_D1_Write; 1451 } 1452 else { 1453 simulator.I1_Read = cachesim_I1_ref; 1454 simulator.D1_Read = cachesim_D1_ref; 1455 simulator.D1_Write = cachesim_D1_ref; 1456 } 1457 } 1458 1459 1460 /* Clear simulator state. Has to be initialized before */ 1461 static 1462 void cachesim_clear(void) 1463 { 1464 cachesim_clearcache(&I1); 1465 cachesim_clearcache(&D1); 1466 cachesim_clearcache(&LL); 1467 1468 prefetch_clear(); 1469 } 1470 1471 1472 static void cachesim_getdesc(Char* buf) 1473 { 1474 Int p; 1475 p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line); 1476 p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line); 1477 VG_(sprintf)(buf+p, "desc: LL cache: %s\n", LL.desc_line); 1478 } 1479 1480 static 1481 void cachesim_print_opts(void) 1482 { 1483 VG_(printf)( 1484 "\n cache simulator options (does cache simulation if used):\n" 1485 " --simulate-wb=no|yes Count write-back events [no]\n" 1486 " --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n" 1487 #if CLG_EXPERIMENTAL 1488 " --simulate-sectors=no|yes Simulate sectored behaviour [no]\n" 1489 #endif 1490 " --cacheuse=no|yes Collect cache block use [no]\n" 1491 " --I1=<size>,<assoc>,<line_size> set I1 cache manually\n" 1492 " --D1=<size>,<assoc>,<line_size> set D1 cache manually\n" 1493 " --LL=<size>,<assoc>,<line_size> set LL cache manually\n" 1494 ); 1495 } 1496 1497 static void parse_opt ( cache_t* cache, 1498 char* opt, Char* optval, UChar kind ) 1499 { 1500 Long i1, i2, i3; 1501 Char* endptr; 1502 Char* checkRes; 1503 1504 // Option argument looks like "65536,2,64". Extract them. 1505 i1 = VG_(strtoll10)(optval, &endptr); if (*endptr != ',') goto bad; 1506 i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',') goto bad; 1507 i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad; 1508 1509 // Check for overflow. 1510 cache->size = (Int)i1; 1511 cache->assoc = (Int)i2; 1512 cache->line_size = (Int)i3; 1513 if (cache->size != i1) goto overflow; 1514 if (cache->assoc != i2) goto overflow; 1515 if (cache->line_size != i3) goto overflow; 1516 1517 checkRes = check_cache(cache); 1518 if (checkRes) { 1519 VG_(fmsg)("%s", checkRes); 1520 goto bad; 1521 } 1522 1523 return; 1524 1525 bad: 1526 VG_(fmsg_bad_option)(opt, ""); 1527 1528 overflow: 1529 VG_(fmsg_bad_option)(opt, 1530 "One of the cache parameters was too large and overflowed.\n"); 1531 } 1532 1533 /* Check for command line option for cache configuration. 1534 * Return False if unknown and not handled. 1535 * 1536 * Called from CLG_(process_cmd_line_option)() in clo.c 1537 */ 1538 static Bool cachesim_parse_opt(Char* arg) 1539 { 1540 Char* tmp_str; 1541 1542 if VG_BOOL_CLO(arg, "--simulate-wb", clo_simulate_writeback) {} 1543 else if VG_BOOL_CLO(arg, "--simulate-hwpref", clo_simulate_hwpref) {} 1544 else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors) {} 1545 1546 else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) { 1547 if (clo_collect_cacheuse) { 1548 /* Use counters only make sense with fine dumping */ 1549 CLG_(clo).dump_instr = True; 1550 } 1551 } 1552 1553 else if VG_STR_CLO(arg, "--I1", tmp_str) 1554 parse_opt(&clo_I1_cache, arg, tmp_str, 'i'); 1555 else if VG_STR_CLO(arg, "--D1", tmp_str) 1556 parse_opt(&clo_D1_cache, arg, tmp_str, '1'); 1557 else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility 1558 VG_STR_CLO(arg, "--LL", tmp_str)) 1559 parse_opt(&clo_LL_cache, arg, tmp_str, '2'); 1560 else 1561 return False; 1562 1563 return True; 1564 } 1565 1566 /* Adds commas to ULong, right justifying in a field field_width wide, returns 1567 * the string in buf. */ 1568 static 1569 Int commify(ULong n, int field_width, char* buf) 1570 { 1571 int len, n_commas, i, j, new_len, space; 1572 1573 VG_(sprintf)(buf, "%llu", n); 1574 len = VG_(strlen)(buf); 1575 n_commas = (len - 1) / 3; 1576 new_len = len + n_commas; 1577 space = field_width - new_len; 1578 1579 /* Allow for printing a number in a field_width smaller than it's size */ 1580 if (space < 0) space = 0; 1581 1582 /* Make j = -1 because we copy the '\0' before doing the numbers in groups 1583 * of three. */ 1584 for (j = -1, i = len ; i >= 0; i--) { 1585 buf[i + n_commas + space] = buf[i]; 1586 1587 if ((i>0) && (3 == ++j)) { 1588 j = 0; 1589 n_commas--; 1590 buf[i + n_commas + space] = ','; 1591 } 1592 } 1593 /* Right justify in field. */ 1594 for (i = 0; i < space; i++) buf[i] = ' '; 1595 return new_len; 1596 } 1597 1598 static 1599 void percentify(Int n, Int ex, Int field_width, char buf[]) 1600 { 1601 int i, len, space; 1602 1603 VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex); 1604 len = VG_(strlen)(buf); 1605 space = field_width - len; 1606 if (space < 0) space = 0; /* Allow for v. small field_width */ 1607 i = len; 1608 1609 /* Right justify in field */ 1610 for ( ; i >= 0; i--) buf[i + space] = buf[i]; 1611 for (i = 0; i < space; i++) buf[i] = ' '; 1612 } 1613 1614 static 1615 void cachesim_printstat(Int l1, Int l2, Int l3) 1616 { 1617 FullCost total = CLG_(total_cost), D_total = 0; 1618 ULong LL_total_m, LL_total_mr, LL_total_mw, 1619 LL_total, LL_total_r, LL_total_w; 1620 char buf1[RESULTS_BUF_LEN], 1621 buf2[RESULTS_BUF_LEN], 1622 buf3[RESULTS_BUF_LEN]; 1623 Int p; 1624 1625 if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) { 1626 VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu\n", 1627 prefetch_up); 1628 VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu\n", 1629 prefetch_down); 1630 VG_(message)(Vg_DebugMsg, "\n"); 1631 } 1632 1633 commify(total[fullOffset(EG_IR) +1], l1, buf1); 1634 VG_(message)(Vg_UserMsg, "I1 misses: %s\n", buf1); 1635 1636 commify(total[fullOffset(EG_IR) +2], l1, buf1); 1637 VG_(message)(Vg_UserMsg, "LLi misses: %s\n", buf1); 1638 1639 p = 100; 1640 1641 if (0 == total[fullOffset(EG_IR)]) 1642 total[fullOffset(EG_IR)] = 1; 1643 1644 percentify(total[fullOffset(EG_IR)+1] * 100 * p / 1645 total[fullOffset(EG_IR)], p, l1+1, buf1); 1646 VG_(message)(Vg_UserMsg, "I1 miss rate: %s\n", buf1); 1647 1648 percentify(total[fullOffset(EG_IR)+2] * 100 * p / 1649 total[fullOffset(EG_IR)], p, l1+1, buf1); 1650 VG_(message)(Vg_UserMsg, "LLi miss rate: %s\n", buf1); 1651 VG_(message)(Vg_UserMsg, "\n"); 1652 1653 /* D cache results. 1654 Use the D_refs.rd and D_refs.wr values to determine the 1655 * width of columns 2 & 3. */ 1656 1657 D_total = CLG_(get_eventset_cost)( CLG_(sets).full ); 1658 CLG_(init_cost)( CLG_(sets).full, D_total); 1659 // we only use the first 3 values of D_total, adding up Dr and Dw costs 1660 CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) ); 1661 CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) ); 1662 1663 commify( D_total[0], l1, buf1); 1664 commify(total[fullOffset(EG_DR)], l2, buf2); 1665 commify(total[fullOffset(EG_DW)], l3, buf3); 1666 VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)\n", 1667 buf1, buf2, buf3); 1668 1669 commify( D_total[1], l1, buf1); 1670 commify(total[fullOffset(EG_DR)+1], l2, buf2); 1671 commify(total[fullOffset(EG_DW)+1], l3, buf3); 1672 VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)\n", 1673 buf1, buf2, buf3); 1674 1675 commify( D_total[2], l1, buf1); 1676 commify(total[fullOffset(EG_DR)+2], l2, buf2); 1677 commify(total[fullOffset(EG_DW)+2], l3, buf3); 1678 VG_(message)(Vg_UserMsg, "LLd misses: %s (%s rd + %s wr)\n", 1679 buf1, buf2, buf3); 1680 1681 p = 10; 1682 1683 if (0 == D_total[0]) D_total[0] = 1; 1684 if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1; 1685 if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1; 1686 1687 percentify( D_total[1] * 100 * p / D_total[0], p, l1+1, buf1); 1688 percentify(total[fullOffset(EG_DR)+1] * 100 * p / 1689 total[fullOffset(EG_DR)], p, l2+1, buf2); 1690 percentify(total[fullOffset(EG_DW)+1] * 100 * p / 1691 total[fullOffset(EG_DW)], p, l3+1, buf3); 1692 VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )\n", 1693 buf1, buf2,buf3); 1694 1695 percentify( D_total[2] * 100 * p / D_total[0], p, l1+1, buf1); 1696 percentify(total[fullOffset(EG_DR)+2] * 100 * p / 1697 total[fullOffset(EG_DR)], p, l2+1, buf2); 1698 percentify(total[fullOffset(EG_DW)+2] * 100 * p / 1699 total[fullOffset(EG_DW)], p, l3+1, buf3); 1700 VG_(message)(Vg_UserMsg, "LLd miss rate: %s (%s + %s )\n", 1701 buf1, buf2,buf3); 1702 VG_(message)(Vg_UserMsg, "\n"); 1703 1704 1705 1706 /* LL overall results */ 1707 1708 LL_total = 1709 total[fullOffset(EG_DR) +1] + 1710 total[fullOffset(EG_DW) +1] + 1711 total[fullOffset(EG_IR) +1]; 1712 LL_total_r = 1713 total[fullOffset(EG_DR) +1] + 1714 total[fullOffset(EG_IR) +1]; 1715 LL_total_w = total[fullOffset(EG_DW) +1]; 1716 commify(LL_total, l1, buf1); 1717 commify(LL_total_r, l2, buf2); 1718 commify(LL_total_w, l3, buf3); 1719 VG_(message)(Vg_UserMsg, "LL refs: %s (%s rd + %s wr)\n", 1720 buf1, buf2, buf3); 1721 1722 LL_total_m = 1723 total[fullOffset(EG_DR) +2] + 1724 total[fullOffset(EG_DW) +2] + 1725 total[fullOffset(EG_IR) +2]; 1726 LL_total_mr = 1727 total[fullOffset(EG_DR) +2] + 1728 total[fullOffset(EG_IR) +2]; 1729 LL_total_mw = total[fullOffset(EG_DW) +2]; 1730 commify(LL_total_m, l1, buf1); 1731 commify(LL_total_mr, l2, buf2); 1732 commify(LL_total_mw, l3, buf3); 1733 VG_(message)(Vg_UserMsg, "LL misses: %s (%s rd + %s wr)\n", 1734 buf1, buf2, buf3); 1735 1736 percentify(LL_total_m * 100 * p / 1737 (total[fullOffset(EG_IR)] + D_total[0]), p, l1+1, buf1); 1738 percentify(LL_total_mr * 100 * p / 1739 (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]), 1740 p, l2+1, buf2); 1741 percentify(LL_total_mw * 100 * p / 1742 total[fullOffset(EG_DW)], p, l3+1, buf3); 1743 VG_(message)(Vg_UserMsg, "LL miss rate: %s (%s + %s )\n", 1744 buf1, buf2,buf3); 1745 } 1746 1747 1748 /*------------------------------------------------------------*/ 1749 /*--- Setup for Event set. ---*/ 1750 /*------------------------------------------------------------*/ 1751 1752 struct event_sets CLG_(sets); 1753 1754 void CLG_(init_eventsets)() 1755 { 1756 // Event groups from which the event sets are composed 1757 // the "Use" group only is used with "cacheuse" simulation 1758 if (clo_collect_cacheuse) 1759 CLG_(register_event_group4)(EG_USE, 1760 "AcCost1", "SpLoss1", "AcCost2", "SpLoss2"); 1761 1762 if (!CLG_(clo).simulate_cache) 1763 CLG_(register_event_group)(EG_IR, "Ir"); 1764 else if (!clo_simulate_writeback) { 1765 CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr"); 1766 CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr"); 1767 CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw"); 1768 } 1769 else { // clo_simulate_writeback 1770 CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr"); 1771 CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr"); 1772 CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw"); 1773 } 1774 1775 if (CLG_(clo).simulate_branch) { 1776 CLG_(register_event_group2)(EG_BC, "Bc", "Bcm"); 1777 CLG_(register_event_group2)(EG_BI, "Bi", "Bim"); 1778 } 1779 1780 if (CLG_(clo).collect_bus) 1781 CLG_(register_event_group)(EG_BUS, "Ge"); 1782 1783 if (CLG_(clo).collect_alloc) 1784 CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize"); 1785 1786 if (CLG_(clo).collect_systime) 1787 CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime"); 1788 1789 // event set used as base for instruction self cost 1790 CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR); 1791 1792 // event set comprising all event groups, used for inclusive cost 1793 CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW); 1794 CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI); 1795 CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS); 1796 CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS); 1797 1798 CLG_DEBUGIF(1) { 1799 CLG_DEBUG(1, "EventSets:\n"); 1800 CLG_(print_eventset)(-2, CLG_(sets).base); 1801 CLG_(print_eventset)(-2, CLG_(sets).full); 1802 } 1803 1804 /* Not-existing events are silently ignored */ 1805 CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full); 1806 CLG_(append_event)(CLG_(dumpmap), "Ir"); 1807 CLG_(append_event)(CLG_(dumpmap), "Dr"); 1808 CLG_(append_event)(CLG_(dumpmap), "Dw"); 1809 CLG_(append_event)(CLG_(dumpmap), "I1mr"); 1810 CLG_(append_event)(CLG_(dumpmap), "D1mr"); 1811 CLG_(append_event)(CLG_(dumpmap), "D1mw"); 1812 CLG_(append_event)(CLG_(dumpmap), "ILmr"); 1813 CLG_(append_event)(CLG_(dumpmap), "DLmr"); 1814 CLG_(append_event)(CLG_(dumpmap), "DLmw"); 1815 CLG_(append_event)(CLG_(dumpmap), "ILdmr"); 1816 CLG_(append_event)(CLG_(dumpmap), "DLdmr"); 1817 CLG_(append_event)(CLG_(dumpmap), "DLdmw"); 1818 CLG_(append_event)(CLG_(dumpmap), "Bc"); 1819 CLG_(append_event)(CLG_(dumpmap), "Bcm"); 1820 CLG_(append_event)(CLG_(dumpmap), "Bi"); 1821 CLG_(append_event)(CLG_(dumpmap), "Bim"); 1822 CLG_(append_event)(CLG_(dumpmap), "AcCost1"); 1823 CLG_(append_event)(CLG_(dumpmap), "SpLoss1"); 1824 CLG_(append_event)(CLG_(dumpmap), "AcCost2"); 1825 CLG_(append_event)(CLG_(dumpmap), "SpLoss2"); 1826 CLG_(append_event)(CLG_(dumpmap), "Ge"); 1827 CLG_(append_event)(CLG_(dumpmap), "allocCount"); 1828 CLG_(append_event)(CLG_(dumpmap), "allocSize"); 1829 CLG_(append_event)(CLG_(dumpmap), "sysCount"); 1830 CLG_(append_event)(CLG_(dumpmap), "sysTime"); 1831 } 1832 1833 1834 /* this is called at dump time for every instruction executed */ 1835 static void cachesim_add_icost(SimCost cost, BBCC* bbcc, 1836 InstrInfo* ii, ULong exe_count) 1837 { 1838 if (!CLG_(clo).simulate_cache) 1839 cost[ fullOffset(EG_IR) ] += exe_count; 1840 1841 if (ii->eventset) 1842 CLG_(add_and_zero_cost2)( CLG_(sets).full, cost, 1843 ii->eventset, bbcc->cost + ii->cost_offset); 1844 } 1845 1846 static 1847 void cachesim_finish(void) 1848 { 1849 if (clo_collect_cacheuse) 1850 cacheuse_finish(); 1851 } 1852 1853 /*------------------------------------------------------------*/ 1854 /*--- The simulator defined in this file ---*/ 1855 /*------------------------------------------------------------*/ 1856 1857 struct cachesim_if CLG_(cachesim) = { 1858 .print_opts = cachesim_print_opts, 1859 .parse_opt = cachesim_parse_opt, 1860 .post_clo_init = cachesim_post_clo_init, 1861 .clear = cachesim_clear, 1862 .getdesc = cachesim_getdesc, 1863 .printstat = cachesim_printstat, 1864 .add_icost = cachesim_add_icost, 1865 .finish = cachesim_finish, 1866 1867 /* these will be set by cachesim_post_clo_init */ 1868 .log_1I0D = 0, 1869 .log_2I0D = 0, 1870 .log_3I0D = 0, 1871 1872 .log_1I1Dr = 0, 1873 .log_1I1Dw = 0, 1874 1875 .log_0I1Dr = 0, 1876 .log_0I1Dw = 0, 1877 1878 .log_1I0D_name = "(no function)", 1879 .log_2I0D_name = "(no function)", 1880 .log_3I0D_name = "(no function)", 1881 1882 .log_1I1Dr_name = "(no function)", 1883 .log_1I1Dw_name = "(no function)", 1884 1885 .log_0I1Dr_name = "(no function)", 1886 .log_0I1Dw_name = "(no function)", 1887 }; 1888 1889 1890 /*--------------------------------------------------------------------*/ 1891 /*--- end ct_sim.c ---*/ 1892 /*--------------------------------------------------------------------*/ 1893 1894