Home | History | Annotate | Download | only in esan
      1 //===-- working_set.cpp ---------------------------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file is a part of EfficiencySanitizer, a family of performance tuners.
     11 //
     12 // This file contains working-set-specific code.
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "working_set.h"
     16 #include "esan.h"
     17 #include "esan_circular_buffer.h"
     18 #include "esan_flags.h"
     19 #include "esan_shadow.h"
     20 #include "esan_sideline.h"
     21 #include "sanitizer_common/sanitizer_procmaps.h"
     22 
     23 // We shadow every cache line of app memory with one shadow byte.
     24 // - The highest bit of each shadow byte indicates whether the corresponding
     25 //   cache line has ever been accessed.
     26 // - The lowest bit of each shadow byte indicates whether the corresponding
     27 //   cache line was accessed since the last sample.
     28 // - The other bits are used for working set snapshots at successively
     29 //   lower frequencies, each bit to the left from the lowest bit stepping
     30 //   down the frequency by 2 to the power of getFlags()->snapshot_step.
     31 // Thus we have something like this:
     32 //   Bit 0: Since last sample
     33 //   Bit 1: Since last 2^2 samples
     34 //   Bit 2: Since last 2^4 samples
     35 //   Bit 3: ...
     36 //   Bit 7: Ever accessed.
     37 // We live with races in accessing each shadow byte.
     38 typedef unsigned char byte;
     39 
     40 namespace __esan {
     41 
     42 // Our shadow memory assumes that the line size is 64.
     43 static const u32 CacheLineSize = 64;
     44 
     45 // See the shadow byte layout description above.
     46 static const u32 TotalWorkingSetBitIdx = 7;
     47 // We accumulate to the left until we hit this bit.
     48 // We don't need to accumulate to the final bit as it's set on each ref
     49 // by the compiler instrumentation.
     50 static const u32 MaxAccumBitIdx = 6;
     51 static const u32 CurWorkingSetBitIdx = 0;
     52 static const byte ShadowAccessedVal =
     53   (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx);
     54 
     55 static SidelineThread Thread;
     56 // If we use real-time-based timer samples this won't overflow in any realistic
     57 // scenario, but if we switch to some other unit (such as memory accesses) we
     58 // may want to consider a 64-bit int.
     59 static u32 SnapshotNum;
     60 
     61 // We store the wset size for each of 8 different sampling frequencies.
     62 static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
     63 // We cannot use static objects as the global destructor is called
     64 // prior to our finalize routine.
     65 // These are each circular buffers, sized up front.
     66 CircularBuffer<u32> SizePerFreq[NumFreq];
     67 // We cannot rely on static initializers (they may run too late) but
     68 // we record the size here for clarity:
     69 u32 CircularBufferSizes[NumFreq] = {
     70   // These are each mmap-ed so our minimum is one page.
     71   32*1024,
     72   16*1024,
     73   8*1024,
     74   4*1024,
     75   4*1024,
     76   4*1024,
     77   4*1024,
     78   4*1024,
     79 };
     80 
     81 void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
     82                                   bool IsWrite) {
     83   if (Size == 0)
     84     return;
     85   SIZE_T I = 0;
     86   uptr LineSize = getFlags()->cache_line_size;
     87   // As Addr+Size could overflow at the top of a 32-bit address space,
     88   // we avoid the simpler formula that rounds the start and end.
     89   SIZE_T NumLines = Size / LineSize +
     90     // Add any extra at the start or end adding on an extra line:
     91     (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
     92   byte *Shadow = (byte *)appToShadow(Addr);
     93   // Write shadow bytes until we're word-aligned.
     94   while (I < NumLines && (uptr)Shadow % 4 != 0) {
     95     if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
     96       *Shadow |= ShadowAccessedVal;
     97     ++Shadow;
     98     ++I;
     99   }
    100   // Write whole shadow words at a time.
    101   // Using a word-stride loop improves the runtime of a microbenchmark of
    102   // memset calls by 10%.
    103   u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 |
    104     ShadowAccessedVal << 16 | ShadowAccessedVal << 24;
    105   while (I + 4 <= NumLines) {
    106     if ((*(u32*)Shadow & WordValue) != WordValue)
    107       *(u32*)Shadow |= WordValue;
    108     Shadow += 4;
    109     I += 4;
    110   }
    111   // Write any trailing shadow bytes.
    112   while (I < NumLines) {
    113     if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
    114       *Shadow |= ShadowAccessedVal;
    115     ++Shadow;
    116     ++I;
    117   }
    118 }
    119 
    120 // This routine will word-align ShadowStart and ShadowEnd prior to scanning.
    121 // It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
    122 // measures the access during the entire execution and should never be cleared.
    123 static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
    124                                      uptr ShadowEnd) {
    125   u32 WorkingSetSize = 0;
    126   u32 ByteValue = 0x1 << BitIdx;
    127   u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 |
    128     ByteValue << 24;
    129   // Get word aligned start.
    130   ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
    131   bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
    132   // Do not clear the bit that measures access during the entire execution.
    133   bool Clear = BitIdx < TotalWorkingSetBitIdx;
    134   for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
    135     if ((*Ptr & WordValue) != 0) {
    136       byte *BytePtr = (byte *)Ptr;
    137       for (u32 j = 0; j < sizeof(u32); ++j) {
    138         if (BytePtr[j] & ByteValue) {
    139           ++WorkingSetSize;
    140           if (Accum) {
    141             // Accumulate to the lower-frequency bit to the left.
    142             BytePtr[j] |= (ByteValue << 1);
    143           }
    144         }
    145       }
    146       if (Clear) {
    147         // Clear this bit from every shadow byte.
    148         *Ptr &= ~WordValue;
    149       }
    150     }
    151   }
    152   return WorkingSetSize;
    153 }
    154 
    155 // Scan shadow memory to calculate the number of cache lines being accessed,
    156 // i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
    157 // We also clear the lowest bits (most recent working set snapshot).
    158 // We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
    159 // measures the access during the entire execution and should never be cleared.
    160 static u32 computeWorkingSizeAndReset(u32 BitIdx) {
    161   u32 WorkingSetSize = 0;
    162   MemoryMappingLayout MemIter(true/*cache*/);
    163   uptr Start, End, Prot;
    164   while (MemIter.Next(&Start, &End, nullptr/*offs*/, nullptr/*file*/,
    165                       0/*file size*/, &Prot)) {
    166     VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n",
    167             __FUNCTION__, Start, End, Prot, isAppMem(Start),
    168             isShadowMem(Start));
    169     if (isShadowMem(Start) && (Prot & MemoryMappingLayout::kProtectionWrite)) {
    170       VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Start, End);
    171       WorkingSetSize += countAndClearShadowValues(BitIdx, Start, End);
    172     }
    173   }
    174   return WorkingSetSize;
    175 }
    176 
    177 // This is invoked from a signal handler but in a sideline thread doing nothing
    178 // else so it is a little less fragile than a typical signal handler.
    179 static void takeSample(void *Arg) {
    180   u32 BitIdx = CurWorkingSetBitIdx;
    181   u32 Freq = 1;
    182   ++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
    183   while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
    184     u32 NumLines = computeWorkingSizeAndReset(BitIdx);
    185     VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
    186             SnapshotNum, BitIdx, Freq, NumLines);
    187     SizePerFreq[BitIdx].push_back(NumLines);
    188     Freq = Freq << getFlags()->snapshot_step;
    189     BitIdx++;
    190   }
    191 }
    192 
    193 // Initialization that must be done before any instrumented code is executed.
    194 void initializeShadowWorkingSet() {
    195   CHECK(getFlags()->cache_line_size == CacheLineSize);
    196   registerMemoryFaultHandler();
    197 }
    198 
    199 void initializeWorkingSet() {
    200   if (getFlags()->record_snapshots) {
    201     for (u32 i = 0; i < NumFreq; ++i)
    202       SizePerFreq[i].initialize(CircularBufferSizes[i]);
    203     Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
    204   }
    205 }
    206 
    207 static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
    208   if (MilliSec > 600000) {
    209     Unit = "min";
    210     return MilliSec / 60000;
    211   } else if (MilliSec > 10000) {
    212     Unit = "sec";
    213     return MilliSec / 1000;
    214   } else {
    215     Unit = "ms";
    216     return MilliSec;
    217   }
    218 }
    219 
    220 static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
    221   // We need a constant to avoid software divide support:
    222   static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
    223   static const u32 MegabyteCachelines = KilobyteCachelines << 10;
    224 
    225   if (NumOfCachelines > 10 * MegabyteCachelines) {
    226     Unit = "MB";
    227     return NumOfCachelines / MegabyteCachelines;
    228   } else if (NumOfCachelines > 10 * KilobyteCachelines) {
    229     Unit = "KB";
    230     return NumOfCachelines / KilobyteCachelines;
    231   } else {
    232     Unit = "Bytes";
    233     return NumOfCachelines * CacheLineSize;
    234   }
    235 }
    236 
    237 void reportWorkingSet() {
    238   const char *Unit;
    239   if (getFlags()->record_snapshots) {
    240     u32 Freq = 1;
    241     Report(" Total number of samples: %u\n", SnapshotNum);
    242     for (u32 i = 0; i < NumFreq; ++i) {
    243       u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
    244       Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
    245       // FIXME: report whether we wrapped around and thus whether we
    246       // have data on the whole run or just the last N samples.
    247       for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
    248         u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
    249         Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
    250                SizePerFreq[i][j]);
    251       }
    252       Freq = Freq << getFlags()->snapshot_step;
    253     }
    254   }
    255 
    256   // Get the working set size for the entire execution.
    257   u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
    258   u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
    259   Report(" %s: the total working set size: %u %s (%u cache lines)\n",
    260          SanitizerToolName, Size, Unit, NumOfCachelines);
    261 }
    262 
    263 int finalizeWorkingSet() {
    264   if (getFlags()->record_snapshots)
    265     Thread.joinThread();
    266   reportWorkingSet();
    267   if (getFlags()->record_snapshots) {
    268     for (u32 i = 0; i < NumFreq; ++i)
    269       SizePerFreq[i].free();
    270   }
    271   return 0;
    272 }
    273 
    274 } // namespace __esan
    275