Home | History | Annotate | Download | only in profiling
      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // instrumentation.h: contains the definitions needed to
     16 // instrument code for profiling:
     17 //   ScopedProfilingLabel, RegisterCurrentThreadForProfiling.
     18 //
     19 // profiler.h is only needed to drive the profiler:
     20 //   StartProfiling, FinishProfiling.
     21 //
     22 // See the usage example in profiler.h.
     23 
     24 #ifndef GEMMLOWP_PROFILING_INSTRUMENTATION_H_
     25 #define GEMMLOWP_PROFILING_INSTRUMENTATION_H_
     26 
     27 #include <pthread.h>
     28 #include <cstdio>
     29 
     30 #ifndef GEMMLOWP_USE_STLPORT
     31 #include <cstdint>
     32 #else
     33 #include <stdint.h>
     34 namespace std {
     35 using ::uint8_t;
     36 using ::uint16_t;
     37 using ::uint32_t;
     38 using ::int8_t;
     39 using ::int16_t;
     40 using ::int32_t;
     41 using ::size_t;
     42 using ::uintptr_t;
     43 }
     44 #endif
     45 
     46 #include <algorithm>
     47 #include <cassert>
     48 #include <cstdlib>
     49 
     50 #ifdef GEMMLOWP_PROFILING
     51 #include <cstring>
     52 #include <set>
     53 #endif
     54 
     55 // We should always use C++11 thread_local; unfortunately that
     56 // isn't fully supported on Apple yet.
     57 #ifdef __APPLE__
     58 #define GEMMLOWP_THREAD_LOCAL static __thread
     59 #define GEMMLOWP_USING_OLD_THREAD_LOCAL
     60 #else
     61 #define GEMMLOWP_THREAD_LOCAL thread_local
     62 #endif
     63 
     64 namespace gemmlowp {
     65 
     66 inline void ReleaseBuildAssertion(bool condition, const char* msg) {
     67   if (!condition) {
     68     fprintf(stderr, "gemmlowp error: %s\n", msg);
     69     abort();
     70   }
     71 }
     72 
     73 // To be used as template parameter for GlobalLock.
     74 // GlobalLock<ProfilerLockId> is the profiler global lock:
     75 // registering threads, starting profiling, finishing profiling, and
     76 // the profiler itself as it samples threads, all need to lock it.
     77 struct ProfilerLockId;
     78 
     79 // A very plain global lock. Templated in LockId so we can have multiple
     80 // locks, one for each LockId type.
     81 template <typename LockId>
     82 class GlobalLock {
     83   static pthread_mutex_t* Mutex() {
     84     static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
     85     return &m;
     86   }
     87 
     88  public:
     89   static void Lock() { pthread_mutex_lock(Mutex()); }
     90   static void Unlock() { pthread_mutex_unlock(Mutex()); }
     91 };
     92 
     93 // A very simple RAII helper to lock and unlock a GlobalLock
     94 template <typename LockId>
     95 struct AutoGlobalLock {
     96   AutoGlobalLock() { GlobalLock<LockId>::Lock(); }
     97   ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); }
     98 };
     99 
    100 // MemoryBarrier is purely a compile-time thing; it tells two things
    101 // to the compiler:
    102 //   1) It prevents reordering code across it
    103 //     (thanks to the 'volatile' after 'asm')
    104 //   2) It requires the compiler to assume that any value previously
    105 //     read from memory, may have changed. Thus it offers an alternative
    106 //     to using 'volatile' variables.
    107 inline void MemoryBarrier() { asm volatile("" ::: "memory"); }
    108 
    109 // Profiling definitions. Two paths: when profiling is enabled,
    110 // and when profiling is disabled.
    111 #ifdef GEMMLOWP_PROFILING
    112 // This code path is when profiling is enabled.
    113 
    114 // A pseudo-call-stack. Contrary to a real call-stack, this only
    115 // contains pointers to literal strings that were manually entered
    116 // in the instrumented code (see ScopedProfilingLabel).
    117 struct ProfilingStack {
    118   static const std::size_t kMaxSize = 15;
    119   typedef const char* LabelsArrayType[kMaxSize];
    120   LabelsArrayType labels;
    121   std::size_t size;
    122 
    123   ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); }
    124 
    125   void Push(const char* label) {
    126     MemoryBarrier();
    127     ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow");
    128     labels[size] = label;
    129     MemoryBarrier();
    130     size++;
    131     MemoryBarrier();
    132   }
    133 
    134   void Pop() {
    135     MemoryBarrier();
    136     ReleaseBuildAssertion(size > 0, "ProfilingStack underflow");
    137     size--;
    138     MemoryBarrier();
    139   }
    140 
    141   void UpdateTop(const char* new_label) {
    142     MemoryBarrier();
    143     assert(size);
    144     labels[size - 1] = new_label;
    145     MemoryBarrier();
    146   }
    147 
    148   ProfilingStack& operator=(const ProfilingStack& other) {
    149     memcpy(this, &other, sizeof(ProfilingStack));
    150     return *this;
    151   }
    152 
    153   bool operator==(const ProfilingStack& other) const {
    154     return !memcmp(this, &other, sizeof(ProfilingStack));
    155   }
    156 };
    157 
    158 static_assert(
    159     !(sizeof(ProfilingStack) & (sizeof(ProfilingStack) - 1)),
    160     "ProfilingStack should have power-of-two size to fit in cache lines");
    161 
    162 struct ThreadInfo;
    163 
    164 // The global set of threads being profiled.
    165 inline std::set<ThreadInfo*>& ThreadsUnderProfiling() {
    166   static std::set<ThreadInfo*> v;
    167   return v;
    168 }
    169 
    170 struct ThreadInfo {
    171   pthread_key_t key;  // used only to get a callback at thread exit.
    172   ProfilingStack stack;
    173 
    174   ThreadInfo() {
    175     pthread_key_create(&key, ThreadExitCallback);
    176     pthread_setspecific(key, this);
    177   }
    178 
    179   static void ThreadExitCallback(void* ptr) {
    180     AutoGlobalLock<ProfilerLockId> lock;
    181     ThreadInfo* self = static_cast<ThreadInfo*>(ptr);
    182     ThreadsUnderProfiling().erase(self);
    183     pthread_key_delete(self->key);
    184   }
    185 };
    186 
    187 inline ThreadInfo& ThreadLocalThreadInfo() {
    188 #ifdef GEMMLOWP_USING_OLD_THREAD_LOCAL
    189   // We're leaking this ThreadInfo structure, because Apple doesn't support
    190   // non-trivial constructors or destructors for their __thread type modifier.
    191   GEMMLOWP_THREAD_LOCAL ThreadInfo* i = nullptr;
    192   if (i == nullptr) {
    193     i = new ThreadInfo();
    194   }
    195   return *i;
    196 #else
    197   GEMMLOWP_THREAD_LOCAL ThreadInfo i;
    198   return i;
    199 #endif
    200 }
    201 
    202 // ScopedProfilingLabel is how one instruments code for profiling
    203 // with this profiler. Construct local ScopedProfilingLabel variables,
    204 // passing a literal string describing the local code. Profile
    205 // samples will then be annotated with this label, while it is in scope
    206 // (whence the name --- also known as RAII).
    207 // See the example in profiler.h.
    208 class ScopedProfilingLabel {
    209   ProfilingStack* profiling_stack_;
    210 
    211  public:
    212   explicit ScopedProfilingLabel(const char* label)
    213       : profiling_stack_(&ThreadLocalThreadInfo().stack) {
    214     profiling_stack_->Push(label);
    215   }
    216 
    217   ~ScopedProfilingLabel() { profiling_stack_->Pop(); }
    218 
    219   void Update(const char* new_label) { profiling_stack_->UpdateTop(new_label); }
    220 };
    221 
    222 // To be called once on each thread to be profiled.
    223 inline void RegisterCurrentThreadForProfiling() {
    224   AutoGlobalLock<ProfilerLockId> lock;
    225   ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo());
    226 }
    227 
    228 #else  // not GEMMLOWP_PROFILING
    229 // This code path is when profiling is disabled.
    230 
    231 // This empty definition of ScopedProfilingLabel ensures that
    232 // it has zero runtime overhead when profiling is disabled.
    233 struct ScopedProfilingLabel {
    234   explicit ScopedProfilingLabel(const char*) {}
    235   void Update(const char*) {}
    236 };
    237 
    238 inline void RegisterCurrentThreadForProfiling() {}
    239 
    240 #endif
    241 
    242 }  // end namespace gemmlowp
    243 
    244 #endif  // GEMMLOWP_PROFILING_INSTRUMENTATION_H_
    245