1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // instrumentation.h: contains the definitions needed to 16 // instrument code for profiling: 17 // ScopedProfilingLabel, RegisterCurrentThreadForProfiling. 18 // 19 // profiler.h is only needed to drive the profiler: 20 // StartProfiling, FinishProfiling. 21 // 22 // See the usage example in profiler.h. 23 24 #ifndef GEMMLOWP_PROFILING_INSTRUMENTATION_H_ 25 #define GEMMLOWP_PROFILING_INSTRUMENTATION_H_ 26 27 #include <pthread.h> 28 #include <cstdio> 29 30 #ifndef GEMMLOWP_USE_STLPORT 31 #include <cstdint> 32 #else 33 #include <stdint.h> 34 namespace std { 35 using ::uint8_t; 36 using ::uint16_t; 37 using ::uint32_t; 38 using ::int8_t; 39 using ::int16_t; 40 using ::int32_t; 41 using ::size_t; 42 using ::uintptr_t; 43 } 44 #endif 45 46 #include <algorithm> 47 #include <cassert> 48 #include <cstdlib> 49 50 #ifdef GEMMLOWP_PROFILING 51 #include <cstring> 52 #include <set> 53 #endif 54 55 // We should always use C++11 thread_local; unfortunately that 56 // isn't fully supported on Apple yet. 57 #ifdef __APPLE__ 58 #define GEMMLOWP_THREAD_LOCAL static __thread 59 #define GEMMLOWP_USING_OLD_THREAD_LOCAL 60 #else 61 #define GEMMLOWP_THREAD_LOCAL thread_local 62 #endif 63 64 namespace gemmlowp { 65 66 inline void ReleaseBuildAssertion(bool condition, const char* msg) { 67 if (!condition) { 68 fprintf(stderr, "gemmlowp error: %s\n", msg); 69 abort(); 70 } 71 } 72 73 // To be used as template parameter for GlobalLock. 74 // GlobalLock<ProfilerLockId> is the profiler global lock: 75 // registering threads, starting profiling, finishing profiling, and 76 // the profiler itself as it samples threads, all need to lock it. 77 struct ProfilerLockId; 78 79 // A very plain global lock. Templated in LockId so we can have multiple 80 // locks, one for each LockId type. 81 template <typename LockId> 82 class GlobalLock { 83 static pthread_mutex_t* Mutex() { 84 static pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; 85 return &m; 86 } 87 88 public: 89 static void Lock() { pthread_mutex_lock(Mutex()); } 90 static void Unlock() { pthread_mutex_unlock(Mutex()); } 91 }; 92 93 // A very simple RAII helper to lock and unlock a GlobalLock 94 template <typename LockId> 95 struct AutoGlobalLock { 96 AutoGlobalLock() { GlobalLock<LockId>::Lock(); } 97 ~AutoGlobalLock() { GlobalLock<LockId>::Unlock(); } 98 }; 99 100 // MemoryBarrier is purely a compile-time thing; it tells two things 101 // to the compiler: 102 // 1) It prevents reordering code across it 103 // (thanks to the 'volatile' after 'asm') 104 // 2) It requires the compiler to assume that any value previously 105 // read from memory, may have changed. Thus it offers an alternative 106 // to using 'volatile' variables. 107 inline void MemoryBarrier() { asm volatile("" ::: "memory"); } 108 109 // Profiling definitions. Two paths: when profiling is enabled, 110 // and when profiling is disabled. 111 #ifdef GEMMLOWP_PROFILING 112 // This code path is when profiling is enabled. 113 114 // A pseudo-call-stack. Contrary to a real call-stack, this only 115 // contains pointers to literal strings that were manually entered 116 // in the instrumented code (see ScopedProfilingLabel). 117 struct ProfilingStack { 118 static const std::size_t kMaxSize = 15; 119 typedef const char* LabelsArrayType[kMaxSize]; 120 LabelsArrayType labels; 121 std::size_t size; 122 123 ProfilingStack() { memset(this, 0, sizeof(ProfilingStack)); } 124 125 void Push(const char* label) { 126 MemoryBarrier(); 127 ReleaseBuildAssertion(size < kMaxSize, "ProfilingStack overflow"); 128 labels[size] = label; 129 MemoryBarrier(); 130 size++; 131 MemoryBarrier(); 132 } 133 134 void Pop() { 135 MemoryBarrier(); 136 ReleaseBuildAssertion(size > 0, "ProfilingStack underflow"); 137 size--; 138 MemoryBarrier(); 139 } 140 141 void UpdateTop(const char* new_label) { 142 MemoryBarrier(); 143 assert(size); 144 labels[size - 1] = new_label; 145 MemoryBarrier(); 146 } 147 148 ProfilingStack& operator=(const ProfilingStack& other) { 149 memcpy(this, &other, sizeof(ProfilingStack)); 150 return *this; 151 } 152 153 bool operator==(const ProfilingStack& other) const { 154 return !memcmp(this, &other, sizeof(ProfilingStack)); 155 } 156 }; 157 158 static_assert( 159 !(sizeof(ProfilingStack) & (sizeof(ProfilingStack) - 1)), 160 "ProfilingStack should have power-of-two size to fit in cache lines"); 161 162 struct ThreadInfo; 163 164 // The global set of threads being profiled. 165 inline std::set<ThreadInfo*>& ThreadsUnderProfiling() { 166 static std::set<ThreadInfo*> v; 167 return v; 168 } 169 170 struct ThreadInfo { 171 pthread_key_t key; // used only to get a callback at thread exit. 172 ProfilingStack stack; 173 174 ThreadInfo() { 175 pthread_key_create(&key, ThreadExitCallback); 176 pthread_setspecific(key, this); 177 } 178 179 static void ThreadExitCallback(void* ptr) { 180 AutoGlobalLock<ProfilerLockId> lock; 181 ThreadInfo* self = static_cast<ThreadInfo*>(ptr); 182 ThreadsUnderProfiling().erase(self); 183 pthread_key_delete(self->key); 184 } 185 }; 186 187 inline ThreadInfo& ThreadLocalThreadInfo() { 188 #ifdef GEMMLOWP_USING_OLD_THREAD_LOCAL 189 // We're leaking this ThreadInfo structure, because Apple doesn't support 190 // non-trivial constructors or destructors for their __thread type modifier. 191 GEMMLOWP_THREAD_LOCAL ThreadInfo* i = nullptr; 192 if (i == nullptr) { 193 i = new ThreadInfo(); 194 } 195 return *i; 196 #else 197 GEMMLOWP_THREAD_LOCAL ThreadInfo i; 198 return i; 199 #endif 200 } 201 202 // ScopedProfilingLabel is how one instruments code for profiling 203 // with this profiler. Construct local ScopedProfilingLabel variables, 204 // passing a literal string describing the local code. Profile 205 // samples will then be annotated with this label, while it is in scope 206 // (whence the name --- also known as RAII). 207 // See the example in profiler.h. 208 class ScopedProfilingLabel { 209 ProfilingStack* profiling_stack_; 210 211 public: 212 explicit ScopedProfilingLabel(const char* label) 213 : profiling_stack_(&ThreadLocalThreadInfo().stack) { 214 profiling_stack_->Push(label); 215 } 216 217 ~ScopedProfilingLabel() { profiling_stack_->Pop(); } 218 219 void Update(const char* new_label) { profiling_stack_->UpdateTop(new_label); } 220 }; 221 222 // To be called once on each thread to be profiled. 223 inline void RegisterCurrentThreadForProfiling() { 224 AutoGlobalLock<ProfilerLockId> lock; 225 ThreadsUnderProfiling().insert(&ThreadLocalThreadInfo()); 226 } 227 228 #else // not GEMMLOWP_PROFILING 229 // This code path is when profiling is disabled. 230 231 // This empty definition of ScopedProfilingLabel ensures that 232 // it has zero runtime overhead when profiling is disabled. 233 struct ScopedProfilingLabel { 234 explicit ScopedProfilingLabel(const char*) {} 235 void Update(const char*) {} 236 }; 237 238 inline void RegisterCurrentThreadForProfiling() {} 239 240 #endif 241 242 } // end namespace gemmlowp 243 244 #endif // GEMMLOWP_PROFILING_INSTRUMENTATION_H_ 245