1 // Copyright 2015 Google Inc. All Rights Reserved. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // profiler.h: a simple sampling profiler that's always just one #include away! 16 // 17 // Overview 18 // ======== 19 // 20 // This profiler only samples a pseudo-stack, not the actual call stack. 21 // The code to be profiled needs to be instrumented with 22 // pseudo-stack "labels", see ScopedProfilingLabel. 23 // Using pseudo-stacks allows this profiler to be very simple, low-overhead, 24 // portable, and independent of compilation details such as function inlining 25 // and frame pointers. The granularity of instrumentation can be freely chosen, 26 // and it is possible to get some annotate-like detail, i.e. detail within one 27 // function without splitting it into multiple functions. 28 // 29 // This profiler should remain small and simple; its key feature is to fit in 30 // a single header file so that there should never be a reason to refrain 31 // from profiling. More complex and feature-rich alternatives are 32 // readily available. This one offers a strict superset of its 33 // functionality: https://github.com/bgirard/GeckoProfiler, including 34 // intertwining pseudostacks with real call stacks, more annotation options, 35 // and advanced visualization. 36 // 37 // Usage 38 // ===== 39 // 40 // 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is 41 // not enabled, profiling instrumentation from instrumentation.h 42 // (ScopedProfilingLabel, RegisterCurrentThreadForProfiling) 43 // is still defined but does nothing. On the other hand, 44 // when profiling is not enabled, it is an error to #include the 45 // present file. 46 // 47 // 1. Each thread can opt in to profiling by calling 48 // RegisterCurrentThreadForProfiling() defined in instrumentation.h. 49 // This can be done at any time, before or during profiling. 50 // No sample will be collected from a thread until 51 // it has called RegisterCurrentThreadForProfiling(). 52 // 53 // 2. Instrument your code to be profiled with ScopedProfilingLabel, 54 // which is a RAII helper defined in instrumentation.h. The identifier 55 // names (some_label, etc) do not matter; what will show up 56 // in the profile is the string passed to the constructor, which 57 // must be a literal string. See the full example below. 58 // 59 // Note: the overhead of ScopedProfilingLabel is zero when not 60 // enabling profiling (when not defining GEMMLOWP_PROFILING). 61 // 62 // 3. Use the profiler.h interface to control profiling. There are two 63 // functions: StartProfiling() and FinishProfiling(). They must be 64 // called on the same thread. FinishProfiling() prints the profile 65 // on stdout. 66 // 67 // Full example 68 // ============ 69 /* 70 #define GEMMLOWP_PROFILING 71 #include "profiling/instrumentation.h" 72 using namespace gemmlowp; 73 74 const int iters = 100000000; 75 volatile int i; 76 77 void Bar() { 78 ScopedProfilingLabel label("Bar"); 79 for (i = 0; i < iters; i++) {} 80 } 81 82 void Foo() { 83 ScopedProfilingLabel label("Foo"); 84 for (i = 0; i < iters; i++) {} 85 Bar(); 86 } 87 88 void Init() { 89 RegisterCurrentThreadForProfiling(); 90 } 91 92 #include "profiling/profiler.h" 93 94 int main() { 95 Init(); 96 StartProfiling(); 97 Foo(); 98 FinishProfiling(); 99 } 100 * 101 * Output: 102 * 103 gemmlowp profile (1 threads, 304 samples) 104 100.00% Foo 105 51.32% other 106 48.68% Bar 107 0.00% other (outside of any label) 108 */ 109 // 110 // Interpreting results 111 // ==================== 112 // 113 // Each node shows the absolute percentage, among all the samples, 114 // of the number of samples that recorded the given pseudo-stack. 115 // The percentages are *NOT* relative to the parent node. In addition 116 // to your own labels, you will also see 'other' nodes that collect 117 // the remainder of samples under the parent node that didn't fall into 118 // any of the labelled child nodes. Example: 119 // 120 // 20% Foo 121 // 12% Bar 122 // 6% Xyz 123 // 2% other 124 // 125 // This means that 20% of all labels were under Foo, of which 12%/20%==60% 126 // were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not 127 // under either Bar or Xyz. 128 // 129 // Typically, one wants to keep adding ScopedProfilingLabel's until 130 // the 'other' nodes show low percentages. 131 // 132 // Interpreting results with multiple threads 133 // ========================================== 134 // 135 // At each sample, each thread registered for profiling gets sampled once. 136 // So if there is one "main thread" spending its time in MainFunc() and 137 // 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the 138 // samples will be in WorkerFunc, so the profile will look like this: 139 // 140 // 80% WorkerFunc 141 // 20% MainFunc 142 143 #ifndef GEMMLOWP_PROFILING_PROFILER_H_ 144 #define GEMMLOWP_PROFILING_PROFILER_H_ 145 146 #ifndef GEMMLOWP_PROFILING 147 #error Profiling is not enabled! 148 #endif 149 150 #include <vector> 151 152 #include "instrumentation.h" 153 154 namespace gemmlowp { 155 156 // A tree view of a profile. 157 class ProfileTreeView { 158 struct Node { 159 std::vector<Node*> children; 160 const char* label; 161 std::size_t weight; 162 Node() : label(nullptr), weight(0) {} 163 ~Node() { 164 for (auto child : children) { 165 delete child; 166 } 167 } 168 }; 169 170 static bool CompareNodes(Node* n1, Node* n2) { 171 return n1->weight > n2->weight; 172 } 173 174 Node root_; 175 176 void PrintNode(const Node* node, int level) const { 177 if (level) { 178 for (int i = 1; i < level; i++) { 179 printf(" "); 180 } 181 printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label); 182 } 183 for (auto child : node->children) { 184 PrintNode(child, level + 1); 185 } 186 } 187 188 static void AddStackToNode(const ProfilingStack& stack, Node* node, 189 std::size_t level) { 190 node->weight++; 191 if (stack.size == level) { 192 return; 193 } 194 Node* child_to_add_to = nullptr; 195 for (auto child : node->children) { 196 if (child->label == stack.labels[level]) { 197 child_to_add_to = child; 198 break; 199 } 200 } 201 if (!child_to_add_to) { 202 child_to_add_to = new Node; 203 child_to_add_to->label = stack.labels[level]; 204 node->children.push_back(child_to_add_to); 205 } 206 AddStackToNode(stack, child_to_add_to, level + 1); 207 return; 208 } 209 210 void AddStack(const ProfilingStack& stack) { 211 AddStackToNode(stack, &root_, 0); 212 } 213 214 void AddOtherChildrenToNode(Node* node) { 215 std::size_t top_level_children_weight = 0; 216 for (auto c : node->children) { 217 AddOtherChildrenToNode(c); 218 top_level_children_weight += c->weight; 219 } 220 if (top_level_children_weight) { 221 Node* other_child = new Node; 222 other_child->label = 223 node == &root_ ? "other (outside of any label)" : "other"; 224 other_child->weight = node->weight - top_level_children_weight; 225 node->children.push_back(other_child); 226 } 227 } 228 229 void AddOtherNodes() { AddOtherChildrenToNode(&root_); } 230 231 void SortNode(Node* node) { 232 std::sort(node->children.begin(), node->children.end(), CompareNodes); 233 for (auto child : node->children) { 234 SortNode(child); 235 } 236 } 237 238 void Sort() { SortNode(&root_); } 239 240 public: 241 explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) { 242 for (auto stack : stacks) { 243 AddStack(stack); 244 } 245 AddOtherNodes(); 246 Sort(); 247 } 248 249 void Print() const { 250 printf("\n"); 251 printf("gemmlowp profile (%d threads, %d samples)\n", 252 static_cast<int>(ThreadsUnderProfiling().size()), 253 static_cast<int>(root_.weight)); 254 PrintNode(&root_, 0); 255 printf("\n"); 256 } 257 }; 258 259 // This function is the only place that determines our sampling frequency. 260 inline void WaitOneProfilerTick() { 261 static const int millisecond = 1000000; 262 263 #if defined __arm__ || defined __aarch64__ 264 // Reduced sampling frequency on mobile devices helps limit time and memory 265 // overhead there. 266 static const int interval = 10 * millisecond; 267 #else 268 static const int interval = 1 * millisecond; 269 #endif 270 271 timespec ts; 272 ts.tv_sec = 0; 273 ts.tv_nsec = interval; 274 nanosleep(&ts, nullptr); 275 } 276 277 // This is how we track whether we've already started profiling, 278 // to guard against misuse of the API. 279 inline bool& IsProfiling() { 280 static bool b; 281 return b; 282 } 283 284 // This is how we tell the profiler thread to finish. 285 inline bool& ProfilerThreadShouldFinish() { 286 static bool b; 287 return b; 288 } 289 290 // The profiler thread. See ProfilerThreadFunc. 291 inline pthread_t& ProfilerThread() { 292 static pthread_t t; 293 return t; 294 } 295 296 // Records a stack from a running thread. 297 // The tricky part is that we're not interrupting the thread. 298 // This is OK because we're looking at a pseudo-stack of labels, 299 // not at the real thread stack, and if the pseudo-stack changes 300 // while we're recording it, we are OK with getting either the 301 // old or the new stack. Note that ProfilingStack::Pop 302 // only decrements the size, and doesn't null the popped label, 303 // so if we're concurrently recording it, it shouldn't change 304 // under our feet until another label is pushed, at which point 305 // we are OK with getting either this new label or the old one. 306 // In the end, the key atomicity property that we are relying on 307 // here is that pointers are changed atomically, and the labels 308 // are pointers (to literal strings). 309 inline void RecordStack(const ThreadInfo* thread, ProfilingStack* dst) { 310 assert(!dst->size); 311 while (dst->size < thread->stack.size) { 312 dst->labels[dst->size] = thread->stack.labels[dst->size]; 313 dst->size++; 314 MemoryBarrier(); // thread->stack can change at any time 315 } 316 } 317 318 // The profiler thread's entry point. 319 // Note that a separate thread is to be started each time we call 320 // StartProfiling(), and finishes when we call FinishProfiling(). 321 // So here we only need to handle the recording and reporting of 322 // a single profile. 323 inline void* ProfilerThreadFunc(void*) { 324 assert(ProfilerThread() == pthread_self()); 325 326 // Since we only handle one profile per profiler thread, the 327 // profile data (the array of recorded stacks) can be a local variable here. 328 std::vector<ProfilingStack> stacks; 329 330 while (!ProfilerThreadShouldFinish()) { 331 WaitOneProfilerTick(); 332 { 333 AutoGlobalLock<ProfilerLockId> lock; 334 for (auto t : ThreadsUnderProfiling()) { 335 ProfilingStack s; 336 RecordStack(t, &s); 337 stacks.push_back(s); 338 } 339 } 340 } 341 342 // Profiling is finished and we now report the results. 343 ProfileTreeView(stacks).Print(); 344 345 return nullptr; 346 } 347 348 // Starts recording samples. 349 inline void StartProfiling() { 350 AutoGlobalLock<ProfilerLockId> lock; 351 ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!"); 352 IsProfiling() = true; 353 ProfilerThreadShouldFinish() = false; 354 pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr); 355 } 356 357 // Stops recording samples, and prints a profile tree-view on stdout. 358 inline void FinishProfiling() { 359 { 360 AutoGlobalLock<ProfilerLockId> lock; 361 ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!"); 362 // The ProfilerThreadShouldFinish() mechanism here is really naive and bad, 363 // as the scary comments below should make clear. 364 // Should we use a condition variable? 365 ProfilerThreadShouldFinish() = true; 366 } // must release the lock here to avoid deadlock with profiler thread. 367 pthread_join(ProfilerThread(), nullptr); 368 IsProfiling() = false; // yikes, this should be guarded by the lock! 369 } 370 371 } // namespace gemmlowp 372 373 #endif // GEMMLOWP_PROFILING_PROFILER_H_ 374