Home | History | Annotate | Download | only in profiling
      1 // Copyright 2015 Google Inc. All Rights Reserved.
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //     http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 
     15 // profiler.h: a simple sampling profiler that's always just one #include away!
     16 //
     17 // Overview
     18 // ========
     19 //
     20 // This profiler only samples a pseudo-stack, not the actual call stack.
     21 // The code to be profiled needs to be instrumented with
     22 // pseudo-stack "labels", see ScopedProfilingLabel.
     23 // Using pseudo-stacks allows this profiler to be very simple, low-overhead,
     24 // portable, and independent of compilation details such as function inlining
     25 // and frame pointers. The granularity of instrumentation can be freely chosen,
     26 // and it is possible to get some annotate-like detail, i.e. detail within one
     27 // function without splitting it into multiple functions.
     28 //
     29 // This profiler should remain small and simple; its key feature is to fit in
     30 // a single header file so that there should never be a reason to refrain
     31 // from profiling. More complex and feature-rich alternatives are
     32 // readily available. This one offers a strict superset of its
     33 // functionality: https://github.com/bgirard/GeckoProfiler, including
     34 // intertwining pseudostacks with real call stacks, more annotation options,
     35 // and advanced visualization.
     36 //
     37 // Usage
     38 // =====
     39 //
     40 // 0. Enable profiling by defining GEMMLOWP_PROFILING. When profiling is
     41 //    not enabled, profiling instrumentation from instrumentation.h
     42 //    (ScopedProfilingLabel, RegisterCurrentThreadForProfiling)
     43 //    is still defined but does nothing. On the other hand,
     44 //    when profiling is not enabled, it is an error to #include the
     45 //    present file.
     46 //
     47 // 1. Each thread can opt in to profiling by calling
     48 //    RegisterCurrentThreadForProfiling() defined in instrumentation.h.
     49 //    This can be done at any time, before or during profiling.
     50 //    No sample will be collected from a thread until
     51 //    it has called RegisterCurrentThreadForProfiling().
     52 //
     53 // 2. Instrument your code to be profiled with ScopedProfilingLabel,
     54 //    which is a RAII helper defined in instrumentation.h. The identifier
     55 //    names (some_label, etc) do not matter; what will show up
     56 //    in the profile is the string passed to the constructor, which
     57 //    must be a literal string. See the full example below.
     58 //
     59 //    Note: the overhead of ScopedProfilingLabel is zero when not
     60 //    enabling profiling (when not defining GEMMLOWP_PROFILING).
     61 //
     62 // 3. Use the profiler.h interface to control profiling. There are two
     63 //    functions: StartProfiling() and FinishProfiling(). They must be
     64 //    called on the same thread. FinishProfiling() prints the profile
     65 //    on stdout.
     66 //
     67 // Full example
     68 // ============
     69 /*
     70     #define GEMMLOWP_PROFILING
     71     #include "profiling/instrumentation.h"
     72     using namespace gemmlowp;
     73 
     74     const int iters = 100000000;
     75     volatile int i;
     76 
     77     void Bar() {
     78       ScopedProfilingLabel label("Bar");
     79       for (i = 0; i < iters; i++) {}
     80     }
     81 
     82     void Foo() {
     83       ScopedProfilingLabel label("Foo");
     84       for (i = 0; i < iters; i++) {}
     85       Bar();
     86     }
     87 
     88     void Init() {
     89       RegisterCurrentThreadForProfiling();
     90     }
     91 
     92     #include "profiling/profiler.h"
     93 
     94     int main() {
     95       Init();
     96       StartProfiling();
     97       Foo();
     98       FinishProfiling();
     99     }
    100 *
    101 * Output:
    102 *
    103     gemmlowp profile (1 threads, 304 samples)
    104     100.00% Foo
    105         51.32% other
    106         48.68% Bar
    107     0.00% other (outside of any label)
    108 */
    109 //
    110 // Interpreting results
    111 // ====================
    112 //
    113 //  Each node shows the absolute percentage, among all the samples,
    114 //  of the number of samples that recorded the given pseudo-stack.
    115 //  The percentages are *NOT* relative to the parent node. In addition
    116 //  to your own labels, you will also see 'other' nodes that collect
    117 //  the remainder of samples under the parent node that didn't fall into
    118 //  any of the labelled child nodes. Example:
    119 //
    120 //  20% Foo
    121 //      12% Bar
    122 //      6% Xyz
    123 //      2% other
    124 //
    125 //  This means that 20% of all labels were under Foo, of which 12%/20%==60%
    126 //  were under Bar, 6%/20%==30% were under Xyz, and 2%/20%==10% were not
    127 //  under either Bar or Xyz.
    128 //
    129 //  Typically, one wants to keep adding ScopedProfilingLabel's until
    130 //  the 'other' nodes show low percentages.
    131 //
    132 // Interpreting results with multiple threads
    133 // ==========================================
    134 //
    135 // At each sample, each thread registered for profiling gets sampled once.
    136 // So if there is one "main thread" spending its time in MainFunc() and
    137 // 4 "worker threads" spending time in WorkerFunc(), then 80% (=4/5) of the
    138 // samples will be in WorkerFunc, so the profile will look like this:
    139 //
    140 // 80% WorkerFunc
    141 // 20% MainFunc
    142 
    143 #ifndef GEMMLOWP_PROFILING_PROFILER_H_
    144 #define GEMMLOWP_PROFILING_PROFILER_H_
    145 
    146 #ifndef GEMMLOWP_PROFILING
    147 #error Profiling is not enabled!
    148 #endif
    149 
    150 #include <vector>
    151 
    152 #include "instrumentation.h"
    153 
    154 namespace gemmlowp {
    155 
    156 // A tree view of a profile.
    157 class ProfileTreeView {
    158   struct Node {
    159     std::vector<Node*> children;
    160     const char* label;
    161     std::size_t weight;
    162     Node() : label(nullptr), weight(0) {}
    163     ~Node() {
    164       for (auto child : children) {
    165         delete child;
    166       }
    167     }
    168   };
    169 
    170   static bool CompareNodes(Node* n1, Node* n2) {
    171     return n1->weight > n2->weight;
    172   }
    173 
    174   Node root_;
    175 
    176   void PrintNode(const Node* node, int level) const {
    177     if (level) {
    178       for (int i = 1; i < level; i++) {
    179         printf("    ");
    180       }
    181       printf("%.2f%% %s\n", 100.0f * node->weight / root_.weight, node->label);
    182     }
    183     for (auto child : node->children) {
    184       PrintNode(child, level + 1);
    185     }
    186   }
    187 
    188   static void AddStackToNode(const ProfilingStack& stack, Node* node,
    189                              std::size_t level) {
    190     node->weight++;
    191     if (stack.size == level) {
    192       return;
    193     }
    194     Node* child_to_add_to = nullptr;
    195     for (auto child : node->children) {
    196       if (child->label == stack.labels[level]) {
    197         child_to_add_to = child;
    198         break;
    199       }
    200     }
    201     if (!child_to_add_to) {
    202       child_to_add_to = new Node;
    203       child_to_add_to->label = stack.labels[level];
    204       node->children.push_back(child_to_add_to);
    205     }
    206     AddStackToNode(stack, child_to_add_to, level + 1);
    207     return;
    208   }
    209 
    210   void AddStack(const ProfilingStack& stack) {
    211     AddStackToNode(stack, &root_, 0);
    212   }
    213 
    214   void AddOtherChildrenToNode(Node* node) {
    215     std::size_t top_level_children_weight = 0;
    216     for (auto c : node->children) {
    217       AddOtherChildrenToNode(c);
    218       top_level_children_weight += c->weight;
    219     }
    220     if (top_level_children_weight) {
    221       Node* other_child = new Node;
    222       other_child->label =
    223           node == &root_ ? "other (outside of any label)" : "other";
    224       other_child->weight = node->weight - top_level_children_weight;
    225       node->children.push_back(other_child);
    226     }
    227   }
    228 
    229   void AddOtherNodes() { AddOtherChildrenToNode(&root_); }
    230 
    231   void SortNode(Node* node) {
    232     std::sort(node->children.begin(), node->children.end(), CompareNodes);
    233     for (auto child : node->children) {
    234       SortNode(child);
    235     }
    236   }
    237 
    238   void Sort() { SortNode(&root_); }
    239 
    240  public:
    241   explicit ProfileTreeView(const std::vector<ProfilingStack>& stacks) {
    242     for (auto stack : stacks) {
    243       AddStack(stack);
    244     }
    245     AddOtherNodes();
    246     Sort();
    247   }
    248 
    249   void Print() const {
    250     printf("\n");
    251     printf("gemmlowp profile (%d threads, %d samples)\n",
    252            static_cast<int>(ThreadsUnderProfiling().size()),
    253            static_cast<int>(root_.weight));
    254     PrintNode(&root_, 0);
    255     printf("\n");
    256   }
    257 };
    258 
    259 // This function is the only place that determines our sampling frequency.
    260 inline void WaitOneProfilerTick() {
    261   static const int millisecond = 1000000;
    262 
    263 #if defined __arm__ || defined __aarch64__
    264   // Reduced sampling frequency on mobile devices helps limit time and memory
    265   // overhead there.
    266   static const int interval = 10 * millisecond;
    267 #else
    268   static const int interval = 1 * millisecond;
    269 #endif
    270 
    271   timespec ts;
    272   ts.tv_sec = 0;
    273   ts.tv_nsec = interval;
    274   nanosleep(&ts, nullptr);
    275 }
    276 
    277 // This is how we track whether we've already started profiling,
    278 // to guard against misuse of the API.
    279 inline bool& IsProfiling() {
    280   static bool b;
    281   return b;
    282 }
    283 
    284 // This is how we tell the profiler thread to finish.
    285 inline bool& ProfilerThreadShouldFinish() {
    286   static bool b;
    287   return b;
    288 }
    289 
    290 // The profiler thread. See ProfilerThreadFunc.
    291 inline pthread_t& ProfilerThread() {
    292   static pthread_t t;
    293   return t;
    294 }
    295 
    296 // Records a stack from a running thread.
    297 // The tricky part is that we're not interrupting the thread.
    298 // This is OK because we're looking at a pseudo-stack of labels,
    299 // not at the real thread stack, and if the pseudo-stack changes
    300 // while we're recording it, we are OK with getting either the
    301 // old or the new stack. Note that ProfilingStack::Pop
    302 // only decrements the size, and doesn't null the popped label,
    303 // so if we're concurrently recording it, it shouldn't change
    304 // under our feet until another label is pushed, at which point
    305 // we are OK with getting either this new label or the old one.
    306 // In the end, the key atomicity property that we are relying on
    307 // here is that pointers are changed atomically, and the labels
    308 // are pointers (to literal strings).
    309 inline void RecordStack(const ThreadInfo* thread, ProfilingStack* dst) {
    310   assert(!dst->size);
    311   while (dst->size < thread->stack.size) {
    312     dst->labels[dst->size] = thread->stack.labels[dst->size];
    313     dst->size++;
    314     MemoryBarrier();  // thread->stack can change at any time
    315   }
    316 }
    317 
    318 // The profiler thread's entry point.
    319 // Note that a separate thread is to be started each time we call
    320 // StartProfiling(), and finishes when we call FinishProfiling().
    321 // So here we only need to handle the recording and reporting of
    322 // a single profile.
    323 inline void* ProfilerThreadFunc(void*) {
    324   assert(ProfilerThread() == pthread_self());
    325 
    326   // Since we only handle one profile per profiler thread, the
    327   // profile data (the array of recorded stacks) can be a local variable here.
    328   std::vector<ProfilingStack> stacks;
    329 
    330   while (!ProfilerThreadShouldFinish()) {
    331     WaitOneProfilerTick();
    332     {
    333       AutoGlobalLock<ProfilerLockId> lock;
    334       for (auto t : ThreadsUnderProfiling()) {
    335         ProfilingStack s;
    336         RecordStack(t, &s);
    337         stacks.push_back(s);
    338       }
    339     }
    340   }
    341 
    342   // Profiling is finished and we now report the results.
    343   ProfileTreeView(stacks).Print();
    344 
    345   return nullptr;
    346 }
    347 
    348 // Starts recording samples.
    349 inline void StartProfiling() {
    350   AutoGlobalLock<ProfilerLockId> lock;
    351   ReleaseBuildAssertion(!IsProfiling(), "We're already profiling!");
    352   IsProfiling() = true;
    353   ProfilerThreadShouldFinish() = false;
    354   pthread_create(&ProfilerThread(), nullptr, ProfilerThreadFunc, nullptr);
    355 }
    356 
    357 // Stops recording samples, and prints a profile tree-view on stdout.
    358 inline void FinishProfiling() {
    359   {
    360     AutoGlobalLock<ProfilerLockId> lock;
    361     ReleaseBuildAssertion(IsProfiling(), "We weren't profiling!");
    362     // The ProfilerThreadShouldFinish() mechanism here is really naive and bad,
    363     // as the scary comments below should make clear.
    364     // Should we use a condition variable?
    365     ProfilerThreadShouldFinish() = true;
    366   }  // must release the lock here to avoid deadlock with profiler thread.
    367   pthread_join(ProfilerThread(), nullptr);
    368   IsProfiling() = false;  // yikes, this should be guarded by the lock!
    369 }
    370 
    371 }  // namespace gemmlowp
    372 
    373 #endif  // GEMMLOWP_PROFILING_PROFILER_H_
    374