Home | History | Annotate | Download | only in llvm-xray
      1 //===- xray-stacks.cpp: XRay Function Call Stack Accounting ---------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file implements stack-based accounting. It takes XRay traces, and
     11 // collates statistics across these traces to show a breakdown of time spent
     12 // at various points of the stack to provide insight into which functions
     13 // spend the most time in terms of a call stack. We provide a few
     14 // sorting/filtering options for zero'ing in on the useful stacks.
     15 //
     16 //===----------------------------------------------------------------------===//
     17 
     18 #include <forward_list>
     19 #include <numeric>
     20 
     21 #include "func-id-helper.h"
     22 #include "trie-node.h"
     23 #include "xray-registry.h"
     24 #include "llvm/ADT/StringExtras.h"
     25 #include "llvm/Support/CommandLine.h"
     26 #include "llvm/Support/Errc.h"
     27 #include "llvm/Support/ErrorHandling.h"
     28 #include "llvm/Support/FormatAdapters.h"
     29 #include "llvm/Support/FormatVariadic.h"
     30 #include "llvm/XRay/Graph.h"
     31 #include "llvm/XRay/InstrumentationMap.h"
     32 #include "llvm/XRay/Trace.h"
     33 
     34 using namespace llvm;
     35 using namespace llvm::xray;
     36 
     37 static cl::SubCommand Stack("stack", "Call stack accounting");
     38 static cl::list<std::string> StackInputs(cl::Positional,
     39                                          cl::desc("<xray trace>"), cl::Required,
     40                                          cl::sub(Stack), cl::OneOrMore);
     41 
     42 static cl::opt<bool>
     43     StackKeepGoing("keep-going", cl::desc("Keep going on errors encountered"),
     44                    cl::sub(Stack), cl::init(false));
     45 static cl::alias StackKeepGoing2("k", cl::aliasopt(StackKeepGoing),
     46                                  cl::desc("Alias for -keep-going"),
     47                                  cl::sub(Stack));
     48 
     49 // TODO: Does there need to be an option to deduce tail or sibling calls?
     50 
     51 static cl::opt<std::string> StacksInstrMap(
     52     "instr_map",
     53     cl::desc("instrumentation map used to identify function ids. "
     54              "Currently supports elf file instrumentation maps."),
     55     cl::sub(Stack), cl::init(""));
     56 static cl::alias StacksInstrMap2("m", cl::aliasopt(StacksInstrMap),
     57                                  cl::desc("Alias for -instr_map"),
     58                                  cl::sub(Stack));
     59 
     60 static cl::opt<bool>
     61     SeparateThreadStacks("per-thread-stacks",
     62                          cl::desc("Report top stacks within each thread id"),
     63                          cl::sub(Stack), cl::init(false));
     64 
     65 static cl::opt<bool>
     66     AggregateThreads("aggregate-threads",
     67                      cl::desc("Aggregate stack times across threads"),
     68                      cl::sub(Stack), cl::init(false));
     69 
     70 static cl::opt<bool>
     71     DumpAllStacks("all-stacks",
     72                   cl::desc("Dump sum of timings for all stacks. "
     73                            "By default separates stacks per-thread."),
     74                   cl::sub(Stack), cl::init(false));
     75 static cl::alias DumpAllStacksShort("all", cl::aliasopt(DumpAllStacks),
     76                                     cl::desc("Alias for -all-stacks"),
     77                                     cl::sub(Stack));
     78 
     79 // TODO(kpw): Add other interesting formats. Perhaps chrome trace viewer format
     80 // possibly with aggregations or just a linear trace of timings.
     81 enum StackOutputFormat { HUMAN, FLAMETOOL };
     82 
     83 static cl::opt<StackOutputFormat> StacksOutputFormat(
     84     "stack-format",
     85     cl::desc("The format that output stacks should be "
     86              "output in. Only applies with all-stacks."),
     87     cl::values(
     88         clEnumValN(HUMAN, "human",
     89                    "Human readable output. Only valid without -all-stacks."),
     90         clEnumValN(FLAMETOOL, "flame",
     91                    "Format consumable by Brendan Gregg's FlameGraph tool. "
     92                    "Only valid with -all-stacks.")),
     93     cl::sub(Stack), cl::init(HUMAN));
     94 
     95 // Types of values for each stack in a CallTrie.
     96 enum class AggregationType {
     97   TOTAL_TIME,      // The total time spent in a stack and its callees.
     98   INVOCATION_COUNT // The number of times the stack was invoked.
     99 };
    100 
    101 static cl::opt<AggregationType> RequestedAggregation(
    102     "aggregation-type",
    103     cl::desc("The type of aggregation to do on call stacks."),
    104     cl::values(
    105         clEnumValN(
    106             AggregationType::TOTAL_TIME, "time",
    107             "Capture the total time spent in an all invocations of a stack."),
    108         clEnumValN(AggregationType::INVOCATION_COUNT, "count",
    109                    "Capture the number of times a stack was invoked. "
    110                    "In flamegraph mode, this count also includes invocations "
    111                    "of all callees.")),
    112     cl::sub(Stack), cl::init(AggregationType::TOTAL_TIME));
    113 
    114 /// A helper struct to work with formatv and XRayRecords. Makes it easier to
    115 /// use instrumentation map names or addresses in formatted output.
    116 struct format_xray_record : public FormatAdapter<XRayRecord> {
    117   explicit format_xray_record(XRayRecord record,
    118                               const FuncIdConversionHelper &conv)
    119       : FormatAdapter<XRayRecord>(std::move(record)), Converter(&conv) {}
    120   void format(raw_ostream &Stream, StringRef Style) override {
    121     Stream << formatv(
    122         "{FuncId: \"{0}\", ThreadId: \"{1}\", RecordType: \"{2}\"}",
    123         Converter->SymbolOrNumber(Item.FuncId), Item.TId,
    124         DecodeRecordType(Item.RecordType));
    125   }
    126 
    127 private:
    128   Twine DecodeRecordType(uint16_t recordType) {
    129     switch (recordType) {
    130     case 0:
    131       return Twine("Fn Entry");
    132     case 1:
    133       return Twine("Fn Exit");
    134     default:
    135       // TODO: Add Tail exit when it is added to llvm/XRay/XRayRecord.h
    136       return Twine("Unknown");
    137     }
    138   }
    139 
    140   const FuncIdConversionHelper *Converter;
    141 };
    142 
    143 /// The stack command will take a set of XRay traces as arguments, and collects
    144 /// information about the stacks of instrumented functions that appear in the
    145 /// traces. We track the following pieces of information:
    146 ///
    147 ///   - Total time: amount of time/cycles accounted for in the traces.
    148 ///   - Stack count: number of times a specific stack appears in the
    149 ///     traces. Only instrumented functions show up in stacks.
    150 ///   - Cumulative stack time: amount of time spent in a stack accumulated
    151 ///     across the invocations in the traces.
    152 ///   - Cumulative local time: amount of time spent in each instrumented
    153 ///     function showing up in a specific stack, accumulated across the traces.
    154 ///
    155 /// Example output for the kind of data we'd like to provide looks like the
    156 /// following:
    157 ///
    158 ///   Total time: 3.33234 s
    159 ///   Stack ID: ...
    160 ///   Stack Count: 2093
    161 ///   #     Function                  Local Time     (%)      Stack Time     (%)
    162 ///   0     main                         2.34 ms   0.07%      3.33234  s    100%
    163 ///   1     foo()                     3.30000  s  99.02%         3.33  s  99.92%
    164 ///   2     bar()                          30 ms   0.90%           30 ms   0.90%
    165 ///
    166 /// We can also show distributions of the function call durations with
    167 /// statistics at each level of the stack. This works by doing the following
    168 /// algorithm:
    169 ///
    170 ///   1. When unwinding, record the duration of each unwound function associated
    171 ///   with the path up to which the unwinding stops. For example:
    172 ///
    173 ///        Step                         Duration (? means has start time)
    174 ///
    175 ///        push a <start time>           a = ?
    176 ///        push b <start time>           a = ?, a->b = ?
    177 ///        push c <start time>           a = ?, a->b = ?, a->b->c = ?
    178 ///        pop  c <end time>             a = ?, a->b = ?, emit duration(a->b->c)
    179 ///        pop  b <end time>             a = ?, emit duration(a->b)
    180 ///        push c <start time>           a = ?, a->c = ?
    181 ///        pop  c <end time>             a = ?, emit duration(a->c)
    182 ///        pop  a <end time>             emit duration(a)
    183 ///
    184 ///   2. We then account for the various stacks we've collected, and for each of
    185 ///      them will have measurements that look like the following (continuing
    186 ///      with the above simple example):
    187 ///
    188 ///        c : [<id("a->b->c"), [durations]>, <id("a->c"), [durations]>]
    189 ///        b : [<id("a->b"), [durations]>]
    190 ///        a : [<id("a"), [durations]>]
    191 ///
    192 ///      This allows us to compute, for each stack id, and each function that
    193 ///      shows up in the stack,  some important statistics like:
    194 ///
    195 ///        - median
    196 ///        - 99th percentile
    197 ///        - mean + stddev
    198 ///        - count
    199 ///
    200 ///   3. For cases where we don't have durations for some of the higher levels
    201 ///   of the stack (perhaps instrumentation wasn't activated when the stack was
    202 ///   entered), we can mark them appropriately.
    203 ///
    204 ///  Computing this data also allows us to implement lookup by call stack nodes,
    205 ///  so that we can find functions that show up in multiple stack traces and
    206 ///  show the statistical properties of that function in various contexts. We
    207 ///  can compute information similar to the following:
    208 ///
    209 ///    Function: 'c'
    210 ///    Stacks: 2 / 2
    211 ///    Stack ID: ...
    212 ///    Stack Count: ...
    213 ///    #     Function  ...
    214 ///    0     a         ...
    215 ///    1     b         ...
    216 ///    2     c         ...
    217 ///
    218 ///    Stack ID: ...
    219 ///    Stack Count: ...
    220 ///    #     Function  ...
    221 ///    0     a         ...
    222 ///    1     c         ...
    223 ///    ----------------...
    224 ///
    225 ///    Function: 'b'
    226 ///    Stacks:  1 / 2
    227 ///    Stack ID: ...
    228 ///    Stack Count: ...
    229 ///    #     Function  ...
    230 ///    0     a         ...
    231 ///    1     b         ...
    232 ///    2     c         ...
    233 ///
    234 ///
    235 /// To do this we require a Trie data structure that will allow us to represent
    236 /// all the call stacks of instrumented functions in an easily traversible
    237 /// manner when we do the aggregations and lookups. For instrumented call
    238 /// sequences like the following:
    239 ///
    240 ///   a()
    241 ///    b()
    242 ///     c()
    243 ///     d()
    244 ///    c()
    245 ///
    246 /// We will have a representation like so:
    247 ///
    248 ///   a -> b -> c
    249 ///   |    |
    250 ///   |    +--> d
    251 ///   |
    252 ///   +--> c
    253 ///
    254 /// We maintain a sequence of durations on the leaves and in the internal nodes
    255 /// as we go through and process every record from the XRay trace. We also
    256 /// maintain an index of unique functions, and provide a means of iterating
    257 /// through all the instrumented call stacks which we know about.
    258 
    259 struct StackDuration {
    260   llvm::SmallVector<int64_t, 4> TerminalDurations;
    261   llvm::SmallVector<int64_t, 4> IntermediateDurations;
    262 };
    263 
    264 StackDuration mergeStackDuration(const StackDuration &Left,
    265                                  const StackDuration &Right) {
    266   StackDuration Data{};
    267   Data.TerminalDurations.reserve(Left.TerminalDurations.size() +
    268                                  Right.TerminalDurations.size());
    269   Data.IntermediateDurations.reserve(Left.IntermediateDurations.size() +
    270                                      Right.IntermediateDurations.size());
    271   // Aggregate the durations.
    272   for (auto duration : Left.TerminalDurations)
    273     Data.TerminalDurations.push_back(duration);
    274   for (auto duration : Right.TerminalDurations)
    275     Data.TerminalDurations.push_back(duration);
    276 
    277   for (auto duration : Left.IntermediateDurations)
    278     Data.IntermediateDurations.push_back(duration);
    279   for (auto duration : Right.IntermediateDurations)
    280     Data.IntermediateDurations.push_back(duration);
    281   return Data;
    282 }
    283 
    284 using StackTrieNode = TrieNode<StackDuration>;
    285 
    286 template <AggregationType AggType>
    287 std::size_t GetValueForStack(const StackTrieNode *Node);
    288 
    289 // When computing total time spent in a stack, we're adding the timings from
    290 // its callees and the timings from when it was a leaf.
    291 template <>
    292 std::size_t
    293 GetValueForStack<AggregationType::TOTAL_TIME>(const StackTrieNode *Node) {
    294   auto TopSum = std::accumulate(Node->ExtraData.TerminalDurations.begin(),
    295                                 Node->ExtraData.TerminalDurations.end(), 0uLL);
    296   return std::accumulate(Node->ExtraData.IntermediateDurations.begin(),
    297                          Node->ExtraData.IntermediateDurations.end(), TopSum);
    298 }
    299 
    300 // Calculates how many times a function was invoked.
    301 // TODO: Hook up option to produce stacks
    302 template <>
    303 std::size_t
    304 GetValueForStack<AggregationType::INVOCATION_COUNT>(const StackTrieNode *Node) {
    305   return Node->ExtraData.TerminalDurations.size() +
    306          Node->ExtraData.IntermediateDurations.size();
    307 }
    308 
    309 // Make sure there are implementations for each enum value.
    310 template <AggregationType T> struct DependentFalseType : std::false_type {};
    311 
    312 template <AggregationType AggType>
    313 std::size_t GetValueForStack(const StackTrieNode *Node) {
    314   static_assert(DependentFalseType<AggType>::value,
    315                 "No implementation found for aggregation type provided.");
    316   return 0;
    317 }
    318 
    319 class StackTrie {
    320   // Avoid the magic number of 4 propagated through the code with an alias.
    321   // We use this SmallVector to track the root nodes in a call graph.
    322   using RootVector = SmallVector<StackTrieNode *, 4>;
    323 
    324   // We maintain pointers to the roots of the tries we see.
    325   DenseMap<uint32_t, RootVector> Roots;
    326 
    327   // We make sure all the nodes are accounted for in this list.
    328   std::forward_list<StackTrieNode> NodeStore;
    329 
    330   // A map of thread ids to pairs call stack trie nodes and their start times.
    331   DenseMap<uint32_t, SmallVector<std::pair<StackTrieNode *, uint64_t>, 8>>
    332       ThreadStackMap;
    333 
    334   StackTrieNode *createTrieNode(uint32_t ThreadId, int32_t FuncId,
    335                                 StackTrieNode *Parent) {
    336     NodeStore.push_front(StackTrieNode{FuncId, Parent, {}, {{}, {}}});
    337     auto I = NodeStore.begin();
    338     auto *Node = &*I;
    339     if (!Parent)
    340       Roots[ThreadId].push_back(Node);
    341     return Node;
    342   }
    343 
    344   StackTrieNode *findRootNode(uint32_t ThreadId, int32_t FuncId) {
    345     const auto &RootsByThread = Roots[ThreadId];
    346     auto I = find_if(RootsByThread,
    347                      [&](StackTrieNode *N) { return N->FuncId == FuncId; });
    348     return (I == RootsByThread.end()) ? nullptr : *I;
    349   }
    350 
    351 public:
    352   enum class AccountRecordStatus {
    353     OK,              // Successfully processed
    354     ENTRY_NOT_FOUND, // An exit record had no matching call stack entry
    355     UNKNOWN_RECORD_TYPE
    356   };
    357 
    358   struct AccountRecordState {
    359     // We keep track of whether the call stack is currently unwinding.
    360     bool wasLastRecordExit;
    361 
    362     static AccountRecordState CreateInitialState() { return {false}; }
    363   };
    364 
    365   AccountRecordStatus accountRecord(const XRayRecord &R,
    366                                     AccountRecordState *state) {
    367     auto &TS = ThreadStackMap[R.TId];
    368     switch (R.Type) {
    369     case RecordTypes::ENTER:
    370     case RecordTypes::ENTER_ARG: {
    371       state->wasLastRecordExit = false;
    372       // When we encounter a new function entry, we want to record the TSC for
    373       // that entry, and the function id. Before doing so we check the top of
    374       // the stack to see if there are callees that already represent this
    375       // function.
    376       if (TS.empty()) {
    377         auto *Root = findRootNode(R.TId, R.FuncId);
    378         TS.emplace_back(Root ? Root : createTrieNode(R.TId, R.FuncId, nullptr),
    379                         R.TSC);
    380         return AccountRecordStatus::OK;
    381       }
    382 
    383       auto &Top = TS.back();
    384       auto I = find_if(Top.first->Callees,
    385                        [&](StackTrieNode *N) { return N->FuncId == R.FuncId; });
    386       if (I == Top.first->Callees.end()) {
    387         // We didn't find the callee in the stack trie, so we're going to
    388         // add to the stack then set up the pointers properly.
    389         auto N = createTrieNode(R.TId, R.FuncId, Top.first);
    390         Top.first->Callees.emplace_back(N);
    391 
    392         // Top may be invalidated after this statement.
    393         TS.emplace_back(N, R.TSC);
    394       } else {
    395         // We found the callee in the stack trie, so we'll use that pointer
    396         // instead, add it to the stack associated with the TSC.
    397         TS.emplace_back(*I, R.TSC);
    398       }
    399       return AccountRecordStatus::OK;
    400     }
    401     case RecordTypes::EXIT:
    402     case RecordTypes::TAIL_EXIT: {
    403       bool wasLastRecordExit = state->wasLastRecordExit;
    404       state->wasLastRecordExit = true;
    405       // The exit case is more interesting, since we want to be able to deduce
    406       // missing exit records. To do that properly, we need to look up the stack
    407       // and see whether the exit record matches any of the entry records. If it
    408       // does match, we attempt to record the durations as we pop the stack to
    409       // where we see the parent.
    410       if (TS.empty()) {
    411         // Short circuit, and say we can't find it.
    412 
    413         return AccountRecordStatus::ENTRY_NOT_FOUND;
    414       }
    415 
    416       auto FunctionEntryMatch = find_if(
    417           reverse(TS), [&](const std::pair<StackTrieNode *, uint64_t> &E) {
    418             return E.first->FuncId == R.FuncId;
    419           });
    420       auto status = AccountRecordStatus::OK;
    421       if (FunctionEntryMatch == TS.rend()) {
    422         status = AccountRecordStatus::ENTRY_NOT_FOUND;
    423       } else {
    424         // Account for offset of 1 between reverse and forward iterators. We
    425         // want the forward iterator to include the function that is exited.
    426         ++FunctionEntryMatch;
    427       }
    428       auto I = FunctionEntryMatch.base();
    429       for (auto &E : make_range(I, TS.end() - 1))
    430         E.first->ExtraData.IntermediateDurations.push_back(
    431             std::max(E.second, R.TSC) - std::min(E.second, R.TSC));
    432       auto &Deepest = TS.back();
    433       if (wasLastRecordExit)
    434         Deepest.first->ExtraData.IntermediateDurations.push_back(
    435             std::max(Deepest.second, R.TSC) - std::min(Deepest.second, R.TSC));
    436       else
    437         Deepest.first->ExtraData.TerminalDurations.push_back(
    438             std::max(Deepest.second, R.TSC) - std::min(Deepest.second, R.TSC));
    439       TS.erase(I, TS.end());
    440       return status;
    441     }
    442     }
    443     return AccountRecordStatus::UNKNOWN_RECORD_TYPE;
    444   }
    445 
    446   bool isEmpty() const { return Roots.empty(); }
    447 
    448   void printStack(raw_ostream &OS, const StackTrieNode *Top,
    449                   FuncIdConversionHelper &FN) {
    450     // Traverse the pointers up to the parent, noting the sums, then print
    451     // in reverse order (callers at top, callees down bottom).
    452     SmallVector<const StackTrieNode *, 8> CurrentStack;
    453     for (auto *F = Top; F != nullptr; F = F->Parent)
    454       CurrentStack.push_back(F);
    455     int Level = 0;
    456     OS << formatv("{0,-5} {1,-60} {2,+12} {3,+16}\n", "lvl", "function",
    457                   "count", "sum");
    458     for (auto *F :
    459          reverse(make_range(CurrentStack.begin() + 1, CurrentStack.end()))) {
    460       auto Sum = std::accumulate(F->ExtraData.IntermediateDurations.begin(),
    461                                  F->ExtraData.IntermediateDurations.end(), 0LL);
    462       auto FuncId = FN.SymbolOrNumber(F->FuncId);
    463       OS << formatv("#{0,-4} {1,-60} {2,+12} {3,+16}\n", Level++,
    464                     FuncId.size() > 60 ? FuncId.substr(0, 57) + "..." : FuncId,
    465                     F->ExtraData.IntermediateDurations.size(), Sum);
    466     }
    467     auto *Leaf = *CurrentStack.begin();
    468     auto LeafSum =
    469         std::accumulate(Leaf->ExtraData.TerminalDurations.begin(),
    470                         Leaf->ExtraData.TerminalDurations.end(), 0LL);
    471     auto LeafFuncId = FN.SymbolOrNumber(Leaf->FuncId);
    472     OS << formatv("#{0,-4} {1,-60} {2,+12} {3,+16}\n", Level++,
    473                   LeafFuncId.size() > 60 ? LeafFuncId.substr(0, 57) + "..."
    474                                          : LeafFuncId,
    475                   Leaf->ExtraData.TerminalDurations.size(), LeafSum);
    476     OS << "\n";
    477   }
    478 
    479   /// Prints top stacks for each thread.
    480   void printPerThread(raw_ostream &OS, FuncIdConversionHelper &FN) {
    481     for (auto iter : Roots) {
    482       OS << "Thread " << iter.first << ":\n";
    483       print(OS, FN, iter.second);
    484       OS << "\n";
    485     }
    486   }
    487 
    488   /// Prints timing sums for each stack in each threads.
    489   template <AggregationType AggType>
    490   void printAllPerThread(raw_ostream &OS, FuncIdConversionHelper &FN,
    491                          StackOutputFormat format) {
    492     for (auto iter : Roots) {
    493       uint32_t threadId = iter.first;
    494       RootVector &perThreadRoots = iter.second;
    495       bool reportThreadId = true;
    496       printAll<AggType>(OS, FN, perThreadRoots, threadId, reportThreadId);
    497     }
    498   }
    499 
    500   /// Prints top stacks from looking at all the leaves and ignoring thread IDs.
    501   /// Stacks that consist of the same function IDs but were called in different
    502   /// thread IDs are not considered unique in this printout.
    503   void printIgnoringThreads(raw_ostream &OS, FuncIdConversionHelper &FN) {
    504     RootVector RootValues;
    505 
    506     // Function to pull the values out of a map iterator.
    507     using RootsType = decltype(Roots.begin())::value_type;
    508     auto MapValueFn = [](const RootsType &Value) { return Value.second; };
    509 
    510     for (const auto &RootNodeRange :
    511          make_range(map_iterator(Roots.begin(), MapValueFn),
    512                     map_iterator(Roots.end(), MapValueFn))) {
    513       for (auto *RootNode : RootNodeRange)
    514         RootValues.push_back(RootNode);
    515     }
    516 
    517     print(OS, FN, RootValues);
    518   }
    519 
    520   /// Creates a merged list of Tries for unique stacks that disregards their
    521   /// thread IDs.
    522   RootVector mergeAcrossThreads(std::forward_list<StackTrieNode> &NodeStore) {
    523     RootVector MergedByThreadRoots;
    524     for (auto MapIter : Roots) {
    525       const auto &RootNodeVector = MapIter.second;
    526       for (auto *Node : RootNodeVector) {
    527         auto MaybeFoundIter =
    528             find_if(MergedByThreadRoots, [Node](StackTrieNode *elem) {
    529               return Node->FuncId == elem->FuncId;
    530             });
    531         if (MaybeFoundIter == MergedByThreadRoots.end()) {
    532           MergedByThreadRoots.push_back(Node);
    533         } else {
    534           MergedByThreadRoots.push_back(mergeTrieNodes(
    535               **MaybeFoundIter, *Node, nullptr, NodeStore, mergeStackDuration));
    536           MergedByThreadRoots.erase(MaybeFoundIter);
    537         }
    538       }
    539     }
    540     return MergedByThreadRoots;
    541   }
    542 
    543   /// Print timing sums for all stacks merged by Thread ID.
    544   template <AggregationType AggType>
    545   void printAllAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN,
    546                                   StackOutputFormat format) {
    547     std::forward_list<StackTrieNode> AggregatedNodeStore;
    548     RootVector MergedByThreadRoots = mergeAcrossThreads(AggregatedNodeStore);
    549     bool reportThreadId = false;
    550     printAll<AggType>(OS, FN, MergedByThreadRoots,
    551                       /*threadId*/ 0, reportThreadId);
    552   }
    553 
    554   /// Merges the trie by thread id before printing top stacks.
    555   void printAggregatingThreads(raw_ostream &OS, FuncIdConversionHelper &FN) {
    556     std::forward_list<StackTrieNode> AggregatedNodeStore;
    557     RootVector MergedByThreadRoots = mergeAcrossThreads(AggregatedNodeStore);
    558     print(OS, FN, MergedByThreadRoots);
    559   }
    560 
    561   // TODO: Add a format option when more than one are supported.
    562   template <AggregationType AggType>
    563   void printAll(raw_ostream &OS, FuncIdConversionHelper &FN,
    564                 RootVector RootValues, uint32_t ThreadId, bool ReportThread) {
    565     SmallVector<const StackTrieNode *, 16> S;
    566     for (const auto *N : RootValues) {
    567       S.clear();
    568       S.push_back(N);
    569       while (!S.empty()) {
    570         auto *Top = S.pop_back_val();
    571         printSingleStack<AggType>(OS, FN, ReportThread, ThreadId, Top);
    572         for (const auto *C : Top->Callees)
    573           S.push_back(C);
    574       }
    575     }
    576   }
    577 
    578   /// Prints values for stacks in a format consumable for the flamegraph.pl
    579   /// tool. This is a line based format that lists each level in the stack
    580   /// hierarchy in a semicolon delimited form followed by a space and a numeric
    581   /// value. If breaking down by thread, the thread ID will be added as the
    582   /// root level of the stack.
    583   template <AggregationType AggType>
    584   void printSingleStack(raw_ostream &OS, FuncIdConversionHelper &Converter,
    585                         bool ReportThread, uint32_t ThreadId,
    586                         const StackTrieNode *Node) {
    587     if (ReportThread)
    588       OS << "thread_" << ThreadId << ";";
    589     SmallVector<const StackTrieNode *, 5> lineage{};
    590     lineage.push_back(Node);
    591     while (lineage.back()->Parent != nullptr)
    592       lineage.push_back(lineage.back()->Parent);
    593     while (!lineage.empty()) {
    594       OS << Converter.SymbolOrNumber(lineage.back()->FuncId) << ";";
    595       lineage.pop_back();
    596     }
    597     OS << " " << GetValueForStack<AggType>(Node) << "\n";
    598   }
    599 
    600   void print(raw_ostream &OS, FuncIdConversionHelper &FN,
    601              RootVector RootValues) {
    602     // Go through each of the roots, and traverse the call stack, producing the
    603     // aggregates as you go along. Remember these aggregates and stacks, and
    604     // show summary statistics about:
    605     //
    606     //   - Total number of unique stacks
    607     //   - Top 10 stacks by count
    608     //   - Top 10 stacks by aggregate duration
    609     SmallVector<std::pair<const StackTrieNode *, uint64_t>, 11>
    610         TopStacksByCount;
    611     SmallVector<std::pair<const StackTrieNode *, uint64_t>, 11> TopStacksBySum;
    612     auto greater_second =
    613         [](const std::pair<const StackTrieNode *, uint64_t> &A,
    614            const std::pair<const StackTrieNode *, uint64_t> &B) {
    615           return A.second > B.second;
    616         };
    617     uint64_t UniqueStacks = 0;
    618     for (const auto *N : RootValues) {
    619       SmallVector<const StackTrieNode *, 16> S;
    620       S.emplace_back(N);
    621 
    622       while (!S.empty()) {
    623         auto *Top = S.pop_back_val();
    624 
    625         // We only start printing the stack (by walking up the parent pointers)
    626         // when we get to a leaf function.
    627         if (!Top->ExtraData.TerminalDurations.empty()) {
    628           ++UniqueStacks;
    629           auto TopSum =
    630               std::accumulate(Top->ExtraData.TerminalDurations.begin(),
    631                               Top->ExtraData.TerminalDurations.end(), 0uLL);
    632           {
    633             auto E = std::make_pair(Top, TopSum);
    634             TopStacksBySum.insert(std::lower_bound(TopStacksBySum.begin(),
    635                                                    TopStacksBySum.end(), E,
    636                                                    greater_second),
    637                                   E);
    638             if (TopStacksBySum.size() == 11)
    639               TopStacksBySum.pop_back();
    640           }
    641           {
    642             auto E =
    643                 std::make_pair(Top, Top->ExtraData.TerminalDurations.size());
    644             TopStacksByCount.insert(std::lower_bound(TopStacksByCount.begin(),
    645                                                      TopStacksByCount.end(), E,
    646                                                      greater_second),
    647                                     E);
    648             if (TopStacksByCount.size() == 11)
    649               TopStacksByCount.pop_back();
    650           }
    651         }
    652         for (const auto *C : Top->Callees)
    653           S.push_back(C);
    654       }
    655     }
    656 
    657     // Now print the statistics in the end.
    658     OS << "\n";
    659     OS << "Unique Stacks: " << UniqueStacks << "\n";
    660     OS << "Top 10 Stacks by leaf sum:\n\n";
    661     for (const auto &P : TopStacksBySum) {
    662       OS << "Sum: " << P.second << "\n";
    663       printStack(OS, P.first, FN);
    664     }
    665     OS << "\n";
    666     OS << "Top 10 Stacks by leaf count:\n\n";
    667     for (const auto &P : TopStacksByCount) {
    668       OS << "Count: " << P.second << "\n";
    669       printStack(OS, P.first, FN);
    670     }
    671     OS << "\n";
    672   }
    673 };
    674 
    675 std::string CreateErrorMessage(StackTrie::AccountRecordStatus Error,
    676                                const XRayRecord &Record,
    677                                const FuncIdConversionHelper &Converter) {
    678   switch (Error) {
    679   case StackTrie::AccountRecordStatus::ENTRY_NOT_FOUND:
    680     return formatv("Found record {0} with no matching function entry\n",
    681                    format_xray_record(Record, Converter));
    682   default:
    683     return formatv("Unknown error type for record {0}\n",
    684                    format_xray_record(Record, Converter));
    685   }
    686 }
    687 
    688 static CommandRegistration Unused(&Stack, []() -> Error {
    689   // Load each file provided as a command-line argument. For each one of them
    690   // account to a single StackTrie, and just print the whole trie for now.
    691   StackTrie ST;
    692   InstrumentationMap Map;
    693   if (!StacksInstrMap.empty()) {
    694     auto InstrumentationMapOrError = loadInstrumentationMap(StacksInstrMap);
    695     if (!InstrumentationMapOrError)
    696       return joinErrors(
    697           make_error<StringError>(
    698               Twine("Cannot open instrumentation map: ") + StacksInstrMap,
    699               std::make_error_code(std::errc::invalid_argument)),
    700           InstrumentationMapOrError.takeError());
    701     Map = std::move(*InstrumentationMapOrError);
    702   }
    703 
    704   if (SeparateThreadStacks && AggregateThreads)
    705     return make_error<StringError>(
    706         Twine("Can't specify options for per thread reporting and reporting "
    707               "that aggregates threads."),
    708         std::make_error_code(std::errc::invalid_argument));
    709 
    710   if (!DumpAllStacks && StacksOutputFormat != HUMAN)
    711     return make_error<StringError>(
    712         Twine("Can't specify a non-human format without -all-stacks."),
    713         std::make_error_code(std::errc::invalid_argument));
    714 
    715   if (DumpAllStacks && StacksOutputFormat == HUMAN)
    716     return make_error<StringError>(
    717         Twine("You must specify a non-human format when reporting with "
    718               "-all-stacks."),
    719         std::make_error_code(std::errc::invalid_argument));
    720 
    721   symbolize::LLVMSymbolizer::Options Opts(
    722       symbolize::FunctionNameKind::LinkageName, true, true, false, "");
    723   symbolize::LLVMSymbolizer Symbolizer(Opts);
    724   FuncIdConversionHelper FuncIdHelper(StacksInstrMap, Symbolizer,
    725                                       Map.getFunctionAddresses());
    726   // TODO: Someday, support output to files instead of just directly to
    727   // standard output.
    728   for (const auto &Filename : StackInputs) {
    729     auto TraceOrErr = loadTraceFile(Filename);
    730     if (!TraceOrErr) {
    731       if (!StackKeepGoing)
    732         return joinErrors(
    733             make_error<StringError>(
    734                 Twine("Failed loading input file '") + Filename + "'",
    735                 std::make_error_code(std::errc::invalid_argument)),
    736             TraceOrErr.takeError());
    737       logAllUnhandledErrors(TraceOrErr.takeError(), errs(), "");
    738       continue;
    739     }
    740     auto &T = *TraceOrErr;
    741     StackTrie::AccountRecordState AccountRecordState =
    742         StackTrie::AccountRecordState::CreateInitialState();
    743     for (const auto &Record : T) {
    744       auto error = ST.accountRecord(Record, &AccountRecordState);
    745       if (error != StackTrie::AccountRecordStatus::OK) {
    746         if (!StackKeepGoing)
    747           return make_error<StringError>(
    748               CreateErrorMessage(error, Record, FuncIdHelper),
    749               make_error_code(errc::illegal_byte_sequence));
    750         errs() << CreateErrorMessage(error, Record, FuncIdHelper);
    751       }
    752     }
    753   }
    754   if (ST.isEmpty()) {
    755     return make_error<StringError>(
    756         "No instrumented calls were accounted in the input file.",
    757         make_error_code(errc::result_out_of_range));
    758   }
    759 
    760   // Report the stacks in a long form mode for another tool to analyze.
    761   if (DumpAllStacks) {
    762     if (AggregateThreads) {
    763       switch (RequestedAggregation) {
    764       case AggregationType::TOTAL_TIME:
    765         ST.printAllAggregatingThreads<AggregationType::TOTAL_TIME>(
    766             outs(), FuncIdHelper, StacksOutputFormat);
    767         break;
    768       case AggregationType::INVOCATION_COUNT:
    769         ST.printAllAggregatingThreads<AggregationType::INVOCATION_COUNT>(
    770             outs(), FuncIdHelper, StacksOutputFormat);
    771         break;
    772       }
    773     } else {
    774       switch (RequestedAggregation) {
    775       case AggregationType::TOTAL_TIME:
    776         ST.printAllPerThread<AggregationType::TOTAL_TIME>(outs(), FuncIdHelper,
    777                                                           StacksOutputFormat);
    778         break;
    779       case AggregationType::INVOCATION_COUNT:
    780         ST.printAllPerThread<AggregationType::INVOCATION_COUNT>(
    781             outs(), FuncIdHelper, StacksOutputFormat);
    782         break;
    783       }
    784     }
    785     return Error::success();
    786   }
    787 
    788   // We're only outputting top stacks.
    789   if (AggregateThreads) {
    790     ST.printAggregatingThreads(outs(), FuncIdHelper);
    791   } else if (SeparateThreadStacks) {
    792     ST.printPerThread(outs(), FuncIdHelper);
    793   } else {
    794     ST.printIgnoringThreads(outs(), FuncIdHelper);
    795   }
    796   return Error::success();
    797 });
    798