Home | History | Annotate | Download | only in util
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/util/stat_summarizer.h"
     17 
     18 #include <iomanip>
     19 #include <map>
     20 #include <queue>
     21 #include <sstream>
     22 #include <string>
     23 
     24 #include "tensorflow/core/framework/step_stats.pb.h"
     25 #include "tensorflow/core/framework/tensor_description.pb.h"
     26 #include "tensorflow/core/framework/tensor_shape.pb.h"
     27 #include "tensorflow/core/lib/strings/str_util.h"
     28 #include "tensorflow/core/platform/env.h"
     29 #include "tensorflow/core/platform/logging.h"
     30 #include "tensorflow/core/platform/types.h"
     31 
     32 namespace tensorflow {
     33 
     34 StatSummarizer::StatSummarizer(const StatSummarizerOptions& options)
     35     : options_(options) {}
     36 
     37 StatSummarizer::StatSummarizer(const tensorflow::GraphDef& tensorflow_graph)
     38     : StatSummarizer(StatSummarizerOptions()) {}
     39 
     40 StatSummarizer::~StatSummarizer() {}
     41 
     42 void StatSummarizer::Reset() {
     43   run_total_us_.Reset();
     44   memory_.Reset();
     45   details_.clear();
     46 }
     47 
     48 void StatSummarizer::Validate(const Detail* detail,
     49                               const NodeExecStats& ns) const {
     50   if (detail->outputs.size() != ns.output_size()) {
     51     LOG(WARNING) << "Number of outputs changed between runs for '"
     52                  << ns.node_name() << "' - was " << detail->outputs.size()
     53                  << ", now " << ns.output_size();
     54   } else {
     55     for (const auto& output : ns.output()) {
     56       const int32 slot = output.slot();
     57       if ((slot < 0) || (slot >= ns.output_size())) {
     58         // This is not a hard error for Switch ops, so just pass.
     59         continue;
     60       }
     61       const auto& stored = detail->outputs[slot];
     62       const auto& current = output.tensor_description();
     63 
     64       bool do_tensors_match =
     65           (stored.dtype() == current.dtype()) &&
     66           (stored.shape().dim_size() == current.shape().dim_size());
     67 
     68       if (do_tensors_match) {
     69         for (int i = 0; i < stored.shape().dim_size(); ++i) {
     70           if (stored.shape().dim(i).size() != current.shape().dim(i).size()) {
     71             do_tensors_match = false;
     72             break;
     73           }
     74         }
     75       }
     76 
     77       if (!do_tensors_match) {
     78         LOG(WARNING) << "Output tensor changed between runs for '"
     79                      << ns.node_name();
     80       }
     81     }
     82   }
     83 }
     84 
     85 namespace {
     86 std::string OpType(const DeviceStepStats& ds, const NodeExecStats& ns) {
     87   // There is no published specification of how DeviceStats and NodeStats
     88   // are filled in. Thus, we live with the fragility of this implementation.
     89   //
     90   // Note that NodeStats.node_name may NOT refer to a node in the Graph.
     91   // This can happen if, either:
     92   // (1) The DeviceStats corresponds to statistics from the GPUTracer
     93   //     logging (which adds devices whose name contains either "/stream"
     94   //     or "/memcpy" to the StepStats), OR
     95   // (2) The graph was partitioned, and thus the NodeStats refers to
     96   //     the SendTensor or RecvTensor operations added.
     97   // For these cases, return "<>" as the "type" of the operation.
     98   //
     99   // The StatSummarizer was initially aimed at CPU execution on mobile, where
    100   // there was no GPUTracing and no graph partitioning, so the conditions above
    101   // do not occur.
    102   //
    103   // It would be nice to have a clearer spec for StepStats so utilities such as
    104   // this class can handle nodes that do not appear in the original graph
    105   // gracefully. Till then, duplicate what is done by:
    106   // https://www.tensorflow.org/code/tensorflow/python/client/timeline.py
    107   // and rely on the unittest.
    108   if (ds.device().find("/stream") != std::string::npos ||
    109       ds.device().find("/memcpy") != std::string::npos) {
    110     // Stats from the GPUTracer, does not correspond to TensorFlow ops.
    111     return "<>";
    112   }
    113   // timeline_label should be of the format: <node_name> = <op_type>(<args>)
    114   // Extract <op_type>.
    115   const std::string sep(" = ");
    116   const std::string& label = ns.timeline_label();
    117   std::string::size_type start = label.find(sep);
    118   if (start == std::string::npos) return "<>";
    119   start += sep.size();
    120   std::string::size_type end = label.find("(", start);
    121   if (end == std::string::npos) return "<>";
    122   return label.substr(start, end - start);
    123 }
    124 }  // namespace
    125 
    126 void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
    127   int64 curr_total_us = 0;
    128   int64 mem_total = 0;
    129 
    130   int64 first_node_start_us =
    131       step_stats.dev_stats(0).node_stats(0).all_start_micros();
    132 
    133   int node_num = 0;
    134   for (const auto& ds : step_stats.dev_stats()) {
    135     for (const auto& ns : ds.node_stats()) {
    136       // NOTE(blackhc): To better support GPUs:
    137       // GPU kernels are duplicated both in /stream:all and their
    138       // /stream:$index. GPU memcpys are duplicated both in /memcpy and their
    139       // /stream:$index. So only keep /stream:all and /memcpy and ignore all
    140       // /stream:$index to only count GPU executions once.
    141       if (ds.device().find("/stream") != std::string::npos &&
    142           ds.device().find("/stream:all") == std::string::npos) {
    143         continue;
    144       }
    145 
    146       std::string name = ns.node_name();
    147       std::string op_type = "<>";
    148       // NOTE(blackhc): we have to ensure that all keys into the detail map
    149       // are unique, so we add [Kernel] or [MemCpy] as a suffix to the name.
    150       // To make the node type summary work better, we prefix "gpu:" to
    151       // the op type when the info is from a /gpu/stream or /memcpy channel.
    152       if (ds.device().find("/stream") != std::string::npos) {
    153         // node_name: name ":" opType
    154         auto parts = str_util::Split(ns.node_name(), ':');
    155         if (parts.size() == 2) {
    156           name = parts[0] + " [Kernel]";
    157           op_type = "gpu:" + parts[1];
    158         }
    159       } else if (ds.device().find("/memcpy") != std::string::npos) {
    160         // node_name: name (":" opType)? ":" memCpyType
    161         auto parts = str_util::Split(ns.node_name(), ':');
    162         if (parts.size() == 2 || parts.size() == 3) {
    163           name = parts.front() + " [MemCpy]";
    164           // We don't care about the actual op type (it might not be available
    165           // for edge_ memcpys). We only care that it's a memcpy for now.
    166           op_type = "gpu:" + parts.back();
    167         }
    168       } else {
    169         op_type = OpType(ds, ns);
    170       }
    171 
    172       ++node_num;
    173       const int64 curr_time = ns.all_end_rel_micros();
    174       curr_total_us += curr_time;
    175       auto result = details_.emplace(name, Detail());
    176       Detail* detail = &(result.first->second);
    177 
    178       detail->start_us.UpdateStat(ns.all_start_micros() - first_node_start_us);
    179       detail->rel_end_us.UpdateStat(curr_time);
    180 
    181       // If this is the first pass, initialize some values.
    182       if (result.second) {
    183         detail->name = name;
    184         detail->type = op_type;
    185 
    186         detail->run_order = node_num;
    187 
    188         detail->outputs.resize(ns.output_size());
    189         for (const auto& output : ns.output()) {
    190           const int32 slot = output.slot();
    191           if ((slot < 0) || (slot >= ns.output_size())) {
    192             // This is not a hard error for Switch ops, so just pass.
    193             continue;
    194           }
    195           detail->outputs[slot] = output.tensor_description();
    196         }
    197 
    198         detail->times_called = 0;
    199       }
    200 
    201       int64 curr_node_mem = 0;
    202       for (const auto& mem : ns.memory()) {
    203         const int64 mem_usage = mem.total_bytes();
    204         curr_node_mem += mem_usage;
    205       }
    206       detail->mem_used.UpdateStat(curr_node_mem);
    207       mem_total += curr_node_mem;
    208 
    209       ++detail->times_called;
    210 
    211       Validate(detail, ns);
    212     }
    213   }
    214 
    215   run_total_us_.UpdateStat(curr_total_us);
    216   memory_.UpdateStat(mem_total);
    217 }
    218 
    219 std::string StatSummarizer::ShortSummary() const {
    220   std::stringstream stream;
    221   stream << "Timings (microseconds): ";
    222   run_total_us_.OutputToStream(&stream);
    223   stream << std::endl;
    224 
    225   stream << "Memory (bytes): ";
    226   memory_.OutputToStream(&stream);
    227   stream << std::endl;
    228 
    229   stream << details_.size() << " nodes observed" << std::endl;
    230   return stream.str();
    231 }
    232 
    233 std::ostream& InitField(std::ostream& stream, int width) {
    234   stream << "\t" << std::right << std::setw(width) << std::fixed
    235          << std::setprecision(3);
    236   return stream;
    237 }
    238 
    239 std::string StatSummarizer::HeaderString(const string& title) const {
    240   std::stringstream stream;
    241 
    242   stream << "============================== " << title
    243          << " ==============================" << std::endl;
    244 
    245   InitField(stream, 24) << "[node type]";
    246   InitField(stream, 9) << "[start]";
    247   InitField(stream, 9) << "[first]";
    248   InitField(stream, 9) << "[avg ms]";
    249   InitField(stream, 8) << "[%]";
    250   InitField(stream, 8) << "[cdf%]";
    251   InitField(stream, 10) << "[mem KB]";
    252   InitField(stream, 9) << "[times called]";
    253   stream << "\t"
    254          << "[Name]";
    255   return stream.str();
    256 }
    257 
    258 std::string StatSummarizer::ColumnString(const Detail& detail,
    259                                          const int64 cumulative_stat_on_node,
    260                                          const Stat<int64>& stat) const {
    261   const double start_ms = detail.start_us.avg() / 1000.0;
    262   const double first_time_ms = detail.rel_end_us.first() / 1000.0;
    263   const double avg_time_ms = detail.rel_end_us.avg() / 1000.0;
    264   const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum();
    265   const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum();
    266   const int64 times_called = detail.times_called / num_runs();
    267 
    268   std::stringstream stream;
    269   InitField(stream, 24) << detail.type;
    270   InitField(stream, 9) << start_ms;
    271   InitField(stream, 9) << first_time_ms;
    272   InitField(stream, 9) << avg_time_ms;
    273   InitField(stream, 7) << percentage << "%";
    274   InitField(stream, 7) << cdf_percentage << "%";
    275   InitField(stream, 10) << detail.mem_used.newest() / 1000.0;
    276   InitField(stream, 9) << times_called;
    277   stream << "\t" << detail.name;
    278 
    279   return stream.str();
    280 }
    281 
    282 void StatSummarizer::OrderNodesByMetric(
    283     SortingMetric metric, std::vector<const Detail*>* details) const {
    284   std::priority_queue<std::pair<string, const Detail*>> sorted_list;
    285   const int num_nodes = details_.size();
    286 
    287   for (const auto& det : details_) {
    288     const Detail* detail = &(det.second);
    289     std::stringstream stream;
    290     stream << std::setw(20) << std::right << std::setprecision(10)
    291            << std::fixed;
    292 
    293     switch (metric) {
    294       case BY_NAME:
    295         stream << detail->name;
    296         break;
    297       case BY_RUN_ORDER:
    298         stream << num_nodes - detail->run_order;
    299         break;
    300       case BY_TIME:
    301         stream << detail->rel_end_us.avg();
    302         break;
    303       case BY_MEMORY:
    304         stream << detail->mem_used.avg();
    305         break;
    306       case BY_TYPE:
    307         stream << detail->type;
    308         break;
    309       default:
    310         stream << "";
    311         break;
    312     }
    313 
    314     sorted_list.emplace(stream.str(), detail);
    315   }
    316 
    317   while (!sorted_list.empty()) {
    318     auto entry = sorted_list.top();
    319     sorted_list.pop();
    320     details->push_back(entry.second);
    321   }
    322 }
    323 
    324 void StatSummarizer::ComputeStatsByType(
    325     std::map<string, int64>* node_type_map_count,
    326     std::map<string, int64>* node_type_map_time,
    327     std::map<string, int64>* node_type_map_memory,
    328     std::map<string, int64>* node_type_map_times_called,
    329     int64* accumulated_us) const {
    330   int64 run_count = run_total_us_.count();
    331 
    332   for (const auto& det : details_) {
    333     const string node_name = det.first;
    334     const Detail& detail = det.second;
    335 
    336     int64 curr_time_val =
    337         static_cast<int64>(detail.rel_end_us.sum() / run_count);
    338     *accumulated_us += curr_time_val;
    339 
    340     int64 curr_memory_val = detail.mem_used.newest();
    341 
    342     const string& node_type = detail.type;
    343 
    344     (*node_type_map_count)[node_type] += 1;
    345     (*node_type_map_time)[node_type] += curr_time_val;
    346     (*node_type_map_memory)[node_type] += curr_memory_val;
    347     (*node_type_map_times_called)[node_type] += detail.times_called / run_count;
    348   }
    349 }
    350 
    351 std::string StatSummarizer::GetStatsByNodeType() const {
    352   std::stringstream stream;
    353 
    354   stream << "============================== Summary by node type "
    355             "=============================="
    356          << std::endl;
    357 
    358   LOG(INFO) << "Number of nodes executed: " << details_.size();
    359 
    360   std::map<string, int64> node_type_map_count;
    361   std::map<string, int64> node_type_map_time;
    362   std::map<string, int64> node_type_map_memory;
    363   std::map<string, int64> node_type_map_times_called;
    364   int64 accumulated_us = 0;
    365 
    366   ComputeStatsByType(&node_type_map_count, &node_type_map_time,
    367                      &node_type_map_memory, &node_type_map_times_called,
    368                      &accumulated_us);
    369 
    370   // Sort them.
    371   std::priority_queue<std::pair<int64, std::pair<string, int64>>> timings;
    372   for (const auto& node_type : node_type_map_time) {
    373     const int64 mem_used = node_type_map_memory[node_type.first];
    374     timings.emplace(node_type.second,
    375                     std::pair<string, int64>(node_type.first, mem_used));
    376   }
    377 
    378   InitField(stream, 24) << "[Node type]";
    379   InitField(stream, 9) << "[count]";
    380   InitField(stream, 10) << "[avg ms]";
    381   InitField(stream, 11) << "[avg %]";
    382   InitField(stream, 11) << "[cdf %]";
    383   InitField(stream, 10) << "[mem KB]";
    384   InitField(stream, 10) << "[times called]";
    385   stream << std::endl;
    386 
    387   float cdf = 0.0f;
    388   while (!timings.empty()) {
    389     auto entry = timings.top();
    390     timings.pop();
    391 
    392     const string node_type = entry.second.first;
    393     const float memory = entry.second.second / 1000.0f;
    394 
    395     const int64 node_type_total_us = entry.first;
    396     const float time_per_run_ms = node_type_total_us / 1000.0f;
    397 
    398     const float percentage =
    399         ((entry.first / static_cast<float>(accumulated_us)) * 100.0f);
    400     cdf += percentage;
    401 
    402     InitField(stream, 24) << node_type;
    403     InitField(stream, 9) << node_type_map_count[node_type];
    404     InitField(stream, 10) << time_per_run_ms;
    405     InitField(stream, 10) << percentage << "%";
    406     InitField(stream, 10) << cdf << "%";
    407     InitField(stream, 10) << memory;
    408     InitField(stream, 9) << node_type_map_times_called[node_type];
    409     stream << std::endl;
    410   }
    411   stream << std::endl;
    412   return stream.str();
    413 }
    414 
    415 std::string StatSummarizer::GetStatsByMetric(const string& title,
    416                                              SortingMetric sorting_metric,
    417                                              int num_stats) const {
    418   std::vector<const Detail*> details;
    419   OrderNodesByMetric(sorting_metric, &details);
    420 
    421   double cumulative_stat_on_node = 0;
    422 
    423   std::stringstream stream;
    424   stream << HeaderString(title) << std::endl;
    425   int stat_num = 0;
    426   for (auto detail : details) {
    427     ++stat_num;
    428     if (num_stats > 0 && stat_num > num_stats) {
    429       break;
    430     }
    431 
    432     // TODO(andrewharp): Make this keep track of the particular metric for cdf.
    433     cumulative_stat_on_node += detail->rel_end_us.sum();
    434     stream << ColumnString(*detail, cumulative_stat_on_node, run_total_us_)
    435            << std::endl;
    436   }
    437   stream << std::endl;
    438   return stream.str();
    439 }
    440 
    441 std::string StatSummarizer::GetOutputString() const {
    442   std::stringstream stream;
    443   if (options_.show_run_order) {
    444     stream << GetStatsByMetric("Run Order", BY_RUN_ORDER,
    445                                options_.run_order_limit);
    446   }
    447   if (options_.show_time) {
    448     stream << GetStatsByMetric("Top by Computation Time", BY_TIME,
    449                                options_.time_limit);
    450   }
    451   if (options_.show_memory) {
    452     stream << GetStatsByMetric("Top by Memory Use", BY_MEMORY,
    453                                options_.memory_limit);
    454   }
    455   if (options_.show_type) {
    456     stream << GetStatsByNodeType();
    457   }
    458   if (options_.show_summary) {
    459     stream << ShortSummary() << std::endl;
    460   }
    461   return stream.str();
    462 }
    463 
    464 void StatSummarizer::PrintStepStats() const {
    465   string output = GetOutputString();
    466   std::istringstream iss(output);
    467   for (std::string line; std::getline(iss, line);) {
    468     LOG(INFO) << line;
    469   }
    470 }
    471 
    472 void StatSummarizer::PrintOutputs() const {
    473   std::priority_queue<
    474       std::pair<int64, const std::pair<const std::string, Detail>*>>
    475       timings;
    476   for (const auto& entry : details_) {
    477     timings.emplace(-entry.second.start_us.avg(), &entry);
    478   }
    479 
    480   LOG(INFO) << "============ Node output tensor sizes in run order ========";
    481   while (!timings.empty()) {
    482     auto entry = timings.top();
    483     timings.pop();
    484     const Detail& detail = entry.second->second;
    485     std::stringstream stream;
    486     stream << entry.second->first << "\t" << detail.outputs.size();
    487     for (const auto& tensor : detail.outputs) {
    488       stream << "\t" << DataTypeString(tensor.dtype());
    489       stream << "\t" << tensor.shape().dim_size();
    490       for (const auto& d : tensor.shape().dim()) {
    491         stream << "\t" << d.size();
    492       }
    493     }
    494     LOG(INFO) << stream.str();
    495   }
    496 }
    497 
    498 }  // namespace tensorflow
    499