1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/util/stat_summarizer.h" 17 18 #include <iomanip> 19 #include <map> 20 #include <queue> 21 #include <sstream> 22 #include <string> 23 24 #include "tensorflow/core/framework/step_stats.pb.h" 25 #include "tensorflow/core/framework/tensor_description.pb.h" 26 #include "tensorflow/core/framework/tensor_shape.pb.h" 27 #include "tensorflow/core/lib/strings/str_util.h" 28 #include "tensorflow/core/platform/env.h" 29 #include "tensorflow/core/platform/logging.h" 30 #include "tensorflow/core/platform/types.h" 31 32 namespace tensorflow { 33 34 StatSummarizer::StatSummarizer(const StatSummarizerOptions& options) 35 : options_(options) {} 36 37 StatSummarizer::StatSummarizer(const tensorflow::GraphDef& tensorflow_graph) 38 : StatSummarizer(StatSummarizerOptions()) {} 39 40 StatSummarizer::~StatSummarizer() {} 41 42 void StatSummarizer::Reset() { 43 run_total_us_.Reset(); 44 memory_.Reset(); 45 details_.clear(); 46 } 47 48 void StatSummarizer::Validate(const Detail* detail, 49 const NodeExecStats& ns) const { 50 if (detail->outputs.size() != ns.output_size()) { 51 LOG(WARNING) << "Number of outputs changed between runs for '" 52 << ns.node_name() << "' - was " << detail->outputs.size() 53 << ", now " << ns.output_size(); 54 } else { 55 for (const auto& output : ns.output()) { 56 const int32 slot = output.slot(); 57 if ((slot < 0) || (slot >= ns.output_size())) { 58 // This is not a hard error for Switch ops, so just pass. 59 continue; 60 } 61 const auto& stored = detail->outputs[slot]; 62 const auto& current = output.tensor_description(); 63 64 bool do_tensors_match = 65 (stored.dtype() == current.dtype()) && 66 (stored.shape().dim_size() == current.shape().dim_size()); 67 68 if (do_tensors_match) { 69 for (int i = 0; i < stored.shape().dim_size(); ++i) { 70 if (stored.shape().dim(i).size() != current.shape().dim(i).size()) { 71 do_tensors_match = false; 72 break; 73 } 74 } 75 } 76 77 if (!do_tensors_match) { 78 LOG(WARNING) << "Output tensor changed between runs for '" 79 << ns.node_name(); 80 } 81 } 82 } 83 } 84 85 namespace { 86 std::string OpType(const DeviceStepStats& ds, const NodeExecStats& ns) { 87 // There is no published specification of how DeviceStats and NodeStats 88 // are filled in. Thus, we live with the fragility of this implementation. 89 // 90 // Note that NodeStats.node_name may NOT refer to a node in the Graph. 91 // This can happen if, either: 92 // (1) The DeviceStats corresponds to statistics from the GPUTracer 93 // logging (which adds devices whose name contains either "/stream" 94 // or "/memcpy" to the StepStats), OR 95 // (2) The graph was partitioned, and thus the NodeStats refers to 96 // the SendTensor or RecvTensor operations added. 97 // For these cases, return "<>" as the "type" of the operation. 98 // 99 // The StatSummarizer was initially aimed at CPU execution on mobile, where 100 // there was no GPUTracing and no graph partitioning, so the conditions above 101 // do not occur. 102 // 103 // It would be nice to have a clearer spec for StepStats so utilities such as 104 // this class can handle nodes that do not appear in the original graph 105 // gracefully. Till then, duplicate what is done by: 106 // https://www.tensorflow.org/code/tensorflow/python/client/timeline.py 107 // and rely on the unittest. 108 if (ds.device().find("/stream") != std::string::npos || 109 ds.device().find("/memcpy") != std::string::npos) { 110 // Stats from the GPUTracer, does not correspond to TensorFlow ops. 111 return "<>"; 112 } 113 // timeline_label should be of the format: <node_name> = <op_type>(<args>) 114 // Extract <op_type>. 115 const std::string sep(" = "); 116 const std::string& label = ns.timeline_label(); 117 std::string::size_type start = label.find(sep); 118 if (start == std::string::npos) return "<>"; 119 start += sep.size(); 120 std::string::size_type end = label.find("(", start); 121 if (end == std::string::npos) return "<>"; 122 return label.substr(start, end - start); 123 } 124 } // namespace 125 126 void StatSummarizer::ProcessStepStats(const StepStats& step_stats) { 127 int64 curr_total_us = 0; 128 int64 mem_total = 0; 129 130 int64 first_node_start_us = 131 step_stats.dev_stats(0).node_stats(0).all_start_micros(); 132 133 int node_num = 0; 134 for (const auto& ds : step_stats.dev_stats()) { 135 for (const auto& ns : ds.node_stats()) { 136 // NOTE(blackhc): To better support GPUs: 137 // GPU kernels are duplicated both in /stream:all and their 138 // /stream:$index. GPU memcpys are duplicated both in /memcpy and their 139 // /stream:$index. So only keep /stream:all and /memcpy and ignore all 140 // /stream:$index to only count GPU executions once. 141 if (ds.device().find("/stream") != std::string::npos && 142 ds.device().find("/stream:all") == std::string::npos) { 143 continue; 144 } 145 146 std::string name = ns.node_name(); 147 std::string op_type = "<>"; 148 // NOTE(blackhc): we have to ensure that all keys into the detail map 149 // are unique, so we add [Kernel] or [MemCpy] as a suffix to the name. 150 // To make the node type summary work better, we prefix "gpu:" to 151 // the op type when the info is from a /gpu/stream or /memcpy channel. 152 if (ds.device().find("/stream") != std::string::npos) { 153 // node_name: name ":" opType 154 auto parts = str_util::Split(ns.node_name(), ':'); 155 if (parts.size() == 2) { 156 name = parts[0] + " [Kernel]"; 157 op_type = "gpu:" + parts[1]; 158 } 159 } else if (ds.device().find("/memcpy") != std::string::npos) { 160 // node_name: name (":" opType)? ":" memCpyType 161 auto parts = str_util::Split(ns.node_name(), ':'); 162 if (parts.size() == 2 || parts.size() == 3) { 163 name = parts.front() + " [MemCpy]"; 164 // We don't care about the actual op type (it might not be available 165 // for edge_ memcpys). We only care that it's a memcpy for now. 166 op_type = "gpu:" + parts.back(); 167 } 168 } else { 169 op_type = OpType(ds, ns); 170 } 171 172 ++node_num; 173 const int64 curr_time = ns.all_end_rel_micros(); 174 curr_total_us += curr_time; 175 auto result = details_.emplace(name, Detail()); 176 Detail* detail = &(result.first->second); 177 178 detail->start_us.UpdateStat(ns.all_start_micros() - first_node_start_us); 179 detail->rel_end_us.UpdateStat(curr_time); 180 181 // If this is the first pass, initialize some values. 182 if (result.second) { 183 detail->name = name; 184 detail->type = op_type; 185 186 detail->run_order = node_num; 187 188 detail->outputs.resize(ns.output_size()); 189 for (const auto& output : ns.output()) { 190 const int32 slot = output.slot(); 191 if ((slot < 0) || (slot >= ns.output_size())) { 192 // This is not a hard error for Switch ops, so just pass. 193 continue; 194 } 195 detail->outputs[slot] = output.tensor_description(); 196 } 197 198 detail->times_called = 0; 199 } 200 201 int64 curr_node_mem = 0; 202 for (const auto& mem : ns.memory()) { 203 const int64 mem_usage = mem.total_bytes(); 204 curr_node_mem += mem_usage; 205 } 206 detail->mem_used.UpdateStat(curr_node_mem); 207 mem_total += curr_node_mem; 208 209 ++detail->times_called; 210 211 Validate(detail, ns); 212 } 213 } 214 215 run_total_us_.UpdateStat(curr_total_us); 216 memory_.UpdateStat(mem_total); 217 } 218 219 std::string StatSummarizer::ShortSummary() const { 220 std::stringstream stream; 221 stream << "Timings (microseconds): "; 222 run_total_us_.OutputToStream(&stream); 223 stream << std::endl; 224 225 stream << "Memory (bytes): "; 226 memory_.OutputToStream(&stream); 227 stream << std::endl; 228 229 stream << details_.size() << " nodes observed" << std::endl; 230 return stream.str(); 231 } 232 233 std::ostream& InitField(std::ostream& stream, int width) { 234 stream << "\t" << std::right << std::setw(width) << std::fixed 235 << std::setprecision(3); 236 return stream; 237 } 238 239 std::string StatSummarizer::HeaderString(const string& title) const { 240 std::stringstream stream; 241 242 stream << "============================== " << title 243 << " ==============================" << std::endl; 244 245 InitField(stream, 24) << "[node type]"; 246 InitField(stream, 9) << "[start]"; 247 InitField(stream, 9) << "[first]"; 248 InitField(stream, 9) << "[avg ms]"; 249 InitField(stream, 8) << "[%]"; 250 InitField(stream, 8) << "[cdf%]"; 251 InitField(stream, 10) << "[mem KB]"; 252 InitField(stream, 9) << "[times called]"; 253 stream << "\t" 254 << "[Name]"; 255 return stream.str(); 256 } 257 258 std::string StatSummarizer::ColumnString(const Detail& detail, 259 const int64 cumulative_stat_on_node, 260 const Stat<int64>& stat) const { 261 const double start_ms = detail.start_us.avg() / 1000.0; 262 const double first_time_ms = detail.rel_end_us.first() / 1000.0; 263 const double avg_time_ms = detail.rel_end_us.avg() / 1000.0; 264 const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum(); 265 const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum(); 266 const int64 times_called = detail.times_called / num_runs(); 267 268 std::stringstream stream; 269 InitField(stream, 24) << detail.type; 270 InitField(stream, 9) << start_ms; 271 InitField(stream, 9) << first_time_ms; 272 InitField(stream, 9) << avg_time_ms; 273 InitField(stream, 7) << percentage << "%"; 274 InitField(stream, 7) << cdf_percentage << "%"; 275 InitField(stream, 10) << detail.mem_used.newest() / 1000.0; 276 InitField(stream, 9) << times_called; 277 stream << "\t" << detail.name; 278 279 return stream.str(); 280 } 281 282 void StatSummarizer::OrderNodesByMetric( 283 SortingMetric metric, std::vector<const Detail*>* details) const { 284 std::priority_queue<std::pair<string, const Detail*>> sorted_list; 285 const int num_nodes = details_.size(); 286 287 for (const auto& det : details_) { 288 const Detail* detail = &(det.second); 289 std::stringstream stream; 290 stream << std::setw(20) << std::right << std::setprecision(10) 291 << std::fixed; 292 293 switch (metric) { 294 case BY_NAME: 295 stream << detail->name; 296 break; 297 case BY_RUN_ORDER: 298 stream << num_nodes - detail->run_order; 299 break; 300 case BY_TIME: 301 stream << detail->rel_end_us.avg(); 302 break; 303 case BY_MEMORY: 304 stream << detail->mem_used.avg(); 305 break; 306 case BY_TYPE: 307 stream << detail->type; 308 break; 309 default: 310 stream << ""; 311 break; 312 } 313 314 sorted_list.emplace(stream.str(), detail); 315 } 316 317 while (!sorted_list.empty()) { 318 auto entry = sorted_list.top(); 319 sorted_list.pop(); 320 details->push_back(entry.second); 321 } 322 } 323 324 void StatSummarizer::ComputeStatsByType( 325 std::map<string, int64>* node_type_map_count, 326 std::map<string, int64>* node_type_map_time, 327 std::map<string, int64>* node_type_map_memory, 328 std::map<string, int64>* node_type_map_times_called, 329 int64* accumulated_us) const { 330 int64 run_count = run_total_us_.count(); 331 332 for (const auto& det : details_) { 333 const string node_name = det.first; 334 const Detail& detail = det.second; 335 336 int64 curr_time_val = 337 static_cast<int64>(detail.rel_end_us.sum() / run_count); 338 *accumulated_us += curr_time_val; 339 340 int64 curr_memory_val = detail.mem_used.newest(); 341 342 const string& node_type = detail.type; 343 344 (*node_type_map_count)[node_type] += 1; 345 (*node_type_map_time)[node_type] += curr_time_val; 346 (*node_type_map_memory)[node_type] += curr_memory_val; 347 (*node_type_map_times_called)[node_type] += detail.times_called / run_count; 348 } 349 } 350 351 std::string StatSummarizer::GetStatsByNodeType() const { 352 std::stringstream stream; 353 354 stream << "============================== Summary by node type " 355 "==============================" 356 << std::endl; 357 358 LOG(INFO) << "Number of nodes executed: " << details_.size(); 359 360 std::map<string, int64> node_type_map_count; 361 std::map<string, int64> node_type_map_time; 362 std::map<string, int64> node_type_map_memory; 363 std::map<string, int64> node_type_map_times_called; 364 int64 accumulated_us = 0; 365 366 ComputeStatsByType(&node_type_map_count, &node_type_map_time, 367 &node_type_map_memory, &node_type_map_times_called, 368 &accumulated_us); 369 370 // Sort them. 371 std::priority_queue<std::pair<int64, std::pair<string, int64>>> timings; 372 for (const auto& node_type : node_type_map_time) { 373 const int64 mem_used = node_type_map_memory[node_type.first]; 374 timings.emplace(node_type.second, 375 std::pair<string, int64>(node_type.first, mem_used)); 376 } 377 378 InitField(stream, 24) << "[Node type]"; 379 InitField(stream, 9) << "[count]"; 380 InitField(stream, 10) << "[avg ms]"; 381 InitField(stream, 11) << "[avg %]"; 382 InitField(stream, 11) << "[cdf %]"; 383 InitField(stream, 10) << "[mem KB]"; 384 InitField(stream, 10) << "[times called]"; 385 stream << std::endl; 386 387 float cdf = 0.0f; 388 while (!timings.empty()) { 389 auto entry = timings.top(); 390 timings.pop(); 391 392 const string node_type = entry.second.first; 393 const float memory = entry.second.second / 1000.0f; 394 395 const int64 node_type_total_us = entry.first; 396 const float time_per_run_ms = node_type_total_us / 1000.0f; 397 398 const float percentage = 399 ((entry.first / static_cast<float>(accumulated_us)) * 100.0f); 400 cdf += percentage; 401 402 InitField(stream, 24) << node_type; 403 InitField(stream, 9) << node_type_map_count[node_type]; 404 InitField(stream, 10) << time_per_run_ms; 405 InitField(stream, 10) << percentage << "%"; 406 InitField(stream, 10) << cdf << "%"; 407 InitField(stream, 10) << memory; 408 InitField(stream, 9) << node_type_map_times_called[node_type]; 409 stream << std::endl; 410 } 411 stream << std::endl; 412 return stream.str(); 413 } 414 415 std::string StatSummarizer::GetStatsByMetric(const string& title, 416 SortingMetric sorting_metric, 417 int num_stats) const { 418 std::vector<const Detail*> details; 419 OrderNodesByMetric(sorting_metric, &details); 420 421 double cumulative_stat_on_node = 0; 422 423 std::stringstream stream; 424 stream << HeaderString(title) << std::endl; 425 int stat_num = 0; 426 for (auto detail : details) { 427 ++stat_num; 428 if (num_stats > 0 && stat_num > num_stats) { 429 break; 430 } 431 432 // TODO(andrewharp): Make this keep track of the particular metric for cdf. 433 cumulative_stat_on_node += detail->rel_end_us.sum(); 434 stream << ColumnString(*detail, cumulative_stat_on_node, run_total_us_) 435 << std::endl; 436 } 437 stream << std::endl; 438 return stream.str(); 439 } 440 441 std::string StatSummarizer::GetOutputString() const { 442 std::stringstream stream; 443 if (options_.show_run_order) { 444 stream << GetStatsByMetric("Run Order", BY_RUN_ORDER, 445 options_.run_order_limit); 446 } 447 if (options_.show_time) { 448 stream << GetStatsByMetric("Top by Computation Time", BY_TIME, 449 options_.time_limit); 450 } 451 if (options_.show_memory) { 452 stream << GetStatsByMetric("Top by Memory Use", BY_MEMORY, 453 options_.memory_limit); 454 } 455 if (options_.show_type) { 456 stream << GetStatsByNodeType(); 457 } 458 if (options_.show_summary) { 459 stream << ShortSummary() << std::endl; 460 } 461 return stream.str(); 462 } 463 464 void StatSummarizer::PrintStepStats() const { 465 string output = GetOutputString(); 466 std::istringstream iss(output); 467 for (std::string line; std::getline(iss, line);) { 468 LOG(INFO) << line; 469 } 470 } 471 472 void StatSummarizer::PrintOutputs() const { 473 std::priority_queue< 474 std::pair<int64, const std::pair<const std::string, Detail>*>> 475 timings; 476 for (const auto& entry : details_) { 477 timings.emplace(-entry.second.start_us.avg(), &entry); 478 } 479 480 LOG(INFO) << "============ Node output tensor sizes in run order ========"; 481 while (!timings.empty()) { 482 auto entry = timings.top(); 483 timings.pop(); 484 const Detail& detail = entry.second->second; 485 std::stringstream stream; 486 stream << entry.second->first << "\t" << detail.outputs.size(); 487 for (const auto& tensor : detail.outputs) { 488 stream << "\t" << DataTypeString(tensor.dtype()); 489 stream << "\t" << tensor.shape().dim_size(); 490 for (const auto& d : tensor.shape().dim()) { 491 stream << "\t" << d.size(); 492 } 493 } 494 LOG(INFO) << stream.str(); 495 } 496 } 497 498 } // namespace tensorflow 499