1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/compiler/xla/service/human_readable_profile_builder.h" 17 #include "tensorflow/compiler/xla/metric_table_report.h" 18 #include "tensorflow/compiler/xla/util.h" 19 #include "tensorflow/core/lib/strings/numbers.h" 20 #include "tensorflow/core/lib/strings/strcat.h" 21 #include "tensorflow/core/lib/strings/stringprintf.h" 22 23 namespace xla { 24 25 using tensorflow::strings::Appendf; 26 using tensorflow::strings::HumanReadableElapsedTime; 27 using tensorflow::strings::HumanReadableNumBytes; 28 using tensorflow::strings::Printf; 29 using tensorflow::strings::StrAppend; 30 31 string HumanReadableProfileBuilder::ToString() const { 32 string s; 33 34 Appendf(&s, "Execution profile for %s: (%s @ f_nom)\n", 35 computation_name_.c_str(), 36 HumanReadableElapsedTime(CyclesToSeconds(total_cycles_)).c_str()); 37 38 auto append_op = [&](const OpInfo& op) { 39 string bytes_per_sec; 40 string bytes_per_cycle; 41 if (op.cycles <= 0 || op.bytes_accessed < 0) { 42 bytes_per_sec = "<unknown>"; 43 bytes_per_cycle = "<unknown>"; 44 } else { 45 bytes_per_sec = 46 HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles)); 47 if (op.bytes_accessed > op.cycles) { 48 bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles); 49 } else { 50 bytes_per_cycle = 51 Printf("%.3fB", static_cast<float>(op.bytes_accessed) / op.cycles); 52 } 53 } 54 55 double cycles_percent = 0; 56 if (total_cycles_ > 0) { 57 cycles_percent = op.cycles / static_cast<double>(total_cycles_) * 100; 58 } 59 60 double nsecs = op.cycles / clock_rate_ghz_; 61 Appendf(&s, 62 "%15lld cycles (%6.2f%%) :: %12.1f usec (%12.1f optimal) :: %18s " 63 ":: %18s :: %12s/s :: %12s/cycle :: %s\n", 64 op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles), 65 op.optimal_seconds * 1e6, 66 op.flop_count <= 0 67 ? "<none>" 68 : HumanReadableNumFlops(op.flop_count, nsecs).c_str(), 69 op.transcendental_count <= 0 ? "<none>" 70 : HumanReadableNumTranscendentalOps( 71 op.transcendental_count, nsecs) 72 .c_str(), 73 bytes_per_sec.c_str(), bytes_per_cycle.c_str(), op.name.c_str()); 74 }; 75 76 float optimal_seconds_sum = 0.0; 77 int64 total_flops = 0.; 78 int64 total_transcendentals = 0.; 79 int64 total_bytes = 0; 80 for (const auto& op : op_infos_) { 81 optimal_seconds_sum += op.optimal_seconds; 82 total_flops += op.flop_count; 83 total_transcendentals += op.transcendental_count; 84 total_bytes += op.bytes_accessed; 85 } 86 87 VLOG(1) << "Total floating point ops: " << total_flops; 88 89 append_op({"[total]", "[total]", /*category=*/"", total_cycles_, total_flops, 90 total_transcendentals, total_bytes, optimal_seconds_sum}); 91 92 // Sort ops in decreasing order of cycles. 93 std::vector<OpInfo> sorted_ops(op_infos_); 94 std::sort( 95 sorted_ops.begin(), sorted_ops.end(), 96 [](const OpInfo& a, const OpInfo& b) { return a.cycles > b.cycles; }); 97 for (const auto& op : sorted_ops) { 98 append_op(op); 99 } 100 101 if (total_cycles_ <= 0) { 102 StrAppend(&s, "****** 0 total cycles ******\n"); 103 } else { 104 // Only show an optimal discrepancy table if at least one value was 105 // specified. Estimates are non-negative, so if the sum is greater than 106 // zero, then at least one summand was greater than zero. 107 if (optimal_seconds_sum > 0) { 108 MetricTableReport table; 109 table.SetMetricName("microseconds above estimated optimum"); 110 table.SetEntryName("ops"); 111 table.SetShowCategoryTable(); 112 float total_discrepancy_in_microseconds = 0.0f; 113 for (const auto& op : sorted_ops) { 114 MetricTableReport::Entry entry; 115 entry.text = op.name; 116 entry.short_text = op.short_name; 117 entry.category_text = op.category; 118 entry.metric = 119 CyclesToMicroseconds(op.cycles) - op.optimal_seconds * 1e6; 120 total_discrepancy_in_microseconds += entry.metric; 121 table.AddEntry(std::move(entry)); 122 } 123 StrAppend(&s, table.MakeReport(total_discrepancy_in_microseconds)); 124 } 125 126 { 127 MetricTableReport table; 128 table.SetMetricName("microseconds"); 129 table.SetEntryName("ops"); 130 table.SetShowCategoryTable(); 131 for (const auto& op : sorted_ops) { 132 MetricTableReport::Entry entry; 133 entry.text = op.name; 134 entry.short_text = op.short_name; 135 entry.category_text = op.category; 136 entry.metric = CyclesToMicroseconds(op.cycles); 137 table.AddEntry(std::move(entry)); 138 } 139 StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_))); 140 } 141 } 142 return s; 143 } 144 145 } // namespace xla 146