1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/profiler/internal/tfprof_node.h" 17 18 #include "tensorflow/core/profiler/internal/tfprof_utils.h" 19 20 namespace tensorflow { 21 namespace tfprof { 22 bool CountAsAcceleratorTime(const string& device) { 23 return device.find("stream:all") != device.npos; 24 } 25 bool CountAsCPUTime(const string& device) { 26 return RE2::FullMatch(device, 27 ".*/(device:gpu|gpu|device:cpu|cpu|device:sycl):\\d+"); 28 } 29 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); } 30 31 // Notes about start and end time from the NodeExecStats proto: 32 // For GPU, there is no difference between op_end_rel_micros and 33 // all_end_rel_micros. All are kernel times. 34 // For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes 35 // some post-processing. Besides, currently, there is no way to measure 36 // the execution time of async ops accurately. 37 // 38 // Notes about device: 39 // For ops on gpu: 40 // It will appear in three different devices in RunMetadata: 1) gpu:x, 41 // 2) gpu:x:stream:all and 3) gpu:x:stream:id. 2) is used a combined view 42 // of all different 3). 1) is the op scheduling, pre-processing and 43 // post processing time. 3) is the execution time of GPU kernels on a stream. 44 // For ops on cpu: 45 // It will only appear as cpu:0. 46 47 void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) { 48 devices_.insert(dev); 49 if (step_stat.all_start_micros() > 0) { 50 if (exec_.all_start_micros() > 0) { 51 exec_.set_all_start_micros( 52 std::min(static_cast<int64>(exec_.all_start_micros()), 53 static_cast<int64>(step_stat.all_start_micros()))); 54 } else { 55 exec_.set_all_start_micros(step_stat.all_start_micros()); 56 } 57 int64 op_end_rel_micros = step_stat.op_end_rel_micros(); 58 // Round quick execution to 1 micro to be semantically robust. 59 if (op_end_rel_micros == 0) { 60 ++op_end_rel_micros; 61 } 62 exec_.set_latest_end_micros( 63 std::max(static_cast<int64>(exec_.latest_end_micros()), 64 step_stat.all_start_micros() + op_end_rel_micros)); 65 66 const std::pair<int64, int64> pair = 67 std::make_pair(step_stat.all_start_micros(), op_end_rel_micros); 68 if (CountAsAcceleratorTime(dev)) { 69 accelerator_execs_[dev].push_back(pair); 70 op_execs_[dev].push_back(pair); 71 } else if (CountAsCPUTime(dev)) { 72 cpu_execs_[dev].push_back(pair); 73 op_execs_[dev].push_back(pair); 74 // In while-loop, a graph node is executed multiple times under 75 // the same name. 76 exec_.set_run_count(exec_.run_count() + 1); 77 } 78 } 79 } 80 81 void ExecStep::AddMemoryStats(const string& dev, 82 const NodeExecStats& step_stat) { 83 ExecMemory exec_mem; 84 if (step_stat.all_start_micros() > 0) { 85 exec_mem.set_memory_micros(step_stat.all_start_micros() + 86 step_stat.op_end_rel_micros()); 87 } else { 88 fprintf(stderr, "%s has no start time, skipping\n", 89 step_stat.node_name().c_str()); 90 return; 91 } 92 93 int accelerator_allocator_cnt = 0; 94 for (const auto& mem : step_stat.memory()) { 95 // TODO(xpan): Fix this hack. Currently the allocator name seems quite 96 // ad-hoc. 97 if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) { 98 continue; 99 } 100 ++accelerator_allocator_cnt; 101 exec_mem.set_allocator_bytes_in_use( 102 std::max(static_cast<int64>(exec_mem.allocator_bytes_in_use()), 103 static_cast<int64>(mem.allocator_bytes_in_use()))); 104 for (const auto& alloc : mem.allocation_records()) { 105 allocations_.push_back(alloc); 106 } 107 } 108 if (accelerator_allocator_cnt > 1) { 109 fprintf(stderr, "found %d gpu allocator for 1 node\n", 110 accelerator_allocator_cnt); 111 } 112 113 int64 total_output_bytes = 0; 114 for (const auto& output : step_stat.output()) { 115 if (output.has_tensor_description() && 116 output.tensor_description().has_allocation_description()) { 117 // TODO(xpan): Maybe allocated_bytes. 118 int64 output_bytes = std::max(output.tensor_description() 119 .allocation_description() 120 .allocated_bytes(), 121 output.tensor_description() 122 .allocation_description() 123 .requested_bytes()); 124 uint64 output_ptr = 125 output.tensor_description().allocation_description().ptr(); 126 total_output_bytes += output_bytes; 127 128 auto& mem = (*exec_mem.mutable_output_memory())[output.slot()]; 129 mem.set_ptr(output_ptr); 130 mem.set_bytes(output_bytes); 131 } 132 } 133 exec_mem.set_output_bytes(total_output_bytes); 134 135 if (step_stat.has_memory_stats()) { 136 if (IsPlacedOnCPU(dev)) { 137 // Currently we assume ops placed on gpu only allocate memory on gpu. 138 exec_mem.set_host_temp_bytes(exec_mem.host_temp_bytes() + 139 step_stat.memory_stats().temp_memory_size()); 140 exec_mem.set_host_persistent_bytes( 141 exec_mem.host_persistent_bytes() + 142 step_stat.memory_stats().persistent_memory_size()); 143 } else { 144 exec_mem.set_accelerator_temp_bytes( 145 exec_mem.accelerator_temp_bytes() + 146 step_stat.memory_stats().temp_memory_size()); 147 exec_mem.set_accelerator_persistent_bytes( 148 exec_mem.accelerator_persistent_bytes() + 149 step_stat.memory_stats().persistent_memory_size()); 150 } 151 } 152 153 // TODO(xpan): Make this more accurate: 154 // High level: Memory tracking is suspicous and requires large scale 155 // clean up. 156 // Investigte the memory usage difference between CPU/GPU with OpViewTest. 157 // 158 // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only 159 // discuss OpKernelContext-related allocations. 160 // 2. allocate_output calls allocate_tensor, which is properly tracked in 161 // 'NodeExecStats.memory'. 162 // 3. allocate_temp is only tracked through record_xxx_temp. It appears 163 // in 'NodeExecStats.memory_stats'. 164 // 4. allocate_persistent calls allocate_tensor, which is properly tracked 165 // in 'NodeExecStats.memory'. However, there is no way to count it as 166 // persistent now. 167 // 5. record_xxx_persistent is called when allocate_persistent 168 // is not used and hence tracks some complementary bytes. It appears in 169 // 'NodeExecStats.memory_stats'. It's suspicious. But we should 170 // use it now since it covers constant op. 171 int64 residual_bytes = 0; 172 int64 requested_bytes = 0; 173 int64 peak_bytes = 0; 174 for (const auto& mem : step_stat.memory()) { 175 residual_bytes += mem.live_bytes(); 176 requested_bytes += mem.total_bytes(); 177 peak_bytes += mem.peak_bytes(); 178 } 179 residual_bytes += exec_mem.host_persistent_bytes() + 180 exec_mem.accelerator_persistent_bytes(); 181 requested_bytes += exec_mem.host_persistent_bytes() + 182 exec_mem.accelerator_persistent_bytes() + 183 exec_mem.host_temp_bytes() + 184 exec_mem.accelerator_temp_bytes(); 185 peak_bytes += exec_mem.host_persistent_bytes() + 186 exec_mem.accelerator_persistent_bytes() + 187 exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes(); 188 189 exec_mem.set_requested_bytes(requested_bytes); 190 exec_mem.set_residual_bytes(residual_bytes); 191 exec_mem.set_peak_bytes(peak_bytes); 192 memory_execs_.emplace_back(exec_mem); 193 } 194 195 void TFGraphNode::AddStepStat(int64 step, const string& device, 196 const NodeExecStats& step_stat) { 197 string dev = str_util::Lowercase(device); 198 199 // TODO(xpan): Make this more robust? 200 // See run_metadata_test.py 201 // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0. 202 // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx. 203 if (IsCanonicalDevice(dev)) { 204 if (!node_.canonical_device().empty()) { 205 if (node_.canonical_device() != dev) { 206 // TODO(xpan): Some RunMetadata node appears at multiple devices. 207 // Need to address it. 208 return; 209 } 210 } else { 211 node_.set_canonical_device(dev); 212 // TODO(xpan): Support things other than gpu? 213 if (dev.find("sycl") != dev.npos) { 214 node_.set_host_device(StringReplace(dev, "device:sycl:\\d+", "cpu:0")); 215 } else { 216 node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0")); 217 } 218 AddOpType(node_.canonical_device()); 219 } 220 } 221 222 auto exec = execs_.find(step); 223 if (exec == execs_.end()) { 224 execs_.insert(std::pair<int64, ExecStep>(step, ExecStep())); 225 exec = execs_.find(step); 226 } 227 228 exec->second.AddTimeStats(dev, step_stat); 229 230 if (dev == node_.canonical_device()) { 231 exec->second.AddMemoryStats(dev, step_stat); 232 } 233 } 234 235 int64 ExecStep::exec_micros() const { 236 return accelerator_exec_micros() + cpu_exec_micros(); 237 } 238 239 int64 ExecStep::accelerator_exec_micros() const { 240 int64 total = 0; 241 // Normally, an op should only be scheduled on 1 accelerator device. 242 // Hence there should generally be 1 element in accelerator_execs_. 243 for (const auto& execs : accelerator_execs_) { 244 // An op can fire multiple kernels or 245 // being scheduled multiple times in while-loop. 246 for (const auto& exec : execs.second) { 247 total += exec.second; 248 } 249 } 250 return total; 251 } 252 253 int64 ExecStep::cpu_exec_micros() const { 254 int64 total = 0; 255 // Normally, an op can only be scheduled on 1 device. 256 for (const auto& execs : cpu_execs_) { 257 // An op can be scheduled multiple times in while-loop. 258 for (const auto& exec : execs.second) { 259 total += exec.second; 260 } 261 } 262 return total; 263 } 264 265 std::vector<int64> ShapeProtoToVec(const TensorShapeProto& shape_pb) { 266 std::vector<int64> shape_vec; 267 if (shape_pb.dim_size() == 0 && !shape_pb.unknown_rank()) { 268 // Scalar parameter with empty shape but known rank. 269 shape_vec.push_back(1); 270 } else { 271 for (const auto& d : shape_pb.dim()) { 272 shape_vec.push_back(d.size()); 273 } 274 } 275 return shape_vec; 276 } 277 278 TensorShapeProto VecToShapeProto(const std::vector<int64>& shape_vec) { 279 TensorShapeProto shape_pb; 280 if (shape_vec.empty()) { 281 shape_pb.set_unknown_rank(true); 282 return shape_pb; 283 } 284 for (const int64 s : shape_vec) { 285 shape_pb.add_dim()->set_size(s); 286 } 287 return shape_pb; 288 } 289 290 bool IsPlacedOnAccelerator(const string& device) { 291 return device.find("gpu") != device.npos || 292 device.find("sycl") != device.npos; 293 } 294 bool IsPlacedOnCPU(const string& device) { 295 return device.find("cpu") != device.npos; 296 } 297 } // namespace tfprof 298 } // namespace tensorflow 299