Home | History | Annotate | Download | only in internal
      1 /* Copyright 2016 The TensorFlow Authors All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/profiler/internal/tfprof_node.h"
     17 
     18 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
     19 
     20 namespace tensorflow {
     21 namespace tfprof {
     22 bool CountAsAcceleratorTime(const string& device) {
     23   return device.find("stream:all") != device.npos;
     24 }
     25 bool CountAsCPUTime(const string& device) {
     26   return RE2::FullMatch(device,
     27                         ".*/(device:gpu|gpu|device:cpu|cpu|device:sycl):\\d+");
     28 }
     29 bool IsCanonicalDevice(const string& device) { return CountAsCPUTime(device); }
     30 
     31 // Notes about start and end time from the NodeExecStats proto:
     32 // For GPU, there is no difference between op_end_rel_micros and
     33 // all_end_rel_micros. All are kernel times.
     34 // For CPU, op_end_rel is the kernel time, while all_end_rel_micros includes
     35 // some post-processing. Besides, currently, there is no way to measure
     36 // the execution time of async ops accurately.
     37 //
     38 // Notes about device:
     39 // For ops on gpu:
     40 // It will appear in three different devices in RunMetadata: 1) gpu:x,
     41 // 2) gpu:x:stream:all and 3) gpu:x:stream:id. 2) is used a combined view
     42 // of all different 3). 1) is the op scheduling, pre-processing and
     43 // post processing time. 3) is the execution time of GPU kernels on a stream.
     44 // For ops on cpu:
     45 // It will only appear as cpu:0.
     46 
     47 void ExecStep::AddTimeStats(const string& dev, const NodeExecStats& step_stat) {
     48   devices_.insert(dev);
     49   if (step_stat.all_start_micros() > 0) {
     50     if (exec_.all_start_micros() > 0) {
     51       exec_.set_all_start_micros(
     52           std::min(static_cast<int64>(exec_.all_start_micros()),
     53                    static_cast<int64>(step_stat.all_start_micros())));
     54     } else {
     55       exec_.set_all_start_micros(step_stat.all_start_micros());
     56     }
     57     int64 op_end_rel_micros = step_stat.op_end_rel_micros();
     58     // Round quick execution to 1 micro to be semantically robust.
     59     if (op_end_rel_micros == 0) {
     60       ++op_end_rel_micros;
     61     }
     62     exec_.set_latest_end_micros(
     63         std::max(static_cast<int64>(exec_.latest_end_micros()),
     64                  step_stat.all_start_micros() + op_end_rel_micros));
     65 
     66     const std::pair<int64, int64> pair =
     67         std::make_pair(step_stat.all_start_micros(), op_end_rel_micros);
     68     if (CountAsAcceleratorTime(dev)) {
     69       accelerator_execs_[dev].push_back(pair);
     70       op_execs_[dev].push_back(pair);
     71     } else if (CountAsCPUTime(dev)) {
     72       cpu_execs_[dev].push_back(pair);
     73       op_execs_[dev].push_back(pair);
     74       // In while-loop, a graph node is executed multiple times under
     75       // the same name.
     76       exec_.set_run_count(exec_.run_count() + 1);
     77     }
     78   }
     79 }
     80 
     81 void ExecStep::AddMemoryStats(const string& dev,
     82                               const NodeExecStats& step_stat) {
     83   ExecMemory exec_mem;
     84   if (step_stat.all_start_micros() > 0) {
     85     exec_mem.set_memory_micros(step_stat.all_start_micros() +
     86                                step_stat.op_end_rel_micros());
     87   } else {
     88     fprintf(stderr, "%s has no start time, skipping\n",
     89             step_stat.node_name().c_str());
     90     return;
     91   }
     92 
     93   int accelerator_allocator_cnt = 0;
     94   for (const auto& mem : step_stat.memory()) {
     95     // TODO(xpan): Fix this hack. Currently the allocator name seems quite
     96     // ad-hoc.
     97     if (mem.allocator_name().find("GPU") == mem.allocator_name().npos) {
     98       continue;
     99     }
    100     ++accelerator_allocator_cnt;
    101     exec_mem.set_allocator_bytes_in_use(
    102         std::max(static_cast<int64>(exec_mem.allocator_bytes_in_use()),
    103                  static_cast<int64>(mem.allocator_bytes_in_use())));
    104     for (const auto& alloc : mem.allocation_records()) {
    105       allocations_.push_back(alloc);
    106     }
    107   }
    108   if (accelerator_allocator_cnt > 1) {
    109     fprintf(stderr, "found %d gpu allocator for 1 node\n",
    110             accelerator_allocator_cnt);
    111   }
    112 
    113   int64 total_output_bytes = 0;
    114   for (const auto& output : step_stat.output()) {
    115     if (output.has_tensor_description() &&
    116         output.tensor_description().has_allocation_description()) {
    117       // TODO(xpan): Maybe allocated_bytes.
    118       int64 output_bytes = std::max(output.tensor_description()
    119                                         .allocation_description()
    120                                         .allocated_bytes(),
    121                                     output.tensor_description()
    122                                         .allocation_description()
    123                                         .requested_bytes());
    124       uint64 output_ptr =
    125           output.tensor_description().allocation_description().ptr();
    126       total_output_bytes += output_bytes;
    127 
    128       auto& mem = (*exec_mem.mutable_output_memory())[output.slot()];
    129       mem.set_ptr(output_ptr);
    130       mem.set_bytes(output_bytes);
    131     }
    132   }
    133   exec_mem.set_output_bytes(total_output_bytes);
    134 
    135   if (step_stat.has_memory_stats()) {
    136     if (IsPlacedOnCPU(dev)) {
    137       // Currently we assume ops placed on gpu only allocate memory on gpu.
    138       exec_mem.set_host_temp_bytes(exec_mem.host_temp_bytes() +
    139                                    step_stat.memory_stats().temp_memory_size());
    140       exec_mem.set_host_persistent_bytes(
    141           exec_mem.host_persistent_bytes() +
    142           step_stat.memory_stats().persistent_memory_size());
    143     } else {
    144       exec_mem.set_accelerator_temp_bytes(
    145           exec_mem.accelerator_temp_bytes() +
    146           step_stat.memory_stats().temp_memory_size());
    147       exec_mem.set_accelerator_persistent_bytes(
    148           exec_mem.accelerator_persistent_bytes() +
    149           step_stat.memory_stats().persistent_memory_size());
    150     }
    151   }
    152 
    153   // TODO(xpan): Make this more accurate:
    154   // High level: Memory tracking is suspicous and requires large scale
    155   // clean up.
    156   // Investigte the memory usage difference between CPU/GPU with OpViewTest.
    157   //
    158   // 1. OpKernelConstruction::allocate_xxx is not traced. Below, we only
    159   //    discuss OpKernelContext-related allocations.
    160   // 2. allocate_output calls allocate_tensor, which is properly tracked in
    161   //    'NodeExecStats.memory'.
    162   // 3. allocate_temp is only tracked through record_xxx_temp. It appears
    163   //    in 'NodeExecStats.memory_stats'.
    164   // 4. allocate_persistent calls allocate_tensor, which is properly tracked
    165   //    in 'NodeExecStats.memory'. However, there is no way to count it as
    166   //    persistent now.
    167   // 5. record_xxx_persistent is called when allocate_persistent
    168   //    is not used and hence tracks some complementary bytes. It appears in
    169   //    'NodeExecStats.memory_stats'. It's suspicious. But we should
    170   //    use it now since it covers constant op.
    171   int64 residual_bytes = 0;
    172   int64 requested_bytes = 0;
    173   int64 peak_bytes = 0;
    174   for (const auto& mem : step_stat.memory()) {
    175     residual_bytes += mem.live_bytes();
    176     requested_bytes += mem.total_bytes();
    177     peak_bytes += mem.peak_bytes();
    178   }
    179   residual_bytes += exec_mem.host_persistent_bytes() +
    180                     exec_mem.accelerator_persistent_bytes();
    181   requested_bytes += exec_mem.host_persistent_bytes() +
    182                      exec_mem.accelerator_persistent_bytes() +
    183                      exec_mem.host_temp_bytes() +
    184                      exec_mem.accelerator_temp_bytes();
    185   peak_bytes += exec_mem.host_persistent_bytes() +
    186                 exec_mem.accelerator_persistent_bytes() +
    187                 exec_mem.host_temp_bytes() + exec_mem.accelerator_temp_bytes();
    188 
    189   exec_mem.set_requested_bytes(requested_bytes);
    190   exec_mem.set_residual_bytes(residual_bytes);
    191   exec_mem.set_peak_bytes(peak_bytes);
    192   memory_execs_.emplace_back(exec_mem);
    193 }
    194 
    195 void TFGraphNode::AddStepStat(int64 step, const string& device,
    196                               const NodeExecStats& step_stat) {
    197   string dev = str_util::Lowercase(device);
    198 
    199   // TODO(xpan): Make this more robust?
    200   // See run_metadata_test.py
    201   // It can be /job:0/replica:0/xxxx/device:GPU:0, or simply /device:GPU:0.
    202   // It can has some ad-hoc suffix, such as /stream:xx or /memcpy:xx.
    203   if (IsCanonicalDevice(dev)) {
    204     if (!node_.canonical_device().empty()) {
    205       if (node_.canonical_device() != dev) {
    206         // TODO(xpan): Some RunMetadata node appears at multiple devices.
    207         // Need to address it.
    208         return;
    209       }
    210     } else {
    211       node_.set_canonical_device(dev);
    212       // TODO(xpan): Support things other than gpu?
    213       if (dev.find("sycl") != dev.npos) {
    214         node_.set_host_device(StringReplace(dev, "device:sycl:\\d+", "cpu:0"));
    215       } else {
    216         node_.set_host_device(StringReplace(dev, "gpu:\\d+", "cpu:0"));
    217       }
    218       AddOpType(node_.canonical_device());
    219     }
    220   }
    221 
    222   auto exec = execs_.find(step);
    223   if (exec == execs_.end()) {
    224     execs_.insert(std::pair<int64, ExecStep>(step, ExecStep()));
    225     exec = execs_.find(step);
    226   }
    227 
    228   exec->second.AddTimeStats(dev, step_stat);
    229 
    230   if (dev == node_.canonical_device()) {
    231     exec->second.AddMemoryStats(dev, step_stat);
    232   }
    233 }
    234 
    235 int64 ExecStep::exec_micros() const {
    236   return accelerator_exec_micros() + cpu_exec_micros();
    237 }
    238 
    239 int64 ExecStep::accelerator_exec_micros() const {
    240   int64 total = 0;
    241   // Normally, an op should only be scheduled on 1 accelerator device.
    242   // Hence there should generally be 1 element in accelerator_execs_.
    243   for (const auto& execs : accelerator_execs_) {
    244     // An op can fire multiple kernels or
    245     // being scheduled multiple times in while-loop.
    246     for (const auto& exec : execs.second) {
    247       total += exec.second;
    248     }
    249   }
    250   return total;
    251 }
    252 
    253 int64 ExecStep::cpu_exec_micros() const {
    254   int64 total = 0;
    255   // Normally, an op can only be scheduled on 1 device.
    256   for (const auto& execs : cpu_execs_) {
    257     // An op can be scheduled multiple times in while-loop.
    258     for (const auto& exec : execs.second) {
    259       total += exec.second;
    260     }
    261   }
    262   return total;
    263 }
    264 
    265 std::vector<int64> ShapeProtoToVec(const TensorShapeProto& shape_pb) {
    266   std::vector<int64> shape_vec;
    267   if (shape_pb.dim_size() == 0 && !shape_pb.unknown_rank()) {
    268     // Scalar parameter with empty shape but known rank.
    269     shape_vec.push_back(1);
    270   } else {
    271     for (const auto& d : shape_pb.dim()) {
    272       shape_vec.push_back(d.size());
    273     }
    274   }
    275   return shape_vec;
    276 }
    277 
    278 TensorShapeProto VecToShapeProto(const std::vector<int64>& shape_vec) {
    279   TensorShapeProto shape_pb;
    280   if (shape_vec.empty()) {
    281     shape_pb.set_unknown_rank(true);
    282     return shape_pb;
    283   }
    284   for (const int64 s : shape_vec) {
    285     shape_pb.add_dim()->set_size(s);
    286   }
    287   return shape_pb;
    288 }
    289 
    290 bool IsPlacedOnAccelerator(const string& device) {
    291   return device.find("gpu") != device.npos ||
    292          device.find("sycl") != device.npos;
    293 }
    294 bool IsPlacedOnCPU(const string& device) {
    295   return device.find("cpu") != device.npos;
    296 }
    297 }  // namespace tfprof
    298 }  // namespace tensorflow
    299