Home | History | Annotate | Download | only in costs
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/grappler/costs/utils.h"
     17 
     18 #include <stddef.h>
     19 #include <utility>
     20 
     21 #include "third_party/eigen3/Eigen/Core"
     22 
     23 #if GOOGLE_CUDA
     24 #include "cuda/include/cuda.h"
     25 #include "cuda/include/cuda_runtime_api.h"
     26 #include "cuda/include/cudnn.h"
     27 #endif
     28 
     29 #include "tensorflow/core/framework/allocation_description.pb.h"
     30 #include "tensorflow/core/framework/attr_value.pb.h"
     31 #include "tensorflow/core/framework/op.h"
     32 #include "tensorflow/core/framework/op_def.pb.h"
     33 #include "tensorflow/core/framework/step_stats.pb.h"
     34 #include "tensorflow/core/framework/tensor.pb.h"
     35 #include "tensorflow/core/framework/tensor_description.pb.h"
     36 #include "tensorflow/core/framework/tensor_shape.pb.h"
     37 #include "tensorflow/core/framework/types.pb.h"
     38 #include "tensorflow/core/graph/graph.h"
     39 #include "tensorflow/core/graph/tensor_id.h"
     40 #include "tensorflow/core/grappler/clusters/utils.h"
     41 #include "tensorflow/core/grappler/utils.h"
     42 #include "tensorflow/core/lib/core/bits.h"
     43 #include "tensorflow/core/lib/strings/numbers.h"
     44 #include "tensorflow/core/lib/strings/strcat.h"
     45 #include "tensorflow/core/platform/cpu_info.h"
     46 #include "tensorflow/core/platform/env.h"
     47 #include "tensorflow/core/platform/logging.h"
     48 #include "tensorflow/core/platform/protobuf.h"
     49 #include "tensorflow/core/protobuf/config.pb.h"
     50 #include "tensorflow/core/util/device_name_utils.h"
     51 
     52 namespace tensorflow {
     53 namespace grappler {
     54 
     55 static OpInfo::TensorProperties UnknownInput() {
     56   OpInfo::TensorProperties input;
     57   input.set_dtype(DataType::DT_INVALID);
     58   input.mutable_shape()->set_unknown_rank(true);
     59   return input;
     60 }
     61 
     62 static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
     63   std::vector<TensorProto> tensors;
     64   switch (attr_value.value_case()) {
     65     case AttrValue::kTensor: {
     66       tensors.push_back(attr_value.tensor());
     67       break;
     68     }
     69     case AttrValue::kList: {
     70       for (const auto& tensor_proto : attr_value.list().tensor()) {
     71         tensors.push_back(tensor_proto);
     72       }
     73       break;
     74     }
     75     default: {}
     76   }
     77   return tensors;
     78 }
     79 
     80 // Annotate the op_info inputs with extra information when possible (e.g. the
     81 // input value if it's known statically).
     82 static void ExtractExtraProperties(
     83     const NodeDef& node,
     84     const std::unordered_map<string, const NodeDef*>& name_to_node,
     85     OpInfo* op_info) {
     86   OpRegistry* op_registry = OpRegistry::Global();
     87   const OpDef* op_def = nullptr;
     88   auto s = op_registry->LookUpOpDef(node.op(), &op_def);
     89   if (!s.ok()) {
     90     op_def = nullptr;
     91   }
     92 
     93   for (int i = 0; i < node.input_size(); ++i) {
     94     const string input_name = node.input(i);
     95     CHECK(!input_name.empty());
     96     if (IsControlInput(input_name)) {
     97       continue;
     98     }
     99     TensorId input_tensor_id = ParseTensorName(input_name);
    100     const string input_node_name = input_tensor_id.first.ToString();
    101 
    102     auto iter = name_to_node.find(input_node_name);
    103     if (iter == name_to_node.end()) continue;
    104     const NodeDef* input_node = iter->second;
    105 
    106     if (i >= op_info->inputs_size()) {
    107       LOG(ERROR) << "OpInfo's inputs doesn't match the graph! OpInfo: "
    108                  << op_info->DebugString()
    109                  << "\nCurrent node: " << node.DebugString()
    110                  << "\nInput node: " << input_node->DebugString();
    111     }
    112 
    113     // The value attribute in Const input is useful for cost prediction.
    114     if (input_node->op() == "Const" && i < op_info->inputs_size()) {
    115       auto it = input_node->attr().find("value");
    116       if (it == input_node->attr().end()) continue;
    117 
    118       const AttrValue& attr_value = it->second;
    119       std::vector<TensorProto> tensors = ExtractTensors(attr_value);
    120       if (tensors.empty()) continue;
    121 
    122       const TensorProto& t = tensors[0];
    123       OpInfo::TensorProperties* input = op_info->mutable_inputs(i);
    124       *(input->mutable_value()) = t;
    125 
    126       // For filename input, the file size can also be useful.
    127       if (op_def && i < op_def->input_arg_size() &&
    128           op_def->input_arg(i).name().find("filename") != std::string::npos) {
    129         Tensor tensor;
    130         if (!tensor.FromProto(t)) {
    131           continue;
    132         }
    133         if (tensor.NumElements() != 1) {
    134           continue;
    135         }
    136         const string filename = tensor.scalar<string>()();
    137 
    138         Env* env = Env::Default();
    139         FileStatistics stat;
    140         Status s = env->Stat(filename, &stat);
    141         if (!s.ok()) {
    142           continue;
    143         }
    144         AttrValue attr;
    145         attr.set_i(stat.length);
    146         string attr_key = strings::StrCat("input_", i, "_filesize");
    147         (*op_info->mutable_attr())[attr_key] = attr;
    148       }
    149     }
    150 
    151     // When the input is a handle (e.g. look up table handle), the information
    152     // in the op itself is not sufficient to predict the op memory.
    153     if (op_def && i < op_def->input_arg_size() &&
    154         op_def->input_arg(i).name().find("handle") != std::string::npos) {
    155       string new_key = strings::StrCat("parent_", i, "_op");
    156       AttrValue attr;
    157       attr.set_s(input_node->op());
    158       (*op_info->mutable_attr())[new_key] = attr;
    159       // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
    160       // and attributes when necessary.
    161     }
    162   }
    163 }
    164 
    165 std::vector<OpInfo::TensorProperties> FindInputFeatures(
    166     const NodeDef& node,
    167     const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
    168     const std::unordered_map<string, const NodeDef*>& name_to_node) {
    169   std::vector<OpInfo::TensorProperties> inputs;
    170   for (const auto& input_name : node.input()) {
    171     CHECK(!input_name.empty());
    172     TensorId input_tensor_id = ParseTensorName(input_name);
    173     const string input_node_name = input_tensor_id.first.ToString();
    174     const int output_index = input_tensor_id.second;
    175 
    176     // Skip control inputs.
    177     if (output_index == Graph::kControlSlot) {
    178       continue;
    179     }
    180 
    181     auto it = name_to_cost.find(input_node_name);
    182     if (it == name_to_cost.end() || output_index < 0) {
    183       inputs.push_back(UnknownInput());
    184     } else {
    185       const CostGraphDef::Node* input_cost = it->second;
    186       if (input_cost->output_info_size() == 0) {
    187         inputs.push_back(UnknownInput());
    188       } else {
    189         const CostGraphDef::Node::OutputInfo& output =
    190             input_cost->output_info(output_index);
    191         OpInfo::TensorProperties input;
    192         input.set_dtype(output.dtype());
    193         *input.mutable_shape() = output.shape();
    194         inputs.push_back(input);
    195       }
    196     }
    197   }
    198 
    199   return inputs;
    200 }
    201 
    202 DeviceProperties GetDeviceInfo(const string& device_str) {
    203   DeviceNameUtils::ParsedName parsed;
    204   if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
    205     if (parsed.type == "GPU") {
    206       return GetLocalGPUInfo(parsed.id);
    207     } else if (parsed.type == "CPU") {
    208       return GetLocalCPUInfo();
    209     }
    210   }
    211   DeviceProperties device;
    212   device.set_type("UNKNOWN");
    213   return device;
    214 }
    215 
    216 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
    217   return GetDeviceInfo(node.device());
    218 }
    219 
    220 OpInfo BuildOpInfoWithoutDevice(
    221     const NodeDef& node,
    222     const std::unordered_map<string, const NodeDef*>& name_to_node,
    223     const std::vector<OpInfo::TensorProperties>& inputs) {
    224   OpInfo op_info;
    225   op_info.set_op(node.op());
    226   *op_info.mutable_attr() = node.attr();
    227   for (auto& input : inputs) {
    228     *op_info.add_inputs() = input;
    229   }
    230   ExtractExtraProperties(node, name_to_node, &op_info);
    231   return op_info;
    232 }
    233 
    234 string GetOpDescription(const OpInfo& op_info) {
    235   string description = "[";
    236   description += "Op=" + op_info.op() + ", ";
    237   description += "input_shapes=[";
    238   for (auto const& input : op_info.inputs()) {
    239     description += PartialTensorShape::DebugString(input.shape());
    240   }
    241   description += "]";
    242   return description;
    243 }
    244 
    245 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
    246                                                const GraphDef& graph) {
    247   OpPerformanceList ret;
    248   std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
    249   std::unordered_map<string, const NodeDef*> name_to_node;
    250   for (auto& node : cost_graph.node()) {
    251     name_to_cost[node.name()] = &node;
    252   }
    253   for (auto& node : graph.node()) {
    254     name_to_node[node.name()] = &node;
    255   }
    256 
    257   for (const auto& node : graph.node()) {
    258     // Skip the nodes that are not in the cost graph: these are nodes that
    259     // aren't run, because they aren't in the intersection of transitive
    260     // fan-in of a fetch node and the transitive fan-out of an input, or nodes
    261     // that were optimized away by the optimizer. Since they don't contribute
    262     // to the execution time we simply discard them.
    263     auto it = name_to_cost.find(node.name());
    264     if (it == name_to_cost.end()) {
    265       continue;
    266     }
    267     const CostGraphDef::Node* cost_node = it->second;
    268 
    269     OpPerformance* perf = ret.add_op_performance();
    270     perf->set_node(node.name());
    271 
    272     std::vector<OpInfo::TensorProperties> inputs =
    273         FindInputFeatures(node, name_to_cost, name_to_node);
    274     *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs);
    275     *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device());
    276 
    277     perf->set_temporary_memory_size(cost_node->temporary_memory_size());
    278     // Note that CostGraphDef::Node::compute_cost is microseconds, while
    279     // OpPerformance.compute_cost is nanoseconds.
    280     perf->set_compute_cost(cost_node->compute_cost() * 1000);
    281     perf->set_compute_time(cost_node->compute_time() * 1000);
    282     perf->set_memory_time(cost_node->memory_time() * 1000);
    283 
    284     for (const auto& output_info : cost_node->output_info()) {
    285       perf->mutable_op_memory()->add_output_memory(output_info.size());
    286     }
    287 
    288     perf->mutable_op_memory()->set_temp_memory(
    289         cost_node->temporary_memory_size());
    290     perf->mutable_op_memory()->set_persistent_memory(
    291         cost_node->persistent_memory_size());
    292   }
    293   return ret;
    294 }
    295 
    296 void TensorSizeHistogram::Add(const uint64 value) {
    297   num_elem_++;
    298   sum_elem_ += value;
    299   min_ = std::min(min_, value);
    300   max_ = std::max(max_, value);
    301   buckets_[Index(value)]++;
    302 }
    303 
    304 void TensorSizeHistogram::Merge(const TensorSizeHistogram& src) {
    305   num_elem_ += src.num_elem_;
    306   sum_elem_ += src.sum_elem_;
    307   min_ = std::min(min_, src.min_);
    308   max_ = std::max(max_, src.max_);
    309   std::transform(buckets_.begin(), buckets_.end(), src.buckets_.begin(),
    310                  buckets_.begin(), std::plus<uint64>());
    311 }
    312 
    313 std::string TensorSizeHistogram::ToString() const {
    314   std::string r;
    315   char buf[200];
    316   snprintf(buf, sizeof(buf), "Count: %lld, Average: ", num_elem_);
    317   r.append(buf);
    318   r.append(strings::HumanReadableNumBytes(Average()));
    319   r.append(", Min: ");
    320   r.append(strings::HumanReadableNumBytes(min_));
    321   r.append(", Max: ");
    322   r.append(strings::HumanReadableNumBytes(max_));
    323   r.append("\n------------------------------------------------------\n");
    324   const double mult = num_elem_ > 0 ? 100.0 / num_elem_ : 0.0;
    325   uint64 cumul_sum = 0;
    326 
    327   const int size_string_width = 12;
    328   for (int i = 0; i < buckets_.size(); i++) {
    329     if (buckets_[i] == 0) continue;
    330     cumul_sum += buckets_[i];
    331     r.append("[ ");
    332     if (i == 0) {
    333       r.append(size_string_width - 2, ' ');
    334       r.append("0B");
    335     } else {
    336       uint64 left = 1ULL << (i - 1);
    337       const auto left_string = strings::HumanReadableNumBytes(left);
    338       r.append(size_string_width - left_string.size(), ' ');
    339       r.append(left_string);
    340     }
    341     r.append(", ");
    342     uint64 right = 1ULL << i;
    343     const auto right_string = strings::HumanReadableNumBytes(right);
    344     r.append(size_string_width - right_string.size(), ' ');
    345     r.append(right_string);
    346     snprintf(buf, sizeof(buf), ") %7lld %7.3f%% %7.3f%% ",
    347              buckets_[i],         // count
    348              mult * buckets_[i],  // percentage
    349              mult * cumul_sum);   // cum percentage
    350     r.append(buf);
    351 
    352     // Add hash marks based on percentage; 40 marks for 100%.
    353     auto marks = static_cast<int>(
    354         (static_cast<double>(40 * buckets_[i] + (num_elem_ >> 1)) / num_elem_));
    355     r.append(marks, '#');
    356     r.push_back('\n');
    357   }
    358   return r;
    359 }
    360 
    361 const int TensorSizeHistogram::Index(const uint64 value) const {
    362   // Log2Floor64 returns -1 for 0, 0 for 1, 1 for 2-3, 2 for 4-7, ...
    363   const auto index = Log2Floor64(value) + 1;
    364   return std::min(index, kMaxBuckets - 1);
    365 }
    366 
    367 string GetDeviceClassForNonChannelDevice(const string& device_name) {
    368   DeviceNameUtils::ParsedName parsed_name;
    369   bool parsed = DeviceNameUtils::ParseFullName(device_name, &parsed_name);
    370   if (!parsed) {
    371     string name = str_util::StringReplace(device_name, "/job_", "/job:", true);
    372     name = str_util::StringReplace(name, "/replica_", "/replica:", true);
    373     name = str_util::StringReplace(name, "/task_", "/task:", true);
    374     name = str_util::StringReplace(name, "/device_", "/device:", true);
    375     name = str_util::StringReplace(name, "GPU_", "GPU:", true);
    376     name = str_util::StringReplace(name, "CPU_", "CPU:", true);
    377     name = str_util::StringReplace(name, "gpu_", "gpu:", true);
    378     name = str_util::StringReplace(name, "cpu_", "cpu:", true);
    379     parsed = DeviceNameUtils::ParseFullName(name, &parsed_name);
    380   }
    381   if (parsed) {
    382     const string jobname = parsed_name.has_job ? parsed_name.job : "";
    383     return strings::StrCat("/", jobname, "/", parsed_name.type);
    384   } else {
    385     return "Unclassified";
    386   }
    387 }
    388 
    389 string GetDeviceClass(const string& device_name) {
    390   // TODO(dyoon): channel device name follows the convention we currently have
    391   // in VirtualScheduler. This should be revised with VirtualScheduler as well
    392   // as VirtualPlacer in the future.
    393   if (device_name.find("Channel") != string::npos) {
    394     const string from = "_from_";
    395     const string to = "_to_";
    396     const auto from_loc = device_name.find(from);
    397     const auto to_loc = device_name.find(to);
    398     const auto src_device_full = device_name.substr(
    399         from_loc + from.size(), to_loc - (from_loc + from.size()));
    400     const auto dst_device_full = device_name.substr(to_loc + to.size());
    401     return strings::StrCat(
    402         "Channel", ": ", GetDeviceClassForNonChannelDevice(src_device_full),
    403         " -> ", GetDeviceClassForNonChannelDevice(dst_device_full));
    404   } else {
    405     return GetDeviceClassForNonChannelDevice(device_name);
    406   }
    407 }
    408 
    409 string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
    410                                      bool verbosity) {
    411   // TODO(dyoon): print out other stats as needed.
    412   std::ostringstream output;
    413 
    414   // Tensor size histogram:
    415   // if verbosity, it outputs per-device histogram,
    416   // otherwise, only per-class histogram.
    417   std::unordered_map<string, TensorSizeHistogram> device_to_hist_map;
    418   const auto& step_stats = run_metadata.step_stats();
    419   for (const auto& dev_stat : step_stats.dev_stats()) {
    420     const auto& device_name = dev_stat.device();
    421     auto& hist = device_to_hist_map[device_name];
    422     for (const auto& node_stat : dev_stat.node_stats()) {
    423       for (const auto& node_output : node_stat.output()) {
    424         // TODO(dyoon): Calculate tensor size from tensor_description's dtype
    425         // and shape, instead of using optional allocation_description.
    426         const auto size = node_output.tensor_description()
    427                               .allocation_description()
    428                               .allocated_bytes();
    429         hist.Add(size);
    430       }
    431     }
    432   }
    433   if (verbosity) {
    434     output << "\n";
    435     output << "Per device tensor size histogram.\n";
    436   }
    437 
    438   std::unordered_map<string, TensorSizeHistogram> device_class_to_hist_map;
    439   for (const auto& device_hist : device_to_hist_map) {
    440     const auto& device_name = device_hist.first;
    441     const auto& hist = device_hist.second;
    442     if (verbosity) {
    443       output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
    444     }
    445     const auto device_class = GetDeviceClass(device_name);
    446     auto it = device_class_to_hist_map.find(device_class);
    447     if (it == device_class_to_hist_map.end()) {
    448       device_class_to_hist_map.emplace(device_class, TensorSizeHistogram(hist));
    449     } else {
    450       it->second.Merge(hist);
    451     }
    452   }
    453   output << "\n";
    454   output << "Aggregated per device / channel type tensor size histogram:\n";
    455   for (const auto& device_hist : device_class_to_hist_map) {
    456     const auto& device_name = device_hist.first;
    457     const auto& hist = device_hist.second;
    458     output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
    459   }
    460   output << "\n";
    461 
    462   return output.str();
    463 }
    464 
    465 }  // end namespace grappler
    466 }  // end namespace tensorflow
    467