grappler/costs/utils.cc

/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

#include "tensorflow/core/grappler/costs/utils.h"

#include <stddef.h>
#include <utility>

#include "third_party/eigen3/Eigen/Core"

#if GOOGLE_CUDA
#include "cuda/include/cuda.h"
#include "cuda/include/cuda_runtime_api.h"
#include "cuda/include/cudnn.h"
#endif

#include "tensorflow/core/framework/allocation_description.pb.h"
#include "tensorflow/core/framework/attr_value.pb.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_def.pb.h"
#include "tensorflow/core/framework/step_stats.pb.h"
#include "tensorflow/core/framework/tensor.pb.h"
#include "tensorflow/core/framework/tensor_description.pb.h"
#include "tensorflow/core/framework/tensor_shape.pb.h"
#include "tensorflow/core/framework/types.pb.h"
#include "tensorflow/core/graph/graph.h"
#include "tensorflow/core/graph/tensor_id.h"
#include "tensorflow/core/grappler/clusters/utils.h"
#include "tensorflow/core/grappler/utils.h"
#include "tensorflow/core/lib/core/bits.h"
#include "tensorflow/core/lib/strings/numbers.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/cpu_info.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/protobuf/config.pb.h"
#include "tensorflow/core/util/device_name_utils.h"

namespace tensorflow {
namespace grappler {

static OpInfo::TensorProperties UnknownInput() {
  OpInfo::TensorProperties input;
  input.set_dtype(DataType::DT_INVALID);
  input.mutable_shape()->set_unknown_rank(true);
  return input;
}

static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) {
  std::vector<TensorProto> tensors;
  switch (attr_value.value_case()) {
    case AttrValue::kTensor: {
      tensors.push_back(attr_value.tensor());
      break;
    }
    case AttrValue::kList: {
      for (const auto& tensor_proto : attr_value.list().tensor()) {
        tensors.push_back(tensor_proto);
      }
      break;
    }
    default: {}
  }
  return tensors;
}

// Annotate the op_info inputs with extra information when possible (e.g. the
// input value if it's known statically).
static void ExtractExtraProperties(
    const NodeDef& node,
    const std::unordered_map<string, const NodeDef*>& name_to_node,
    OpInfo* op_info) {
  OpRegistry* op_registry = OpRegistry::Global();
  const OpDef* op_def = nullptr;
  auto s = op_registry->LookUpOpDef(node.op(), &op_def);
  if (!s.ok()) {
    op_def = nullptr;
  }

  for (int i = 0; i < node.input_size(); ++i) {
    const string input_name = node.input(i);
    CHECK(!input_name.empty());
    if (IsControlInput(input_name)) {
      continue;
    }
    TensorId input_tensor_id = ParseTensorName(input_name);
    const string input_node_name = input_tensor_id.first.ToString();

    auto iter = name_to_node.find(input_node_name);
    if (iter == name_to_node.end()) continue;
    const NodeDef* input_node = iter->second;

    if (i >= op_info->inputs_size()) {
      LOG(ERROR) << "OpInfo's inputs doesn't match the graph! OpInfo: "
                 << op_info->DebugString()
                 << "\nCurrent node: " << node.DebugString()
                 << "\nInput node: " << input_node->DebugString();
    }

    // The value attribute in Const input is useful for cost prediction.
    if (input_node->op() == "Const" && i < op_info->inputs_size()) {
      auto it = input_node->attr().find("value");
      if (it == input_node->attr().end()) continue;

      const AttrValue& attr_value = it->second;
      std::vector<TensorProto> tensors = ExtractTensors(attr_value);
      if (tensors.empty()) continue;

      const TensorProto& t = tensors[0];
      OpInfo::TensorProperties* input = op_info->mutable_inputs(i);
      *(input->mutable_value()) = t;

      // For filename input, the file size can also be useful.
      if (op_def && i < op_def->input_arg_size() &&
          op_def->input_arg(i).name().find("filename") != std::string::npos) {
        Tensor tensor;
        if (!tensor.FromProto(t)) {
          continue;
        }
        if (tensor.NumElements() != 1) {
          continue;
        }
        const string filename = tensor.scalar<string>()();

        Env* env = Env::Default();
        FileStatistics stat;
        Status s = env->Stat(filename, &stat);
        if (!s.ok()) {
          continue;
        }
        AttrValue attr;
        attr.set_i(stat.length);
        string attr_key = strings::StrCat("input_", i, "_filesize");
        (*op_info->mutable_attr())[attr_key] = attr;
      }
    }

    // When the input is a handle (e.g. look up table handle), the information
    // in the op itself is not sufficient to predict the op memory.
    if (op_def && i < op_def->input_arg_size() &&
        op_def->input_arg(i).name().find("handle") != std::string::npos) {
      string new_key = strings::StrCat("parent_", i, "_op");
      AttrValue attr;
      attr.set_s(input_node->op());
      (*op_info->mutable_attr())[new_key] = attr;
      // TODO(yuefengz): Only parent node's op name is copied. Copy inputs
      // and attributes when necessary.
    }
  }
}

std::vector<OpInfo::TensorProperties> FindInputFeatures(
    const NodeDef& node,
    const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost,
    const std::unordered_map<string, const NodeDef*>& name_to_node) {
  std::vector<OpInfo::TensorProperties> inputs;
  for (const auto& input_name : node.input()) {
    CHECK(!input_name.empty());
    TensorId input_tensor_id = ParseTensorName(input_name);
    const string input_node_name = input_tensor_id.first.ToString();
    const int output_index = input_tensor_id.second;

    // Skip control inputs.
    if (output_index == Graph::kControlSlot) {
      continue;
    }

    auto it = name_to_cost.find(input_node_name);
    if (it == name_to_cost.end() || output_index < 0) {
      inputs.push_back(UnknownInput());
    } else {
      const CostGraphDef::Node* input_cost = it->second;
      if (input_cost->output_info_size() == 0) {
        inputs.push_back(UnknownInput());
      } else {
        const CostGraphDef::Node::OutputInfo& output =
            input_cost->output_info(output_index);
        OpInfo::TensorProperties input;
        input.set_dtype(output.dtype());
        *input.mutable_shape() = output.shape();
        inputs.push_back(input);
      }
    }
  }

  return inputs;
}

DeviceProperties GetDeviceInfo(const string& device_str) {
  DeviceNameUtils::ParsedName parsed;
  if (DeviceNameUtils::ParseFullName(device_str, &parsed)) {
    if (parsed.type == "GPU") {
      return GetLocalGPUInfo(parsed.id);
    } else if (parsed.type == "CPU") {
      return GetLocalCPUInfo();
    }
  }
  DeviceProperties device;
  device.set_type("UNKNOWN");
  return device;
}

DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) {
  return GetDeviceInfo(node.device());
}

OpInfo BuildOpInfoWithoutDevice(
    const NodeDef& node,
    const std::unordered_map<string, const NodeDef*>& name_to_node,
    const std::vector<OpInfo::TensorProperties>& inputs) {
  OpInfo op_info;
  op_info.set_op(node.op());
  *op_info.mutable_attr() = node.attr();
  for (auto& input : inputs) {
    *op_info.add_inputs() = input;
  }
  ExtractExtraProperties(node, name_to_node, &op_info);
  return op_info;
}

string GetOpDescription(const OpInfo& op_info) {
  string description = "[";
  description += "Op=" + op_info.op() + ", ";
  description += "input_shapes=[";
  for (auto const& input : op_info.inputs()) {
    description += PartialTensorShape::DebugString(input.shape());
  }
  description += "]";
  return description;
}

OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph,
                                               const GraphDef& graph) {
  OpPerformanceList ret;
  std::unordered_map<string, const CostGraphDef::Node*> name_to_cost;
  std::unordered_map<string, const NodeDef*> name_to_node;
  for (auto& node : cost_graph.node()) {
    name_to_cost[node.name()] = &node;
  }
  for (auto& node : graph.node()) {
    name_to_node[node.name()] = &node;
  }

  for (const auto& node : graph.node()) {
    // Skip the nodes that are not in the cost graph: these are nodes that
    // aren't run, because they aren't in the intersection of transitive
    // fan-in of a fetch node and the transitive fan-out of an input, or nodes
    // that were optimized away by the optimizer. Since they don't contribute
    // to the execution time we simply discard them.
    auto it = name_to_cost.find(node.name());
    if (it == name_to_cost.end()) {
      continue;
    }
    const CostGraphDef::Node* cost_node = it->second;

    OpPerformance* perf = ret.add_op_performance();
    perf->set_node(node.name());

    std::vector<OpInfo::TensorProperties> inputs =
        FindInputFeatures(node, name_to_cost, name_to_node);
    *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs);
    *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device());

    perf->set_temporary_memory_size(cost_node->temporary_memory_size());
    // Note that CostGraphDef::Node::compute_cost is microseconds, while
    // OpPerformance.compute_cost is nanoseconds.
    perf->set_compute_cost(cost_node->compute_cost() * 1000);
    perf->set_compute_time(cost_node->compute_time() * 1000);
    perf->set_memory_time(cost_node->memory_time() * 1000);

    for (const auto& output_info : cost_node->output_info()) {
      perf->mutable_op_memory()->add_output_memory(output_info.size());
    }

    perf->mutable_op_memory()->set_temp_memory(
        cost_node->temporary_memory_size());
    perf->mutable_op_memory()->set_persistent_memory(
        cost_node->persistent_memory_size());
  }
  return ret;
}

void TensorSizeHistogram::Add(const uint64 value) {
  num_elem_++;
  sum_elem_ += value;
  min_ = std::min(min_, value);
  max_ = std::max(max_, value);
  buckets_[Index(value)]++;
}

void TensorSizeHistogram::Merge(const TensorSizeHistogram& src) {
  num_elem_ += src.num_elem_;
  sum_elem_ += src.sum_elem_;
  min_ = std::min(min_, src.min_);
  max_ = std::max(max_, src.max_);
  std::transform(buckets_.begin(), buckets_.end(), src.buckets_.begin(),
                 buckets_.begin(), std::plus<uint64>());
}

std::string TensorSizeHistogram::ToString() const {
  std::string r;
  char buf[200];
  snprintf(buf, sizeof(buf), "Count: %lld, Average: ", num_elem_);
  r.append(buf);
  r.append(strings::HumanReadableNumBytes(Average()));
  r.append(", Min: ");
  r.append(strings::HumanReadableNumBytes(min_));
  r.append(", Max: ");
  r.append(strings::HumanReadableNumBytes(max_));
  r.append("\n------------------------------------------------------\n");
  const double mult = num_elem_ > 0 ? 100.0 / num_elem_ : 0.0;
  uint64 cumul_sum = 0;

  const int size_string_width = 12;
  for (int i = 0; i < buckets_.size(); i++) {
    if (buckets_[i] == 0) continue;
    cumul_sum += buckets_[i];
    r.append("[ ");
    if (i == 0) {
      r.append(size_string_width - 2, ' ');
      r.append("0B");
    } else {
      uint64 left = 1ULL << (i - 1);
      const auto left_string = strings::HumanReadableNumBytes(left);
      r.append(size_string_width - left_string.size(), ' ');
      r.append(left_string);
    }
    r.append(", ");
    uint64 right = 1ULL << i;
    const auto right_string = strings::HumanReadableNumBytes(right);
    r.append(size_string_width - right_string.size(), ' ');
    r.append(right_string);
    snprintf(buf, sizeof(buf), ") %7lld %7.3f%% %7.3f%% ",
             buckets_[i],         // count
             mult * buckets_[i],  // percentage
             mult * cumul_sum);   // cum percentage
    r.append(buf);

    // Add hash marks based on percentage; 40 marks for 100%.
    auto marks = static_cast<int>(
        (static_cast<double>(40 * buckets_[i] + (num_elem_ >> 1)) / num_elem_));
    r.append(marks, '#');
    r.push_back('\n');
  }
  return r;
}

const int TensorSizeHistogram::Index(const uint64 value) const {
  // Log2Floor64 returns -1 for 0, 0 for 1, 1 for 2-3, 2 for 4-7, ...
  const auto index = Log2Floor64(value) + 1;
  return std::min(index, kMaxBuckets - 1);
}

string GetDeviceClassForNonChannelDevice(const string& device_name) {
  DeviceNameUtils::ParsedName parsed_name;
  bool parsed = DeviceNameUtils::ParseFullName(device_name, &parsed_name);
  if (!parsed) {
    string name = str_util::StringReplace(device_name, "/job_", "/job:", true);
    name = str_util::StringReplace(name, "/replica_", "/replica:", true);
    name = str_util::StringReplace(name, "/task_", "/task:", true);
    name = str_util::StringReplace(name, "/device_", "/device:", true);
    name = str_util::StringReplace(name, "GPU_", "GPU:", true);
    name = str_util::StringReplace(name, "CPU_", "CPU:", true);
    name = str_util::StringReplace(name, "gpu_", "gpu:", true);
    name = str_util::StringReplace(name, "cpu_", "cpu:", true);
    parsed = DeviceNameUtils::ParseFullName(name, &parsed_name);
  }
  if (parsed) {
    const string jobname = parsed_name.has_job ? parsed_name.job : "";
    return strings::StrCat("/", jobname, "/", parsed_name.type);
  } else {
    return "Unclassified";
  }
}

string GetDeviceClass(const string& device_name) {
  // TODO(dyoon): channel device name follows the convention we currently have
  // in VirtualScheduler. This should be revised with VirtualScheduler as well
  // as VirtualPlacer in the future.
  if (device_name.find("Channel") != string::npos) {
    const string from = "_from_";
    const string to = "_to_";
    const auto from_loc = device_name.find(from);
    const auto to_loc = device_name.find(to);
    const auto src_device_full = device_name.substr(
        from_loc + from.size(), to_loc - (from_loc + from.size()));
    const auto dst_device_full = device_name.substr(to_loc + to.size());
    return strings::StrCat(
        "Channel", ": ", GetDeviceClassForNonChannelDevice(src_device_full),
        " -> ", GetDeviceClassForNonChannelDevice(dst_device_full));
  } else {
    return GetDeviceClassForNonChannelDevice(device_name);
  }
}

string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
                                     bool verbosity) {
  // TODO(dyoon): print out other stats as needed.
  std::ostringstream output;

  // Tensor size histogram:
  // if verbosity, it outputs per-device histogram,
  // otherwise, only per-class histogram.
  std::unordered_map<string, TensorSizeHistogram> device_to_hist_map;
  const auto& step_stats = run_metadata.step_stats();
  for (const auto& dev_stat : step_stats.dev_stats()) {
    const auto& device_name = dev_stat.device();
    auto& hist = device_to_hist_map[device_name];
    for (const auto& node_stat : dev_stat.node_stats()) {
      for (const auto& node_output : node_stat.output()) {
        // TODO(dyoon): Calculate tensor size from tensor_description's dtype
        // and shape, instead of using optional allocation_description.
        const auto size = node_output.tensor_description()
                              .allocation_description()
                              .allocated_bytes();
        hist.Add(size);
      }
    }
  }
  if (verbosity) {
    output << "\n";
    output << "Per device tensor size histogram.\n";
  }

  std::unordered_map<string, TensorSizeHistogram> device_class_to_hist_map;
  for (const auto& device_hist : device_to_hist_map) {
    const auto& device_name = device_hist.first;
    const auto& hist = device_hist.second;
    if (verbosity) {
      output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
    }
    const auto device_class = GetDeviceClass(device_name);
    auto it = device_class_to_hist_map.find(device_class);
    if (it == device_class_to_hist_map.end()) {
      device_class_to_hist_map.emplace(device_class, TensorSizeHistogram(hist));
    } else {
      it->second.Merge(hist);
    }
  }
  output << "\n";
  output << "Aggregated per device / channel type tensor size histogram:\n";
  for (const auto& device_hist : device_class_to_hist_map) {
    const auto& device_name = device_hist.first;
    const auto& hist = device_hist.second;
    output << "Device: " << device_name << "\n" << hist.ToString() << "\n";
  }
  output << "\n";

  return output.str();
}

}  // end namespace grappler
}  // end namespace tensorflow