Home | History | Annotate | Download | only in graph
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/graph/costmodel.h"
     17 
     18 #include <vector>
     19 #include "tensorflow/core/framework/allocation_description.pb.h"
     20 #include "tensorflow/core/framework/cost_graph.pb.h"
     21 #include "tensorflow/core/framework/step_stats.pb.h"
     22 #include "tensorflow/core/framework/tensor_description.pb.h"
     23 #include "tensorflow/core/graph/graph.h"
     24 #include "tensorflow/core/platform/logging.h"
     25 
     26 namespace tensorflow {
     27 namespace {
     28 const Microseconds kDefaultTimeEstimate(1);
     29 const Microseconds kMinTimeEstimate(1);
     30 }  // namespace
     31 
     32 void CostModel::SuppressInfrequent() {
     33   // Find the median of the non-zero counts, and use half of its value
     34   // as the cutoff for a "normal" execution mode node.
     35   if (count_.empty()) return;
     36   std::vector<int32> non_zero;
     37   for (auto v : count_) {
     38     if (v > 0) non_zero.push_back(v);
     39   }
     40   const size_t sz = non_zero.size();
     41   if (sz > 0) {
     42     std::nth_element(non_zero.begin(), non_zero.begin() + sz / 2,
     43                      non_zero.end());
     44     int32 median_value = non_zero[sz / 2];
     45     min_count_ = median_value / 2;
     46     VLOG(1) << "num non_zero vals: " << non_zero.size() << " median_value "
     47             << median_value;
     48   } else {
     49     min_count_ = 1;
     50   }
     51 }
     52 
     53 void CostModel::MergeFromLocal(const Graph& g, const CostModel& cm) {
     54   CHECK(is_global_);
     55   CHECK(!cm.is_global());
     56   for (const Node* n : g.nodes()) {
     57     const int local_id = cm.Id(n);
     58     const int global_id = Id(n);
     59     if (local_id < 0 || global_id < 0) continue;
     60     int num_slots = cm.slot_bytes_[local_id].size();
     61     Ensure(global_id, num_slots);
     62     count_[global_id] += cm.count_[local_id];
     63     time_[global_id] += cm.time_[local_id];
     64     if (num_slots > 0) {
     65       if (slot_bytes_[global_id].empty()) {
     66         slot_bytes_[global_id].resize(num_slots);
     67       } else {
     68         CHECK_EQ(num_slots, slot_bytes_[global_id].size());
     69       }
     70       for (int s = 0; s < num_slots; ++s) {
     71         slot_bytes_[global_id][s] += cm.slot_bytes_[local_id][s];
     72       }
     73     }
     74   }
     75 }
     76 
     77 void CostModel::MergeFromGlobal(const CostModel& cm) {
     78   CHECK(is_global_);
     79   CHECK_EQ(true, cm.is_global());
     80   const int num_nodes = cm.count_.size();
     81   for (int i = num_nodes - 1; i >= 0; --i) {
     82     count_[i] += cm.count_[i];
     83     time_[i] += cm.time_[i];
     84     int num_slots = cm.slot_bytes_[i].size();
     85     Ensure(i, num_slots);
     86     if (num_slots > 0) {
     87       if (slot_bytes_[i].empty()) {
     88         slot_bytes_[i].resize(num_slots);
     89       } else {
     90         CHECK_EQ(num_slots, slot_bytes_[i].size());
     91       }
     92       for (int s = 0; s < num_slots; ++s) {
     93         slot_bytes_[i][s] += cm.slot_bytes_[i][s];
     94       }
     95     }
     96   }
     97 }
     98 
     99 void CostModel::MergeFromStats(const NodeNameToCostIdMap& map,
    100                                const StepStats& ss) {
    101   CHECK(is_global_);
    102   for (auto& ds : ss.dev_stats()) {
    103     for (auto& ns : ds.node_stats()) {
    104       NodeNameToCostIdMap::const_iterator iter = map.find(ns.node_name());
    105       // We don't keep stats for nodes not in the global graph, i.e.
    106       // copy/send/recv nodes, feed/fetch, etc.
    107       if (iter == map.end()) continue;
    108       int32 global_id = iter->second;
    109       Ensure(global_id, ns.output_size());
    110       int64 elapsed_micros = ns.op_end_rel_micros() - ns.op_start_rel_micros();
    111       count_[global_id]++;
    112       time_[global_id] += elapsed_micros;
    113       for (auto& no : ns.output()) {
    114         int si = no.slot();
    115         if (static_cast<size_t>(si) >= slot_bytes_[global_id].size()) {
    116           slot_bytes_[global_id].resize(1 + si);
    117         }
    118         slot_bytes_[global_id][si] +=
    119             no.tensor_description().allocation_description().requested_bytes();
    120       }
    121     }
    122   }
    123 }
    124 
    125 void CostModel::Ensure(int id, int num_outputs) {
    126   if (slot_bytes_.size() <= static_cast<size_t>(id)) {
    127     slot_bytes_.resize(id + 1);
    128     count_.resize(id + 1);
    129     time_.resize(id + 1);
    130     max_mem_usage_.resize(id + 1);
    131     max_exec_time_.resize(id + 1);
    132     output_port_alloc_ids_.resize(id + 1);
    133   }
    134   if (num_outputs > 0) {
    135     auto perslot = &slot_bytes_[id];
    136     auto output_port_alloc_ids = &output_port_alloc_ids_[id];
    137     auto max_mem_usage = &max_mem_usage_[id];
    138 
    139     CHECK_LE(perslot->size(), num_outputs);
    140     DCHECK_EQ(output_port_alloc_ids->size(), perslot->size());
    141     DCHECK_EQ(max_mem_usage->output_port_mem.size(), perslot->size());
    142     DCHECK_EQ(max_mem_usage->output_port_shape.size(), perslot->size());
    143     DCHECK_EQ(max_mem_usage->output_port_type.size(), perslot->size());
    144 
    145     perslot->resize(num_outputs, Bytes(-1));
    146     output_port_alloc_ids->resize(num_outputs, -1);
    147     max_mem_usage->output_port_mem.resize(num_outputs, Bytes(-1));
    148     max_mem_usage->output_port_shape.resize(num_outputs, unknown_shape_);
    149     max_mem_usage->output_port_type.resize(num_outputs, DT_INVALID);
    150   }
    151 }
    152 
    153 void CostModel::SetNumOutputs(const Node* node, int num_outputs) {
    154   const int id = Id(node);
    155   if (id < 0) return;
    156   // Do not resize the number of slots before checking its existing number of
    157   // slots.
    158   Ensure(id, 0);
    159   auto perslot = &slot_bytes_[id];
    160   if (!perslot->empty()) {
    161     CHECK_EQ(num_outputs, perslot->size())
    162         << "Cannot resize slot_bytes, node=" << node->name();
    163   }
    164   Ensure(id, num_outputs);
    165 }
    166 
    167 void CostModel::RecordCount(const Node* node, int count) {
    168   const int id = Id(node);
    169   if (id < 0) return;
    170   CHECK_LT(id, slot_bytes_.size());
    171   count_[id] += count;
    172 }
    173 
    174 int32 CostModel::TotalCount(const Node* node) const {
    175   const int id = Id(node);
    176   if (id < 0) return 0;
    177   return (static_cast<size_t>(id) < slot_bytes_.size()) ? count_[id] : 0;
    178 }
    179 
    180 void CostModel::RecordSize(const Node* node, int slot, Bytes bytes) {
    181   const int id = Id(node);
    182   if (id < 0) return;
    183   CHECK_LT(id, slot_bytes_.size());
    184   auto perslot = &slot_bytes_[id];
    185   CHECK_LT(slot, perslot->size());
    186   auto v = &(*perslot)[slot];
    187   if (*v >= 0) {
    188     *v += bytes;
    189   } else {
    190     *v = bytes;
    191   }
    192 }
    193 
    194 Bytes CostModel::TotalBytes(const Node* node, int slot) const {
    195   const int id = Id(node);
    196   if (id < 0 || static_cast<size_t>(id) >= slot_bytes_.size() ||
    197       slot_bytes_[id].size() <= static_cast<size_t>(slot)) {
    198     return Bytes(0);
    199   }
    200   return slot_bytes_[id][slot];
    201 }
    202 
    203 Bytes CostModel::SizeEstimate(const Node* node, int slot) const {
    204   int32 count = TotalCount(node);
    205   if (count < min_count_) return Bytes(0);
    206   return TotalBytes(node, slot) / std::max(1, TotalCount(node));
    207 }
    208 
    209 void CostModel::RecordTime(const Node* node, Microseconds time) {
    210   const int id = Id(node);
    211   if (id < 0) return;
    212   DCHECK(node->IsOp()) << node->DebugString();
    213   Ensure(id, node->num_outputs());
    214   time_[id] += time;
    215 }
    216 
    217 Microseconds CostModel::TotalTime(const Node* node) const {
    218   DCHECK(node->IsOp()) << node->DebugString();
    219   const int id = Id(node);
    220   if (id < 0 || static_cast<size_t>(id) >= time_.size() ||
    221       time_[id] < Microseconds(0)) {
    222     return Microseconds(0);
    223   }
    224   return time_[id];
    225 }
    226 
    227 Microseconds CostModel::TimeEstimate(const Node* node) const {
    228   int32 count = TotalCount(node);
    229   if (count <= min_count_) return kMinTimeEstimate;
    230   return std::max(kMinTimeEstimate, TotalTime(node) / std::max(1, count));
    231 }
    232 
    233 void CostModel::CheckInitialized(const Graph& graph) const {
    234   for (const Node* n : graph.op_nodes()) {
    235     CHECK(static_cast<size_t>(n->id()) < time_.size() &&
    236           time_[n->id()] >= Microseconds(0))
    237         << ": no time estimate for " << n->DebugString();
    238 
    239     CHECK(static_cast<size_t>(n->id()) < slot_bytes_.size())
    240         << ": no size estimate for " << n->DebugString();
    241     const auto& perslot = slot_bytes_[n->id()];
    242     for (size_t i = 0; i < perslot.size(); i++) {
    243       CHECK_GE(perslot[i], Bytes(0)) << ": no size estimate for output# " << i
    244                                      << " of " << n->DebugString();
    245     }
    246   }
    247 }
    248 
    249 void CostModel::RecordMaxMemorySize(const Node* node, int output_slot,
    250                                     Bytes bytes,
    251                                     const TensorShapeProto& tensor_shape,
    252                                     const DataType& dtype) {
    253   const int id = Id(node);
    254   if (id < 0) return;
    255   if (output_slot >= node->num_outputs()) {
    256     LOG(ERROR) << "Unexpected output slot for node " << node->DebugString()
    257                << ". Got " << output_slot << " but its num_outputs is "
    258                << node->num_outputs();
    259     return;
    260   }
    261   Ensure(id, node->num_outputs());
    262   auto& current_max = max_mem_usage_[id].output_port_mem[output_slot];
    263   // If the memory allocator doesn't track memory usage, let's infer a lower
    264   // bound from the tensor shape and its data type.
    265   if (bytes.value() < 0) {
    266     bytes = MinTensorMemoryUsage(tensor_shape, dtype);
    267   }
    268   if (bytes.value() > current_max.value()) {
    269     current_max = bytes.value();
    270     max_mem_usage_[id].output_port_shape[output_slot] = tensor_shape;
    271     max_mem_usage_[id].output_port_type[output_slot] = dtype;
    272   }
    273 }
    274 
    275 Bytes CostModel::MaxMemorySize(const Node* node, int slot) const {
    276   const int id = Id(node);
    277   if (id < 0 || static_cast<size_t>(id) >= max_mem_usage_.size() ||
    278       max_mem_usage_[id].output_port_mem.size() <= static_cast<size_t>(slot)) {
    279     return Bytes(0);
    280   }
    281   return max_mem_usage_[id].output_port_mem[slot];
    282 }
    283 
    284 const TensorShapeProto& CostModel::MaxMemoryShape(const Node* node,
    285                                                   int slot) const {
    286   const int id = Id(node);
    287   if (id < 0 || static_cast<size_t>(id) >= max_mem_usage_.size() ||
    288       max_mem_usage_[id].output_port_shape.size() <=
    289           static_cast<size_t>(slot)) {
    290     return unknown_shape_;
    291   }
    292   return max_mem_usage_[id].output_port_shape[slot];
    293 }
    294 
    295 DataType CostModel::MaxMemoryType(const Node* node, int slot) const {
    296   const int id = Id(node);
    297   if (id < 0 || static_cast<size_t>(id) >= max_mem_usage_.size() ||
    298       max_mem_usage_[id].output_port_type.size() <= static_cast<size_t>(slot)) {
    299     return DT_INVALID;
    300   }
    301   return max_mem_usage_[id].output_port_type[slot];
    302 }
    303 
    304 Bytes CostModel::TempMemorySize(const Node* node) const {
    305   const int id = Id(node);
    306   if (id < 0) {
    307     return Bytes(0);
    308   }
    309   return max_mem_usage_[id].temp_memory_size;
    310 }
    311 
    312 Bytes CostModel::PersistentMemorySize(const Node* node) const {
    313   const int id = Id(node);
    314   if (id < 0) {
    315     return Bytes(0);
    316   }
    317   return max_mem_usage_[id].persistent_memory_size;
    318 }
    319 
    320 void CostModel::RecordMemoryStats(const Node* node,
    321                                   const MemoryStats& memory_stats) {
    322   const int id = Id(node);
    323   if (id < 0) return;
    324   max_mem_usage_[id].temp_memory_size = memory_stats.temp_memory_size();
    325   max_mem_usage_[id].persistent_memory_size =
    326       memory_stats.persistent_memory_size();
    327   for (int64 alloc_id : memory_stats.persistent_tensor_alloc_ids()) {
    328     if (alloc_id > 0) {
    329       persistent_alloc_ids_.insert(alloc_id);
    330     }
    331   }
    332 }
    333 
    334 void CostModel::RecordMaxExecutionTime(const Node* node, Microseconds time) {
    335   const int id = Id(node);
    336   if (id < 0) return;
    337   Ensure(id, node->num_outputs());
    338   max_exec_time_[id] = std::max(max_exec_time_[id], time);
    339 }
    340 
    341 Microseconds CostModel::MaxExecutionTime(const Node* node) const {
    342   const int id = Id(node);
    343   if (id < 0 || static_cast<size_t>(id) >= max_exec_time_.size()) {
    344     return Microseconds(0);
    345   }
    346   return max_exec_time_[id];
    347 }
    348 
    349 void CostModel::RecordAllocationId(const Node* node, int output_slot,
    350                                    int64 alloc_id) {
    351   const int id = Id(node);
    352   if (id < 0) return;
    353   Ensure(id, node->num_outputs());
    354   output_port_alloc_ids_[id][output_slot] = alloc_id;
    355 }
    356 
    357 int64 CostModel::AllocationId(const Node* node, int slot) const {
    358   const int id = Id(node);
    359   if (id < 0 || static_cast<size_t>(id) >= output_port_alloc_ids_.size() ||
    360       output_port_alloc_ids_[id].size() <= static_cast<size_t>(slot)) {
    361     return -1;
    362   }
    363   return output_port_alloc_ids_[id][slot];
    364 }
    365 
    366 bool CostModel::IsPersistentTensor(const Node* node, int64 alloc_id) const {
    367   if (persistent_alloc_ids_.count(alloc_id) > 0) {
    368     return true;
    369   }
    370   if (persistent_alloc_ids_by_devices_.find(node->assigned_device_name()) ==
    371       persistent_alloc_ids_by_devices_.end()) {
    372     return false;
    373   }
    374   return persistent_alloc_ids_by_devices_.at(node->assigned_device_name())
    375       .count(alloc_id);
    376 }
    377 
    378 Microseconds CostModel::CopyTimeEstimate(Bytes b, double network_latency_millis,
    379                                          double estimated_gbps) {
    380   // TODO(jeff,sanjay): estimate cost based on bandwidth along the
    381   // communication path and the type of transport we are using between
    382   // devices.
    383   //
    384   // We assume the copy time follows a linear model:
    385   //    copy_time = copy_bytes / rate + min_time
    386   int64 copy_bytes = b.value();
    387   const double bytes_per_usec = estimated_gbps * 1000.0 / 8;
    388   const double min_micros = network_latency_millis * 1000.0;
    389   return Microseconds(
    390       static_cast<int64>(copy_bytes / bytes_per_usec + min_micros));
    391 }
    392 
    393 Microseconds CostModel::ComputationTimeEstimate(int64 math_ops) {
    394   // TODO(jeff,sanjay): Eventually we should pass in the type of device
    395   // (GPU vs. CPU) and use that to affect the estimate.
    396 
    397   // We estimate the microseconds using that value.  We divide
    398   // by 1000 to convert the madd number into microseconds (assuming
    399   // roughly 1000 madds per microsecond (~1 GHz for one core)).
    400   return Microseconds(math_ops / 1000);
    401 }
    402 
    403 void CostModel::IncrementUpdateTimes() { update_times_++; }
    404 
    405 int32 CostModel::GetUpdateTimes() const { return update_times_; }
    406 
    407 // ----------------------------------------------------------------------------
    408 // InitCostModel
    409 // ----------------------------------------------------------------------------
    410 
    411 namespace {
    412 
    413 static void AddNodesToCostModel(const Graph& g, CostModel* cost_model) {
    414   for (Node* n : g.nodes()) {
    415     const int num_outputs = n->num_outputs();
    416     cost_model->SetNumOutputs(n, num_outputs);
    417     for (int output = 0; output < num_outputs; output++) {
    418       // Set up an initial bogus estimate for the node's outputs
    419       cost_model->RecordSize(n, output, Bytes(1));
    420     }
    421   }
    422 }
    423 
    424 static void AssignSizes(const Graph& g, CostModel* cost_model) {
    425   for (const Edge* e : g.edges()) {
    426     // Skip if it is a control edge.
    427     if (e->IsControlEdge()) {
    428       continue;
    429     }
    430     Node* src = e->src();
    431 
    432     // TODO(josh11b): Get an estimate from the Op
    433     Bytes size(1);
    434     cost_model->RecordSize(src, e->src_output(), size);
    435   }
    436 }
    437 
    438 // This generates an extremely simple initial guess for the
    439 // computation cost of each node. For ordinary Ops, its value should quickly
    440 // be wiped out by the real runtime measurements.  For other Ops we don't
    441 // actually generate measurements, so suppression of infrequent Ops ends up
    442 // giving them 0 costs.  So, this is not of much consequence except perhaps
    443 // in tests.
    444 static Microseconds TimeEstimateForNode(CostModel* cost_model, Node* n) {
    445   CHECK(n->IsOp());
    446   VLOG(2) << "Node " << n->id() << ": " << n->name()
    447           << " type_string: " << n->type_string();
    448   if (IsConstant(n) || IsVariable(n)) {
    449     return Microseconds(0);
    450   }
    451   return kDefaultTimeEstimate;
    452 }
    453 
    454 static void EstimateComputationCosts(const Graph& g, CostModel* cost_model) {
    455   for (Node* n : g.nodes()) {
    456     if (!n->IsOp()) continue;
    457     cost_model->RecordTime(n, TimeEstimateForNode(cost_model, n));
    458   }
    459 }
    460 
    461 }  // namespace
    462 
    463 void CostModel::InitFromGraph(const Graph& g) {
    464   const int num_node_ids = g.num_node_ids();
    465   slot_bytes_.reserve(num_node_ids);
    466   count_.reserve(num_node_ids);
    467   time_.reserve(num_node_ids);
    468   max_mem_usage_.reserve(num_node_ids);
    469   max_exec_time_.reserve(num_node_ids);
    470   output_port_alloc_ids_.reserve(num_node_ids);
    471 
    472   AddNodesToCostModel(g, this);
    473   AssignSizes(g, this);
    474   EstimateComputationCosts(g, this);
    475   CheckInitialized(g);
    476 }
    477 
    478 void CostModel::AddToCostGraphDef(const Graph* graph,
    479                                   CostGraphDef* cost_graph) const {
    480   std::vector<const Edge*> inputs;
    481   std::vector<const Edge*> control_inputs;
    482   for (const Node* n : graph->nodes()) {
    483     CostGraphDef::Node* cnode = cost_graph->add_node();
    484     cnode->set_name(n->name());
    485     cnode->set_device(n->assigned_device_name());
    486     cnode->set_id(Id(n));
    487 
    488     inputs.clear();
    489     inputs.resize(n->num_inputs(), nullptr);
    490     control_inputs.clear();
    491     for (const Edge* e : n->in_edges()) {
    492       if (e->IsControlEdge()) {
    493         control_inputs.push_back(e);
    494       } else {
    495         inputs[e->dst_input()] = e;
    496       }
    497     }
    498     std::sort(control_inputs.begin(), control_inputs.end(),
    499               [this](Edge const* a, Edge const* b) {
    500                 return Id(a->src()) < Id(b->src());
    501               });
    502 
    503     for (const Edge* e : inputs) {
    504       CostGraphDef::Node::InputInfo* input_info = cnode->add_input_info();
    505       input_info->set_preceding_node(Id(e->src()));
    506       input_info->set_preceding_port(e->src_output());
    507     }
    508 
    509     for (int i = 0; i < n->num_outputs(); i++) {
    510       CostGraphDef::Node::OutputInfo* output_info = cnode->add_output_info();
    511       int64 alloc_id = AllocationId(n, i);
    512       int64 alias_to_input = -1;
    513       for (const Edge* e : inputs) {
    514         int64 input_alloc_id = AllocationId(e->src(), e->src_output());
    515         if (input_alloc_id == alloc_id) {
    516           alias_to_input = e->dst_input();
    517           break;
    518         }
    519       }
    520       output_info->set_alias_input_port(alias_to_input);
    521       output_info->set_dtype(MaxMemoryType(n, i));
    522       *output_info->mutable_shape() = MaxMemoryShape(n, i);
    523       if (alias_to_input < 0 && IsPersistentTensor(n, alloc_id)) {
    524         output_info->set_size(0);
    525       } else {
    526         output_info->set_size(MaxMemorySize(n, i).value());
    527       }
    528     }
    529 
    530     for (const Edge* e : control_inputs) {
    531       cnode->add_control_input(Id(e->src()));
    532     }
    533 
    534     cnode->set_temporary_memory_size(TempMemorySize(n).value());
    535     cnode->set_persistent_memory_size(PersistentMemorySize(n).value());
    536 
    537     cnode->set_compute_cost(MaxExecutionTime(n).value());
    538 
    539     // For now we treat all send nodes as final.
    540     // TODO(yuanbyu): Send nodes for fetches shouldn't be treated as final.
    541     cnode->set_is_final(n->IsSend());
    542   }
    543 }
    544 
    545 void CostModel::WriteSummaryToLog() const {
    546   LOG(INFO) << " min_count_=" << min_count_;
    547   for (size_t i = 0; i < count_.size(); ++i) {
    548     LOG(INFO) << "Node " << i << " count " << count_[i] << " total time "
    549               << time_[i] << " avg time "
    550               << (time_[i] / (std::max(1, count_[i])));
    551   }
    552 }
    553 
    554 Bytes CostModel::MinTensorMemoryUsage(const TensorShapeProto& tensor_shape,
    555                                       const DataType& dtype) {
    556   if (tensor_shape.unknown_rank()) {
    557     return Bytes(-1);
    558   }
    559 
    560   size_t num_coefficients = 1;
    561   for (const TensorShapeProto::Dim& dim : tensor_shape.dim()) {
    562     // If the dimension is unknown, it has to be at least 1
    563     num_coefficients *= std::max<size_t>(dim.size(), 1);
    564   }
    565   return Bytes(num_coefficients * DataTypeSize(dtype));
    566 }
    567 
    568 }  // namespace tensorflow
    569