1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/grappler/costs/utils.h" 17 18 #include <stddef.h> 19 #include <utility> 20 21 #include "third_party/eigen3/Eigen/Core" 22 23 #include "tensorflow/core/common_runtime/gpu/gpu_id.h" 24 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h" 25 #include "tensorflow/core/framework/allocation_description.pb.h" 26 #include "tensorflow/core/framework/attr_value.pb.h" 27 #include "tensorflow/core/framework/op.h" 28 #include "tensorflow/core/framework/op_def.pb.h" 29 #include "tensorflow/core/framework/step_stats.pb.h" 30 #include "tensorflow/core/framework/tensor.pb.h" 31 #include "tensorflow/core/framework/tensor_description.pb.h" 32 #include "tensorflow/core/framework/tensor_shape.pb.h" 33 #include "tensorflow/core/framework/types.pb.h" 34 #include "tensorflow/core/graph/graph.h" 35 #include "tensorflow/core/graph/tensor_id.h" 36 #include "tensorflow/core/grappler/clusters/utils.h" 37 #include "tensorflow/core/grappler/utils.h" 38 #include "tensorflow/core/lib/core/bits.h" 39 #include "tensorflow/core/lib/strings/numbers.h" 40 #include "tensorflow/core/lib/strings/strcat.h" 41 #include "tensorflow/core/platform/byte_order.h" 42 #include "tensorflow/core/platform/env.h" 43 #include "tensorflow/core/platform/logging.h" 44 #include "tensorflow/core/platform/protobuf.h" 45 #include "tensorflow/core/protobuf/config.pb.h" 46 #include "tensorflow/core/util/device_name_utils.h" 47 48 namespace tensorflow { 49 namespace grappler { 50 51 static OpInfo::TensorProperties UnknownInput() { 52 OpInfo::TensorProperties input; 53 input.set_dtype(DataType::DT_INVALID); 54 input.mutable_shape()->set_unknown_rank(true); 55 return input; 56 } 57 58 static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) { 59 std::vector<TensorProto> tensors; 60 switch (attr_value.value_case()) { 61 case AttrValue::kTensor: { 62 tensors.push_back(attr_value.tensor()); 63 break; 64 } 65 case AttrValue::kList: { 66 for (const auto& tensor_proto : attr_value.list().tensor()) { 67 tensors.push_back(tensor_proto); 68 } 69 break; 70 } 71 default: { 72 } 73 } 74 return tensors; 75 } 76 77 // Annotate the op_info inputs with extra information when possible (e.g. the 78 // input value if it's known statically). 79 static void ExtractExtraProperties( 80 const NodeDef& node, 81 const std::unordered_map<string, const NodeDef*>& name_to_node, 82 OpInfo* op_info) { 83 OpRegistry* op_registry = OpRegistry::Global(); 84 const OpDef* op_def = nullptr; 85 auto s = op_registry->LookUpOpDef(node.op(), &op_def); 86 if (!s.ok()) { 87 op_def = nullptr; 88 } 89 90 for (int i = 0; i < node.input_size(); ++i) { 91 const string input_name = node.input(i); 92 CHECK(!input_name.empty()); 93 if (IsControlInput(input_name)) { 94 continue; 95 } 96 TensorId input_tensor_id = ParseTensorName(input_name); 97 const string input_node_name(input_tensor_id.first); 98 99 auto iter = name_to_node.find(input_node_name); 100 if (iter == name_to_node.end()) continue; 101 const NodeDef* input_node = iter->second; 102 103 if (i >= op_info->inputs_size()) { 104 LOG(ERROR) << "OpInfo's inputs doesn't match the graph! OpInfo: " 105 << op_info->DebugString() 106 << "\nCurrent node: " << node.DebugString() 107 << "\nInput node: " << input_node->DebugString(); 108 } 109 110 // The value attribute in Const input is useful for cost prediction. 111 if (input_node->op() == "Const" && i < op_info->inputs_size()) { 112 auto it = input_node->attr().find("value"); 113 if (it == input_node->attr().end()) continue; 114 115 const AttrValue& attr_value = it->second; 116 std::vector<TensorProto> tensors = ExtractTensors(attr_value); 117 if (tensors.empty()) continue; 118 119 const TensorProto& t = tensors[0]; 120 OpInfo::TensorProperties* input = op_info->mutable_inputs(i); 121 *(input->mutable_value()) = t; 122 123 // For filename input, the file size can also be useful. 124 if (op_def && i < op_def->input_arg_size() && 125 op_def->input_arg(i).name().find("filename") != string::npos) { 126 Tensor tensor; 127 if (!tensor.FromProto(t)) { 128 continue; 129 } 130 if (tensor.NumElements() != 1) { 131 continue; 132 } 133 const string filename = tensor.scalar<string>()(); 134 135 Env* env = Env::Default(); 136 FileStatistics stat; 137 Status s = env->Stat(filename, &stat); 138 if (!s.ok()) { 139 continue; 140 } 141 AttrValue attr; 142 attr.set_i(stat.length); 143 string attr_key = strings::StrCat("input_", i, "_filesize"); 144 (*op_info->mutable_attr())[attr_key] = attr; 145 } 146 } 147 148 // When the input is a handle (e.g. look up table handle), the information 149 // in the op itself is not sufficient to predict the op memory. 150 if (op_def && i < op_def->input_arg_size() && 151 op_def->input_arg(i).name().find("handle") != string::npos) { 152 string new_key = strings::StrCat("parent_", i, "_op"); 153 AttrValue attr; 154 attr.set_s(input_node->op()); 155 (*op_info->mutable_attr())[new_key] = attr; 156 // TODO(yuefengz): Only parent node's op name is copied. Copy inputs 157 // and attributes when necessary. 158 } 159 } 160 } 161 162 std::vector<OpInfo::TensorProperties> FindInputFeatures( 163 const NodeDef& node, 164 const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost, 165 const std::unordered_map<string, const NodeDef*>& name_to_node) { 166 std::vector<OpInfo::TensorProperties> inputs; 167 for (const auto& input_name : node.input()) { 168 CHECK(!input_name.empty()); 169 TensorId input_tensor_id = ParseTensorName(input_name); 170 const string input_node_name(input_tensor_id.first); 171 const int output_index = input_tensor_id.second; 172 173 // Skip control inputs. 174 if (output_index == Graph::kControlSlot) { 175 continue; 176 } 177 178 auto it = name_to_cost.find(input_node_name); 179 if (it == name_to_cost.end() || output_index < 0) { 180 inputs.push_back(UnknownInput()); 181 } else { 182 const CostGraphDef::Node* input_cost = it->second; 183 if (input_cost->output_info_size() == 0) { 184 inputs.push_back(UnknownInput()); 185 } else { 186 const CostGraphDef::Node::OutputInfo& output = 187 input_cost->output_info(output_index); 188 OpInfo::TensorProperties input; 189 input.set_dtype(output.dtype()); 190 *input.mutable_shape() = output.shape(); 191 inputs.push_back(input); 192 } 193 } 194 } 195 196 return inputs; 197 } 198 199 int64 CalculateTensorSize(const OpInfo::TensorProperties& prop) { 200 int64 size = DataTypeSize(BaseType(prop.dtype())); 201 TensorShapeProto shape = prop.shape(); 202 203 // Can't infer the size if the rank is unknown. It has to be at least a 204 // scalar though. 205 if (shape.unknown_rank()) { 206 VLOG(2) << "CalculateTensorSize() -- unknown rank"; 207 return size; 208 } 209 210 // If one of the dimensions is unknown statically, assume it's at least one. 211 for (int i = 0; i < shape.dim_size(); ++i) { 212 if (shape.dim(i).size() < 0) { 213 shape.mutable_dim(i)->set_size(1); 214 VLOG(2) << "CalculateTensorSize() -- unknown dim: " << i; 215 } 216 } 217 218 int64 num_elems = TensorShape(shape).num_elements(); 219 return num_elems * size; 220 } 221 222 int64 CalculateOutputSize( 223 const std::vector<OpInfo::TensorProperties>& output_properties, 224 const int port_num) { 225 if (port_num < 0) return 4; // 4B for control dependency. 226 227 if (port_num >= output_properties.size()) { 228 LOG(ERROR) << "CalculateOutputSize() -- port_num: " << port_num 229 << " >= output_properties.size(): " << output_properties.size(); 230 return 0; 231 } 232 233 return CalculateTensorSize(output_properties[port_num]); 234 } 235 236 DeviceProperties GetDeviceInfo(const string& device_str) { 237 DeviceProperties unknown; 238 unknown.set_type("UNKNOWN"); 239 240 DeviceNameUtils::ParsedName parsed; 241 if (DeviceNameUtils::ParseFullName(device_str, &parsed)) { 242 if (parsed.type == "GPU") { 243 TfGpuId tf_gpu_id(parsed.id); 244 PlatformGpuId platform_gpu_id; 245 Status s = GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id); 246 if (!s.ok()) { 247 // We are probably running simulation without linking cuda libraries. 248 platform_gpu_id = PlatformGpuId(parsed.id); 249 } 250 return GetLocalGPUInfo(platform_gpu_id); 251 } else if (parsed.type == "CPU") { 252 return GetLocalCPUInfo(); 253 } 254 } 255 return unknown; 256 } 257 258 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) { 259 return GetDeviceInfo(node.device()); 260 } 261 262 OpInfo BuildOpInfoWithoutDevice( 263 const NodeDef& node, 264 const std::unordered_map<string, const NodeDef*>& name_to_node, 265 const std::vector<OpInfo::TensorProperties>& inputs) { 266 OpInfo op_info; 267 op_info.set_op(node.op()); 268 *op_info.mutable_attr() = node.attr(); 269 for (auto& input : inputs) { 270 *op_info.add_inputs() = input; 271 } 272 ExtractExtraProperties(node, name_to_node, &op_info); 273 return op_info; 274 } 275 276 string GetOpDescription(const OpInfo& op_info) { 277 string description = "["; 278 description += "Op=" + op_info.op() + ", "; 279 description += "input_shapes=["; 280 for (auto const& input : op_info.inputs()) { 281 description += PartialTensorShape::DebugString(input.shape()); 282 } 283 description += "]"; 284 return description; 285 } 286 287 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph, 288 const GraphDef& graph) { 289 OpPerformanceList ret; 290 std::unordered_map<string, const CostGraphDef::Node*> name_to_cost; 291 std::unordered_map<string, const NodeDef*> name_to_node; 292 for (auto& node : cost_graph.node()) { 293 name_to_cost[node.name()] = &node; 294 } 295 for (auto& node : graph.node()) { 296 name_to_node[node.name()] = &node; 297 } 298 299 for (const auto& node : graph.node()) { 300 // Skip the nodes that are not in the cost graph: these are nodes that 301 // aren't run, because they aren't in the intersection of transitive 302 // fan-in of a fetch node and the transitive fan-out of an input, or nodes 303 // that were optimized away by the optimizer. Since they don't contribute 304 // to the execution time we simply discard them. 305 auto it = name_to_cost.find(node.name()); 306 if (it == name_to_cost.end()) { 307 continue; 308 } 309 const CostGraphDef::Node* cost_node = it->second; 310 311 OpPerformance* perf = ret.add_op_performance(); 312 perf->set_node(node.name()); 313 314 std::vector<OpInfo::TensorProperties> inputs = 315 FindInputFeatures(node, name_to_cost, name_to_node); 316 *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs); 317 *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device()); 318 319 perf->set_temporary_memory_size(cost_node->temporary_memory_size()); 320 // Note that CostGraphDef::Node::compute_cost is microseconds, while 321 // OpPerformance.compute_cost is nanoseconds. 322 perf->set_compute_cost(cost_node->compute_cost() * 1000); 323 perf->set_compute_time(cost_node->compute_time() * 1000); 324 perf->set_memory_time(cost_node->memory_time() * 1000); 325 326 for (const auto& output_info : cost_node->output_info()) { 327 perf->mutable_op_memory()->add_output_memory(output_info.size()); 328 } 329 330 perf->mutable_op_memory()->set_temp_memory( 331 cost_node->temporary_memory_size()); 332 perf->mutable_op_memory()->set_persistent_memory( 333 cost_node->persistent_memory_size()); 334 } 335 return ret; 336 } 337 338 void TensorSizeHistogram::Add(const uint64 value) { 339 num_elem_++; 340 sum_elem_ += value; 341 min_ = std::min(min_, value); 342 max_ = std::max(max_, value); 343 buckets_[Index(value)]++; 344 } 345 346 void TensorSizeHistogram::Merge(const TensorSizeHistogram& src) { 347 num_elem_ += src.num_elem_; 348 sum_elem_ += src.sum_elem_; 349 min_ = std::min(min_, src.min_); 350 max_ = std::max(max_, src.max_); 351 std::transform(buckets_.begin(), buckets_.end(), src.buckets_.begin(), 352 buckets_.begin(), std::plus<uint64>()); 353 } 354 355 string TensorSizeHistogram::ToString() const { 356 string r; 357 char buf[200]; 358 snprintf(buf, sizeof(buf), "Count: %lld, Average: ", num_elem_); 359 r.append(buf); 360 r.append(strings::HumanReadableNumBytes(Average())); 361 r.append(", Min: "); 362 r.append(strings::HumanReadableNumBytes(min_)); 363 r.append(", Max: "); 364 r.append(strings::HumanReadableNumBytes(max_)); 365 r.append("\n------------------------------------------------------\n"); 366 const double mult = num_elem_ > 0 ? 100.0 / num_elem_ : 0.0; 367 uint64 cumul_sum = 0; 368 369 const int size_string_width = 12; 370 for (int i = 0; i < buckets_.size(); i++) { 371 if (buckets_[i] == 0) continue; 372 cumul_sum += buckets_[i]; 373 r.append("[ "); 374 if (i == 0) { 375 r.append(size_string_width - 2, ' '); 376 r.append("0B"); 377 } else { 378 uint64 left = 1ULL << (i - 1); 379 const auto left_string = strings::HumanReadableNumBytes(left); 380 r.append(size_string_width - left_string.size(), ' '); 381 r.append(left_string); 382 } 383 r.append(", "); 384 uint64 right = 1ULL << i; 385 const auto right_string = strings::HumanReadableNumBytes(right); 386 r.append(size_string_width - right_string.size(), ' '); 387 r.append(right_string); 388 snprintf(buf, sizeof(buf), ") %7lld %7.3f%% %7.3f%% ", 389 buckets_[i], // count 390 mult * buckets_[i], // percentage 391 mult * cumul_sum); // cum percentage 392 r.append(buf); 393 394 // Add hash marks based on percentage; 40 marks for 100%. 395 auto marks = static_cast<int>( 396 (static_cast<double>(40 * buckets_[i] + (num_elem_ >> 1)) / num_elem_)); 397 r.append(marks, '#'); 398 r.push_back('\n'); 399 } 400 return r; 401 } 402 403 const int TensorSizeHistogram::Index(const uint64 value) const { 404 // Log2Floor64 returns -1 for 0, 0 for 1, 1 for 2-3, 2 for 4-7, ... 405 const auto index = Log2Floor64(value) + 1; 406 return std::min(index, kMaxBuckets - 1); 407 } 408 409 string GetDeviceClassForNonChannelDevice(const string& device_name) { 410 DeviceNameUtils::ParsedName parsed_name; 411 bool parsed = DeviceNameUtils::ParseFullName(device_name, &parsed_name); 412 if (!parsed) { 413 string name = str_util::StringReplace(device_name, "/job_", "/job:", true); 414 name = str_util::StringReplace(name, "/replica_", "/replica:", true); 415 name = str_util::StringReplace(name, "/task_", "/task:", true); 416 name = str_util::StringReplace(name, "/device_", "/device:", true); 417 name = str_util::StringReplace(name, "GPU_", "GPU:", true); 418 name = str_util::StringReplace(name, "CPU_", "CPU:", true); 419 name = str_util::StringReplace(name, "gpu_", "gpu:", true); 420 name = str_util::StringReplace(name, "cpu_", "cpu:", true); 421 parsed = DeviceNameUtils::ParseFullName(name, &parsed_name); 422 } 423 if (parsed) { 424 const string jobname = parsed_name.has_job ? parsed_name.job : ""; 425 return strings::StrCat("/", jobname, "/", parsed_name.type); 426 } else { 427 return "Unclassified"; 428 } 429 } 430 431 string GetDeviceClass(const string& device_name) { 432 // TODO(dyoon): channel device name follows the convention we currently have 433 // in VirtualScheduler. This should be revised with VirtualScheduler as well 434 // as VirtualPlacer in the future. 435 if (device_name.find("Channel") != string::npos) { 436 const string from = "_from_"; 437 const string to = "_to_"; 438 const auto from_loc = device_name.find(from); 439 const auto to_loc = device_name.find(to); 440 const auto src_device_full = device_name.substr( 441 from_loc + from.size(), to_loc - (from_loc + from.size())); 442 const auto dst_device_full = device_name.substr(to_loc + to.size()); 443 return strings::StrCat( 444 "Channel", ": ", GetDeviceClassForNonChannelDevice(src_device_full), 445 " -> ", GetDeviceClassForNonChannelDevice(dst_device_full)); 446 } else { 447 return GetDeviceClassForNonChannelDevice(device_name); 448 } 449 } 450 451 string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata, 452 bool verbosity) { 453 // TODO(dyoon): print out other stats as needed. 454 std::ostringstream output; 455 456 // Tensor size histogram: 457 // if verbosity, it outputs per-device histogram, 458 // otherwise, only per-class histogram. 459 std::unordered_map<string, TensorSizeHistogram> device_to_hist_map; 460 const auto& step_stats = run_metadata.step_stats(); 461 for (const auto& dev_stat : step_stats.dev_stats()) { 462 const auto& device_name = dev_stat.device(); 463 auto& hist = device_to_hist_map[device_name]; 464 for (const auto& node_stat : dev_stat.node_stats()) { 465 for (const auto& node_output : node_stat.output()) { 466 // TODO(dyoon): Calculate tensor size from tensor_description's dtype 467 // and shape, instead of using optional allocation_description. 468 const auto size = node_output.tensor_description() 469 .allocation_description() 470 .allocated_bytes(); 471 hist.Add(size); 472 } 473 } 474 } 475 if (verbosity) { 476 output << "\n"; 477 output << "Per device tensor size histogram.\n"; 478 } 479 480 std::unordered_map<string, TensorSizeHistogram> device_class_to_hist_map; 481 for (const auto& device_hist : device_to_hist_map) { 482 const auto& device_name = device_hist.first; 483 const auto& hist = device_hist.second; 484 if (verbosity) { 485 output << "Device: " << device_name << "\n" << hist.ToString() << "\n"; 486 } 487 const auto device_class = GetDeviceClass(device_name); 488 auto it = device_class_to_hist_map.find(device_class); 489 if (it == device_class_to_hist_map.end()) { 490 device_class_to_hist_map.emplace(device_class, TensorSizeHistogram(hist)); 491 } else { 492 it->second.Merge(hist); 493 } 494 } 495 output << "\n"; 496 output << "Aggregated per device / channel type tensor size histogram:\n"; 497 for (const auto& device_hist : device_class_to_hist_map) { 498 const auto& device_name = device_hist.first; 499 const auto& hist = device_hist.second; 500 output << "Device: " << device_name << "\n" << hist.ToString() << "\n"; 501 } 502 output << "\n"; 503 504 return output.str(); 505 } 506 507 } // end namespace grappler 508 } // end namespace tensorflow 509