1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/grappler/costs/utils.h" 17 18 #include <stddef.h> 19 #include <utility> 20 21 #include "third_party/eigen3/Eigen/Core" 22 23 #if GOOGLE_CUDA 24 #include "cuda/include/cuda.h" 25 #include "cuda/include/cuda_runtime_api.h" 26 #include "cuda/include/cudnn.h" 27 #endif 28 29 #include "tensorflow/core/framework/allocation_description.pb.h" 30 #include "tensorflow/core/framework/attr_value.pb.h" 31 #include "tensorflow/core/framework/op.h" 32 #include "tensorflow/core/framework/op_def.pb.h" 33 #include "tensorflow/core/framework/step_stats.pb.h" 34 #include "tensorflow/core/framework/tensor.pb.h" 35 #include "tensorflow/core/framework/tensor_description.pb.h" 36 #include "tensorflow/core/framework/tensor_shape.pb.h" 37 #include "tensorflow/core/framework/types.pb.h" 38 #include "tensorflow/core/graph/graph.h" 39 #include "tensorflow/core/graph/tensor_id.h" 40 #include "tensorflow/core/grappler/clusters/utils.h" 41 #include "tensorflow/core/grappler/utils.h" 42 #include "tensorflow/core/lib/core/bits.h" 43 #include "tensorflow/core/lib/strings/numbers.h" 44 #include "tensorflow/core/lib/strings/strcat.h" 45 #include "tensorflow/core/platform/cpu_info.h" 46 #include "tensorflow/core/platform/env.h" 47 #include "tensorflow/core/platform/logging.h" 48 #include "tensorflow/core/platform/protobuf.h" 49 #include "tensorflow/core/protobuf/config.pb.h" 50 #include "tensorflow/core/util/device_name_utils.h" 51 52 namespace tensorflow { 53 namespace grappler { 54 55 static OpInfo::TensorProperties UnknownInput() { 56 OpInfo::TensorProperties input; 57 input.set_dtype(DataType::DT_INVALID); 58 input.mutable_shape()->set_unknown_rank(true); 59 return input; 60 } 61 62 static std::vector<TensorProto> ExtractTensors(const AttrValue& attr_value) { 63 std::vector<TensorProto> tensors; 64 switch (attr_value.value_case()) { 65 case AttrValue::kTensor: { 66 tensors.push_back(attr_value.tensor()); 67 break; 68 } 69 case AttrValue::kList: { 70 for (const auto& tensor_proto : attr_value.list().tensor()) { 71 tensors.push_back(tensor_proto); 72 } 73 break; 74 } 75 default: {} 76 } 77 return tensors; 78 } 79 80 // Annotate the op_info inputs with extra information when possible (e.g. the 81 // input value if it's known statically). 82 static void ExtractExtraProperties( 83 const NodeDef& node, 84 const std::unordered_map<string, const NodeDef*>& name_to_node, 85 OpInfo* op_info) { 86 OpRegistry* op_registry = OpRegistry::Global(); 87 const OpDef* op_def = nullptr; 88 auto s = op_registry->LookUpOpDef(node.op(), &op_def); 89 if (!s.ok()) { 90 op_def = nullptr; 91 } 92 93 for (int i = 0; i < node.input_size(); ++i) { 94 const string input_name = node.input(i); 95 CHECK(!input_name.empty()); 96 if (IsControlInput(input_name)) { 97 continue; 98 } 99 TensorId input_tensor_id = ParseTensorName(input_name); 100 const string input_node_name = input_tensor_id.first.ToString(); 101 102 auto iter = name_to_node.find(input_node_name); 103 if (iter == name_to_node.end()) continue; 104 const NodeDef* input_node = iter->second; 105 106 if (i >= op_info->inputs_size()) { 107 LOG(ERROR) << "OpInfo's inputs doesn't match the graph! OpInfo: " 108 << op_info->DebugString() 109 << "\nCurrent node: " << node.DebugString() 110 << "\nInput node: " << input_node->DebugString(); 111 } 112 113 // The value attribute in Const input is useful for cost prediction. 114 if (input_node->op() == "Const" && i < op_info->inputs_size()) { 115 auto it = input_node->attr().find("value"); 116 if (it == input_node->attr().end()) continue; 117 118 const AttrValue& attr_value = it->second; 119 std::vector<TensorProto> tensors = ExtractTensors(attr_value); 120 if (tensors.empty()) continue; 121 122 const TensorProto& t = tensors[0]; 123 OpInfo::TensorProperties* input = op_info->mutable_inputs(i); 124 *(input->mutable_value()) = t; 125 126 // For filename input, the file size can also be useful. 127 if (op_def && i < op_def->input_arg_size() && 128 op_def->input_arg(i).name().find("filename") != std::string::npos) { 129 Tensor tensor; 130 if (!tensor.FromProto(t)) { 131 continue; 132 } 133 if (tensor.NumElements() != 1) { 134 continue; 135 } 136 const string filename = tensor.scalar<string>()(); 137 138 Env* env = Env::Default(); 139 FileStatistics stat; 140 Status s = env->Stat(filename, &stat); 141 if (!s.ok()) { 142 continue; 143 } 144 AttrValue attr; 145 attr.set_i(stat.length); 146 string attr_key = strings::StrCat("input_", i, "_filesize"); 147 (*op_info->mutable_attr())[attr_key] = attr; 148 } 149 } 150 151 // When the input is a handle (e.g. look up table handle), the information 152 // in the op itself is not sufficient to predict the op memory. 153 if (op_def && i < op_def->input_arg_size() && 154 op_def->input_arg(i).name().find("handle") != std::string::npos) { 155 string new_key = strings::StrCat("parent_", i, "_op"); 156 AttrValue attr; 157 attr.set_s(input_node->op()); 158 (*op_info->mutable_attr())[new_key] = attr; 159 // TODO(yuefengz): Only parent node's op name is copied. Copy inputs 160 // and attributes when necessary. 161 } 162 } 163 } 164 165 std::vector<OpInfo::TensorProperties> FindInputFeatures( 166 const NodeDef& node, 167 const std::unordered_map<string, const CostGraphDef::Node*>& name_to_cost, 168 const std::unordered_map<string, const NodeDef*>& name_to_node) { 169 std::vector<OpInfo::TensorProperties> inputs; 170 for (const auto& input_name : node.input()) { 171 CHECK(!input_name.empty()); 172 TensorId input_tensor_id = ParseTensorName(input_name); 173 const string input_node_name = input_tensor_id.first.ToString(); 174 const int output_index = input_tensor_id.second; 175 176 // Skip control inputs. 177 if (output_index == Graph::kControlSlot) { 178 continue; 179 } 180 181 auto it = name_to_cost.find(input_node_name); 182 if (it == name_to_cost.end() || output_index < 0) { 183 inputs.push_back(UnknownInput()); 184 } else { 185 const CostGraphDef::Node* input_cost = it->second; 186 if (input_cost->output_info_size() == 0) { 187 inputs.push_back(UnknownInput()); 188 } else { 189 const CostGraphDef::Node::OutputInfo& output = 190 input_cost->output_info(output_index); 191 OpInfo::TensorProperties input; 192 input.set_dtype(output.dtype()); 193 *input.mutable_shape() = output.shape(); 194 inputs.push_back(input); 195 } 196 } 197 } 198 199 return inputs; 200 } 201 202 DeviceProperties GetDeviceInfo(const string& device_str) { 203 DeviceNameUtils::ParsedName parsed; 204 if (DeviceNameUtils::ParseFullName(device_str, &parsed)) { 205 if (parsed.type == "GPU") { 206 return GetLocalGPUInfo(parsed.id); 207 } else if (parsed.type == "CPU") { 208 return GetLocalCPUInfo(); 209 } 210 } 211 DeviceProperties device; 212 device.set_type("UNKNOWN"); 213 return device; 214 } 215 216 DeviceProperties GetDeviceInfo(const CostGraphDef::Node& node) { 217 return GetDeviceInfo(node.device()); 218 } 219 220 OpInfo BuildOpInfoWithoutDevice( 221 const NodeDef& node, 222 const std::unordered_map<string, const NodeDef*>& name_to_node, 223 const std::vector<OpInfo::TensorProperties>& inputs) { 224 OpInfo op_info; 225 op_info.set_op(node.op()); 226 *op_info.mutable_attr() = node.attr(); 227 for (auto& input : inputs) { 228 *op_info.add_inputs() = input; 229 } 230 ExtractExtraProperties(node, name_to_node, &op_info); 231 return op_info; 232 } 233 234 string GetOpDescription(const OpInfo& op_info) { 235 string description = "["; 236 description += "Op=" + op_info.op() + ", "; 237 description += "input_shapes=["; 238 for (auto const& input : op_info.inputs()) { 239 description += PartialTensorShape::DebugString(input.shape()); 240 } 241 description += "]"; 242 return description; 243 } 244 245 OpPerformanceList CostGraphToOpPerformanceData(const CostGraphDef& cost_graph, 246 const GraphDef& graph) { 247 OpPerformanceList ret; 248 std::unordered_map<string, const CostGraphDef::Node*> name_to_cost; 249 std::unordered_map<string, const NodeDef*> name_to_node; 250 for (auto& node : cost_graph.node()) { 251 name_to_cost[node.name()] = &node; 252 } 253 for (auto& node : graph.node()) { 254 name_to_node[node.name()] = &node; 255 } 256 257 for (const auto& node : graph.node()) { 258 // Skip the nodes that are not in the cost graph: these are nodes that 259 // aren't run, because they aren't in the intersection of transitive 260 // fan-in of a fetch node and the transitive fan-out of an input, or nodes 261 // that were optimized away by the optimizer. Since they don't contribute 262 // to the execution time we simply discard them. 263 auto it = name_to_cost.find(node.name()); 264 if (it == name_to_cost.end()) { 265 continue; 266 } 267 const CostGraphDef::Node* cost_node = it->second; 268 269 OpPerformance* perf = ret.add_op_performance(); 270 perf->set_node(node.name()); 271 272 std::vector<OpInfo::TensorProperties> inputs = 273 FindInputFeatures(node, name_to_cost, name_to_node); 274 *perf->mutable_op() = BuildOpInfoWithoutDevice(node, name_to_node, inputs); 275 *perf->mutable_op()->mutable_device() = GetDeviceInfo(cost_node->device()); 276 277 perf->set_temporary_memory_size(cost_node->temporary_memory_size()); 278 // Note that CostGraphDef::Node::compute_cost is microseconds, while 279 // OpPerformance.compute_cost is nanoseconds. 280 perf->set_compute_cost(cost_node->compute_cost() * 1000); 281 perf->set_compute_time(cost_node->compute_time() * 1000); 282 perf->set_memory_time(cost_node->memory_time() * 1000); 283 284 for (const auto& output_info : cost_node->output_info()) { 285 perf->mutable_op_memory()->add_output_memory(output_info.size()); 286 } 287 288 perf->mutable_op_memory()->set_temp_memory( 289 cost_node->temporary_memory_size()); 290 perf->mutable_op_memory()->set_persistent_memory( 291 cost_node->persistent_memory_size()); 292 } 293 return ret; 294 } 295 296 void TensorSizeHistogram::Add(const uint64 value) { 297 num_elem_++; 298 sum_elem_ += value; 299 min_ = std::min(min_, value); 300 max_ = std::max(max_, value); 301 buckets_[Index(value)]++; 302 } 303 304 void TensorSizeHistogram::Merge(const TensorSizeHistogram& src) { 305 num_elem_ += src.num_elem_; 306 sum_elem_ += src.sum_elem_; 307 min_ = std::min(min_, src.min_); 308 max_ = std::max(max_, src.max_); 309 std::transform(buckets_.begin(), buckets_.end(), src.buckets_.begin(), 310 buckets_.begin(), std::plus<uint64>()); 311 } 312 313 std::string TensorSizeHistogram::ToString() const { 314 std::string r; 315 char buf[200]; 316 snprintf(buf, sizeof(buf), "Count: %lld, Average: ", num_elem_); 317 r.append(buf); 318 r.append(strings::HumanReadableNumBytes(Average())); 319 r.append(", Min: "); 320 r.append(strings::HumanReadableNumBytes(min_)); 321 r.append(", Max: "); 322 r.append(strings::HumanReadableNumBytes(max_)); 323 r.append("\n------------------------------------------------------\n"); 324 const double mult = num_elem_ > 0 ? 100.0 / num_elem_ : 0.0; 325 uint64 cumul_sum = 0; 326 327 const int size_string_width = 12; 328 for (int i = 0; i < buckets_.size(); i++) { 329 if (buckets_[i] == 0) continue; 330 cumul_sum += buckets_[i]; 331 r.append("[ "); 332 if (i == 0) { 333 r.append(size_string_width - 2, ' '); 334 r.append("0B"); 335 } else { 336 uint64 left = 1ULL << (i - 1); 337 const auto left_string = strings::HumanReadableNumBytes(left); 338 r.append(size_string_width - left_string.size(), ' '); 339 r.append(left_string); 340 } 341 r.append(", "); 342 uint64 right = 1ULL << i; 343 const auto right_string = strings::HumanReadableNumBytes(right); 344 r.append(size_string_width - right_string.size(), ' '); 345 r.append(right_string); 346 snprintf(buf, sizeof(buf), ") %7lld %7.3f%% %7.3f%% ", 347 buckets_[i], // count 348 mult * buckets_[i], // percentage 349 mult * cumul_sum); // cum percentage 350 r.append(buf); 351 352 // Add hash marks based on percentage; 40 marks for 100%. 353 auto marks = static_cast<int>( 354 (static_cast<double>(40 * buckets_[i] + (num_elem_ >> 1)) / num_elem_)); 355 r.append(marks, '#'); 356 r.push_back('\n'); 357 } 358 return r; 359 } 360 361 const int TensorSizeHistogram::Index(const uint64 value) const { 362 // Log2Floor64 returns -1 for 0, 0 for 1, 1 for 2-3, 2 for 4-7, ... 363 const auto index = Log2Floor64(value) + 1; 364 return std::min(index, kMaxBuckets - 1); 365 } 366 367 string GetDeviceClassForNonChannelDevice(const string& device_name) { 368 DeviceNameUtils::ParsedName parsed_name; 369 bool parsed = DeviceNameUtils::ParseFullName(device_name, &parsed_name); 370 if (!parsed) { 371 string name = str_util::StringReplace(device_name, "/job_", "/job:", true); 372 name = str_util::StringReplace(name, "/replica_", "/replica:", true); 373 name = str_util::StringReplace(name, "/task_", "/task:", true); 374 name = str_util::StringReplace(name, "/device_", "/device:", true); 375 name = str_util::StringReplace(name, "GPU_", "GPU:", true); 376 name = str_util::StringReplace(name, "CPU_", "CPU:", true); 377 name = str_util::StringReplace(name, "gpu_", "gpu:", true); 378 name = str_util::StringReplace(name, "cpu_", "cpu:", true); 379 parsed = DeviceNameUtils::ParseFullName(name, &parsed_name); 380 } 381 if (parsed) { 382 const string jobname = parsed_name.has_job ? parsed_name.job : ""; 383 return strings::StrCat("/", jobname, "/", parsed_name.type); 384 } else { 385 return "Unclassified"; 386 } 387 } 388 389 string GetDeviceClass(const string& device_name) { 390 // TODO(dyoon): channel device name follows the convention we currently have 391 // in VirtualScheduler. This should be revised with VirtualScheduler as well 392 // as VirtualPlacer in the future. 393 if (device_name.find("Channel") != string::npos) { 394 const string from = "_from_"; 395 const string to = "_to_"; 396 const auto from_loc = device_name.find(from); 397 const auto to_loc = device_name.find(to); 398 const auto src_device_full = device_name.substr( 399 from_loc + from.size(), to_loc - (from_loc + from.size())); 400 const auto dst_device_full = device_name.substr(to_loc + to.size()); 401 return strings::StrCat( 402 "Channel", ": ", GetDeviceClassForNonChannelDevice(src_device_full), 403 " -> ", GetDeviceClassForNonChannelDevice(dst_device_full)); 404 } else { 405 return GetDeviceClassForNonChannelDevice(device_name); 406 } 407 } 408 409 string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata, 410 bool verbosity) { 411 // TODO(dyoon): print out other stats as needed. 412 std::ostringstream output; 413 414 // Tensor size histogram: 415 // if verbosity, it outputs per-device histogram, 416 // otherwise, only per-class histogram. 417 std::unordered_map<string, TensorSizeHistogram> device_to_hist_map; 418 const auto& step_stats = run_metadata.step_stats(); 419 for (const auto& dev_stat : step_stats.dev_stats()) { 420 const auto& device_name = dev_stat.device(); 421 auto& hist = device_to_hist_map[device_name]; 422 for (const auto& node_stat : dev_stat.node_stats()) { 423 for (const auto& node_output : node_stat.output()) { 424 // TODO(dyoon): Calculate tensor size from tensor_description's dtype 425 // and shape, instead of using optional allocation_description. 426 const auto size = node_output.tensor_description() 427 .allocation_description() 428 .allocated_bytes(); 429 hist.Add(size); 430 } 431 } 432 } 433 if (verbosity) { 434 output << "\n"; 435 output << "Per device tensor size histogram.\n"; 436 } 437 438 std::unordered_map<string, TensorSizeHistogram> device_class_to_hist_map; 439 for (const auto& device_hist : device_to_hist_map) { 440 const auto& device_name = device_hist.first; 441 const auto& hist = device_hist.second; 442 if (verbosity) { 443 output << "Device: " << device_name << "\n" << hist.ToString() << "\n"; 444 } 445 const auto device_class = GetDeviceClass(device_name); 446 auto it = device_class_to_hist_map.find(device_class); 447 if (it == device_class_to_hist_map.end()) { 448 device_class_to_hist_map.emplace(device_class, TensorSizeHistogram(hist)); 449 } else { 450 it->second.Merge(hist); 451 } 452 } 453 output << "\n"; 454 output << "Aggregated per device / channel type tensor size histogram:\n"; 455 for (const auto& device_hist : device_class_to_hist_map) { 456 const auto& device_name = device_hist.first; 457 const auto& hist = device_hist.second; 458 output << "Device: " << device_name << "\n" << hist.ToString() << "\n"; 459 } 460 output << "\n"; 461 462 return output.str(); 463 } 464 465 } // end namespace grappler 466 } // end namespace tensorflow 467