1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/debug/debug_graph_utils.h" 17 18 #include "tensorflow/core/common_runtime/memory_types.h" 19 #include "tensorflow/core/framework/kernel_def.pb.h" 20 #include "tensorflow/core/framework/node_def_builder.h" 21 #include "tensorflow/core/framework/op_kernel.h" 22 #include "tensorflow/core/graph/node_builder.h" 23 #include "tensorflow/core/lib/strings/strcat.h" 24 #include "tensorflow/core/protobuf/debug.pb.h" 25 26 namespace tensorflow { 27 28 namespace { 29 30 // TODO(cais): Switch to safe_strtob when available. 31 Status ParseBoolString(const string& bool_str, bool* bool_val) { 32 const string lower_bool_str = str_util::Lowercase(bool_str); 33 if (lower_bool_str == "false" || lower_bool_str == "f" || 34 lower_bool_str == "0") { 35 *bool_val = false; 36 } else if (lower_bool_str == "true" || lower_bool_str == "t" || 37 lower_bool_str == "1") { 38 *bool_val = true; 39 } else { 40 return errors::InvalidArgument("Invalid string for bool value: ", bool_str); 41 } 42 return Status::OK(); 43 } 44 45 } // namespace 46 47 // static 48 Status DebugNodeInserter::InsertNodes( 49 const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph, 50 Device* device) { 51 // TODO(cais): This method is getting too large in size. 52 // Refactor it with helpers. 53 54 if (watches.empty()) { 55 // Nothing to do: Return OK right away. 56 return Status::OK(); 57 } 58 59 // A map from tensor name (e.g., "node_a:0") to list of debug op names 60 // (e.g., {"DebugIdentity", "DebugNanCount"}) 61 std::unordered_map<string, std::vector<string>> tensor_watches; 62 // A map from tensor name to debug_url. 63 std::unordered_map<string, std::vector<string>> tensor_watch_urls; 64 std::unordered_map<string, bool> tensor_tolerate_failures; 65 66 // Cache the proto content for fast lookup later 67 for (const DebugTensorWatch& watch : watches) { 68 if (watch.output_slot() < 0) { 69 // The semantics of output_slot == -1 is that the node is watched only 70 // for completion, but not for output tensor values (see 71 // NodeCompletionCallback in debug_gateway.h). 72 continue; 73 } 74 if (watch.debug_ops().empty()) { 75 continue; 76 } 77 78 string tensor_name = 79 strings::StrCat(watch.node_name(), ":", watch.output_slot()); 80 81 std::vector<string> debug_ops; 82 for (const string& debug_op : watch.debug_ops()) { 83 debug_ops.push_back(debug_op); 84 } 85 86 tensor_watches[tensor_name] = debug_ops; 87 tensor_tolerate_failures[tensor_name] = 88 watch.tolerate_debug_op_creation_failures(); 89 90 std::vector<string> urls; 91 for (const string& url : watch.debug_urls()) { 92 urls.push_back(url); 93 } 94 tensor_watch_urls[tensor_name] = urls; 95 } 96 97 if (tensor_watches.empty()) { 98 return Status::OK(); 99 } 100 101 DeviceType device_type = DeviceType{device->device_type()}; 102 103 // Keep track of all edges to be removed. 104 std::vector<const Edge*> edges_to_remove; 105 106 for (Node* src_node : graph->nodes()) { 107 // Make a map from output slot to outgoing edges from the slot. 108 std::unordered_map<int, std::vector<const Edge*>> output_slot_to_edges; 109 for (const Edge* edge : src_node->out_edges()) { 110 const int src_output = edge->src_output(); 111 if (output_slot_to_edges.find(src_output) == output_slot_to_edges.end()) { 112 output_slot_to_edges[src_output] = {edge}; 113 } else { 114 output_slot_to_edges[src_output].push_back(edge); 115 } 116 } 117 118 // Iterate through all output slots of the node. 119 for (int src_output_slot = 0; src_output_slot < src_node->num_outputs(); 120 ++src_output_slot) { 121 const string tensor_name = 122 strings::StrCat(src_node->name(), ":", src_output_slot); 123 if (tensor_watches.find(tensor_name) == tensor_watches.end()) { 124 // Add debug nodes only for edges with matching source node and source 125 // output slot. 126 continue; 127 } 128 129 // Now we have encountered a watched tensor. We will: 130 // 1) Mark this edge as to be removed, iff this is a non-Reference 131 // tensor 132 // 2) Create a Copy node for the tensor 133 // 3) Add a new edge, from the source tensor to the Copy node 134 // 4) Add a new edge, from the Copy node to the destination node, iff 135 // this is a non-Reference tensor. 136 // 5) Create all the requested debug nodes and their edges to the Copy 137 // node. 138 // 6) Add control edges from the debug nodes to the destination nodes 139 // to ensure that the tensors values exported by the debug nodes 140 // to the debug URLs reflect the values before the execution of 141 // the destination nodes. 142 143 const DataType src_dt = src_node->output_type(src_output_slot); 144 MemoryType memory_type; 145 TF_RETURN_IF_ERROR(MemoryTypeForOutput(device_type, graph, src_node, 146 src_output_slot, &memory_type)); 147 148 // Create the copy node for the watched tensor. 149 Node* copy_node; 150 Status copy_s = CreateCopyNode( 151 graph, device_type, memory_type == HOST_MEMORY, src_node->name(), 152 src_output_slot, src_dt, tensor_name, tensor_watches[tensor_name], 153 tensor_watch_urls[tensor_name], ©_node); 154 if (!copy_s.ok()) { 155 return Status( 156 error::FAILED_PRECONDITION, 157 strings::StrCat("Failed to create Copy/CopyHost node for tensor ", 158 tensor_name, ", due to: ", copy_s.error_message())); 159 } 160 161 // Add edge from watched tensor to the copy node. 162 graph->AddEdge(src_node, src_output_slot, copy_node, 0); 163 164 // Create all requested debug nodes and their edges to the Copy node. 165 std::vector<Node*> debug_nodes; 166 for (size_t i = 0; i < tensor_watches[tensor_name].size(); ++i) { 167 const string& debug_op_name = tensor_watches[tensor_name][i]; 168 169 Node* debug_node; 170 Status debug_s = CreateDebugNode( 171 graph, *device, copy_node->name(), src_dt, tensor_name, 172 tensor_watch_urls[tensor_name], i, debug_op_name, &debug_node); 173 if (debug_s.ok()) { 174 graph->AddEdge(copy_node, 0, debug_node, 0); 175 debug_nodes.push_back(debug_node); 176 } else { 177 if (tensor_tolerate_failures[tensor_name]) { 178 LOG(INFO) << "Tolerating failure to create debug node: " 179 << "tensor name = " << tensor_name << "; " 180 << "debug op name = " << debug_op_name; 181 } else { 182 return Status( 183 error::FAILED_PRECONDITION, 184 strings::StrCat("Failed to create debug node ", debug_op_name, 185 " for tensor ", tensor_name, 186 ", due to: ", debug_s.error_message())); 187 } 188 } 189 } 190 191 // Is the output a reference? 192 const bool is_ref = IsRefType(src_node->output_type(src_output_slot)); 193 194 // Iterate through all outgoing edges attached to the slot. 195 for (const Edge* edge : output_slot_to_edges[src_output_slot]) { 196 // Mark the edge for removal. 197 if (!is_ref) { 198 edges_to_remove.push_back(edge); 199 graph->AddEdge(copy_node, 0, edge->dst(), edge->dst_input()); 200 } 201 202 // Add control edges from the debug nodes to the destination node 203 // to ensure that the debug nodes are executed before the destination 204 // node. Skip Enter and NextIteration ops to avoid hanging. 205 for (Node* debug_node : debug_nodes) { 206 if (!src_node->IsEnter() && !src_node->IsNextIteration()) { 207 graph->AddEdge(debug_node, Graph::kControlSlot, edge->dst(), 208 Graph::kControlSlot); 209 } 210 } 211 } 212 } 213 } 214 215 // Remove all edges marked for removal. 216 for (const Edge* edge : edges_to_remove) { 217 graph->RemoveEdge(edge); 218 } 219 220 return Status::OK(); 221 } 222 223 void DebugNodeInserter::DeparallelizeWhileLoops(Graph* graph, Device* device) { 224 bool deparallelized_a_loop = false; 225 for (Node* node : graph->nodes()) { 226 if (node->IsEnter()) { 227 const AttrValue* parallel_iterations = 228 node->attrs().Find("parallel_iterations"); 229 if (parallel_iterations && parallel_iterations->i() > 1) { 230 deparallelized_a_loop = true; 231 VLOG(1) << "Changing the parallel_iterations attribute of the " 232 << "Enter/RefEnter node \"" << node->name() << "\" on device \"" 233 << device->name() << "\" from " << parallel_iterations->i() 234 << " to 1."; 235 node->AddAttr<int64>("parallel_iterations", 1); 236 } 237 } 238 } 239 if (deparallelized_a_loop) { 240 LOG(INFO) << "For debugging, tfdbg has set the parallel_iterations " 241 << "attribute of all scheduled Enter/RefEnter nodes to 1. (This " 242 << "does not affect subsequent non-debug runs.)"; 243 } 244 } 245 246 // static 247 const string DebugNodeInserter::GetCopyNodeName(const string& node_name, 248 const int output_slot) { 249 // For example, if the watched node is named "node1" and the output slot 250 // is 0, the debug node will be called: __copy_node1_0 251 return strings::StrCat("__copy_", node_name, "_", output_slot); 252 } 253 254 // static 255 const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name, 256 const int debug_op_num, 257 const string& debug_op_name) { 258 // For example, if the watched node is named "node1" and the debug op that 259 // watches the output slot of node1 is of the type "DebugNanCount", the 260 // debug node will be called: __dbg_node1_0_0_DebugNanCount. 261 return strings::StrCat("__dbg_", tensor_name, "_", debug_op_num, "_", 262 debug_op_name); 263 } 264 265 // static 266 Status DebugNodeInserter::CreateCopyNode( 267 Graph* graph, const DeviceType device_type, const bool is_host_memory, 268 const string& src_node_name, const int src_output, const DataType src_dt, 269 const string& tensor_name, const std::vector<string>& debug_ops, 270 const std::vector<string>& debug_urls, Node** copy_node) { 271 const string kGatedGrpcAttributeKey = "gated_grpc"; 272 273 NodeDef node_def; 274 const KernelDef* kdef; 275 276 const string copy_op_name = is_host_memory ? "CopyHost" : "Copy"; 277 const string copy_node_name = GetCopyNodeName(src_node_name, src_output); 278 279 // Cross debug_ops and debug_urls to get the list of debug ops and watches. 280 std::vector<string> debug_ops_spec; 281 for (const string& debug_op : debug_ops) { 282 for (const string& debug_url : debug_urls) { 283 string debug_op_name_proper; 284 std::unordered_map<string, string> custom_attributes; 285 TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op, &debug_op_name_proper, 286 &custom_attributes)); 287 288 bool gated_grpc_value = false; 289 if (custom_attributes.find(kGatedGrpcAttributeKey) != 290 custom_attributes.end()) { 291 TF_RETURN_IF_ERROR(ParseBoolString( 292 custom_attributes[kGatedGrpcAttributeKey], &gated_grpc_value)); 293 } 294 debug_ops_spec.push_back(strings::StrCat(debug_op_name_proper, ";", 295 debug_url, ";", 296 gated_grpc_value ? "1" : "0")); 297 } 298 } 299 300 auto builder = NodeDefBuilder(copy_node_name, copy_op_name) 301 .Input(src_node_name, src_output, src_dt) 302 .Attr("debug_ops_spec", std::move(debug_ops_spec)); 303 304 if (!builder.Finalize(&node_def).ok()) { 305 return Status( 306 error::FAILED_PRECONDITION, 307 strings::StrCat("Failed to create node definition ", "for copy op ", 308 copy_node_name, " on watched tensor ", tensor_name)); 309 } 310 Status s = FindKernelDef(device_type, node_def, &kdef, nullptr); 311 312 if (!s.ok()) { 313 return Status( 314 error::FAILED_PRECONDITION, 315 strings::StrCat("Failed to find kernel definition ", "for copy op ", 316 copy_node_name, " on watched tensor ", tensor_name)); 317 } 318 if (!NodeBuilder(builder).Finalize(graph, copy_node).ok()) { 319 return Status(error::FAILED_PRECONDITION, 320 strings::StrCat("Failed to create copy node ", copy_node_name, 321 " on watched tensor ", tensor_name)); 322 } 323 324 return Status::OK(); 325 } 326 327 // static 328 Status DebugNodeInserter::ParseDebugOpName( 329 const string& debug_op_name, string* debug_op_name_proper, 330 std::unordered_map<string, string>* attributes) { 331 const size_t l_index = debug_op_name.find('('); 332 const size_t r_index = debug_op_name.find(')'); 333 if (l_index == string::npos && r_index == string::npos) { 334 *debug_op_name_proper = debug_op_name; 335 } else { 336 if (l_index == string::npos || l_index == 0 || 337 r_index != debug_op_name.size() - 1) { 338 return errors::InvalidArgument("Malformed debug op name \"", 339 debug_op_name, "\""); 340 } 341 342 *debug_op_name_proper = debug_op_name.substr(0, l_index); 343 string arguments = debug_op_name.substr(l_index + 1, r_index - l_index - 1); 344 345 std::vector<string> attribute_segs = str_util::Split(arguments, ";"); 346 for (const string& attribute_seg : attribute_segs) { 347 StringPiece seg(attribute_seg); 348 str_util::RemoveWhitespaceContext(&seg); 349 if (seg.empty()) { 350 continue; 351 } 352 353 const size_t eq_index = seg.find('='); 354 if (eq_index == string::npos) { 355 return errors::InvalidArgument( 356 "Malformed attributes in debug op name \"", debug_op_name, "\""); 357 } 358 359 const string key = seg.substr(0, eq_index).ToString(); 360 const string value = 361 seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1) 362 .ToString(); 363 if (key.empty() || value.empty()) { 364 return errors::InvalidArgument( 365 "Malformed attributes in debug op name \"", debug_op_name, "\""); 366 } 367 368 if (attributes->find(key) == attributes->end()) { 369 (*attributes)[key] = value; 370 } else { 371 return errors::InvalidArgument("Duplicate attribute name \"", key, 372 "\" found in the debug op: \"", 373 debug_op_name, "\""); 374 } 375 } 376 } 377 return Status::OK(); 378 } 379 380 // static 381 Status DebugNodeInserter::SetDebugNodeAttributes( 382 Node* debug_node, const std::unordered_map<string, string>& attributes) { 383 std::unordered_set<string> unfulfilled_keys; 384 for (const auto& item : attributes) { 385 unfulfilled_keys.insert(item.first); 386 } 387 388 for (const auto& attr : debug_node->op_def().attr()) { 389 if (attributes.find(attr.name()) != attributes.end()) { 390 const string& attr_value = attributes.at(attr.name()); 391 if (attr.type() == "string") { 392 debug_node->AddAttr<string>(attr.name(), attr_value); 393 } else if (attr.type() == "float") { 394 float float_value = 0.0; 395 if (!::tensorflow::strings::safe_strtof(attr_value.c_str(), 396 &float_value)) { 397 return errors::InvalidArgument( 398 "Invalid value string for float-type attribute ", attr.name(), 399 "of debug node ", debug_node->name(), ": \"", attr_value, "\""); 400 } 401 debug_node->AddAttr<float>(attr.name(), float_value); 402 } else if (attr.type() == "int") { 403 int64 int_value = 0; 404 if (!::tensorflow::strings::safe_strto64(attr_value, &int_value)) { 405 return errors::InvalidArgument( 406 "Invalid value string for int-type attribute ", attr.name(), 407 "of debug node ", debug_node->name(), ": \"", attr_value, "\""); 408 } 409 debug_node->AddAttr<int>(attr.name(), int_value); 410 } else if (attr.type() == "bool") { 411 bool bool_value; 412 if (!ParseBoolString(attr_value, &bool_value).ok()) { 413 return errors::InvalidArgument( 414 "Invalid value string for bool-type attribute ", attr.name(), 415 "of debug node ", debug_node->name(), ": \"", attr_value, "\""); 416 } 417 debug_node->AddAttr<bool>(attr.name(), bool_value); 418 } else { 419 return errors::InvalidArgument( 420 "Unsupported type of custom attribute for debug ops: ", 421 attr.type()); 422 } 423 424 unfulfilled_keys.erase(attr.name()); 425 } 426 } 427 428 if (unfulfilled_keys.empty()) { 429 return Status::OK(); 430 } else { 431 return errors::InvalidArgument( 432 unfulfilled_keys.size(), 433 " attribute key(s) were not valid for debug node ", debug_node->name(), 434 ": ", str_util::Join(unfulfilled_keys, ", ")); 435 } 436 } 437 438 // static 439 Status DebugNodeInserter::CreateDebugNode( 440 Graph* graph, const Device& device, const string& src_copy_node_name, 441 const DataType src_dt, const string& tensor_name, 442 const std::vector<string>& debug_urls, const int debug_op_num, 443 const string& debug_op_name, Node** debug_node) { 444 NodeDef node_def; 445 const KernelDef* kdef; 446 447 string debug_op_name_proper; 448 std::unordered_map<string, string> custom_attributes; 449 TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op_name, &debug_op_name_proper, 450 &custom_attributes)); 451 452 const string debug_node_name = 453 GetDebugNodeName(tensor_name, debug_op_num, debug_op_name_proper); 454 auto builder = NodeDefBuilder(debug_node_name, debug_op_name_proper) 455 .Input(src_copy_node_name, 0, src_dt) 456 .Attr("device_name", device.name()) 457 .Attr("tensor_name", tensor_name) 458 .Attr("debug_urls", debug_urls); 459 460 if (!builder.Finalize(&node_def).ok()) { 461 return errors::FailedPrecondition( 462 "Failed to create node definition for debug op ", debug_op_name_proper, 463 " on watched tensor ", tensor_name); 464 } 465 if (!FindKernelDef(DeviceType(device.device_type()), node_def, &kdef, nullptr) 466 .ok()) { 467 return errors::FailedPrecondition( 468 "Failed to find kernel definition for debug op ", debug_op_name_proper, 469 " on watched tensor ", tensor_name); 470 } 471 if (!NodeBuilder(builder).Finalize(graph, debug_node).ok()) { 472 return errors::FailedPrecondition("Failed to create debug node ", 473 debug_op_name_proper, 474 " on watched tensor ", tensor_name); 475 } 476 477 // Set custom attributes (if any). 478 if (!custom_attributes.empty()) { 479 TF_RETURN_IF_ERROR(SetDebugNodeAttributes(*debug_node, custom_attributes)); 480 } 481 482 return Status::OK(); 483 } 484 485 } // namespace tensorflow 486