Home | History | Annotate | Download | only in debug
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/debug/debug_graph_utils.h"
     17 
     18 #include "tensorflow/core/common_runtime/memory_types.h"
     19 #include "tensorflow/core/framework/kernel_def.pb.h"
     20 #include "tensorflow/core/framework/node_def_builder.h"
     21 #include "tensorflow/core/framework/op_kernel.h"
     22 #include "tensorflow/core/graph/node_builder.h"
     23 #include "tensorflow/core/lib/strings/strcat.h"
     24 #include "tensorflow/core/protobuf/debug.pb.h"
     25 
     26 namespace tensorflow {
     27 
     28 namespace {
     29 
     30 // TODO(cais): Switch to safe_strtob when available.
     31 Status ParseBoolString(const string& bool_str, bool* bool_val) {
     32   const string lower_bool_str = str_util::Lowercase(bool_str);
     33   if (lower_bool_str == "false" || lower_bool_str == "f" ||
     34       lower_bool_str == "0") {
     35     *bool_val = false;
     36   } else if (lower_bool_str == "true" || lower_bool_str == "t" ||
     37              lower_bool_str == "1") {
     38     *bool_val = true;
     39   } else {
     40     return errors::InvalidArgument("Invalid string for bool value: ", bool_str);
     41   }
     42   return Status::OK();
     43 }
     44 
     45 }  // namespace
     46 
     47 // static
     48 Status DebugNodeInserter::InsertNodes(
     49     const protobuf::RepeatedPtrField<DebugTensorWatch>& watches, Graph* graph,
     50     Device* device) {
     51   // TODO(cais): This method is getting too large in size.
     52   // Refactor it with helpers.
     53 
     54   if (watches.empty()) {
     55     // Nothing to do: Return OK right away.
     56     return Status::OK();
     57   }
     58 
     59   // A map from tensor name (e.g., "node_a:0") to list of debug op names
     60   // (e.g., {"DebugIdentity", "DebugNanCount"})
     61   std::unordered_map<string, std::vector<string>> tensor_watches;
     62   // A map from tensor name to debug_url.
     63   std::unordered_map<string, std::vector<string>> tensor_watch_urls;
     64   std::unordered_map<string, bool> tensor_tolerate_failures;
     65 
     66   // Cache the proto content for fast lookup later
     67   for (const DebugTensorWatch& watch : watches) {
     68     if (watch.output_slot() < 0) {
     69       // The semantics of output_slot == -1 is that the node is watched only
     70       // for completion, but not for output tensor values (see
     71       // NodeCompletionCallback in debug_gateway.h).
     72       continue;
     73     }
     74     if (watch.debug_ops().empty()) {
     75       continue;
     76     }
     77 
     78     string tensor_name =
     79         strings::StrCat(watch.node_name(), ":", watch.output_slot());
     80 
     81     std::vector<string> debug_ops;
     82     for (const string& debug_op : watch.debug_ops()) {
     83       debug_ops.push_back(debug_op);
     84     }
     85 
     86     tensor_watches[tensor_name] = debug_ops;
     87     tensor_tolerate_failures[tensor_name] =
     88         watch.tolerate_debug_op_creation_failures();
     89 
     90     std::vector<string> urls;
     91     for (const string& url : watch.debug_urls()) {
     92       urls.push_back(url);
     93     }
     94     tensor_watch_urls[tensor_name] = urls;
     95   }
     96 
     97   if (tensor_watches.empty()) {
     98     return Status::OK();
     99   }
    100 
    101   DeviceType device_type = DeviceType{device->device_type()};
    102 
    103   // Keep track of all edges to be removed.
    104   std::vector<const Edge*> edges_to_remove;
    105 
    106   for (Node* src_node : graph->nodes()) {
    107     // Make a map from output slot to outgoing edges from the slot.
    108     std::unordered_map<int, std::vector<const Edge*>> output_slot_to_edges;
    109     for (const Edge* edge : src_node->out_edges()) {
    110       const int src_output = edge->src_output();
    111       if (output_slot_to_edges.find(src_output) == output_slot_to_edges.end()) {
    112         output_slot_to_edges[src_output] = {edge};
    113       } else {
    114         output_slot_to_edges[src_output].push_back(edge);
    115       }
    116     }
    117 
    118     // Iterate through all output slots of the node.
    119     for (int src_output_slot = 0; src_output_slot < src_node->num_outputs();
    120          ++src_output_slot) {
    121       const string tensor_name =
    122           strings::StrCat(src_node->name(), ":", src_output_slot);
    123       if (tensor_watches.find(tensor_name) == tensor_watches.end()) {
    124         // Add debug nodes only for edges with matching source node and source
    125         // output slot.
    126         continue;
    127       }
    128 
    129       // Now we have encountered a watched tensor. We will:
    130       //   1) Mark this edge as to be removed, iff this is a non-Reference
    131       //      tensor
    132       //   2) Create a Copy node for the tensor
    133       //   3) Add a new edge, from the source tensor to the Copy node
    134       //   4) Add a new edge, from the Copy node to the destination node, iff
    135       //      this is a non-Reference tensor.
    136       //   5) Create all the requested debug nodes and their edges to the Copy
    137       //      node.
    138       //   6) Add control edges from the debug nodes to the destination nodes
    139       //      to ensure that the tensors values exported by the debug nodes
    140       //      to the debug URLs reflect the values before the execution of
    141       //      the destination nodes.
    142 
    143       const DataType src_dt = src_node->output_type(src_output_slot);
    144       MemoryType memory_type;
    145       TF_RETURN_IF_ERROR(MemoryTypeForOutput(device_type, graph, src_node,
    146                                              src_output_slot, &memory_type));
    147 
    148       // Create the copy node for the watched tensor.
    149       Node* copy_node;
    150       Status copy_s = CreateCopyNode(
    151           graph, device_type, memory_type == HOST_MEMORY, src_node->name(),
    152           src_output_slot, src_dt, tensor_name, tensor_watches[tensor_name],
    153           tensor_watch_urls[tensor_name], &copy_node);
    154       if (!copy_s.ok()) {
    155         return Status(
    156             error::FAILED_PRECONDITION,
    157             strings::StrCat("Failed to create Copy/CopyHost node for tensor ",
    158                             tensor_name, ", due to: ", copy_s.error_message()));
    159       }
    160 
    161       // Add edge from watched tensor to the copy node.
    162       graph->AddEdge(src_node, src_output_slot, copy_node, 0);
    163 
    164       // Create all requested debug nodes and their edges to the Copy node.
    165       std::vector<Node*> debug_nodes;
    166       for (size_t i = 0; i < tensor_watches[tensor_name].size(); ++i) {
    167         const string& debug_op_name = tensor_watches[tensor_name][i];
    168 
    169         Node* debug_node;
    170         Status debug_s = CreateDebugNode(
    171             graph, *device, copy_node->name(), src_dt, tensor_name,
    172             tensor_watch_urls[tensor_name], i, debug_op_name, &debug_node);
    173         if (debug_s.ok()) {
    174           graph->AddEdge(copy_node, 0, debug_node, 0);
    175           debug_nodes.push_back(debug_node);
    176         } else {
    177           if (tensor_tolerate_failures[tensor_name]) {
    178             LOG(INFO) << "Tolerating failure to create debug node: "
    179                       << "tensor name = " << tensor_name << "; "
    180                       << "debug op name = " << debug_op_name;
    181           } else {
    182             return Status(
    183                 error::FAILED_PRECONDITION,
    184                 strings::StrCat("Failed to create debug node ", debug_op_name,
    185                                 " for tensor ", tensor_name,
    186                                 ", due to: ", debug_s.error_message()));
    187           }
    188         }
    189       }
    190 
    191       // Is the output a reference?
    192       const bool is_ref = IsRefType(src_node->output_type(src_output_slot));
    193 
    194       // Iterate through all outgoing edges attached to the slot.
    195       for (const Edge* edge : output_slot_to_edges[src_output_slot]) {
    196         // Mark the edge for removal.
    197         if (!is_ref) {
    198           edges_to_remove.push_back(edge);
    199           graph->AddEdge(copy_node, 0, edge->dst(), edge->dst_input());
    200         }
    201 
    202         // Add control edges from the debug nodes to the destination node
    203         // to ensure that the debug nodes are executed before the destination
    204         // node. Skip Enter and NextIteration ops to avoid hanging.
    205         for (Node* debug_node : debug_nodes) {
    206           if (!src_node->IsEnter() && !src_node->IsNextIteration()) {
    207             graph->AddEdge(debug_node, Graph::kControlSlot, edge->dst(),
    208                            Graph::kControlSlot);
    209           }
    210         }
    211       }
    212     }
    213   }
    214 
    215   // Remove all edges marked for removal.
    216   for (const Edge* edge : edges_to_remove) {
    217     graph->RemoveEdge(edge);
    218   }
    219 
    220   return Status::OK();
    221 }
    222 
    223 void DebugNodeInserter::DeparallelizeWhileLoops(Graph* graph, Device* device) {
    224   bool deparallelized_a_loop = false;
    225   for (Node* node : graph->nodes()) {
    226     if (node->IsEnter()) {
    227       const AttrValue* parallel_iterations =
    228           node->attrs().Find("parallel_iterations");
    229       if (parallel_iterations && parallel_iterations->i() > 1) {
    230         deparallelized_a_loop = true;
    231         VLOG(1) << "Changing the parallel_iterations attribute of the "
    232                 << "Enter/RefEnter node \"" << node->name() << "\" on device \""
    233                 << device->name() << "\" from " << parallel_iterations->i()
    234                 << " to 1.";
    235         node->AddAttr<int64>("parallel_iterations", 1);
    236       }
    237     }
    238   }
    239   if (deparallelized_a_loop) {
    240     LOG(INFO) << "For debugging, tfdbg has set the parallel_iterations "
    241               << "attribute of all scheduled Enter/RefEnter nodes to 1. (This "
    242               << "does not affect subsequent non-debug runs.)";
    243   }
    244 }
    245 
    246 // static
    247 const string DebugNodeInserter::GetCopyNodeName(const string& node_name,
    248                                                 const int output_slot) {
    249   // For example, if the watched node is named "node1" and the output slot
    250   // is 0, the debug node will be called: __copy_node1_0
    251   return strings::StrCat("__copy_", node_name, "_", output_slot);
    252 }
    253 
    254 // static
    255 const string DebugNodeInserter::GetDebugNodeName(const string& tensor_name,
    256                                                  const int debug_op_num,
    257                                                  const string& debug_op_name) {
    258   // For example, if the watched node is named "node1" and the debug op that
    259   // watches the output slot of node1 is of the type "DebugNanCount", the
    260   // debug node will be called: __dbg_node1_0_0_DebugNanCount.
    261   return strings::StrCat("__dbg_", tensor_name, "_", debug_op_num, "_",
    262                          debug_op_name);
    263 }
    264 
    265 // static
    266 Status DebugNodeInserter::CreateCopyNode(
    267     Graph* graph, const DeviceType device_type, const bool is_host_memory,
    268     const string& src_node_name, const int src_output, const DataType src_dt,
    269     const string& tensor_name, const std::vector<string>& debug_ops,
    270     const std::vector<string>& debug_urls, Node** copy_node) {
    271   const string kGatedGrpcAttributeKey = "gated_grpc";
    272 
    273   NodeDef node_def;
    274   const KernelDef* kdef;
    275 
    276   const string copy_op_name = is_host_memory ? "CopyHost" : "Copy";
    277   const string copy_node_name = GetCopyNodeName(src_node_name, src_output);
    278 
    279   // Cross debug_ops and debug_urls to get the list of debug ops and watches.
    280   std::vector<string> debug_ops_spec;
    281   for (const string& debug_op : debug_ops) {
    282     for (const string& debug_url : debug_urls) {
    283       string debug_op_name_proper;
    284       std::unordered_map<string, string> custom_attributes;
    285       TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op, &debug_op_name_proper,
    286                                           &custom_attributes));
    287 
    288       bool gated_grpc_value = false;
    289       if (custom_attributes.find(kGatedGrpcAttributeKey) !=
    290           custom_attributes.end()) {
    291         TF_RETURN_IF_ERROR(ParseBoolString(
    292             custom_attributes[kGatedGrpcAttributeKey], &gated_grpc_value));
    293       }
    294       debug_ops_spec.push_back(strings::StrCat(debug_op_name_proper, ";",
    295                                                debug_url, ";",
    296                                                gated_grpc_value ? "1" : "0"));
    297     }
    298   }
    299 
    300   auto builder = NodeDefBuilder(copy_node_name, copy_op_name)
    301                      .Input(src_node_name, src_output, src_dt)
    302                      .Attr("debug_ops_spec", std::move(debug_ops_spec));
    303 
    304   if (!builder.Finalize(&node_def).ok()) {
    305     return Status(
    306         error::FAILED_PRECONDITION,
    307         strings::StrCat("Failed to create node definition ", "for copy op ",
    308                         copy_node_name, " on watched tensor ", tensor_name));
    309   }
    310   Status s = FindKernelDef(device_type, node_def, &kdef, nullptr);
    311 
    312   if (!s.ok()) {
    313     return Status(
    314         error::FAILED_PRECONDITION,
    315         strings::StrCat("Failed to find kernel definition ", "for copy op ",
    316                         copy_node_name, " on watched tensor ", tensor_name));
    317   }
    318   if (!NodeBuilder(builder).Finalize(graph, copy_node).ok()) {
    319     return Status(error::FAILED_PRECONDITION,
    320                   strings::StrCat("Failed to create copy node ", copy_node_name,
    321                                   " on watched tensor ", tensor_name));
    322   }
    323 
    324   return Status::OK();
    325 }
    326 
    327 // static
    328 Status DebugNodeInserter::ParseDebugOpName(
    329     const string& debug_op_name, string* debug_op_name_proper,
    330     std::unordered_map<string, string>* attributes) {
    331   const size_t l_index = debug_op_name.find('(');
    332   const size_t r_index = debug_op_name.find(')');
    333   if (l_index == string::npos && r_index == string::npos) {
    334     *debug_op_name_proper = debug_op_name;
    335   } else {
    336     if (l_index == string::npos || l_index == 0 ||
    337         r_index != debug_op_name.size() - 1) {
    338       return errors::InvalidArgument("Malformed debug op name \"",
    339                                      debug_op_name, "\"");
    340     }
    341 
    342     *debug_op_name_proper = debug_op_name.substr(0, l_index);
    343     string arguments = debug_op_name.substr(l_index + 1, r_index - l_index - 1);
    344 
    345     std::vector<string> attribute_segs = str_util::Split(arguments, ";");
    346     for (const string& attribute_seg : attribute_segs) {
    347       StringPiece seg(attribute_seg);
    348       str_util::RemoveWhitespaceContext(&seg);
    349       if (seg.empty()) {
    350         continue;
    351       }
    352 
    353       const size_t eq_index = seg.find('=');
    354       if (eq_index == string::npos) {
    355         return errors::InvalidArgument(
    356             "Malformed attributes in debug op name \"", debug_op_name, "\"");
    357       }
    358 
    359       const string key = seg.substr(0, eq_index).ToString();
    360       const string value =
    361           seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1)
    362               .ToString();
    363       if (key.empty() || value.empty()) {
    364         return errors::InvalidArgument(
    365             "Malformed attributes in debug op name \"", debug_op_name, "\"");
    366       }
    367 
    368       if (attributes->find(key) == attributes->end()) {
    369         (*attributes)[key] = value;
    370       } else {
    371         return errors::InvalidArgument("Duplicate attribute name \"", key,
    372                                        "\" found in the debug op: \"",
    373                                        debug_op_name, "\"");
    374       }
    375     }
    376   }
    377   return Status::OK();
    378 }
    379 
    380 // static
    381 Status DebugNodeInserter::SetDebugNodeAttributes(
    382     Node* debug_node, const std::unordered_map<string, string>& attributes) {
    383   std::unordered_set<string> unfulfilled_keys;
    384   for (const auto& item : attributes) {
    385     unfulfilled_keys.insert(item.first);
    386   }
    387 
    388   for (const auto& attr : debug_node->op_def().attr()) {
    389     if (attributes.find(attr.name()) != attributes.end()) {
    390       const string& attr_value = attributes.at(attr.name());
    391       if (attr.type() == "string") {
    392         debug_node->AddAttr<string>(attr.name(), attr_value);
    393       } else if (attr.type() == "float") {
    394         float float_value = 0.0;
    395         if (!::tensorflow::strings::safe_strtof(attr_value.c_str(),
    396                                                 &float_value)) {
    397           return errors::InvalidArgument(
    398               "Invalid value string for float-type attribute ", attr.name(),
    399               "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
    400         }
    401         debug_node->AddAttr<float>(attr.name(), float_value);
    402       } else if (attr.type() == "int") {
    403         int64 int_value = 0;
    404         if (!::tensorflow::strings::safe_strto64(attr_value, &int_value)) {
    405           return errors::InvalidArgument(
    406               "Invalid value string for int-type attribute ", attr.name(),
    407               "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
    408         }
    409         debug_node->AddAttr<int>(attr.name(), int_value);
    410       } else if (attr.type() == "bool") {
    411         bool bool_value;
    412         if (!ParseBoolString(attr_value, &bool_value).ok()) {
    413           return errors::InvalidArgument(
    414               "Invalid value string for bool-type attribute ", attr.name(),
    415               "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
    416         }
    417         debug_node->AddAttr<bool>(attr.name(), bool_value);
    418       } else {
    419         return errors::InvalidArgument(
    420             "Unsupported type of custom attribute for debug ops: ",
    421             attr.type());
    422       }
    423 
    424       unfulfilled_keys.erase(attr.name());
    425     }
    426   }
    427 
    428   if (unfulfilled_keys.empty()) {
    429     return Status::OK();
    430   } else {
    431     return errors::InvalidArgument(
    432         unfulfilled_keys.size(),
    433         " attribute key(s) were not valid for debug node ", debug_node->name(),
    434         ": ", str_util::Join(unfulfilled_keys, ", "));
    435   }
    436 }
    437 
    438 // static
    439 Status DebugNodeInserter::CreateDebugNode(
    440     Graph* graph, const Device& device, const string& src_copy_node_name,
    441     const DataType src_dt, const string& tensor_name,
    442     const std::vector<string>& debug_urls, const int debug_op_num,
    443     const string& debug_op_name, Node** debug_node) {
    444   NodeDef node_def;
    445   const KernelDef* kdef;
    446 
    447   string debug_op_name_proper;
    448   std::unordered_map<string, string> custom_attributes;
    449   TF_RETURN_IF_ERROR(ParseDebugOpName(debug_op_name, &debug_op_name_proper,
    450                                       &custom_attributes));
    451 
    452   const string debug_node_name =
    453       GetDebugNodeName(tensor_name, debug_op_num, debug_op_name_proper);
    454   auto builder = NodeDefBuilder(debug_node_name, debug_op_name_proper)
    455                      .Input(src_copy_node_name, 0, src_dt)
    456                      .Attr("device_name", device.name())
    457                      .Attr("tensor_name", tensor_name)
    458                      .Attr("debug_urls", debug_urls);
    459 
    460   if (!builder.Finalize(&node_def).ok()) {
    461     return errors::FailedPrecondition(
    462         "Failed to create node definition for debug op ", debug_op_name_proper,
    463         " on watched tensor ", tensor_name);
    464   }
    465   if (!FindKernelDef(DeviceType(device.device_type()), node_def, &kdef, nullptr)
    466            .ok()) {
    467     return errors::FailedPrecondition(
    468         "Failed to find kernel definition for debug op ", debug_op_name_proper,
    469         " on watched tensor ", tensor_name);
    470   }
    471   if (!NodeBuilder(builder).Finalize(graph, debug_node).ok()) {
    472     return errors::FailedPrecondition("Failed to create debug node ",
    473                                       debug_op_name_proper,
    474                                       " on watched tensor ", tensor_name);
    475   }
    476 
    477   // Set custom attributes (if any).
    478   if (!custom_attributes.empty()) {
    479     TF_RETURN_IF_ERROR(SetDebugNodeAttributes(*debug_node, custom_attributes));
    480   }
    481 
    482   return Status::OK();
    483 }
    484 
    485 }  // namespace tensorflow
    486