Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #ifndef TENSORFLOW_KERNELS_DEBUG_OP_H_
     17 #define TENSORFLOW_KERNELS_DEBUG_OP_H_
     18 
     19 #if GOOGLE_CUDA
     20 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
     21 #endif
     22 #ifdef TENSORFLOW_USE_SYCL
     23 #include "tensorflow/core/common_runtime/sycl/sycl_util.h"
     24 #endif  // TENSORFLOW_USE_SYCL
     25 #include "tensorflow/core/debug/debug_io_utils.h"
     26 #include "tensorflow/core/framework/device_base.h"
     27 #include "tensorflow/core/framework/op_kernel.h"
     28 #include "tensorflow/core/framework/tensor_util.h"
     29 #include "tensorflow/core/lib/core/notification.h"
     30 #include "tensorflow/core/lib/strings/stringprintf.h"
     31 
     32 namespace tensorflow {
     33 
     34 // Copy op for debugging.
     35 // Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the
     36 // device on which the tensor is allocated.
     37 class CopyOp : public OpKernel {
     38  public:
     39   explicit CopyOp(OpKernelConstruction* context) : OpKernel(context) {
     40     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_));
     41 
     42     std::vector<string> debug_ops_spec;
     43     OP_REQUIRES_OK(context,
     44                    context->GetAttr("debug_ops_spec", &debug_ops_spec));
     45     for (const string& debug_op_spec : debug_ops_spec) {
     46       // Assume debug_op_spec has the format
     47       // <debug_op>;<debug_url>;<gated_grpc>, e.g.,
     48       // DebugIdentity;grpc://localhost:3333;1
     49       const std::vector<string> items = str_util::Split(debug_op_spec, ";");
     50       OP_REQUIRES(
     51           context, items.size() == 3,
     52           errors::Internal(
     53               "Unexpected number of semicolons in debug_ops_spec element: ",
     54               debug_op_spec));
     55       debug_op_and_url_specs_.push_back(
     56           DebugWatchAndURLSpec(strings::StrCat(tensor_name_, ":", items[0]),
     57                                items[1], items[2] == "1"));
     58     }
     59   }
     60 
     61   void Compute(OpKernelContext* context) override {
     62     const Tensor& src_tensor = context->input(0);
     63 
     64     if (src_tensor.IsInitialized() &&
     65         DataTypeCanUseMemcpy(src_tensor.dtype()) &&
     66         DebugIO::IsCopyNodeGateOpen(debug_op_and_url_specs_)) {
     67       // Source tensor is initialized and is mem-copyable. Make a copy.
     68       Tensor* copied_tensor;
     69       OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(),
     70                                                        &copied_tensor));
     71 
     72 #if GOOGLE_CUDA
     73       Device* device = static_cast<Device*>(context->device());
     74       // Determine if the input tensor is not on CPU (e.g., on GPU).
     75       bool off_host_input = device->device_type() == DEVICE_GPU &&
     76                             !context->input_alloc_attr(0).on_host();
     77 
     78       if (off_host_input) {
     79         DeviceContext* device_ctxt = context->op_device_context();
     80         // Input is not on host: deep-copy it from GPU to the same GPU.
     81         Notification done_copy;
     82         GPUUtil::CopyGPUTensorToSameGPU(
     83             device, device_ctxt, &src_tensor, copied_tensor,
     84             [&done_copy](const Status& s) { done_copy.Notify(); });
     85         done_copy.WaitForNotification();
     86       } else {
     87         // The input tensor is on the host (CPU): deep-copy from CPU to CPU.
     88         *copied_tensor = tensor::DeepCopy(src_tensor);
     89       }
     90 #elif defined(TENSORFLOW_USE_SYCL)
     91       Device* device = static_cast<Device*>(context->device());
     92       // Determine if the input tensor is not on CPU (e.g., on GPU).
     93       const bool off_host_input = device->device_type() == DEVICE_SYCL &&
     94                                   !context->input_alloc_attr(0).on_host();
     95 
     96       if (off_host_input) {
     97         SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor);
     98       } else {
     99         *copied_tensor = tensor::DeepCopy(src_tensor);
    100       }
    101 #else
    102       *copied_tensor = tensor::DeepCopy(src_tensor);
    103 #endif
    104     } else {
    105       // Source tensor is NOT initialized and/or is not mem-copyable: Forward
    106       // the Tensor object.
    107       context->set_output(0, src_tensor);
    108     }
    109   }
    110 
    111   bool IsExpensive() override { return false; }
    112 
    113  private:
    114   string tensor_name_;
    115   std::vector<DebugWatchAndURLSpec> debug_op_and_url_specs_;
    116 };
    117 
    118 // Base class of all debug ops.
    119 class BaseDebugOp : public OpKernel {
    120  public:
    121   explicit BaseDebugOp(const string& debug_op_name,
    122                        OpKernelConstruction* context)
    123       : OpKernel(context), debug_op_name_(debug_op_name) {
    124     OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_));
    125     OP_REQUIRES_OK(context, context->GetAttr("gated_grpc", &gated_grpc_));
    126 
    127     string device_name;
    128     string tensor_name;
    129     OP_REQUIRES_OK(context, context->GetAttr("device_name", &device_name));
    130     OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name));
    131 
    132     std::vector<string> name_items = str_util::Split(tensor_name, ':');
    133     string node_name;
    134     int32 output_slot = 0;
    135     OP_REQUIRES(context, name_items.size() == 1 || name_items.size() == 2,
    136                 errors::InvalidArgument("Failed to parse tensor name: \"",
    137                                         tensor_name, "\""));
    138     if (name_items.size() == 2) {
    139       node_name = name_items[0];
    140       OP_REQUIRES(
    141           context, strings::safe_strto32(name_items[1], &output_slot),
    142           errors::InvalidArgument("Invalid string value for output_slot: \"",
    143                                   name_items[1], "\""));
    144     } else if (name_items.size() == 1) {
    145       node_name = name_items[0];
    146     }
    147 
    148     debug_watch_key_.reset(
    149         new DebugNodeKey(device_name, node_name, output_slot, debug_op_name_));
    150   }
    151 
    152   bool IsExpensive() override { return false; }
    153 
    154  protected:
    155   // Apply gRPC gating (if gated_grpc_ attribute is true).
    156   //
    157   // Returns false if and only if all grpc:// debug URLs of the debug op are
    158   // disabled currently (i.e., gated off), in which case the debug op will emit
    159   // an empty (size {0}) tensor of undefined data type.
    160   bool ApplyGrpcGating(OpKernelContext* context) {
    161     if (gated_grpc_ && !DebugIO::IsDebugNodeGateOpen(
    162                            debug_watch_key_->debug_node_name, debug_urls_)) {
    163       // The entire node is gated off: Output an empty tensor and avoid
    164       // expensive computation.
    165       Tensor* output_tensor;
    166       TensorShape shape({0});
    167       if (!context->allocate_output(0, shape, &output_tensor).ok()) {
    168         LOG(ERROR) << "Debug node of watch key "
    169                    << debug_watch_key_->debug_node_name
    170                    << " failed to allocate empty tensor under gated-off state.";
    171       }
    172       return false;
    173     } else {
    174       return true;
    175     }
    176   }
    177 
    178   // Publish a tensor to all debug URLs of the debug op.
    179   // Log an error if the publishing failed.
    180   void PublishTensor(const Tensor& tensor) {
    181     if (!debug_urls_.empty()) {
    182       Status status = DebugIO::PublishDebugTensor(*debug_watch_key_, tensor,
    183                                                   Env::Default()->NowMicros(),
    184                                                   debug_urls_, gated_grpc_);
    185       if (!status.ok()) {
    186         LOG(ERROR) << "Debug node of watch key "
    187                    << debug_watch_key_->debug_node_name
    188                    << " failed to publish debug tensor data to all URLs "
    189                    << str_util::Join(debug_urls_, ", ")
    190                    << ", due to: " << status.error_message();
    191       }
    192     }
    193   }
    194 
    195  private:
    196   const string debug_op_name_;
    197   std::unique_ptr<DebugNodeKey> debug_watch_key_;
    198   std::vector<string> debug_urls_;
    199   bool gated_grpc_;
    200 };
    201 
    202 // Identity op for debugging.
    203 //   Output slot 0 carries the debug signal and is always allocated on the
    204 //   host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp,
    205 //   the debug signal is equal to the input tensor.
    206 class DebugIdentityOp : public BaseDebugOp {
    207  public:
    208   explicit DebugIdentityOp(OpKernelConstruction* context)
    209       : BaseDebugOp("DebugIdentity", context) {}
    210 
    211   void Compute(OpKernelContext* context) override {
    212     if (!ApplyGrpcGating(context)) {
    213       return;
    214     }
    215 
    216     PublishTensor(context->input(0));
    217     context->set_output(0, context->input(0));
    218   }
    219 };
    220 
    221 // NaN-counter op for debugging.
    222 template <typename T>
    223 class DebugNanCountOp : public BaseDebugOp {
    224  public:
    225   explicit DebugNanCountOp(OpKernelConstruction* context)
    226       : BaseDebugOp("DebugNanCount", context) {}
    227 
    228   void Compute(OpKernelContext* context) override {
    229     if (!ApplyGrpcGating(context)) {
    230       return;
    231     }
    232 
    233     Tensor* output_tensor;
    234     const Tensor& input = context->input(0);
    235 
    236     // Use DT_INT64/int64 to be consistent with TensorShape::num_elements().
    237     int64 nan_count = 0;
    238 
    239     // If the input is an uninitialized tensor, let nan_count be 0.
    240     if (input.IsInitialized()) {
    241       // Count NaNs.
    242       const TensorShape& input_shape = input.shape();
    243       const T* input_flat = input.template flat<T>().data();
    244 
    245       for (int64 i = 0; i < input_shape.num_elements(); ++i) {
    246         if (Eigen::numext::isnan(static_cast<double>(input_flat[i]))) {
    247           nan_count++;
    248         }
    249       }
    250     }
    251 
    252     TensorShape shape({1});
    253     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
    254     output_tensor->vec<int64>()(0) = nan_count;
    255     PublishTensor(*output_tensor);
    256   }
    257 };
    258 
    259 // Numeric summary op for debugging.
    260 template <typename T>
    261 class DebugNumericSummaryOp : public BaseDebugOp {
    262  public:
    263   explicit DebugNumericSummaryOp(OpKernelConstruction* context)
    264       : BaseDebugOp("DebugNumericSummary", context) {
    265     OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_));
    266     OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_));
    267     OP_REQUIRES_OK(context,
    268                    context->GetAttr("mute_if_healthy", &mute_if_healthy_));
    269   }
    270 
    271   void Compute(OpKernelContext* context) override {
    272     if (!ApplyGrpcGating(context)) {
    273       return;
    274     }
    275 
    276     Tensor* output_tensor;
    277     const Tensor& input = context->input(0);
    278 
    279     int64 is_initialized = 0;
    280     int64 element_count = 0;
    281     int64 negative_inf_count = 0;
    282     int64 negative_count = 0;
    283     int64 zero_count = 0;
    284     int64 positive_count = 0;
    285     int64 positive_inf_count = 0;
    286     int64 nan_count = 0;
    287     double min = std::numeric_limits<double>::infinity();
    288     double max = -std::numeric_limits<double>::infinity();
    289     double sum = 0.0;
    290     double mean = std::numeric_limits<double>::quiet_NaN();
    291     double variance = std::numeric_limits<double>::quiet_NaN();
    292 
    293     // Equal to negative_count + zero_count + positive_count.
    294     int64 non_inf_nan_count = 0;
    295 
    296     const TensorShape& input_shape = input.shape();
    297     if (input.IsInitialized()) {
    298       is_initialized = 1;
    299       const T* input_flat = input.template flat<T>().data();
    300 
    301       element_count = input_shape.num_elements();
    302       const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_);
    303       const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_);
    304 
    305       for (int64 i = 0; i < element_count; ++i) {
    306         const double x = static_cast<double>(input_flat[i]);
    307         if (Eigen::numext::isnan(x)) {
    308           nan_count++;
    309         } else if (Eigen::numext::isinf(x)) {
    310           if (x < 0.0) {
    311             negative_inf_count++;
    312           } else {
    313             positive_inf_count++;
    314           }
    315         } else {
    316           if (is_lower_bound_custom && x <= lower_bound_) {
    317             negative_inf_count++;
    318           } else if (is_upper_bound_custom && x >= upper_bound_) {
    319             positive_inf_count++;
    320           } else if (x < 0.0) {
    321             negative_count++;
    322           } else if (x > 0.0) {
    323             positive_count++;
    324           } else {
    325             zero_count++;
    326           }
    327 
    328           if (x < min) {
    329             min = x;
    330           }
    331           if (x > max) {
    332             max = x;
    333           }
    334 
    335           non_inf_nan_count++;
    336           sum += x;
    337         }
    338       }
    339 
    340       if (non_inf_nan_count > 0) {
    341         mean = sum / non_inf_nan_count;
    342 
    343         // Do a second pass to compute variance.
    344         variance = 0.0;
    345         for (int64 i = 0; i < element_count; ++i) {
    346           const double x = static_cast<double>(input_flat[i]);
    347           if (!Eigen::numext::isnan(x) && !Eigen::numext::isinf(x)) {
    348             variance += (x - mean) * (x - mean);
    349           }
    350         }
    351         variance /= non_inf_nan_count;
    352       }
    353     }
    354 
    355     TensorShape shape({14 + input_shape.dims()});
    356     OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor));
    357     output_tensor->vec<double>()(0) = static_cast<double>(is_initialized);
    358     output_tensor->vec<double>()(1) = static_cast<double>(element_count);
    359     output_tensor->vec<double>()(2) = static_cast<double>(nan_count);
    360     output_tensor->vec<double>()(3) = static_cast<double>(negative_inf_count);
    361     output_tensor->vec<double>()(4) = static_cast<double>(negative_count);
    362     output_tensor->vec<double>()(5) = static_cast<double>(zero_count);
    363     output_tensor->vec<double>()(6) = static_cast<double>(positive_count);
    364     output_tensor->vec<double>()(7) = static_cast<double>(positive_inf_count);
    365     output_tensor->vec<double>()(8) = min;
    366     output_tensor->vec<double>()(9) = max;
    367     output_tensor->vec<double>()(10) = mean;
    368     output_tensor->vec<double>()(11) = variance;
    369 
    370     output_tensor->vec<double>()(12) = static_cast<double>(input.dtype());
    371     output_tensor->vec<double>()(13) = static_cast<double>(input_shape.dims());
    372     for (size_t d = 0; d < input_shape.dims(); ++d) {
    373       output_tensor->vec<double>()(14 + d) =
    374           static_cast<double>(input_shape.dim_sizes()[d]);
    375     }
    376 
    377     bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 &&
    378                 positive_inf_count == 0;
    379     if (!mute) {
    380       PublishTensor(*output_tensor);
    381     }
    382   }
    383 
    384  private:
    385   float lower_bound_;
    386   float upper_bound_;
    387   bool mute_if_healthy_;
    388 };
    389 
    390 }  // namespace tensorflow
    391 
    392 #endif  // TENSORFLOW_KERNELS_DEBUG_OP_H_
    393