1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_KERNELS_DEBUG_OP_H_ 17 #define TENSORFLOW_KERNELS_DEBUG_OP_H_ 18 19 #if GOOGLE_CUDA 20 #include "tensorflow/core/common_runtime/gpu/gpu_util.h" 21 #endif 22 #ifdef TENSORFLOW_USE_SYCL 23 #include "tensorflow/core/common_runtime/sycl/sycl_util.h" 24 #endif // TENSORFLOW_USE_SYCL 25 #include "tensorflow/core/debug/debug_io_utils.h" 26 #include "tensorflow/core/framework/device_base.h" 27 #include "tensorflow/core/framework/op_kernel.h" 28 #include "tensorflow/core/framework/tensor_util.h" 29 #include "tensorflow/core/lib/core/notification.h" 30 #include "tensorflow/core/lib/strings/stringprintf.h" 31 32 namespace tensorflow { 33 34 // Copy op for debugging. 35 // Performs CPU-to-CPU or GPU-to-GPU deep-copying of tensor, depending on the 36 // device on which the tensor is allocated. 37 class CopyOp : public OpKernel { 38 public: 39 explicit CopyOp(OpKernelConstruction* context) : OpKernel(context) { 40 OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name_)); 41 42 std::vector<string> debug_ops_spec; 43 OP_REQUIRES_OK(context, 44 context->GetAttr("debug_ops_spec", &debug_ops_spec)); 45 for (const string& debug_op_spec : debug_ops_spec) { 46 // Assume debug_op_spec has the format 47 // <debug_op>;<debug_url>;<gated_grpc>, e.g., 48 // DebugIdentity;grpc://localhost:3333;1 49 const std::vector<string> items = str_util::Split(debug_op_spec, ";"); 50 OP_REQUIRES( 51 context, items.size() == 3, 52 errors::Internal( 53 "Unexpected number of semicolons in debug_ops_spec element: ", 54 debug_op_spec)); 55 debug_op_and_url_specs_.push_back( 56 DebugWatchAndURLSpec(strings::StrCat(tensor_name_, ":", items[0]), 57 items[1], items[2] == "1")); 58 } 59 } 60 61 void Compute(OpKernelContext* context) override { 62 const Tensor& src_tensor = context->input(0); 63 64 if (src_tensor.IsInitialized() && 65 DataTypeCanUseMemcpy(src_tensor.dtype()) && 66 DebugIO::IsCopyNodeGateOpen(debug_op_and_url_specs_)) { 67 // Source tensor is initialized and is mem-copyable. Make a copy. 68 Tensor* copied_tensor; 69 OP_REQUIRES_OK(context, context->allocate_output(0, src_tensor.shape(), 70 &copied_tensor)); 71 72 #if GOOGLE_CUDA 73 Device* device = static_cast<Device*>(context->device()); 74 // Determine if the input tensor is not on CPU (e.g., on GPU). 75 bool off_host_input = device->device_type() == DEVICE_GPU && 76 !context->input_alloc_attr(0).on_host(); 77 78 if (off_host_input) { 79 DeviceContext* device_ctxt = context->op_device_context(); 80 // Input is not on host: deep-copy it from GPU to the same GPU. 81 Notification done_copy; 82 GPUUtil::CopyGPUTensorToSameGPU( 83 device, device_ctxt, &src_tensor, copied_tensor, 84 [&done_copy](const Status& s) { done_copy.Notify(); }); 85 done_copy.WaitForNotification(); 86 } else { 87 // The input tensor is on the host (CPU): deep-copy from CPU to CPU. 88 *copied_tensor = tensor::DeepCopy(src_tensor); 89 } 90 #elif defined(TENSORFLOW_USE_SYCL) 91 Device* device = static_cast<Device*>(context->device()); 92 // Determine if the input tensor is not on CPU (e.g., on GPU). 93 const bool off_host_input = device->device_type() == DEVICE_SYCL && 94 !context->input_alloc_attr(0).on_host(); 95 96 if (off_host_input) { 97 SYCLmemcpy(context->eigen_sycl_device(), src_tensor, copied_tensor); 98 } else { 99 *copied_tensor = tensor::DeepCopy(src_tensor); 100 } 101 #else 102 *copied_tensor = tensor::DeepCopy(src_tensor); 103 #endif 104 } else { 105 // Source tensor is NOT initialized and/or is not mem-copyable: Forward 106 // the Tensor object. 107 context->set_output(0, src_tensor); 108 } 109 } 110 111 bool IsExpensive() override { return false; } 112 113 private: 114 string tensor_name_; 115 std::vector<DebugWatchAndURLSpec> debug_op_and_url_specs_; 116 }; 117 118 // Base class of all debug ops. 119 class BaseDebugOp : public OpKernel { 120 public: 121 explicit BaseDebugOp(const string& debug_op_name, 122 OpKernelConstruction* context) 123 : OpKernel(context), debug_op_name_(debug_op_name) { 124 OP_REQUIRES_OK(context, context->GetAttr("debug_urls", &debug_urls_)); 125 OP_REQUIRES_OK(context, context->GetAttr("gated_grpc", &gated_grpc_)); 126 127 string device_name; 128 string tensor_name; 129 OP_REQUIRES_OK(context, context->GetAttr("device_name", &device_name)); 130 OP_REQUIRES_OK(context, context->GetAttr("tensor_name", &tensor_name)); 131 132 std::vector<string> name_items = str_util::Split(tensor_name, ':'); 133 string node_name; 134 int32 output_slot = 0; 135 OP_REQUIRES(context, name_items.size() == 1 || name_items.size() == 2, 136 errors::InvalidArgument("Failed to parse tensor name: \"", 137 tensor_name, "\"")); 138 if (name_items.size() == 2) { 139 node_name = name_items[0]; 140 OP_REQUIRES( 141 context, strings::safe_strto32(name_items[1], &output_slot), 142 errors::InvalidArgument("Invalid string value for output_slot: \"", 143 name_items[1], "\"")); 144 } else if (name_items.size() == 1) { 145 node_name = name_items[0]; 146 } 147 148 debug_watch_key_.reset( 149 new DebugNodeKey(device_name, node_name, output_slot, debug_op_name_)); 150 } 151 152 bool IsExpensive() override { return false; } 153 154 protected: 155 // Apply gRPC gating (if gated_grpc_ attribute is true). 156 // 157 // Returns false if and only if all grpc:// debug URLs of the debug op are 158 // disabled currently (i.e., gated off), in which case the debug op will emit 159 // an empty (size {0}) tensor of undefined data type. 160 bool ApplyGrpcGating(OpKernelContext* context) { 161 if (gated_grpc_ && !DebugIO::IsDebugNodeGateOpen( 162 debug_watch_key_->debug_node_name, debug_urls_)) { 163 // The entire node is gated off: Output an empty tensor and avoid 164 // expensive computation. 165 Tensor* output_tensor; 166 TensorShape shape({0}); 167 if (!context->allocate_output(0, shape, &output_tensor).ok()) { 168 LOG(ERROR) << "Debug node of watch key " 169 << debug_watch_key_->debug_node_name 170 << " failed to allocate empty tensor under gated-off state."; 171 } 172 return false; 173 } else { 174 return true; 175 } 176 } 177 178 // Publish a tensor to all debug URLs of the debug op. 179 // Log an error if the publishing failed. 180 void PublishTensor(const Tensor& tensor) { 181 if (!debug_urls_.empty()) { 182 Status status = DebugIO::PublishDebugTensor(*debug_watch_key_, tensor, 183 Env::Default()->NowMicros(), 184 debug_urls_, gated_grpc_); 185 if (!status.ok()) { 186 LOG(ERROR) << "Debug node of watch key " 187 << debug_watch_key_->debug_node_name 188 << " failed to publish debug tensor data to all URLs " 189 << str_util::Join(debug_urls_, ", ") 190 << ", due to: " << status.error_message(); 191 } 192 } 193 } 194 195 private: 196 const string debug_op_name_; 197 std::unique_ptr<DebugNodeKey> debug_watch_key_; 198 std::vector<string> debug_urls_; 199 bool gated_grpc_; 200 }; 201 202 // Identity op for debugging. 203 // Output slot 0 carries the debug signal and is always allocated on the 204 // host (CPU) as a non-Ref tensor. In the case of DebugIdentityOp, 205 // the debug signal is equal to the input tensor. 206 class DebugIdentityOp : public BaseDebugOp { 207 public: 208 explicit DebugIdentityOp(OpKernelConstruction* context) 209 : BaseDebugOp("DebugIdentity", context) {} 210 211 void Compute(OpKernelContext* context) override { 212 if (!ApplyGrpcGating(context)) { 213 return; 214 } 215 216 PublishTensor(context->input(0)); 217 context->set_output(0, context->input(0)); 218 } 219 }; 220 221 // NaN-counter op for debugging. 222 template <typename T> 223 class DebugNanCountOp : public BaseDebugOp { 224 public: 225 explicit DebugNanCountOp(OpKernelConstruction* context) 226 : BaseDebugOp("DebugNanCount", context) {} 227 228 void Compute(OpKernelContext* context) override { 229 if (!ApplyGrpcGating(context)) { 230 return; 231 } 232 233 Tensor* output_tensor; 234 const Tensor& input = context->input(0); 235 236 // Use DT_INT64/int64 to be consistent with TensorShape::num_elements(). 237 int64 nan_count = 0; 238 239 // If the input is an uninitialized tensor, let nan_count be 0. 240 if (input.IsInitialized()) { 241 // Count NaNs. 242 const TensorShape& input_shape = input.shape(); 243 const T* input_flat = input.template flat<T>().data(); 244 245 for (int64 i = 0; i < input_shape.num_elements(); ++i) { 246 if (Eigen::numext::isnan(static_cast<double>(input_flat[i]))) { 247 nan_count++; 248 } 249 } 250 } 251 252 TensorShape shape({1}); 253 OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); 254 output_tensor->vec<int64>()(0) = nan_count; 255 PublishTensor(*output_tensor); 256 } 257 }; 258 259 // Numeric summary op for debugging. 260 template <typename T> 261 class DebugNumericSummaryOp : public BaseDebugOp { 262 public: 263 explicit DebugNumericSummaryOp(OpKernelConstruction* context) 264 : BaseDebugOp("DebugNumericSummary", context) { 265 OP_REQUIRES_OK(context, context->GetAttr("lower_bound", &lower_bound_)); 266 OP_REQUIRES_OK(context, context->GetAttr("upper_bound", &upper_bound_)); 267 OP_REQUIRES_OK(context, 268 context->GetAttr("mute_if_healthy", &mute_if_healthy_)); 269 } 270 271 void Compute(OpKernelContext* context) override { 272 if (!ApplyGrpcGating(context)) { 273 return; 274 } 275 276 Tensor* output_tensor; 277 const Tensor& input = context->input(0); 278 279 int64 is_initialized = 0; 280 int64 element_count = 0; 281 int64 negative_inf_count = 0; 282 int64 negative_count = 0; 283 int64 zero_count = 0; 284 int64 positive_count = 0; 285 int64 positive_inf_count = 0; 286 int64 nan_count = 0; 287 double min = std::numeric_limits<double>::infinity(); 288 double max = -std::numeric_limits<double>::infinity(); 289 double sum = 0.0; 290 double mean = std::numeric_limits<double>::quiet_NaN(); 291 double variance = std::numeric_limits<double>::quiet_NaN(); 292 293 // Equal to negative_count + zero_count + positive_count. 294 int64 non_inf_nan_count = 0; 295 296 const TensorShape& input_shape = input.shape(); 297 if (input.IsInitialized()) { 298 is_initialized = 1; 299 const T* input_flat = input.template flat<T>().data(); 300 301 element_count = input_shape.num_elements(); 302 const bool is_lower_bound_custom = !Eigen::numext::isinf(lower_bound_); 303 const bool is_upper_bound_custom = !Eigen::numext::isinf(upper_bound_); 304 305 for (int64 i = 0; i < element_count; ++i) { 306 const double x = static_cast<double>(input_flat[i]); 307 if (Eigen::numext::isnan(x)) { 308 nan_count++; 309 } else if (Eigen::numext::isinf(x)) { 310 if (x < 0.0) { 311 negative_inf_count++; 312 } else { 313 positive_inf_count++; 314 } 315 } else { 316 if (is_lower_bound_custom && x <= lower_bound_) { 317 negative_inf_count++; 318 } else if (is_upper_bound_custom && x >= upper_bound_) { 319 positive_inf_count++; 320 } else if (x < 0.0) { 321 negative_count++; 322 } else if (x > 0.0) { 323 positive_count++; 324 } else { 325 zero_count++; 326 } 327 328 if (x < min) { 329 min = x; 330 } 331 if (x > max) { 332 max = x; 333 } 334 335 non_inf_nan_count++; 336 sum += x; 337 } 338 } 339 340 if (non_inf_nan_count > 0) { 341 mean = sum / non_inf_nan_count; 342 343 // Do a second pass to compute variance. 344 variance = 0.0; 345 for (int64 i = 0; i < element_count; ++i) { 346 const double x = static_cast<double>(input_flat[i]); 347 if (!Eigen::numext::isnan(x) && !Eigen::numext::isinf(x)) { 348 variance += (x - mean) * (x - mean); 349 } 350 } 351 variance /= non_inf_nan_count; 352 } 353 } 354 355 TensorShape shape({14 + input_shape.dims()}); 356 OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); 357 output_tensor->vec<double>()(0) = static_cast<double>(is_initialized); 358 output_tensor->vec<double>()(1) = static_cast<double>(element_count); 359 output_tensor->vec<double>()(2) = static_cast<double>(nan_count); 360 output_tensor->vec<double>()(3) = static_cast<double>(negative_inf_count); 361 output_tensor->vec<double>()(4) = static_cast<double>(negative_count); 362 output_tensor->vec<double>()(5) = static_cast<double>(zero_count); 363 output_tensor->vec<double>()(6) = static_cast<double>(positive_count); 364 output_tensor->vec<double>()(7) = static_cast<double>(positive_inf_count); 365 output_tensor->vec<double>()(8) = min; 366 output_tensor->vec<double>()(9) = max; 367 output_tensor->vec<double>()(10) = mean; 368 output_tensor->vec<double>()(11) = variance; 369 370 output_tensor->vec<double>()(12) = static_cast<double>(input.dtype()); 371 output_tensor->vec<double>()(13) = static_cast<double>(input_shape.dims()); 372 for (size_t d = 0; d < input_shape.dims(); ++d) { 373 output_tensor->vec<double>()(14 + d) = 374 static_cast<double>(input_shape.dim_sizes()[d]); 375 } 376 377 bool mute = mute_if_healthy_ && nan_count == 0 && negative_inf_count == 0 && 378 positive_inf_count == 0; 379 if (!mute) { 380 PublishTensor(*output_tensor); 381 } 382 } 383 384 private: 385 float lower_bound_; 386 float upper_bound_; 387 bool mute_if_healthy_; 388 }; 389 390 } // namespace tensorflow 391 392 #endif // TENSORFLOW_KERNELS_DEBUG_OP_H_ 393