1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // See docs in ../ops/array_ops.cc. 17 18 #include <math.h> 19 #include <algorithm> 20 #include <numeric> 21 22 #include "tensorflow/core/framework/op_kernel.h" 23 #include "tensorflow/core/framework/register_types.h" 24 #include "tensorflow/core/framework/tensor.h" 25 #include "tensorflow/core/framework/types.h" 26 27 #if GOOGLE_CUDA 28 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 29 #include "tensorflow/core/platform/cuda.h" 30 #endif // GOOGLE_CUDA 31 namespace tensorflow { 32 33 typedef Eigen::ThreadPoolDevice CPUDevice; 34 typedef Eigen::GpuDevice GPUDevice; 35 36 #if GOOGLE_CUDA 37 template <typename T> 38 struct CheckNumericsLaunch { 39 void Run(const GPUDevice& d, const T* data, int size, 40 int abnormal_detected[2]); 41 }; 42 #endif 43 44 namespace { 45 46 template <typename Device, typename T> 47 class CheckNumericsOp; 48 49 // Partial specialization for CPU 50 template <typename T> 51 class CheckNumericsOp<CPUDevice, T> : public OpKernel { 52 public: 53 explicit CheckNumericsOp(OpKernelConstruction* context) : OpKernel(context) { 54 // message_ is used as the prefix for the assertion error message. For 55 // instance, this can be the name of the input op that produced the tensor. 56 OP_REQUIRES_OK(context, context->GetAttr("message", &message_)); 57 } 58 59 void Compute(OpKernelContext* context) override { 60 // pass along the input to the output 61 context->set_output(0, context->input(0)); 62 63 auto in = context->input(0).flat<T>(); 64 const T* data = in.data(); 65 const int64 size = in.size(); 66 // Check to see if any element of the tensor is NaN or Inf. 67 int fp_props = 68 std::accumulate(data, data + size, 0, [](const int& x, const T& y) { 69 int result = x; 70 if (Eigen::numext::isinf(y)) { 71 result |= kInfBit; 72 } else if (Eigen::numext::isnan(y)) { 73 result |= kNaNBit; 74 } 75 return result; 76 }); 77 string status; 78 if ((fp_props & kInfBit) && (fp_props & kNaNBit)) { 79 status = "Inf and NaN"; 80 } else { 81 if (fp_props & kInfBit) { 82 status = "Inf"; 83 } 84 if (fp_props & kNaNBit) { 85 status = "NaN"; 86 } 87 } 88 if (!status.empty()) { 89 context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ", 90 status, " values")); 91 } 92 } 93 94 private: 95 string message_; 96 static const int kInfBit = 0x01; 97 static const int kNaNBit = 0x02; 98 }; 99 100 #if GOOGLE_CUDA 101 // Partial specialization for GPU 102 template <typename T> 103 class CheckNumericsOp<GPUDevice, T> : public AsyncOpKernel { 104 public: 105 typedef GPUDevice Device; 106 107 explicit CheckNumericsOp(OpKernelConstruction* context) 108 : AsyncOpKernel(context) { 109 // message_ is used as the prefix for the assertion error message. For 110 // instance, this can be the name of the input op that produced the tensor. 111 OP_REQUIRES_OK(context, context->GetAttr("message", &message_)); 112 } 113 114 void ComputeAsync(OpKernelContext* context, DoneCallback done) override { 115 // pass along the input to the output 116 context->set_output(0, context->input(0)); 117 if (context->input(0).NumElements() == 0) { 118 done(); 119 return; 120 } 121 auto input = context->input(0).flat<T>(); 122 123 // Allocate and initialize the elements to hold the check results 124 const int abnormal_detected_size = 2; 125 Tensor abnormal_detected; 126 OP_REQUIRES_OK(context, context->allocate_temp( 127 DT_INT32, TensorShape({abnormal_detected_size}), 128 &abnormal_detected)); 129 130 auto* stream = context->op_device_context()->stream(); 131 OP_REQUIRES_ASYNC(context, stream != nullptr, 132 errors::Internal("No GPU stream available."), done); 133 134 perftools::gputools::DeviceMemoryBase abnormal_detected_ptr( 135 abnormal_detected.flat<int>().data(), 136 abnormal_detected.flat<int>().size()); 137 stream->ThenMemset32(&abnormal_detected_ptr, 0, 138 abnormal_detected.flat<int>().size() * sizeof(int)); 139 140 // Call the Cuda kernels for the numerical checks 141 const Device& d = context->eigen_device<Device>(); 142 CheckNumericsLaunch<T>().Run(d, input.data(), input.size(), 143 abnormal_detected.flat<int>().data()); 144 145 // Copy the results from device to host 146 AllocatorAttributes attr; 147 attr.set_on_host(true); 148 attr.set_gpu_compatible(true); 149 Tensor abnormal_detected_host; 150 OP_REQUIRES_OK_ASYNC( 151 context, 152 context->allocate_temp(DT_INT32, TensorShape({abnormal_detected_size}), 153 &abnormal_detected_host, attr), 154 done); 155 OP_REQUIRES_ASYNC( 156 context, 157 stream 158 ->ThenMemcpy(abnormal_detected_host.flat<int>().data(), 159 abnormal_detected_ptr, 160 abnormal_detected_size * sizeof(int)) 161 .ok(), 162 errors::Internal("cudaMemcpy from device to host failed"), done); 163 164 // We have observed crashes on some network stacks when not holding 165 // this tensor reference. 166 TensorReference abnormal_detected_ref(abnormal_detected); 167 auto check_cb = [this, stream, abnormal_detected_ref, 168 abnormal_detected_host, context, done]() { 169 ::perftools::gputools::cuda::ScopedActivateExecutorContext 170 scoped_activation{stream->parent()}; 171 auto abnormal_detected_host_flat = abnormal_detected_host.flat<int>(); 172 int is_nan = abnormal_detected_host_flat(0); 173 int is_inf = abnormal_detected_host_flat(1); 174 abnormal_detected_ref.Unref(); 175 if (is_nan || is_inf) { 176 string status; 177 LOG(ERROR) << "abnormal_detected_host @" 178 << abnormal_detected_host_flat.data() << " = {" << is_nan 179 << ", " << is_inf << "} " << message_; 180 181 // Results should always be 1 or 0. If we see anything else then 182 // there has been some GPU memory corruption. 183 CHECK_GE(is_nan, 0); 184 CHECK_GE(is_inf, 0); 185 CHECK_LE(is_nan, 1); 186 CHECK_LE(is_inf, 1); 187 188 if (is_nan && is_inf) { 189 status = "Inf and NaN"; 190 } else if (is_nan) { 191 status = "NaN"; 192 } else if (is_inf) { 193 status = "Inf"; 194 } 195 context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ", 196 status, " values")); 197 } 198 done(); 199 }; 200 context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( 201 stream, std::move(check_cb)); 202 } 203 204 private: 205 string message_; 206 }; 207 #endif // GOOGLE_CUDA 208 209 } // namespace 210 211 #define REGISTER_CPU_KERNEL(T) \ 212 REGISTER_KERNEL_BUILDER( \ 213 Name("CheckNumerics").Device(DEVICE_CPU).TypeConstraint<T>("T"), \ 214 CheckNumericsOp<CPUDevice, T>); 215 TF_CALL_half(REGISTER_CPU_KERNEL); 216 TF_CALL_float(REGISTER_CPU_KERNEL); 217 TF_CALL_double(REGISTER_CPU_KERNEL); 218 219 #if GOOGLE_CUDA 220 REGISTER_KERNEL_BUILDER( 221 Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"), 222 CheckNumericsOp<GPUDevice, Eigen::half>); 223 REGISTER_KERNEL_BUILDER( 224 Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<float>("T"), 225 CheckNumericsOp<GPUDevice, float>); 226 REGISTER_KERNEL_BUILDER( 227 Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<double>("T"), 228 CheckNumericsOp<GPUDevice, double>); 229 #endif // GOOGLE_CUDA 230 231 } // namespace tensorflow 232