Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // See docs in ../ops/array_ops.cc.
     17 
     18 #include <math.h>
     19 #include <algorithm>
     20 #include <numeric>
     21 
     22 #include "tensorflow/core/framework/op_kernel.h"
     23 #include "tensorflow/core/framework/register_types.h"
     24 #include "tensorflow/core/framework/tensor.h"
     25 #include "tensorflow/core/framework/types.h"
     26 
     27 #if GOOGLE_CUDA
     28 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
     29 #include "tensorflow/core/platform/cuda.h"
     30 #endif  // GOOGLE_CUDA
     31 namespace tensorflow {
     32 
     33 typedef Eigen::ThreadPoolDevice CPUDevice;
     34 typedef Eigen::GpuDevice GPUDevice;
     35 
     36 #if GOOGLE_CUDA
     37 template <typename T>
     38 struct CheckNumericsLaunch {
     39   void Run(const GPUDevice& d, const T* data, int size,
     40            int abnormal_detected[2]);
     41 };
     42 #endif
     43 
     44 namespace {
     45 
     46 template <typename Device, typename T>
     47 class CheckNumericsOp;
     48 
     49 // Partial specialization for CPU
     50 template <typename T>
     51 class CheckNumericsOp<CPUDevice, T> : public OpKernel {
     52  public:
     53   explicit CheckNumericsOp(OpKernelConstruction* context) : OpKernel(context) {
     54     // message_ is used as the prefix for the assertion error message. For
     55     // instance, this can be the name of the input op that produced the tensor.
     56     OP_REQUIRES_OK(context, context->GetAttr("message", &message_));
     57   }
     58 
     59   void Compute(OpKernelContext* context) override {
     60     // pass along the input to the output
     61     context->set_output(0, context->input(0));
     62 
     63     auto in = context->input(0).flat<T>();
     64     const T* data = in.data();
     65     const int64 size = in.size();
     66     // Check to see if any element of the tensor is NaN or Inf.
     67     int fp_props =
     68         std::accumulate(data, data + size, 0, [](const int& x, const T& y) {
     69           int result = x;
     70           if (Eigen::numext::isinf(y)) {
     71             result |= kInfBit;
     72           } else if (Eigen::numext::isnan(y)) {
     73             result |= kNaNBit;
     74           }
     75           return result;
     76         });
     77     string status;
     78     if ((fp_props & kInfBit) && (fp_props & kNaNBit)) {
     79       status = "Inf and NaN";
     80     } else {
     81       if (fp_props & kInfBit) {
     82         status = "Inf";
     83       }
     84       if (fp_props & kNaNBit) {
     85         status = "NaN";
     86       }
     87     }
     88     if (!status.empty()) {
     89       context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ",
     90                                                  status, " values"));
     91     }
     92   }
     93 
     94  private:
     95   string message_;
     96   static const int kInfBit = 0x01;
     97   static const int kNaNBit = 0x02;
     98 };
     99 
    100 #if GOOGLE_CUDA
    101 // Partial specialization for GPU
    102 template <typename T>
    103 class CheckNumericsOp<GPUDevice, T> : public AsyncOpKernel {
    104  public:
    105   typedef GPUDevice Device;
    106 
    107   explicit CheckNumericsOp(OpKernelConstruction* context)
    108       : AsyncOpKernel(context) {
    109     // message_ is used as the prefix for the assertion error message. For
    110     // instance, this can be the name of the input op that produced the tensor.
    111     OP_REQUIRES_OK(context, context->GetAttr("message", &message_));
    112   }
    113 
    114   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
    115     // pass along the input to the output
    116     context->set_output(0, context->input(0));
    117     if (context->input(0).NumElements() == 0) {
    118       done();
    119       return;
    120     }
    121     auto input = context->input(0).flat<T>();
    122 
    123     // Allocate and initialize the elements to hold the check results
    124     const int abnormal_detected_size = 2;
    125     Tensor abnormal_detected;
    126     OP_REQUIRES_OK(context, context->allocate_temp(
    127                                 DT_INT32, TensorShape({abnormal_detected_size}),
    128                                 &abnormal_detected));
    129 
    130     auto* stream = context->op_device_context()->stream();
    131     OP_REQUIRES_ASYNC(context, stream != nullptr,
    132                       errors::Internal("No GPU stream available."), done);
    133 
    134     perftools::gputools::DeviceMemoryBase abnormal_detected_ptr(
    135         abnormal_detected.flat<int>().data(),
    136         abnormal_detected.flat<int>().size());
    137     stream->ThenMemset32(&abnormal_detected_ptr, 0,
    138                          abnormal_detected.flat<int>().size() * sizeof(int));
    139 
    140     // Call the Cuda kernels for the numerical checks
    141     const Device& d = context->eigen_device<Device>();
    142     CheckNumericsLaunch<T>().Run(d, input.data(), input.size(),
    143                                  abnormal_detected.flat<int>().data());
    144 
    145     // Copy the results from device to host
    146     AllocatorAttributes attr;
    147     attr.set_on_host(true);
    148     attr.set_gpu_compatible(true);
    149     Tensor abnormal_detected_host;
    150     OP_REQUIRES_OK_ASYNC(
    151         context,
    152         context->allocate_temp(DT_INT32, TensorShape({abnormal_detected_size}),
    153                                &abnormal_detected_host, attr),
    154         done);
    155     OP_REQUIRES_ASYNC(
    156         context,
    157         stream
    158             ->ThenMemcpy(abnormal_detected_host.flat<int>().data(),
    159                          abnormal_detected_ptr,
    160                          abnormal_detected_size * sizeof(int))
    161             .ok(),
    162         errors::Internal("cudaMemcpy from device to host failed"), done);
    163 
    164     // We have observed crashes on some network stacks when not holding
    165     // this tensor reference.
    166     TensorReference abnormal_detected_ref(abnormal_detected);
    167     auto check_cb = [this, stream, abnormal_detected_ref,
    168                      abnormal_detected_host, context, done]() {
    169       ::perftools::gputools::cuda::ScopedActivateExecutorContext
    170           scoped_activation{stream->parent()};
    171       auto abnormal_detected_host_flat = abnormal_detected_host.flat<int>();
    172       int is_nan = abnormal_detected_host_flat(0);
    173       int is_inf = abnormal_detected_host_flat(1);
    174       abnormal_detected_ref.Unref();
    175       if (is_nan || is_inf) {
    176         string status;
    177         LOG(ERROR) << "abnormal_detected_host @"
    178                    << abnormal_detected_host_flat.data() << " = {" << is_nan
    179                    << ", " << is_inf << "} " << message_;
    180 
    181         // Results should always be 1 or 0.  If we see anything else then
    182         // there has been some GPU memory corruption.
    183         CHECK_GE(is_nan, 0);
    184         CHECK_GE(is_inf, 0);
    185         CHECK_LE(is_nan, 1);
    186         CHECK_LE(is_inf, 1);
    187 
    188         if (is_nan && is_inf) {
    189           status = "Inf and NaN";
    190         } else if (is_nan) {
    191           status = "NaN";
    192         } else if (is_inf) {
    193           status = "Inf";
    194         }
    195         context->SetStatus(errors::InvalidArgument(message_, " : Tensor had ",
    196                                                    status, " values"));
    197       }
    198       done();
    199     };
    200     context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute(
    201         stream, std::move(check_cb));
    202   }
    203 
    204  private:
    205   string message_;
    206 };
    207 #endif  // GOOGLE_CUDA
    208 
    209 }  // namespace
    210 
    211 #define REGISTER_CPU_KERNEL(T)                                         \
    212   REGISTER_KERNEL_BUILDER(                                             \
    213       Name("CheckNumerics").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
    214       CheckNumericsOp<CPUDevice, T>);
    215 TF_CALL_half(REGISTER_CPU_KERNEL);
    216 TF_CALL_float(REGISTER_CPU_KERNEL);
    217 TF_CALL_double(REGISTER_CPU_KERNEL);
    218 
    219 #if GOOGLE_CUDA
    220 REGISTER_KERNEL_BUILDER(
    221     Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<Eigen::half>("T"),
    222     CheckNumericsOp<GPUDevice, Eigen::half>);
    223 REGISTER_KERNEL_BUILDER(
    224     Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<float>("T"),
    225     CheckNumericsOp<GPUDevice, float>);
    226 REGISTER_KERNEL_BUILDER(
    227     Name("CheckNumerics").Device(DEVICE_GPU).TypeConstraint<double>("T"),
    228     CheckNumericsOp<GPUDevice, double>);
    229 #endif  // GOOGLE_CUDA
    230 
    231 }  // namespace tensorflow
    232