Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #define EIGEN_USE_THREADS
     17 
     18 #if GOOGLE_CUDA
     19 #define EIGEN_USE_GPU
     20 #endif  // GOOGLE_CUDA
     21 
     22 #include "tensorflow/core/kernels/quantize_and_dequantize_op.h"
     23 
     24 #include "tensorflow/core/framework/op.h"
     25 #include "tensorflow/core/framework/op_kernel.h"
     26 #include "tensorflow/core/framework/register_types.h"
     27 #include "tensorflow/core/framework/type_traits.h"
     28 #include "tensorflow/core/framework/types.h"
     29 #include "tensorflow/core/lib/core/errors.h"
     30 
     31 namespace tensorflow {
     32 
     33 typedef Eigen::ThreadPoolDevice CPUDevice;
     34 typedef Eigen::GpuDevice GPUDevice;
     35 
     36 // Simulate quantization precision loss in a float tensor by:
     37 // 1. Quantize the tensor to fixed point numbers, which should match the target
     38 //    quantization method when it is used in inference.
     39 // 2. Dequantize it back to floating point numbers for the following ops, most
     40 //    likely matmul.
     41 template <typename Device, typename T>
     42 class QuantizeAndDequantizeV2Op : public OpKernel {
     43  public:
     44   explicit QuantizeAndDequantizeV2Op(OpKernelConstruction* ctx)
     45       : OpKernel(ctx) {
     46     OP_REQUIRES_OK(ctx, ctx->GetAttr("signed_input", &signed_input_));
     47     OP_REQUIRES_OK(ctx, ctx->GetAttr("num_bits", &num_bits_));
     48     OP_REQUIRES(ctx, num_bits_ > 0 && num_bits_ < (signed_input_ ? 62 : 63),
     49                 errors::InvalidArgument("num_bits is out of range: ", num_bits_,
     50                                         " with signed_input_ ", signed_input_));
     51     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
     52 
     53     string round_mode_string;
     54     OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
     55     OP_REQUIRES(
     56         ctx,
     57         (round_mode_string == "HALF_UP" || round_mode_string == "HALF_TO_EVEN"),
     58         errors::InvalidArgument("Round mode string must be "
     59                                 "'HALF_UP' or "
     60                                 "'HALF_TO_EVEN', is '" +
     61                                 round_mode_string + "'"));
     62     if (round_mode_string == "HALF_UP") {
     63       round_mode_ = ROUND_HALF_UP;
     64     } else if (round_mode_string == "HALF_TO_EVEN") {
     65       round_mode_ = ROUND_HALF_TO_EVEN;
     66     }
     67   }
     68 
     69   void Compute(OpKernelContext* ctx) override {
     70     const Tensor& input = ctx->input(0);
     71 
     72     Tensor* output = nullptr;
     73     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
     74 
     75     Tensor input_min_tensor;
     76     Tensor input_max_tensor;
     77     if (range_given_) {
     78       input_min_tensor = ctx->input(1);
     79       input_max_tensor = ctx->input(2);
     80       auto min_val = input_min_tensor.scalar<T>()();
     81       auto max_val = input_max_tensor.scalar<T>()();
     82       OP_REQUIRES(ctx, min_val <= max_val,
     83                   errors::InvalidArgument("Invalid range: input_min ", min_val,
     84                                           " > input_max ", max_val));
     85     } else {
     86       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
     87                                              TensorShape(), &input_min_tensor));
     88       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
     89                                              TensorShape(), &input_max_tensor));
     90     }
     91 
     92     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
     93     f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_,
     94       range_given_, &input_min_tensor, &input_max_tensor, round_mode_,
     95       output->flat<T>());
     96   }
     97 
     98  private:
     99   bool signed_input_;
    100   int num_bits_;
    101   bool range_given_;
    102   QuantizerRoundMode round_mode_;
    103 };
    104 
    105 // Simulate quantization precision loss in a float tensor by:
    106 // 1. Quantize the tensor to fixed point numbers, which should match the target
    107 //    quantization method when it is used in inference.
    108 // 2. Dequantize it back to floating point numbers for the following ops, most
    109 //    likely matmul.
    110 // Almost identical to QuantizeAndDequantizeV2Op, except that num_bits is a
    111 // tensor.
    112 template <typename Device, typename T>
    113 class QuantizeAndDequantizeV3Op : public OpKernel {
    114  public:
    115   explicit QuantizeAndDequantizeV3Op(OpKernelConstruction* ctx)
    116       : OpKernel(ctx) {
    117     OP_REQUIRES_OK(ctx, ctx->GetAttr("signed_input", &signed_input_));
    118     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
    119   }
    120 
    121   void Compute(OpKernelContext* ctx) override {
    122     const Tensor& input = ctx->input(0);
    123 
    124     Tensor* output = nullptr;
    125     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
    126 
    127     Tensor num_bits_tensor;
    128     num_bits_tensor = ctx->input(3);
    129     int num_bits_val = num_bits_tensor.scalar<int32>()();
    130 
    131     OP_REQUIRES(
    132         ctx, num_bits_val > 0 && num_bits_val < (signed_input_ ? 62 : 63),
    133         errors::InvalidArgument("num_bits is out of range: ", num_bits_val,
    134                                 " with signed_input_ ", signed_input_));
    135 
    136     Tensor input_min_tensor;
    137     Tensor input_max_tensor;
    138     if (range_given_) {
    139       input_min_tensor = ctx->input(1);
    140       input_max_tensor = ctx->input(2);
    141       auto min_val = input_min_tensor.scalar<T>()();
    142       auto max_val = input_max_tensor.scalar<T>()();
    143       OP_REQUIRES(ctx, min_val <= max_val,
    144                   errors::InvalidArgument("Invalid range: input_min ", min_val,
    145                                           " > input_max ", max_val));
    146     } else {
    147       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
    148                                              TensorShape(), &input_min_tensor));
    149       OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
    150                                              TensorShape(), &input_max_tensor));
    151     }
    152 
    153     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
    154     f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_val,
    155       range_given_, &input_min_tensor, &input_max_tensor, ROUND_HALF_TO_EVEN,
    156       output->flat<T>());
    157   }
    158 
    159  private:
    160   bool signed_input_;
    161   bool range_given_;
    162 };
    163 
    164 // DEPRECATED: Use QuantizeAndDequantizeV2Op.
    165 template <typename Device, typename T>
    166 class QuantizeAndDequantizeOp : public OpKernel {
    167  public:
    168   explicit QuantizeAndDequantizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    169     OP_REQUIRES_OK(ctx, ctx->GetAttr("signed_input", &signed_input_));
    170     OP_REQUIRES_OK(ctx, ctx->GetAttr("num_bits", &num_bits_));
    171     OP_REQUIRES(ctx, num_bits_ > 0 && num_bits_ < (signed_input_ ? 62 : 63),
    172                 errors::InvalidArgument("num_bits is out of range: ", num_bits_,
    173                                         " with signed_input_ ", signed_input_));
    174     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
    175     OP_REQUIRES_OK(ctx, ctx->GetAttr("input_min", &input_min_));
    176     OP_REQUIRES_OK(ctx, ctx->GetAttr("input_max", &input_max_));
    177     if (range_given_) {
    178       OP_REQUIRES(
    179           ctx, input_min_ <= input_max_,
    180           errors::InvalidArgument("Invalid range: input_min ", input_min_,
    181                                   " > input_max ", input_max_));
    182     }
    183   }
    184 
    185   void Compute(OpKernelContext* ctx) override {
    186     const Tensor& input = ctx->input(0);
    187 
    188     Tensor* output = nullptr;
    189     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
    190 
    191     // One global scale.
    192     Tensor input_min_tensor(DataTypeToEnum<T>::value, TensorShape());
    193     Tensor input_max_tensor(DataTypeToEnum<T>::value, TensorShape());
    194     // Initialize the tensors with the values in the Attrs.
    195     input_min_tensor.template scalar<T>()() = static_cast<T>(input_min_);
    196     input_max_tensor.template scalar<T>()() = static_cast<T>(input_max_);
    197 
    198     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> functor;
    199     functor(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_,
    200             num_bits_, range_given_, &input_min_tensor, &input_max_tensor,
    201             ROUND_HALF_TO_EVEN, output->flat<T>());
    202   }
    203 
    204  private:
    205   bool signed_input_;
    206   int num_bits_;
    207   bool range_given_;
    208   float input_min_;
    209   float input_max_;
    210 };
    211 
    212 // Specialization for CPUDevice.
    213 namespace functor {
    214 template <typename T>
    215 struct QuantizeAndDequantizeOneScaleFunctor<CPUDevice, T> {
    216   void operator()(const CPUDevice& d, typename TTypes<T>::ConstVec input,
    217                   const bool signed_input, const int num_bits,
    218                   const bool range_given, Tensor* input_min_tensor,
    219                   Tensor* input_max_tensor, QuantizerRoundMode round_mode,
    220                   typename TTypes<T>::Vec out) {
    221     QuantizeAndDequantizeOneScaleImpl<CPUDevice, T>::Compute(
    222         d, input, signed_input, num_bits, range_given, input_min_tensor,
    223         input_max_tensor, round_mode, out);
    224   }
    225 };
    226 }  // namespace functor
    227 
    228 #define REGISTER_CPU_KERNEL(T)                                                 \
    229   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV2")                      \
    230                               .Device(DEVICE_CPU)                              \
    231                               .TypeConstraint<T>("T"),                         \
    232                           QuantizeAndDequantizeV2Op<CPUDevice, T>);            \
    233   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV3")                      \
    234                               .Device(DEVICE_CPU)                              \
    235                               .TypeConstraint<T>("T"),                         \
    236                           QuantizeAndDequantizeV3Op<CPUDevice, T>);            \
    237   REGISTER_KERNEL_BUILDER(                                                     \
    238       Name("QuantizeAndDequantize").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
    239       QuantizeAndDequantizeOp<CPUDevice, T>);
    240 TF_CALL_float(REGISTER_CPU_KERNEL);
    241 TF_CALL_double(REGISTER_CPU_KERNEL);
    242 #undef REGISTER_CPU_KERNEL
    243 
    244 #if GOOGLE_CUDA
    245 #define REGISTER_GPU_KERNEL(T)                                                 \
    246   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV2")                      \
    247                               .Device(DEVICE_GPU)                              \
    248                               .HostMemory("input_max")                         \
    249                               .HostMemory("input_min")                         \
    250                               .TypeConstraint<T>("T"),                         \
    251                           QuantizeAndDequantizeV2Op<GPUDevice, T>);            \
    252   REGISTER_KERNEL_BUILDER(Name("QuantizeAndDequantizeV3")                      \
    253                               .Device(DEVICE_GPU)                              \
    254                               .HostMemory("input_max")                         \
    255                               .HostMemory("input_min")                         \
    256                               .HostMemory("num_bits")                          \
    257                               .TypeConstraint<T>("T"),                         \
    258                           QuantizeAndDequantizeV3Op<GPUDevice, T>);            \
    259   REGISTER_KERNEL_BUILDER(                                                     \
    260       Name("QuantizeAndDequantize").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
    261       QuantizeAndDequantizeOp<GPUDevice, T>);
    262 TF_CALL_float(REGISTER_GPU_KERNEL);
    263 TF_CALL_double(REGISTER_GPU_KERNEL);
    264 #undef REGISTER_GPU_KERNEL
    265 #endif
    266 }  // namespace tensorflow
    267