Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // Implements a quantized eight-bit version of the matmul operation.
     17 
     18 #define EIGEN_USE_THREADS
     19 
     20 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
     21 #define USE_NEON
     22 #include <arm_neon.h>
     23 #endif
     24 
     25 #include "tensorflow/core/framework/op_kernel.h"
     26 #include "tensorflow/core/framework/tensor.h"
     27 #include "tensorflow/core/kernels/meta_support.h"
     28 #include "tensorflow/core/kernels/quantization_utils.h"
     29 #include "tensorflow/core/lib/core/casts.h"
     30 #include "tensorflow/core/lib/core/errors.h"
     31 #include "tensorflow/core/util/bcast.h"
     32 
     33 namespace tensorflow {
     34 namespace {
     35 
     36 template <class T, class Toutput>
     37 void ScalarMultiply(OpKernelContext* context, const T* full_input,
     38                     int32 full_input_offset, int64 num_elements, T scalar_input,
     39                     int32 scalar_input_offset, Toutput* output) {
     40   const int32 scalar_minus_offset =
     41       static_cast<int32>(scalar_input) - scalar_input_offset;
     42   for (int i = 0; i < num_elements; ++i) {
     43     output[i] = (static_cast<int32>(full_input[i]) - full_input_offset) *
     44                 scalar_minus_offset;
     45   }
     46 }
     47 
     48 #ifdef USE_NEON
     49 
     50 template <>
     51 void ScalarMultiply<quint8, qint32>(OpKernelContext* context,
     52                                     const quint8* full_input,
     53                                     int32 full_input_offset, int64 num_elements,
     54                                     quint8 scalar_input,
     55                                     int32 scalar_input_offset, qint32* output) {
     56   const int16 scalar_minus_offset =
     57       static_cast<int16>(scalar_input) - scalar_input_offset;
     58   const int16x4_t scalar_minus_offset_16x4 = vmov_n_s16(scalar_minus_offset);
     59   const uint8x8_t full_input_offset_8x8 = vmov_n_u8(full_input_offset);
     60   // Go through the results in 16-element chunks for NEON acceleration.
     61   int i;
     62   for (i = 0; i < (num_elements - 15); i += 16) {
     63     // Load the tensor inputs.
     64     const uint8* full_input_ptr = &(full_input->value) + i;
     65     const uint8x16_t full_input_8x16 = vld1q_u8(full_input_ptr);
     66 
     67     // Break into two sets of vectors so we can do further calculations
     68     // easily.
     69     const uint8x8_t full_input_high_8x8 = vget_high_u8(full_input_8x16);
     70     const uint8x8_t full_input_low_8x8 = vget_low_u8(full_input_8x16);
     71 
     72     // Subtract off the offset value to get 16-bit results.
     73     const int16x8_t full_input_minus_offset_high_16x8 = vreinterpretq_s16_u16(
     74         vsubl_u8(full_input_high_8x8, full_input_offset_8x8));
     75     const int16x8_t full_input_minus_offset_low_16x8 = vreinterpretq_s16_u16(
     76         vsubl_u8(full_input_low_8x8, full_input_offset_8x8));
     77 
     78     // We have to work with 4-wide vectors, so extract them.
     79     const int16x4_t x_high_high_16x4 =
     80         vget_high_s16(full_input_minus_offset_high_16x8);
     81     const int16x4_t x_high_low_16x4 =
     82         vget_low_s16(full_input_minus_offset_high_16x8);
     83     const int16x4_t x_low_high_16x4 =
     84         vget_high_s16(full_input_minus_offset_low_16x8);
     85     const int16x4_t x_low_low_16x4 =
     86         vget_low_s16(full_input_minus_offset_low_16x8);
     87 
     88     // Perform the multiplication.
     89     const int32x4_t z_high_high_32x4 =
     90         vmull_s16(x_high_high_16x4, scalar_minus_offset_16x4);
     91     const int32x4_t z_high_low_32x4 =
     92         vmull_s16(x_high_low_16x4, scalar_minus_offset_16x4);
     93     const int32x4_t z_low_high_32x4 =
     94         vmull_s16(x_low_high_16x4, scalar_minus_offset_16x4);
     95     const int32x4_t z_low_low_32x4 =
     96         vmull_s16(x_low_low_16x4, scalar_minus_offset_16x4);
     97 
     98     // Write out the results.
     99     int32* output_ptr = &(output->value) + i;
    100     vst1q_s32(output_ptr + 0, z_low_low_32x4);
    101     vst1q_s32(output_ptr + 4, z_low_high_32x4);
    102     vst1q_s32(output_ptr + 8, z_high_low_32x4);
    103     vst1q_s32(output_ptr + 12, z_high_high_32x4);
    104   }
    105   // Finish up any remaining elements that weren't a multiple of 16.
    106   for (; i < num_elements; ++i) {
    107     output[i] = (static_cast<int32>(full_input[i]) - full_input_offset) *
    108                 scalar_minus_offset;
    109   }
    110 }
    111 #endif  // USE_NEON
    112 
    113 template <class T, class Toutput>
    114 void VectorMultiply(OpKernelContext* context, const T* x_data, int32 offset_x,
    115                     const T* y_data, int32 offset_y, int64 num_elements,
    116                     Toutput* output) {
    117   for (int i = 0; i < num_elements; ++i) {
    118     output[i] = (static_cast<int32>(x_data[i]) - offset_x) *
    119                 (static_cast<int32>(y_data[i]) - offset_y);
    120   }
    121 }
    122 
    123 #ifdef USE_NEON
    124 template <>
    125 void VectorMultiply<quint8, qint32>(OpKernelContext* context,
    126                                     const quint8* x_data, int32 offset_x,
    127                                     const quint8* y_data, int32 offset_y,
    128                                     int64 num_elements, qint32* output) {
    129   const uint8x8_t offset_x_8x8 = vmov_n_u8(offset_x);
    130   const uint8x8_t offset_y_8x8 = vmov_n_u8(offset_y);
    131   int i;
    132   // Go through the results in 16-element chunks for NEON acceleration.
    133   for (i = 0; i < (num_elements - 15); i += 16) {
    134     // Load the vector inputs.
    135     const uint8* x_data_ptr = &(x_data->value) + i;
    136     const uint8x16_t x_8x16 = vld1q_u8(x_data_ptr);
    137     const uint8* y_data_ptr = &(y_data->value) + i;
    138     const uint8x16_t y_8x16 = vld1q_u8(y_data_ptr);
    139 
    140     // Break into two sets of vectors so we can do further calculations easily.
    141     const uint8x8_t x_high_8x8 = vget_high_u8(x_8x16);
    142     const uint8x8_t x_low_8x8 = vget_low_u8(x_8x16);
    143     const uint8x8_t y_high_8x8 = vget_high_u8(y_8x16);
    144     const uint8x8_t y_low_8x8 = vget_low_u8(y_8x16);
    145 
    146     // Subtract off the offset values to get 16-bit results.
    147     const int16x8_t x_minus_offset_high_16x8 =
    148         vreinterpretq_s16_u16(vsubl_u8(x_high_8x8, offset_x_8x8));
    149     const int16x8_t x_minus_offset_low_16x8 =
    150         vreinterpretq_s16_u16(vsubl_u8(x_low_8x8, offset_x_8x8));
    151     const int16x8_t y_minus_offset_high_16x8 =
    152         vreinterpretq_s16_u16(vsubl_u8(y_high_8x8, offset_y_8x8));
    153     const int16x8_t y_minus_offset_low_16x8 =
    154         vreinterpretq_s16_u16(vsubl_u8(y_low_8x8, offset_y_8x8));
    155 
    156     // We have to work with 4-wide vectors, so extract them.
    157     const int16x4_t x_high_high_16x4 = vget_high_s16(x_minus_offset_high_16x8);
    158     const int16x4_t x_high_low_16x4 = vget_low_s16(x_minus_offset_high_16x8);
    159     const int16x4_t x_low_high_16x4 = vget_high_s16(x_minus_offset_low_16x8);
    160     const int16x4_t x_low_low_16x4 = vget_low_s16(x_minus_offset_low_16x8);
    161     const int16x4_t y_high_high_16x4 = vget_high_s16(y_minus_offset_high_16x8);
    162     const int16x4_t y_high_low_16x4 = vget_low_s16(y_minus_offset_high_16x8);
    163     const int16x4_t y_low_high_16x4 = vget_high_s16(y_minus_offset_low_16x8);
    164     const int16x4_t y_low_low_16x4 = vget_low_s16(y_minus_offset_low_16x8);
    165 
    166     // Perform the multiplication.
    167     const int32x4_t z_high_high_32x4 =
    168         vmull_s16(x_high_high_16x4, y_high_high_16x4);
    169     const int32x4_t z_high_low_32x4 =
    170         vmull_s16(x_high_low_16x4, y_high_low_16x4);
    171     const int32x4_t z_low_high_32x4 =
    172         vmull_s16(x_low_high_16x4, y_low_high_16x4);
    173     const int32x4_t z_low_low_32x4 = vmull_s16(x_low_low_16x4, y_low_low_16x4);
    174 
    175     // Write out the results.
    176     int32* output_ptr = &(output->value) + i;
    177     vst1q_s32(output_ptr + 0, z_low_low_32x4);
    178     vst1q_s32(output_ptr + 4, z_low_high_32x4);
    179     vst1q_s32(output_ptr + 8, z_high_low_32x4);
    180     vst1q_s32(output_ptr + 12, z_high_high_32x4);
    181   }
    182   for (; i < num_elements; ++i) {
    183     output[i] = (static_cast<int32>(x_data[i]) - offset_x) *
    184                 (static_cast<int32>(y_data[i]) - offset_y);
    185   }
    186 }
    187 #endif  // USE_NEON
    188 
    189 template <class T, class Toutput>
    190 void VectorTensorMultiply(const T* vector_data, int32 vector_offset,
    191                           int64 vector_num_elements, const T* tensor_data,
    192                           int32 tensor_offset, int64 tensor_num_elements,
    193                           Toutput* output) {
    194   for (int i = 0; i < tensor_num_elements; ++i) {
    195     const int64 vector_i = i % vector_num_elements;
    196     output[i] = (static_cast<int32>(vector_data[vector_i]) - vector_offset) *
    197                 (static_cast<int32>(tensor_data[i]) - tensor_offset);
    198   }
    199 }
    200 
    201 #ifdef USE_NEON
    202 template <>
    203 void VectorTensorMultiply<quint8, qint32>(
    204     const quint8* vector_data, int32 vector_offset, int64 vector_num_elements,
    205     const quint8* tensor_data, int32 tensor_offset, int64 tensor_num_elements,
    206     qint32* output) {
    207   const uint8x8_t offset_x_8x8 = vmov_n_u8(vector_offset);
    208   const uint8x8_t offset_y_8x8 = vmov_n_u8(tensor_offset);
    209   CHECK_EQ(0, tensor_num_elements % vector_num_elements);
    210   for (int base_i = 0; base_i < tensor_num_elements;
    211        base_i += vector_num_elements) {
    212     int i = base_i;
    213     const int end_i = base_i + vector_num_elements;
    214     // Go through the results in 16-element chunks for NEON acceleration.
    215     int vector_i;
    216     for (vector_i = 0; vector_i < (vector_num_elements - 15);
    217          vector_i += 16, i += 16) {
    218       // Load the vector inputs.
    219       const uint8* x_data_ptr = &(vector_data->value) + vector_i;
    220       const uint8x16_t x_8x16 = vld1q_u8(x_data_ptr);
    221       const uint8* y_data_ptr = &(tensor_data->value) + i;
    222       const uint8x16_t y_8x16 = vld1q_u8(y_data_ptr);
    223 
    224       // Break into two sets of vectors so we can do further calculations
    225       // easily.
    226       const uint8x8_t x_high_8x8 = vget_high_u8(x_8x16);
    227       const uint8x8_t x_low_8x8 = vget_low_u8(x_8x16);
    228       const uint8x8_t y_high_8x8 = vget_high_u8(y_8x16);
    229       const uint8x8_t y_low_8x8 = vget_low_u8(y_8x16);
    230 
    231       // Subtract off the offset values to get 16-bit results.
    232       const int16x8_t x_minus_offset_high_16x8 =
    233           vreinterpretq_s16_u16(vsubl_u8(x_high_8x8, offset_x_8x8));
    234       const int16x8_t x_minus_offset_low_16x8 =
    235           vreinterpretq_s16_u16(vsubl_u8(x_low_8x8, offset_x_8x8));
    236       const int16x8_t y_minus_offset_high_16x8 =
    237           vreinterpretq_s16_u16(vsubl_u8(y_high_8x8, offset_y_8x8));
    238       const int16x8_t y_minus_offset_low_16x8 =
    239           vreinterpretq_s16_u16(vsubl_u8(y_low_8x8, offset_y_8x8));
    240 
    241       // We have to work with 4-wide vectors, so extract them.
    242       const int16x4_t x_high_high_16x4 =
    243           vget_high_s16(x_minus_offset_high_16x8);
    244       const int16x4_t x_high_low_16x4 = vget_low_s16(x_minus_offset_high_16x8);
    245       const int16x4_t x_low_high_16x4 = vget_high_s16(x_minus_offset_low_16x8);
    246       const int16x4_t x_low_low_16x4 = vget_low_s16(x_minus_offset_low_16x8);
    247       const int16x4_t y_high_high_16x4 =
    248           vget_high_s16(y_minus_offset_high_16x8);
    249       const int16x4_t y_high_low_16x4 = vget_low_s16(y_minus_offset_high_16x8);
    250       const int16x4_t y_low_high_16x4 = vget_high_s16(y_minus_offset_low_16x8);
    251       const int16x4_t y_low_low_16x4 = vget_low_s16(y_minus_offset_low_16x8);
    252 
    253       // Perform the multiplication.
    254       const int32x4_t z_high_high_32x4 =
    255           vmull_s16(x_high_high_16x4, y_high_high_16x4);
    256       const int32x4_t z_high_low_32x4 =
    257           vmull_s16(x_high_low_16x4, y_high_low_16x4);
    258       const int32x4_t z_low_high_32x4 =
    259           vmull_s16(x_low_high_16x4, y_low_high_16x4);
    260       const int32x4_t z_low_low_32x4 =
    261           vmull_s16(x_low_low_16x4, y_low_low_16x4);
    262 
    263       // Write out the results.
    264       int32* output_ptr = &(output->value) + i;
    265       vst1q_s32(output_ptr + 0, z_low_low_32x4);
    266       vst1q_s32(output_ptr + 4, z_low_high_32x4);
    267       vst1q_s32(output_ptr + 8, z_high_low_32x4);
    268       vst1q_s32(output_ptr + 12, z_high_high_32x4);
    269     }
    270     for (; i < end_i; ++i, ++vector_i) {
    271       output[i] = (static_cast<int32>(vector_data[vector_i]) - vector_offset) *
    272                   (static_cast<int32>(tensor_data[i]) - tensor_offset);
    273     }
    274   }
    275 }
    276 #endif  // USE_NEON
    277 
    278 }  // namespace
    279 
    280 template <class T, class Toutput>
    281 class QuantizedMulOp : public OpKernel {
    282  public:
    283   explicit QuantizedMulOp(OpKernelConstruction* context) : OpKernel(context) {}
    284 
    285   void Compute(OpKernelContext* context) override {
    286     const Tensor& x = context->input(0);
    287     const Tensor& y = context->input(1);
    288     const float min_x = context->input(2).flat<float>()(0);
    289     const float max_x = context->input(3).flat<float>()(0);
    290     const float min_y = context->input(4).flat<float>()(0);
    291     const float max_y = context->input(5).flat<float>()(0);
    292 
    293     BCast bcast(BCast::FromShape(x.shape()), BCast::FromShape(y.shape()));
    294     if (!bcast.IsValid()) {
    295       context->SetStatus(errors::InvalidArgument(
    296           "Incompatible shapes: ", x.shape().DebugString(), " vs. ",
    297           y.shape().DebugString()));
    298       return;
    299     }
    300     Tensor* z;
    301     OP_REQUIRES_OK(context, context->allocate_output(
    302                                 0, BCast::ToShape(bcast.output_shape()), &z));
    303 
    304     // Make sure that we have valid quantization ranges for the input buffers.
    305     // If the difference between the min and max is negative or zero, it makes
    306     // it hard to do meaningful intermediate operations on the values.
    307     OP_REQUIRES(context, (max_x > min_x),
    308                 errors::InvalidArgument("max_x must be larger than min_a."));
    309     OP_REQUIRES(context, (max_y > min_y),
    310                 errors::InvalidArgument("max_x must be larger than min_b."));
    311     const int32 offset_x = FloatToQuantizedUnclamped<T>(0.0f, min_x, max_x);
    312     const int32 offset_y = FloatToQuantizedUnclamped<T>(0.0f, min_y, max_y);
    313     const T* x_data = x.flat<T>().data();
    314     const T* y_data = y.flat<T>().data();
    315     Toutput* z_data = z->flat<Toutput>().data();
    316 
    317     const int ndims = bcast.x_reshape().size();
    318     if (ndims <= 1) {
    319       if (x.NumElements() == 1) {
    320         ScalarMultiply<T, Toutput>(context, y_data, offset_y, y.NumElements(),
    321                                    x_data[0], offset_x, z_data);
    322       } else if (y.NumElements() == 1) {
    323         ScalarMultiply<T, Toutput>(context, x_data, offset_x, x.NumElements(),
    324                                    y_data[0], offset_y, z_data);
    325       } else {
    326         VectorMultiply<T, Toutput>(context, x_data, offset_x, y_data, offset_y,
    327                                    x.NumElements(), z_data);
    328       }
    329     } else if (ndims == 2) {
    330       const T* vector_data;
    331       int64 vector_num_elements;
    332       int32 vector_offset;
    333       const T* tensor_data;
    334       int64 tensor_num_elements;
    335       int32 tensor_offset;
    336       if (x.NumElements() < y.NumElements()) {
    337         vector_data = x_data;
    338         vector_num_elements = x.NumElements();
    339         vector_offset = offset_x;
    340         tensor_data = y_data;
    341         tensor_num_elements = y.NumElements();
    342         tensor_offset = offset_y;
    343       } else {
    344         vector_data = y_data;
    345         vector_num_elements = y.NumElements();
    346         vector_offset = offset_y;
    347         tensor_data = x_data;
    348         tensor_num_elements = x.NumElements();
    349         tensor_offset = offset_x;
    350       }
    351       VectorTensorMultiply<T, Toutput>(
    352           vector_data, vector_offset, vector_num_elements, tensor_data,
    353           tensor_offset, tensor_num_elements, z_data);
    354     } else {
    355       LOG(INFO) << "ndims=" << ndims;
    356       LOG(INFO) << "bcast.x_reshape()="
    357                 << TensorShape(bcast.x_reshape()).DebugString();
    358       LOG(INFO) << "bcast.y_reshape()="
    359                 << TensorShape(bcast.y_reshape()).DebugString();
    360       LOG(INFO) << "bcast.x_bcast()="
    361                 << TensorShape(bcast.x_bcast()).DebugString();
    362       LOG(INFO) << "bcast.y_bcast()="
    363                 << TensorShape(bcast.y_bcast()).DebugString();
    364 
    365       context->SetStatus(errors::Unimplemented(
    366           "Broadcast between ", context->input(0).shape().DebugString(),
    367           " and ", context->input(1).shape().DebugString(),
    368           " is not supported yet."));
    369       return;
    370     }
    371 
    372     float min_z_value;
    373     float max_z_value;
    374     QuantizationRangeForMultiplication<T, T, Toutput>(
    375         min_x, max_x, min_y, max_y, &min_z_value, &max_z_value);
    376     Tensor* z_min = nullptr;
    377     OP_REQUIRES_OK(context, context->allocate_output(1, {}, &z_min));
    378     z_min->flat<float>()(0) = min_z_value;
    379 
    380     Tensor* z_max = nullptr;
    381     OP_REQUIRES_OK(context, context->allocate_output(2, {}, &z_max));
    382     z_max->flat<float>()(0) = max_z_value;
    383   }
    384 };
    385 
    386 REGISTER_KERNEL_BUILDER(Name("QuantizedMul")
    387                             .Device(DEVICE_CPU)
    388                             .TypeConstraint<quint8>("T1")
    389                             .TypeConstraint<quint8>("T2")
    390                             .TypeConstraint<qint32>("Toutput"),
    391                         QuantizedMulOp<quint8, qint32>);
    392 
    393 }  // namespace tensorflow
    394