Home | History | Annotate | Download | only in optimize
      1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 #include "tensorflow/lite/tools/optimize/subgraph_quantizer.h"
     16 
     17 #include <algorithm>
     18 #include <limits>
     19 
     20 #include "flatbuffers/flexbuffers.h"
     21 #include "absl/memory/memory.h"
     22 #include "tensorflow/lite/context.h"
     23 #include "tensorflow/lite/core/api/error_reporter.h"
     24 #include "tensorflow/lite/kernels/internal/round.h"
     25 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
     26 #include "tensorflow/lite/kernels/internal/types.h"
     27 #include "tensorflow/lite/model.h"
     28 #include "tensorflow/lite/schema/schema_generated.h"
     29 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
     30 
     31 namespace tflite {
     32 namespace optimize {
     33 namespace internal {
     34 
     35 namespace {
     36 TfLiteStatus AddQuantizationParams(const std::vector<float>& scales,
     37                                    const std::vector<int64_t>& zero_point,
     38                                    int quantized_dimension,
     39                                    const uint8_t* buffer_data,
     40                                    size_t buffer_size, TensorType output_type,
     41                                    ModelT* model, TensorT* tensor) {
     42   tensor->quantization = absl::make_unique<QuantizationParametersT>();
     43   tensor->quantization->scale.assign(scales.begin(), scales.end());
     44   if (zero_point.size() != scales.size()) {
     45     return kTfLiteError;
     46   }
     47   tensor->quantization->zero_point.assign(zero_point.begin(), zero_point.end());
     48   tensor->quantization->quantized_dimension = quantized_dimension;
     49   model->buffers[tensor->buffer]->data.assign(buffer_data,
     50                                               buffer_data + buffer_size);
     51   // Update the tensor type.
     52   tensor->type = output_type;
     53   return kTfLiteOk;
     54 }
     55 
     56 bool OpHasOptionalBiasTensor(BuiltinOperator op_code) {
     57   return op_code == BuiltinOperator_CONV_2D ||
     58          op_code == BuiltinOperator_DEPTHWISE_CONV_2D;
     59 }
     60 
     61 struct OpWithBiasTensors {
     62   int activation_input_index;
     63   int weights_input_index;
     64   int bias_input_index;
     65   int index_for_channel_in_weights;
     66 };
     67 
     68 const OpWithBiasTensors* GetInfoForOpWithBiasTensor(BuiltinOperator op_code) {
     69   if (op_code == BuiltinOperator_CONV_2D) {
     70     static OpWithBiasTensors op_info = {/* activation_input_index */ 0,
     71                                         /* weights_input_index */ 1,
     72                                         /* bias_input_index */ 2,
     73                                         /* index_for_channel_in_weights */ 0};
     74     return &op_info;
     75   }
     76   if (op_code == BuiltinOperator_DEPTHWISE_CONV_2D) {
     77     static OpWithBiasTensors op_info = {/* bias_input_index */ 0,
     78                                         /* bias_input_index */ 1,
     79                                         /* bias_input_index */ 2,
     80                                         /* index_for_channel_in_weights */ 3};
     81     return &op_info;
     82   }
     83 
     84   return nullptr;
     85 }
     86 
     87 // Symmetrically Quantizes the given tensor as int8 values.
     88 TfLiteStatus SymmetricPerChannelQuantizeTensor(ModelT* model, TensorT* tensor,
     89                                                int32_t channel_dim_index,
     90                                                ErrorReporter* error_reporter) {
     91   if (tensor->shape.size() != 4) {
     92     error_reporter->Report("Only dims=4 is supported, tensor dims: %d",
     93                            tensor->shape.size());
     94     return kTfLiteError;
     95   }
     96 
     97   // Get dimensions.
     98   uint64_t num_elements;
     99   TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
    100   const int32_t channel_dim_size = tensor->shape[channel_dim_index];
    101 
    102   // Get input float data.
    103   BufferT* buffer = model->buffers[tensor->buffer].get();
    104   float* float_input_data = reinterpret_cast<float*>(buffer->data.data());
    105 
    106   // Create container for output scale and output data.
    107   std::vector<float> scales(channel_dim_size);
    108   std::vector<int8_t> final_buffer(num_elements);
    109 
    110   // Quantize the input data with respect to channel_dim_index.
    111   const std::vector<int> tensor_dims = {tensor->shape[0], tensor->shape[1],
    112                                         tensor->shape[2], tensor->shape[3]};
    113   utils::SymmetricPerChannelQuantization(
    114       float_input_data, tensor_dims, channel_dim_index, &scales, &final_buffer);
    115 
    116   // Set the buffers and output type.
    117   uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
    118   const size_t buffer_size = num_elements * sizeof(int8_t);
    119   std::vector<int64_t> zero_point(scales.size(), 0);
    120   return AddQuantizationParams(scales, zero_point, channel_dim_index,
    121                                uint8_buffer, buffer_size, TensorType_INT8,
    122                                model, tensor);
    123 }
    124 
    125 // Symmetrically quantizes the bias for ops like Conv and DepthwiseConv.
    126 // The scale of bias if weight_per_channel_scale[channel] * input_scale
    127 TfLiteStatus SymmetricPerChannelBiasQuantize(const TensorT* input_tensor,
    128                                              const TensorT* weight_tensor,
    129                                              int channel_dim_index,
    130                                              ModelT* model, TensorT* tensor,
    131                                              ErrorReporter* error_reporter) {
    132   if (tensor->shape.size() != 1) {
    133     error_reporter->Report("Expected bias tensor shape to be 1.");
    134     return kTfLiteError;
    135   }
    136 
    137   if (tensor->type != TensorType_FLOAT32) {
    138     return kTfLiteOk;
    139   }
    140 
    141   // TODO(shashishekhar): Make this support scalar biases.
    142   if (tensor->shape[0] != weight_tensor->shape[channel_dim_index]) {
    143     error_reporter->Report(
    144         "Channel mismatch between bias and weight tensors %d vs %d",
    145         tensor->shape[0], weight_tensor->shape[channel_dim_index]);
    146     return kTfLiteError;
    147   }
    148   int32_t channel_dim_size = tensor->shape[0];
    149   if (!input_tensor->quantization ||
    150       input_tensor->quantization->scale.size() != 1) {
    151     error_reporter->Report("Input tensor missing quantization information");
    152     return kTfLiteError;
    153   }
    154   TF_LITE_ENSURE(error_reporter, weight_tensor->quantization);
    155   const std::vector<float>& weight_scales = weight_tensor->quantization->scale;
    156 
    157   if (weight_scales.size() != channel_dim_size) {
    158     error_reporter->Report("Mismatch weight scale dimension: %d",
    159                            weight_scales.size());
    160     return kTfLiteError;
    161   }
    162 
    163   // Compute scales.
    164   std::vector<float> scales(channel_dim_size);
    165   for (size_t i = 0; i < channel_dim_size; i++) {
    166     scales[i] = input_tensor->quantization->scale[0] * weight_scales[i];
    167   }
    168 
    169   BufferT* buffer = model->buffers[tensor->buffer].get();
    170   float* float_data = reinterpret_cast<float*>(buffer->data.data());
    171   uint64_t num_elements;
    172   TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements));
    173 
    174   std::vector<int32_t> final_buffer(num_elements);
    175   const int32_t kScale = std::numeric_limits<int32_t>::max();
    176 
    177   for (int32_t channel_idx = 0; channel_idx < channel_dim_size; channel_idx++) {
    178     float scaling_factor = scales[channel_idx];
    179     float scaling_factor_inv = (scaling_factor == 0) ? 0 : 1.0 / scaling_factor;
    180     const int32_t quantized_value = static_cast<int32_t>(
    181         TfLiteRound(float_data[channel_idx] * scaling_factor_inv));
    182     final_buffer[channel_idx] =
    183         std::min(kScale, std::max(-kScale, quantized_value));
    184   }
    185 
    186   // Set the buffers and output type.
    187   uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data());
    188   size_t buffer_size = num_elements * sizeof(int32_t);
    189   std::vector<int64_t> zero_point(scales.size(), 0);
    190   return AddQuantizationParams(scales, zero_point, channel_dim_index,
    191                                uint8_buffer, buffer_size, TensorType_INT32,
    192                                model, tensor);
    193 }
    194 }  // namespace
    195 
    196 TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeTensor(
    197     BuiltinOperator op_code, int32_t tensor_idx) {
    198   TensorT* tensor = subgraph_->tensors[tensor_idx].get();
    199   if (tensor->type != TensorType_FLOAT32) {
    200     return kTfLiteOk;
    201   }
    202 
    203   if (model_->buffers[tensor->buffer]->data.data() != nullptr) {
    204     return kTfLiteError;
    205   }
    206   if (!tensor->quantization || tensor->quantization->min.empty() ||
    207       tensor->quantization->max.empty()) {
    208     error_reporter_->Report(
    209         "Missing required min/max information for tensor_idx %d of operation: "
    210         "%s",
    211         tensor_idx, EnumNameBuiltinOperator(op_code));
    212     return kTfLiteError;
    213   }
    214   utils::GetAsymmetricQuantizationParams(
    215       tensor->quantization->min[0], tensor->quantization->max[0],
    216       std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(),
    217       tensor->quantization.get());
    218   tensor->type = TensorType_INT8;
    219   return kTfLiteOk;
    220 }
    221 
    222 TfLiteStatus SubgraphQuantizer::QuantizeOpWithBias(BuiltinOperator op_code,
    223                                                    OperatorT* op) {
    224   auto op_tensor_info = GetInfoForOpWithBiasTensor(op_code);
    225   if (!op_tensor_info) {
    226     error_reporter_->Report("Cannot quantize op: %s",
    227                             EnumNameBuiltinOperator(op_code));
    228     return kTfLiteError;
    229   }
    230 
    231   // Conv/Depthwise conv have 2 inputs when there is no bias, 3 otherwise.
    232   if (op->inputs.size() != 2 && op->inputs.size() != 3) {
    233     return kTfLiteError;
    234   }
    235   auto input_tensor_idx = op->inputs[op_tensor_info->activation_input_index];
    236   if (IsSubgraphInput(input_tensor_idx)) {
    237     TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, input_tensor_idx));
    238   }
    239   auto weights_tensor_idx = op->inputs[op_tensor_info->weights_input_index];
    240 
    241   TensorT* weights_tensor = subgraph_->tensors[weights_tensor_idx].get();
    242   int weights_channel_index = op_tensor_info->index_for_channel_in_weights;
    243 
    244   auto status = SymmetricPerChannelQuantizeTensor(
    245       model_, weights_tensor, weights_channel_index, error_reporter_);
    246   TF_LITE_ENSURE_STATUS(status);
    247 
    248   // If there is bias, quantize it.
    249   if (op->inputs.size() == 3) {
    250     auto bias_tensor_idx = op->inputs[op_tensor_info->bias_input_index];
    251     const TensorT* input_tensor = subgraph_->tensors[input_tensor_idx].get();
    252     TensorT* bias_tensor = subgraph_->tensors[bias_tensor_idx].get();
    253     TF_LITE_ENSURE_STATUS(SymmetricPerChannelBiasQuantize(
    254         input_tensor, weights_tensor, weights_channel_index, model_,
    255         bias_tensor, error_reporter_));
    256   }
    257 
    258   if (op->outputs.size() != 1) {
    259     return kTfLiteError;
    260   }
    261   auto output_tensor_idx = op->outputs[0];
    262   TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, output_tensor_idx));
    263 
    264   return kTfLiteOk;
    265 }
    266 
    267 TfLiteStatus SubgraphQuantizer::PropagateMinMaxForAvgAndMaxPool(
    268     BuiltinOperator op_code, OperatorT* op) {
    269   TF_LITE_ENSURE_EQ(this->error_reporter_, op->inputs.size(), 1);
    270 
    271   if (IsSubgraphInput(op->inputs[0])) {
    272     TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->inputs[0]));
    273   }
    274 
    275   auto output_tensor = subgraph_->tensors[op->outputs[0]].get();
    276   if (output_tensor->type != TensorType_FLOAT32) {
    277     return kTfLiteOk;
    278   }
    279   auto input_tensor = subgraph_->tensors[op->inputs[0]].get();
    280   if (!input_tensor->quantization) {
    281     error_reporter_->Report(
    282         "Missing required min/max information for input of operation: %s",
    283         EnumNameBuiltinOperator(op_code));
    284     return kTfLiteError;
    285   }
    286   if (input_tensor->quantization->min.size() != 1 ||
    287       input_tensor->quantization->max.size() != 1 ||
    288       input_tensor->quantization->scale.size() != 1 ||
    289       input_tensor->quantization->zero_point.size() != 1) {
    290     error_reporter_->Report(
    291         "Invalid quantization information for Op: %s, tensor: %s",
    292         EnumNameBuiltinOperator(op_code), input_tensor->name.c_str());
    293     return kTfLiteError;
    294   }
    295   auto quant_params = absl::make_unique<QuantizationParametersT>();
    296   // Nudge min, max to include the floating point zero.
    297   const float min = std::min(0.f, input_tensor->quantization->min[0]);
    298   const float max = std::max(0.f, input_tensor->quantization->max[0]);
    299   quant_params->min.push_back(min);
    300   quant_params->max.push_back(max);
    301   quant_params->scale.push_back(input_tensor->quantization->scale[0]);
    302   quant_params->zero_point.push_back(input_tensor->quantization->zero_point[0]);
    303   // TODO(shashishekhar): Log a warning here if overriding existing
    304   // min/max/scales differ from input scales.
    305   output_tensor->quantization = std::move(quant_params);
    306   output_tensor->type = TensorType_INT8;
    307   return kTfLiteOk;
    308 }
    309 
    310 TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeSoftmax(
    311     BuiltinOperator op_code, OperatorT* op) {
    312   TF_LITE_ENSURE_EQ(this->error_reporter_, op->inputs.size(), 1);
    313   TF_LITE_ENSURE_EQ(this->error_reporter_, op->outputs.size(), 1);
    314 
    315   if (IsSubgraphInput(op->inputs[0])) {
    316     TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->inputs[0]));
    317   }
    318 
    319   auto output_tensor = subgraph_->tensors[op->outputs[0]].get();
    320   if (output_tensor->type != TensorType_FLOAT32) {
    321     return kTfLiteOk;
    322   }
    323 
    324   // Softmax output is hardcoded to have 1/256 as scale and -128 as zero point.
    325   output_tensor->type = TensorType_INT8;
    326   output_tensor->quantization->scale = {1.0f / 256.0f};
    327   output_tensor->quantization->zero_point = {-128};
    328   return kTfLiteOk;
    329 }
    330 
    331 TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeInputsAndOutputs(
    332     BuiltinOperator op_code, OperatorT* op) {
    333   TF_LITE_ENSURE(this->error_reporter_, !op->inputs.empty());
    334   TF_LITE_ENSURE(this->error_reporter_, !op->outputs.empty());
    335   for (size_t input_idx = 0; input_idx < op->inputs.size(); ++input_idx) {
    336     auto input_tensor = subgraph_->tensors[op->inputs[input_idx]].get();
    337     if (IsSubgraphInput(op->inputs[input_idx]) &&
    338         input_tensor->type == TensorType_FLOAT32) {
    339       TF_LITE_ENSURE_STATUS(
    340           AsymmetricQuantizeTensor(op_code, op->inputs[input_idx]));
    341     }
    342   }
    343 
    344   for (size_t output_idx = 0; output_idx < op->outputs.size(); ++output_idx) {
    345     auto output_tensor = subgraph_->tensors[op->outputs[output_idx]].get();
    346     if (output_tensor->type == TensorType_FLOAT32) {
    347       TF_LITE_ENSURE_STATUS(
    348           AsymmetricQuantizeTensor(op_code, op->outputs[output_idx]));
    349     }
    350   }
    351   return kTfLiteOk;
    352 }
    353 
    354 bool SubgraphQuantizer::IsSubgraphInput(int32_t tensor_idx) const {
    355   return std::find(subgraph_->inputs.begin(), subgraph_->inputs.end(),
    356                    tensor_idx) != subgraph_->inputs.end();
    357 }
    358 
    359 TfLiteStatus SubgraphQuantizer::QuantizeOperator(int op_idx) {
    360   OperatorT* op = subgraph_->operators[op_idx].get();
    361   const BuiltinOperator op_code =
    362       model_->operator_codes[op->opcode_index]->builtin_code;
    363   if (OpHasOptionalBiasTensor(op_code)) {
    364     return QuantizeOpWithBias(op_code, op);
    365   }
    366   switch (op_code) {
    367     case BuiltinOperator_AVERAGE_POOL_2D:
    368     case BuiltinOperator_MAX_POOL_2D:
    369       return PropagateMinMaxForAvgAndMaxPool(op_code, op);
    370     case BuiltinOperator_SQUEEZE:
    371     case BuiltinOperator_RESHAPE:
    372     case BuiltinOperator_ADD:
    373       return AsymmetricQuantizeInputsAndOutputs(op_code, op);
    374     case BuiltinOperator_SOFTMAX:
    375       return AsymmetricQuantizeSoftmax(op_code, op);
    376     default:
    377       return kTfLiteError;
    378   }
    379 
    380   return kTfLiteError;
    381 }
    382 
    383 }  // namespace internal
    384 }  // namespace optimize
    385 }  // namespace tflite
    386