1 /* Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 #include "tensorflow/lite/tools/optimize/subgraph_quantizer.h" 16 17 #include <algorithm> 18 #include <limits> 19 20 #include "flatbuffers/flexbuffers.h" 21 #include "absl/memory/memory.h" 22 #include "tensorflow/lite/context.h" 23 #include "tensorflow/lite/core/api/error_reporter.h" 24 #include "tensorflow/lite/kernels/internal/round.h" 25 #include "tensorflow/lite/kernels/internal/tensor_utils.h" 26 #include "tensorflow/lite/kernels/internal/types.h" 27 #include "tensorflow/lite/model.h" 28 #include "tensorflow/lite/schema/schema_generated.h" 29 #include "tensorflow/lite/tools/optimize/quantization_utils.h" 30 31 namespace tflite { 32 namespace optimize { 33 namespace internal { 34 35 namespace { 36 TfLiteStatus AddQuantizationParams(const std::vector<float>& scales, 37 const std::vector<int64_t>& zero_point, 38 int quantized_dimension, 39 const uint8_t* buffer_data, 40 size_t buffer_size, TensorType output_type, 41 ModelT* model, TensorT* tensor) { 42 tensor->quantization = absl::make_unique<QuantizationParametersT>(); 43 tensor->quantization->scale.assign(scales.begin(), scales.end()); 44 if (zero_point.size() != scales.size()) { 45 return kTfLiteError; 46 } 47 tensor->quantization->zero_point.assign(zero_point.begin(), zero_point.end()); 48 tensor->quantization->quantized_dimension = quantized_dimension; 49 model->buffers[tensor->buffer]->data.assign(buffer_data, 50 buffer_data + buffer_size); 51 // Update the tensor type. 52 tensor->type = output_type; 53 return kTfLiteOk; 54 } 55 56 bool OpHasOptionalBiasTensor(BuiltinOperator op_code) { 57 return op_code == BuiltinOperator_CONV_2D || 58 op_code == BuiltinOperator_DEPTHWISE_CONV_2D; 59 } 60 61 struct OpWithBiasTensors { 62 int activation_input_index; 63 int weights_input_index; 64 int bias_input_index; 65 int index_for_channel_in_weights; 66 }; 67 68 const OpWithBiasTensors* GetInfoForOpWithBiasTensor(BuiltinOperator op_code) { 69 if (op_code == BuiltinOperator_CONV_2D) { 70 static OpWithBiasTensors op_info = {/* activation_input_index */ 0, 71 /* weights_input_index */ 1, 72 /* bias_input_index */ 2, 73 /* index_for_channel_in_weights */ 0}; 74 return &op_info; 75 } 76 if (op_code == BuiltinOperator_DEPTHWISE_CONV_2D) { 77 static OpWithBiasTensors op_info = {/* bias_input_index */ 0, 78 /* bias_input_index */ 1, 79 /* bias_input_index */ 2, 80 /* index_for_channel_in_weights */ 3}; 81 return &op_info; 82 } 83 84 return nullptr; 85 } 86 87 // Symmetrically Quantizes the given tensor as int8 values. 88 TfLiteStatus SymmetricPerChannelQuantizeTensor(ModelT* model, TensorT* tensor, 89 int32_t channel_dim_index, 90 ErrorReporter* error_reporter) { 91 if (tensor->shape.size() != 4) { 92 error_reporter->Report("Only dims=4 is supported, tensor dims: %d", 93 tensor->shape.size()); 94 return kTfLiteError; 95 } 96 97 // Get dimensions. 98 uint64_t num_elements; 99 TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements)); 100 const int32_t channel_dim_size = tensor->shape[channel_dim_index]; 101 102 // Get input float data. 103 BufferT* buffer = model->buffers[tensor->buffer].get(); 104 float* float_input_data = reinterpret_cast<float*>(buffer->data.data()); 105 106 // Create container for output scale and output data. 107 std::vector<float> scales(channel_dim_size); 108 std::vector<int8_t> final_buffer(num_elements); 109 110 // Quantize the input data with respect to channel_dim_index. 111 const std::vector<int> tensor_dims = {tensor->shape[0], tensor->shape[1], 112 tensor->shape[2], tensor->shape[3]}; 113 utils::SymmetricPerChannelQuantization( 114 float_input_data, tensor_dims, channel_dim_index, &scales, &final_buffer); 115 116 // Set the buffers and output type. 117 uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data()); 118 const size_t buffer_size = num_elements * sizeof(int8_t); 119 std::vector<int64_t> zero_point(scales.size(), 0); 120 return AddQuantizationParams(scales, zero_point, channel_dim_index, 121 uint8_buffer, buffer_size, TensorType_INT8, 122 model, tensor); 123 } 124 125 // Symmetrically quantizes the bias for ops like Conv and DepthwiseConv. 126 // The scale of bias if weight_per_channel_scale[channel] * input_scale 127 TfLiteStatus SymmetricPerChannelBiasQuantize(const TensorT* input_tensor, 128 const TensorT* weight_tensor, 129 int channel_dim_index, 130 ModelT* model, TensorT* tensor, 131 ErrorReporter* error_reporter) { 132 if (tensor->shape.size() != 1) { 133 error_reporter->Report("Expected bias tensor shape to be 1."); 134 return kTfLiteError; 135 } 136 137 if (tensor->type != TensorType_FLOAT32) { 138 return kTfLiteOk; 139 } 140 141 // TODO(shashishekhar): Make this support scalar biases. 142 if (tensor->shape[0] != weight_tensor->shape[channel_dim_index]) { 143 error_reporter->Report( 144 "Channel mismatch between bias and weight tensors %d vs %d", 145 tensor->shape[0], weight_tensor->shape[channel_dim_index]); 146 return kTfLiteError; 147 } 148 int32_t channel_dim_size = tensor->shape[0]; 149 if (!input_tensor->quantization || 150 input_tensor->quantization->scale.size() != 1) { 151 error_reporter->Report("Input tensor missing quantization information"); 152 return kTfLiteError; 153 } 154 TF_LITE_ENSURE(error_reporter, weight_tensor->quantization); 155 const std::vector<float>& weight_scales = weight_tensor->quantization->scale; 156 157 if (weight_scales.size() != channel_dim_size) { 158 error_reporter->Report("Mismatch weight scale dimension: %d", 159 weight_scales.size()); 160 return kTfLiteError; 161 } 162 163 // Compute scales. 164 std::vector<float> scales(channel_dim_size); 165 for (size_t i = 0; i < channel_dim_size; i++) { 166 scales[i] = input_tensor->quantization->scale[0] * weight_scales[i]; 167 } 168 169 BufferT* buffer = model->buffers[tensor->buffer].get(); 170 float* float_data = reinterpret_cast<float*>(buffer->data.data()); 171 uint64_t num_elements; 172 TF_LITE_ENSURE_STATUS(utils::NumElements(*tensor, &num_elements)); 173 174 std::vector<int32_t> final_buffer(num_elements); 175 const int32_t kScale = std::numeric_limits<int32_t>::max(); 176 177 for (int32_t channel_idx = 0; channel_idx < channel_dim_size; channel_idx++) { 178 float scaling_factor = scales[channel_idx]; 179 float scaling_factor_inv = (scaling_factor == 0) ? 0 : 1.0 / scaling_factor; 180 const int32_t quantized_value = static_cast<int32_t>( 181 TfLiteRound(float_data[channel_idx] * scaling_factor_inv)); 182 final_buffer[channel_idx] = 183 std::min(kScale, std::max(-kScale, quantized_value)); 184 } 185 186 // Set the buffers and output type. 187 uint8_t* uint8_buffer = reinterpret_cast<uint8_t*>(final_buffer.data()); 188 size_t buffer_size = num_elements * sizeof(int32_t); 189 std::vector<int64_t> zero_point(scales.size(), 0); 190 return AddQuantizationParams(scales, zero_point, channel_dim_index, 191 uint8_buffer, buffer_size, TensorType_INT32, 192 model, tensor); 193 } 194 } // namespace 195 196 TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeTensor( 197 BuiltinOperator op_code, int32_t tensor_idx) { 198 TensorT* tensor = subgraph_->tensors[tensor_idx].get(); 199 if (tensor->type != TensorType_FLOAT32) { 200 return kTfLiteOk; 201 } 202 203 if (model_->buffers[tensor->buffer]->data.data() != nullptr) { 204 return kTfLiteError; 205 } 206 if (!tensor->quantization || tensor->quantization->min.empty() || 207 tensor->quantization->max.empty()) { 208 error_reporter_->Report( 209 "Missing required min/max information for tensor_idx %d of operation: " 210 "%s", 211 tensor_idx, EnumNameBuiltinOperator(op_code)); 212 return kTfLiteError; 213 } 214 utils::GetAsymmetricQuantizationParams( 215 tensor->quantization->min[0], tensor->quantization->max[0], 216 std::numeric_limits<int8_t>::min(), std::numeric_limits<int8_t>::max(), 217 tensor->quantization.get()); 218 tensor->type = TensorType_INT8; 219 return kTfLiteOk; 220 } 221 222 TfLiteStatus SubgraphQuantizer::QuantizeOpWithBias(BuiltinOperator op_code, 223 OperatorT* op) { 224 auto op_tensor_info = GetInfoForOpWithBiasTensor(op_code); 225 if (!op_tensor_info) { 226 error_reporter_->Report("Cannot quantize op: %s", 227 EnumNameBuiltinOperator(op_code)); 228 return kTfLiteError; 229 } 230 231 // Conv/Depthwise conv have 2 inputs when there is no bias, 3 otherwise. 232 if (op->inputs.size() != 2 && op->inputs.size() != 3) { 233 return kTfLiteError; 234 } 235 auto input_tensor_idx = op->inputs[op_tensor_info->activation_input_index]; 236 if (IsSubgraphInput(input_tensor_idx)) { 237 TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, input_tensor_idx)); 238 } 239 auto weights_tensor_idx = op->inputs[op_tensor_info->weights_input_index]; 240 241 TensorT* weights_tensor = subgraph_->tensors[weights_tensor_idx].get(); 242 int weights_channel_index = op_tensor_info->index_for_channel_in_weights; 243 244 auto status = SymmetricPerChannelQuantizeTensor( 245 model_, weights_tensor, weights_channel_index, error_reporter_); 246 TF_LITE_ENSURE_STATUS(status); 247 248 // If there is bias, quantize it. 249 if (op->inputs.size() == 3) { 250 auto bias_tensor_idx = op->inputs[op_tensor_info->bias_input_index]; 251 const TensorT* input_tensor = subgraph_->tensors[input_tensor_idx].get(); 252 TensorT* bias_tensor = subgraph_->tensors[bias_tensor_idx].get(); 253 TF_LITE_ENSURE_STATUS(SymmetricPerChannelBiasQuantize( 254 input_tensor, weights_tensor, weights_channel_index, model_, 255 bias_tensor, error_reporter_)); 256 } 257 258 if (op->outputs.size() != 1) { 259 return kTfLiteError; 260 } 261 auto output_tensor_idx = op->outputs[0]; 262 TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, output_tensor_idx)); 263 264 return kTfLiteOk; 265 } 266 267 TfLiteStatus SubgraphQuantizer::PropagateMinMaxForAvgAndMaxPool( 268 BuiltinOperator op_code, OperatorT* op) { 269 TF_LITE_ENSURE_EQ(this->error_reporter_, op->inputs.size(), 1); 270 271 if (IsSubgraphInput(op->inputs[0])) { 272 TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->inputs[0])); 273 } 274 275 auto output_tensor = subgraph_->tensors[op->outputs[0]].get(); 276 if (output_tensor->type != TensorType_FLOAT32) { 277 return kTfLiteOk; 278 } 279 auto input_tensor = subgraph_->tensors[op->inputs[0]].get(); 280 if (!input_tensor->quantization) { 281 error_reporter_->Report( 282 "Missing required min/max information for input of operation: %s", 283 EnumNameBuiltinOperator(op_code)); 284 return kTfLiteError; 285 } 286 if (input_tensor->quantization->min.size() != 1 || 287 input_tensor->quantization->max.size() != 1 || 288 input_tensor->quantization->scale.size() != 1 || 289 input_tensor->quantization->zero_point.size() != 1) { 290 error_reporter_->Report( 291 "Invalid quantization information for Op: %s, tensor: %s", 292 EnumNameBuiltinOperator(op_code), input_tensor->name.c_str()); 293 return kTfLiteError; 294 } 295 auto quant_params = absl::make_unique<QuantizationParametersT>(); 296 // Nudge min, max to include the floating point zero. 297 const float min = std::min(0.f, input_tensor->quantization->min[0]); 298 const float max = std::max(0.f, input_tensor->quantization->max[0]); 299 quant_params->min.push_back(min); 300 quant_params->max.push_back(max); 301 quant_params->scale.push_back(input_tensor->quantization->scale[0]); 302 quant_params->zero_point.push_back(input_tensor->quantization->zero_point[0]); 303 // TODO(shashishekhar): Log a warning here if overriding existing 304 // min/max/scales differ from input scales. 305 output_tensor->quantization = std::move(quant_params); 306 output_tensor->type = TensorType_INT8; 307 return kTfLiteOk; 308 } 309 310 TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeSoftmax( 311 BuiltinOperator op_code, OperatorT* op) { 312 TF_LITE_ENSURE_EQ(this->error_reporter_, op->inputs.size(), 1); 313 TF_LITE_ENSURE_EQ(this->error_reporter_, op->outputs.size(), 1); 314 315 if (IsSubgraphInput(op->inputs[0])) { 316 TF_LITE_ENSURE_STATUS(AsymmetricQuantizeTensor(op_code, op->inputs[0])); 317 } 318 319 auto output_tensor = subgraph_->tensors[op->outputs[0]].get(); 320 if (output_tensor->type != TensorType_FLOAT32) { 321 return kTfLiteOk; 322 } 323 324 // Softmax output is hardcoded to have 1/256 as scale and -128 as zero point. 325 output_tensor->type = TensorType_INT8; 326 output_tensor->quantization->scale = {1.0f / 256.0f}; 327 output_tensor->quantization->zero_point = {-128}; 328 return kTfLiteOk; 329 } 330 331 TfLiteStatus SubgraphQuantizer::AsymmetricQuantizeInputsAndOutputs( 332 BuiltinOperator op_code, OperatorT* op) { 333 TF_LITE_ENSURE(this->error_reporter_, !op->inputs.empty()); 334 TF_LITE_ENSURE(this->error_reporter_, !op->outputs.empty()); 335 for (size_t input_idx = 0; input_idx < op->inputs.size(); ++input_idx) { 336 auto input_tensor = subgraph_->tensors[op->inputs[input_idx]].get(); 337 if (IsSubgraphInput(op->inputs[input_idx]) && 338 input_tensor->type == TensorType_FLOAT32) { 339 TF_LITE_ENSURE_STATUS( 340 AsymmetricQuantizeTensor(op_code, op->inputs[input_idx])); 341 } 342 } 343 344 for (size_t output_idx = 0; output_idx < op->outputs.size(); ++output_idx) { 345 auto output_tensor = subgraph_->tensors[op->outputs[output_idx]].get(); 346 if (output_tensor->type == TensorType_FLOAT32) { 347 TF_LITE_ENSURE_STATUS( 348 AsymmetricQuantizeTensor(op_code, op->outputs[output_idx])); 349 } 350 } 351 return kTfLiteOk; 352 } 353 354 bool SubgraphQuantizer::IsSubgraphInput(int32_t tensor_idx) const { 355 return std::find(subgraph_->inputs.begin(), subgraph_->inputs.end(), 356 tensor_idx) != subgraph_->inputs.end(); 357 } 358 359 TfLiteStatus SubgraphQuantizer::QuantizeOperator(int op_idx) { 360 OperatorT* op = subgraph_->operators[op_idx].get(); 361 const BuiltinOperator op_code = 362 model_->operator_codes[op->opcode_index]->builtin_code; 363 if (OpHasOptionalBiasTensor(op_code)) { 364 return QuantizeOpWithBias(op_code, op); 365 } 366 switch (op_code) { 367 case BuiltinOperator_AVERAGE_POOL_2D: 368 case BuiltinOperator_MAX_POOL_2D: 369 return PropagateMinMaxForAvgAndMaxPool(op_code, op); 370 case BuiltinOperator_SQUEEZE: 371 case BuiltinOperator_RESHAPE: 372 case BuiltinOperator_ADD: 373 return AsymmetricQuantizeInputsAndOutputs(op_code, op); 374 case BuiltinOperator_SOFTMAX: 375 return AsymmetricQuantizeSoftmax(op_code, op); 376 default: 377 return kTfLiteError; 378 } 379 380 return kTfLiteError; 381 } 382 383 } // namespace internal 384 } // namespace optimize 385 } // namespace tflite 386