1 // Example code illustrating the theory exposed in doc/quantization.md 2 3 /* Command line to build and run on x86: 4 5 c++ doc/quantization_example.cc -I . --std=c++11 -msse4.1 -lpthread \ 6 -o /tmp/quantization_example && \ 7 /tmp/quantization_example 8 9 */ 10 11 #include <algorithm> 12 #include <cassert> 13 #include <cmath> 14 #include <cstdint> 15 #include <iostream> 16 #include <random> 17 #include <vector> 18 #include "../public/gemmlowp.h" 19 #include "../public/output_stages.h" 20 21 // We will handle both float and quantized matrices, which we will 22 // represent as gemmlowp::MatrixMap. 23 // We will need to be able to print them. 24 25 // Output a matrix to a std::ostream 26 template <typename tScalar, gemmlowp::MapOrder tOrder> 27 std::ostream& operator<<(std::ostream& s, 28 const gemmlowp::MatrixMap<tScalar, tOrder>& m) { 29 for (int i = 0; i < m.rows(); i++) { 30 for (int j = 0; j < m.cols(); j++) { 31 if (j) { 32 s << '\t'; 33 } 34 s << static_cast<float>(m(i, j)); 35 } 36 s << '\n'; 37 } 38 return s; 39 } 40 41 // Find the min and max value in a float matrix. 42 template <gemmlowp::MapOrder tOrder> 43 void FindMinMax(const gemmlowp::MatrixMap<float, tOrder>& m, float* min, 44 float* max) { 45 *min = *max = m(0, 0); 46 for (int i = 0; i < m.rows(); i++) { 47 for (int j = 0; j < m.cols(); j++) { 48 const float val = m(i, j); 49 *min = std::min(*min, val); 50 *max = std::max(*max, val); 51 } 52 } 53 } 54 55 // A structure to hold quantization parameters 'scale' and 'zero_point' 56 // as discussed in doc/quantization.md. As explained there, the meaning 57 // of these values is as the constants in the quantization equation 58 // 59 // real_value = scale * (quantized_value - zero_point) 60 // 61 // In other words, 'zero_point' is the quantized value that corresponds 62 // to the real value 0, and 'scale' is the difference of real values 63 // corresponding to consecutive quantized values. 64 struct QuantizationParams { 65 float scale; 66 std::uint8_t zero_point; 67 }; 68 69 // Given the min and max values of a float array, return 70 // reasonable quantization parameters to use for this array. 71 QuantizationParams ChooseQuantizationParams(float min, float max) { 72 // We extend the [min, max] interval to ensure that it contains 0. 73 // Otherwise, we would not meet the requirement that 0 be an exactly 74 // representable value. 75 min = std::min(min, 0.f); 76 max = std::max(max, 0.f); 77 78 // the min and max quantized values, as floating-point values 79 const float qmin = 0; 80 const float qmax = 255; 81 82 // First determine the scale. 83 const double scale = (max - min) / (qmax - qmin); 84 85 // Zero-point computation. 86 // First the initial floating-point computation. The zero-point can be 87 // determined from solving an affine equation for any known pair 88 // (real value, corresponding quantized value). 89 // We know two such pairs: (rmin, qmin) and (rmax, qmax). 90 // Let's use the first one here. 91 const double initial_zero_point = qmin - min / scale; 92 93 // Now we need to nudge the zero point to be an integer 94 // (our zero points are integer, and this is motivated by the requirement 95 // to be able to represent the real value "0" exactly as a quantized value, 96 // which is required in multiple places, for example in Im2col with SAME 97 // padding). 98 std::uint8_t nudged_zero_point = 0; 99 if (initial_zero_point < qmin) { 100 nudged_zero_point = qmin; 101 } else if (initial_zero_point > qmax) { 102 nudged_zero_point = qmax; 103 } else { 104 nudged_zero_point = 105 static_cast<std::uint8_t>(std::round(initial_zero_point)); 106 } 107 108 QuantizationParams result; 109 result.scale = scale; 110 result.zero_point = nudged_zero_point; 111 return result; 112 } 113 114 template <gemmlowp::MapOrder tLhsOrder, gemmlowp::MapOrder tRhsOrder, 115 gemmlowp::MapOrder tResultOrder> 116 void FloatMatrixMultiplication( 117 const gemmlowp::MatrixMap<const float, tLhsOrder>& lhs, 118 const gemmlowp::MatrixMap<const float, tRhsOrder>& rhs, 119 gemmlowp::MatrixMap<float, tResultOrder>* result) { 120 assert(lhs.cols() == rhs.rows()); 121 assert(lhs.rows() == result->rows()); 122 assert(rhs.cols() == result->cols()); 123 for (int i = 0; i < lhs.rows(); i++) { 124 for (int k = 0; k < rhs.cols(); k++) { 125 (*result)(i, k) = 0; 126 for (int j = 0; j < lhs.cols(); j++) { 127 (*result)(i, k) += lhs(i, j) * rhs(j, k); 128 } 129 } 130 } 131 } 132 133 void Quantize(const QuantizationParams& qparams, const std::vector<float>& src, 134 std::vector<std::uint8_t>* dst) { 135 assert(src.size() == dst->size()); 136 for (std::size_t i = 0; i < src.size(); i++) { 137 const float real_val = src[i]; 138 const float transformed_val = qparams.zero_point + real_val / qparams.scale; 139 const float clamped_val = std::max(0.f, std::min(255.f, transformed_val)); 140 (*dst)[i] = static_cast<std::uint8_t>(std::round(clamped_val)); 141 } 142 } 143 144 void Dequantize(const QuantizationParams& qparams, 145 const std::vector<std::uint8_t>& src, std::vector<float>* dst) { 146 assert(src.size() == dst->size()); 147 for (std::size_t i = 0; i < src.size(); i++) { 148 const std::uint8_t quantized_val = src[i]; 149 (*dst)[i] = qparams.scale * (quantized_val - qparams.zero_point); 150 } 151 } 152 153 template <typename tScalar, gemmlowp::MapOrder tOrder> 154 class MatrixWithStorage { 155 public: 156 MatrixWithStorage(int rows, int cols) 157 : storage(rows * cols), matrix_map(storage.data(), rows, cols) {} 158 void MakeRandom() { 159 static std::mt19937 random_engine; 160 std::uniform_real_distribution<float> distribution(-1, 1); 161 for (auto& x : storage) { 162 x = static_cast<tScalar>(distribution(random_engine)); 163 } 164 } 165 gemmlowp::MatrixMap<const tScalar, tOrder> ConstMap() const { 166 return gemmlowp::MatrixMap<const tScalar, tOrder>( 167 storage.data(), matrix_map.rows(), matrix_map.cols()); 168 } 169 gemmlowp::MatrixMap<tScalar, tOrder> Map() { 170 return gemmlowp::MatrixMap<tScalar, tOrder>( 171 storage.data(), matrix_map.rows(), matrix_map.cols()); 172 } 173 const std::vector<tScalar>& Storage() const { return storage; } 174 std::vector<tScalar>& Storage() { return storage; } 175 176 private: 177 std::vector<tScalar> storage; 178 gemmlowp::MatrixMap<tScalar, tOrder> matrix_map; 179 }; 180 181 template <typename tScalar, gemmlowp::MapOrder tOrder> 182 std::ostream& operator<<(std::ostream& s, 183 const MatrixWithStorage<tScalar, tOrder>& m) { 184 return s << m.ConstMap(); 185 } 186 187 // Given a real_multiplier in the interval (0, 1), 188 // produces a pair (quantized_multiplier, right_shift) where 189 // quantized_multiplier is an int32 representing a fixed-point value 190 // in the interval [-1, 1) (in practice we only produce positive values) 191 // and right_shift is an amount to shift right by, so that the 192 // floating-point multiplication of some int32 input value by real_multiplier, 193 // 194 // return static_cast<int32>(int32_value * real_multiplier); 195 // 196 // is best approximated by the integer-arithmetic-only code 197 // 198 // return RoundingRightShift( 199 // FixedPointMultiplication(int32_value, quantized_multiplier), 200 // right_shift); 201 // 202 // This is how to obtain the fixed-point multiplier and right shift 203 // parameters to pass to 204 // OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint. 205 // 206 // Note: all this code only needs to run offline to generate the quantized 207 // neural network workload, not at runtime on the 208 // device on which quantized neural networks need to run. So it's not 209 // performance-critical at all. 210 void QuantizeMultiplierSmallerThanOne(float real_multiplier, 211 std::int32_t* quantized_multiplier, 212 int* right_shift) { 213 assert(real_multiplier > 0.f); 214 assert(real_multiplier < 1.f); 215 int s = 0; 216 // We want to bring the real multiplier into the interval [1/2, 1). 217 // We can do so by multiplying it by two, and recording how many times 218 // we multiplied by two so that we can compensate that by a right 219 // shift by the same amount. 220 while (real_multiplier < 0.5f) { 221 real_multiplier *= 2.0f; 222 s++; 223 } 224 // Now that the real multiplier is in [1/2, 1), we convert it 225 // into a fixed-point number. 226 std::int64_t q = 227 static_cast<std::int64_t>(std::round(real_multiplier * (1ll << 31))); 228 assert(q <= (1ll << 31)); 229 // Handle the special case when the real multiplier was so close to 1 230 // that its fixed-point approximation was undistinguishable from 1. 231 // We handle this by dividing it by two, and remembering to decrement 232 // the right shift amount. 233 if (q == (1ll << 31)) { 234 q /= 2; 235 s--; 236 } 237 assert(s >= 0); 238 assert(q <= std::numeric_limits<std::int32_t>::max()); 239 *quantized_multiplier = static_cast<std::int32_t>(q); 240 *right_shift = s; 241 } 242 243 int main() { 244 std::cout.precision(3); 245 246 const int rows = 2; 247 const int depth = 4; 248 const int cols = 3; 249 const auto kOrder = gemmlowp::MapOrder::ColMajor; 250 251 std::cout << "First, let us make some float matrices LHS and RHS, " 252 << "and compute their product.\n" 253 << std::endl; 254 MatrixWithStorage<float, kOrder> float_lhs(rows, depth); 255 float_lhs.MakeRandom(); 256 MatrixWithStorage<float, kOrder> float_rhs(depth, cols); 257 float_rhs.MakeRandom(); 258 MatrixWithStorage<float, kOrder> reference_float_result(rows, cols); 259 auto reference_float_result_map = reference_float_result.Map(); 260 FloatMatrixMultiplication(float_lhs.ConstMap(), float_rhs.ConstMap(), 261 &reference_float_result_map); 262 std::cout << "Here is the float LHS matrix:\n" << float_lhs << std::endl; 263 std::cout << "Here is the float RHS matrix:\n" << float_rhs << std::endl; 264 std::cout << "Here is the float product (LHS * RHS) matrix obtained by " 265 << "ordinary float matrix multiplication, i.e. as far as we are " 266 << "concerned, the REFERENCE RESULT:\n" 267 << reference_float_result << std::endl; 268 269 std::cout 270 << "Now we embark on reproducing this result using " 271 << "quantized arithmetic. The code below splits into two parts: " 272 << "quantization code that only needs to run offline (e.g. to " 273 << "generate a quantized neural network workload), and actual " 274 << "runtime quantized code, which is typically performance-critical " 275 << "and where we typically do not want to use any floating-point " 276 << "arithmetic. We want to clearly distinguish between the two.\n" 277 << std::endl; 278 279 std::cout << "The below is OFFLINE QUANTIZATION CODE. We still use some " 280 << "floating-point arithmetic in the process of generating the " 281 << "quantized workload to be run on-device.\n" 282 << std::endl; 283 284 std::cout 285 << "Now, let us choose quantization parameters for these matrices. " 286 << "You might ask, what good is quantization if we need to pick " 287 << "quantization parameters for the result before we can run the " 288 << "quantized computation to obtain the result? The idea is that we " 289 << "target applications such as neural networks, where unknown results " 290 << "are only allowed to vary within preexisting bounds. In practice, the " 291 << "bounds for the results are typically learned during the neural " 292 "network " 293 << "training process. The min and max of the result do not have to be " 294 << "exact. If they are too broad, we just get lower quantization " 295 "accuracy. " 296 << "If they are too narrow, we just get clamping at the bounds.\n" 297 << std::endl; 298 299 float lhs_min, lhs_max, rhs_min, rhs_max, result_min, result_max; 300 FindMinMax(float_lhs.Map(), &lhs_min, &lhs_max); 301 FindMinMax(float_rhs.Map(), &rhs_min, &rhs_max); 302 FindMinMax(reference_float_result.Map(), &result_min, &result_max); 303 const auto lhs_qparams = ChooseQuantizationParams(lhs_min, lhs_max); 304 const auto rhs_qparams = ChooseQuantizationParams(rhs_min, rhs_max); 305 const auto result_qparams = ChooseQuantizationParams(result_min, result_max); 306 307 std::cout << "For LHS, we have min = " << lhs_min << ", max = " << lhs_max 308 << ", scale = " << lhs_qparams.scale 309 << ", zero_point = " << static_cast<float>(lhs_qparams.zero_point) 310 << std::endl; 311 std::cout << "For RHS, we have min = " << rhs_min << ", max = " << rhs_max 312 << ", scale = " << rhs_qparams.scale 313 << ", zero_point = " << static_cast<float>(rhs_qparams.zero_point) 314 << std::endl; 315 std::cout << "For the result, we have min = " << result_min 316 << ", max = " << result_max << ", scale = " << result_qparams.scale 317 << ", zero_point = " 318 << static_cast<float>(result_qparams.zero_point) << std::endl; 319 320 std::cout << std::endl; 321 322 MatrixWithStorage<std::uint8_t, kOrder> uint8_lhs(rows, depth); 323 MatrixWithStorage<std::uint8_t, kOrder> uint8_rhs(depth, cols); 324 MatrixWithStorage<std::uint8_t, kOrder> actual_uint8_result(rows, cols); 325 326 Quantize(lhs_qparams, float_lhs.Storage(), &uint8_lhs.Storage()); 327 Quantize(rhs_qparams, float_rhs.Storage(), &uint8_rhs.Storage()); 328 329 std::cout << "Quantized uint8 LHS matrix:\n" << uint8_lhs << std::endl; 330 std::cout << "Quantized uint8 RHS matrix:\n" << uint8_rhs << std::endl; 331 332 const int lhs_offset = -lhs_qparams.zero_point; 333 const int rhs_offset = -rhs_qparams.zero_point; 334 const int result_offset = result_qparams.zero_point; 335 336 const float real_multiplier = 337 lhs_qparams.scale * rhs_qparams.scale / result_qparams.scale; 338 std::int32_t quantized_multiplier; 339 int right_shift; 340 QuantizeMultiplierSmallerThanOne(real_multiplier, &quantized_multiplier, 341 &right_shift); 342 343 std::cout << "End of OFFLINE QUANTIZATION CODE.\n" << std::endl; 344 345 std::cout << "The below is ON-DEVICE RUNTIME QUANTIZED CODE. " 346 << "This is the part that is performance-critical and may only " 347 << "use quantized arithmetic.\n" 348 << std::endl; 349 350 gemmlowp::OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint 351 quantize_down_stage; 352 quantize_down_stage.result_offset_after_shift = result_offset; 353 quantize_down_stage.result_fixedpoint_multiplier = quantized_multiplier; 354 quantize_down_stage.result_shift = right_shift; 355 gemmlowp::OutputStageSaturatingCastToUint8 saturating_cast_stage; 356 const auto& output_pipeline = 357 std::make_tuple(quantize_down_stage, saturating_cast_stage); 358 359 auto actual_uint8_result_map = actual_uint8_result.Map(); 360 gemmlowp::GemmContext gemm_context; 361 gemmlowp::GemmWithOutputPipeline<std::uint8_t, std::uint8_t, 362 gemmlowp::DefaultL8R8BitDepthParams>( 363 &gemm_context, uint8_lhs.ConstMap(), uint8_rhs.ConstMap(), 364 &actual_uint8_result_map, lhs_offset, rhs_offset, output_pipeline); 365 366 std::cout << "Quantized uint8 result matrix obtained by quantized " 367 << "multiplication:\n" 368 << actual_uint8_result << std::endl; 369 370 std::cout << "End of ON-DEVICE RUNTIME QUANTIZED CODE.\n" << std::endl; 371 372 MatrixWithStorage<float, kOrder> actual_float_result(rows, cols); 373 Dequantize(result_qparams, actual_uint8_result.Storage(), 374 &actual_float_result.Storage()); 375 std::cout 376 << "Here is the actual float product (LHS * RHS) matrix obtained by " 377 << "dequantizing the above uint8 result, i.e. " 378 << "as far as we are concerned, the ACTUAL RESULT:\n" 379 << actual_float_result << std::endl; 380 381 MatrixWithStorage<float, kOrder> diff_float_result(rows, cols); 382 for (int i = 0; i < rows; i++) { 383 for (int j = 0; j < cols; j++) { 384 diff_float_result.Map()(i, j) = 385 actual_float_result.Map()(i, j) - reference_float_result.Map()(i, j); 386 } 387 } 388 389 std::cout << "Difference between ACTUAL and REFERENCE float results:\n" 390 << diff_float_result << std::endl; 391 }