1 // Copyright 2011 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // Quantization 11 // 12 // Author: Skal (pascal.massimino (at) gmail.com) 13 14 #include <assert.h> 15 #include <math.h> 16 17 #include "./vp8enci.h" 18 #include "./cost.h" 19 20 #define DO_TRELLIS_I4 1 21 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate. 22 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth. 23 #define USE_TDISTO 1 24 25 #define MID_ALPHA 64 // neutral value for susceptibility 26 #define MIN_ALPHA 30 // lowest usable value for susceptibility 27 #define MAX_ALPHA 100 // higher meaninful value for susceptibility 28 29 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP 30 // power-law modulation. Must be strictly less than 1. 31 32 #define I4_PENALTY 4000 // Rate-penalty for quick i4/i16 decision 33 34 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8) 35 36 #if defined(__cplusplus) || defined(c_plusplus) 37 extern "C" { 38 #endif 39 40 //------------------------------------------------------------------------------ 41 42 static WEBP_INLINE int clip(int v, int m, int M) { 43 return v < m ? m : v > M ? M : v; 44 } 45 46 static const uint8_t kZigzag[16] = { 47 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 48 }; 49 50 static const uint8_t kDcTable[128] = { 51 4, 5, 6, 7, 8, 9, 10, 10, 52 11, 12, 13, 14, 15, 16, 17, 17, 53 18, 19, 20, 20, 21, 21, 22, 22, 54 23, 23, 24, 25, 25, 26, 27, 28, 55 29, 30, 31, 32, 33, 34, 35, 36, 56 37, 37, 38, 39, 40, 41, 42, 43, 57 44, 45, 46, 46, 47, 48, 49, 50, 58 51, 52, 53, 54, 55, 56, 57, 58, 59 59, 60, 61, 62, 63, 64, 65, 66, 60 67, 68, 69, 70, 71, 72, 73, 74, 61 75, 76, 76, 77, 78, 79, 80, 81, 62 82, 83, 84, 85, 86, 87, 88, 89, 63 91, 93, 95, 96, 98, 100, 101, 102, 64 104, 106, 108, 110, 112, 114, 116, 118, 65 122, 124, 126, 128, 130, 132, 134, 136, 66 138, 140, 143, 145, 148, 151, 154, 157 67 }; 68 69 static const uint16_t kAcTable[128] = { 70 4, 5, 6, 7, 8, 9, 10, 11, 71 12, 13, 14, 15, 16, 17, 18, 19, 72 20, 21, 22, 23, 24, 25, 26, 27, 73 28, 29, 30, 31, 32, 33, 34, 35, 74 36, 37, 38, 39, 40, 41, 42, 43, 75 44, 45, 46, 47, 48, 49, 50, 51, 76 52, 53, 54, 55, 56, 57, 58, 60, 77 62, 64, 66, 68, 70, 72, 74, 76, 78 78, 80, 82, 84, 86, 88, 90, 92, 79 94, 96, 98, 100, 102, 104, 106, 108, 80 110, 112, 114, 116, 119, 122, 125, 128, 81 131, 134, 137, 140, 143, 146, 149, 152, 82 155, 158, 161, 164, 167, 170, 173, 177, 83 181, 185, 189, 193, 197, 201, 205, 209, 84 213, 217, 221, 225, 229, 234, 239, 245, 85 249, 254, 259, 264, 269, 274, 279, 284 86 }; 87 88 static const uint16_t kAcTable2[128] = { 89 8, 8, 9, 10, 12, 13, 15, 17, 90 18, 20, 21, 23, 24, 26, 27, 29, 91 31, 32, 34, 35, 37, 38, 40, 41, 92 43, 44, 46, 48, 49, 51, 52, 54, 93 55, 57, 58, 60, 62, 63, 65, 66, 94 68, 69, 71, 72, 74, 75, 77, 79, 95 80, 82, 83, 85, 86, 88, 89, 93, 96 96, 99, 102, 105, 108, 111, 114, 117, 97 120, 124, 127, 130, 133, 136, 139, 142, 98 145, 148, 151, 155, 158, 161, 164, 167, 99 170, 173, 176, 179, 184, 189, 193, 198, 100 203, 207, 212, 217, 221, 226, 230, 235, 101 240, 244, 249, 254, 258, 263, 268, 274, 102 280, 286, 292, 299, 305, 311, 317, 323, 103 330, 336, 342, 348, 354, 362, 370, 379, 104 385, 393, 401, 409, 416, 424, 432, 440 105 }; 106 107 static const uint16_t kCoeffThresh[16] = { 108 0, 10, 20, 30, 109 10, 20, 30, 30, 110 20, 30, 30, 30, 111 30, 30, 30, 30 112 }; 113 114 // TODO(skal): tune more. Coeff thresholding? 115 static const uint8_t kBiasMatrices[3][16] = { // [3] = [luma-ac,luma-dc,chroma] 116 { 96, 96, 96, 96, 117 96, 96, 96, 96, 118 96, 96, 96, 96, 119 96, 96, 96, 96 }, 120 { 96, 96, 96, 96, 121 96, 96, 96, 96, 122 96, 96, 96, 96, 123 96, 96, 96, 96 }, 124 { 96, 96, 96, 96, 125 96, 96, 96, 96, 126 96, 96, 96, 96, 127 96, 96, 96, 96 } 128 }; 129 130 // Sharpening by (slightly) raising the hi-frequency coeffs (only for trellis). 131 // Hack-ish but helpful for mid-bitrate range. Use with care. 132 static const uint8_t kFreqSharpening[16] = { 133 0, 30, 60, 90, 134 30, 60, 90, 90, 135 60, 90, 90, 90, 136 90, 90, 90, 90 137 }; 138 139 //------------------------------------------------------------------------------ 140 // Initialize quantization parameters in VP8Matrix 141 142 // Returns the average quantizer 143 static int ExpandMatrix(VP8Matrix* const m, int type) { 144 int i; 145 int sum = 0; 146 for (i = 2; i < 16; ++i) { 147 m->q_[i] = m->q_[1]; 148 } 149 for (i = 0; i < 16; ++i) { 150 const int j = kZigzag[i]; 151 const int bias = kBiasMatrices[type][j]; 152 m->iq_[j] = (1 << QFIX) / m->q_[j]; 153 m->bias_[j] = BIAS(bias); 154 // TODO(skal): tune kCoeffThresh[] 155 m->zthresh_[j] = ((256 /*+ kCoeffThresh[j]*/ - bias) * m->q_[j] + 127) >> 8; 156 m->sharpen_[j] = (kFreqSharpening[j] * m->q_[j]) >> 11; 157 sum += m->q_[j]; 158 } 159 return (sum + 8) >> 4; 160 } 161 162 static void SetupMatrices(VP8Encoder* enc) { 163 int i; 164 const int tlambda_scale = 165 (enc->method_ >= 4) ? enc->config_->sns_strength 166 : 0; 167 const int num_segments = enc->segment_hdr_.num_segments_; 168 for (i = 0; i < num_segments; ++i) { 169 VP8SegmentInfo* const m = &enc->dqm_[i]; 170 const int q = m->quant_; 171 int q4, q16, quv; 172 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)]; 173 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)]; 174 175 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2; 176 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)]; 177 178 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)]; 179 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)]; 180 181 q4 = ExpandMatrix(&m->y1_, 0); 182 q16 = ExpandMatrix(&m->y2_, 1); 183 quv = ExpandMatrix(&m->uv_, 2); 184 185 // TODO: Switch to kLambda*[] tables? 186 { 187 m->lambda_i4_ = (3 * q4 * q4) >> 7; 188 m->lambda_i16_ = (3 * q16 * q16); 189 m->lambda_uv_ = (3 * quv * quv) >> 6; 190 m->lambda_mode_ = (1 * q4 * q4) >> 7; 191 m->lambda_trellis_i4_ = (7 * q4 * q4) >> 3; 192 m->lambda_trellis_i16_ = (q16 * q16) >> 2; 193 m->lambda_trellis_uv_ = (quv *quv) << 1; 194 m->tlambda_ = (tlambda_scale * q4) >> 5; 195 } 196 } 197 } 198 199 //------------------------------------------------------------------------------ 200 // Initialize filtering parameters 201 202 // Very small filter-strength values have close to no visual effect. So we can 203 // save a little decoding-CPU by turning filtering off for these. 204 #define FSTRENGTH_CUTOFF 3 205 206 static void SetupFilterStrength(VP8Encoder* const enc) { 207 int i; 208 const int level0 = enc->config_->filter_strength; 209 for (i = 0; i < NUM_MB_SEGMENTS; ++i) { 210 // Segments with lower quantizer will be less filtered. TODO: tune (wrt SNS) 211 const int level = level0 * 256 * enc->dqm_[i].quant_ / 128; 212 const int f = level / (256 + enc->dqm_[i].beta_); 213 enc->dqm_[i].fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f; 214 } 215 // We record the initial strength (mainly for the case of 1-segment only). 216 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_; 217 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0); 218 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness; 219 } 220 221 //------------------------------------------------------------------------------ 222 223 // Note: if you change the values below, remember that the max range 224 // allowed by the syntax for DQ_UV is [-16,16]. 225 #define MAX_DQ_UV (6) 226 #define MIN_DQ_UV (-4) 227 228 // We want to emulate jpeg-like behaviour where the expected "good" quality 229 // is around q=75. Internally, our "good" middle is around c=50. So we 230 // map accordingly using linear piece-wise function 231 static double QualityToCompression(double c) { 232 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.; 233 // The file size roughly scales as pow(quantizer, 3.). Actually, the 234 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested 235 // in the mid-quant range. So we scale the compressibility inversely to 236 // this power-law: quant ~= compression ^ 1/3. This law holds well for 237 // low quant. Finer modelling for high-quant would make use of kAcTable[] 238 // more explicitly. 239 const double v = pow(linear_c, 1 / 3.); 240 return v; 241 } 242 243 static double QualityToJPEGCompression(double c, double alpha) { 244 // We map the complexity 'alpha' and quality setting 'c' to a compression 245 // exponent empirically matched to the compression curve of libjpeg6b. 246 // On average, the WebP output size will be roughly similar to that of a 247 // JPEG file compressed with same quality factor. 248 const double amin = 0.30; 249 const double amax = 0.85; 250 const double exp_min = 0.4; 251 const double exp_max = 0.9; 252 const double slope = (exp_min - exp_max) / (amax - amin); 253 // Linearly interpolate 'expn' from exp_min to exp_max 254 // in the [amin, amax] range. 255 const double expn = (alpha > amax) ? exp_min 256 : (alpha < amin) ? exp_max 257 : exp_max + slope * (alpha - amin); 258 const double v = pow(c, expn); 259 return v; 260 } 261 262 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1, 263 const VP8SegmentInfo* const S2) { 264 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_); 265 } 266 267 static void SimplifySegments(VP8Encoder* const enc) { 268 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 }; 269 const int num_segments = enc->segment_hdr_.num_segments_; 270 int num_final_segments = 1; 271 int s1, s2; 272 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments 273 const VP8SegmentInfo* const S1 = &enc->dqm_[s1]; 274 int found = 0; 275 // check if we already have similar segment 276 for (s2 = 0; s2 < num_final_segments; ++s2) { 277 const VP8SegmentInfo* const S2 = &enc->dqm_[s2]; 278 if (SegmentsAreEquivalent(S1, S2)) { 279 found = 1; 280 break; 281 } 282 } 283 map[s1] = s2; 284 if (!found) { 285 if (num_final_segments != s1) { 286 enc->dqm_[num_final_segments] = enc->dqm_[s1]; 287 } 288 ++num_final_segments; 289 } 290 } 291 if (num_final_segments < num_segments) { // Remap 292 int i = enc->mb_w_ * enc->mb_h_; 293 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_]; 294 enc->segment_hdr_.num_segments_ = num_final_segments; 295 // Replicate the trailing segment infos (it's mostly cosmetics) 296 for (i = num_final_segments; i < num_segments; ++i) { 297 enc->dqm_[i] = enc->dqm_[num_final_segments - 1]; 298 } 299 } 300 } 301 302 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) { 303 int i; 304 int dq_uv_ac, dq_uv_dc; 305 const int num_segments = enc->segment_hdr_.num_segments_; 306 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.; 307 const double Q = quality / 100.; 308 const double c_base = enc->config_->emulate_jpeg_size ? 309 QualityToJPEGCompression(Q, enc->alpha_ / 255.) : 310 QualityToCompression(Q); 311 for (i = 0; i < num_segments; ++i) { 312 // We modulate the base coefficient to accommodate for the quantization 313 // susceptibility and allow denser segments to be quantized more. 314 const double expn = 1. - amp * enc->dqm_[i].alpha_; 315 const double c = pow(c_base, expn); 316 const int q = (int)(127. * (1. - c)); 317 assert(expn > 0.); 318 enc->dqm_[i].quant_ = clip(q, 0, 127); 319 } 320 321 // purely indicative in the bitstream (except for the 1-segment case) 322 enc->base_quant_ = enc->dqm_[0].quant_; 323 324 // fill-in values for the unused segments (required by the syntax) 325 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) { 326 enc->dqm_[i].quant_ = enc->base_quant_; 327 } 328 329 // uv_alpha_ is normally spread around ~60. The useful range is 330 // typically ~30 (quite bad) to ~100 (ok to decimate UV more). 331 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv. 332 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV) 333 / (MAX_ALPHA - MIN_ALPHA); 334 // we rescale by the user-defined strength of adaptation 335 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100; 336 // and make it safe. 337 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV); 338 // We also boost the dc-uv-quant a little, based on sns-strength, since 339 // U/V channels are quite more reactive to high quants (flat DC-blocks 340 // tend to appear, and are displeasant). 341 dq_uv_dc = -4 * enc->config_->sns_strength / 100; 342 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed 343 344 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum 345 enc->dq_y2_dc_ = 0; 346 enc->dq_y2_ac_ = 0; 347 enc->dq_uv_dc_ = dq_uv_dc; 348 enc->dq_uv_ac_ = dq_uv_ac; 349 350 SetupFilterStrength(enc); // initialize segments' filtering, eventually 351 352 if (num_segments > 1) SimplifySegments(enc); 353 354 SetupMatrices(enc); // finalize quantization matrices 355 } 356 357 //------------------------------------------------------------------------------ 358 // Form the predictions in cache 359 360 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index 361 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 }; 362 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 }; 363 364 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index 365 const int VP8I4ModeOffsets[NUM_BMODES] = { 366 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 367 }; 368 369 void VP8MakeLuma16Preds(const VP8EncIterator* const it) { 370 const VP8Encoder* const enc = it->enc_; 371 const uint8_t* const left = it->x_ ? enc->y_left_ : NULL; 372 const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL; 373 VP8EncPredLuma16(it->yuv_p_, left, top); 374 } 375 376 void VP8MakeChroma8Preds(const VP8EncIterator* const it) { 377 const VP8Encoder* const enc = it->enc_; 378 const uint8_t* const left = it->x_ ? enc->u_left_ : NULL; 379 const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL; 380 VP8EncPredChroma8(it->yuv_p_, left, top); 381 } 382 383 void VP8MakeIntra4Preds(const VP8EncIterator* const it) { 384 VP8EncPredLuma4(it->yuv_p_, it->i4_top_); 385 } 386 387 //------------------------------------------------------------------------------ 388 // Quantize 389 390 // Layout: 391 // +----+ 392 // |YYYY| 0 393 // |YYYY| 4 394 // |YYYY| 8 395 // |YYYY| 12 396 // +----+ 397 // |UUVV| 16 398 // |UUVV| 20 399 // +----+ 400 401 const int VP8Scan[16 + 4 + 4] = { 402 // Luma 403 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, 404 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, 405 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, 406 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS, 407 408 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U 409 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V 410 }; 411 412 //------------------------------------------------------------------------------ 413 // Distortion measurement 414 415 static const uint16_t kWeightY[16] = { 416 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2 417 }; 418 419 static const uint16_t kWeightTrellis[16] = { 420 #if USE_TDISTO == 0 421 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 422 #else 423 30, 27, 19, 11, 424 27, 24, 17, 10, 425 19, 17, 12, 8, 426 11, 10, 8, 6 427 #endif 428 }; 429 430 // Init/Copy the common fields in score. 431 static void InitScore(VP8ModeScore* const rd) { 432 rd->D = 0; 433 rd->SD = 0; 434 rd->R = 0; 435 rd->nz = 0; 436 rd->score = MAX_COST; 437 } 438 439 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) { 440 dst->D = src->D; 441 dst->SD = src->SD; 442 dst->R = src->R; 443 dst->nz = src->nz; // note that nz is not accumulated, but just copied. 444 dst->score = src->score; 445 } 446 447 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) { 448 dst->D += src->D; 449 dst->SD += src->SD; 450 dst->R += src->R; 451 dst->nz |= src->nz; // here, new nz bits are accumulated. 452 dst->score += src->score; 453 } 454 455 //------------------------------------------------------------------------------ 456 // Performs trellis-optimized quantization. 457 458 // Trellis 459 460 typedef struct { 461 int prev; // best previous 462 int level; // level 463 int sign; // sign of coeff_i 464 score_t cost; // bit cost 465 score_t error; // distortion = sum of (|coeff_i| - level_i * Q_i)^2 466 int ctx; // context (only depends on 'level'. Could be spared.) 467 } Node; 468 469 // If a coefficient was quantized to a value Q (using a neutral bias), 470 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA] 471 // We don't test negative values though. 472 #define MIN_DELTA 0 // how much lower level to try 473 #define MAX_DELTA 1 // how much higher 474 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA) 475 #define NODE(n, l) (nodes[(n) + 1][(l) + MIN_DELTA]) 476 477 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) { 478 // TODO: incorporate the "* 256" in the tables? 479 rd->score = rd->R * lambda + 256 * (rd->D + rd->SD); 480 } 481 482 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate, 483 score_t distortion) { 484 return rate * lambda + 256 * distortion; 485 } 486 487 static int TrellisQuantizeBlock(const VP8EncIterator* const it, 488 int16_t in[16], int16_t out[16], 489 int ctx0, int coeff_type, 490 const VP8Matrix* const mtx, 491 int lambda) { 492 ProbaArray* const last_costs = it->enc_->proba_.coeffs_[coeff_type]; 493 CostArray* const costs = it->enc_->proba_.level_cost_[coeff_type]; 494 const int first = (coeff_type == 0) ? 1 : 0; 495 Node nodes[17][NUM_NODES]; 496 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous 497 score_t best_score; 498 int best_node; 499 int last = first - 1; 500 int n, m, p, nz; 501 502 { 503 score_t cost; 504 score_t max_error; 505 const int thresh = mtx->q_[1] * mtx->q_[1] / 4; 506 const int last_proba = last_costs[VP8EncBands[first]][ctx0][0]; 507 508 // compute maximal distortion. 509 max_error = 0; 510 for (n = first; n < 16; ++n) { 511 const int j = kZigzag[n]; 512 const int err = in[j] * in[j]; 513 max_error += kWeightTrellis[j] * err; 514 if (err > thresh) last = n; 515 } 516 // we don't need to go inspect up to n = 16 coeffs. We can just go up 517 // to last + 1 (inclusive) without losing much. 518 if (last < 15) ++last; 519 520 // compute 'skip' score. This is the max score one can do. 521 cost = VP8BitCost(0, last_proba); 522 best_score = RDScoreTrellis(lambda, cost, max_error); 523 524 // initialize source node. 525 n = first - 1; 526 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { 527 NODE(n, m).cost = 0; 528 NODE(n, m).error = max_error; 529 NODE(n, m).ctx = ctx0; 530 } 531 } 532 533 // traverse trellis. 534 for (n = first; n <= last; ++n) { 535 const int j = kZigzag[n]; 536 const int Q = mtx->q_[j]; 537 const int iQ = mtx->iq_[j]; 538 const int B = BIAS(0x00); // neutral bias 539 // note: it's important to take sign of the _original_ coeff, 540 // so we don't have to consider level < 0 afterward. 541 const int sign = (in[j] < 0); 542 int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; 543 int level0; 544 if (coeff0 > 2047) coeff0 = 2047; 545 546 level0 = QUANTDIV(coeff0, iQ, B); 547 // test all alternate level values around level0. 548 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { 549 Node* const cur = &NODE(n, m); 550 int delta_error, new_error; 551 score_t cur_score = MAX_COST; 552 int level = level0 + m; 553 int last_proba; 554 555 cur->sign = sign; 556 cur->level = level; 557 cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2; 558 if (level >= 2048 || level < 0) { // node is dead? 559 cur->cost = MAX_COST; 560 continue; 561 } 562 last_proba = last_costs[VP8EncBands[n + 1]][cur->ctx][0]; 563 564 // Compute delta_error = how much coding this level will 565 // subtract as distortion to max_error 566 new_error = coeff0 - level * Q; 567 delta_error = 568 kWeightTrellis[j] * (coeff0 * coeff0 - new_error * new_error); 569 570 // Inspect all possible non-dead predecessors. Retain only the best one. 571 for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) { 572 const Node* const prev = &NODE(n - 1, p); 573 const int prev_ctx = prev->ctx; 574 const uint16_t* const tcost = costs[VP8EncBands[n]][prev_ctx]; 575 const score_t total_error = prev->error - delta_error; 576 score_t cost, base_cost, score; 577 578 if (prev->cost >= MAX_COST) { // dead node? 579 continue; 580 } 581 582 // Base cost of both terminal/non-terminal 583 base_cost = prev->cost + VP8LevelCost(tcost, level); 584 585 // Examine node assuming it's a non-terminal one. 586 cost = base_cost; 587 if (level && n < 15) { 588 cost += VP8BitCost(1, last_proba); 589 } 590 score = RDScoreTrellis(lambda, cost, total_error); 591 if (score < cur_score) { 592 cur_score = score; 593 cur->cost = cost; 594 cur->error = total_error; 595 cur->prev = p; 596 } 597 598 // Now, record best terminal node (and thus best entry in the graph). 599 if (level) { 600 cost = base_cost; 601 if (n < 15) cost += VP8BitCost(0, last_proba); 602 score = RDScoreTrellis(lambda, cost, total_error); 603 if (score < best_score) { 604 best_score = score; 605 best_path[0] = n; // best eob position 606 best_path[1] = m; // best level 607 best_path[2] = p; // best predecessor 608 } 609 } 610 } 611 } 612 } 613 614 // Fresh start 615 memset(in + first, 0, (16 - first) * sizeof(*in)); 616 memset(out + first, 0, (16 - first) * sizeof(*out)); 617 if (best_path[0] == -1) { 618 return 0; // skip! 619 } 620 621 // Unwind the best path. 622 // Note: best-prev on terminal node is not necessarily equal to the 623 // best_prev for non-terminal. So we patch best_path[2] in. 624 n = best_path[0]; 625 best_node = best_path[1]; 626 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal 627 nz = 0; 628 629 for (; n >= first; --n) { 630 const Node* const node = &NODE(n, best_node); 631 const int j = kZigzag[n]; 632 out[n] = node->sign ? -node->level : node->level; 633 nz |= (node->level != 0); 634 in[j] = out[n] * mtx->q_[j]; 635 best_node = node->prev; 636 } 637 return nz; 638 } 639 640 #undef NODE 641 642 //------------------------------------------------------------------------------ 643 // Performs: difference, transform, quantize, back-transform, add 644 // all at once. Output is the reconstructed block in *yuv_out, and the 645 // quantized levels in *levels. 646 647 static int ReconstructIntra16(VP8EncIterator* const it, 648 VP8ModeScore* const rd, 649 uint8_t* const yuv_out, 650 int mode) { 651 const VP8Encoder* const enc = it->enc_; 652 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; 653 const uint8_t* const src = it->yuv_in_ + Y_OFF; 654 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 655 int nz = 0; 656 int n; 657 int16_t tmp[16][16], dc_tmp[16]; 658 659 for (n = 0; n < 16; ++n) { 660 VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]); 661 } 662 VP8FTransformWHT(tmp[0], dc_tmp); 663 nz |= VP8EncQuantizeBlock(dc_tmp, rd->y_dc_levels, 0, &dqm->y2_) << 24; 664 665 if (DO_TRELLIS_I16 && it->do_trellis_) { 666 int x, y; 667 VP8IteratorNzToBytes(it); 668 for (y = 0, n = 0; y < 4; ++y) { 669 for (x = 0; x < 4; ++x, ++n) { 670 const int ctx = it->top_nz_[x] + it->left_nz_[y]; 671 const int non_zero = 672 TrellisQuantizeBlock(it, tmp[n], rd->y_ac_levels[n], ctx, 0, 673 &dqm->y1_, dqm->lambda_trellis_i16_); 674 it->top_nz_[x] = it->left_nz_[y] = non_zero; 675 nz |= non_zero << n; 676 } 677 } 678 } else { 679 for (n = 0; n < 16; ++n) { 680 nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n; 681 } 682 } 683 684 // Transform back 685 VP8ITransformWHT(dc_tmp, tmp[0]); 686 for (n = 0; n < 16; n += 2) { 687 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1); 688 } 689 690 return nz; 691 } 692 693 static int ReconstructIntra4(VP8EncIterator* const it, 694 int16_t levels[16], 695 const uint8_t* const src, 696 uint8_t* const yuv_out, 697 int mode) { 698 const VP8Encoder* const enc = it->enc_; 699 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode]; 700 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 701 int nz = 0; 702 int16_t tmp[16]; 703 704 VP8FTransform(src, ref, tmp); 705 if (DO_TRELLIS_I4 && it->do_trellis_) { 706 const int x = it->i4_ & 3, y = it->i4_ >> 2; 707 const int ctx = it->top_nz_[x] + it->left_nz_[y]; 708 nz = TrellisQuantizeBlock(it, tmp, levels, ctx, 3, &dqm->y1_, 709 dqm->lambda_trellis_i4_); 710 } else { 711 nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_); 712 } 713 VP8ITransform(ref, tmp, yuv_out, 0); 714 return nz; 715 } 716 717 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, 718 uint8_t* const yuv_out, int mode) { 719 const VP8Encoder* const enc = it->enc_; 720 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode]; 721 const uint8_t* const src = it->yuv_in_ + U_OFF; 722 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 723 int nz = 0; 724 int n; 725 int16_t tmp[8][16]; 726 727 for (n = 0; n < 8; ++n) { 728 VP8FTransform(src + VP8Scan[16 + n], ref + VP8Scan[16 + n], tmp[n]); 729 } 730 if (DO_TRELLIS_UV && it->do_trellis_) { 731 int ch, x, y; 732 for (ch = 0, n = 0; ch <= 2; ch += 2) { 733 for (y = 0; y < 2; ++y) { 734 for (x = 0; x < 2; ++x, ++n) { 735 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y]; 736 const int non_zero = 737 TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2, 738 &dqm->uv_, dqm->lambda_trellis_uv_); 739 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero; 740 nz |= non_zero << n; 741 } 742 } 743 } 744 } else { 745 for (n = 0; n < 8; ++n) { 746 nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], 0, &dqm->uv_) << n; 747 } 748 } 749 750 for (n = 0; n < 8; n += 2) { 751 VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1); 752 } 753 return (nz << 16); 754 } 755 756 //------------------------------------------------------------------------------ 757 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost. 758 // Pick the mode is lower RD-cost = Rate + lamba * Distortion. 759 760 static void SwapPtr(uint8_t** a, uint8_t** b) { 761 uint8_t* const tmp = *a; 762 *a = *b; 763 *b = tmp; 764 } 765 766 static void SwapOut(VP8EncIterator* const it) { 767 SwapPtr(&it->yuv_out_, &it->yuv_out2_); 768 } 769 770 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) { 771 const VP8Encoder* const enc = it->enc_; 772 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 773 const int lambda = dqm->lambda_i16_; 774 const int tlambda = dqm->tlambda_; 775 const uint8_t* const src = it->yuv_in_ + Y_OFF; 776 VP8ModeScore rd16; 777 int mode; 778 779 rd->mode_i16 = -1; 780 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 781 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF; // scratch buffer 782 int nz; 783 784 // Reconstruct 785 nz = ReconstructIntra16(it, &rd16, tmp_dst, mode); 786 787 // Measure RD-score 788 rd16.D = VP8SSE16x16(src, tmp_dst); 789 rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) 790 : 0; 791 rd16.R = VP8GetCostLuma16(it, &rd16); 792 rd16.R += VP8FixedCostsI16[mode]; 793 794 // Since we always examine Intra16 first, we can overwrite *rd directly. 795 SetRDScore(lambda, &rd16); 796 if (mode == 0 || rd16.score < rd->score) { 797 CopyScore(rd, &rd16); 798 rd->mode_i16 = mode; 799 rd->nz = nz; 800 memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels)); 801 memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels)); 802 SwapOut(it); 803 } 804 } 805 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision. 806 VP8SetIntra16Mode(it, rd->mode_i16); 807 } 808 809 //------------------------------------------------------------------------------ 810 811 // return the cost array corresponding to the surrounding prediction modes. 812 static const uint16_t* GetCostModeI4(VP8EncIterator* const it, 813 const uint8_t modes[16]) { 814 const int preds_w = it->enc_->preds_w_; 815 const int x = (it->i4_ & 3), y = it->i4_ >> 2; 816 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1]; 817 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4]; 818 return VP8FixedCostsI4[top][left]; 819 } 820 821 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) { 822 const VP8Encoder* const enc = it->enc_; 823 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 824 const int lambda = dqm->lambda_i4_; 825 const int tlambda = dqm->tlambda_; 826 const uint8_t* const src0 = it->yuv_in_ + Y_OFF; 827 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF; 828 int total_header_bits = 0; 829 VP8ModeScore rd_best; 830 831 if (enc->max_i4_header_bits_ == 0) { 832 return 0; 833 } 834 835 InitScore(&rd_best); 836 rd_best.score = 211; // '211' is the value of VP8BitCost(0, 145) 837 VP8IteratorStartI4(it); 838 do { 839 VP8ModeScore rd_i4; 840 int mode; 841 int best_mode = -1; 842 const uint8_t* const src = src0 + VP8Scan[it->i4_]; 843 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4); 844 uint8_t* best_block = best_blocks + VP8Scan[it->i4_]; 845 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer. 846 847 InitScore(&rd_i4); 848 VP8MakeIntra4Preds(it); 849 for (mode = 0; mode < NUM_BMODES; ++mode) { 850 VP8ModeScore rd_tmp; 851 int16_t tmp_levels[16]; 852 853 // Reconstruct 854 rd_tmp.nz = 855 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_; 856 857 // Compute RD-score 858 rd_tmp.D = VP8SSE4x4(src, tmp_dst); 859 rd_tmp.SD = 860 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY)) 861 : 0; 862 rd_tmp.R = VP8GetCostLuma4(it, tmp_levels); 863 rd_tmp.R += mode_costs[mode]; 864 865 SetRDScore(lambda, &rd_tmp); 866 if (best_mode < 0 || rd_tmp.score < rd_i4.score) { 867 CopyScore(&rd_i4, &rd_tmp); 868 best_mode = mode; 869 SwapPtr(&tmp_dst, &best_block); 870 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels)); 871 } 872 } 873 SetRDScore(dqm->lambda_mode_, &rd_i4); 874 AddScore(&rd_best, &rd_i4); 875 total_header_bits += mode_costs[best_mode]; 876 if (rd_best.score >= rd->score || 877 total_header_bits > enc->max_i4_header_bits_) { 878 return 0; 879 } 880 // Copy selected samples if not in the right place already. 881 if (best_block != best_blocks + VP8Scan[it->i4_]) 882 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]); 883 rd->modes_i4[it->i4_] = best_mode; 884 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0); 885 } while (VP8IteratorRotateI4(it, best_blocks)); 886 887 // finalize state 888 CopyScore(rd, &rd_best); 889 VP8SetIntra4Mode(it, rd->modes_i4); 890 SwapOut(it); 891 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels)); 892 return 1; // select intra4x4 over intra16x16 893 } 894 895 //------------------------------------------------------------------------------ 896 897 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { 898 const VP8Encoder* const enc = it->enc_; 899 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 900 const int lambda = dqm->lambda_uv_; 901 const uint8_t* const src = it->yuv_in_ + U_OFF; 902 uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF; // scratch buffer 903 uint8_t* const dst0 = it->yuv_out_ + U_OFF; 904 VP8ModeScore rd_best; 905 int mode; 906 907 rd->mode_uv = -1; 908 InitScore(&rd_best); 909 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 910 VP8ModeScore rd_uv; 911 912 // Reconstruct 913 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode); 914 915 // Compute RD-score 916 rd_uv.D = VP8SSE16x8(src, tmp_dst); 917 rd_uv.SD = 0; // TODO: should we call TDisto? it tends to flatten areas. 918 rd_uv.R = VP8GetCostUV(it, &rd_uv); 919 rd_uv.R += VP8FixedCostsUV[mode]; 920 921 SetRDScore(lambda, &rd_uv); 922 if (mode == 0 || rd_uv.score < rd_best.score) { 923 CopyScore(&rd_best, &rd_uv); 924 rd->mode_uv = mode; 925 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels)); 926 memcpy(dst0, tmp_dst, UV_SIZE); // TODO: SwapUVOut() ? 927 } 928 } 929 VP8SetIntraUVMode(it, rd->mode_uv); 930 AddScore(rd, &rd_best); 931 } 932 933 //------------------------------------------------------------------------------ 934 // Final reconstruction and quantization. 935 936 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) { 937 const VP8Encoder* const enc = it->enc_; 938 const int is_i16 = (it->mb_->type_ == 1); 939 int nz = 0; 940 941 if (is_i16) { 942 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]); 943 } else { 944 VP8IteratorStartI4(it); 945 do { 946 const int mode = 947 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_]; 948 const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_]; 949 uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_]; 950 VP8MakeIntra4Preds(it); 951 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_], 952 src, dst, mode) << it->i4_; 953 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF)); 954 } 955 956 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_); 957 rd->nz = nz; 958 } 959 960 // Refine intra16/intra4 sub-modes based on distortion only (not rate). 961 static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) { 962 const int is_i16 = (it->mb_->type_ == 1); 963 score_t best_score = MAX_COST; 964 965 if (try_both_i4_i16 || is_i16) { 966 int mode; 967 int best_mode = -1; 968 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 969 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; 970 const uint8_t* const src = it->yuv_in_ + Y_OFF; 971 const score_t score = VP8SSE16x16(src, ref); 972 if (score < best_score) { 973 best_mode = mode; 974 best_score = score; 975 } 976 } 977 VP8SetIntra16Mode(it, best_mode); 978 } 979 if (try_both_i4_i16 || !is_i16) { 980 uint8_t modes_i4[16]; 981 // We don't evaluate the rate here, but just account for it through a 982 // constant penalty (i4 mode usually needs more bits compared to i16). 983 score_t score_i4 = (score_t)I4_PENALTY; 984 985 VP8IteratorStartI4(it); 986 do { 987 int mode; 988 int best_sub_mode = -1; 989 score_t best_sub_score = MAX_COST; 990 const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_]; 991 992 // TODO(skal): we don't really need the prediction pixels here, 993 // but just the distortion against 'src'. 994 VP8MakeIntra4Preds(it); 995 for (mode = 0; mode < NUM_BMODES; ++mode) { 996 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode]; 997 const score_t score = VP8SSE4x4(src, ref); 998 if (score < best_sub_score) { 999 best_sub_mode = mode; 1000 best_sub_score = score; 1001 } 1002 } 1003 modes_i4[it->i4_] = best_sub_mode; 1004 score_i4 += best_sub_score; 1005 if (score_i4 >= best_score) break; 1006 } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF)); 1007 if (score_i4 < best_score) { 1008 VP8SetIntra4Mode(it, modes_i4); 1009 } 1010 } 1011 } 1012 1013 //------------------------------------------------------------------------------ 1014 // Entry point 1015 1016 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, 1017 VP8RDLevel rd_opt) { 1018 int is_skipped; 1019 const int method = it->enc_->method_; 1020 1021 InitScore(rd); 1022 1023 // We can perform predictions for Luma16x16 and Chroma8x8 already. 1024 // Luma4x4 predictions needs to be done as-we-go. 1025 VP8MakeLuma16Preds(it); 1026 VP8MakeChroma8Preds(it); 1027 1028 if (rd_opt > RD_OPT_NONE) { 1029 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL); 1030 PickBestIntra16(it, rd); 1031 if (method >= 2) { 1032 PickBestIntra4(it, rd); 1033 } 1034 PickBestUV(it, rd); 1035 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now 1036 it->do_trellis_ = 1; 1037 SimpleQuantize(it, rd); 1038 } 1039 } else { 1040 // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower). 1041 // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode). 1042 DistoRefine(it, (method >= 2)); 1043 SimpleQuantize(it, rd); 1044 } 1045 is_skipped = (rd->nz == 0); 1046 VP8SetSkip(it, is_skipped); 1047 return is_skipped; 1048 } 1049 1050 #if defined(__cplusplus) || defined(c_plusplus) 1051 } // extern "C" 1052 #endif 1053