1 // Copyright 2011 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // Quantization 11 // 12 // Author: Skal (pascal.massimino (at) gmail.com) 13 14 #include <assert.h> 15 #include <math.h> 16 #include <stdlib.h> // for abs() 17 18 #include "./vp8enci.h" 19 #include "./cost.h" 20 21 #define DO_TRELLIS_I4 1 22 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate. 23 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth. 24 #define USE_TDISTO 1 25 26 #define MID_ALPHA 64 // neutral value for susceptibility 27 #define MIN_ALPHA 30 // lowest usable value for susceptibility 28 #define MAX_ALPHA 100 // higher meaningful value for susceptibility 29 30 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP 31 // power-law modulation. Must be strictly less than 1. 32 33 #define I4_PENALTY 14000 // Rate-penalty for quick i4/i16 decision 34 35 // number of non-zero coeffs below which we consider the block very flat 36 // (and apply a penalty to complex predictions) 37 #define FLATNESS_LIMIT_I16 10 // I16 mode 38 #define FLATNESS_LIMIT_I4 3 // I4 mode 39 #define FLATNESS_LIMIT_UV 2 // UV mode 40 #define FLATNESS_PENALTY 140 // roughly ~1bit per block 41 42 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8) 43 44 #define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda) 45 46 // #define DEBUG_BLOCK 47 48 //------------------------------------------------------------------------------ 49 50 #if defined(DEBUG_BLOCK) 51 52 #include <stdio.h> 53 #include <stdlib.h> 54 55 static void PrintBlockInfo(const VP8EncIterator* const it, 56 const VP8ModeScore* const rd) { 57 int i, j; 58 const int is_i16 = (it->mb_->type_ == 1); 59 const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC; 60 const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC; 61 const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC; 62 const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC; 63 printf("SOURCE / OUTPUT / ABS DELTA\n"); 64 for (j = 0; j < 16; ++j) { 65 for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]); 66 printf(" "); 67 for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]); 68 printf(" "); 69 for (i = 0; i < 16; ++i) { 70 printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS])); 71 } 72 printf("\n"); 73 } 74 printf("\n"); // newline before the U/V block 75 for (j = 0; j < 8; ++j) { 76 for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]); 77 printf(" "); 78 for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]); 79 printf(" "); 80 for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]); 81 printf(" "); 82 for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]); 83 printf(" "); 84 for (i = 0; i < 8; ++i) { 85 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS])); 86 } 87 printf(" "); 88 for (i = 8; i < 16; ++i) { 89 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS])); 90 } 91 printf("\n"); 92 } 93 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n", 94 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz, 95 (int)rd->score); 96 if (is_i16) { 97 printf("Mode: %d\n", rd->mode_i16); 98 printf("y_dc_levels:"); 99 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]); 100 printf("\n"); 101 } else { 102 printf("Modes[16]: "); 103 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]); 104 printf("\n"); 105 } 106 printf("y_ac_levels:\n"); 107 for (j = 0; j < 16; ++j) { 108 for (i = is_i16 ? 1 : 0; i < 16; ++i) { 109 printf("%4d ", rd->y_ac_levels[j][i]); 110 } 111 printf("\n"); 112 } 113 printf("\n"); 114 printf("uv_levels (mode=%d):\n", rd->mode_uv); 115 for (j = 0; j < 8; ++j) { 116 for (i = 0; i < 16; ++i) { 117 printf("%4d ", rd->uv_levels[j][i]); 118 } 119 printf("\n"); 120 } 121 } 122 123 #endif // DEBUG_BLOCK 124 125 //------------------------------------------------------------------------------ 126 127 static WEBP_INLINE int clip(int v, int m, int M) { 128 return v < m ? m : v > M ? M : v; 129 } 130 131 static const uint8_t kZigzag[16] = { 132 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 133 }; 134 135 static const uint8_t kDcTable[128] = { 136 4, 5, 6, 7, 8, 9, 10, 10, 137 11, 12, 13, 14, 15, 16, 17, 17, 138 18, 19, 20, 20, 21, 21, 22, 22, 139 23, 23, 24, 25, 25, 26, 27, 28, 140 29, 30, 31, 32, 33, 34, 35, 36, 141 37, 37, 38, 39, 40, 41, 42, 43, 142 44, 45, 46, 46, 47, 48, 49, 50, 143 51, 52, 53, 54, 55, 56, 57, 58, 144 59, 60, 61, 62, 63, 64, 65, 66, 145 67, 68, 69, 70, 71, 72, 73, 74, 146 75, 76, 76, 77, 78, 79, 80, 81, 147 82, 83, 84, 85, 86, 87, 88, 89, 148 91, 93, 95, 96, 98, 100, 101, 102, 149 104, 106, 108, 110, 112, 114, 116, 118, 150 122, 124, 126, 128, 130, 132, 134, 136, 151 138, 140, 143, 145, 148, 151, 154, 157 152 }; 153 154 static const uint16_t kAcTable[128] = { 155 4, 5, 6, 7, 8, 9, 10, 11, 156 12, 13, 14, 15, 16, 17, 18, 19, 157 20, 21, 22, 23, 24, 25, 26, 27, 158 28, 29, 30, 31, 32, 33, 34, 35, 159 36, 37, 38, 39, 40, 41, 42, 43, 160 44, 45, 46, 47, 48, 49, 50, 51, 161 52, 53, 54, 55, 56, 57, 58, 60, 162 62, 64, 66, 68, 70, 72, 74, 76, 163 78, 80, 82, 84, 86, 88, 90, 92, 164 94, 96, 98, 100, 102, 104, 106, 108, 165 110, 112, 114, 116, 119, 122, 125, 128, 166 131, 134, 137, 140, 143, 146, 149, 152, 167 155, 158, 161, 164, 167, 170, 173, 177, 168 181, 185, 189, 193, 197, 201, 205, 209, 169 213, 217, 221, 225, 229, 234, 239, 245, 170 249, 254, 259, 264, 269, 274, 279, 284 171 }; 172 173 static const uint16_t kAcTable2[128] = { 174 8, 8, 9, 10, 12, 13, 15, 17, 175 18, 20, 21, 23, 24, 26, 27, 29, 176 31, 32, 34, 35, 37, 38, 40, 41, 177 43, 44, 46, 48, 49, 51, 52, 54, 178 55, 57, 58, 60, 62, 63, 65, 66, 179 68, 69, 71, 72, 74, 75, 77, 79, 180 80, 82, 83, 85, 86, 88, 89, 93, 181 96, 99, 102, 105, 108, 111, 114, 117, 182 120, 124, 127, 130, 133, 136, 139, 142, 183 145, 148, 151, 155, 158, 161, 164, 167, 184 170, 173, 176, 179, 184, 189, 193, 198, 185 203, 207, 212, 217, 221, 226, 230, 235, 186 240, 244, 249, 254, 258, 263, 268, 274, 187 280, 286, 292, 299, 305, 311, 317, 323, 188 330, 336, 342, 348, 354, 362, 370, 379, 189 385, 393, 401, 409, 416, 424, 432, 440 190 }; 191 192 static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac] 193 { 96, 110 }, { 96, 108 }, { 110, 115 } 194 }; 195 196 // Sharpening by (slightly) raising the hi-frequency coeffs. 197 // Hack-ish but helpful for mid-bitrate range. Use with care. 198 #define SHARPEN_BITS 11 // number of descaling bits for sharpening bias 199 static const uint8_t kFreqSharpening[16] = { 200 0, 30, 60, 90, 201 30, 60, 90, 90, 202 60, 90, 90, 90, 203 90, 90, 90, 90 204 }; 205 206 //------------------------------------------------------------------------------ 207 // Initialize quantization parameters in VP8Matrix 208 209 // Returns the average quantizer 210 static int ExpandMatrix(VP8Matrix* const m, int type) { 211 int i, sum; 212 for (i = 0; i < 2; ++i) { 213 const int is_ac_coeff = (i > 0); 214 const int bias = kBiasMatrices[type][is_ac_coeff]; 215 m->iq_[i] = (1 << QFIX) / m->q_[i]; 216 m->bias_[i] = BIAS(bias); 217 // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is: 218 // * zero if coeff <= zthresh 219 // * non-zero if coeff > zthresh 220 m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i]; 221 } 222 for (i = 2; i < 16; ++i) { 223 m->q_[i] = m->q_[1]; 224 m->iq_[i] = m->iq_[1]; 225 m->bias_[i] = m->bias_[1]; 226 m->zthresh_[i] = m->zthresh_[1]; 227 } 228 for (sum = 0, i = 0; i < 16; ++i) { 229 if (type == 0) { // we only use sharpening for AC luma coeffs 230 m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS; 231 } else { 232 m->sharpen_[i] = 0; 233 } 234 sum += m->q_[i]; 235 } 236 return (sum + 8) >> 4; 237 } 238 239 static void SetupMatrices(VP8Encoder* enc) { 240 int i; 241 const int tlambda_scale = 242 (enc->method_ >= 4) ? enc->config_->sns_strength 243 : 0; 244 const int num_segments = enc->segment_hdr_.num_segments_; 245 for (i = 0; i < num_segments; ++i) { 246 VP8SegmentInfo* const m = &enc->dqm_[i]; 247 const int q = m->quant_; 248 int q4, q16, quv; 249 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)]; 250 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)]; 251 252 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2; 253 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)]; 254 255 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)]; 256 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)]; 257 258 q4 = ExpandMatrix(&m->y1_, 0); 259 q16 = ExpandMatrix(&m->y2_, 1); 260 quv = ExpandMatrix(&m->uv_, 2); 261 262 m->lambda_i4_ = (3 * q4 * q4) >> 7; 263 m->lambda_i16_ = (3 * q16 * q16); 264 m->lambda_uv_ = (3 * quv * quv) >> 6; 265 m->lambda_mode_ = (1 * q4 * q4) >> 7; 266 m->lambda_trellis_i4_ = (7 * q4 * q4) >> 3; 267 m->lambda_trellis_i16_ = (q16 * q16) >> 2; 268 m->lambda_trellis_uv_ = (quv *quv) << 1; 269 m->tlambda_ = (tlambda_scale * q4) >> 5; 270 271 m->min_disto_ = 10 * m->y1_.q_[0]; // quantization-aware min disto 272 m->max_edge_ = 0; 273 } 274 } 275 276 //------------------------------------------------------------------------------ 277 // Initialize filtering parameters 278 279 // Very small filter-strength values have close to no visual effect. So we can 280 // save a little decoding-CPU by turning filtering off for these. 281 #define FSTRENGTH_CUTOFF 2 282 283 static void SetupFilterStrength(VP8Encoder* const enc) { 284 int i; 285 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering. 286 const int level0 = 5 * enc->config_->filter_strength; 287 for (i = 0; i < NUM_MB_SEGMENTS; ++i) { 288 VP8SegmentInfo* const m = &enc->dqm_[i]; 289 // We focus on the quantization of AC coeffs. 290 const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2; 291 const int base_strength = 292 VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep); 293 // Segments with lower complexity ('beta') will be less filtered. 294 const int f = base_strength * level0 / (256 + m->beta_); 295 m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f; 296 } 297 // We record the initial strength (mainly for the case of 1-segment only). 298 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_; 299 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0); 300 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness; 301 } 302 303 //------------------------------------------------------------------------------ 304 305 // Note: if you change the values below, remember that the max range 306 // allowed by the syntax for DQ_UV is [-16,16]. 307 #define MAX_DQ_UV (6) 308 #define MIN_DQ_UV (-4) 309 310 // We want to emulate jpeg-like behaviour where the expected "good" quality 311 // is around q=75. Internally, our "good" middle is around c=50. So we 312 // map accordingly using linear piece-wise function 313 static double QualityToCompression(double c) { 314 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.; 315 // The file size roughly scales as pow(quantizer, 3.). Actually, the 316 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested 317 // in the mid-quant range. So we scale the compressibility inversely to 318 // this power-law: quant ~= compression ^ 1/3. This law holds well for 319 // low quant. Finer modeling for high-quant would make use of kAcTable[] 320 // more explicitly. 321 const double v = pow(linear_c, 1 / 3.); 322 return v; 323 } 324 325 static double QualityToJPEGCompression(double c, double alpha) { 326 // We map the complexity 'alpha' and quality setting 'c' to a compression 327 // exponent empirically matched to the compression curve of libjpeg6b. 328 // On average, the WebP output size will be roughly similar to that of a 329 // JPEG file compressed with same quality factor. 330 const double amin = 0.30; 331 const double amax = 0.85; 332 const double exp_min = 0.4; 333 const double exp_max = 0.9; 334 const double slope = (exp_min - exp_max) / (amax - amin); 335 // Linearly interpolate 'expn' from exp_min to exp_max 336 // in the [amin, amax] range. 337 const double expn = (alpha > amax) ? exp_min 338 : (alpha < amin) ? exp_max 339 : exp_max + slope * (alpha - amin); 340 const double v = pow(c, expn); 341 return v; 342 } 343 344 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1, 345 const VP8SegmentInfo* const S2) { 346 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_); 347 } 348 349 static void SimplifySegments(VP8Encoder* const enc) { 350 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 }; 351 const int num_segments = enc->segment_hdr_.num_segments_; 352 int num_final_segments = 1; 353 int s1, s2; 354 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments 355 const VP8SegmentInfo* const S1 = &enc->dqm_[s1]; 356 int found = 0; 357 // check if we already have similar segment 358 for (s2 = 0; s2 < num_final_segments; ++s2) { 359 const VP8SegmentInfo* const S2 = &enc->dqm_[s2]; 360 if (SegmentsAreEquivalent(S1, S2)) { 361 found = 1; 362 break; 363 } 364 } 365 map[s1] = s2; 366 if (!found) { 367 if (num_final_segments != s1) { 368 enc->dqm_[num_final_segments] = enc->dqm_[s1]; 369 } 370 ++num_final_segments; 371 } 372 } 373 if (num_final_segments < num_segments) { // Remap 374 int i = enc->mb_w_ * enc->mb_h_; 375 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_]; 376 enc->segment_hdr_.num_segments_ = num_final_segments; 377 // Replicate the trailing segment infos (it's mostly cosmetics) 378 for (i = num_final_segments; i < num_segments; ++i) { 379 enc->dqm_[i] = enc->dqm_[num_final_segments - 1]; 380 } 381 } 382 } 383 384 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) { 385 int i; 386 int dq_uv_ac, dq_uv_dc; 387 const int num_segments = enc->segment_hdr_.num_segments_; 388 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.; 389 const double Q = quality / 100.; 390 const double c_base = enc->config_->emulate_jpeg_size ? 391 QualityToJPEGCompression(Q, enc->alpha_ / 255.) : 392 QualityToCompression(Q); 393 for (i = 0; i < num_segments; ++i) { 394 // We modulate the base coefficient to accommodate for the quantization 395 // susceptibility and allow denser segments to be quantized more. 396 const double expn = 1. - amp * enc->dqm_[i].alpha_; 397 const double c = pow(c_base, expn); 398 const int q = (int)(127. * (1. - c)); 399 assert(expn > 0.); 400 enc->dqm_[i].quant_ = clip(q, 0, 127); 401 } 402 403 // purely indicative in the bitstream (except for the 1-segment case) 404 enc->base_quant_ = enc->dqm_[0].quant_; 405 406 // fill-in values for the unused segments (required by the syntax) 407 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) { 408 enc->dqm_[i].quant_ = enc->base_quant_; 409 } 410 411 // uv_alpha_ is normally spread around ~60. The useful range is 412 // typically ~30 (quite bad) to ~100 (ok to decimate UV more). 413 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv. 414 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV) 415 / (MAX_ALPHA - MIN_ALPHA); 416 // we rescale by the user-defined strength of adaptation 417 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100; 418 // and make it safe. 419 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV); 420 // We also boost the dc-uv-quant a little, based on sns-strength, since 421 // U/V channels are quite more reactive to high quants (flat DC-blocks 422 // tend to appear, and are unpleasant). 423 dq_uv_dc = -4 * enc->config_->sns_strength / 100; 424 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed 425 426 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum 427 enc->dq_y2_dc_ = 0; 428 enc->dq_y2_ac_ = 0; 429 enc->dq_uv_dc_ = dq_uv_dc; 430 enc->dq_uv_ac_ = dq_uv_ac; 431 432 SetupFilterStrength(enc); // initialize segments' filtering, eventually 433 434 if (num_segments > 1) SimplifySegments(enc); 435 436 SetupMatrices(enc); // finalize quantization matrices 437 } 438 439 //------------------------------------------------------------------------------ 440 // Form the predictions in cache 441 442 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index 443 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 }; 444 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 }; 445 446 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index 447 const int VP8I4ModeOffsets[NUM_BMODES] = { 448 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 449 }; 450 451 void VP8MakeLuma16Preds(const VP8EncIterator* const it) { 452 const uint8_t* const left = it->x_ ? it->y_left_ : NULL; 453 const uint8_t* const top = it->y_ ? it->y_top_ : NULL; 454 VP8EncPredLuma16(it->yuv_p_, left, top); 455 } 456 457 void VP8MakeChroma8Preds(const VP8EncIterator* const it) { 458 const uint8_t* const left = it->x_ ? it->u_left_ : NULL; 459 const uint8_t* const top = it->y_ ? it->uv_top_ : NULL; 460 VP8EncPredChroma8(it->yuv_p_, left, top); 461 } 462 463 void VP8MakeIntra4Preds(const VP8EncIterator* const it) { 464 VP8EncPredLuma4(it->yuv_p_, it->i4_top_); 465 } 466 467 //------------------------------------------------------------------------------ 468 // Quantize 469 470 // Layout: 471 // +----+----+ 472 // |YYYY|UUVV| 0 473 // |YYYY|UUVV| 4 474 // |YYYY|....| 8 475 // |YYYY|....| 12 476 // +----+----+ 477 478 const int VP8Scan[16] = { // Luma 479 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, 480 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, 481 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, 482 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS, 483 }; 484 485 static const int VP8ScanUV[4 + 4] = { 486 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U 487 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V 488 }; 489 490 //------------------------------------------------------------------------------ 491 // Distortion measurement 492 493 static const uint16_t kWeightY[16] = { 494 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2 495 }; 496 497 static const uint16_t kWeightTrellis[16] = { 498 #if USE_TDISTO == 0 499 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 500 #else 501 30, 27, 19, 11, 502 27, 24, 17, 10, 503 19, 17, 12, 8, 504 11, 10, 8, 6 505 #endif 506 }; 507 508 // Init/Copy the common fields in score. 509 static void InitScore(VP8ModeScore* const rd) { 510 rd->D = 0; 511 rd->SD = 0; 512 rd->R = 0; 513 rd->H = 0; 514 rd->nz = 0; 515 rd->score = MAX_COST; 516 } 517 518 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) { 519 dst->D = src->D; 520 dst->SD = src->SD; 521 dst->R = src->R; 522 dst->H = src->H; 523 dst->nz = src->nz; // note that nz is not accumulated, but just copied. 524 dst->score = src->score; 525 } 526 527 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) { 528 dst->D += src->D; 529 dst->SD += src->SD; 530 dst->R += src->R; 531 dst->H += src->H; 532 dst->nz |= src->nz; // here, new nz bits are accumulated. 533 dst->score += src->score; 534 } 535 536 //------------------------------------------------------------------------------ 537 // Performs trellis-optimized quantization. 538 539 // Trellis node 540 typedef struct { 541 int8_t prev; // best previous node 542 int8_t sign; // sign of coeff_i 543 int16_t level; // level 544 } Node; 545 546 // Score state 547 typedef struct { 548 score_t score; // partial RD score 549 const uint16_t* costs; // shortcut to cost tables 550 } ScoreState; 551 552 // If a coefficient was quantized to a value Q (using a neutral bias), 553 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA] 554 // We don't test negative values though. 555 #define MIN_DELTA 0 // how much lower level to try 556 #define MAX_DELTA 1 // how much higher 557 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA) 558 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA]) 559 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA]) 560 561 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) { 562 rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD); 563 } 564 565 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate, 566 score_t distortion) { 567 return rate * lambda + RD_DISTO_MULT * distortion; 568 } 569 570 static int TrellisQuantizeBlock(const VP8Encoder* const enc, 571 int16_t in[16], int16_t out[16], 572 int ctx0, int coeff_type, 573 const VP8Matrix* const mtx, 574 int lambda) { 575 const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type]; 576 CostArrayPtr const costs = 577 (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type]; 578 const int first = (coeff_type == 0) ? 1 : 0; 579 Node nodes[16][NUM_NODES]; 580 ScoreState score_states[2][NUM_NODES]; 581 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA); 582 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA); 583 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous 584 score_t best_score; 585 int n, m, p, last; 586 587 { 588 score_t cost; 589 const int thresh = mtx->q_[1] * mtx->q_[1] / 4; 590 const int last_proba = probas[VP8EncBands[first]][ctx0][0]; 591 592 // compute the position of the last interesting coefficient 593 last = first - 1; 594 for (n = 15; n >= first; --n) { 595 const int j = kZigzag[n]; 596 const int err = in[j] * in[j]; 597 if (err > thresh) { 598 last = n; 599 break; 600 } 601 } 602 // we don't need to go inspect up to n = 16 coeffs. We can just go up 603 // to last + 1 (inclusive) without losing much. 604 if (last < 15) ++last; 605 606 // compute 'skip' score. This is the max score one can do. 607 cost = VP8BitCost(0, last_proba); 608 best_score = RDScoreTrellis(lambda, cost, 0); 609 610 // initialize source node. 611 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { 612 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0; 613 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0); 614 ss_cur[m].costs = costs[first][ctx0]; 615 } 616 } 617 618 // traverse trellis. 619 for (n = first; n <= last; ++n) { 620 const int j = kZigzag[n]; 621 const uint32_t Q = mtx->q_[j]; 622 const uint32_t iQ = mtx->iq_[j]; 623 const uint32_t B = BIAS(0x00); // neutral bias 624 // note: it's important to take sign of the _original_ coeff, 625 // so we don't have to consider level < 0 afterward. 626 const int sign = (in[j] < 0); 627 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; 628 int level0 = QUANTDIV(coeff0, iQ, B); 629 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL; 630 631 { // Swap current and previous score states 632 ScoreState* const tmp = ss_cur; 633 ss_cur = ss_prev; 634 ss_prev = tmp; 635 } 636 637 // test all alternate level values around level0. 638 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { 639 Node* const cur = &NODE(n, m); 640 int level = level0 + m; 641 const int ctx = (level > 2) ? 2 : level; 642 const int band = VP8EncBands[n + 1]; 643 score_t base_score, last_pos_score; 644 score_t best_cur_score = MAX_COST; 645 int best_prev = 0; // default, in case 646 647 ss_cur[m].score = MAX_COST; 648 ss_cur[m].costs = costs[n + 1][ctx]; 649 if (level > MAX_LEVEL || level < 0) { // node is dead? 650 continue; 651 } 652 653 // Compute extra rate cost if last coeff's position is < 15 654 { 655 const score_t last_pos_cost = 656 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0; 657 last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0); 658 } 659 660 { 661 // Compute delta_error = how much coding this level will 662 // subtract to max_error as distortion. 663 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2 664 const int new_error = coeff0 - level * Q; 665 const int delta_error = 666 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0); 667 base_score = RDScoreTrellis(lambda, 0, delta_error); 668 } 669 670 // Inspect all possible non-dead predecessors. Retain only the best one. 671 for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) { 672 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically 673 // eliminated since their score can't be better than the current best. 674 const score_t cost = VP8LevelCost(ss_prev[p].costs, level); 675 // Examine node assuming it's a non-terminal one. 676 const score_t score = 677 base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0); 678 if (score < best_cur_score) { 679 best_cur_score = score; 680 best_prev = p; 681 } 682 } 683 // Store best finding in current node. 684 cur->sign = sign; 685 cur->level = level; 686 cur->prev = best_prev; 687 ss_cur[m].score = best_cur_score; 688 689 // Now, record best terminal node (and thus best entry in the graph). 690 if (level != 0) { 691 const score_t score = best_cur_score + last_pos_score; 692 if (score < best_score) { 693 best_score = score; 694 best_path[0] = n; // best eob position 695 best_path[1] = m; // best node index 696 best_path[2] = best_prev; // best predecessor 697 } 698 } 699 } 700 } 701 702 // Fresh start 703 memset(in + first, 0, (16 - first) * sizeof(*in)); 704 memset(out + first, 0, (16 - first) * sizeof(*out)); 705 if (best_path[0] == -1) { 706 return 0; // skip! 707 } 708 709 { 710 // Unwind the best path. 711 // Note: best-prev on terminal node is not necessarily equal to the 712 // best_prev for non-terminal. So we patch best_path[2] in. 713 int nz = 0; 714 int best_node = best_path[1]; 715 n = best_path[0]; 716 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal 717 718 for (; n >= first; --n) { 719 const Node* const node = &NODE(n, best_node); 720 const int j = kZigzag[n]; 721 out[n] = node->sign ? -node->level : node->level; 722 nz |= node->level; 723 in[j] = out[n] * mtx->q_[j]; 724 best_node = node->prev; 725 } 726 return (nz != 0); 727 } 728 } 729 730 #undef NODE 731 732 //------------------------------------------------------------------------------ 733 // Performs: difference, transform, quantize, back-transform, add 734 // all at once. Output is the reconstructed block in *yuv_out, and the 735 // quantized levels in *levels. 736 737 static int ReconstructIntra16(VP8EncIterator* const it, 738 VP8ModeScore* const rd, 739 uint8_t* const yuv_out, 740 int mode) { 741 const VP8Encoder* const enc = it->enc_; 742 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; 743 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC; 744 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 745 int nz = 0; 746 int n; 747 int16_t tmp[16][16], dc_tmp[16]; 748 749 for (n = 0; n < 16; n += 2) { 750 VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]); 751 } 752 VP8FTransformWHT(tmp[0], dc_tmp); 753 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24; 754 755 if (DO_TRELLIS_I16 && it->do_trellis_) { 756 int x, y; 757 VP8IteratorNzToBytes(it); 758 for (y = 0, n = 0; y < 4; ++y) { 759 for (x = 0; x < 4; ++x, ++n) { 760 const int ctx = it->top_nz_[x] + it->left_nz_[y]; 761 const int non_zero = 762 TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0, 763 &dqm->y1_, dqm->lambda_trellis_i16_); 764 it->top_nz_[x] = it->left_nz_[y] = non_zero; 765 rd->y_ac_levels[n][0] = 0; 766 nz |= non_zero << n; 767 } 768 } 769 } else { 770 for (n = 0; n < 16; n += 2) { 771 // Zero-out the first coeff, so that: a) nz is correct below, and 772 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified. 773 tmp[n][0] = tmp[n + 1][0] = 0; 774 nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n; 775 assert(rd->y_ac_levels[n + 0][0] == 0); 776 assert(rd->y_ac_levels[n + 1][0] == 0); 777 } 778 } 779 780 // Transform back 781 VP8TransformWHT(dc_tmp, tmp[0]); 782 for (n = 0; n < 16; n += 2) { 783 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1); 784 } 785 786 return nz; 787 } 788 789 static int ReconstructIntra4(VP8EncIterator* const it, 790 int16_t levels[16], 791 const uint8_t* const src, 792 uint8_t* const yuv_out, 793 int mode) { 794 const VP8Encoder* const enc = it->enc_; 795 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode]; 796 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 797 int nz = 0; 798 int16_t tmp[16]; 799 800 VP8FTransform(src, ref, tmp); 801 if (DO_TRELLIS_I4 && it->do_trellis_) { 802 const int x = it->i4_ & 3, y = it->i4_ >> 2; 803 const int ctx = it->top_nz_[x] + it->left_nz_[y]; 804 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_, 805 dqm->lambda_trellis_i4_); 806 } else { 807 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_); 808 } 809 VP8ITransform(ref, tmp, yuv_out, 0); 810 return nz; 811 } 812 813 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, 814 uint8_t* const yuv_out, int mode) { 815 const VP8Encoder* const enc = it->enc_; 816 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode]; 817 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC; 818 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 819 int nz = 0; 820 int n; 821 int16_t tmp[8][16]; 822 823 for (n = 0; n < 8; n += 2) { 824 VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]); 825 } 826 if (DO_TRELLIS_UV && it->do_trellis_) { 827 int ch, x, y; 828 for (ch = 0, n = 0; ch <= 2; ch += 2) { 829 for (y = 0; y < 2; ++y) { 830 for (x = 0; x < 2; ++x, ++n) { 831 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y]; 832 const int non_zero = 833 TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2, 834 &dqm->uv_, dqm->lambda_trellis_uv_); 835 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero; 836 nz |= non_zero << n; 837 } 838 } 839 } 840 } else { 841 for (n = 0; n < 8; n += 2) { 842 nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n; 843 } 844 } 845 846 for (n = 0; n < 8; n += 2) { 847 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1); 848 } 849 return (nz << 16); 850 } 851 852 //------------------------------------------------------------------------------ 853 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost. 854 // Pick the mode is lower RD-cost = Rate + lambda * Distortion. 855 856 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) { 857 // We look at the first three AC coefficients to determine what is the average 858 // delta between each sub-4x4 block. 859 const int v0 = abs(DCs[1]); 860 const int v1 = abs(DCs[4]); 861 const int v2 = abs(DCs[5]); 862 int max_v = (v0 > v1) ? v1 : v0; 863 max_v = (v2 > max_v) ? v2 : max_v; 864 if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v; 865 } 866 867 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) { 868 VP8ModeScore* const tmp = *a; 869 *a = *b; 870 *b = tmp; 871 } 872 873 static void SwapPtr(uint8_t** a, uint8_t** b) { 874 uint8_t* const tmp = *a; 875 *a = *b; 876 *b = tmp; 877 } 878 879 static void SwapOut(VP8EncIterator* const it) { 880 SwapPtr(&it->yuv_out_, &it->yuv_out2_); 881 } 882 883 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) { 884 score_t score = 0; 885 while (num_blocks-- > 0) { // TODO(skal): refine positional scoring? 886 int i; 887 for (i = 1; i < 16; ++i) { // omit DC, we're only interested in AC 888 score += (levels[i] != 0); 889 if (score > thresh) return 0; 890 } 891 levels += 16; 892 } 893 return 1; 894 } 895 896 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) { 897 const int kNumBlocks = 16; 898 VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_]; 899 const int lambda = dqm->lambda_i16_; 900 const int tlambda = dqm->tlambda_; 901 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC; 902 VP8ModeScore rd_tmp; 903 VP8ModeScore* rd_cur = &rd_tmp; 904 VP8ModeScore* rd_best = rd; 905 int mode; 906 907 rd->mode_i16 = -1; 908 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 909 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer 910 rd_cur->mode_i16 = mode; 911 912 // Reconstruct 913 rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode); 914 915 // Measure RD-score 916 rd_cur->D = VP8SSE16x16(src, tmp_dst); 917 rd_cur->SD = 918 tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0; 919 rd_cur->H = VP8FixedCostsI16[mode]; 920 rd_cur->R = VP8GetCostLuma16(it, rd_cur); 921 if (mode > 0 && 922 IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) { 923 // penalty to avoid flat area to be mispredicted by complex mode 924 rd_cur->R += FLATNESS_PENALTY * kNumBlocks; 925 } 926 927 // Since we always examine Intra16 first, we can overwrite *rd directly. 928 SetRDScore(lambda, rd_cur); 929 if (mode == 0 || rd_cur->score < rd_best->score) { 930 SwapModeScore(&rd_cur, &rd_best); 931 SwapOut(it); 932 } 933 } 934 if (rd_best != rd) { 935 memcpy(rd, rd_best, sizeof(*rd)); 936 } 937 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision. 938 VP8SetIntra16Mode(it, rd->mode_i16); 939 940 // we have a blocky macroblock (only DCs are non-zero) with fairly high 941 // distortion, record max delta so we can later adjust the minimal filtering 942 // strength needed to smooth these blocks out. 943 if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) { 944 StoreMaxDelta(dqm, rd->y_dc_levels); 945 } 946 } 947 948 //------------------------------------------------------------------------------ 949 950 // return the cost array corresponding to the surrounding prediction modes. 951 static const uint16_t* GetCostModeI4(VP8EncIterator* const it, 952 const uint8_t modes[16]) { 953 const int preds_w = it->enc_->preds_w_; 954 const int x = (it->i4_ & 3), y = it->i4_ >> 2; 955 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1]; 956 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4]; 957 return VP8FixedCostsI4[top][left]; 958 } 959 960 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) { 961 const VP8Encoder* const enc = it->enc_; 962 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 963 const int lambda = dqm->lambda_i4_; 964 const int tlambda = dqm->tlambda_; 965 const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC; 966 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC; 967 int total_header_bits = 0; 968 VP8ModeScore rd_best; 969 970 if (enc->max_i4_header_bits_ == 0) { 971 return 0; 972 } 973 974 InitScore(&rd_best); 975 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145) 976 SetRDScore(dqm->lambda_mode_, &rd_best); 977 VP8IteratorStartI4(it); 978 do { 979 const int kNumBlocks = 1; 980 VP8ModeScore rd_i4; 981 int mode; 982 int best_mode = -1; 983 const uint8_t* const src = src0 + VP8Scan[it->i4_]; 984 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4); 985 uint8_t* best_block = best_blocks + VP8Scan[it->i4_]; 986 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer. 987 988 InitScore(&rd_i4); 989 VP8MakeIntra4Preds(it); 990 for (mode = 0; mode < NUM_BMODES; ++mode) { 991 VP8ModeScore rd_tmp; 992 int16_t tmp_levels[16]; 993 994 // Reconstruct 995 rd_tmp.nz = 996 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_; 997 998 // Compute RD-score 999 rd_tmp.D = VP8SSE4x4(src, tmp_dst); 1000 rd_tmp.SD = 1001 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY)) 1002 : 0; 1003 rd_tmp.H = mode_costs[mode]; 1004 1005 // Add flatness penalty 1006 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) { 1007 rd_tmp.R = FLATNESS_PENALTY * kNumBlocks; 1008 } else { 1009 rd_tmp.R = 0; 1010 } 1011 1012 // early-out check 1013 SetRDScore(lambda, &rd_tmp); 1014 if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue; 1015 1016 // finish computing score 1017 rd_tmp.R += VP8GetCostLuma4(it, tmp_levels); 1018 SetRDScore(lambda, &rd_tmp); 1019 1020 if (best_mode < 0 || rd_tmp.score < rd_i4.score) { 1021 CopyScore(&rd_i4, &rd_tmp); 1022 best_mode = mode; 1023 SwapPtr(&tmp_dst, &best_block); 1024 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, 1025 sizeof(rd_best.y_ac_levels[it->i4_])); 1026 } 1027 } 1028 SetRDScore(dqm->lambda_mode_, &rd_i4); 1029 AddScore(&rd_best, &rd_i4); 1030 if (rd_best.score >= rd->score) { 1031 return 0; 1032 } 1033 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode]; 1034 if (total_header_bits > enc->max_i4_header_bits_) { 1035 return 0; 1036 } 1037 // Copy selected samples if not in the right place already. 1038 if (best_block != best_blocks + VP8Scan[it->i4_]) { 1039 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]); 1040 } 1041 rd->modes_i4[it->i4_] = best_mode; 1042 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0); 1043 } while (VP8IteratorRotateI4(it, best_blocks)); 1044 1045 // finalize state 1046 CopyScore(rd, &rd_best); 1047 VP8SetIntra4Mode(it, rd->modes_i4); 1048 SwapOut(it); 1049 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels)); 1050 return 1; // select intra4x4 over intra16x16 1051 } 1052 1053 //------------------------------------------------------------------------------ 1054 1055 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { 1056 const int kNumBlocks = 8; 1057 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_]; 1058 const int lambda = dqm->lambda_uv_; 1059 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC; 1060 uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer 1061 uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC; 1062 uint8_t* dst = dst0; 1063 VP8ModeScore rd_best; 1064 int mode; 1065 1066 rd->mode_uv = -1; 1067 InitScore(&rd_best); 1068 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 1069 VP8ModeScore rd_uv; 1070 1071 // Reconstruct 1072 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode); 1073 1074 // Compute RD-score 1075 rd_uv.D = VP8SSE16x8(src, tmp_dst); 1076 rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas. 1077 rd_uv.H = VP8FixedCostsUV[mode]; 1078 rd_uv.R = VP8GetCostUV(it, &rd_uv); 1079 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) { 1080 rd_uv.R += FLATNESS_PENALTY * kNumBlocks; 1081 } 1082 1083 SetRDScore(lambda, &rd_uv); 1084 if (mode == 0 || rd_uv.score < rd_best.score) { 1085 CopyScore(&rd_best, &rd_uv); 1086 rd->mode_uv = mode; 1087 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels)); 1088 SwapPtr(&dst, &tmp_dst); 1089 } 1090 } 1091 VP8SetIntraUVMode(it, rd->mode_uv); 1092 AddScore(rd, &rd_best); 1093 if (dst != dst0) { // copy 16x8 block if needed 1094 VP8Copy16x8(dst, dst0); 1095 } 1096 } 1097 1098 //------------------------------------------------------------------------------ 1099 // Final reconstruction and quantization. 1100 1101 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) { 1102 const VP8Encoder* const enc = it->enc_; 1103 const int is_i16 = (it->mb_->type_ == 1); 1104 int nz = 0; 1105 1106 if (is_i16) { 1107 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]); 1108 } else { 1109 VP8IteratorStartI4(it); 1110 do { 1111 const int mode = 1112 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_]; 1113 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_]; 1114 uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_]; 1115 VP8MakeIntra4Preds(it); 1116 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_], 1117 src, dst, mode) << it->i4_; 1118 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC)); 1119 } 1120 1121 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_); 1122 rd->nz = nz; 1123 } 1124 1125 // Refine intra16/intra4 sub-modes based on distortion only (not rate). 1126 static void RefineUsingDistortion(VP8EncIterator* const it, 1127 int try_both_modes, int refine_uv_mode, 1128 VP8ModeScore* const rd) { 1129 score_t best_score = MAX_COST; 1130 score_t score_i4 = (score_t)I4_PENALTY; 1131 int16_t tmp_levels[16][16]; 1132 uint8_t modes_i4[16]; 1133 int nz = 0; 1134 int mode; 1135 int is_i16 = try_both_modes || (it->mb_->type_ == 1); 1136 1137 if (is_i16) { // First, evaluate Intra16 distortion 1138 int best_mode = -1; 1139 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC; 1140 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 1141 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; 1142 const score_t score = VP8SSE16x16(src, ref); 1143 if (score < best_score) { 1144 best_mode = mode; 1145 best_score = score; 1146 } 1147 } 1148 VP8SetIntra16Mode(it, best_mode); 1149 // we'll reconstruct later, if i16 mode actually gets selected 1150 } 1151 1152 // Next, evaluate Intra4 1153 if (try_both_modes || !is_i16) { 1154 // We don't evaluate the rate here, but just account for it through a 1155 // constant penalty (i4 mode usually needs more bits compared to i16). 1156 is_i16 = 0; 1157 VP8IteratorStartI4(it); 1158 do { 1159 int best_i4_mode = -1; 1160 score_t best_i4_score = MAX_COST; 1161 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_]; 1162 1163 VP8MakeIntra4Preds(it); 1164 for (mode = 0; mode < NUM_BMODES; ++mode) { 1165 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode]; 1166 const score_t score = VP8SSE4x4(src, ref); 1167 if (score < best_i4_score) { 1168 best_i4_mode = mode; 1169 best_i4_score = score; 1170 } 1171 } 1172 modes_i4[it->i4_] = best_i4_mode; 1173 score_i4 += best_i4_score; 1174 if (score_i4 >= best_score) { 1175 // Intra4 won't be better than Intra16. Bail out and pick Intra16. 1176 is_i16 = 1; 1177 break; 1178 } else { // reconstruct partial block inside yuv_out2_ buffer 1179 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_]; 1180 nz |= ReconstructIntra4(it, tmp_levels[it->i4_], 1181 src, tmp_dst, best_i4_mode) << it->i4_; 1182 } 1183 } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC)); 1184 } 1185 1186 // Final reconstruction, depending on which mode is selected. 1187 if (!is_i16) { 1188 VP8SetIntra4Mode(it, modes_i4); 1189 memcpy(rd->y_ac_levels, tmp_levels, sizeof(tmp_levels)); 1190 SwapOut(it); 1191 best_score = score_i4; 1192 } else { 1193 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]); 1194 } 1195 1196 // ... and UV! 1197 if (refine_uv_mode) { 1198 int best_mode = -1; 1199 score_t best_uv_score = MAX_COST; 1200 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC; 1201 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 1202 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode]; 1203 const score_t score = VP8SSE16x8(src, ref); 1204 if (score < best_uv_score) { 1205 best_mode = mode; 1206 best_uv_score = score; 1207 } 1208 } 1209 VP8SetIntraUVMode(it, best_mode); 1210 } 1211 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_); 1212 1213 rd->nz = nz; 1214 rd->score = best_score; 1215 } 1216 1217 //------------------------------------------------------------------------------ 1218 // Entry point 1219 1220 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, 1221 VP8RDLevel rd_opt) { 1222 int is_skipped; 1223 const int method = it->enc_->method_; 1224 1225 InitScore(rd); 1226 1227 // We can perform predictions for Luma16x16 and Chroma8x8 already. 1228 // Luma4x4 predictions needs to be done as-we-go. 1229 VP8MakeLuma16Preds(it); 1230 VP8MakeChroma8Preds(it); 1231 1232 if (rd_opt > RD_OPT_NONE) { 1233 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL); 1234 PickBestIntra16(it, rd); 1235 if (method >= 2) { 1236 PickBestIntra4(it, rd); 1237 } 1238 PickBestUV(it, rd); 1239 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now 1240 it->do_trellis_ = 1; 1241 SimpleQuantize(it, rd); 1242 } 1243 } else { 1244 // At this point we have heuristically decided intra16 / intra4. 1245 // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower). 1246 // For method <= 1, we don't re-examine the decision but just go ahead with 1247 // quantization/reconstruction. 1248 RefineUsingDistortion(it, (method >= 2), (method >= 1), rd); 1249 } 1250 is_skipped = (rd->nz == 0); 1251 VP8SetSkip(it, is_skipped); 1252 return is_skipped; 1253 } 1254