1 // Copyright 2011 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // Quantization 11 // 12 // Author: Skal (pascal.massimino (at) gmail.com) 13 14 #include <assert.h> 15 #include <math.h> 16 #include <stdlib.h> // for abs() 17 18 #include "./vp8enci.h" 19 #include "./cost.h" 20 21 #define DO_TRELLIS_I4 1 22 #define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate. 23 #define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth. 24 #define USE_TDISTO 1 25 26 #define MID_ALPHA 64 // neutral value for susceptibility 27 #define MIN_ALPHA 30 // lowest usable value for susceptibility 28 #define MAX_ALPHA 100 // higher meaningful value for susceptibility 29 30 #define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP 31 // power-law modulation. Must be strictly less than 1. 32 33 #define I4_PENALTY 4000 // Rate-penalty for quick i4/i16 decision 34 35 // number of non-zero coeffs below which we consider the block very flat 36 // (and apply a penalty to complex predictions) 37 #define FLATNESS_LIMIT_I16 10 // I16 mode 38 #define FLATNESS_LIMIT_I4 3 // I4 mode 39 #define FLATNESS_LIMIT_UV 2 // UV mode 40 #define FLATNESS_PENALTY 140 // roughly ~1bit per block 41 42 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8) 43 44 // #define DEBUG_BLOCK 45 46 //------------------------------------------------------------------------------ 47 48 #if defined(DEBUG_BLOCK) 49 50 #include <stdio.h> 51 #include <stdlib.h> 52 53 static void PrintBlockInfo(const VP8EncIterator* const it, 54 const VP8ModeScore* const rd) { 55 int i, j; 56 const int is_i16 = (it->mb_->type_ == 1); 57 printf("SOURCE / OUTPUT / ABS DELTA\n"); 58 for (j = 0; j < 24; ++j) { 59 if (j == 16) printf("\n"); // newline before the U/V block 60 for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]); 61 printf(" "); 62 for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]); 63 printf(" "); 64 for (i = 0; i < 16; ++i) { 65 printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS])); 66 } 67 printf("\n"); 68 } 69 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n", 70 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz, 71 (int)rd->score); 72 if (is_i16) { 73 printf("Mode: %d\n", rd->mode_i16); 74 printf("y_dc_levels:"); 75 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]); 76 printf("\n"); 77 } else { 78 printf("Modes[16]: "); 79 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]); 80 printf("\n"); 81 } 82 printf("y_ac_levels:\n"); 83 for (j = 0; j < 16; ++j) { 84 for (i = is_i16 ? 1 : 0; i < 16; ++i) { 85 printf("%4d ", rd->y_ac_levels[j][i]); 86 } 87 printf("\n"); 88 } 89 printf("\n"); 90 printf("uv_levels (mode=%d):\n", rd->mode_uv); 91 for (j = 0; j < 8; ++j) { 92 for (i = 0; i < 16; ++i) { 93 printf("%4d ", rd->uv_levels[j][i]); 94 } 95 printf("\n"); 96 } 97 } 98 99 #endif // DEBUG_BLOCK 100 101 //------------------------------------------------------------------------------ 102 103 static WEBP_INLINE int clip(int v, int m, int M) { 104 return v < m ? m : v > M ? M : v; 105 } 106 107 static const uint8_t kZigzag[16] = { 108 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 109 }; 110 111 static const uint8_t kDcTable[128] = { 112 4, 5, 6, 7, 8, 9, 10, 10, 113 11, 12, 13, 14, 15, 16, 17, 17, 114 18, 19, 20, 20, 21, 21, 22, 22, 115 23, 23, 24, 25, 25, 26, 27, 28, 116 29, 30, 31, 32, 33, 34, 35, 36, 117 37, 37, 38, 39, 40, 41, 42, 43, 118 44, 45, 46, 46, 47, 48, 49, 50, 119 51, 52, 53, 54, 55, 56, 57, 58, 120 59, 60, 61, 62, 63, 64, 65, 66, 121 67, 68, 69, 70, 71, 72, 73, 74, 122 75, 76, 76, 77, 78, 79, 80, 81, 123 82, 83, 84, 85, 86, 87, 88, 89, 124 91, 93, 95, 96, 98, 100, 101, 102, 125 104, 106, 108, 110, 112, 114, 116, 118, 126 122, 124, 126, 128, 130, 132, 134, 136, 127 138, 140, 143, 145, 148, 151, 154, 157 128 }; 129 130 static const uint16_t kAcTable[128] = { 131 4, 5, 6, 7, 8, 9, 10, 11, 132 12, 13, 14, 15, 16, 17, 18, 19, 133 20, 21, 22, 23, 24, 25, 26, 27, 134 28, 29, 30, 31, 32, 33, 34, 35, 135 36, 37, 38, 39, 40, 41, 42, 43, 136 44, 45, 46, 47, 48, 49, 50, 51, 137 52, 53, 54, 55, 56, 57, 58, 60, 138 62, 64, 66, 68, 70, 72, 74, 76, 139 78, 80, 82, 84, 86, 88, 90, 92, 140 94, 96, 98, 100, 102, 104, 106, 108, 141 110, 112, 114, 116, 119, 122, 125, 128, 142 131, 134, 137, 140, 143, 146, 149, 152, 143 155, 158, 161, 164, 167, 170, 173, 177, 144 181, 185, 189, 193, 197, 201, 205, 209, 145 213, 217, 221, 225, 229, 234, 239, 245, 146 249, 254, 259, 264, 269, 274, 279, 284 147 }; 148 149 static const uint16_t kAcTable2[128] = { 150 8, 8, 9, 10, 12, 13, 15, 17, 151 18, 20, 21, 23, 24, 26, 27, 29, 152 31, 32, 34, 35, 37, 38, 40, 41, 153 43, 44, 46, 48, 49, 51, 52, 54, 154 55, 57, 58, 60, 62, 63, 65, 66, 155 68, 69, 71, 72, 74, 75, 77, 79, 156 80, 82, 83, 85, 86, 88, 89, 93, 157 96, 99, 102, 105, 108, 111, 114, 117, 158 120, 124, 127, 130, 133, 136, 139, 142, 159 145, 148, 151, 155, 158, 161, 164, 167, 160 170, 173, 176, 179, 184, 189, 193, 198, 161 203, 207, 212, 217, 221, 226, 230, 235, 162 240, 244, 249, 254, 258, 263, 268, 274, 163 280, 286, 292, 299, 305, 311, 317, 323, 164 330, 336, 342, 348, 354, 362, 370, 379, 165 385, 393, 401, 409, 416, 424, 432, 440 166 }; 167 168 static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac] 169 { 96, 110 }, { 96, 108 }, { 110, 115 } 170 }; 171 172 // Sharpening by (slightly) raising the hi-frequency coeffs. 173 // Hack-ish but helpful for mid-bitrate range. Use with care. 174 #define SHARPEN_BITS 11 // number of descaling bits for sharpening bias 175 static const uint8_t kFreqSharpening[16] = { 176 0, 30, 60, 90, 177 30, 60, 90, 90, 178 60, 90, 90, 90, 179 90, 90, 90, 90 180 }; 181 182 //------------------------------------------------------------------------------ 183 // Initialize quantization parameters in VP8Matrix 184 185 // Returns the average quantizer 186 static int ExpandMatrix(VP8Matrix* const m, int type) { 187 int i, sum; 188 for (i = 0; i < 2; ++i) { 189 const int is_ac_coeff = (i > 0); 190 const int bias = kBiasMatrices[type][is_ac_coeff]; 191 m->iq_[i] = (1 << QFIX) / m->q_[i]; 192 m->bias_[i] = BIAS(bias); 193 // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is: 194 // * zero if coeff <= zthresh 195 // * non-zero if coeff > zthresh 196 m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i]; 197 } 198 for (i = 2; i < 16; ++i) { 199 m->q_[i] = m->q_[1]; 200 m->iq_[i] = m->iq_[1]; 201 m->bias_[i] = m->bias_[1]; 202 m->zthresh_[i] = m->zthresh_[1]; 203 } 204 for (sum = 0, i = 0; i < 16; ++i) { 205 if (type == 0) { // we only use sharpening for AC luma coeffs 206 m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS; 207 } else { 208 m->sharpen_[i] = 0; 209 } 210 sum += m->q_[i]; 211 } 212 return (sum + 8) >> 4; 213 } 214 215 static void SetupMatrices(VP8Encoder* enc) { 216 int i; 217 const int tlambda_scale = 218 (enc->method_ >= 4) ? enc->config_->sns_strength 219 : 0; 220 const int num_segments = enc->segment_hdr_.num_segments_; 221 for (i = 0; i < num_segments; ++i) { 222 VP8SegmentInfo* const m = &enc->dqm_[i]; 223 const int q = m->quant_; 224 int q4, q16, quv; 225 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)]; 226 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)]; 227 228 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2; 229 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)]; 230 231 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)]; 232 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)]; 233 234 q4 = ExpandMatrix(&m->y1_, 0); 235 q16 = ExpandMatrix(&m->y2_, 1); 236 quv = ExpandMatrix(&m->uv_, 2); 237 238 m->lambda_i4_ = (3 * q4 * q4) >> 7; 239 m->lambda_i16_ = (3 * q16 * q16); 240 m->lambda_uv_ = (3 * quv * quv) >> 6; 241 m->lambda_mode_ = (1 * q4 * q4) >> 7; 242 m->lambda_trellis_i4_ = (7 * q4 * q4) >> 3; 243 m->lambda_trellis_i16_ = (q16 * q16) >> 2; 244 m->lambda_trellis_uv_ = (quv *quv) << 1; 245 m->tlambda_ = (tlambda_scale * q4) >> 5; 246 247 m->min_disto_ = 10 * m->y1_.q_[0]; // quantization-aware min disto 248 m->max_edge_ = 0; 249 } 250 } 251 252 //------------------------------------------------------------------------------ 253 // Initialize filtering parameters 254 255 // Very small filter-strength values have close to no visual effect. So we can 256 // save a little decoding-CPU by turning filtering off for these. 257 #define FSTRENGTH_CUTOFF 2 258 259 static void SetupFilterStrength(VP8Encoder* const enc) { 260 int i; 261 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering. 262 const int level0 = 5 * enc->config_->filter_strength; 263 for (i = 0; i < NUM_MB_SEGMENTS; ++i) { 264 VP8SegmentInfo* const m = &enc->dqm_[i]; 265 // We focus on the quantization of AC coeffs. 266 const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2; 267 const int base_strength = 268 VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep); 269 // Segments with lower complexity ('beta') will be less filtered. 270 const int f = base_strength * level0 / (256 + m->beta_); 271 m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f; 272 } 273 // We record the initial strength (mainly for the case of 1-segment only). 274 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_; 275 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0); 276 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness; 277 } 278 279 //------------------------------------------------------------------------------ 280 281 // Note: if you change the values below, remember that the max range 282 // allowed by the syntax for DQ_UV is [-16,16]. 283 #define MAX_DQ_UV (6) 284 #define MIN_DQ_UV (-4) 285 286 // We want to emulate jpeg-like behaviour where the expected "good" quality 287 // is around q=75. Internally, our "good" middle is around c=50. So we 288 // map accordingly using linear piece-wise function 289 static double QualityToCompression(double c) { 290 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.; 291 // The file size roughly scales as pow(quantizer, 3.). Actually, the 292 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested 293 // in the mid-quant range. So we scale the compressibility inversely to 294 // this power-law: quant ~= compression ^ 1/3. This law holds well for 295 // low quant. Finer modeling for high-quant would make use of kAcTable[] 296 // more explicitly. 297 const double v = pow(linear_c, 1 / 3.); 298 return v; 299 } 300 301 static double QualityToJPEGCompression(double c, double alpha) { 302 // We map the complexity 'alpha' and quality setting 'c' to a compression 303 // exponent empirically matched to the compression curve of libjpeg6b. 304 // On average, the WebP output size will be roughly similar to that of a 305 // JPEG file compressed with same quality factor. 306 const double amin = 0.30; 307 const double amax = 0.85; 308 const double exp_min = 0.4; 309 const double exp_max = 0.9; 310 const double slope = (exp_min - exp_max) / (amax - amin); 311 // Linearly interpolate 'expn' from exp_min to exp_max 312 // in the [amin, amax] range. 313 const double expn = (alpha > amax) ? exp_min 314 : (alpha < amin) ? exp_max 315 : exp_max + slope * (alpha - amin); 316 const double v = pow(c, expn); 317 return v; 318 } 319 320 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1, 321 const VP8SegmentInfo* const S2) { 322 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_); 323 } 324 325 static void SimplifySegments(VP8Encoder* const enc) { 326 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 }; 327 const int num_segments = enc->segment_hdr_.num_segments_; 328 int num_final_segments = 1; 329 int s1, s2; 330 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments 331 const VP8SegmentInfo* const S1 = &enc->dqm_[s1]; 332 int found = 0; 333 // check if we already have similar segment 334 for (s2 = 0; s2 < num_final_segments; ++s2) { 335 const VP8SegmentInfo* const S2 = &enc->dqm_[s2]; 336 if (SegmentsAreEquivalent(S1, S2)) { 337 found = 1; 338 break; 339 } 340 } 341 map[s1] = s2; 342 if (!found) { 343 if (num_final_segments != s1) { 344 enc->dqm_[num_final_segments] = enc->dqm_[s1]; 345 } 346 ++num_final_segments; 347 } 348 } 349 if (num_final_segments < num_segments) { // Remap 350 int i = enc->mb_w_ * enc->mb_h_; 351 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_]; 352 enc->segment_hdr_.num_segments_ = num_final_segments; 353 // Replicate the trailing segment infos (it's mostly cosmetics) 354 for (i = num_final_segments; i < num_segments; ++i) { 355 enc->dqm_[i] = enc->dqm_[num_final_segments - 1]; 356 } 357 } 358 } 359 360 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) { 361 int i; 362 int dq_uv_ac, dq_uv_dc; 363 const int num_segments = enc->segment_hdr_.num_segments_; 364 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.; 365 const double Q = quality / 100.; 366 const double c_base = enc->config_->emulate_jpeg_size ? 367 QualityToJPEGCompression(Q, enc->alpha_ / 255.) : 368 QualityToCompression(Q); 369 for (i = 0; i < num_segments; ++i) { 370 // We modulate the base coefficient to accommodate for the quantization 371 // susceptibility and allow denser segments to be quantized more. 372 const double expn = 1. - amp * enc->dqm_[i].alpha_; 373 const double c = pow(c_base, expn); 374 const int q = (int)(127. * (1. - c)); 375 assert(expn > 0.); 376 enc->dqm_[i].quant_ = clip(q, 0, 127); 377 } 378 379 // purely indicative in the bitstream (except for the 1-segment case) 380 enc->base_quant_ = enc->dqm_[0].quant_; 381 382 // fill-in values for the unused segments (required by the syntax) 383 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) { 384 enc->dqm_[i].quant_ = enc->base_quant_; 385 } 386 387 // uv_alpha_ is normally spread around ~60. The useful range is 388 // typically ~30 (quite bad) to ~100 (ok to decimate UV more). 389 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv. 390 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV) 391 / (MAX_ALPHA - MIN_ALPHA); 392 // we rescale by the user-defined strength of adaptation 393 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100; 394 // and make it safe. 395 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV); 396 // We also boost the dc-uv-quant a little, based on sns-strength, since 397 // U/V channels are quite more reactive to high quants (flat DC-blocks 398 // tend to appear, and are unpleasant). 399 dq_uv_dc = -4 * enc->config_->sns_strength / 100; 400 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed 401 402 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum 403 enc->dq_y2_dc_ = 0; 404 enc->dq_y2_ac_ = 0; 405 enc->dq_uv_dc_ = dq_uv_dc; 406 enc->dq_uv_ac_ = dq_uv_ac; 407 408 SetupFilterStrength(enc); // initialize segments' filtering, eventually 409 410 if (num_segments > 1) SimplifySegments(enc); 411 412 SetupMatrices(enc); // finalize quantization matrices 413 } 414 415 //------------------------------------------------------------------------------ 416 // Form the predictions in cache 417 418 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index 419 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 }; 420 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 }; 421 422 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index 423 const int VP8I4ModeOffsets[NUM_BMODES] = { 424 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 425 }; 426 427 void VP8MakeLuma16Preds(const VP8EncIterator* const it) { 428 const uint8_t* const left = it->x_ ? it->y_left_ : NULL; 429 const uint8_t* const top = it->y_ ? it->y_top_ : NULL; 430 VP8EncPredLuma16(it->yuv_p_, left, top); 431 } 432 433 void VP8MakeChroma8Preds(const VP8EncIterator* const it) { 434 const uint8_t* const left = it->x_ ? it->u_left_ : NULL; 435 const uint8_t* const top = it->y_ ? it->uv_top_ : NULL; 436 VP8EncPredChroma8(it->yuv_p_, left, top); 437 } 438 439 void VP8MakeIntra4Preds(const VP8EncIterator* const it) { 440 VP8EncPredLuma4(it->yuv_p_, it->i4_top_); 441 } 442 443 //------------------------------------------------------------------------------ 444 // Quantize 445 446 // Layout: 447 // +----+ 448 // |YYYY| 0 449 // |YYYY| 4 450 // |YYYY| 8 451 // |YYYY| 12 452 // +----+ 453 // |UUVV| 16 454 // |UUVV| 20 455 // +----+ 456 457 const int VP8Scan[16] = { // Luma 458 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, 459 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, 460 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, 461 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS, 462 }; 463 464 static const int VP8ScanUV[4 + 4] = { 465 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U 466 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V 467 }; 468 469 //------------------------------------------------------------------------------ 470 // Distortion measurement 471 472 static const uint16_t kWeightY[16] = { 473 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2 474 }; 475 476 static const uint16_t kWeightTrellis[16] = { 477 #if USE_TDISTO == 0 478 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 479 #else 480 30, 27, 19, 11, 481 27, 24, 17, 10, 482 19, 17, 12, 8, 483 11, 10, 8, 6 484 #endif 485 }; 486 487 // Init/Copy the common fields in score. 488 static void InitScore(VP8ModeScore* const rd) { 489 rd->D = 0; 490 rd->SD = 0; 491 rd->R = 0; 492 rd->H = 0; 493 rd->nz = 0; 494 rd->score = MAX_COST; 495 } 496 497 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) { 498 dst->D = src->D; 499 dst->SD = src->SD; 500 dst->R = src->R; 501 dst->H = src->H; 502 dst->nz = src->nz; // note that nz is not accumulated, but just copied. 503 dst->score = src->score; 504 } 505 506 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) { 507 dst->D += src->D; 508 dst->SD += src->SD; 509 dst->R += src->R; 510 dst->H += src->H; 511 dst->nz |= src->nz; // here, new nz bits are accumulated. 512 dst->score += src->score; 513 } 514 515 //------------------------------------------------------------------------------ 516 // Performs trellis-optimized quantization. 517 518 // Trellis node 519 typedef struct { 520 int8_t prev; // best previous node 521 int8_t sign; // sign of coeff_i 522 int16_t level; // level 523 } Node; 524 525 // Score state 526 typedef struct { 527 score_t score; // partial RD score 528 const uint16_t* costs; // shortcut to cost tables 529 } ScoreState; 530 531 // If a coefficient was quantized to a value Q (using a neutral bias), 532 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA] 533 // We don't test negative values though. 534 #define MIN_DELTA 0 // how much lower level to try 535 #define MAX_DELTA 1 // how much higher 536 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA) 537 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA]) 538 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA]) 539 540 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) { 541 // TODO: incorporate the "* 256" in the tables? 542 rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD); 543 } 544 545 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate, 546 score_t distortion) { 547 return rate * lambda + 256 * distortion; 548 } 549 550 static int TrellisQuantizeBlock(const VP8Encoder* const enc, 551 int16_t in[16], int16_t out[16], 552 int ctx0, int coeff_type, 553 const VP8Matrix* const mtx, 554 int lambda) { 555 const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type]; 556 const CostArray* const costs = enc->proba_.level_cost_[coeff_type]; 557 const int first = (coeff_type == 0) ? 1 : 0; 558 Node nodes[16][NUM_NODES]; 559 ScoreState score_states[2][NUM_NODES]; 560 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA); 561 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA); 562 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous 563 score_t best_score; 564 int n, m, p, last; 565 566 { 567 score_t cost; 568 const int thresh = mtx->q_[1] * mtx->q_[1] / 4; 569 const int last_proba = probas[VP8EncBands[first]][ctx0][0]; 570 571 // compute the position of the last interesting coefficient 572 last = first - 1; 573 for (n = 15; n >= first; --n) { 574 const int j = kZigzag[n]; 575 const int err = in[j] * in[j]; 576 if (err > thresh) { 577 last = n; 578 break; 579 } 580 } 581 // we don't need to go inspect up to n = 16 coeffs. We can just go up 582 // to last + 1 (inclusive) without losing much. 583 if (last < 15) ++last; 584 585 // compute 'skip' score. This is the max score one can do. 586 cost = VP8BitCost(0, last_proba); 587 best_score = RDScoreTrellis(lambda, cost, 0); 588 589 // initialize source node. 590 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { 591 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0; 592 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0); 593 ss_cur[m].costs = costs[VP8EncBands[first]][ctx0]; 594 } 595 } 596 597 // traverse trellis. 598 for (n = first; n <= last; ++n) { 599 const int j = kZigzag[n]; 600 const uint32_t Q = mtx->q_[j]; 601 const uint32_t iQ = mtx->iq_[j]; 602 const uint32_t B = BIAS(0x00); // neutral bias 603 // note: it's important to take sign of the _original_ coeff, 604 // so we don't have to consider level < 0 afterward. 605 const int sign = (in[j] < 0); 606 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; 607 int level0 = QUANTDIV(coeff0, iQ, B); 608 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL; 609 610 { // Swap current and previous score states 611 ScoreState* const tmp = ss_cur; 612 ss_cur = ss_prev; 613 ss_prev = tmp; 614 } 615 616 // test all alternate level values around level0. 617 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) { 618 Node* const cur = &NODE(n, m); 619 int level = level0 + m; 620 const int ctx = (level > 2) ? 2 : level; 621 const int band = VP8EncBands[n + 1]; 622 score_t base_score, last_pos_score; 623 score_t best_cur_score = MAX_COST; 624 int best_prev = 0; // default, in case 625 626 ss_cur[m].score = MAX_COST; 627 ss_cur[m].costs = costs[band][ctx]; 628 if (level > MAX_LEVEL || level < 0) { // node is dead? 629 continue; 630 } 631 632 // Compute extra rate cost if last coeff's position is < 15 633 { 634 const score_t last_pos_cost = 635 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0; 636 last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0); 637 } 638 639 { 640 // Compute delta_error = how much coding this level will 641 // subtract to max_error as distortion. 642 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2 643 const int new_error = coeff0 - level * Q; 644 const int delta_error = 645 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0); 646 base_score = RDScoreTrellis(lambda, 0, delta_error); 647 } 648 649 // Inspect all possible non-dead predecessors. Retain only the best one. 650 for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) { 651 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically 652 // eliminated since their score can't be better than the current best. 653 const score_t cost = VP8LevelCost(ss_prev[p].costs, level); 654 // Examine node assuming it's a non-terminal one. 655 const score_t score = 656 base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0); 657 if (score < best_cur_score) { 658 best_cur_score = score; 659 best_prev = p; 660 } 661 } 662 // Store best finding in current node. 663 cur->sign = sign; 664 cur->level = level; 665 cur->prev = best_prev; 666 ss_cur[m].score = best_cur_score; 667 668 // Now, record best terminal node (and thus best entry in the graph). 669 if (level != 0) { 670 const score_t score = best_cur_score + last_pos_score; 671 if (score < best_score) { 672 best_score = score; 673 best_path[0] = n; // best eob position 674 best_path[1] = m; // best node index 675 best_path[2] = best_prev; // best predecessor 676 } 677 } 678 } 679 } 680 681 // Fresh start 682 memset(in + first, 0, (16 - first) * sizeof(*in)); 683 memset(out + first, 0, (16 - first) * sizeof(*out)); 684 if (best_path[0] == -1) { 685 return 0; // skip! 686 } 687 688 { 689 // Unwind the best path. 690 // Note: best-prev on terminal node is not necessarily equal to the 691 // best_prev for non-terminal. So we patch best_path[2] in. 692 int nz = 0; 693 int best_node = best_path[1]; 694 n = best_path[0]; 695 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal 696 697 for (; n >= first; --n) { 698 const Node* const node = &NODE(n, best_node); 699 const int j = kZigzag[n]; 700 out[n] = node->sign ? -node->level : node->level; 701 nz |= node->level; 702 in[j] = out[n] * mtx->q_[j]; 703 best_node = node->prev; 704 } 705 return (nz != 0); 706 } 707 } 708 709 #undef NODE 710 711 //------------------------------------------------------------------------------ 712 // Performs: difference, transform, quantize, back-transform, add 713 // all at once. Output is the reconstructed block in *yuv_out, and the 714 // quantized levels in *levels. 715 716 static int ReconstructIntra16(VP8EncIterator* const it, 717 VP8ModeScore* const rd, 718 uint8_t* const yuv_out, 719 int mode) { 720 const VP8Encoder* const enc = it->enc_; 721 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; 722 const uint8_t* const src = it->yuv_in_ + Y_OFF; 723 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 724 int nz = 0; 725 int n; 726 int16_t tmp[16][16], dc_tmp[16]; 727 728 for (n = 0; n < 16; ++n) { 729 VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]); 730 } 731 VP8FTransformWHT(tmp[0], dc_tmp); 732 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24; 733 734 if (DO_TRELLIS_I16 && it->do_trellis_) { 735 int x, y; 736 VP8IteratorNzToBytes(it); 737 for (y = 0, n = 0; y < 4; ++y) { 738 for (x = 0; x < 4; ++x, ++n) { 739 const int ctx = it->top_nz_[x] + it->left_nz_[y]; 740 const int non_zero = 741 TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0, 742 &dqm->y1_, dqm->lambda_trellis_i16_); 743 it->top_nz_[x] = it->left_nz_[y] = non_zero; 744 rd->y_ac_levels[n][0] = 0; 745 nz |= non_zero << n; 746 } 747 } 748 } else { 749 for (n = 0; n < 16; ++n) { 750 // Zero-out the first coeff, so that: a) nz is correct below, and 751 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified. 752 tmp[n][0] = 0; 753 nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n; 754 assert(rd->y_ac_levels[n][0] == 0); 755 } 756 } 757 758 // Transform back 759 VP8TransformWHT(dc_tmp, tmp[0]); 760 for (n = 0; n < 16; n += 2) { 761 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1); 762 } 763 764 return nz; 765 } 766 767 static int ReconstructIntra4(VP8EncIterator* const it, 768 int16_t levels[16], 769 const uint8_t* const src, 770 uint8_t* const yuv_out, 771 int mode) { 772 const VP8Encoder* const enc = it->enc_; 773 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode]; 774 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 775 int nz = 0; 776 int16_t tmp[16]; 777 778 VP8FTransform(src, ref, tmp); 779 if (DO_TRELLIS_I4 && it->do_trellis_) { 780 const int x = it->i4_ & 3, y = it->i4_ >> 2; 781 const int ctx = it->top_nz_[x] + it->left_nz_[y]; 782 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_, 783 dqm->lambda_trellis_i4_); 784 } else { 785 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_); 786 } 787 VP8ITransform(ref, tmp, yuv_out, 0); 788 return nz; 789 } 790 791 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, 792 uint8_t* const yuv_out, int mode) { 793 const VP8Encoder* const enc = it->enc_; 794 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode]; 795 const uint8_t* const src = it->yuv_in_ + U_OFF; 796 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 797 int nz = 0; 798 int n; 799 int16_t tmp[8][16]; 800 801 for (n = 0; n < 8; ++n) { 802 VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]); 803 } 804 if (DO_TRELLIS_UV && it->do_trellis_) { 805 int ch, x, y; 806 for (ch = 0, n = 0; ch <= 2; ch += 2) { 807 for (y = 0; y < 2; ++y) { 808 for (x = 0; x < 2; ++x, ++n) { 809 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y]; 810 const int non_zero = 811 TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2, 812 &dqm->uv_, dqm->lambda_trellis_uv_); 813 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero; 814 nz |= non_zero << n; 815 } 816 } 817 } 818 } else { 819 for (n = 0; n < 8; ++n) { 820 nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n; 821 } 822 } 823 824 for (n = 0; n < 8; n += 2) { 825 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1); 826 } 827 return (nz << 16); 828 } 829 830 //------------------------------------------------------------------------------ 831 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost. 832 // Pick the mode is lower RD-cost = Rate + lambda * Distortion. 833 834 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) { 835 // We look at the first three AC coefficients to determine what is the average 836 // delta between each sub-4x4 block. 837 const int v0 = abs(DCs[1]); 838 const int v1 = abs(DCs[4]); 839 const int v2 = abs(DCs[5]); 840 int max_v = (v0 > v1) ? v1 : v0; 841 max_v = (v2 > max_v) ? v2 : max_v; 842 if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v; 843 } 844 845 static void SwapPtr(uint8_t** a, uint8_t** b) { 846 uint8_t* const tmp = *a; 847 *a = *b; 848 *b = tmp; 849 } 850 851 static void SwapOut(VP8EncIterator* const it) { 852 SwapPtr(&it->yuv_out_, &it->yuv_out2_); 853 } 854 855 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) { 856 score_t score = 0; 857 while (num_blocks-- > 0) { // TODO(skal): refine positional scoring? 858 int i; 859 for (i = 1; i < 16; ++i) { // omit DC, we're only interested in AC 860 score += (levels[i] != 0); 861 if (score > thresh) return 0; 862 } 863 levels += 16; 864 } 865 return 1; 866 } 867 868 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) { 869 const int kNumBlocks = 16; 870 VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_]; 871 const int lambda = dqm->lambda_i16_; 872 const int tlambda = dqm->tlambda_; 873 const uint8_t* const src = it->yuv_in_ + Y_OFF; 874 VP8ModeScore rd16; 875 int mode; 876 877 rd->mode_i16 = -1; 878 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 879 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF; // scratch buffer 880 int nz; 881 882 // Reconstruct 883 nz = ReconstructIntra16(it, &rd16, tmp_dst, mode); 884 885 // Measure RD-score 886 rd16.D = VP8SSE16x16(src, tmp_dst); 887 rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) 888 : 0; 889 rd16.H = VP8FixedCostsI16[mode]; 890 rd16.R = VP8GetCostLuma16(it, &rd16); 891 if (mode > 0 && 892 IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) { 893 // penalty to avoid flat area to be mispredicted by complex mode 894 rd16.R += FLATNESS_PENALTY * kNumBlocks; 895 } 896 897 // Since we always examine Intra16 first, we can overwrite *rd directly. 898 SetRDScore(lambda, &rd16); 899 if (mode == 0 || rd16.score < rd->score) { 900 CopyScore(rd, &rd16); 901 rd->mode_i16 = mode; 902 rd->nz = nz; 903 memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels)); 904 memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels)); 905 SwapOut(it); 906 } 907 } 908 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision. 909 VP8SetIntra16Mode(it, rd->mode_i16); 910 911 // we have a blocky macroblock (only DCs are non-zero) with fairly high 912 // distortion, record max delta so we can later adjust the minimal filtering 913 // strength needed to smooth these blocks out. 914 if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) { 915 StoreMaxDelta(dqm, rd->y_dc_levels); 916 } 917 } 918 919 //------------------------------------------------------------------------------ 920 921 // return the cost array corresponding to the surrounding prediction modes. 922 static const uint16_t* GetCostModeI4(VP8EncIterator* const it, 923 const uint8_t modes[16]) { 924 const int preds_w = it->enc_->preds_w_; 925 const int x = (it->i4_ & 3), y = it->i4_ >> 2; 926 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1]; 927 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4]; 928 return VP8FixedCostsI4[top][left]; 929 } 930 931 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) { 932 const VP8Encoder* const enc = it->enc_; 933 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_]; 934 const int lambda = dqm->lambda_i4_; 935 const int tlambda = dqm->tlambda_; 936 const uint8_t* const src0 = it->yuv_in_ + Y_OFF; 937 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF; 938 int total_header_bits = 0; 939 VP8ModeScore rd_best; 940 941 if (enc->max_i4_header_bits_ == 0) { 942 return 0; 943 } 944 945 InitScore(&rd_best); 946 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145) 947 SetRDScore(dqm->lambda_mode_, &rd_best); 948 VP8IteratorStartI4(it); 949 do { 950 const int kNumBlocks = 1; 951 VP8ModeScore rd_i4; 952 int mode; 953 int best_mode = -1; 954 const uint8_t* const src = src0 + VP8Scan[it->i4_]; 955 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4); 956 uint8_t* best_block = best_blocks + VP8Scan[it->i4_]; 957 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer. 958 959 InitScore(&rd_i4); 960 VP8MakeIntra4Preds(it); 961 for (mode = 0; mode < NUM_BMODES; ++mode) { 962 VP8ModeScore rd_tmp; 963 int16_t tmp_levels[16]; 964 965 // Reconstruct 966 rd_tmp.nz = 967 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_; 968 969 // Compute RD-score 970 rd_tmp.D = VP8SSE4x4(src, tmp_dst); 971 rd_tmp.SD = 972 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY)) 973 : 0; 974 rd_tmp.H = mode_costs[mode]; 975 rd_tmp.R = VP8GetCostLuma4(it, tmp_levels); 976 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) { 977 rd_tmp.R += FLATNESS_PENALTY * kNumBlocks; 978 } 979 980 SetRDScore(lambda, &rd_tmp); 981 if (best_mode < 0 || rd_tmp.score < rd_i4.score) { 982 CopyScore(&rd_i4, &rd_tmp); 983 best_mode = mode; 984 SwapPtr(&tmp_dst, &best_block); 985 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels)); 986 } 987 } 988 SetRDScore(dqm->lambda_mode_, &rd_i4); 989 AddScore(&rd_best, &rd_i4); 990 if (rd_best.score >= rd->score) { 991 return 0; 992 } 993 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode]; 994 if (total_header_bits > enc->max_i4_header_bits_) { 995 return 0; 996 } 997 // Copy selected samples if not in the right place already. 998 if (best_block != best_blocks + VP8Scan[it->i4_]) { 999 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]); 1000 } 1001 rd->modes_i4[it->i4_] = best_mode; 1002 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0); 1003 } while (VP8IteratorRotateI4(it, best_blocks)); 1004 1005 // finalize state 1006 CopyScore(rd, &rd_best); 1007 VP8SetIntra4Mode(it, rd->modes_i4); 1008 SwapOut(it); 1009 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels)); 1010 return 1; // select intra4x4 over intra16x16 1011 } 1012 1013 //------------------------------------------------------------------------------ 1014 1015 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { 1016 const int kNumBlocks = 8; 1017 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_]; 1018 const int lambda = dqm->lambda_uv_; 1019 const uint8_t* const src = it->yuv_in_ + U_OFF; 1020 uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF; // scratch buffer 1021 uint8_t* const dst0 = it->yuv_out_ + U_OFF; 1022 VP8ModeScore rd_best; 1023 int mode; 1024 1025 rd->mode_uv = -1; 1026 InitScore(&rd_best); 1027 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 1028 VP8ModeScore rd_uv; 1029 1030 // Reconstruct 1031 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode); 1032 1033 // Compute RD-score 1034 rd_uv.D = VP8SSE16x8(src, tmp_dst); 1035 rd_uv.SD = 0; // TODO: should we call TDisto? it tends to flatten areas. 1036 rd_uv.H = VP8FixedCostsUV[mode]; 1037 rd_uv.R = VP8GetCostUV(it, &rd_uv); 1038 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) { 1039 rd_uv.R += FLATNESS_PENALTY * kNumBlocks; 1040 } 1041 1042 SetRDScore(lambda, &rd_uv); 1043 if (mode == 0 || rd_uv.score < rd_best.score) { 1044 CopyScore(&rd_best, &rd_uv); 1045 rd->mode_uv = mode; 1046 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels)); 1047 memcpy(dst0, tmp_dst, UV_SIZE); // TODO: SwapUVOut() ? 1048 } 1049 } 1050 VP8SetIntraUVMode(it, rd->mode_uv); 1051 AddScore(rd, &rd_best); 1052 } 1053 1054 //------------------------------------------------------------------------------ 1055 // Final reconstruction and quantization. 1056 1057 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) { 1058 const VP8Encoder* const enc = it->enc_; 1059 const int is_i16 = (it->mb_->type_ == 1); 1060 int nz = 0; 1061 1062 if (is_i16) { 1063 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]); 1064 } else { 1065 VP8IteratorStartI4(it); 1066 do { 1067 const int mode = 1068 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_]; 1069 const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_]; 1070 uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_]; 1071 VP8MakeIntra4Preds(it); 1072 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_], 1073 src, dst, mode) << it->i4_; 1074 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF)); 1075 } 1076 1077 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_); 1078 rd->nz = nz; 1079 } 1080 1081 // Refine intra16/intra4 sub-modes based on distortion only (not rate). 1082 static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) { 1083 const int is_i16 = (it->mb_->type_ == 1); 1084 score_t best_score = MAX_COST; 1085 1086 if (try_both_i4_i16 || is_i16) { 1087 int mode; 1088 int best_mode = -1; 1089 for (mode = 0; mode < NUM_PRED_MODES; ++mode) { 1090 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode]; 1091 const uint8_t* const src = it->yuv_in_ + Y_OFF; 1092 const score_t score = VP8SSE16x16(src, ref); 1093 if (score < best_score) { 1094 best_mode = mode; 1095 best_score = score; 1096 } 1097 } 1098 VP8SetIntra16Mode(it, best_mode); 1099 } 1100 if (try_both_i4_i16 || !is_i16) { 1101 uint8_t modes_i4[16]; 1102 // We don't evaluate the rate here, but just account for it through a 1103 // constant penalty (i4 mode usually needs more bits compared to i16). 1104 score_t score_i4 = (score_t)I4_PENALTY; 1105 1106 VP8IteratorStartI4(it); 1107 do { 1108 int mode; 1109 int best_sub_mode = -1; 1110 score_t best_sub_score = MAX_COST; 1111 const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_]; 1112 1113 // TODO(skal): we don't really need the prediction pixels here, 1114 // but just the distortion against 'src'. 1115 VP8MakeIntra4Preds(it); 1116 for (mode = 0; mode < NUM_BMODES; ++mode) { 1117 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode]; 1118 const score_t score = VP8SSE4x4(src, ref); 1119 if (score < best_sub_score) { 1120 best_sub_mode = mode; 1121 best_sub_score = score; 1122 } 1123 } 1124 modes_i4[it->i4_] = best_sub_mode; 1125 score_i4 += best_sub_score; 1126 if (score_i4 >= best_score) break; 1127 } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF)); 1128 if (score_i4 < best_score) { 1129 VP8SetIntra4Mode(it, modes_i4); 1130 } 1131 } 1132 } 1133 1134 //------------------------------------------------------------------------------ 1135 // Entry point 1136 1137 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, 1138 VP8RDLevel rd_opt) { 1139 int is_skipped; 1140 const int method = it->enc_->method_; 1141 1142 InitScore(rd); 1143 1144 // We can perform predictions for Luma16x16 and Chroma8x8 already. 1145 // Luma4x4 predictions needs to be done as-we-go. 1146 VP8MakeLuma16Preds(it); 1147 VP8MakeChroma8Preds(it); 1148 1149 if (rd_opt > RD_OPT_NONE) { 1150 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL); 1151 PickBestIntra16(it, rd); 1152 if (method >= 2) { 1153 PickBestIntra4(it, rd); 1154 } 1155 PickBestUV(it, rd); 1156 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now 1157 it->do_trellis_ = 1; 1158 SimpleQuantize(it, rd); 1159 } 1160 } else { 1161 // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower). 1162 // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode). 1163 DistoRefine(it, (method >= 2)); 1164 SimpleQuantize(it, rd); 1165 } 1166 is_skipped = (rd->nz == 0); 1167 VP8SetSkip(it, is_skipped); 1168 return is_skipped; 1169 } 1170 1171