Home | History | Annotate | Download | only in enc
      1 // Copyright 2011 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 //   Quantization
     11 //
     12 // Author: Skal (pascal.massimino (at) gmail.com)
     13 
     14 #include <assert.h>
     15 #include <math.h>
     16 #include <stdlib.h>  // for abs()
     17 
     18 #include "./vp8enci.h"
     19 #include "./cost.h"
     20 
     21 #define DO_TRELLIS_I4  1
     22 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
     23 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
     24 #define USE_TDISTO 1
     25 
     26 #define MID_ALPHA 64      // neutral value for susceptibility
     27 #define MIN_ALPHA 30      // lowest usable value for susceptibility
     28 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
     29 
     30 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
     31                           // power-law modulation. Must be strictly less than 1.
     32 
     33 #define I4_PENALTY 14000  // Rate-penalty for quick i4/i16 decision
     34 
     35 // number of non-zero coeffs below which we consider the block very flat
     36 // (and apply a penalty to complex predictions)
     37 #define FLATNESS_LIMIT_I16 10      // I16 mode
     38 #define FLATNESS_LIMIT_I4  3       // I4 mode
     39 #define FLATNESS_LIMIT_UV  2       // UV mode
     40 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
     41 
     42 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
     43 
     44 #define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
     45 
     46 // #define DEBUG_BLOCK
     47 
     48 //------------------------------------------------------------------------------
     49 
     50 #if defined(DEBUG_BLOCK)
     51 
     52 #include <stdio.h>
     53 #include <stdlib.h>
     54 
     55 static void PrintBlockInfo(const VP8EncIterator* const it,
     56                            const VP8ModeScore* const rd) {
     57   int i, j;
     58   const int is_i16 = (it->mb_->type_ == 1);
     59   const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
     60   const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
     61   const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
     62   const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
     63   printf("SOURCE / OUTPUT / ABS DELTA\n");
     64   for (j = 0; j < 16; ++j) {
     65     for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
     66     printf("     ");
     67     for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
     68     printf("     ");
     69     for (i = 0; i < 16; ++i) {
     70       printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
     71     }
     72     printf("\n");
     73   }
     74   printf("\n");   // newline before the U/V block
     75   for (j = 0; j < 8; ++j) {
     76     for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
     77     printf(" ");
     78     for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
     79     printf("    ");
     80     for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
     81     printf(" ");
     82     for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
     83     printf("   ");
     84     for (i = 0; i < 8; ++i) {
     85       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
     86     }
     87     printf(" ");
     88     for (i = 8; i < 16; ++i) {
     89       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
     90     }
     91     printf("\n");
     92   }
     93   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
     94     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
     95     (int)rd->score);
     96   if (is_i16) {
     97     printf("Mode: %d\n", rd->mode_i16);
     98     printf("y_dc_levels:");
     99     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
    100     printf("\n");
    101   } else {
    102     printf("Modes[16]: ");
    103     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
    104     printf("\n");
    105   }
    106   printf("y_ac_levels:\n");
    107   for (j = 0; j < 16; ++j) {
    108     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
    109       printf("%4d ", rd->y_ac_levels[j][i]);
    110     }
    111     printf("\n");
    112   }
    113   printf("\n");
    114   printf("uv_levels (mode=%d):\n", rd->mode_uv);
    115   for (j = 0; j < 8; ++j) {
    116     for (i = 0; i < 16; ++i) {
    117       printf("%4d ", rd->uv_levels[j][i]);
    118     }
    119     printf("\n");
    120   }
    121 }
    122 
    123 #endif   // DEBUG_BLOCK
    124 
    125 //------------------------------------------------------------------------------
    126 
    127 static WEBP_INLINE int clip(int v, int m, int M) {
    128   return v < m ? m : v > M ? M : v;
    129 }
    130 
    131 static const uint8_t kZigzag[16] = {
    132   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
    133 };
    134 
    135 static const uint8_t kDcTable[128] = {
    136   4,     5,   6,   7,   8,   9,  10,  10,
    137   11,   12,  13,  14,  15,  16,  17,  17,
    138   18,   19,  20,  20,  21,  21,  22,  22,
    139   23,   23,  24,  25,  25,  26,  27,  28,
    140   29,   30,  31,  32,  33,  34,  35,  36,
    141   37,   37,  38,  39,  40,  41,  42,  43,
    142   44,   45,  46,  46,  47,  48,  49,  50,
    143   51,   52,  53,  54,  55,  56,  57,  58,
    144   59,   60,  61,  62,  63,  64,  65,  66,
    145   67,   68,  69,  70,  71,  72,  73,  74,
    146   75,   76,  76,  77,  78,  79,  80,  81,
    147   82,   83,  84,  85,  86,  87,  88,  89,
    148   91,   93,  95,  96,  98, 100, 101, 102,
    149   104, 106, 108, 110, 112, 114, 116, 118,
    150   122, 124, 126, 128, 130, 132, 134, 136,
    151   138, 140, 143, 145, 148, 151, 154, 157
    152 };
    153 
    154 static const uint16_t kAcTable[128] = {
    155   4,     5,   6,   7,   8,   9,  10,  11,
    156   12,   13,  14,  15,  16,  17,  18,  19,
    157   20,   21,  22,  23,  24,  25,  26,  27,
    158   28,   29,  30,  31,  32,  33,  34,  35,
    159   36,   37,  38,  39,  40,  41,  42,  43,
    160   44,   45,  46,  47,  48,  49,  50,  51,
    161   52,   53,  54,  55,  56,  57,  58,  60,
    162   62,   64,  66,  68,  70,  72,  74,  76,
    163   78,   80,  82,  84,  86,  88,  90,  92,
    164   94,   96,  98, 100, 102, 104, 106, 108,
    165   110, 112, 114, 116, 119, 122, 125, 128,
    166   131, 134, 137, 140, 143, 146, 149, 152,
    167   155, 158, 161, 164, 167, 170, 173, 177,
    168   181, 185, 189, 193, 197, 201, 205, 209,
    169   213, 217, 221, 225, 229, 234, 239, 245,
    170   249, 254, 259, 264, 269, 274, 279, 284
    171 };
    172 
    173 static const uint16_t kAcTable2[128] = {
    174   8,     8,   9,  10,  12,  13,  15,  17,
    175   18,   20,  21,  23,  24,  26,  27,  29,
    176   31,   32,  34,  35,  37,  38,  40,  41,
    177   43,   44,  46,  48,  49,  51,  52,  54,
    178   55,   57,  58,  60,  62,  63,  65,  66,
    179   68,   69,  71,  72,  74,  75,  77,  79,
    180   80,   82,  83,  85,  86,  88,  89,  93,
    181   96,   99, 102, 105, 108, 111, 114, 117,
    182   120, 124, 127, 130, 133, 136, 139, 142,
    183   145, 148, 151, 155, 158, 161, 164, 167,
    184   170, 173, 176, 179, 184, 189, 193, 198,
    185   203, 207, 212, 217, 221, 226, 230, 235,
    186   240, 244, 249, 254, 258, 263, 268, 274,
    187   280, 286, 292, 299, 305, 311, 317, 323,
    188   330, 336, 342, 348, 354, 362, 370, 379,
    189   385, 393, 401, 409, 416, 424, 432, 440
    190 };
    191 
    192 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
    193   { 96, 110 }, { 96, 108 }, { 110, 115 }
    194 };
    195 
    196 // Sharpening by (slightly) raising the hi-frequency coeffs.
    197 // Hack-ish but helpful for mid-bitrate range. Use with care.
    198 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
    199 static const uint8_t kFreqSharpening[16] = {
    200   0,  30, 60, 90,
    201   30, 60, 90, 90,
    202   60, 90, 90, 90,
    203   90, 90, 90, 90
    204 };
    205 
    206 //------------------------------------------------------------------------------
    207 // Initialize quantization parameters in VP8Matrix
    208 
    209 // Returns the average quantizer
    210 static int ExpandMatrix(VP8Matrix* const m, int type) {
    211   int i, sum;
    212   for (i = 0; i < 2; ++i) {
    213     const int is_ac_coeff = (i > 0);
    214     const int bias = kBiasMatrices[type][is_ac_coeff];
    215     m->iq_[i] = (1 << QFIX) / m->q_[i];
    216     m->bias_[i] = BIAS(bias);
    217     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
    218     //   * zero if coeff <= zthresh
    219     //   * non-zero if coeff > zthresh
    220     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
    221   }
    222   for (i = 2; i < 16; ++i) {
    223     m->q_[i] = m->q_[1];
    224     m->iq_[i] = m->iq_[1];
    225     m->bias_[i] = m->bias_[1];
    226     m->zthresh_[i] = m->zthresh_[1];
    227   }
    228   for (sum = 0, i = 0; i < 16; ++i) {
    229     if (type == 0) {  // we only use sharpening for AC luma coeffs
    230       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
    231     } else {
    232       m->sharpen_[i] = 0;
    233     }
    234     sum += m->q_[i];
    235   }
    236   return (sum + 8) >> 4;
    237 }
    238 
    239 static void SetupMatrices(VP8Encoder* enc) {
    240   int i;
    241   const int tlambda_scale =
    242     (enc->method_ >= 4) ? enc->config_->sns_strength
    243                         : 0;
    244   const int num_segments = enc->segment_hdr_.num_segments_;
    245   for (i = 0; i < num_segments; ++i) {
    246     VP8SegmentInfo* const m = &enc->dqm_[i];
    247     const int q = m->quant_;
    248     int q4, q16, quv;
    249     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
    250     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
    251 
    252     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
    253     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
    254 
    255     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
    256     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
    257 
    258     q4  = ExpandMatrix(&m->y1_, 0);
    259     q16 = ExpandMatrix(&m->y2_, 1);
    260     quv = ExpandMatrix(&m->uv_, 2);
    261 
    262     m->lambda_i4_          = (3 * q4 * q4) >> 7;
    263     m->lambda_i16_         = (3 * q16 * q16);
    264     m->lambda_uv_          = (3 * quv * quv) >> 6;
    265     m->lambda_mode_        = (1 * q4 * q4) >> 7;
    266     m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
    267     m->lambda_trellis_i16_ = (q16 * q16) >> 2;
    268     m->lambda_trellis_uv_  = (quv *quv) << 1;
    269     m->tlambda_            = (tlambda_scale * q4) >> 5;
    270 
    271     m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
    272     m->max_edge_  = 0;
    273   }
    274 }
    275 
    276 //------------------------------------------------------------------------------
    277 // Initialize filtering parameters
    278 
    279 // Very small filter-strength values have close to no visual effect. So we can
    280 // save a little decoding-CPU by turning filtering off for these.
    281 #define FSTRENGTH_CUTOFF 2
    282 
    283 static void SetupFilterStrength(VP8Encoder* const enc) {
    284   int i;
    285   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
    286   const int level0 = 5 * enc->config_->filter_strength;
    287   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
    288     VP8SegmentInfo* const m = &enc->dqm_[i];
    289     // We focus on the quantization of AC coeffs.
    290     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
    291     const int base_strength =
    292         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
    293     // Segments with lower complexity ('beta') will be less filtered.
    294     const int f = base_strength * level0 / (256 + m->beta_);
    295     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
    296   }
    297   // We record the initial strength (mainly for the case of 1-segment only).
    298   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
    299   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
    300   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
    301 }
    302 
    303 //------------------------------------------------------------------------------
    304 
    305 // Note: if you change the values below, remember that the max range
    306 // allowed by the syntax for DQ_UV is [-16,16].
    307 #define MAX_DQ_UV (6)
    308 #define MIN_DQ_UV (-4)
    309 
    310 // We want to emulate jpeg-like behaviour where the expected "good" quality
    311 // is around q=75. Internally, our "good" middle is around c=50. So we
    312 // map accordingly using linear piece-wise function
    313 static double QualityToCompression(double c) {
    314   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
    315   // The file size roughly scales as pow(quantizer, 3.). Actually, the
    316   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
    317   // in the mid-quant range. So we scale the compressibility inversely to
    318   // this power-law: quant ~= compression ^ 1/3. This law holds well for
    319   // low quant. Finer modeling for high-quant would make use of kAcTable[]
    320   // more explicitly.
    321   const double v = pow(linear_c, 1 / 3.);
    322   return v;
    323 }
    324 
    325 static double QualityToJPEGCompression(double c, double alpha) {
    326   // We map the complexity 'alpha' and quality setting 'c' to a compression
    327   // exponent empirically matched to the compression curve of libjpeg6b.
    328   // On average, the WebP output size will be roughly similar to that of a
    329   // JPEG file compressed with same quality factor.
    330   const double amin = 0.30;
    331   const double amax = 0.85;
    332   const double exp_min = 0.4;
    333   const double exp_max = 0.9;
    334   const double slope = (exp_min - exp_max) / (amax - amin);
    335   // Linearly interpolate 'expn' from exp_min to exp_max
    336   // in the [amin, amax] range.
    337   const double expn = (alpha > amax) ? exp_min
    338                     : (alpha < amin) ? exp_max
    339                     : exp_max + slope * (alpha - amin);
    340   const double v = pow(c, expn);
    341   return v;
    342 }
    343 
    344 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
    345                                  const VP8SegmentInfo* const S2) {
    346   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
    347 }
    348 
    349 static void SimplifySegments(VP8Encoder* const enc) {
    350   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
    351   const int num_segments = enc->segment_hdr_.num_segments_;
    352   int num_final_segments = 1;
    353   int s1, s2;
    354   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
    355     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
    356     int found = 0;
    357     // check if we already have similar segment
    358     for (s2 = 0; s2 < num_final_segments; ++s2) {
    359       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
    360       if (SegmentsAreEquivalent(S1, S2)) {
    361         found = 1;
    362         break;
    363       }
    364     }
    365     map[s1] = s2;
    366     if (!found) {
    367       if (num_final_segments != s1) {
    368         enc->dqm_[num_final_segments] = enc->dqm_[s1];
    369       }
    370       ++num_final_segments;
    371     }
    372   }
    373   if (num_final_segments < num_segments) {  // Remap
    374     int i = enc->mb_w_ * enc->mb_h_;
    375     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
    376     enc->segment_hdr_.num_segments_ = num_final_segments;
    377     // Replicate the trailing segment infos (it's mostly cosmetics)
    378     for (i = num_final_segments; i < num_segments; ++i) {
    379       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
    380     }
    381   }
    382 }
    383 
    384 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
    385   int i;
    386   int dq_uv_ac, dq_uv_dc;
    387   const int num_segments = enc->segment_hdr_.num_segments_;
    388   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
    389   const double Q = quality / 100.;
    390   const double c_base = enc->config_->emulate_jpeg_size ?
    391       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
    392       QualityToCompression(Q);
    393   for (i = 0; i < num_segments; ++i) {
    394     // We modulate the base coefficient to accommodate for the quantization
    395     // susceptibility and allow denser segments to be quantized more.
    396     const double expn = 1. - amp * enc->dqm_[i].alpha_;
    397     const double c = pow(c_base, expn);
    398     const int q = (int)(127. * (1. - c));
    399     assert(expn > 0.);
    400     enc->dqm_[i].quant_ = clip(q, 0, 127);
    401   }
    402 
    403   // purely indicative in the bitstream (except for the 1-segment case)
    404   enc->base_quant_ = enc->dqm_[0].quant_;
    405 
    406   // fill-in values for the unused segments (required by the syntax)
    407   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
    408     enc->dqm_[i].quant_ = enc->base_quant_;
    409   }
    410 
    411   // uv_alpha_ is normally spread around ~60. The useful range is
    412   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
    413   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
    414   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
    415                                           / (MAX_ALPHA - MIN_ALPHA);
    416   // we rescale by the user-defined strength of adaptation
    417   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
    418   // and make it safe.
    419   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
    420   // We also boost the dc-uv-quant a little, based on sns-strength, since
    421   // U/V channels are quite more reactive to high quants (flat DC-blocks
    422   // tend to appear, and are unpleasant).
    423   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
    424   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
    425 
    426   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
    427   enc->dq_y2_dc_ = 0;
    428   enc->dq_y2_ac_ = 0;
    429   enc->dq_uv_dc_ = dq_uv_dc;
    430   enc->dq_uv_ac_ = dq_uv_ac;
    431 
    432   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
    433 
    434   if (num_segments > 1) SimplifySegments(enc);
    435 
    436   SetupMatrices(enc);         // finalize quantization matrices
    437 }
    438 
    439 //------------------------------------------------------------------------------
    440 // Form the predictions in cache
    441 
    442 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
    443 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
    444 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
    445 
    446 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
    447 const int VP8I4ModeOffsets[NUM_BMODES] = {
    448   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
    449 };
    450 
    451 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
    452   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
    453   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
    454   VP8EncPredLuma16(it->yuv_p_, left, top);
    455 }
    456 
    457 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
    458   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
    459   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
    460   VP8EncPredChroma8(it->yuv_p_, left, top);
    461 }
    462 
    463 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
    464   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
    465 }
    466 
    467 //------------------------------------------------------------------------------
    468 // Quantize
    469 
    470 // Layout:
    471 // +----+----+
    472 // |YYYY|UUVV| 0
    473 // |YYYY|UUVV| 4
    474 // |YYYY|....| 8
    475 // |YYYY|....| 12
    476 // +----+----+
    477 
    478 const int VP8Scan[16] = {  // Luma
    479   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
    480   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
    481   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
    482   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
    483 };
    484 
    485 static const int VP8ScanUV[4 + 4] = {
    486   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
    487   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
    488 };
    489 
    490 //------------------------------------------------------------------------------
    491 // Distortion measurement
    492 
    493 static const uint16_t kWeightY[16] = {
    494   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
    495 };
    496 
    497 static const uint16_t kWeightTrellis[16] = {
    498 #if USE_TDISTO == 0
    499   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
    500 #else
    501   30, 27, 19, 11,
    502   27, 24, 17, 10,
    503   19, 17, 12,  8,
    504   11, 10,  8,  6
    505 #endif
    506 };
    507 
    508 // Init/Copy the common fields in score.
    509 static void InitScore(VP8ModeScore* const rd) {
    510   rd->D  = 0;
    511   rd->SD = 0;
    512   rd->R  = 0;
    513   rd->H  = 0;
    514   rd->nz = 0;
    515   rd->score = MAX_COST;
    516 }
    517 
    518 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
    519   dst->D  = src->D;
    520   dst->SD = src->SD;
    521   dst->R  = src->R;
    522   dst->H  = src->H;
    523   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
    524   dst->score = src->score;
    525 }
    526 
    527 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
    528   dst->D  += src->D;
    529   dst->SD += src->SD;
    530   dst->R  += src->R;
    531   dst->H  += src->H;
    532   dst->nz |= src->nz;     // here, new nz bits are accumulated.
    533   dst->score += src->score;
    534 }
    535 
    536 //------------------------------------------------------------------------------
    537 // Performs trellis-optimized quantization.
    538 
    539 // Trellis node
    540 typedef struct {
    541   int8_t prev;            // best previous node
    542   int8_t sign;            // sign of coeff_i
    543   int16_t level;          // level
    544 } Node;
    545 
    546 // Score state
    547 typedef struct {
    548   score_t score;          // partial RD score
    549   const uint16_t* costs;  // shortcut to cost tables
    550 } ScoreState;
    551 
    552 // If a coefficient was quantized to a value Q (using a neutral bias),
    553 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
    554 // We don't test negative values though.
    555 #define MIN_DELTA 0   // how much lower level to try
    556 #define MAX_DELTA 1   // how much higher
    557 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
    558 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
    559 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
    560 
    561 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
    562   rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
    563 }
    564 
    565 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
    566                                           score_t distortion) {
    567   return rate * lambda + RD_DISTO_MULT * distortion;
    568 }
    569 
    570 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
    571                                 int16_t in[16], int16_t out[16],
    572                                 int ctx0, int coeff_type,
    573                                 const VP8Matrix* const mtx,
    574                                 int lambda) {
    575   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
    576   CostArrayPtr const costs =
    577       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
    578   const int first = (coeff_type == 0) ? 1 : 0;
    579   Node nodes[16][NUM_NODES];
    580   ScoreState score_states[2][NUM_NODES];
    581   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
    582   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
    583   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
    584   score_t best_score;
    585   int n, m, p, last;
    586 
    587   {
    588     score_t cost;
    589     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
    590     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
    591 
    592     // compute the position of the last interesting coefficient
    593     last = first - 1;
    594     for (n = 15; n >= first; --n) {
    595       const int j = kZigzag[n];
    596       const int err = in[j] * in[j];
    597       if (err > thresh) {
    598         last = n;
    599         break;
    600       }
    601     }
    602     // we don't need to go inspect up to n = 16 coeffs. We can just go up
    603     // to last + 1 (inclusive) without losing much.
    604     if (last < 15) ++last;
    605 
    606     // compute 'skip' score. This is the max score one can do.
    607     cost = VP8BitCost(0, last_proba);
    608     best_score = RDScoreTrellis(lambda, cost, 0);
    609 
    610     // initialize source node.
    611     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
    612       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
    613       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
    614       ss_cur[m].costs = costs[first][ctx0];
    615     }
    616   }
    617 
    618   // traverse trellis.
    619   for (n = first; n <= last; ++n) {
    620     const int j = kZigzag[n];
    621     const uint32_t Q  = mtx->q_[j];
    622     const uint32_t iQ = mtx->iq_[j];
    623     const uint32_t B = BIAS(0x00);     // neutral bias
    624     // note: it's important to take sign of the _original_ coeff,
    625     // so we don't have to consider level < 0 afterward.
    626     const int sign = (in[j] < 0);
    627     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
    628     int level0 = QUANTDIV(coeff0, iQ, B);
    629     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
    630 
    631     {   // Swap current and previous score states
    632       ScoreState* const tmp = ss_cur;
    633       ss_cur = ss_prev;
    634       ss_prev = tmp;
    635     }
    636 
    637     // test all alternate level values around level0.
    638     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
    639       Node* const cur = &NODE(n, m);
    640       int level = level0 + m;
    641       const int ctx = (level > 2) ? 2 : level;
    642       const int band = VP8EncBands[n + 1];
    643       score_t base_score, last_pos_score;
    644       score_t best_cur_score = MAX_COST;
    645       int best_prev = 0;   // default, in case
    646 
    647       ss_cur[m].score = MAX_COST;
    648       ss_cur[m].costs = costs[n + 1][ctx];
    649       if (level > MAX_LEVEL || level < 0) {   // node is dead?
    650         continue;
    651       }
    652 
    653       // Compute extra rate cost if last coeff's position is < 15
    654       {
    655         const score_t last_pos_cost =
    656             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
    657         last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
    658       }
    659 
    660       {
    661         // Compute delta_error = how much coding this level will
    662         // subtract to max_error as distortion.
    663         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
    664         const int new_error = coeff0 - level * Q;
    665         const int delta_error =
    666             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
    667         base_score = RDScoreTrellis(lambda, 0, delta_error);
    668       }
    669 
    670       // Inspect all possible non-dead predecessors. Retain only the best one.
    671       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
    672         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
    673         // eliminated since their score can't be better than the current best.
    674         const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
    675         // Examine node assuming it's a non-terminal one.
    676         const score_t score =
    677             base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
    678         if (score < best_cur_score) {
    679           best_cur_score = score;
    680           best_prev = p;
    681         }
    682       }
    683       // Store best finding in current node.
    684       cur->sign = sign;
    685       cur->level = level;
    686       cur->prev = best_prev;
    687       ss_cur[m].score = best_cur_score;
    688 
    689       // Now, record best terminal node (and thus best entry in the graph).
    690       if (level != 0) {
    691         const score_t score = best_cur_score + last_pos_score;
    692         if (score < best_score) {
    693           best_score = score;
    694           best_path[0] = n;                     // best eob position
    695           best_path[1] = m;                     // best node index
    696           best_path[2] = best_prev;             // best predecessor
    697         }
    698       }
    699     }
    700   }
    701 
    702   // Fresh start
    703   memset(in + first, 0, (16 - first) * sizeof(*in));
    704   memset(out + first, 0, (16 - first) * sizeof(*out));
    705   if (best_path[0] == -1) {
    706     return 0;   // skip!
    707   }
    708 
    709   {
    710     // Unwind the best path.
    711     // Note: best-prev on terminal node is not necessarily equal to the
    712     // best_prev for non-terminal. So we patch best_path[2] in.
    713     int nz = 0;
    714     int best_node = best_path[1];
    715     n = best_path[0];
    716     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
    717 
    718     for (; n >= first; --n) {
    719       const Node* const node = &NODE(n, best_node);
    720       const int j = kZigzag[n];
    721       out[n] = node->sign ? -node->level : node->level;
    722       nz |= node->level;
    723       in[j] = out[n] * mtx->q_[j];
    724       best_node = node->prev;
    725     }
    726     return (nz != 0);
    727   }
    728 }
    729 
    730 #undef NODE
    731 
    732 //------------------------------------------------------------------------------
    733 // Performs: difference, transform, quantize, back-transform, add
    734 // all at once. Output is the reconstructed block in *yuv_out, and the
    735 // quantized levels in *levels.
    736 
    737 static int ReconstructIntra16(VP8EncIterator* const it,
    738                               VP8ModeScore* const rd,
    739                               uint8_t* const yuv_out,
    740                               int mode) {
    741   const VP8Encoder* const enc = it->enc_;
    742   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
    743   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
    744   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    745   int nz = 0;
    746   int n;
    747   int16_t tmp[16][16], dc_tmp[16];
    748 
    749   for (n = 0; n < 16; n += 2) {
    750     VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
    751   }
    752   VP8FTransformWHT(tmp[0], dc_tmp);
    753   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
    754 
    755   if (DO_TRELLIS_I16 && it->do_trellis_) {
    756     int x, y;
    757     VP8IteratorNzToBytes(it);
    758     for (y = 0, n = 0; y < 4; ++y) {
    759       for (x = 0; x < 4; ++x, ++n) {
    760         const int ctx = it->top_nz_[x] + it->left_nz_[y];
    761         const int non_zero =
    762             TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
    763                                  &dqm->y1_, dqm->lambda_trellis_i16_);
    764         it->top_nz_[x] = it->left_nz_[y] = non_zero;
    765         rd->y_ac_levels[n][0] = 0;
    766         nz |= non_zero << n;
    767       }
    768     }
    769   } else {
    770     for (n = 0; n < 16; n += 2) {
    771       // Zero-out the first coeff, so that: a) nz is correct below, and
    772       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
    773       tmp[n][0] = tmp[n + 1][0] = 0;
    774       nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
    775       assert(rd->y_ac_levels[n + 0][0] == 0);
    776       assert(rd->y_ac_levels[n + 1][0] == 0);
    777     }
    778   }
    779 
    780   // Transform back
    781   VP8TransformWHT(dc_tmp, tmp[0]);
    782   for (n = 0; n < 16; n += 2) {
    783     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
    784   }
    785 
    786   return nz;
    787 }
    788 
    789 static int ReconstructIntra4(VP8EncIterator* const it,
    790                              int16_t levels[16],
    791                              const uint8_t* const src,
    792                              uint8_t* const yuv_out,
    793                              int mode) {
    794   const VP8Encoder* const enc = it->enc_;
    795   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
    796   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    797   int nz = 0;
    798   int16_t tmp[16];
    799 
    800   VP8FTransform(src, ref, tmp);
    801   if (DO_TRELLIS_I4 && it->do_trellis_) {
    802     const int x = it->i4_ & 3, y = it->i4_ >> 2;
    803     const int ctx = it->top_nz_[x] + it->left_nz_[y];
    804     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
    805                               dqm->lambda_trellis_i4_);
    806   } else {
    807     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
    808   }
    809   VP8ITransform(ref, tmp, yuv_out, 0);
    810   return nz;
    811 }
    812 
    813 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
    814                          uint8_t* const yuv_out, int mode) {
    815   const VP8Encoder* const enc = it->enc_;
    816   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
    817   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
    818   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    819   int nz = 0;
    820   int n;
    821   int16_t tmp[8][16];
    822 
    823   for (n = 0; n < 8; n += 2) {
    824     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
    825   }
    826   if (DO_TRELLIS_UV && it->do_trellis_) {
    827     int ch, x, y;
    828     for (ch = 0, n = 0; ch <= 2; ch += 2) {
    829       for (y = 0; y < 2; ++y) {
    830         for (x = 0; x < 2; ++x, ++n) {
    831           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
    832           const int non_zero =
    833               TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
    834                                    &dqm->uv_, dqm->lambda_trellis_uv_);
    835           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
    836           nz |= non_zero << n;
    837         }
    838       }
    839     }
    840   } else {
    841     for (n = 0; n < 8; n += 2) {
    842       nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
    843     }
    844   }
    845 
    846   for (n = 0; n < 8; n += 2) {
    847     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
    848   }
    849   return (nz << 16);
    850 }
    851 
    852 //------------------------------------------------------------------------------
    853 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
    854 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
    855 
    856 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
    857   // We look at the first three AC coefficients to determine what is the average
    858   // delta between each sub-4x4 block.
    859   const int v0 = abs(DCs[1]);
    860   const int v1 = abs(DCs[4]);
    861   const int v2 = abs(DCs[5]);
    862   int max_v = (v0 > v1) ? v1 : v0;
    863   max_v = (v2 > max_v) ? v2 : max_v;
    864   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
    865 }
    866 
    867 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
    868   VP8ModeScore* const tmp = *a;
    869   *a = *b;
    870   *b = tmp;
    871 }
    872 
    873 static void SwapPtr(uint8_t** a, uint8_t** b) {
    874   uint8_t* const tmp = *a;
    875   *a = *b;
    876   *b = tmp;
    877 }
    878 
    879 static void SwapOut(VP8EncIterator* const it) {
    880   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
    881 }
    882 
    883 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
    884   score_t score = 0;
    885   while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
    886     int i;
    887     for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
    888       score += (levels[i] != 0);
    889       if (score > thresh) return 0;
    890     }
    891     levels += 16;
    892   }
    893   return 1;
    894 }
    895 
    896 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
    897   const int kNumBlocks = 16;
    898   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
    899   const int lambda = dqm->lambda_i16_;
    900   const int tlambda = dqm->tlambda_;
    901   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
    902   VP8ModeScore rd_tmp;
    903   VP8ModeScore* rd_cur = &rd_tmp;
    904   VP8ModeScore* rd_best = rd;
    905   int mode;
    906 
    907   rd->mode_i16 = -1;
    908   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
    909     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
    910     rd_cur->mode_i16 = mode;
    911 
    912     // Reconstruct
    913     rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
    914 
    915     // Measure RD-score
    916     rd_cur->D = VP8SSE16x16(src, tmp_dst);
    917     rd_cur->SD =
    918         tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
    919     rd_cur->H = VP8FixedCostsI16[mode];
    920     rd_cur->R = VP8GetCostLuma16(it, rd_cur);
    921     if (mode > 0 &&
    922         IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
    923       // penalty to avoid flat area to be mispredicted by complex mode
    924       rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
    925     }
    926 
    927     // Since we always examine Intra16 first, we can overwrite *rd directly.
    928     SetRDScore(lambda, rd_cur);
    929     if (mode == 0 || rd_cur->score < rd_best->score) {
    930       SwapModeScore(&rd_cur, &rd_best);
    931       SwapOut(it);
    932     }
    933   }
    934   if (rd_best != rd) {
    935     memcpy(rd, rd_best, sizeof(*rd));
    936   }
    937   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
    938   VP8SetIntra16Mode(it, rd->mode_i16);
    939 
    940   // we have a blocky macroblock (only DCs are non-zero) with fairly high
    941   // distortion, record max delta so we can later adjust the minimal filtering
    942   // strength needed to smooth these blocks out.
    943   if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
    944     StoreMaxDelta(dqm, rd->y_dc_levels);
    945   }
    946 }
    947 
    948 //------------------------------------------------------------------------------
    949 
    950 // return the cost array corresponding to the surrounding prediction modes.
    951 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
    952                                      const uint8_t modes[16]) {
    953   const int preds_w = it->enc_->preds_w_;
    954   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
    955   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
    956   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
    957   return VP8FixedCostsI4[top][left];
    958 }
    959 
    960 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
    961   const VP8Encoder* const enc = it->enc_;
    962   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    963   const int lambda = dqm->lambda_i4_;
    964   const int tlambda = dqm->tlambda_;
    965   const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
    966   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
    967   int total_header_bits = 0;
    968   VP8ModeScore rd_best;
    969 
    970   if (enc->max_i4_header_bits_ == 0) {
    971     return 0;
    972   }
    973 
    974   InitScore(&rd_best);
    975   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
    976   SetRDScore(dqm->lambda_mode_, &rd_best);
    977   VP8IteratorStartI4(it);
    978   do {
    979     const int kNumBlocks = 1;
    980     VP8ModeScore rd_i4;
    981     int mode;
    982     int best_mode = -1;
    983     const uint8_t* const src = src0 + VP8Scan[it->i4_];
    984     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
    985     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
    986     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
    987 
    988     InitScore(&rd_i4);
    989     VP8MakeIntra4Preds(it);
    990     for (mode = 0; mode < NUM_BMODES; ++mode) {
    991       VP8ModeScore rd_tmp;
    992       int16_t tmp_levels[16];
    993 
    994       // Reconstruct
    995       rd_tmp.nz =
    996           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
    997 
    998       // Compute RD-score
    999       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
   1000       rd_tmp.SD =
   1001           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
   1002                   : 0;
   1003       rd_tmp.H = mode_costs[mode];
   1004 
   1005       // Add flatness penalty
   1006       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
   1007         rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
   1008       } else {
   1009         rd_tmp.R = 0;
   1010       }
   1011 
   1012       // early-out check
   1013       SetRDScore(lambda, &rd_tmp);
   1014       if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
   1015 
   1016       // finish computing score
   1017       rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
   1018       SetRDScore(lambda, &rd_tmp);
   1019 
   1020       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
   1021         CopyScore(&rd_i4, &rd_tmp);
   1022         best_mode = mode;
   1023         SwapPtr(&tmp_dst, &best_block);
   1024         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
   1025                sizeof(rd_best.y_ac_levels[it->i4_]));
   1026       }
   1027     }
   1028     SetRDScore(dqm->lambda_mode_, &rd_i4);
   1029     AddScore(&rd_best, &rd_i4);
   1030     if (rd_best.score >= rd->score) {
   1031       return 0;
   1032     }
   1033     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
   1034     if (total_header_bits > enc->max_i4_header_bits_) {
   1035       return 0;
   1036     }
   1037     // Copy selected samples if not in the right place already.
   1038     if (best_block != best_blocks + VP8Scan[it->i4_]) {
   1039       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
   1040     }
   1041     rd->modes_i4[it->i4_] = best_mode;
   1042     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
   1043   } while (VP8IteratorRotateI4(it, best_blocks));
   1044 
   1045   // finalize state
   1046   CopyScore(rd, &rd_best);
   1047   VP8SetIntra4Mode(it, rd->modes_i4);
   1048   SwapOut(it);
   1049   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
   1050   return 1;   // select intra4x4 over intra16x16
   1051 }
   1052 
   1053 //------------------------------------------------------------------------------
   1054 
   1055 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
   1056   const int kNumBlocks = 8;
   1057   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   1058   const int lambda = dqm->lambda_uv_;
   1059   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
   1060   uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
   1061   uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
   1062   uint8_t* dst = dst0;
   1063   VP8ModeScore rd_best;
   1064   int mode;
   1065 
   1066   rd->mode_uv = -1;
   1067   InitScore(&rd_best);
   1068   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1069     VP8ModeScore rd_uv;
   1070 
   1071     // Reconstruct
   1072     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
   1073 
   1074     // Compute RD-score
   1075     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
   1076     rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
   1077     rd_uv.H  = VP8FixedCostsUV[mode];
   1078     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
   1079     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
   1080       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
   1081     }
   1082 
   1083     SetRDScore(lambda, &rd_uv);
   1084     if (mode == 0 || rd_uv.score < rd_best.score) {
   1085       CopyScore(&rd_best, &rd_uv);
   1086       rd->mode_uv = mode;
   1087       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
   1088       SwapPtr(&dst, &tmp_dst);
   1089     }
   1090   }
   1091   VP8SetIntraUVMode(it, rd->mode_uv);
   1092   AddScore(rd, &rd_best);
   1093   if (dst != dst0) {   // copy 16x8 block if needed
   1094     VP8Copy16x8(dst, dst0);
   1095   }
   1096 }
   1097 
   1098 //------------------------------------------------------------------------------
   1099 // Final reconstruction and quantization.
   1100 
   1101 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
   1102   const VP8Encoder* const enc = it->enc_;
   1103   const int is_i16 = (it->mb_->type_ == 1);
   1104   int nz = 0;
   1105 
   1106   if (is_i16) {
   1107     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
   1108   } else {
   1109     VP8IteratorStartI4(it);
   1110     do {
   1111       const int mode =
   1112           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
   1113       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
   1114       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
   1115       VP8MakeIntra4Preds(it);
   1116       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
   1117                               src, dst, mode) << it->i4_;
   1118     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
   1119   }
   1120 
   1121   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
   1122   rd->nz = nz;
   1123 }
   1124 
   1125 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
   1126 static void RefineUsingDistortion(VP8EncIterator* const it,
   1127                                   int try_both_modes, int refine_uv_mode,
   1128                                   VP8ModeScore* const rd) {
   1129   score_t best_score = MAX_COST;
   1130   score_t score_i4 = (score_t)I4_PENALTY;
   1131   int16_t tmp_levels[16][16];
   1132   uint8_t modes_i4[16];
   1133   int nz = 0;
   1134   int mode;
   1135   int is_i16 = try_both_modes || (it->mb_->type_ == 1);
   1136 
   1137   if (is_i16) {   // First, evaluate Intra16 distortion
   1138     int best_mode = -1;
   1139     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
   1140     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1141       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
   1142       const score_t score = VP8SSE16x16(src, ref);
   1143       if (score < best_score) {
   1144         best_mode = mode;
   1145         best_score = score;
   1146       }
   1147     }
   1148     VP8SetIntra16Mode(it, best_mode);
   1149     // we'll reconstruct later, if i16 mode actually gets selected
   1150   }
   1151 
   1152   // Next, evaluate Intra4
   1153   if (try_both_modes || !is_i16) {
   1154     // We don't evaluate the rate here, but just account for it through a
   1155     // constant penalty (i4 mode usually needs more bits compared to i16).
   1156     is_i16 = 0;
   1157     VP8IteratorStartI4(it);
   1158     do {
   1159       int best_i4_mode = -1;
   1160       score_t best_i4_score = MAX_COST;
   1161       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
   1162 
   1163       VP8MakeIntra4Preds(it);
   1164       for (mode = 0; mode < NUM_BMODES; ++mode) {
   1165         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
   1166         const score_t score = VP8SSE4x4(src, ref);
   1167         if (score < best_i4_score) {
   1168           best_i4_mode = mode;
   1169           best_i4_score = score;
   1170         }
   1171       }
   1172       modes_i4[it->i4_] = best_i4_mode;
   1173       score_i4 += best_i4_score;
   1174       if (score_i4 >= best_score) {
   1175         // Intra4 won't be better than Intra16. Bail out and pick Intra16.
   1176         is_i16 = 1;
   1177         break;
   1178       } else {  // reconstruct partial block inside yuv_out2_ buffer
   1179         uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
   1180         nz |= ReconstructIntra4(it, tmp_levels[it->i4_],
   1181                                 src, tmp_dst, best_i4_mode) << it->i4_;
   1182       }
   1183     } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
   1184   }
   1185 
   1186   // Final reconstruction, depending on which mode is selected.
   1187   if (!is_i16) {
   1188     VP8SetIntra4Mode(it, modes_i4);
   1189     memcpy(rd->y_ac_levels, tmp_levels, sizeof(tmp_levels));
   1190     SwapOut(it);
   1191     best_score = score_i4;
   1192   } else {
   1193     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
   1194   }
   1195 
   1196   // ... and UV!
   1197   if (refine_uv_mode) {
   1198     int best_mode = -1;
   1199     score_t best_uv_score = MAX_COST;
   1200     const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
   1201     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1202       const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
   1203       const score_t score = VP8SSE16x8(src, ref);
   1204       if (score < best_uv_score) {
   1205         best_mode = mode;
   1206         best_uv_score = score;
   1207       }
   1208     }
   1209     VP8SetIntraUVMode(it, best_mode);
   1210   }
   1211   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
   1212 
   1213   rd->nz = nz;
   1214   rd->score = best_score;
   1215 }
   1216 
   1217 //------------------------------------------------------------------------------
   1218 // Entry point
   1219 
   1220 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
   1221                 VP8RDLevel rd_opt) {
   1222   int is_skipped;
   1223   const int method = it->enc_->method_;
   1224 
   1225   InitScore(rd);
   1226 
   1227   // We can perform predictions for Luma16x16 and Chroma8x8 already.
   1228   // Luma4x4 predictions needs to be done as-we-go.
   1229   VP8MakeLuma16Preds(it);
   1230   VP8MakeChroma8Preds(it);
   1231 
   1232   if (rd_opt > RD_OPT_NONE) {
   1233     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
   1234     PickBestIntra16(it, rd);
   1235     if (method >= 2) {
   1236       PickBestIntra4(it, rd);
   1237     }
   1238     PickBestUV(it, rd);
   1239     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
   1240       it->do_trellis_ = 1;
   1241       SimpleQuantize(it, rd);
   1242     }
   1243   } else {
   1244     // At this point we have heuristically decided intra16 / intra4.
   1245     // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
   1246     // For method <= 1, we don't re-examine the decision but just go ahead with
   1247     // quantization/reconstruction.
   1248     RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
   1249   }
   1250   is_skipped = (rd->nz == 0);
   1251   VP8SetSkip(it, is_skipped);
   1252   return is_skipped;
   1253 }
   1254