Home | History | Annotate | Download | only in enc
      1 // Copyright 2011 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 //   Quantization
     11 //
     12 // Author: Skal (pascal.massimino (at) gmail.com)
     13 
     14 #include <assert.h>
     15 #include <math.h>
     16 #include <stdlib.h>  // for abs()
     17 
     18 #include "./vp8enci.h"
     19 #include "./cost.h"
     20 
     21 #define DO_TRELLIS_I4  1
     22 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
     23 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
     24 #define USE_TDISTO 1
     25 
     26 #define MID_ALPHA 64      // neutral value for susceptibility
     27 #define MIN_ALPHA 30      // lowest usable value for susceptibility
     28 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
     29 
     30 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
     31                           // power-law modulation. Must be strictly less than 1.
     32 
     33 #define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
     34 
     35 // number of non-zero coeffs below which we consider the block very flat
     36 // (and apply a penalty to complex predictions)
     37 #define FLATNESS_LIMIT_I16 10      // I16 mode
     38 #define FLATNESS_LIMIT_I4  3       // I4 mode
     39 #define FLATNESS_LIMIT_UV  2       // UV mode
     40 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
     41 
     42 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
     43 
     44 // #define DEBUG_BLOCK
     45 
     46 //------------------------------------------------------------------------------
     47 
     48 #if defined(DEBUG_BLOCK)
     49 
     50 #include <stdio.h>
     51 #include <stdlib.h>
     52 
     53 static void PrintBlockInfo(const VP8EncIterator* const it,
     54                            const VP8ModeScore* const rd) {
     55   int i, j;
     56   const int is_i16 = (it->mb_->type_ == 1);
     57   printf("SOURCE / OUTPUT / ABS DELTA\n");
     58   for (j = 0; j < 24; ++j) {
     59     if (j == 16) printf("\n");   // newline before the U/V block
     60     for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
     61     printf("     ");
     62     for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
     63     printf("     ");
     64     for (i = 0; i < 16; ++i) {
     65       printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
     66     }
     67     printf("\n");
     68   }
     69   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
     70     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
     71     (int)rd->score);
     72   if (is_i16) {
     73     printf("Mode: %d\n", rd->mode_i16);
     74     printf("y_dc_levels:");
     75     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
     76     printf("\n");
     77   } else {
     78     printf("Modes[16]: ");
     79     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
     80     printf("\n");
     81   }
     82   printf("y_ac_levels:\n");
     83   for (j = 0; j < 16; ++j) {
     84     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
     85       printf("%4d ", rd->y_ac_levels[j][i]);
     86     }
     87     printf("\n");
     88   }
     89   printf("\n");
     90   printf("uv_levels (mode=%d):\n", rd->mode_uv);
     91   for (j = 0; j < 8; ++j) {
     92     for (i = 0; i < 16; ++i) {
     93       printf("%4d ", rd->uv_levels[j][i]);
     94     }
     95     printf("\n");
     96   }
     97 }
     98 
     99 #endif   // DEBUG_BLOCK
    100 
    101 //------------------------------------------------------------------------------
    102 
    103 static WEBP_INLINE int clip(int v, int m, int M) {
    104   return v < m ? m : v > M ? M : v;
    105 }
    106 
    107 static const uint8_t kZigzag[16] = {
    108   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
    109 };
    110 
    111 static const uint8_t kDcTable[128] = {
    112   4,     5,   6,   7,   8,   9,  10,  10,
    113   11,   12,  13,  14,  15,  16,  17,  17,
    114   18,   19,  20,  20,  21,  21,  22,  22,
    115   23,   23,  24,  25,  25,  26,  27,  28,
    116   29,   30,  31,  32,  33,  34,  35,  36,
    117   37,   37,  38,  39,  40,  41,  42,  43,
    118   44,   45,  46,  46,  47,  48,  49,  50,
    119   51,   52,  53,  54,  55,  56,  57,  58,
    120   59,   60,  61,  62,  63,  64,  65,  66,
    121   67,   68,  69,  70,  71,  72,  73,  74,
    122   75,   76,  76,  77,  78,  79,  80,  81,
    123   82,   83,  84,  85,  86,  87,  88,  89,
    124   91,   93,  95,  96,  98, 100, 101, 102,
    125   104, 106, 108, 110, 112, 114, 116, 118,
    126   122, 124, 126, 128, 130, 132, 134, 136,
    127   138, 140, 143, 145, 148, 151, 154, 157
    128 };
    129 
    130 static const uint16_t kAcTable[128] = {
    131   4,     5,   6,   7,   8,   9,  10,  11,
    132   12,   13,  14,  15,  16,  17,  18,  19,
    133   20,   21,  22,  23,  24,  25,  26,  27,
    134   28,   29,  30,  31,  32,  33,  34,  35,
    135   36,   37,  38,  39,  40,  41,  42,  43,
    136   44,   45,  46,  47,  48,  49,  50,  51,
    137   52,   53,  54,  55,  56,  57,  58,  60,
    138   62,   64,  66,  68,  70,  72,  74,  76,
    139   78,   80,  82,  84,  86,  88,  90,  92,
    140   94,   96,  98, 100, 102, 104, 106, 108,
    141   110, 112, 114, 116, 119, 122, 125, 128,
    142   131, 134, 137, 140, 143, 146, 149, 152,
    143   155, 158, 161, 164, 167, 170, 173, 177,
    144   181, 185, 189, 193, 197, 201, 205, 209,
    145   213, 217, 221, 225, 229, 234, 239, 245,
    146   249, 254, 259, 264, 269, 274, 279, 284
    147 };
    148 
    149 static const uint16_t kAcTable2[128] = {
    150   8,     8,   9,  10,  12,  13,  15,  17,
    151   18,   20,  21,  23,  24,  26,  27,  29,
    152   31,   32,  34,  35,  37,  38,  40,  41,
    153   43,   44,  46,  48,  49,  51,  52,  54,
    154   55,   57,  58,  60,  62,  63,  65,  66,
    155   68,   69,  71,  72,  74,  75,  77,  79,
    156   80,   82,  83,  85,  86,  88,  89,  93,
    157   96,   99, 102, 105, 108, 111, 114, 117,
    158   120, 124, 127, 130, 133, 136, 139, 142,
    159   145, 148, 151, 155, 158, 161, 164, 167,
    160   170, 173, 176, 179, 184, 189, 193, 198,
    161   203, 207, 212, 217, 221, 226, 230, 235,
    162   240, 244, 249, 254, 258, 263, 268, 274,
    163   280, 286, 292, 299, 305, 311, 317, 323,
    164   330, 336, 342, 348, 354, 362, 370, 379,
    165   385, 393, 401, 409, 416, 424, 432, 440
    166 };
    167 
    168 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
    169   { 96, 110 }, { 96, 108 }, { 110, 115 }
    170 };
    171 
    172 // Sharpening by (slightly) raising the hi-frequency coeffs.
    173 // Hack-ish but helpful for mid-bitrate range. Use with care.
    174 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
    175 static const uint8_t kFreqSharpening[16] = {
    176   0,  30, 60, 90,
    177   30, 60, 90, 90,
    178   60, 90, 90, 90,
    179   90, 90, 90, 90
    180 };
    181 
    182 //------------------------------------------------------------------------------
    183 // Initialize quantization parameters in VP8Matrix
    184 
    185 // Returns the average quantizer
    186 static int ExpandMatrix(VP8Matrix* const m, int type) {
    187   int i, sum;
    188   for (i = 0; i < 2; ++i) {
    189     const int is_ac_coeff = (i > 0);
    190     const int bias = kBiasMatrices[type][is_ac_coeff];
    191     m->iq_[i] = (1 << QFIX) / m->q_[i];
    192     m->bias_[i] = BIAS(bias);
    193     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
    194     //   * zero if coeff <= zthresh
    195     //   * non-zero if coeff > zthresh
    196     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
    197   }
    198   for (i = 2; i < 16; ++i) {
    199     m->q_[i] = m->q_[1];
    200     m->iq_[i] = m->iq_[1];
    201     m->bias_[i] = m->bias_[1];
    202     m->zthresh_[i] = m->zthresh_[1];
    203   }
    204   for (sum = 0, i = 0; i < 16; ++i) {
    205     if (type == 0) {  // we only use sharpening for AC luma coeffs
    206       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
    207     } else {
    208       m->sharpen_[i] = 0;
    209     }
    210     sum += m->q_[i];
    211   }
    212   return (sum + 8) >> 4;
    213 }
    214 
    215 static void SetupMatrices(VP8Encoder* enc) {
    216   int i;
    217   const int tlambda_scale =
    218     (enc->method_ >= 4) ? enc->config_->sns_strength
    219                         : 0;
    220   const int num_segments = enc->segment_hdr_.num_segments_;
    221   for (i = 0; i < num_segments; ++i) {
    222     VP8SegmentInfo* const m = &enc->dqm_[i];
    223     const int q = m->quant_;
    224     int q4, q16, quv;
    225     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
    226     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
    227 
    228     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
    229     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
    230 
    231     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
    232     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
    233 
    234     q4  = ExpandMatrix(&m->y1_, 0);
    235     q16 = ExpandMatrix(&m->y2_, 1);
    236     quv = ExpandMatrix(&m->uv_, 2);
    237 
    238     m->lambda_i4_          = (3 * q4 * q4) >> 7;
    239     m->lambda_i16_         = (3 * q16 * q16);
    240     m->lambda_uv_          = (3 * quv * quv) >> 6;
    241     m->lambda_mode_        = (1 * q4 * q4) >> 7;
    242     m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
    243     m->lambda_trellis_i16_ = (q16 * q16) >> 2;
    244     m->lambda_trellis_uv_  = (quv *quv) << 1;
    245     m->tlambda_            = (tlambda_scale * q4) >> 5;
    246 
    247     m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
    248     m->max_edge_  = 0;
    249   }
    250 }
    251 
    252 //------------------------------------------------------------------------------
    253 // Initialize filtering parameters
    254 
    255 // Very small filter-strength values have close to no visual effect. So we can
    256 // save a little decoding-CPU by turning filtering off for these.
    257 #define FSTRENGTH_CUTOFF 2
    258 
    259 static void SetupFilterStrength(VP8Encoder* const enc) {
    260   int i;
    261   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
    262   const int level0 = 5 * enc->config_->filter_strength;
    263   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
    264     VP8SegmentInfo* const m = &enc->dqm_[i];
    265     // We focus on the quantization of AC coeffs.
    266     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
    267     const int base_strength =
    268         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
    269     // Segments with lower complexity ('beta') will be less filtered.
    270     const int f = base_strength * level0 / (256 + m->beta_);
    271     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
    272   }
    273   // We record the initial strength (mainly for the case of 1-segment only).
    274   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
    275   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
    276   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
    277 }
    278 
    279 //------------------------------------------------------------------------------
    280 
    281 // Note: if you change the values below, remember that the max range
    282 // allowed by the syntax for DQ_UV is [-16,16].
    283 #define MAX_DQ_UV (6)
    284 #define MIN_DQ_UV (-4)
    285 
    286 // We want to emulate jpeg-like behaviour where the expected "good" quality
    287 // is around q=75. Internally, our "good" middle is around c=50. So we
    288 // map accordingly using linear piece-wise function
    289 static double QualityToCompression(double c) {
    290   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
    291   // The file size roughly scales as pow(quantizer, 3.). Actually, the
    292   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
    293   // in the mid-quant range. So we scale the compressibility inversely to
    294   // this power-law: quant ~= compression ^ 1/3. This law holds well for
    295   // low quant. Finer modeling for high-quant would make use of kAcTable[]
    296   // more explicitly.
    297   const double v = pow(linear_c, 1 / 3.);
    298   return v;
    299 }
    300 
    301 static double QualityToJPEGCompression(double c, double alpha) {
    302   // We map the complexity 'alpha' and quality setting 'c' to a compression
    303   // exponent empirically matched to the compression curve of libjpeg6b.
    304   // On average, the WebP output size will be roughly similar to that of a
    305   // JPEG file compressed with same quality factor.
    306   const double amin = 0.30;
    307   const double amax = 0.85;
    308   const double exp_min = 0.4;
    309   const double exp_max = 0.9;
    310   const double slope = (exp_min - exp_max) / (amax - amin);
    311   // Linearly interpolate 'expn' from exp_min to exp_max
    312   // in the [amin, amax] range.
    313   const double expn = (alpha > amax) ? exp_min
    314                     : (alpha < amin) ? exp_max
    315                     : exp_max + slope * (alpha - amin);
    316   const double v = pow(c, expn);
    317   return v;
    318 }
    319 
    320 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
    321                                  const VP8SegmentInfo* const S2) {
    322   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
    323 }
    324 
    325 static void SimplifySegments(VP8Encoder* const enc) {
    326   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
    327   const int num_segments = enc->segment_hdr_.num_segments_;
    328   int num_final_segments = 1;
    329   int s1, s2;
    330   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
    331     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
    332     int found = 0;
    333     // check if we already have similar segment
    334     for (s2 = 0; s2 < num_final_segments; ++s2) {
    335       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
    336       if (SegmentsAreEquivalent(S1, S2)) {
    337         found = 1;
    338         break;
    339       }
    340     }
    341     map[s1] = s2;
    342     if (!found) {
    343       if (num_final_segments != s1) {
    344         enc->dqm_[num_final_segments] = enc->dqm_[s1];
    345       }
    346       ++num_final_segments;
    347     }
    348   }
    349   if (num_final_segments < num_segments) {  // Remap
    350     int i = enc->mb_w_ * enc->mb_h_;
    351     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
    352     enc->segment_hdr_.num_segments_ = num_final_segments;
    353     // Replicate the trailing segment infos (it's mostly cosmetics)
    354     for (i = num_final_segments; i < num_segments; ++i) {
    355       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
    356     }
    357   }
    358 }
    359 
    360 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
    361   int i;
    362   int dq_uv_ac, dq_uv_dc;
    363   const int num_segments = enc->segment_hdr_.num_segments_;
    364   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
    365   const double Q = quality / 100.;
    366   const double c_base = enc->config_->emulate_jpeg_size ?
    367       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
    368       QualityToCompression(Q);
    369   for (i = 0; i < num_segments; ++i) {
    370     // We modulate the base coefficient to accommodate for the quantization
    371     // susceptibility and allow denser segments to be quantized more.
    372     const double expn = 1. - amp * enc->dqm_[i].alpha_;
    373     const double c = pow(c_base, expn);
    374     const int q = (int)(127. * (1. - c));
    375     assert(expn > 0.);
    376     enc->dqm_[i].quant_ = clip(q, 0, 127);
    377   }
    378 
    379   // purely indicative in the bitstream (except for the 1-segment case)
    380   enc->base_quant_ = enc->dqm_[0].quant_;
    381 
    382   // fill-in values for the unused segments (required by the syntax)
    383   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
    384     enc->dqm_[i].quant_ = enc->base_quant_;
    385   }
    386 
    387   // uv_alpha_ is normally spread around ~60. The useful range is
    388   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
    389   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
    390   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
    391                                           / (MAX_ALPHA - MIN_ALPHA);
    392   // we rescale by the user-defined strength of adaptation
    393   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
    394   // and make it safe.
    395   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
    396   // We also boost the dc-uv-quant a little, based on sns-strength, since
    397   // U/V channels are quite more reactive to high quants (flat DC-blocks
    398   // tend to appear, and are unpleasant).
    399   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
    400   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
    401 
    402   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
    403   enc->dq_y2_dc_ = 0;
    404   enc->dq_y2_ac_ = 0;
    405   enc->dq_uv_dc_ = dq_uv_dc;
    406   enc->dq_uv_ac_ = dq_uv_ac;
    407 
    408   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
    409 
    410   if (num_segments > 1) SimplifySegments(enc);
    411 
    412   SetupMatrices(enc);         // finalize quantization matrices
    413 }
    414 
    415 //------------------------------------------------------------------------------
    416 // Form the predictions in cache
    417 
    418 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
    419 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
    420 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
    421 
    422 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
    423 const int VP8I4ModeOffsets[NUM_BMODES] = {
    424   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
    425 };
    426 
    427 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
    428   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
    429   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
    430   VP8EncPredLuma16(it->yuv_p_, left, top);
    431 }
    432 
    433 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
    434   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
    435   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
    436   VP8EncPredChroma8(it->yuv_p_, left, top);
    437 }
    438 
    439 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
    440   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
    441 }
    442 
    443 //------------------------------------------------------------------------------
    444 // Quantize
    445 
    446 // Layout:
    447 // +----+
    448 // |YYYY| 0
    449 // |YYYY| 4
    450 // |YYYY| 8
    451 // |YYYY| 12
    452 // +----+
    453 // |UUVV| 16
    454 // |UUVV| 20
    455 // +----+
    456 
    457 const int VP8Scan[16] = {  // Luma
    458   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
    459   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
    460   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
    461   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
    462 };
    463 
    464 static const int VP8ScanUV[4 + 4] = {
    465   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
    466   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
    467 };
    468 
    469 //------------------------------------------------------------------------------
    470 // Distortion measurement
    471 
    472 static const uint16_t kWeightY[16] = {
    473   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
    474 };
    475 
    476 static const uint16_t kWeightTrellis[16] = {
    477 #if USE_TDISTO == 0
    478   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
    479 #else
    480   30, 27, 19, 11,
    481   27, 24, 17, 10,
    482   19, 17, 12,  8,
    483   11, 10,  8,  6
    484 #endif
    485 };
    486 
    487 // Init/Copy the common fields in score.
    488 static void InitScore(VP8ModeScore* const rd) {
    489   rd->D  = 0;
    490   rd->SD = 0;
    491   rd->R  = 0;
    492   rd->H  = 0;
    493   rd->nz = 0;
    494   rd->score = MAX_COST;
    495 }
    496 
    497 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
    498   dst->D  = src->D;
    499   dst->SD = src->SD;
    500   dst->R  = src->R;
    501   dst->H  = src->H;
    502   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
    503   dst->score = src->score;
    504 }
    505 
    506 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
    507   dst->D  += src->D;
    508   dst->SD += src->SD;
    509   dst->R  += src->R;
    510   dst->H  += src->H;
    511   dst->nz |= src->nz;     // here, new nz bits are accumulated.
    512   dst->score += src->score;
    513 }
    514 
    515 //------------------------------------------------------------------------------
    516 // Performs trellis-optimized quantization.
    517 
    518 // Trellis node
    519 typedef struct {
    520   int8_t prev;            // best previous node
    521   int8_t sign;            // sign of coeff_i
    522   int16_t level;          // level
    523 } Node;
    524 
    525 // Score state
    526 typedef struct {
    527   score_t score;          // partial RD score
    528   const uint16_t* costs;  // shortcut to cost tables
    529 } ScoreState;
    530 
    531 // If a coefficient was quantized to a value Q (using a neutral bias),
    532 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
    533 // We don't test negative values though.
    534 #define MIN_DELTA 0   // how much lower level to try
    535 #define MAX_DELTA 1   // how much higher
    536 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
    537 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
    538 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
    539 
    540 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
    541   // TODO: incorporate the "* 256" in the tables?
    542   rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
    543 }
    544 
    545 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
    546                                           score_t distortion) {
    547   return rate * lambda + 256 * distortion;
    548 }
    549 
    550 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
    551                                 int16_t in[16], int16_t out[16],
    552                                 int ctx0, int coeff_type,
    553                                 const VP8Matrix* const mtx,
    554                                 int lambda) {
    555   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
    556   const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
    557   const int first = (coeff_type == 0) ? 1 : 0;
    558   Node nodes[16][NUM_NODES];
    559   ScoreState score_states[2][NUM_NODES];
    560   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
    561   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
    562   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
    563   score_t best_score;
    564   int n, m, p, last;
    565 
    566   {
    567     score_t cost;
    568     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
    569     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
    570 
    571     // compute the position of the last interesting coefficient
    572     last = first - 1;
    573     for (n = 15; n >= first; --n) {
    574       const int j = kZigzag[n];
    575       const int err = in[j] * in[j];
    576       if (err > thresh) {
    577         last = n;
    578         break;
    579       }
    580     }
    581     // we don't need to go inspect up to n = 16 coeffs. We can just go up
    582     // to last + 1 (inclusive) without losing much.
    583     if (last < 15) ++last;
    584 
    585     // compute 'skip' score. This is the max score one can do.
    586     cost = VP8BitCost(0, last_proba);
    587     best_score = RDScoreTrellis(lambda, cost, 0);
    588 
    589     // initialize source node.
    590     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
    591       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
    592       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
    593       ss_cur[m].costs = costs[VP8EncBands[first]][ctx0];
    594     }
    595   }
    596 
    597   // traverse trellis.
    598   for (n = first; n <= last; ++n) {
    599     const int j = kZigzag[n];
    600     const uint32_t Q  = mtx->q_[j];
    601     const uint32_t iQ = mtx->iq_[j];
    602     const uint32_t B = BIAS(0x00);     // neutral bias
    603     // note: it's important to take sign of the _original_ coeff,
    604     // so we don't have to consider level < 0 afterward.
    605     const int sign = (in[j] < 0);
    606     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
    607     int level0 = QUANTDIV(coeff0, iQ, B);
    608     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
    609 
    610     {   // Swap current and previous score states
    611       ScoreState* const tmp = ss_cur;
    612       ss_cur = ss_prev;
    613       ss_prev = tmp;
    614     }
    615 
    616     // test all alternate level values around level0.
    617     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
    618       Node* const cur = &NODE(n, m);
    619       int level = level0 + m;
    620       const int ctx = (level > 2) ? 2 : level;
    621       const int band = VP8EncBands[n + 1];
    622       score_t base_score, last_pos_score;
    623       score_t best_cur_score = MAX_COST;
    624       int best_prev = 0;   // default, in case
    625 
    626       ss_cur[m].score = MAX_COST;
    627       ss_cur[m].costs = costs[band][ctx];
    628       if (level > MAX_LEVEL || level < 0) {   // node is dead?
    629         continue;
    630       }
    631 
    632       // Compute extra rate cost if last coeff's position is < 15
    633       {
    634         const score_t last_pos_cost =
    635             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
    636         last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
    637       }
    638 
    639       {
    640         // Compute delta_error = how much coding this level will
    641         // subtract to max_error as distortion.
    642         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
    643         const int new_error = coeff0 - level * Q;
    644         const int delta_error =
    645             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
    646         base_score = RDScoreTrellis(lambda, 0, delta_error);
    647       }
    648 
    649       // Inspect all possible non-dead predecessors. Retain only the best one.
    650       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
    651         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
    652         // eliminated since their score can't be better than the current best.
    653         const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
    654         // Examine node assuming it's a non-terminal one.
    655         const score_t score =
    656             base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
    657         if (score < best_cur_score) {
    658           best_cur_score = score;
    659           best_prev = p;
    660         }
    661       }
    662       // Store best finding in current node.
    663       cur->sign = sign;
    664       cur->level = level;
    665       cur->prev = best_prev;
    666       ss_cur[m].score = best_cur_score;
    667 
    668       // Now, record best terminal node (and thus best entry in the graph).
    669       if (level != 0) {
    670         const score_t score = best_cur_score + last_pos_score;
    671         if (score < best_score) {
    672           best_score = score;
    673           best_path[0] = n;                     // best eob position
    674           best_path[1] = m;                     // best node index
    675           best_path[2] = best_prev;             // best predecessor
    676         }
    677       }
    678     }
    679   }
    680 
    681   // Fresh start
    682   memset(in + first, 0, (16 - first) * sizeof(*in));
    683   memset(out + first, 0, (16 - first) * sizeof(*out));
    684   if (best_path[0] == -1) {
    685     return 0;   // skip!
    686   }
    687 
    688   {
    689     // Unwind the best path.
    690     // Note: best-prev on terminal node is not necessarily equal to the
    691     // best_prev for non-terminal. So we patch best_path[2] in.
    692     int nz = 0;
    693     int best_node = best_path[1];
    694     n = best_path[0];
    695     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
    696 
    697     for (; n >= first; --n) {
    698       const Node* const node = &NODE(n, best_node);
    699       const int j = kZigzag[n];
    700       out[n] = node->sign ? -node->level : node->level;
    701       nz |= node->level;
    702       in[j] = out[n] * mtx->q_[j];
    703       best_node = node->prev;
    704     }
    705     return (nz != 0);
    706   }
    707 }
    708 
    709 #undef NODE
    710 
    711 //------------------------------------------------------------------------------
    712 // Performs: difference, transform, quantize, back-transform, add
    713 // all at once. Output is the reconstructed block in *yuv_out, and the
    714 // quantized levels in *levels.
    715 
    716 static int ReconstructIntra16(VP8EncIterator* const it,
    717                               VP8ModeScore* const rd,
    718                               uint8_t* const yuv_out,
    719                               int mode) {
    720   const VP8Encoder* const enc = it->enc_;
    721   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
    722   const uint8_t* const src = it->yuv_in_ + Y_OFF;
    723   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    724   int nz = 0;
    725   int n;
    726   int16_t tmp[16][16], dc_tmp[16];
    727 
    728   for (n = 0; n < 16; ++n) {
    729     VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
    730   }
    731   VP8FTransformWHT(tmp[0], dc_tmp);
    732   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
    733 
    734   if (DO_TRELLIS_I16 && it->do_trellis_) {
    735     int x, y;
    736     VP8IteratorNzToBytes(it);
    737     for (y = 0, n = 0; y < 4; ++y) {
    738       for (x = 0; x < 4; ++x, ++n) {
    739         const int ctx = it->top_nz_[x] + it->left_nz_[y];
    740         const int non_zero =
    741             TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
    742                                  &dqm->y1_, dqm->lambda_trellis_i16_);
    743         it->top_nz_[x] = it->left_nz_[y] = non_zero;
    744         rd->y_ac_levels[n][0] = 0;
    745         nz |= non_zero << n;
    746       }
    747     }
    748   } else {
    749     for (n = 0; n < 16; ++n) {
    750       // Zero-out the first coeff, so that: a) nz is correct below, and
    751       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
    752       tmp[n][0] = 0;
    753       nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
    754       assert(rd->y_ac_levels[n][0] == 0);
    755     }
    756   }
    757 
    758   // Transform back
    759   VP8TransformWHT(dc_tmp, tmp[0]);
    760   for (n = 0; n < 16; n += 2) {
    761     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
    762   }
    763 
    764   return nz;
    765 }
    766 
    767 static int ReconstructIntra4(VP8EncIterator* const it,
    768                              int16_t levels[16],
    769                              const uint8_t* const src,
    770                              uint8_t* const yuv_out,
    771                              int mode) {
    772   const VP8Encoder* const enc = it->enc_;
    773   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
    774   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    775   int nz = 0;
    776   int16_t tmp[16];
    777 
    778   VP8FTransform(src, ref, tmp);
    779   if (DO_TRELLIS_I4 && it->do_trellis_) {
    780     const int x = it->i4_ & 3, y = it->i4_ >> 2;
    781     const int ctx = it->top_nz_[x] + it->left_nz_[y];
    782     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
    783                               dqm->lambda_trellis_i4_);
    784   } else {
    785     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
    786   }
    787   VP8ITransform(ref, tmp, yuv_out, 0);
    788   return nz;
    789 }
    790 
    791 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
    792                          uint8_t* const yuv_out, int mode) {
    793   const VP8Encoder* const enc = it->enc_;
    794   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
    795   const uint8_t* const src = it->yuv_in_ + U_OFF;
    796   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    797   int nz = 0;
    798   int n;
    799   int16_t tmp[8][16];
    800 
    801   for (n = 0; n < 8; ++n) {
    802     VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
    803   }
    804   if (DO_TRELLIS_UV && it->do_trellis_) {
    805     int ch, x, y;
    806     for (ch = 0, n = 0; ch <= 2; ch += 2) {
    807       for (y = 0; y < 2; ++y) {
    808         for (x = 0; x < 2; ++x, ++n) {
    809           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
    810           const int non_zero =
    811               TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
    812                                    &dqm->uv_, dqm->lambda_trellis_uv_);
    813           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
    814           nz |= non_zero << n;
    815         }
    816       }
    817     }
    818   } else {
    819     for (n = 0; n < 8; ++n) {
    820       nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
    821     }
    822   }
    823 
    824   for (n = 0; n < 8; n += 2) {
    825     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
    826   }
    827   return (nz << 16);
    828 }
    829 
    830 //------------------------------------------------------------------------------
    831 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
    832 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
    833 
    834 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
    835   // We look at the first three AC coefficients to determine what is the average
    836   // delta between each sub-4x4 block.
    837   const int v0 = abs(DCs[1]);
    838   const int v1 = abs(DCs[4]);
    839   const int v2 = abs(DCs[5]);
    840   int max_v = (v0 > v1) ? v1 : v0;
    841   max_v = (v2 > max_v) ? v2 : max_v;
    842   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
    843 }
    844 
    845 static void SwapPtr(uint8_t** a, uint8_t** b) {
    846   uint8_t* const tmp = *a;
    847   *a = *b;
    848   *b = tmp;
    849 }
    850 
    851 static void SwapOut(VP8EncIterator* const it) {
    852   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
    853 }
    854 
    855 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
    856   score_t score = 0;
    857   while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
    858     int i;
    859     for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
    860       score += (levels[i] != 0);
    861       if (score > thresh) return 0;
    862     }
    863     levels += 16;
    864   }
    865   return 1;
    866 }
    867 
    868 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
    869   const int kNumBlocks = 16;
    870   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
    871   const int lambda = dqm->lambda_i16_;
    872   const int tlambda = dqm->tlambda_;
    873   const uint8_t* const src = it->yuv_in_ + Y_OFF;
    874   VP8ModeScore rd16;
    875   int mode;
    876 
    877   rd->mode_i16 = -1;
    878   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
    879     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
    880     int nz;
    881 
    882     // Reconstruct
    883     nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
    884 
    885     // Measure RD-score
    886     rd16.D = VP8SSE16x16(src, tmp_dst);
    887     rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
    888             : 0;
    889     rd16.H = VP8FixedCostsI16[mode];
    890     rd16.R = VP8GetCostLuma16(it, &rd16);
    891     if (mode > 0 &&
    892         IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
    893       // penalty to avoid flat area to be mispredicted by complex mode
    894       rd16.R += FLATNESS_PENALTY * kNumBlocks;
    895     }
    896 
    897     // Since we always examine Intra16 first, we can overwrite *rd directly.
    898     SetRDScore(lambda, &rd16);
    899     if (mode == 0 || rd16.score < rd->score) {
    900       CopyScore(rd, &rd16);
    901       rd->mode_i16 = mode;
    902       rd->nz = nz;
    903       memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
    904       memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
    905       SwapOut(it);
    906     }
    907   }
    908   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
    909   VP8SetIntra16Mode(it, rd->mode_i16);
    910 
    911   // we have a blocky macroblock (only DCs are non-zero) with fairly high
    912   // distortion, record max delta so we can later adjust the minimal filtering
    913   // strength needed to smooth these blocks out.
    914   if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
    915     StoreMaxDelta(dqm, rd->y_dc_levels);
    916   }
    917 }
    918 
    919 //------------------------------------------------------------------------------
    920 
    921 // return the cost array corresponding to the surrounding prediction modes.
    922 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
    923                                      const uint8_t modes[16]) {
    924   const int preds_w = it->enc_->preds_w_;
    925   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
    926   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
    927   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
    928   return VP8FixedCostsI4[top][left];
    929 }
    930 
    931 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
    932   const VP8Encoder* const enc = it->enc_;
    933   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    934   const int lambda = dqm->lambda_i4_;
    935   const int tlambda = dqm->tlambda_;
    936   const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
    937   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
    938   int total_header_bits = 0;
    939   VP8ModeScore rd_best;
    940 
    941   if (enc->max_i4_header_bits_ == 0) {
    942     return 0;
    943   }
    944 
    945   InitScore(&rd_best);
    946   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
    947   SetRDScore(dqm->lambda_mode_, &rd_best);
    948   VP8IteratorStartI4(it);
    949   do {
    950     const int kNumBlocks = 1;
    951     VP8ModeScore rd_i4;
    952     int mode;
    953     int best_mode = -1;
    954     const uint8_t* const src = src0 + VP8Scan[it->i4_];
    955     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
    956     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
    957     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
    958 
    959     InitScore(&rd_i4);
    960     VP8MakeIntra4Preds(it);
    961     for (mode = 0; mode < NUM_BMODES; ++mode) {
    962       VP8ModeScore rd_tmp;
    963       int16_t tmp_levels[16];
    964 
    965       // Reconstruct
    966       rd_tmp.nz =
    967           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
    968 
    969       // Compute RD-score
    970       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
    971       rd_tmp.SD =
    972           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
    973                   : 0;
    974       rd_tmp.H = mode_costs[mode];
    975       rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
    976       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
    977         rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
    978       }
    979 
    980       SetRDScore(lambda, &rd_tmp);
    981       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
    982         CopyScore(&rd_i4, &rd_tmp);
    983         best_mode = mode;
    984         SwapPtr(&tmp_dst, &best_block);
    985         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
    986       }
    987     }
    988     SetRDScore(dqm->lambda_mode_, &rd_i4);
    989     AddScore(&rd_best, &rd_i4);
    990     if (rd_best.score >= rd->score) {
    991       return 0;
    992     }
    993     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
    994     if (total_header_bits > enc->max_i4_header_bits_) {
    995       return 0;
    996     }
    997     // Copy selected samples if not in the right place already.
    998     if (best_block != best_blocks + VP8Scan[it->i4_]) {
    999       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
   1000     }
   1001     rd->modes_i4[it->i4_] = best_mode;
   1002     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
   1003   } while (VP8IteratorRotateI4(it, best_blocks));
   1004 
   1005   // finalize state
   1006   CopyScore(rd, &rd_best);
   1007   VP8SetIntra4Mode(it, rd->modes_i4);
   1008   SwapOut(it);
   1009   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
   1010   return 1;   // select intra4x4 over intra16x16
   1011 }
   1012 
   1013 //------------------------------------------------------------------------------
   1014 
   1015 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
   1016   const int kNumBlocks = 8;
   1017   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   1018   const int lambda = dqm->lambda_uv_;
   1019   const uint8_t* const src = it->yuv_in_ + U_OFF;
   1020   uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
   1021   uint8_t* const dst0 = it->yuv_out_ + U_OFF;
   1022   VP8ModeScore rd_best;
   1023   int mode;
   1024 
   1025   rd->mode_uv = -1;
   1026   InitScore(&rd_best);
   1027   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1028     VP8ModeScore rd_uv;
   1029 
   1030     // Reconstruct
   1031     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
   1032 
   1033     // Compute RD-score
   1034     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
   1035     rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
   1036     rd_uv.H  = VP8FixedCostsUV[mode];
   1037     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
   1038     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
   1039       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
   1040     }
   1041 
   1042     SetRDScore(lambda, &rd_uv);
   1043     if (mode == 0 || rd_uv.score < rd_best.score) {
   1044       CopyScore(&rd_best, &rd_uv);
   1045       rd->mode_uv = mode;
   1046       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
   1047       memcpy(dst0, tmp_dst, UV_SIZE);   //  TODO: SwapUVOut() ?
   1048     }
   1049   }
   1050   VP8SetIntraUVMode(it, rd->mode_uv);
   1051   AddScore(rd, &rd_best);
   1052 }
   1053 
   1054 //------------------------------------------------------------------------------
   1055 // Final reconstruction and quantization.
   1056 
   1057 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
   1058   const VP8Encoder* const enc = it->enc_;
   1059   const int is_i16 = (it->mb_->type_ == 1);
   1060   int nz = 0;
   1061 
   1062   if (is_i16) {
   1063     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
   1064   } else {
   1065     VP8IteratorStartI4(it);
   1066     do {
   1067       const int mode =
   1068           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
   1069       const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
   1070       uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
   1071       VP8MakeIntra4Preds(it);
   1072       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
   1073                               src, dst, mode) << it->i4_;
   1074     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
   1075   }
   1076 
   1077   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
   1078   rd->nz = nz;
   1079 }
   1080 
   1081 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
   1082 static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
   1083   const int is_i16 = (it->mb_->type_ == 1);
   1084   score_t best_score = MAX_COST;
   1085 
   1086   if (try_both_i4_i16 || is_i16) {
   1087     int mode;
   1088     int best_mode = -1;
   1089     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1090       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
   1091       const uint8_t* const src = it->yuv_in_ + Y_OFF;
   1092       const score_t score = VP8SSE16x16(src, ref);
   1093       if (score < best_score) {
   1094         best_mode = mode;
   1095         best_score = score;
   1096       }
   1097     }
   1098     VP8SetIntra16Mode(it, best_mode);
   1099   }
   1100   if (try_both_i4_i16 || !is_i16) {
   1101     uint8_t modes_i4[16];
   1102     // We don't evaluate the rate here, but just account for it through a
   1103     // constant penalty (i4 mode usually needs more bits compared to i16).
   1104     score_t score_i4 = (score_t)I4_PENALTY;
   1105 
   1106     VP8IteratorStartI4(it);
   1107     do {
   1108       int mode;
   1109       int best_sub_mode = -1;
   1110       score_t best_sub_score = MAX_COST;
   1111       const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
   1112 
   1113       // TODO(skal): we don't really need the prediction pixels here,
   1114       // but just the distortion against 'src'.
   1115       VP8MakeIntra4Preds(it);
   1116       for (mode = 0; mode < NUM_BMODES; ++mode) {
   1117         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
   1118         const score_t score = VP8SSE4x4(src, ref);
   1119         if (score < best_sub_score) {
   1120           best_sub_mode = mode;
   1121           best_sub_score = score;
   1122         }
   1123       }
   1124       modes_i4[it->i4_] = best_sub_mode;
   1125       score_i4 += best_sub_score;
   1126       if (score_i4 >= best_score) break;
   1127     } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
   1128     if (score_i4 < best_score) {
   1129       VP8SetIntra4Mode(it, modes_i4);
   1130     }
   1131   }
   1132 }
   1133 
   1134 //------------------------------------------------------------------------------
   1135 // Entry point
   1136 
   1137 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
   1138                 VP8RDLevel rd_opt) {
   1139   int is_skipped;
   1140   const int method = it->enc_->method_;
   1141 
   1142   InitScore(rd);
   1143 
   1144   // We can perform predictions for Luma16x16 and Chroma8x8 already.
   1145   // Luma4x4 predictions needs to be done as-we-go.
   1146   VP8MakeLuma16Preds(it);
   1147   VP8MakeChroma8Preds(it);
   1148 
   1149   if (rd_opt > RD_OPT_NONE) {
   1150     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
   1151     PickBestIntra16(it, rd);
   1152     if (method >= 2) {
   1153       PickBestIntra4(it, rd);
   1154     }
   1155     PickBestUV(it, rd);
   1156     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
   1157       it->do_trellis_ = 1;
   1158       SimpleQuantize(it, rd);
   1159     }
   1160   } else {
   1161     // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
   1162     // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
   1163     DistoRefine(it, (method >= 2));
   1164     SimpleQuantize(it, rd);
   1165   }
   1166   is_skipped = (rd->nz == 0);
   1167   VP8SetSkip(it, is_skipped);
   1168   return is_skipped;
   1169 }
   1170 
   1171