Home | History | Annotate | Download | only in enc
      1 // Copyright 2011 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 //   Quantization
     11 //
     12 // Author: Skal (pascal.massimino (at) gmail.com)
     13 
     14 #include <assert.h>
     15 #include <math.h>
     16 
     17 #include "./vp8enci.h"
     18 #include "./cost.h"
     19 
     20 #define DO_TRELLIS_I4  1
     21 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
     22 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
     23 #define USE_TDISTO 1
     24 
     25 #define MID_ALPHA 64      // neutral value for susceptibility
     26 #define MIN_ALPHA 30      // lowest usable value for susceptibility
     27 #define MAX_ALPHA 100     // higher meaninful value for susceptibility
     28 
     29 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
     30                           // power-law modulation. Must be strictly less than 1.
     31 
     32 #define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
     33 
     34 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
     35 
     36 #if defined(__cplusplus) || defined(c_plusplus)
     37 extern "C" {
     38 #endif
     39 
     40 //------------------------------------------------------------------------------
     41 
     42 static WEBP_INLINE int clip(int v, int m, int M) {
     43   return v < m ? m : v > M ? M : v;
     44 }
     45 
     46 static const uint8_t kZigzag[16] = {
     47   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
     48 };
     49 
     50 static const uint8_t kDcTable[128] = {
     51   4,     5,   6,   7,   8,   9,  10,  10,
     52   11,   12,  13,  14,  15,  16,  17,  17,
     53   18,   19,  20,  20,  21,  21,  22,  22,
     54   23,   23,  24,  25,  25,  26,  27,  28,
     55   29,   30,  31,  32,  33,  34,  35,  36,
     56   37,   37,  38,  39,  40,  41,  42,  43,
     57   44,   45,  46,  46,  47,  48,  49,  50,
     58   51,   52,  53,  54,  55,  56,  57,  58,
     59   59,   60,  61,  62,  63,  64,  65,  66,
     60   67,   68,  69,  70,  71,  72,  73,  74,
     61   75,   76,  76,  77,  78,  79,  80,  81,
     62   82,   83,  84,  85,  86,  87,  88,  89,
     63   91,   93,  95,  96,  98, 100, 101, 102,
     64   104, 106, 108, 110, 112, 114, 116, 118,
     65   122, 124, 126, 128, 130, 132, 134, 136,
     66   138, 140, 143, 145, 148, 151, 154, 157
     67 };
     68 
     69 static const uint16_t kAcTable[128] = {
     70   4,     5,   6,   7,   8,   9,  10,  11,
     71   12,   13,  14,  15,  16,  17,  18,  19,
     72   20,   21,  22,  23,  24,  25,  26,  27,
     73   28,   29,  30,  31,  32,  33,  34,  35,
     74   36,   37,  38,  39,  40,  41,  42,  43,
     75   44,   45,  46,  47,  48,  49,  50,  51,
     76   52,   53,  54,  55,  56,  57,  58,  60,
     77   62,   64,  66,  68,  70,  72,  74,  76,
     78   78,   80,  82,  84,  86,  88,  90,  92,
     79   94,   96,  98, 100, 102, 104, 106, 108,
     80   110, 112, 114, 116, 119, 122, 125, 128,
     81   131, 134, 137, 140, 143, 146, 149, 152,
     82   155, 158, 161, 164, 167, 170, 173, 177,
     83   181, 185, 189, 193, 197, 201, 205, 209,
     84   213, 217, 221, 225, 229, 234, 239, 245,
     85   249, 254, 259, 264, 269, 274, 279, 284
     86 };
     87 
     88 static const uint16_t kAcTable2[128] = {
     89   8,     8,   9,  10,  12,  13,  15,  17,
     90   18,   20,  21,  23,  24,  26,  27,  29,
     91   31,   32,  34,  35,  37,  38,  40,  41,
     92   43,   44,  46,  48,  49,  51,  52,  54,
     93   55,   57,  58,  60,  62,  63,  65,  66,
     94   68,   69,  71,  72,  74,  75,  77,  79,
     95   80,   82,  83,  85,  86,  88,  89,  93,
     96   96,   99, 102, 105, 108, 111, 114, 117,
     97   120, 124, 127, 130, 133, 136, 139, 142,
     98   145, 148, 151, 155, 158, 161, 164, 167,
     99   170, 173, 176, 179, 184, 189, 193, 198,
    100   203, 207, 212, 217, 221, 226, 230, 235,
    101   240, 244, 249, 254, 258, 263, 268, 274,
    102   280, 286, 292, 299, 305, 311, 317, 323,
    103   330, 336, 342, 348, 354, 362, 370, 379,
    104   385, 393, 401, 409, 416, 424, 432, 440
    105 };
    106 
    107 static const uint16_t kCoeffThresh[16] = {
    108   0,  10, 20, 30,
    109   10, 20, 30, 30,
    110   20, 30, 30, 30,
    111   30, 30, 30, 30
    112 };
    113 
    114 // TODO(skal): tune more. Coeff thresholding?
    115 static const uint8_t kBiasMatrices[3][16] = {  // [3] = [luma-ac,luma-dc,chroma]
    116   { 96, 96, 96, 96,
    117     96, 96, 96, 96,
    118     96, 96, 96, 96,
    119     96, 96, 96, 96 },
    120   { 96, 96, 96, 96,
    121     96, 96, 96, 96,
    122     96, 96, 96, 96,
    123     96, 96, 96, 96 },
    124   { 96, 96, 96, 96,
    125     96, 96, 96, 96,
    126     96, 96, 96, 96,
    127     96, 96, 96, 96 }
    128 };
    129 
    130 // Sharpening by (slightly) raising the hi-frequency coeffs (only for trellis).
    131 // Hack-ish but helpful for mid-bitrate range. Use with care.
    132 static const uint8_t kFreqSharpening[16] = {
    133   0,  30, 60, 90,
    134   30, 60, 90, 90,
    135   60, 90, 90, 90,
    136   90, 90, 90, 90
    137 };
    138 
    139 //------------------------------------------------------------------------------
    140 // Initialize quantization parameters in VP8Matrix
    141 
    142 // Returns the average quantizer
    143 static int ExpandMatrix(VP8Matrix* const m, int type) {
    144   int i;
    145   int sum = 0;
    146   for (i = 2; i < 16; ++i) {
    147     m->q_[i] = m->q_[1];
    148   }
    149   for (i = 0; i < 16; ++i) {
    150     const int j = kZigzag[i];
    151     const int bias = kBiasMatrices[type][j];
    152     m->iq_[j] = (1 << QFIX) / m->q_[j];
    153     m->bias_[j] = BIAS(bias);
    154     // TODO(skal): tune kCoeffThresh[]
    155     m->zthresh_[j] = ((256 /*+ kCoeffThresh[j]*/ - bias) * m->q_[j] + 127) >> 8;
    156     m->sharpen_[j] = (kFreqSharpening[j] * m->q_[j]) >> 11;
    157     sum += m->q_[j];
    158   }
    159   return (sum + 8) >> 4;
    160 }
    161 
    162 static void SetupMatrices(VP8Encoder* enc) {
    163   int i;
    164   const int tlambda_scale =
    165     (enc->method_ >= 4) ? enc->config_->sns_strength
    166                         : 0;
    167   const int num_segments = enc->segment_hdr_.num_segments_;
    168   for (i = 0; i < num_segments; ++i) {
    169     VP8SegmentInfo* const m = &enc->dqm_[i];
    170     const int q = m->quant_;
    171     int q4, q16, quv;
    172     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
    173     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
    174 
    175     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
    176     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
    177 
    178     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
    179     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
    180 
    181     q4  = ExpandMatrix(&m->y1_, 0);
    182     q16 = ExpandMatrix(&m->y2_, 1);
    183     quv = ExpandMatrix(&m->uv_, 2);
    184 
    185     // TODO: Switch to kLambda*[] tables?
    186     {
    187       m->lambda_i4_  = (3 * q4 * q4) >> 7;
    188       m->lambda_i16_ = (3 * q16 * q16);
    189       m->lambda_uv_  = (3 * quv * quv) >> 6;
    190       m->lambda_mode_    = (1 * q4 * q4) >> 7;
    191       m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
    192       m->lambda_trellis_i16_ = (q16 * q16) >> 2;
    193       m->lambda_trellis_uv_  = (quv *quv) << 1;
    194       m->tlambda_            = (tlambda_scale * q4) >> 5;
    195     }
    196   }
    197 }
    198 
    199 //------------------------------------------------------------------------------
    200 // Initialize filtering parameters
    201 
    202 // Very small filter-strength values have close to no visual effect. So we can
    203 // save a little decoding-CPU by turning filtering off for these.
    204 #define FSTRENGTH_CUTOFF 3
    205 
    206 static void SetupFilterStrength(VP8Encoder* const enc) {
    207   int i;
    208   const int level0 = enc->config_->filter_strength;
    209   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
    210     // Segments with lower quantizer will be less filtered. TODO: tune (wrt SNS)
    211     const int level = level0 * 256 * enc->dqm_[i].quant_ / 128;
    212     const int f = level / (256 + enc->dqm_[i].beta_);
    213     enc->dqm_[i].fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
    214   }
    215   // We record the initial strength (mainly for the case of 1-segment only).
    216   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
    217   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
    218   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
    219 }
    220 
    221 //------------------------------------------------------------------------------
    222 
    223 // Note: if you change the values below, remember that the max range
    224 // allowed by the syntax for DQ_UV is [-16,16].
    225 #define MAX_DQ_UV (6)
    226 #define MIN_DQ_UV (-4)
    227 
    228 // We want to emulate jpeg-like behaviour where the expected "good" quality
    229 // is around q=75. Internally, our "good" middle is around c=50. So we
    230 // map accordingly using linear piece-wise function
    231 static double QualityToCompression(double c) {
    232   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
    233   // The file size roughly scales as pow(quantizer, 3.). Actually, the
    234   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
    235   // in the mid-quant range. So we scale the compressibility inversely to
    236   // this power-law: quant ~= compression ^ 1/3. This law holds well for
    237   // low quant. Finer modelling for high-quant would make use of kAcTable[]
    238   // more explicitly.
    239   const double v = pow(linear_c, 1 / 3.);
    240   return v;
    241 }
    242 
    243 static double QualityToJPEGCompression(double c, double alpha) {
    244   // We map the complexity 'alpha' and quality setting 'c' to a compression
    245   // exponent empirically matched to the compression curve of libjpeg6b.
    246   // On average, the WebP output size will be roughly similar to that of a
    247   // JPEG file compressed with same quality factor.
    248   const double amin = 0.30;
    249   const double amax = 0.85;
    250   const double exp_min = 0.4;
    251   const double exp_max = 0.9;
    252   const double slope = (exp_min - exp_max) / (amax - amin);
    253   // Linearly interpolate 'expn' from exp_min to exp_max
    254   // in the [amin, amax] range.
    255   const double expn = (alpha > amax) ? exp_min
    256                     : (alpha < amin) ? exp_max
    257                     : exp_max + slope * (alpha - amin);
    258   const double v = pow(c, expn);
    259   return v;
    260 }
    261 
    262 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
    263                                  const VP8SegmentInfo* const S2) {
    264   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
    265 }
    266 
    267 static void SimplifySegments(VP8Encoder* const enc) {
    268   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
    269   const int num_segments = enc->segment_hdr_.num_segments_;
    270   int num_final_segments = 1;
    271   int s1, s2;
    272   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
    273     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
    274     int found = 0;
    275     // check if we already have similar segment
    276     for (s2 = 0; s2 < num_final_segments; ++s2) {
    277       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
    278       if (SegmentsAreEquivalent(S1, S2)) {
    279         found = 1;
    280         break;
    281       }
    282     }
    283     map[s1] = s2;
    284     if (!found) {
    285       if (num_final_segments != s1) {
    286         enc->dqm_[num_final_segments] = enc->dqm_[s1];
    287       }
    288       ++num_final_segments;
    289     }
    290   }
    291   if (num_final_segments < num_segments) {  // Remap
    292     int i = enc->mb_w_ * enc->mb_h_;
    293     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
    294     enc->segment_hdr_.num_segments_ = num_final_segments;
    295     // Replicate the trailing segment infos (it's mostly cosmetics)
    296     for (i = num_final_segments; i < num_segments; ++i) {
    297       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
    298     }
    299   }
    300 }
    301 
    302 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
    303   int i;
    304   int dq_uv_ac, dq_uv_dc;
    305   const int num_segments = enc->segment_hdr_.num_segments_;
    306   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
    307   const double Q = quality / 100.;
    308   const double c_base = enc->config_->emulate_jpeg_size ?
    309       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
    310       QualityToCompression(Q);
    311   for (i = 0; i < num_segments; ++i) {
    312     // We modulate the base coefficient to accommodate for the quantization
    313     // susceptibility and allow denser segments to be quantized more.
    314     const double expn = 1. - amp * enc->dqm_[i].alpha_;
    315     const double c = pow(c_base, expn);
    316     const int q = (int)(127. * (1. - c));
    317     assert(expn > 0.);
    318     enc->dqm_[i].quant_ = clip(q, 0, 127);
    319   }
    320 
    321   // purely indicative in the bitstream (except for the 1-segment case)
    322   enc->base_quant_ = enc->dqm_[0].quant_;
    323 
    324   // fill-in values for the unused segments (required by the syntax)
    325   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
    326     enc->dqm_[i].quant_ = enc->base_quant_;
    327   }
    328 
    329   // uv_alpha_ is normally spread around ~60. The useful range is
    330   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
    331   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
    332   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
    333                                           / (MAX_ALPHA - MIN_ALPHA);
    334   // we rescale by the user-defined strength of adaptation
    335   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
    336   // and make it safe.
    337   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
    338   // We also boost the dc-uv-quant a little, based on sns-strength, since
    339   // U/V channels are quite more reactive to high quants (flat DC-blocks
    340   // tend to appear, and are displeasant).
    341   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
    342   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
    343 
    344   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
    345   enc->dq_y2_dc_ = 0;
    346   enc->dq_y2_ac_ = 0;
    347   enc->dq_uv_dc_ = dq_uv_dc;
    348   enc->dq_uv_ac_ = dq_uv_ac;
    349 
    350   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
    351 
    352   if (num_segments > 1) SimplifySegments(enc);
    353 
    354   SetupMatrices(enc);         // finalize quantization matrices
    355 }
    356 
    357 //------------------------------------------------------------------------------
    358 // Form the predictions in cache
    359 
    360 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
    361 const int VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
    362 const int VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
    363 
    364 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
    365 const int VP8I4ModeOffsets[NUM_BMODES] = {
    366   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
    367 };
    368 
    369 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
    370   const VP8Encoder* const enc = it->enc_;
    371   const uint8_t* const left = it->x_ ? enc->y_left_ : NULL;
    372   const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
    373   VP8EncPredLuma16(it->yuv_p_, left, top);
    374 }
    375 
    376 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
    377   const VP8Encoder* const enc = it->enc_;
    378   const uint8_t* const left = it->x_ ? enc->u_left_ : NULL;
    379   const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
    380   VP8EncPredChroma8(it->yuv_p_, left, top);
    381 }
    382 
    383 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
    384   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
    385 }
    386 
    387 //------------------------------------------------------------------------------
    388 // Quantize
    389 
    390 // Layout:
    391 // +----+
    392 // |YYYY| 0
    393 // |YYYY| 4
    394 // |YYYY| 8
    395 // |YYYY| 12
    396 // +----+
    397 // |UUVV| 16
    398 // |UUVV| 20
    399 // +----+
    400 
    401 const int VP8Scan[16 + 4 + 4] = {
    402   // Luma
    403   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
    404   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
    405   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
    406   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
    407 
    408   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
    409   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
    410 };
    411 
    412 //------------------------------------------------------------------------------
    413 // Distortion measurement
    414 
    415 static const uint16_t kWeightY[16] = {
    416   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
    417 };
    418 
    419 static const uint16_t kWeightTrellis[16] = {
    420 #if USE_TDISTO == 0
    421   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
    422 #else
    423   30, 27, 19, 11,
    424   27, 24, 17, 10,
    425   19, 17, 12,  8,
    426   11, 10,  8,  6
    427 #endif
    428 };
    429 
    430 // Init/Copy the common fields in score.
    431 static void InitScore(VP8ModeScore* const rd) {
    432   rd->D  = 0;
    433   rd->SD = 0;
    434   rd->R  = 0;
    435   rd->nz = 0;
    436   rd->score = MAX_COST;
    437 }
    438 
    439 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
    440   dst->D  = src->D;
    441   dst->SD = src->SD;
    442   dst->R  = src->R;
    443   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
    444   dst->score = src->score;
    445 }
    446 
    447 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
    448   dst->D  += src->D;
    449   dst->SD += src->SD;
    450   dst->R  += src->R;
    451   dst->nz |= src->nz;     // here, new nz bits are accumulated.
    452   dst->score += src->score;
    453 }
    454 
    455 //------------------------------------------------------------------------------
    456 // Performs trellis-optimized quantization.
    457 
    458 // Trellis
    459 
    460 typedef struct {
    461   int prev;        // best previous
    462   int level;       // level
    463   int sign;        // sign of coeff_i
    464   score_t cost;    // bit cost
    465   score_t error;   // distortion = sum of (|coeff_i| - level_i * Q_i)^2
    466   int ctx;         // context (only depends on 'level'. Could be spared.)
    467 } Node;
    468 
    469 // If a coefficient was quantized to a value Q (using a neutral bias),
    470 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
    471 // We don't test negative values though.
    472 #define MIN_DELTA 0   // how much lower level to try
    473 #define MAX_DELTA 1   // how much higher
    474 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
    475 #define NODE(n, l) (nodes[(n) + 1][(l) + MIN_DELTA])
    476 
    477 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
    478   // TODO: incorporate the "* 256" in the tables?
    479   rd->score = rd->R * lambda + 256 * (rd->D + rd->SD);
    480 }
    481 
    482 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
    483                                           score_t distortion) {
    484   return rate * lambda + 256 * distortion;
    485 }
    486 
    487 static int TrellisQuantizeBlock(const VP8EncIterator* const it,
    488                                 int16_t in[16], int16_t out[16],
    489                                 int ctx0, int coeff_type,
    490                                 const VP8Matrix* const mtx,
    491                                 int lambda) {
    492   ProbaArray* const last_costs = it->enc_->proba_.coeffs_[coeff_type];
    493   CostArray* const costs = it->enc_->proba_.level_cost_[coeff_type];
    494   const int first = (coeff_type == 0) ? 1 : 0;
    495   Node nodes[17][NUM_NODES];
    496   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
    497   score_t best_score;
    498   int best_node;
    499   int last = first - 1;
    500   int n, m, p, nz;
    501 
    502   {
    503     score_t cost;
    504     score_t max_error;
    505     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
    506     const int last_proba = last_costs[VP8EncBands[first]][ctx0][0];
    507 
    508     // compute maximal distortion.
    509     max_error = 0;
    510     for (n = first; n < 16; ++n) {
    511       const int j  = kZigzag[n];
    512       const int err = in[j] * in[j];
    513       max_error += kWeightTrellis[j] * err;
    514       if (err > thresh) last = n;
    515     }
    516     // we don't need to go inspect up to n = 16 coeffs. We can just go up
    517     // to last + 1 (inclusive) without losing much.
    518     if (last < 15) ++last;
    519 
    520     // compute 'skip' score. This is the max score one can do.
    521     cost = VP8BitCost(0, last_proba);
    522     best_score = RDScoreTrellis(lambda, cost, max_error);
    523 
    524     // initialize source node.
    525     n = first - 1;
    526     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
    527       NODE(n, m).cost = 0;
    528       NODE(n, m).error = max_error;
    529       NODE(n, m).ctx = ctx0;
    530     }
    531   }
    532 
    533   // traverse trellis.
    534   for (n = first; n <= last; ++n) {
    535     const int j  = kZigzag[n];
    536     const int Q  = mtx->q_[j];
    537     const int iQ = mtx->iq_[j];
    538     const int B = BIAS(0x00);     // neutral bias
    539     // note: it's important to take sign of the _original_ coeff,
    540     // so we don't have to consider level < 0 afterward.
    541     const int sign = (in[j] < 0);
    542     int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
    543     int level0;
    544     if (coeff0 > 2047) coeff0 = 2047;
    545 
    546     level0 = QUANTDIV(coeff0, iQ, B);
    547     // test all alternate level values around level0.
    548     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
    549       Node* const cur = &NODE(n, m);
    550       int delta_error, new_error;
    551       score_t cur_score = MAX_COST;
    552       int level = level0 + m;
    553       int last_proba;
    554 
    555       cur->sign = sign;
    556       cur->level = level;
    557       cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
    558       if (level >= 2048 || level < 0) {   // node is dead?
    559         cur->cost = MAX_COST;
    560         continue;
    561       }
    562       last_proba = last_costs[VP8EncBands[n + 1]][cur->ctx][0];
    563 
    564       // Compute delta_error = how much coding this level will
    565       // subtract as distortion to max_error
    566       new_error = coeff0 - level * Q;
    567       delta_error =
    568         kWeightTrellis[j] * (coeff0 * coeff0 - new_error * new_error);
    569 
    570       // Inspect all possible non-dead predecessors. Retain only the best one.
    571       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
    572         const Node* const prev = &NODE(n - 1, p);
    573         const int prev_ctx = prev->ctx;
    574         const uint16_t* const tcost = costs[VP8EncBands[n]][prev_ctx];
    575         const score_t total_error = prev->error - delta_error;
    576         score_t cost, base_cost, score;
    577 
    578         if (prev->cost >= MAX_COST) {   // dead node?
    579           continue;
    580         }
    581 
    582         // Base cost of both terminal/non-terminal
    583         base_cost = prev->cost + VP8LevelCost(tcost, level);
    584 
    585         // Examine node assuming it's a non-terminal one.
    586         cost = base_cost;
    587         if (level && n < 15) {
    588           cost += VP8BitCost(1, last_proba);
    589         }
    590         score = RDScoreTrellis(lambda, cost, total_error);
    591         if (score < cur_score) {
    592           cur_score = score;
    593           cur->cost  = cost;
    594           cur->error = total_error;
    595           cur->prev  = p;
    596         }
    597 
    598         // Now, record best terminal node (and thus best entry in the graph).
    599         if (level) {
    600           cost = base_cost;
    601           if (n < 15) cost += VP8BitCost(0, last_proba);
    602           score = RDScoreTrellis(lambda, cost, total_error);
    603           if (score < best_score) {
    604             best_score = score;
    605             best_path[0] = n;   // best eob position
    606             best_path[1] = m;   // best level
    607             best_path[2] = p;   // best predecessor
    608           }
    609         }
    610       }
    611     }
    612   }
    613 
    614   // Fresh start
    615   memset(in + first, 0, (16 - first) * sizeof(*in));
    616   memset(out + first, 0, (16 - first) * sizeof(*out));
    617   if (best_path[0] == -1) {
    618     return 0;   // skip!
    619   }
    620 
    621   // Unwind the best path.
    622   // Note: best-prev on terminal node is not necessarily equal to the
    623   // best_prev for non-terminal. So we patch best_path[2] in.
    624   n = best_path[0];
    625   best_node = best_path[1];
    626   NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
    627   nz = 0;
    628 
    629   for (; n >= first; --n) {
    630     const Node* const node = &NODE(n, best_node);
    631     const int j = kZigzag[n];
    632     out[n] = node->sign ? -node->level : node->level;
    633     nz |= (node->level != 0);
    634     in[j] = out[n] * mtx->q_[j];
    635     best_node = node->prev;
    636   }
    637   return nz;
    638 }
    639 
    640 #undef NODE
    641 
    642 //------------------------------------------------------------------------------
    643 // Performs: difference, transform, quantize, back-transform, add
    644 // all at once. Output is the reconstructed block in *yuv_out, and the
    645 // quantized levels in *levels.
    646 
    647 static int ReconstructIntra16(VP8EncIterator* const it,
    648                               VP8ModeScore* const rd,
    649                               uint8_t* const yuv_out,
    650                               int mode) {
    651   const VP8Encoder* const enc = it->enc_;
    652   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
    653   const uint8_t* const src = it->yuv_in_ + Y_OFF;
    654   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    655   int nz = 0;
    656   int n;
    657   int16_t tmp[16][16], dc_tmp[16];
    658 
    659   for (n = 0; n < 16; ++n) {
    660     VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
    661   }
    662   VP8FTransformWHT(tmp[0], dc_tmp);
    663   nz |= VP8EncQuantizeBlock(dc_tmp, rd->y_dc_levels, 0, &dqm->y2_) << 24;
    664 
    665   if (DO_TRELLIS_I16 && it->do_trellis_) {
    666     int x, y;
    667     VP8IteratorNzToBytes(it);
    668     for (y = 0, n = 0; y < 4; ++y) {
    669       for (x = 0; x < 4; ++x, ++n) {
    670         const int ctx = it->top_nz_[x] + it->left_nz_[y];
    671         const int non_zero =
    672            TrellisQuantizeBlock(it, tmp[n], rd->y_ac_levels[n], ctx, 0,
    673                                 &dqm->y1_, dqm->lambda_trellis_i16_);
    674         it->top_nz_[x] = it->left_nz_[y] = non_zero;
    675         nz |= non_zero << n;
    676       }
    677     }
    678   } else {
    679     for (n = 0; n < 16; ++n) {
    680       nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n;
    681     }
    682   }
    683 
    684   // Transform back
    685   VP8ITransformWHT(dc_tmp, tmp[0]);
    686   for (n = 0; n < 16; n += 2) {
    687     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
    688   }
    689 
    690   return nz;
    691 }
    692 
    693 static int ReconstructIntra4(VP8EncIterator* const it,
    694                              int16_t levels[16],
    695                              const uint8_t* const src,
    696                              uint8_t* const yuv_out,
    697                              int mode) {
    698   const VP8Encoder* const enc = it->enc_;
    699   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
    700   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    701   int nz = 0;
    702   int16_t tmp[16];
    703 
    704   VP8FTransform(src, ref, tmp);
    705   if (DO_TRELLIS_I4 && it->do_trellis_) {
    706     const int x = it->i4_ & 3, y = it->i4_ >> 2;
    707     const int ctx = it->top_nz_[x] + it->left_nz_[y];
    708     nz = TrellisQuantizeBlock(it, tmp, levels, ctx, 3, &dqm->y1_,
    709                               dqm->lambda_trellis_i4_);
    710   } else {
    711     nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_);
    712   }
    713   VP8ITransform(ref, tmp, yuv_out, 0);
    714   return nz;
    715 }
    716 
    717 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
    718                          uint8_t* const yuv_out, int mode) {
    719   const VP8Encoder* const enc = it->enc_;
    720   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
    721   const uint8_t* const src = it->yuv_in_ + U_OFF;
    722   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    723   int nz = 0;
    724   int n;
    725   int16_t tmp[8][16];
    726 
    727   for (n = 0; n < 8; ++n) {
    728     VP8FTransform(src + VP8Scan[16 + n], ref + VP8Scan[16 + n], tmp[n]);
    729   }
    730   if (DO_TRELLIS_UV && it->do_trellis_) {
    731     int ch, x, y;
    732     for (ch = 0, n = 0; ch <= 2; ch += 2) {
    733       for (y = 0; y < 2; ++y) {
    734         for (x = 0; x < 2; ++x, ++n) {
    735           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
    736           const int non_zero =
    737             TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2,
    738                                  &dqm->uv_, dqm->lambda_trellis_uv_);
    739           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
    740           nz |= non_zero << n;
    741         }
    742       }
    743     }
    744   } else {
    745     for (n = 0; n < 8; ++n) {
    746       nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], 0, &dqm->uv_) << n;
    747     }
    748   }
    749 
    750   for (n = 0; n < 8; n += 2) {
    751     VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1);
    752   }
    753   return (nz << 16);
    754 }
    755 
    756 //------------------------------------------------------------------------------
    757 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
    758 // Pick the mode is lower RD-cost = Rate + lamba * Distortion.
    759 
    760 static void SwapPtr(uint8_t** a, uint8_t** b) {
    761   uint8_t* const tmp = *a;
    762   *a = *b;
    763   *b = tmp;
    764 }
    765 
    766 static void SwapOut(VP8EncIterator* const it) {
    767   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
    768 }
    769 
    770 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
    771   const VP8Encoder* const enc = it->enc_;
    772   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    773   const int lambda = dqm->lambda_i16_;
    774   const int tlambda = dqm->tlambda_;
    775   const uint8_t* const src = it->yuv_in_ + Y_OFF;
    776   VP8ModeScore rd16;
    777   int mode;
    778 
    779   rd->mode_i16 = -1;
    780   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
    781     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
    782     int nz;
    783 
    784     // Reconstruct
    785     nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
    786 
    787     // Measure RD-score
    788     rd16.D = VP8SSE16x16(src, tmp_dst);
    789     rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
    790             : 0;
    791     rd16.R = VP8GetCostLuma16(it, &rd16);
    792     rd16.R += VP8FixedCostsI16[mode];
    793 
    794     // Since we always examine Intra16 first, we can overwrite *rd directly.
    795     SetRDScore(lambda, &rd16);
    796     if (mode == 0 || rd16.score < rd->score) {
    797       CopyScore(rd, &rd16);
    798       rd->mode_i16 = mode;
    799       rd->nz = nz;
    800       memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
    801       memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
    802       SwapOut(it);
    803     }
    804   }
    805   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
    806   VP8SetIntra16Mode(it, rd->mode_i16);
    807 }
    808 
    809 //------------------------------------------------------------------------------
    810 
    811 // return the cost array corresponding to the surrounding prediction modes.
    812 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
    813                                      const uint8_t modes[16]) {
    814   const int preds_w = it->enc_->preds_w_;
    815   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
    816   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
    817   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
    818   return VP8FixedCostsI4[top][left];
    819 }
    820 
    821 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
    822   const VP8Encoder* const enc = it->enc_;
    823   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    824   const int lambda = dqm->lambda_i4_;
    825   const int tlambda = dqm->tlambda_;
    826   const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
    827   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
    828   int total_header_bits = 0;
    829   VP8ModeScore rd_best;
    830 
    831   if (enc->max_i4_header_bits_ == 0) {
    832     return 0;
    833   }
    834 
    835   InitScore(&rd_best);
    836   rd_best.score = 211;  // '211' is the value of VP8BitCost(0, 145)
    837   VP8IteratorStartI4(it);
    838   do {
    839     VP8ModeScore rd_i4;
    840     int mode;
    841     int best_mode = -1;
    842     const uint8_t* const src = src0 + VP8Scan[it->i4_];
    843     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
    844     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
    845     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
    846 
    847     InitScore(&rd_i4);
    848     VP8MakeIntra4Preds(it);
    849     for (mode = 0; mode < NUM_BMODES; ++mode) {
    850       VP8ModeScore rd_tmp;
    851       int16_t tmp_levels[16];
    852 
    853       // Reconstruct
    854       rd_tmp.nz =
    855           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
    856 
    857       // Compute RD-score
    858       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
    859       rd_tmp.SD =
    860           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
    861                   : 0;
    862       rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
    863       rd_tmp.R += mode_costs[mode];
    864 
    865       SetRDScore(lambda, &rd_tmp);
    866       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
    867         CopyScore(&rd_i4, &rd_tmp);
    868         best_mode = mode;
    869         SwapPtr(&tmp_dst, &best_block);
    870         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
    871       }
    872     }
    873     SetRDScore(dqm->lambda_mode_, &rd_i4);
    874     AddScore(&rd_best, &rd_i4);
    875     total_header_bits += mode_costs[best_mode];
    876     if (rd_best.score >= rd->score ||
    877         total_header_bits > enc->max_i4_header_bits_) {
    878       return 0;
    879     }
    880     // Copy selected samples if not in the right place already.
    881     if (best_block != best_blocks + VP8Scan[it->i4_])
    882       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
    883     rd->modes_i4[it->i4_] = best_mode;
    884     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
    885   } while (VP8IteratorRotateI4(it, best_blocks));
    886 
    887   // finalize state
    888   CopyScore(rd, &rd_best);
    889   VP8SetIntra4Mode(it, rd->modes_i4);
    890   SwapOut(it);
    891   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
    892   return 1;   // select intra4x4 over intra16x16
    893 }
    894 
    895 //------------------------------------------------------------------------------
    896 
    897 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
    898   const VP8Encoder* const enc = it->enc_;
    899   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    900   const int lambda = dqm->lambda_uv_;
    901   const uint8_t* const src = it->yuv_in_ + U_OFF;
    902   uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
    903   uint8_t* const dst0 = it->yuv_out_ + U_OFF;
    904   VP8ModeScore rd_best;
    905   int mode;
    906 
    907   rd->mode_uv = -1;
    908   InitScore(&rd_best);
    909   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
    910     VP8ModeScore rd_uv;
    911 
    912     // Reconstruct
    913     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
    914 
    915     // Compute RD-score
    916     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
    917     rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
    918     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
    919     rd_uv.R += VP8FixedCostsUV[mode];
    920 
    921     SetRDScore(lambda, &rd_uv);
    922     if (mode == 0 || rd_uv.score < rd_best.score) {
    923       CopyScore(&rd_best, &rd_uv);
    924       rd->mode_uv = mode;
    925       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
    926       memcpy(dst0, tmp_dst, UV_SIZE);   //  TODO: SwapUVOut() ?
    927     }
    928   }
    929   VP8SetIntraUVMode(it, rd->mode_uv);
    930   AddScore(rd, &rd_best);
    931 }
    932 
    933 //------------------------------------------------------------------------------
    934 // Final reconstruction and quantization.
    935 
    936 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
    937   const VP8Encoder* const enc = it->enc_;
    938   const int is_i16 = (it->mb_->type_ == 1);
    939   int nz = 0;
    940 
    941   if (is_i16) {
    942     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
    943   } else {
    944     VP8IteratorStartI4(it);
    945     do {
    946       const int mode =
    947           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
    948       const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
    949       uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
    950       VP8MakeIntra4Preds(it);
    951       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
    952                               src, dst, mode) << it->i4_;
    953     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
    954   }
    955 
    956   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
    957   rd->nz = nz;
    958 }
    959 
    960 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
    961 static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
    962   const int is_i16 = (it->mb_->type_ == 1);
    963   score_t best_score = MAX_COST;
    964 
    965   if (try_both_i4_i16 || is_i16) {
    966     int mode;
    967     int best_mode = -1;
    968     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
    969       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
    970       const uint8_t* const src = it->yuv_in_ + Y_OFF;
    971       const score_t score = VP8SSE16x16(src, ref);
    972       if (score < best_score) {
    973         best_mode = mode;
    974         best_score = score;
    975       }
    976     }
    977     VP8SetIntra16Mode(it, best_mode);
    978   }
    979   if (try_both_i4_i16 || !is_i16) {
    980     uint8_t modes_i4[16];
    981     // We don't evaluate the rate here, but just account for it through a
    982     // constant penalty (i4 mode usually needs more bits compared to i16).
    983     score_t score_i4 = (score_t)I4_PENALTY;
    984 
    985     VP8IteratorStartI4(it);
    986     do {
    987       int mode;
    988       int best_sub_mode = -1;
    989       score_t best_sub_score = MAX_COST;
    990       const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
    991 
    992       // TODO(skal): we don't really need the prediction pixels here,
    993       // but just the distortion against 'src'.
    994       VP8MakeIntra4Preds(it);
    995       for (mode = 0; mode < NUM_BMODES; ++mode) {
    996         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
    997         const score_t score = VP8SSE4x4(src, ref);
    998         if (score < best_sub_score) {
    999           best_sub_mode = mode;
   1000           best_sub_score = score;
   1001         }
   1002       }
   1003       modes_i4[it->i4_] = best_sub_mode;
   1004       score_i4 += best_sub_score;
   1005       if (score_i4 >= best_score) break;
   1006     } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
   1007     if (score_i4 < best_score) {
   1008       VP8SetIntra4Mode(it, modes_i4);
   1009     }
   1010   }
   1011 }
   1012 
   1013 //------------------------------------------------------------------------------
   1014 // Entry point
   1015 
   1016 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
   1017                 VP8RDLevel rd_opt) {
   1018   int is_skipped;
   1019   const int method = it->enc_->method_;
   1020 
   1021   InitScore(rd);
   1022 
   1023   // We can perform predictions for Luma16x16 and Chroma8x8 already.
   1024   // Luma4x4 predictions needs to be done as-we-go.
   1025   VP8MakeLuma16Preds(it);
   1026   VP8MakeChroma8Preds(it);
   1027 
   1028   if (rd_opt > RD_OPT_NONE) {
   1029     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
   1030     PickBestIntra16(it, rd);
   1031     if (method >= 2) {
   1032       PickBestIntra4(it, rd);
   1033     }
   1034     PickBestUV(it, rd);
   1035     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
   1036       it->do_trellis_ = 1;
   1037       SimpleQuantize(it, rd);
   1038     }
   1039   } else {
   1040     // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
   1041     // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
   1042     DistoRefine(it, (method >= 2));
   1043     SimpleQuantize(it, rd);
   1044   }
   1045   is_skipped = (rd->nz == 0);
   1046   VP8SetSkip(it, is_skipped);
   1047   return is_skipped;
   1048 }
   1049 
   1050 #if defined(__cplusplus) || defined(c_plusplus)
   1051 }    // extern "C"
   1052 #endif
   1053