Home | History | Annotate | Download | only in enc
      1 // Copyright 2011 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 //   Quantization
     11 //
     12 // Author: Skal (pascal.massimino (at) gmail.com)
     13 
     14 #include <assert.h>
     15 #include <math.h>
     16 #include <stdlib.h>  // for abs()
     17 
     18 #include "src/enc/vp8i_enc.h"
     19 #include "src/enc/cost_enc.h"
     20 
     21 #define DO_TRELLIS_I4  1
     22 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
     23 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
     24 #define USE_TDISTO 1
     25 
     26 #define MID_ALPHA 64      // neutral value for susceptibility
     27 #define MIN_ALPHA 30      // lowest usable value for susceptibility
     28 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
     29 
     30 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
     31                           // power-law modulation. Must be strictly less than 1.
     32 
     33 // number of non-zero coeffs below which we consider the block very flat
     34 // (and apply a penalty to complex predictions)
     35 #define FLATNESS_LIMIT_I16 10      // I16 mode
     36 #define FLATNESS_LIMIT_I4  3       // I4 mode
     37 #define FLATNESS_LIMIT_UV  2       // UV mode
     38 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
     39 
     40 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
     41 
     42 #define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
     43 
     44 // #define DEBUG_BLOCK
     45 
     46 //------------------------------------------------------------------------------
     47 
     48 #if defined(DEBUG_BLOCK)
     49 
     50 #include <stdio.h>
     51 #include <stdlib.h>
     52 
     53 static void PrintBlockInfo(const VP8EncIterator* const it,
     54                            const VP8ModeScore* const rd) {
     55   int i, j;
     56   const int is_i16 = (it->mb_->type_ == 1);
     57   const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
     58   const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
     59   const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
     60   const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
     61   printf("SOURCE / OUTPUT / ABS DELTA\n");
     62   for (j = 0; j < 16; ++j) {
     63     for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
     64     printf("     ");
     65     for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
     66     printf("     ");
     67     for (i = 0; i < 16; ++i) {
     68       printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
     69     }
     70     printf("\n");
     71   }
     72   printf("\n");   // newline before the U/V block
     73   for (j = 0; j < 8; ++j) {
     74     for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
     75     printf(" ");
     76     for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
     77     printf("    ");
     78     for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
     79     printf(" ");
     80     for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
     81     printf("   ");
     82     for (i = 0; i < 8; ++i) {
     83       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
     84     }
     85     printf(" ");
     86     for (i = 8; i < 16; ++i) {
     87       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
     88     }
     89     printf("\n");
     90   }
     91   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
     92     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
     93     (int)rd->score);
     94   if (is_i16) {
     95     printf("Mode: %d\n", rd->mode_i16);
     96     printf("y_dc_levels:");
     97     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
     98     printf("\n");
     99   } else {
    100     printf("Modes[16]: ");
    101     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
    102     printf("\n");
    103   }
    104   printf("y_ac_levels:\n");
    105   for (j = 0; j < 16; ++j) {
    106     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
    107       printf("%4d ", rd->y_ac_levels[j][i]);
    108     }
    109     printf("\n");
    110   }
    111   printf("\n");
    112   printf("uv_levels (mode=%d):\n", rd->mode_uv);
    113   for (j = 0; j < 8; ++j) {
    114     for (i = 0; i < 16; ++i) {
    115       printf("%4d ", rd->uv_levels[j][i]);
    116     }
    117     printf("\n");
    118   }
    119 }
    120 
    121 #endif   // DEBUG_BLOCK
    122 
    123 //------------------------------------------------------------------------------
    124 
    125 static WEBP_INLINE int clip(int v, int m, int M) {
    126   return v < m ? m : v > M ? M : v;
    127 }
    128 
    129 static const uint8_t kZigzag[16] = {
    130   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
    131 };
    132 
    133 static const uint8_t kDcTable[128] = {
    134   4,     5,   6,   7,   8,   9,  10,  10,
    135   11,   12,  13,  14,  15,  16,  17,  17,
    136   18,   19,  20,  20,  21,  21,  22,  22,
    137   23,   23,  24,  25,  25,  26,  27,  28,
    138   29,   30,  31,  32,  33,  34,  35,  36,
    139   37,   37,  38,  39,  40,  41,  42,  43,
    140   44,   45,  46,  46,  47,  48,  49,  50,
    141   51,   52,  53,  54,  55,  56,  57,  58,
    142   59,   60,  61,  62,  63,  64,  65,  66,
    143   67,   68,  69,  70,  71,  72,  73,  74,
    144   75,   76,  76,  77,  78,  79,  80,  81,
    145   82,   83,  84,  85,  86,  87,  88,  89,
    146   91,   93,  95,  96,  98, 100, 101, 102,
    147   104, 106, 108, 110, 112, 114, 116, 118,
    148   122, 124, 126, 128, 130, 132, 134, 136,
    149   138, 140, 143, 145, 148, 151, 154, 157
    150 };
    151 
    152 static const uint16_t kAcTable[128] = {
    153   4,     5,   6,   7,   8,   9,  10,  11,
    154   12,   13,  14,  15,  16,  17,  18,  19,
    155   20,   21,  22,  23,  24,  25,  26,  27,
    156   28,   29,  30,  31,  32,  33,  34,  35,
    157   36,   37,  38,  39,  40,  41,  42,  43,
    158   44,   45,  46,  47,  48,  49,  50,  51,
    159   52,   53,  54,  55,  56,  57,  58,  60,
    160   62,   64,  66,  68,  70,  72,  74,  76,
    161   78,   80,  82,  84,  86,  88,  90,  92,
    162   94,   96,  98, 100, 102, 104, 106, 108,
    163   110, 112, 114, 116, 119, 122, 125, 128,
    164   131, 134, 137, 140, 143, 146, 149, 152,
    165   155, 158, 161, 164, 167, 170, 173, 177,
    166   181, 185, 189, 193, 197, 201, 205, 209,
    167   213, 217, 221, 225, 229, 234, 239, 245,
    168   249, 254, 259, 264, 269, 274, 279, 284
    169 };
    170 
    171 static const uint16_t kAcTable2[128] = {
    172   8,     8,   9,  10,  12,  13,  15,  17,
    173   18,   20,  21,  23,  24,  26,  27,  29,
    174   31,   32,  34,  35,  37,  38,  40,  41,
    175   43,   44,  46,  48,  49,  51,  52,  54,
    176   55,   57,  58,  60,  62,  63,  65,  66,
    177   68,   69,  71,  72,  74,  75,  77,  79,
    178   80,   82,  83,  85,  86,  88,  89,  93,
    179   96,   99, 102, 105, 108, 111, 114, 117,
    180   120, 124, 127, 130, 133, 136, 139, 142,
    181   145, 148, 151, 155, 158, 161, 164, 167,
    182   170, 173, 176, 179, 184, 189, 193, 198,
    183   203, 207, 212, 217, 221, 226, 230, 235,
    184   240, 244, 249, 254, 258, 263, 268, 274,
    185   280, 286, 292, 299, 305, 311, 317, 323,
    186   330, 336, 342, 348, 354, 362, 370, 379,
    187   385, 393, 401, 409, 416, 424, 432, 440
    188 };
    189 
    190 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
    191   { 96, 110 }, { 96, 108 }, { 110, 115 }
    192 };
    193 
    194 // Sharpening by (slightly) raising the hi-frequency coeffs.
    195 // Hack-ish but helpful for mid-bitrate range. Use with care.
    196 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
    197 static const uint8_t kFreqSharpening[16] = {
    198   0,  30, 60, 90,
    199   30, 60, 90, 90,
    200   60, 90, 90, 90,
    201   90, 90, 90, 90
    202 };
    203 
    204 //------------------------------------------------------------------------------
    205 // Initialize quantization parameters in VP8Matrix
    206 
    207 // Returns the average quantizer
    208 static int ExpandMatrix(VP8Matrix* const m, int type) {
    209   int i, sum;
    210   for (i = 0; i < 2; ++i) {
    211     const int is_ac_coeff = (i > 0);
    212     const int bias = kBiasMatrices[type][is_ac_coeff];
    213     m->iq_[i] = (1 << QFIX) / m->q_[i];
    214     m->bias_[i] = BIAS(bias);
    215     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
    216     //   * zero if coeff <= zthresh
    217     //   * non-zero if coeff > zthresh
    218     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
    219   }
    220   for (i = 2; i < 16; ++i) {
    221     m->q_[i] = m->q_[1];
    222     m->iq_[i] = m->iq_[1];
    223     m->bias_[i] = m->bias_[1];
    224     m->zthresh_[i] = m->zthresh_[1];
    225   }
    226   for (sum = 0, i = 0; i < 16; ++i) {
    227     if (type == 0) {  // we only use sharpening for AC luma coeffs
    228       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
    229     } else {
    230       m->sharpen_[i] = 0;
    231     }
    232     sum += m->q_[i];
    233   }
    234   return (sum + 8) >> 4;
    235 }
    236 
    237 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
    238 
    239 static void SetupMatrices(VP8Encoder* enc) {
    240   int i;
    241   const int tlambda_scale =
    242     (enc->method_ >= 4) ? enc->config_->sns_strength
    243                         : 0;
    244   const int num_segments = enc->segment_hdr_.num_segments_;
    245   for (i = 0; i < num_segments; ++i) {
    246     VP8SegmentInfo* const m = &enc->dqm_[i];
    247     const int q = m->quant_;
    248     int q_i4, q_i16, q_uv;
    249     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
    250     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
    251 
    252     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
    253     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
    254 
    255     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
    256     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
    257 
    258     q_i4  = ExpandMatrix(&m->y1_, 0);
    259     q_i16 = ExpandMatrix(&m->y2_, 1);
    260     q_uv  = ExpandMatrix(&m->uv_, 2);
    261 
    262     m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
    263     m->lambda_i16_         = (3 * q_i16 * q_i16);
    264     m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
    265     m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
    266     m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
    267     m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
    268     m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
    269     m->tlambda_            = (tlambda_scale * q_i4) >> 5;
    270 
    271     // none of these constants should be < 1
    272     CheckLambdaValue(&m->lambda_i4_);
    273     CheckLambdaValue(&m->lambda_i16_);
    274     CheckLambdaValue(&m->lambda_uv_);
    275     CheckLambdaValue(&m->lambda_mode_);
    276     CheckLambdaValue(&m->lambda_trellis_i4_);
    277     CheckLambdaValue(&m->lambda_trellis_i16_);
    278     CheckLambdaValue(&m->lambda_trellis_uv_);
    279     CheckLambdaValue(&m->tlambda_);
    280 
    281     m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
    282     m->max_edge_  = 0;
    283 
    284     m->i4_penalty_ = 1000 * q_i4 * q_i4;
    285   }
    286 }
    287 
    288 //------------------------------------------------------------------------------
    289 // Initialize filtering parameters
    290 
    291 // Very small filter-strength values have close to no visual effect. So we can
    292 // save a little decoding-CPU by turning filtering off for these.
    293 #define FSTRENGTH_CUTOFF 2
    294 
    295 static void SetupFilterStrength(VP8Encoder* const enc) {
    296   int i;
    297   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
    298   const int level0 = 5 * enc->config_->filter_strength;
    299   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
    300     VP8SegmentInfo* const m = &enc->dqm_[i];
    301     // We focus on the quantization of AC coeffs.
    302     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
    303     const int base_strength =
    304         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
    305     // Segments with lower complexity ('beta') will be less filtered.
    306     const int f = base_strength * level0 / (256 + m->beta_);
    307     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
    308   }
    309   // We record the initial strength (mainly for the case of 1-segment only).
    310   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
    311   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
    312   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
    313 }
    314 
    315 //------------------------------------------------------------------------------
    316 
    317 // Note: if you change the values below, remember that the max range
    318 // allowed by the syntax for DQ_UV is [-16,16].
    319 #define MAX_DQ_UV (6)
    320 #define MIN_DQ_UV (-4)
    321 
    322 // We want to emulate jpeg-like behaviour where the expected "good" quality
    323 // is around q=75. Internally, our "good" middle is around c=50. So we
    324 // map accordingly using linear piece-wise function
    325 static double QualityToCompression(double c) {
    326   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
    327   // The file size roughly scales as pow(quantizer, 3.). Actually, the
    328   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
    329   // in the mid-quant range. So we scale the compressibility inversely to
    330   // this power-law: quant ~= compression ^ 1/3. This law holds well for
    331   // low quant. Finer modeling for high-quant would make use of kAcTable[]
    332   // more explicitly.
    333   const double v = pow(linear_c, 1 / 3.);
    334   return v;
    335 }
    336 
    337 static double QualityToJPEGCompression(double c, double alpha) {
    338   // We map the complexity 'alpha' and quality setting 'c' to a compression
    339   // exponent empirically matched to the compression curve of libjpeg6b.
    340   // On average, the WebP output size will be roughly similar to that of a
    341   // JPEG file compressed with same quality factor.
    342   const double amin = 0.30;
    343   const double amax = 0.85;
    344   const double exp_min = 0.4;
    345   const double exp_max = 0.9;
    346   const double slope = (exp_min - exp_max) / (amax - amin);
    347   // Linearly interpolate 'expn' from exp_min to exp_max
    348   // in the [amin, amax] range.
    349   const double expn = (alpha > amax) ? exp_min
    350                     : (alpha < amin) ? exp_max
    351                     : exp_max + slope * (alpha - amin);
    352   const double v = pow(c, expn);
    353   return v;
    354 }
    355 
    356 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
    357                                  const VP8SegmentInfo* const S2) {
    358   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
    359 }
    360 
    361 static void SimplifySegments(VP8Encoder* const enc) {
    362   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
    363   // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
    364   // explicit check is needed to avoid a spurious warning about 'i' exceeding
    365   // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
    366   const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
    367                                ? enc->segment_hdr_.num_segments_
    368                                : NUM_MB_SEGMENTS;
    369   int num_final_segments = 1;
    370   int s1, s2;
    371   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
    372     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
    373     int found = 0;
    374     // check if we already have similar segment
    375     for (s2 = 0; s2 < num_final_segments; ++s2) {
    376       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
    377       if (SegmentsAreEquivalent(S1, S2)) {
    378         found = 1;
    379         break;
    380       }
    381     }
    382     map[s1] = s2;
    383     if (!found) {
    384       if (num_final_segments != s1) {
    385         enc->dqm_[num_final_segments] = enc->dqm_[s1];
    386       }
    387       ++num_final_segments;
    388     }
    389   }
    390   if (num_final_segments < num_segments) {  // Remap
    391     int i = enc->mb_w_ * enc->mb_h_;
    392     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
    393     enc->segment_hdr_.num_segments_ = num_final_segments;
    394     // Replicate the trailing segment infos (it's mostly cosmetics)
    395     for (i = num_final_segments; i < num_segments; ++i) {
    396       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
    397     }
    398   }
    399 }
    400 
    401 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
    402   int i;
    403   int dq_uv_ac, dq_uv_dc;
    404   const int num_segments = enc->segment_hdr_.num_segments_;
    405   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
    406   const double Q = quality / 100.;
    407   const double c_base = enc->config_->emulate_jpeg_size ?
    408       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
    409       QualityToCompression(Q);
    410   for (i = 0; i < num_segments; ++i) {
    411     // We modulate the base coefficient to accommodate for the quantization
    412     // susceptibility and allow denser segments to be quantized more.
    413     const double expn = 1. - amp * enc->dqm_[i].alpha_;
    414     const double c = pow(c_base, expn);
    415     const int q = (int)(127. * (1. - c));
    416     assert(expn > 0.);
    417     enc->dqm_[i].quant_ = clip(q, 0, 127);
    418   }
    419 
    420   // purely indicative in the bitstream (except for the 1-segment case)
    421   enc->base_quant_ = enc->dqm_[0].quant_;
    422 
    423   // fill-in values for the unused segments (required by the syntax)
    424   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
    425     enc->dqm_[i].quant_ = enc->base_quant_;
    426   }
    427 
    428   // uv_alpha_ is normally spread around ~60. The useful range is
    429   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
    430   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
    431   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
    432                                           / (MAX_ALPHA - MIN_ALPHA);
    433   // we rescale by the user-defined strength of adaptation
    434   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
    435   // and make it safe.
    436   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
    437   // We also boost the dc-uv-quant a little, based on sns-strength, since
    438   // U/V channels are quite more reactive to high quants (flat DC-blocks
    439   // tend to appear, and are unpleasant).
    440   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
    441   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
    442 
    443   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
    444   enc->dq_y2_dc_ = 0;
    445   enc->dq_y2_ac_ = 0;
    446   enc->dq_uv_dc_ = dq_uv_dc;
    447   enc->dq_uv_ac_ = dq_uv_ac;
    448 
    449   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
    450 
    451   if (num_segments > 1) SimplifySegments(enc);
    452 
    453   SetupMatrices(enc);         // finalize quantization matrices
    454 }
    455 
    456 //------------------------------------------------------------------------------
    457 // Form the predictions in cache
    458 
    459 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
    460 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
    461 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
    462 
    463 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
    464 const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
    465   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
    466 };
    467 
    468 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
    469   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
    470   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
    471   VP8EncPredLuma16(it->yuv_p_, left, top);
    472 }
    473 
    474 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
    475   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
    476   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
    477   VP8EncPredChroma8(it->yuv_p_, left, top);
    478 }
    479 
    480 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
    481   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
    482 }
    483 
    484 //------------------------------------------------------------------------------
    485 // Quantize
    486 
    487 // Layout:
    488 // +----+----+
    489 // |YYYY|UUVV| 0
    490 // |YYYY|UUVV| 4
    491 // |YYYY|....| 8
    492 // |YYYY|....| 12
    493 // +----+----+
    494 
    495 const uint16_t VP8Scan[16] = {  // Luma
    496   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
    497   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
    498   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
    499   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
    500 };
    501 
    502 static const uint16_t VP8ScanUV[4 + 4] = {
    503   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
    504   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
    505 };
    506 
    507 //------------------------------------------------------------------------------
    508 // Distortion measurement
    509 
    510 static const uint16_t kWeightY[16] = {
    511   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
    512 };
    513 
    514 static const uint16_t kWeightTrellis[16] = {
    515 #if USE_TDISTO == 0
    516   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
    517 #else
    518   30, 27, 19, 11,
    519   27, 24, 17, 10,
    520   19, 17, 12,  8,
    521   11, 10,  8,  6
    522 #endif
    523 };
    524 
    525 // Init/Copy the common fields in score.
    526 static void InitScore(VP8ModeScore* const rd) {
    527   rd->D  = 0;
    528   rd->SD = 0;
    529   rd->R  = 0;
    530   rd->H  = 0;
    531   rd->nz = 0;
    532   rd->score = MAX_COST;
    533 }
    534 
    535 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
    536   dst->D  = src->D;
    537   dst->SD = src->SD;
    538   dst->R  = src->R;
    539   dst->H  = src->H;
    540   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
    541   dst->score = src->score;
    542 }
    543 
    544 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
    545   dst->D  += src->D;
    546   dst->SD += src->SD;
    547   dst->R  += src->R;
    548   dst->H  += src->H;
    549   dst->nz |= src->nz;     // here, new nz bits are accumulated.
    550   dst->score += src->score;
    551 }
    552 
    553 //------------------------------------------------------------------------------
    554 // Performs trellis-optimized quantization.
    555 
    556 // Trellis node
    557 typedef struct {
    558   int8_t prev;            // best previous node
    559   int8_t sign;            // sign of coeff_i
    560   int16_t level;          // level
    561 } Node;
    562 
    563 // Score state
    564 typedef struct {
    565   score_t score;          // partial RD score
    566   const uint16_t* costs;  // shortcut to cost tables
    567 } ScoreState;
    568 
    569 // If a coefficient was quantized to a value Q (using a neutral bias),
    570 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
    571 // We don't test negative values though.
    572 #define MIN_DELTA 0   // how much lower level to try
    573 #define MAX_DELTA 1   // how much higher
    574 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
    575 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
    576 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
    577 
    578 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
    579   rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
    580 }
    581 
    582 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
    583                                           score_t distortion) {
    584   return rate * lambda + RD_DISTO_MULT * distortion;
    585 }
    586 
    587 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
    588                                 int16_t in[16], int16_t out[16],
    589                                 int ctx0, int coeff_type,
    590                                 const VP8Matrix* const mtx,
    591                                 int lambda) {
    592   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
    593   CostArrayPtr const costs =
    594       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
    595   const int first = (coeff_type == 0) ? 1 : 0;
    596   Node nodes[16][NUM_NODES];
    597   ScoreState score_states[2][NUM_NODES];
    598   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
    599   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
    600   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
    601   score_t best_score;
    602   int n, m, p, last;
    603 
    604   {
    605     score_t cost;
    606     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
    607     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
    608 
    609     // compute the position of the last interesting coefficient
    610     last = first - 1;
    611     for (n = 15; n >= first; --n) {
    612       const int j = kZigzag[n];
    613       const int err = in[j] * in[j];
    614       if (err > thresh) {
    615         last = n;
    616         break;
    617       }
    618     }
    619     // we don't need to go inspect up to n = 16 coeffs. We can just go up
    620     // to last + 1 (inclusive) without losing much.
    621     if (last < 15) ++last;
    622 
    623     // compute 'skip' score. This is the max score one can do.
    624     cost = VP8BitCost(0, last_proba);
    625     best_score = RDScoreTrellis(lambda, cost, 0);
    626 
    627     // initialize source node.
    628     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
    629       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
    630       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
    631       ss_cur[m].costs = costs[first][ctx0];
    632     }
    633   }
    634 
    635   // traverse trellis.
    636   for (n = first; n <= last; ++n) {
    637     const int j = kZigzag[n];
    638     const uint32_t Q  = mtx->q_[j];
    639     const uint32_t iQ = mtx->iq_[j];
    640     const uint32_t B = BIAS(0x00);     // neutral bias
    641     // note: it's important to take sign of the _original_ coeff,
    642     // so we don't have to consider level < 0 afterward.
    643     const int sign = (in[j] < 0);
    644     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
    645     int level0 = QUANTDIV(coeff0, iQ, B);
    646     int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
    647     if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
    648     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
    649 
    650     {   // Swap current and previous score states
    651       ScoreState* const tmp = ss_cur;
    652       ss_cur = ss_prev;
    653       ss_prev = tmp;
    654     }
    655 
    656     // test all alternate level values around level0.
    657     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
    658       Node* const cur = &NODE(n, m);
    659       int level = level0 + m;
    660       const int ctx = (level > 2) ? 2 : level;
    661       const int band = VP8EncBands[n + 1];
    662       score_t base_score;
    663       score_t best_cur_score = MAX_COST;
    664       int best_prev = 0;   // default, in case
    665 
    666       ss_cur[m].score = MAX_COST;
    667       ss_cur[m].costs = costs[n + 1][ctx];
    668       if (level < 0 || level > thresh_level) {
    669         // Node is dead.
    670         continue;
    671       }
    672 
    673       {
    674         // Compute delta_error = how much coding this level will
    675         // subtract to max_error as distortion.
    676         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
    677         const int new_error = coeff0 - level * Q;
    678         const int delta_error =
    679             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
    680         base_score = RDScoreTrellis(lambda, 0, delta_error);
    681       }
    682 
    683       // Inspect all possible non-dead predecessors. Retain only the best one.
    684       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
    685         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
    686         // eliminated since their score can't be better than the current best.
    687         const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
    688         // Examine node assuming it's a non-terminal one.
    689         const score_t score =
    690             base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
    691         if (score < best_cur_score) {
    692           best_cur_score = score;
    693           best_prev = p;
    694         }
    695       }
    696       // Store best finding in current node.
    697       cur->sign = sign;
    698       cur->level = level;
    699       cur->prev = best_prev;
    700       ss_cur[m].score = best_cur_score;
    701 
    702       // Now, record best terminal node (and thus best entry in the graph).
    703       if (level != 0) {
    704         const score_t last_pos_cost =
    705             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
    706         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
    707         const score_t score = best_cur_score + last_pos_score;
    708         if (score < best_score) {
    709           best_score = score;
    710           best_path[0] = n;                     // best eob position
    711           best_path[1] = m;                     // best node index
    712           best_path[2] = best_prev;             // best predecessor
    713         }
    714       }
    715     }
    716   }
    717 
    718   // Fresh start
    719   memset(in + first, 0, (16 - first) * sizeof(*in));
    720   memset(out + first, 0, (16 - first) * sizeof(*out));
    721   if (best_path[0] == -1) {
    722     return 0;   // skip!
    723   }
    724 
    725   {
    726     // Unwind the best path.
    727     // Note: best-prev on terminal node is not necessarily equal to the
    728     // best_prev for non-terminal. So we patch best_path[2] in.
    729     int nz = 0;
    730     int best_node = best_path[1];
    731     n = best_path[0];
    732     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
    733 
    734     for (; n >= first; --n) {
    735       const Node* const node = &NODE(n, best_node);
    736       const int j = kZigzag[n];
    737       out[n] = node->sign ? -node->level : node->level;
    738       nz |= node->level;
    739       in[j] = out[n] * mtx->q_[j];
    740       best_node = node->prev;
    741     }
    742     return (nz != 0);
    743   }
    744 }
    745 
    746 #undef NODE
    747 
    748 //------------------------------------------------------------------------------
    749 // Performs: difference, transform, quantize, back-transform, add
    750 // all at once. Output is the reconstructed block in *yuv_out, and the
    751 // quantized levels in *levels.
    752 
    753 static int ReconstructIntra16(VP8EncIterator* const it,
    754                               VP8ModeScore* const rd,
    755                               uint8_t* const yuv_out,
    756                               int mode) {
    757   const VP8Encoder* const enc = it->enc_;
    758   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
    759   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
    760   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    761   int nz = 0;
    762   int n;
    763   int16_t tmp[16][16], dc_tmp[16];
    764 
    765   for (n = 0; n < 16; n += 2) {
    766     VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
    767   }
    768   VP8FTransformWHT(tmp[0], dc_tmp);
    769   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
    770 
    771   if (DO_TRELLIS_I16 && it->do_trellis_) {
    772     int x, y;
    773     VP8IteratorNzToBytes(it);
    774     for (y = 0, n = 0; y < 4; ++y) {
    775       for (x = 0; x < 4; ++x, ++n) {
    776         const int ctx = it->top_nz_[x] + it->left_nz_[y];
    777         const int non_zero =
    778             TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
    779                                  &dqm->y1_, dqm->lambda_trellis_i16_);
    780         it->top_nz_[x] = it->left_nz_[y] = non_zero;
    781         rd->y_ac_levels[n][0] = 0;
    782         nz |= non_zero << n;
    783       }
    784     }
    785   } else {
    786     for (n = 0; n < 16; n += 2) {
    787       // Zero-out the first coeff, so that: a) nz is correct below, and
    788       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
    789       tmp[n][0] = tmp[n + 1][0] = 0;
    790       nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
    791       assert(rd->y_ac_levels[n + 0][0] == 0);
    792       assert(rd->y_ac_levels[n + 1][0] == 0);
    793     }
    794   }
    795 
    796   // Transform back
    797   VP8TransformWHT(dc_tmp, tmp[0]);
    798   for (n = 0; n < 16; n += 2) {
    799     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
    800   }
    801 
    802   return nz;
    803 }
    804 
    805 static int ReconstructIntra4(VP8EncIterator* const it,
    806                              int16_t levels[16],
    807                              const uint8_t* const src,
    808                              uint8_t* const yuv_out,
    809                              int mode) {
    810   const VP8Encoder* const enc = it->enc_;
    811   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
    812   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    813   int nz = 0;
    814   int16_t tmp[16];
    815 
    816   VP8FTransform(src, ref, tmp);
    817   if (DO_TRELLIS_I4 && it->do_trellis_) {
    818     const int x = it->i4_ & 3, y = it->i4_ >> 2;
    819     const int ctx = it->top_nz_[x] + it->left_nz_[y];
    820     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
    821                               dqm->lambda_trellis_i4_);
    822   } else {
    823     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
    824   }
    825   VP8ITransform(ref, tmp, yuv_out, 0);
    826   return nz;
    827 }
    828 
    829 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
    830                          uint8_t* const yuv_out, int mode) {
    831   const VP8Encoder* const enc = it->enc_;
    832   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
    833   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
    834   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    835   int nz = 0;
    836   int n;
    837   int16_t tmp[8][16];
    838 
    839   for (n = 0; n < 8; n += 2) {
    840     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
    841   }
    842   if (DO_TRELLIS_UV && it->do_trellis_) {
    843     int ch, x, y;
    844     for (ch = 0, n = 0; ch <= 2; ch += 2) {
    845       for (y = 0; y < 2; ++y) {
    846         for (x = 0; x < 2; ++x, ++n) {
    847           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
    848           const int non_zero =
    849               TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
    850                                    &dqm->uv_, dqm->lambda_trellis_uv_);
    851           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
    852           nz |= non_zero << n;
    853         }
    854       }
    855     }
    856   } else {
    857     for (n = 0; n < 8; n += 2) {
    858       nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
    859     }
    860   }
    861 
    862   for (n = 0; n < 8; n += 2) {
    863     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
    864   }
    865   return (nz << 16);
    866 }
    867 
    868 //------------------------------------------------------------------------------
    869 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
    870 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
    871 
    872 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
    873   // We look at the first three AC coefficients to determine what is the average
    874   // delta between each sub-4x4 block.
    875   const int v0 = abs(DCs[1]);
    876   const int v1 = abs(DCs[2]);
    877   const int v2 = abs(DCs[4]);
    878   int max_v = (v1 > v0) ? v1 : v0;
    879   max_v = (v2 > max_v) ? v2 : max_v;
    880   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
    881 }
    882 
    883 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
    884   VP8ModeScore* const tmp = *a;
    885   *a = *b;
    886   *b = tmp;
    887 }
    888 
    889 static void SwapPtr(uint8_t** a, uint8_t** b) {
    890   uint8_t* const tmp = *a;
    891   *a = *b;
    892   *b = tmp;
    893 }
    894 
    895 static void SwapOut(VP8EncIterator* const it) {
    896   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
    897 }
    898 
    899 static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
    900   score_t score = 0;
    901   while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
    902     int i;
    903     for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
    904       score += (levels[i] != 0);
    905       if (score > thresh) return 0;
    906     }
    907     levels += 16;
    908   }
    909   return 1;
    910 }
    911 
    912 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
    913   const int kNumBlocks = 16;
    914   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
    915   const int lambda = dqm->lambda_i16_;
    916   const int tlambda = dqm->tlambda_;
    917   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
    918   VP8ModeScore rd_tmp;
    919   VP8ModeScore* rd_cur = &rd_tmp;
    920   VP8ModeScore* rd_best = rd;
    921   int mode;
    922 
    923   rd->mode_i16 = -1;
    924   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
    925     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
    926     rd_cur->mode_i16 = mode;
    927 
    928     // Reconstruct
    929     rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
    930 
    931     // Measure RD-score
    932     rd_cur->D = VP8SSE16x16(src, tmp_dst);
    933     rd_cur->SD =
    934         tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
    935     rd_cur->H = VP8FixedCostsI16[mode];
    936     rd_cur->R = VP8GetCostLuma16(it, rd_cur);
    937     if (mode > 0 &&
    938         IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
    939       // penalty to avoid flat area to be mispredicted by complex mode
    940       rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
    941     }
    942 
    943     // Since we always examine Intra16 first, we can overwrite *rd directly.
    944     SetRDScore(lambda, rd_cur);
    945     if (mode == 0 || rd_cur->score < rd_best->score) {
    946       SwapModeScore(&rd_cur, &rd_best);
    947       SwapOut(it);
    948     }
    949   }
    950   if (rd_best != rd) {
    951     memcpy(rd, rd_best, sizeof(*rd));
    952   }
    953   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
    954   VP8SetIntra16Mode(it, rd->mode_i16);
    955 
    956   // we have a blocky macroblock (only DCs are non-zero) with fairly high
    957   // distortion, record max delta so we can later adjust the minimal filtering
    958   // strength needed to smooth these blocks out.
    959   if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
    960     StoreMaxDelta(dqm, rd->y_dc_levels);
    961   }
    962 }
    963 
    964 //------------------------------------------------------------------------------
    965 
    966 // return the cost array corresponding to the surrounding prediction modes.
    967 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
    968                                      const uint8_t modes[16]) {
    969   const int preds_w = it->enc_->preds_w_;
    970   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
    971   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
    972   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
    973   return VP8FixedCostsI4[top][left];
    974 }
    975 
    976 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
    977   const VP8Encoder* const enc = it->enc_;
    978   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    979   const int lambda = dqm->lambda_i4_;
    980   const int tlambda = dqm->tlambda_;
    981   const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
    982   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
    983   int total_header_bits = 0;
    984   VP8ModeScore rd_best;
    985 
    986   if (enc->max_i4_header_bits_ == 0) {
    987     return 0;
    988   }
    989 
    990   InitScore(&rd_best);
    991   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
    992   SetRDScore(dqm->lambda_mode_, &rd_best);
    993   VP8IteratorStartI4(it);
    994   do {
    995     const int kNumBlocks = 1;
    996     VP8ModeScore rd_i4;
    997     int mode;
    998     int best_mode = -1;
    999     const uint8_t* const src = src0 + VP8Scan[it->i4_];
   1000     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
   1001     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
   1002     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
   1003 
   1004     InitScore(&rd_i4);
   1005     VP8MakeIntra4Preds(it);
   1006     for (mode = 0; mode < NUM_BMODES; ++mode) {
   1007       VP8ModeScore rd_tmp;
   1008       int16_t tmp_levels[16];
   1009 
   1010       // Reconstruct
   1011       rd_tmp.nz =
   1012           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
   1013 
   1014       // Compute RD-score
   1015       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
   1016       rd_tmp.SD =
   1017           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
   1018                   : 0;
   1019       rd_tmp.H = mode_costs[mode];
   1020 
   1021       // Add flatness penalty
   1022       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
   1023         rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
   1024       } else {
   1025         rd_tmp.R = 0;
   1026       }
   1027 
   1028       // early-out check
   1029       SetRDScore(lambda, &rd_tmp);
   1030       if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
   1031 
   1032       // finish computing score
   1033       rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
   1034       SetRDScore(lambda, &rd_tmp);
   1035 
   1036       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
   1037         CopyScore(&rd_i4, &rd_tmp);
   1038         best_mode = mode;
   1039         SwapPtr(&tmp_dst, &best_block);
   1040         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
   1041                sizeof(rd_best.y_ac_levels[it->i4_]));
   1042       }
   1043     }
   1044     SetRDScore(dqm->lambda_mode_, &rd_i4);
   1045     AddScore(&rd_best, &rd_i4);
   1046     if (rd_best.score >= rd->score) {
   1047       return 0;
   1048     }
   1049     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
   1050     if (total_header_bits > enc->max_i4_header_bits_) {
   1051       return 0;
   1052     }
   1053     // Copy selected samples if not in the right place already.
   1054     if (best_block != best_blocks + VP8Scan[it->i4_]) {
   1055       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
   1056     }
   1057     rd->modes_i4[it->i4_] = best_mode;
   1058     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
   1059   } while (VP8IteratorRotateI4(it, best_blocks));
   1060 
   1061   // finalize state
   1062   CopyScore(rd, &rd_best);
   1063   VP8SetIntra4Mode(it, rd->modes_i4);
   1064   SwapOut(it);
   1065   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
   1066   return 1;   // select intra4x4 over intra16x16
   1067 }
   1068 
   1069 //------------------------------------------------------------------------------
   1070 
   1071 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
   1072   const int kNumBlocks = 8;
   1073   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   1074   const int lambda = dqm->lambda_uv_;
   1075   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
   1076   uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
   1077   uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
   1078   uint8_t* dst = dst0;
   1079   VP8ModeScore rd_best;
   1080   int mode;
   1081 
   1082   rd->mode_uv = -1;
   1083   InitScore(&rd_best);
   1084   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1085     VP8ModeScore rd_uv;
   1086 
   1087     // Reconstruct
   1088     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
   1089 
   1090     // Compute RD-score
   1091     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
   1092     rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
   1093     rd_uv.H  = VP8FixedCostsUV[mode];
   1094     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
   1095     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
   1096       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
   1097     }
   1098 
   1099     SetRDScore(lambda, &rd_uv);
   1100     if (mode == 0 || rd_uv.score < rd_best.score) {
   1101       CopyScore(&rd_best, &rd_uv);
   1102       rd->mode_uv = mode;
   1103       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
   1104       SwapPtr(&dst, &tmp_dst);
   1105     }
   1106   }
   1107   VP8SetIntraUVMode(it, rd->mode_uv);
   1108   AddScore(rd, &rd_best);
   1109   if (dst != dst0) {   // copy 16x8 block if needed
   1110     VP8Copy16x8(dst, dst0);
   1111   }
   1112 }
   1113 
   1114 //------------------------------------------------------------------------------
   1115 // Final reconstruction and quantization.
   1116 
   1117 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
   1118   const VP8Encoder* const enc = it->enc_;
   1119   const int is_i16 = (it->mb_->type_ == 1);
   1120   int nz = 0;
   1121 
   1122   if (is_i16) {
   1123     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
   1124   } else {
   1125     VP8IteratorStartI4(it);
   1126     do {
   1127       const int mode =
   1128           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
   1129       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
   1130       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
   1131       VP8MakeIntra4Preds(it);
   1132       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
   1133                               src, dst, mode) << it->i4_;
   1134     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
   1135   }
   1136 
   1137   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
   1138   rd->nz = nz;
   1139 }
   1140 
   1141 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
   1142 static void RefineUsingDistortion(VP8EncIterator* const it,
   1143                                   int try_both_modes, int refine_uv_mode,
   1144                                   VP8ModeScore* const rd) {
   1145   score_t best_score = MAX_COST;
   1146   int nz = 0;
   1147   int mode;
   1148   int is_i16 = try_both_modes || (it->mb_->type_ == 1);
   1149 
   1150   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   1151   // Some empiric constants, of approximate order of magnitude.
   1152   const int lambda_d_i16 = 106;
   1153   const int lambda_d_i4 = 11;
   1154   const int lambda_d_uv = 120;
   1155   score_t score_i4 = dqm->i4_penalty_;
   1156   score_t i4_bit_sum = 0;
   1157   const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
   1158                                            : MAX_COST;  // no early-out allowed
   1159 
   1160   if (is_i16) {   // First, evaluate Intra16 distortion
   1161     int best_mode = -1;
   1162     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
   1163     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1164       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
   1165       const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
   1166                           + VP8FixedCostsI16[mode] * lambda_d_i16;
   1167       if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
   1168         continue;
   1169       }
   1170       if (score < best_score) {
   1171         best_mode = mode;
   1172         best_score = score;
   1173       }
   1174     }
   1175     VP8SetIntra16Mode(it, best_mode);
   1176     // we'll reconstruct later, if i16 mode actually gets selected
   1177   }
   1178 
   1179   // Next, evaluate Intra4
   1180   if (try_both_modes || !is_i16) {
   1181     // We don't evaluate the rate here, but just account for it through a
   1182     // constant penalty (i4 mode usually needs more bits compared to i16).
   1183     is_i16 = 0;
   1184     VP8IteratorStartI4(it);
   1185     do {
   1186       int best_i4_mode = -1;
   1187       score_t best_i4_score = MAX_COST;
   1188       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
   1189       const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
   1190 
   1191       VP8MakeIntra4Preds(it);
   1192       for (mode = 0; mode < NUM_BMODES; ++mode) {
   1193         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
   1194         const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
   1195                             + mode_costs[mode] * lambda_d_i4;
   1196         if (score < best_i4_score) {
   1197           best_i4_mode = mode;
   1198           best_i4_score = score;
   1199         }
   1200       }
   1201       i4_bit_sum += mode_costs[best_i4_mode];
   1202       rd->modes_i4[it->i4_] = best_i4_mode;
   1203       score_i4 += best_i4_score;
   1204       if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
   1205         // Intra4 won't be better than Intra16. Bail out and pick Intra16.
   1206         is_i16 = 1;
   1207         break;
   1208       } else {  // reconstruct partial block inside yuv_out2_ buffer
   1209         uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
   1210         nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
   1211                                 src, tmp_dst, best_i4_mode) << it->i4_;
   1212       }
   1213     } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
   1214   }
   1215 
   1216   // Final reconstruction, depending on which mode is selected.
   1217   if (!is_i16) {
   1218     VP8SetIntra4Mode(it, rd->modes_i4);
   1219     SwapOut(it);
   1220     best_score = score_i4;
   1221   } else {
   1222     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
   1223   }
   1224 
   1225   // ... and UV!
   1226   if (refine_uv_mode) {
   1227     int best_mode = -1;
   1228     score_t best_uv_score = MAX_COST;
   1229     const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
   1230     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1231       const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
   1232       const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
   1233                           + VP8FixedCostsUV[mode] * lambda_d_uv;
   1234       if (score < best_uv_score) {
   1235         best_mode = mode;
   1236         best_uv_score = score;
   1237       }
   1238     }
   1239     VP8SetIntraUVMode(it, best_mode);
   1240   }
   1241   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
   1242 
   1243   rd->nz = nz;
   1244   rd->score = best_score;
   1245 }
   1246 
   1247 //------------------------------------------------------------------------------
   1248 // Entry point
   1249 
   1250 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
   1251                 VP8RDLevel rd_opt) {
   1252   int is_skipped;
   1253   const int method = it->enc_->method_;
   1254 
   1255   InitScore(rd);
   1256 
   1257   // We can perform predictions for Luma16x16 and Chroma8x8 already.
   1258   // Luma4x4 predictions needs to be done as-we-go.
   1259   VP8MakeLuma16Preds(it);
   1260   VP8MakeChroma8Preds(it);
   1261 
   1262   if (rd_opt > RD_OPT_NONE) {
   1263     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
   1264     PickBestIntra16(it, rd);
   1265     if (method >= 2) {
   1266       PickBestIntra4(it, rd);
   1267     }
   1268     PickBestUV(it, rd);
   1269     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
   1270       it->do_trellis_ = 1;
   1271       SimpleQuantize(it, rd);
   1272     }
   1273   } else {
   1274     // At this point we have heuristically decided intra16 / intra4.
   1275     // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
   1276     // For method <= 1, we don't re-examine the decision but just go ahead with
   1277     // quantization/reconstruction.
   1278     RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
   1279   }
   1280   is_skipped = (rd->nz == 0);
   1281   VP8SetSkip(it, is_skipped);
   1282   return is_skipped;
   1283 }
   1284