Home | History | Annotate | Download | only in enc
      1 // Copyright 2011 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 //   Quantization
     11 //
     12 // Author: Skal (pascal.massimino (at) gmail.com)
     13 
     14 #include <assert.h>
     15 #include <math.h>
     16 #include <stdlib.h>  // for abs()
     17 
     18 #include "src/dsp/quant.h"
     19 #include "src/enc/vp8i_enc.h"
     20 #include "src/enc/cost_enc.h"
     21 
     22 #define DO_TRELLIS_I4  1
     23 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
     24 #define DO_TRELLIS_UV  0   // disable trellis for UV. Risky. Not worth.
     25 #define USE_TDISTO 1
     26 
     27 #define MID_ALPHA 64      // neutral value for susceptibility
     28 #define MIN_ALPHA 30      // lowest usable value for susceptibility
     29 #define MAX_ALPHA 100     // higher meaningful value for susceptibility
     30 
     31 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
     32                           // power-law modulation. Must be strictly less than 1.
     33 
     34 // number of non-zero coeffs below which we consider the block very flat
     35 // (and apply a penalty to complex predictions)
     36 #define FLATNESS_LIMIT_I16 10      // I16 mode
     37 #define FLATNESS_LIMIT_I4  3       // I4 mode
     38 #define FLATNESS_LIMIT_UV  2       // UV mode
     39 #define FLATNESS_PENALTY   140     // roughly ~1bit per block
     40 
     41 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
     42 
     43 #define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
     44 
     45 // #define DEBUG_BLOCK
     46 
     47 //------------------------------------------------------------------------------
     48 
     49 #if defined(DEBUG_BLOCK)
     50 
     51 #include <stdio.h>
     52 #include <stdlib.h>
     53 
     54 static void PrintBlockInfo(const VP8EncIterator* const it,
     55                            const VP8ModeScore* const rd) {
     56   int i, j;
     57   const int is_i16 = (it->mb_->type_ == 1);
     58   const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
     59   const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
     60   const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
     61   const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
     62   printf("SOURCE / OUTPUT / ABS DELTA\n");
     63   for (j = 0; j < 16; ++j) {
     64     for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
     65     printf("     ");
     66     for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
     67     printf("     ");
     68     for (i = 0; i < 16; ++i) {
     69       printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
     70     }
     71     printf("\n");
     72   }
     73   printf("\n");   // newline before the U/V block
     74   for (j = 0; j < 8; ++j) {
     75     for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
     76     printf(" ");
     77     for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
     78     printf("    ");
     79     for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
     80     printf(" ");
     81     for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
     82     printf("   ");
     83     for (i = 0; i < 8; ++i) {
     84       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
     85     }
     86     printf(" ");
     87     for (i = 8; i < 16; ++i) {
     88       printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
     89     }
     90     printf("\n");
     91   }
     92   printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
     93     (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
     94     (int)rd->score);
     95   if (is_i16) {
     96     printf("Mode: %d\n", rd->mode_i16);
     97     printf("y_dc_levels:");
     98     for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
     99     printf("\n");
    100   } else {
    101     printf("Modes[16]: ");
    102     for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
    103     printf("\n");
    104   }
    105   printf("y_ac_levels:\n");
    106   for (j = 0; j < 16; ++j) {
    107     for (i = is_i16 ? 1 : 0; i < 16; ++i) {
    108       printf("%4d ", rd->y_ac_levels[j][i]);
    109     }
    110     printf("\n");
    111   }
    112   printf("\n");
    113   printf("uv_levels (mode=%d):\n", rd->mode_uv);
    114   for (j = 0; j < 8; ++j) {
    115     for (i = 0; i < 16; ++i) {
    116       printf("%4d ", rd->uv_levels[j][i]);
    117     }
    118     printf("\n");
    119   }
    120 }
    121 
    122 #endif   // DEBUG_BLOCK
    123 
    124 //------------------------------------------------------------------------------
    125 
    126 static WEBP_INLINE int clip(int v, int m, int M) {
    127   return v < m ? m : v > M ? M : v;
    128 }
    129 
    130 static const uint8_t kZigzag[16] = {
    131   0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
    132 };
    133 
    134 static const uint8_t kDcTable[128] = {
    135   4,     5,   6,   7,   8,   9,  10,  10,
    136   11,   12,  13,  14,  15,  16,  17,  17,
    137   18,   19,  20,  20,  21,  21,  22,  22,
    138   23,   23,  24,  25,  25,  26,  27,  28,
    139   29,   30,  31,  32,  33,  34,  35,  36,
    140   37,   37,  38,  39,  40,  41,  42,  43,
    141   44,   45,  46,  46,  47,  48,  49,  50,
    142   51,   52,  53,  54,  55,  56,  57,  58,
    143   59,   60,  61,  62,  63,  64,  65,  66,
    144   67,   68,  69,  70,  71,  72,  73,  74,
    145   75,   76,  76,  77,  78,  79,  80,  81,
    146   82,   83,  84,  85,  86,  87,  88,  89,
    147   91,   93,  95,  96,  98, 100, 101, 102,
    148   104, 106, 108, 110, 112, 114, 116, 118,
    149   122, 124, 126, 128, 130, 132, 134, 136,
    150   138, 140, 143, 145, 148, 151, 154, 157
    151 };
    152 
    153 static const uint16_t kAcTable[128] = {
    154   4,     5,   6,   7,   8,   9,  10,  11,
    155   12,   13,  14,  15,  16,  17,  18,  19,
    156   20,   21,  22,  23,  24,  25,  26,  27,
    157   28,   29,  30,  31,  32,  33,  34,  35,
    158   36,   37,  38,  39,  40,  41,  42,  43,
    159   44,   45,  46,  47,  48,  49,  50,  51,
    160   52,   53,  54,  55,  56,  57,  58,  60,
    161   62,   64,  66,  68,  70,  72,  74,  76,
    162   78,   80,  82,  84,  86,  88,  90,  92,
    163   94,   96,  98, 100, 102, 104, 106, 108,
    164   110, 112, 114, 116, 119, 122, 125, 128,
    165   131, 134, 137, 140, 143, 146, 149, 152,
    166   155, 158, 161, 164, 167, 170, 173, 177,
    167   181, 185, 189, 193, 197, 201, 205, 209,
    168   213, 217, 221, 225, 229, 234, 239, 245,
    169   249, 254, 259, 264, 269, 274, 279, 284
    170 };
    171 
    172 static const uint16_t kAcTable2[128] = {
    173   8,     8,   9,  10,  12,  13,  15,  17,
    174   18,   20,  21,  23,  24,  26,  27,  29,
    175   31,   32,  34,  35,  37,  38,  40,  41,
    176   43,   44,  46,  48,  49,  51,  52,  54,
    177   55,   57,  58,  60,  62,  63,  65,  66,
    178   68,   69,  71,  72,  74,  75,  77,  79,
    179   80,   82,  83,  85,  86,  88,  89,  93,
    180   96,   99, 102, 105, 108, 111, 114, 117,
    181   120, 124, 127, 130, 133, 136, 139, 142,
    182   145, 148, 151, 155, 158, 161, 164, 167,
    183   170, 173, 176, 179, 184, 189, 193, 198,
    184   203, 207, 212, 217, 221, 226, 230, 235,
    185   240, 244, 249, 254, 258, 263, 268, 274,
    186   280, 286, 292, 299, 305, 311, 317, 323,
    187   330, 336, 342, 348, 354, 362, 370, 379,
    188   385, 393, 401, 409, 416, 424, 432, 440
    189 };
    190 
    191 static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
    192   { 96, 110 }, { 96, 108 }, { 110, 115 }
    193 };
    194 
    195 // Sharpening by (slightly) raising the hi-frequency coeffs.
    196 // Hack-ish but helpful for mid-bitrate range. Use with care.
    197 #define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
    198 static const uint8_t kFreqSharpening[16] = {
    199   0,  30, 60, 90,
    200   30, 60, 90, 90,
    201   60, 90, 90, 90,
    202   90, 90, 90, 90
    203 };
    204 
    205 //------------------------------------------------------------------------------
    206 // Initialize quantization parameters in VP8Matrix
    207 
    208 // Returns the average quantizer
    209 static int ExpandMatrix(VP8Matrix* const m, int type) {
    210   int i, sum;
    211   for (i = 0; i < 2; ++i) {
    212     const int is_ac_coeff = (i > 0);
    213     const int bias = kBiasMatrices[type][is_ac_coeff];
    214     m->iq_[i] = (1 << QFIX) / m->q_[i];
    215     m->bias_[i] = BIAS(bias);
    216     // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
    217     //   * zero if coeff <= zthresh
    218     //   * non-zero if coeff > zthresh
    219     m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
    220   }
    221   for (i = 2; i < 16; ++i) {
    222     m->q_[i] = m->q_[1];
    223     m->iq_[i] = m->iq_[1];
    224     m->bias_[i] = m->bias_[1];
    225     m->zthresh_[i] = m->zthresh_[1];
    226   }
    227   for (sum = 0, i = 0; i < 16; ++i) {
    228     if (type == 0) {  // we only use sharpening for AC luma coeffs
    229       m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
    230     } else {
    231       m->sharpen_[i] = 0;
    232     }
    233     sum += m->q_[i];
    234   }
    235   return (sum + 8) >> 4;
    236 }
    237 
    238 static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
    239 
    240 static void SetupMatrices(VP8Encoder* enc) {
    241   int i;
    242   const int tlambda_scale =
    243     (enc->method_ >= 4) ? enc->config_->sns_strength
    244                         : 0;
    245   const int num_segments = enc->segment_hdr_.num_segments_;
    246   for (i = 0; i < num_segments; ++i) {
    247     VP8SegmentInfo* const m = &enc->dqm_[i];
    248     const int q = m->quant_;
    249     int q_i4, q_i16, q_uv;
    250     m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
    251     m->y1_.q_[1] = kAcTable[clip(q,                  0, 127)];
    252 
    253     m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
    254     m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
    255 
    256     m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
    257     m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
    258 
    259     q_i4  = ExpandMatrix(&m->y1_, 0);
    260     q_i16 = ExpandMatrix(&m->y2_, 1);
    261     q_uv  = ExpandMatrix(&m->uv_, 2);
    262 
    263     m->lambda_i4_          = (3 * q_i4 * q_i4) >> 7;
    264     m->lambda_i16_         = (3 * q_i16 * q_i16);
    265     m->lambda_uv_          = (3 * q_uv * q_uv) >> 6;
    266     m->lambda_mode_        = (1 * q_i4 * q_i4) >> 7;
    267     m->lambda_trellis_i4_  = (7 * q_i4 * q_i4) >> 3;
    268     m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
    269     m->lambda_trellis_uv_  = (q_uv * q_uv) << 1;
    270     m->tlambda_            = (tlambda_scale * q_i4) >> 5;
    271 
    272     // none of these constants should be < 1
    273     CheckLambdaValue(&m->lambda_i4_);
    274     CheckLambdaValue(&m->lambda_i16_);
    275     CheckLambdaValue(&m->lambda_uv_);
    276     CheckLambdaValue(&m->lambda_mode_);
    277     CheckLambdaValue(&m->lambda_trellis_i4_);
    278     CheckLambdaValue(&m->lambda_trellis_i16_);
    279     CheckLambdaValue(&m->lambda_trellis_uv_);
    280     CheckLambdaValue(&m->tlambda_);
    281 
    282     m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
    283     m->max_edge_  = 0;
    284 
    285     m->i4_penalty_ = 1000 * q_i4 * q_i4;
    286   }
    287 }
    288 
    289 //------------------------------------------------------------------------------
    290 // Initialize filtering parameters
    291 
    292 // Very small filter-strength values have close to no visual effect. So we can
    293 // save a little decoding-CPU by turning filtering off for these.
    294 #define FSTRENGTH_CUTOFF 2
    295 
    296 static void SetupFilterStrength(VP8Encoder* const enc) {
    297   int i;
    298   // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
    299   const int level0 = 5 * enc->config_->filter_strength;
    300   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
    301     VP8SegmentInfo* const m = &enc->dqm_[i];
    302     // We focus on the quantization of AC coeffs.
    303     const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
    304     const int base_strength =
    305         VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
    306     // Segments with lower complexity ('beta') will be less filtered.
    307     const int f = base_strength * level0 / (256 + m->beta_);
    308     m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
    309   }
    310   // We record the initial strength (mainly for the case of 1-segment only).
    311   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
    312   enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
    313   enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
    314 }
    315 
    316 //------------------------------------------------------------------------------
    317 
    318 // Note: if you change the values below, remember that the max range
    319 // allowed by the syntax for DQ_UV is [-16,16].
    320 #define MAX_DQ_UV (6)
    321 #define MIN_DQ_UV (-4)
    322 
    323 // We want to emulate jpeg-like behaviour where the expected "good" quality
    324 // is around q=75. Internally, our "good" middle is around c=50. So we
    325 // map accordingly using linear piece-wise function
    326 static double QualityToCompression(double c) {
    327   const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
    328   // The file size roughly scales as pow(quantizer, 3.). Actually, the
    329   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
    330   // in the mid-quant range. So we scale the compressibility inversely to
    331   // this power-law: quant ~= compression ^ 1/3. This law holds well for
    332   // low quant. Finer modeling for high-quant would make use of kAcTable[]
    333   // more explicitly.
    334   const double v = pow(linear_c, 1 / 3.);
    335   return v;
    336 }
    337 
    338 static double QualityToJPEGCompression(double c, double alpha) {
    339   // We map the complexity 'alpha' and quality setting 'c' to a compression
    340   // exponent empirically matched to the compression curve of libjpeg6b.
    341   // On average, the WebP output size will be roughly similar to that of a
    342   // JPEG file compressed with same quality factor.
    343   const double amin = 0.30;
    344   const double amax = 0.85;
    345   const double exp_min = 0.4;
    346   const double exp_max = 0.9;
    347   const double slope = (exp_min - exp_max) / (amax - amin);
    348   // Linearly interpolate 'expn' from exp_min to exp_max
    349   // in the [amin, amax] range.
    350   const double expn = (alpha > amax) ? exp_min
    351                     : (alpha < amin) ? exp_max
    352                     : exp_max + slope * (alpha - amin);
    353   const double v = pow(c, expn);
    354   return v;
    355 }
    356 
    357 static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
    358                                  const VP8SegmentInfo* const S2) {
    359   return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
    360 }
    361 
    362 static void SimplifySegments(VP8Encoder* const enc) {
    363   int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
    364   // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
    365   // explicit check is needed to avoid a spurious warning about 'i' exceeding
    366   // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
    367   const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
    368                                ? enc->segment_hdr_.num_segments_
    369                                : NUM_MB_SEGMENTS;
    370   int num_final_segments = 1;
    371   int s1, s2;
    372   for (s1 = 1; s1 < num_segments; ++s1) {    // find similar segments
    373     const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
    374     int found = 0;
    375     // check if we already have similar segment
    376     for (s2 = 0; s2 < num_final_segments; ++s2) {
    377       const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
    378       if (SegmentsAreEquivalent(S1, S2)) {
    379         found = 1;
    380         break;
    381       }
    382     }
    383     map[s1] = s2;
    384     if (!found) {
    385       if (num_final_segments != s1) {
    386         enc->dqm_[num_final_segments] = enc->dqm_[s1];
    387       }
    388       ++num_final_segments;
    389     }
    390   }
    391   if (num_final_segments < num_segments) {  // Remap
    392     int i = enc->mb_w_ * enc->mb_h_;
    393     while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
    394     enc->segment_hdr_.num_segments_ = num_final_segments;
    395     // Replicate the trailing segment infos (it's mostly cosmetics)
    396     for (i = num_final_segments; i < num_segments; ++i) {
    397       enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
    398     }
    399   }
    400 }
    401 
    402 void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
    403   int i;
    404   int dq_uv_ac, dq_uv_dc;
    405   const int num_segments = enc->segment_hdr_.num_segments_;
    406   const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
    407   const double Q = quality / 100.;
    408   const double c_base = enc->config_->emulate_jpeg_size ?
    409       QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
    410       QualityToCompression(Q);
    411   for (i = 0; i < num_segments; ++i) {
    412     // We modulate the base coefficient to accommodate for the quantization
    413     // susceptibility and allow denser segments to be quantized more.
    414     const double expn = 1. - amp * enc->dqm_[i].alpha_;
    415     const double c = pow(c_base, expn);
    416     const int q = (int)(127. * (1. - c));
    417     assert(expn > 0.);
    418     enc->dqm_[i].quant_ = clip(q, 0, 127);
    419   }
    420 
    421   // purely indicative in the bitstream (except for the 1-segment case)
    422   enc->base_quant_ = enc->dqm_[0].quant_;
    423 
    424   // fill-in values for the unused segments (required by the syntax)
    425   for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
    426     enc->dqm_[i].quant_ = enc->base_quant_;
    427   }
    428 
    429   // uv_alpha_ is normally spread around ~60. The useful range is
    430   // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
    431   // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
    432   dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
    433                                           / (MAX_ALPHA - MIN_ALPHA);
    434   // we rescale by the user-defined strength of adaptation
    435   dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
    436   // and make it safe.
    437   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
    438   // We also boost the dc-uv-quant a little, based on sns-strength, since
    439   // U/V channels are quite more reactive to high quants (flat DC-blocks
    440   // tend to appear, and are unpleasant).
    441   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
    442   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
    443 
    444   enc->dq_y1_dc_ = 0;       // TODO(skal): dq-lum
    445   enc->dq_y2_dc_ = 0;
    446   enc->dq_y2_ac_ = 0;
    447   enc->dq_uv_dc_ = dq_uv_dc;
    448   enc->dq_uv_ac_ = dq_uv_ac;
    449 
    450   SetupFilterStrength(enc);   // initialize segments' filtering, eventually
    451 
    452   if (num_segments > 1) SimplifySegments(enc);
    453 
    454   SetupMatrices(enc);         // finalize quantization matrices
    455 }
    456 
    457 //------------------------------------------------------------------------------
    458 // Form the predictions in cache
    459 
    460 // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
    461 const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
    462 const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
    463 
    464 // Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
    465 const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
    466   I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
    467 };
    468 
    469 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
    470   const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
    471   const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
    472   VP8EncPredLuma16(it->yuv_p_, left, top);
    473 }
    474 
    475 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
    476   const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
    477   const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
    478   VP8EncPredChroma8(it->yuv_p_, left, top);
    479 }
    480 
    481 void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
    482   VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
    483 }
    484 
    485 //------------------------------------------------------------------------------
    486 // Quantize
    487 
    488 // Layout:
    489 // +----+----+
    490 // |YYYY|UUVV| 0
    491 // |YYYY|UUVV| 4
    492 // |YYYY|....| 8
    493 // |YYYY|....| 12
    494 // +----+----+
    495 
    496 const uint16_t VP8Scan[16] = {  // Luma
    497   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
    498   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
    499   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
    500   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
    501 };
    502 
    503 static const uint16_t VP8ScanUV[4 + 4] = {
    504   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
    505   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
    506 };
    507 
    508 //------------------------------------------------------------------------------
    509 // Distortion measurement
    510 
    511 static const uint16_t kWeightY[16] = {
    512   38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
    513 };
    514 
    515 static const uint16_t kWeightTrellis[16] = {
    516 #if USE_TDISTO == 0
    517   16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
    518 #else
    519   30, 27, 19, 11,
    520   27, 24, 17, 10,
    521   19, 17, 12,  8,
    522   11, 10,  8,  6
    523 #endif
    524 };
    525 
    526 // Init/Copy the common fields in score.
    527 static void InitScore(VP8ModeScore* const rd) {
    528   rd->D  = 0;
    529   rd->SD = 0;
    530   rd->R  = 0;
    531   rd->H  = 0;
    532   rd->nz = 0;
    533   rd->score = MAX_COST;
    534 }
    535 
    536 static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
    537   dst->D  = src->D;
    538   dst->SD = src->SD;
    539   dst->R  = src->R;
    540   dst->H  = src->H;
    541   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
    542   dst->score = src->score;
    543 }
    544 
    545 static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
    546   dst->D  += src->D;
    547   dst->SD += src->SD;
    548   dst->R  += src->R;
    549   dst->H  += src->H;
    550   dst->nz |= src->nz;     // here, new nz bits are accumulated.
    551   dst->score += src->score;
    552 }
    553 
    554 //------------------------------------------------------------------------------
    555 // Performs trellis-optimized quantization.
    556 
    557 // Trellis node
    558 typedef struct {
    559   int8_t prev;            // best previous node
    560   int8_t sign;            // sign of coeff_i
    561   int16_t level;          // level
    562 } Node;
    563 
    564 // Score state
    565 typedef struct {
    566   score_t score;          // partial RD score
    567   const uint16_t* costs;  // shortcut to cost tables
    568 } ScoreState;
    569 
    570 // If a coefficient was quantized to a value Q (using a neutral bias),
    571 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
    572 // We don't test negative values though.
    573 #define MIN_DELTA 0   // how much lower level to try
    574 #define MAX_DELTA 1   // how much higher
    575 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
    576 #define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
    577 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
    578 
    579 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
    580   rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
    581 }
    582 
    583 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
    584                                           score_t distortion) {
    585   return rate * lambda + RD_DISTO_MULT * distortion;
    586 }
    587 
    588 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
    589                                 int16_t in[16], int16_t out[16],
    590                                 int ctx0, int coeff_type,
    591                                 const VP8Matrix* const mtx,
    592                                 int lambda) {
    593   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
    594   CostArrayPtr const costs =
    595       (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
    596   const int first = (coeff_type == 0) ? 1 : 0;
    597   Node nodes[16][NUM_NODES];
    598   ScoreState score_states[2][NUM_NODES];
    599   ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
    600   ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
    601   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
    602   score_t best_score;
    603   int n, m, p, last;
    604 
    605   {
    606     score_t cost;
    607     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
    608     const int last_proba = probas[VP8EncBands[first]][ctx0][0];
    609 
    610     // compute the position of the last interesting coefficient
    611     last = first - 1;
    612     for (n = 15; n >= first; --n) {
    613       const int j = kZigzag[n];
    614       const int err = in[j] * in[j];
    615       if (err > thresh) {
    616         last = n;
    617         break;
    618       }
    619     }
    620     // we don't need to go inspect up to n = 16 coeffs. We can just go up
    621     // to last + 1 (inclusive) without losing much.
    622     if (last < 15) ++last;
    623 
    624     // compute 'skip' score. This is the max score one can do.
    625     cost = VP8BitCost(0, last_proba);
    626     best_score = RDScoreTrellis(lambda, cost, 0);
    627 
    628     // initialize source node.
    629     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
    630       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
    631       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
    632       ss_cur[m].costs = costs[first][ctx0];
    633     }
    634   }
    635 
    636   // traverse trellis.
    637   for (n = first; n <= last; ++n) {
    638     const int j = kZigzag[n];
    639     const uint32_t Q  = mtx->q_[j];
    640     const uint32_t iQ = mtx->iq_[j];
    641     const uint32_t B = BIAS(0x00);     // neutral bias
    642     // note: it's important to take sign of the _original_ coeff,
    643     // so we don't have to consider level < 0 afterward.
    644     const int sign = (in[j] < 0);
    645     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
    646     int level0 = QUANTDIV(coeff0, iQ, B);
    647     int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
    648     if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
    649     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
    650 
    651     {   // Swap current and previous score states
    652       ScoreState* const tmp = ss_cur;
    653       ss_cur = ss_prev;
    654       ss_prev = tmp;
    655     }
    656 
    657     // test all alternate level values around level0.
    658     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
    659       Node* const cur = &NODE(n, m);
    660       int level = level0 + m;
    661       const int ctx = (level > 2) ? 2 : level;
    662       const int band = VP8EncBands[n + 1];
    663       score_t base_score;
    664       score_t best_cur_score = MAX_COST;
    665       int best_prev = 0;   // default, in case
    666 
    667       ss_cur[m].score = MAX_COST;
    668       ss_cur[m].costs = costs[n + 1][ctx];
    669       if (level < 0 || level > thresh_level) {
    670         // Node is dead.
    671         continue;
    672       }
    673 
    674       {
    675         // Compute delta_error = how much coding this level will
    676         // subtract to max_error as distortion.
    677         // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
    678         const int new_error = coeff0 - level * Q;
    679         const int delta_error =
    680             kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
    681         base_score = RDScoreTrellis(lambda, 0, delta_error);
    682       }
    683 
    684       // Inspect all possible non-dead predecessors. Retain only the best one.
    685       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
    686         // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
    687         // eliminated since their score can't be better than the current best.
    688         const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
    689         // Examine node assuming it's a non-terminal one.
    690         const score_t score =
    691             base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
    692         if (score < best_cur_score) {
    693           best_cur_score = score;
    694           best_prev = p;
    695         }
    696       }
    697       // Store best finding in current node.
    698       cur->sign = sign;
    699       cur->level = level;
    700       cur->prev = best_prev;
    701       ss_cur[m].score = best_cur_score;
    702 
    703       // Now, record best terminal node (and thus best entry in the graph).
    704       if (level != 0) {
    705         const score_t last_pos_cost =
    706             (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
    707         const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
    708         const score_t score = best_cur_score + last_pos_score;
    709         if (score < best_score) {
    710           best_score = score;
    711           best_path[0] = n;                     // best eob position
    712           best_path[1] = m;                     // best node index
    713           best_path[2] = best_prev;             // best predecessor
    714         }
    715       }
    716     }
    717   }
    718 
    719   // Fresh start
    720   memset(in + first, 0, (16 - first) * sizeof(*in));
    721   memset(out + first, 0, (16 - first) * sizeof(*out));
    722   if (best_path[0] == -1) {
    723     return 0;   // skip!
    724   }
    725 
    726   {
    727     // Unwind the best path.
    728     // Note: best-prev on terminal node is not necessarily equal to the
    729     // best_prev for non-terminal. So we patch best_path[2] in.
    730     int nz = 0;
    731     int best_node = best_path[1];
    732     n = best_path[0];
    733     NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
    734 
    735     for (; n >= first; --n) {
    736       const Node* const node = &NODE(n, best_node);
    737       const int j = kZigzag[n];
    738       out[n] = node->sign ? -node->level : node->level;
    739       nz |= node->level;
    740       in[j] = out[n] * mtx->q_[j];
    741       best_node = node->prev;
    742     }
    743     return (nz != 0);
    744   }
    745 }
    746 
    747 #undef NODE
    748 
    749 //------------------------------------------------------------------------------
    750 // Performs: difference, transform, quantize, back-transform, add
    751 // all at once. Output is the reconstructed block in *yuv_out, and the
    752 // quantized levels in *levels.
    753 
    754 static int ReconstructIntra16(VP8EncIterator* const it,
    755                               VP8ModeScore* const rd,
    756                               uint8_t* const yuv_out,
    757                               int mode) {
    758   const VP8Encoder* const enc = it->enc_;
    759   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
    760   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
    761   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    762   int nz = 0;
    763   int n;
    764   int16_t tmp[16][16], dc_tmp[16];
    765 
    766   for (n = 0; n < 16; n += 2) {
    767     VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
    768   }
    769   VP8FTransformWHT(tmp[0], dc_tmp);
    770   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
    771 
    772   if (DO_TRELLIS_I16 && it->do_trellis_) {
    773     int x, y;
    774     VP8IteratorNzToBytes(it);
    775     for (y = 0, n = 0; y < 4; ++y) {
    776       for (x = 0; x < 4; ++x, ++n) {
    777         const int ctx = it->top_nz_[x] + it->left_nz_[y];
    778         const int non_zero =
    779             TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
    780                                  &dqm->y1_, dqm->lambda_trellis_i16_);
    781         it->top_nz_[x] = it->left_nz_[y] = non_zero;
    782         rd->y_ac_levels[n][0] = 0;
    783         nz |= non_zero << n;
    784       }
    785     }
    786   } else {
    787     for (n = 0; n < 16; n += 2) {
    788       // Zero-out the first coeff, so that: a) nz is correct below, and
    789       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
    790       tmp[n][0] = tmp[n + 1][0] = 0;
    791       nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
    792       assert(rd->y_ac_levels[n + 0][0] == 0);
    793       assert(rd->y_ac_levels[n + 1][0] == 0);
    794     }
    795   }
    796 
    797   // Transform back
    798   VP8TransformWHT(dc_tmp, tmp[0]);
    799   for (n = 0; n < 16; n += 2) {
    800     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
    801   }
    802 
    803   return nz;
    804 }
    805 
    806 static int ReconstructIntra4(VP8EncIterator* const it,
    807                              int16_t levels[16],
    808                              const uint8_t* const src,
    809                              uint8_t* const yuv_out,
    810                              int mode) {
    811   const VP8Encoder* const enc = it->enc_;
    812   const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
    813   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    814   int nz = 0;
    815   int16_t tmp[16];
    816 
    817   VP8FTransform(src, ref, tmp);
    818   if (DO_TRELLIS_I4 && it->do_trellis_) {
    819     const int x = it->i4_ & 3, y = it->i4_ >> 2;
    820     const int ctx = it->top_nz_[x] + it->left_nz_[y];
    821     nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
    822                               dqm->lambda_trellis_i4_);
    823   } else {
    824     nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
    825   }
    826   VP8ITransform(ref, tmp, yuv_out, 0);
    827   return nz;
    828 }
    829 
    830 //------------------------------------------------------------------------------
    831 // DC-error diffusion
    832 
    833 // Diffusion weights. We under-correct a bit (15/16th of the error is actually
    834 // diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
    835 #define C1 7    // fraction of error sent to the 4x4 block below
    836 #define C2 8    // fraction of error sent to the 4x4 block on the right
    837 #define DSHIFT 4
    838 #define DSCALE 1   // storage descaling, needed to make the error fit int8_t
    839 
    840 // Quantize as usual, but also compute and return the quantization error.
    841 // Error is already divided by DSHIFT.
    842 static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
    843   int V = *v;
    844   const int sign = (V < 0);
    845   if (sign) V = -V;
    846   if (V > (int)mtx->zthresh_[0]) {
    847     const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
    848     const int err = (V - qV);
    849     *v = sign ? -qV : qV;
    850     return (sign ? -err : err) >> DSCALE;
    851   }
    852   *v = 0;
    853   return (sign ? -V : V) >> DSCALE;
    854 }
    855 
    856 static void CorrectDCValues(const VP8EncIterator* const it,
    857                             const VP8Matrix* const mtx,
    858                             int16_t tmp[][16], VP8ModeScore* const rd) {
    859   //         | top[0] | top[1]
    860   // --------+--------+---------
    861   // left[0] | tmp[0]   tmp[1]  <->   err0 err1
    862   // left[1] | tmp[2]   tmp[3]        err2 err3
    863   //
    864   // Final errors {err1,err2,err3} are preserved and later restored
    865   // as top[]/left[] on the next block.
    866   int ch;
    867   for (ch = 0; ch <= 1; ++ch) {
    868     const int8_t* const top = it->top_derr_[it->x_][ch];
    869     const int8_t* const left = it->left_derr_[ch];
    870     int16_t (* const c)[16] = &tmp[ch * 4];
    871     int err0, err1, err2, err3;
    872     c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
    873     err0 = QuantizeSingle(&c[0][0], mtx);
    874     c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
    875     err1 = QuantizeSingle(&c[1][0], mtx);
    876     c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
    877     err2 = QuantizeSingle(&c[2][0], mtx);
    878     c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
    879     err3 = QuantizeSingle(&c[3][0], mtx);
    880     // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
    881     // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
    882     assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
    883     rd->derr[ch][0] = (int8_t)err1;
    884     rd->derr[ch][1] = (int8_t)err2;
    885     rd->derr[ch][2] = (int8_t)err3;
    886   }
    887 }
    888 
    889 static void StoreDiffusionErrors(VP8EncIterator* const it,
    890                                  const VP8ModeScore* const rd) {
    891   int ch;
    892   for (ch = 0; ch <= 1; ++ch) {
    893     int8_t* const top = it->top_derr_[it->x_][ch];
    894     int8_t* const left = it->left_derr_[ch];
    895     left[0] = rd->derr[ch][0];            // restore err1
    896     left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
    897     top[0]  = rd->derr[ch][1];            //     ... err2
    898     top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
    899   }
    900 }
    901 
    902 #undef C1
    903 #undef C2
    904 #undef DSHIFT
    905 #undef DSCALE
    906 
    907 //------------------------------------------------------------------------------
    908 
    909 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
    910                          uint8_t* const yuv_out, int mode) {
    911   const VP8Encoder* const enc = it->enc_;
    912   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
    913   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
    914   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
    915   int nz = 0;
    916   int n;
    917   int16_t tmp[8][16];
    918 
    919   for (n = 0; n < 8; n += 2) {
    920     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
    921   }
    922   if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
    923 
    924   if (DO_TRELLIS_UV && it->do_trellis_) {
    925     int ch, x, y;
    926     for (ch = 0, n = 0; ch <= 2; ch += 2) {
    927       for (y = 0; y < 2; ++y) {
    928         for (x = 0; x < 2; ++x, ++n) {
    929           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
    930           const int non_zero =
    931               TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
    932                                    &dqm->uv_, dqm->lambda_trellis_uv_);
    933           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
    934           nz |= non_zero << n;
    935         }
    936       }
    937     }
    938   } else {
    939     for (n = 0; n < 8; n += 2) {
    940       nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
    941     }
    942   }
    943 
    944   for (n = 0; n < 8; n += 2) {
    945     VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
    946   }
    947   return (nz << 16);
    948 }
    949 
    950 //------------------------------------------------------------------------------
    951 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
    952 // Pick the mode is lower RD-cost = Rate + lambda * Distortion.
    953 
    954 static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
    955   // We look at the first three AC coefficients to determine what is the average
    956   // delta between each sub-4x4 block.
    957   const int v0 = abs(DCs[1]);
    958   const int v1 = abs(DCs[2]);
    959   const int v2 = abs(DCs[4]);
    960   int max_v = (v1 > v0) ? v1 : v0;
    961   max_v = (v2 > max_v) ? v2 : max_v;
    962   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
    963 }
    964 
    965 static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
    966   VP8ModeScore* const tmp = *a;
    967   *a = *b;
    968   *b = tmp;
    969 }
    970 
    971 static void SwapPtr(uint8_t** a, uint8_t** b) {
    972   uint8_t* const tmp = *a;
    973   *a = *b;
    974   *b = tmp;
    975 }
    976 
    977 static void SwapOut(VP8EncIterator* const it) {
    978   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
    979 }
    980 
    981 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
    982   const int kNumBlocks = 16;
    983   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
    984   const int lambda = dqm->lambda_i16_;
    985   const int tlambda = dqm->tlambda_;
    986   const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
    987   VP8ModeScore rd_tmp;
    988   VP8ModeScore* rd_cur = &rd_tmp;
    989   VP8ModeScore* rd_best = rd;
    990   int mode;
    991 
    992   rd->mode_i16 = -1;
    993   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
    994     uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
    995     rd_cur->mode_i16 = mode;
    996 
    997     // Reconstruct
    998     rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
    999 
   1000     // Measure RD-score
   1001     rd_cur->D = VP8SSE16x16(src, tmp_dst);
   1002     rd_cur->SD =
   1003         tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
   1004     rd_cur->H = VP8FixedCostsI16[mode];
   1005     rd_cur->R = VP8GetCostLuma16(it, rd_cur);
   1006     if (mode > 0 &&
   1007         IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
   1008       // penalty to avoid flat area to be mispredicted by complex mode
   1009       rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
   1010     }
   1011 
   1012     // Since we always examine Intra16 first, we can overwrite *rd directly.
   1013     SetRDScore(lambda, rd_cur);
   1014     if (mode == 0 || rd_cur->score < rd_best->score) {
   1015       SwapModeScore(&rd_cur, &rd_best);
   1016       SwapOut(it);
   1017     }
   1018   }
   1019   if (rd_best != rd) {
   1020     memcpy(rd, rd_best, sizeof(*rd));
   1021   }
   1022   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
   1023   VP8SetIntra16Mode(it, rd->mode_i16);
   1024 
   1025   // we have a blocky macroblock (only DCs are non-zero) with fairly high
   1026   // distortion, record max delta so we can later adjust the minimal filtering
   1027   // strength needed to smooth these blocks out.
   1028   if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
   1029     StoreMaxDelta(dqm, rd->y_dc_levels);
   1030   }
   1031 }
   1032 
   1033 //------------------------------------------------------------------------------
   1034 
   1035 // return the cost array corresponding to the surrounding prediction modes.
   1036 static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
   1037                                      const uint8_t modes[16]) {
   1038   const int preds_w = it->enc_->preds_w_;
   1039   const int x = (it->i4_ & 3), y = it->i4_ >> 2;
   1040   const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
   1041   const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
   1042   return VP8FixedCostsI4[top][left];
   1043 }
   1044 
   1045 static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
   1046   const VP8Encoder* const enc = it->enc_;
   1047   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   1048   const int lambda = dqm->lambda_i4_;
   1049   const int tlambda = dqm->tlambda_;
   1050   const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
   1051   uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
   1052   int total_header_bits = 0;
   1053   VP8ModeScore rd_best;
   1054 
   1055   if (enc->max_i4_header_bits_ == 0) {
   1056     return 0;
   1057   }
   1058 
   1059   InitScore(&rd_best);
   1060   rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
   1061   SetRDScore(dqm->lambda_mode_, &rd_best);
   1062   VP8IteratorStartI4(it);
   1063   do {
   1064     const int kNumBlocks = 1;
   1065     VP8ModeScore rd_i4;
   1066     int mode;
   1067     int best_mode = -1;
   1068     const uint8_t* const src = src0 + VP8Scan[it->i4_];
   1069     const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
   1070     uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
   1071     uint8_t* tmp_dst = it->yuv_p_ + I4TMP;    // scratch buffer.
   1072 
   1073     InitScore(&rd_i4);
   1074     VP8MakeIntra4Preds(it);
   1075     for (mode = 0; mode < NUM_BMODES; ++mode) {
   1076       VP8ModeScore rd_tmp;
   1077       int16_t tmp_levels[16];
   1078 
   1079       // Reconstruct
   1080       rd_tmp.nz =
   1081           ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
   1082 
   1083       // Compute RD-score
   1084       rd_tmp.D = VP8SSE4x4(src, tmp_dst);
   1085       rd_tmp.SD =
   1086           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
   1087                   : 0;
   1088       rd_tmp.H = mode_costs[mode];
   1089 
   1090       // Add flatness penalty
   1091       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
   1092         rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
   1093       } else {
   1094         rd_tmp.R = 0;
   1095       }
   1096 
   1097       // early-out check
   1098       SetRDScore(lambda, &rd_tmp);
   1099       if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
   1100 
   1101       // finish computing score
   1102       rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
   1103       SetRDScore(lambda, &rd_tmp);
   1104 
   1105       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
   1106         CopyScore(&rd_i4, &rd_tmp);
   1107         best_mode = mode;
   1108         SwapPtr(&tmp_dst, &best_block);
   1109         memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
   1110                sizeof(rd_best.y_ac_levels[it->i4_]));
   1111       }
   1112     }
   1113     SetRDScore(dqm->lambda_mode_, &rd_i4);
   1114     AddScore(&rd_best, &rd_i4);
   1115     if (rd_best.score >= rd->score) {
   1116       return 0;
   1117     }
   1118     total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
   1119     if (total_header_bits > enc->max_i4_header_bits_) {
   1120       return 0;
   1121     }
   1122     // Copy selected samples if not in the right place already.
   1123     if (best_block != best_blocks + VP8Scan[it->i4_]) {
   1124       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
   1125     }
   1126     rd->modes_i4[it->i4_] = best_mode;
   1127     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
   1128   } while (VP8IteratorRotateI4(it, best_blocks));
   1129 
   1130   // finalize state
   1131   CopyScore(rd, &rd_best);
   1132   VP8SetIntra4Mode(it, rd->modes_i4);
   1133   SwapOut(it);
   1134   memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
   1135   return 1;   // select intra4x4 over intra16x16
   1136 }
   1137 
   1138 //------------------------------------------------------------------------------
   1139 
   1140 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
   1141   const int kNumBlocks = 8;
   1142   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   1143   const int lambda = dqm->lambda_uv_;
   1144   const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
   1145   uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
   1146   uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
   1147   uint8_t* dst = dst0;
   1148   VP8ModeScore rd_best;
   1149   int mode;
   1150 
   1151   rd->mode_uv = -1;
   1152   InitScore(&rd_best);
   1153   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1154     VP8ModeScore rd_uv;
   1155 
   1156     // Reconstruct
   1157     rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
   1158 
   1159     // Compute RD-score
   1160     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
   1161     rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
   1162     rd_uv.H  = VP8FixedCostsUV[mode];
   1163     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
   1164     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
   1165       rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
   1166     }
   1167 
   1168     SetRDScore(lambda, &rd_uv);
   1169     if (mode == 0 || rd_uv.score < rd_best.score) {
   1170       CopyScore(&rd_best, &rd_uv);
   1171       rd->mode_uv = mode;
   1172       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
   1173       if (it->top_derr_ != NULL) {
   1174         memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
   1175       }
   1176       SwapPtr(&dst, &tmp_dst);
   1177     }
   1178   }
   1179   VP8SetIntraUVMode(it, rd->mode_uv);
   1180   AddScore(rd, &rd_best);
   1181   if (dst != dst0) {   // copy 16x8 block if needed
   1182     VP8Copy16x8(dst, dst0);
   1183   }
   1184   if (it->top_derr_ != NULL) {  // store diffusion errors for next block
   1185     StoreDiffusionErrors(it, rd);
   1186   }
   1187 }
   1188 
   1189 //------------------------------------------------------------------------------
   1190 // Final reconstruction and quantization.
   1191 
   1192 static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
   1193   const VP8Encoder* const enc = it->enc_;
   1194   const int is_i16 = (it->mb_->type_ == 1);
   1195   int nz = 0;
   1196 
   1197   if (is_i16) {
   1198     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
   1199   } else {
   1200     VP8IteratorStartI4(it);
   1201     do {
   1202       const int mode =
   1203           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
   1204       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
   1205       uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
   1206       VP8MakeIntra4Preds(it);
   1207       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
   1208                               src, dst, mode) << it->i4_;
   1209     } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
   1210   }
   1211 
   1212   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
   1213   rd->nz = nz;
   1214 }
   1215 
   1216 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
   1217 static void RefineUsingDistortion(VP8EncIterator* const it,
   1218                                   int try_both_modes, int refine_uv_mode,
   1219                                   VP8ModeScore* const rd) {
   1220   score_t best_score = MAX_COST;
   1221   int nz = 0;
   1222   int mode;
   1223   int is_i16 = try_both_modes || (it->mb_->type_ == 1);
   1224 
   1225   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   1226   // Some empiric constants, of approximate order of magnitude.
   1227   const int lambda_d_i16 = 106;
   1228   const int lambda_d_i4 = 11;
   1229   const int lambda_d_uv = 120;
   1230   score_t score_i4 = dqm->i4_penalty_;
   1231   score_t i4_bit_sum = 0;
   1232   const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
   1233                                            : MAX_COST;  // no early-out allowed
   1234 
   1235   if (is_i16) {   // First, evaluate Intra16 distortion
   1236     int best_mode = -1;
   1237     const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
   1238     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1239       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
   1240       const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
   1241                           + VP8FixedCostsI16[mode] * lambda_d_i16;
   1242       if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
   1243         continue;
   1244       }
   1245       if (score < best_score) {
   1246         best_mode = mode;
   1247         best_score = score;
   1248       }
   1249     }
   1250     VP8SetIntra16Mode(it, best_mode);
   1251     // we'll reconstruct later, if i16 mode actually gets selected
   1252   }
   1253 
   1254   // Next, evaluate Intra4
   1255   if (try_both_modes || !is_i16) {
   1256     // We don't evaluate the rate here, but just account for it through a
   1257     // constant penalty (i4 mode usually needs more bits compared to i16).
   1258     is_i16 = 0;
   1259     VP8IteratorStartI4(it);
   1260     do {
   1261       int best_i4_mode = -1;
   1262       score_t best_i4_score = MAX_COST;
   1263       const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
   1264       const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
   1265 
   1266       VP8MakeIntra4Preds(it);
   1267       for (mode = 0; mode < NUM_BMODES; ++mode) {
   1268         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
   1269         const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
   1270                             + mode_costs[mode] * lambda_d_i4;
   1271         if (score < best_i4_score) {
   1272           best_i4_mode = mode;
   1273           best_i4_score = score;
   1274         }
   1275       }
   1276       i4_bit_sum += mode_costs[best_i4_mode];
   1277       rd->modes_i4[it->i4_] = best_i4_mode;
   1278       score_i4 += best_i4_score;
   1279       if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
   1280         // Intra4 won't be better than Intra16. Bail out and pick Intra16.
   1281         is_i16 = 1;
   1282         break;
   1283       } else {  // reconstruct partial block inside yuv_out2_ buffer
   1284         uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
   1285         nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
   1286                                 src, tmp_dst, best_i4_mode) << it->i4_;
   1287       }
   1288     } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
   1289   }
   1290 
   1291   // Final reconstruction, depending on which mode is selected.
   1292   if (!is_i16) {
   1293     VP8SetIntra4Mode(it, rd->modes_i4);
   1294     SwapOut(it);
   1295     best_score = score_i4;
   1296   } else {
   1297     nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
   1298   }
   1299 
   1300   // ... and UV!
   1301   if (refine_uv_mode) {
   1302     int best_mode = -1;
   1303     score_t best_uv_score = MAX_COST;
   1304     const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
   1305     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
   1306       const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
   1307       const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
   1308                           + VP8FixedCostsUV[mode] * lambda_d_uv;
   1309       if (score < best_uv_score) {
   1310         best_mode = mode;
   1311         best_uv_score = score;
   1312       }
   1313     }
   1314     VP8SetIntraUVMode(it, best_mode);
   1315   }
   1316   nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
   1317 
   1318   rd->nz = nz;
   1319   rd->score = best_score;
   1320 }
   1321 
   1322 //------------------------------------------------------------------------------
   1323 // Entry point
   1324 
   1325 int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
   1326                 VP8RDLevel rd_opt) {
   1327   int is_skipped;
   1328   const int method = it->enc_->method_;
   1329 
   1330   InitScore(rd);
   1331 
   1332   // We can perform predictions for Luma16x16 and Chroma8x8 already.
   1333   // Luma4x4 predictions needs to be done as-we-go.
   1334   VP8MakeLuma16Preds(it);
   1335   VP8MakeChroma8Preds(it);
   1336 
   1337   if (rd_opt > RD_OPT_NONE) {
   1338     it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
   1339     PickBestIntra16(it, rd);
   1340     if (method >= 2) {
   1341       PickBestIntra4(it, rd);
   1342     }
   1343     PickBestUV(it, rd);
   1344     if (rd_opt == RD_OPT_TRELLIS) {   // finish off with trellis-optim now
   1345       it->do_trellis_ = 1;
   1346       SimpleQuantize(it, rd);
   1347     }
   1348   } else {
   1349     // At this point we have heuristically decided intra16 / intra4.
   1350     // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
   1351     // For method <= 1, we don't re-examine the decision but just go ahead with
   1352     // quantization/reconstruction.
   1353     RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
   1354   }
   1355   is_skipped = (rd->nz == 0);
   1356   VP8SetSkip(it, is_skipped);
   1357   return is_skipped;
   1358 }
   1359