Home | History | Annotate | Download | only in encoder
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <math.h>
     13 #include <stdio.h>
     14 
     15 #include "./vp9_rtcd.h"
     16 
     17 #include "vpx_dsp/vpx_dsp_common.h"
     18 #include "vpx_mem/vpx_mem.h"
     19 #include "vpx_ports/bitops.h"
     20 #include "vpx_ports/mem.h"
     21 #include "vpx_ports/system_state.h"
     22 
     23 #include "vp9/common/vp9_common.h"
     24 #include "vp9/common/vp9_entropy.h"
     25 #include "vp9/common/vp9_entropymode.h"
     26 #include "vp9/common/vp9_mvref_common.h"
     27 #include "vp9/common/vp9_pred_common.h"
     28 #include "vp9/common/vp9_quant_common.h"
     29 #include "vp9/common/vp9_reconinter.h"
     30 #include "vp9/common/vp9_reconintra.h"
     31 #include "vp9/common/vp9_seg_common.h"
     32 
     33 #include "vp9/encoder/vp9_cost.h"
     34 #include "vp9/encoder/vp9_encodemb.h"
     35 #include "vp9/encoder/vp9_encodemv.h"
     36 #include "vp9/encoder/vp9_encoder.h"
     37 #include "vp9/encoder/vp9_mcomp.h"
     38 #include "vp9/encoder/vp9_quantize.h"
     39 #include "vp9/encoder/vp9_ratectrl.h"
     40 #include "vp9/encoder/vp9_rd.h"
     41 #include "vp9/encoder/vp9_tokenize.h"
     42 
     43 #define RD_THRESH_POW 1.25
     44 
     45 // Factor to weigh the rate for switchable interp filters.
     46 #define SWITCHABLE_INTERP_RATE_FACTOR 1
     47 
     48 void vp9_rd_cost_reset(RD_COST *rd_cost) {
     49   rd_cost->rate = INT_MAX;
     50   rd_cost->dist = INT64_MAX;
     51   rd_cost->rdcost = INT64_MAX;
     52 }
     53 
     54 void vp9_rd_cost_init(RD_COST *rd_cost) {
     55   rd_cost->rate = 0;
     56   rd_cost->dist = 0;
     57   rd_cost->rdcost = 0;
     58 }
     59 
     60 // The baseline rd thresholds for breaking out of the rd loop for
     61 // certain modes are assumed to be based on 8x8 blocks.
     62 // This table is used to correct for block size.
     63 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
     64 static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
     65   2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32
     66 };
     67 
     68 static void fill_mode_costs(VP9_COMP *cpi) {
     69   const FRAME_CONTEXT *const fc = cpi->common.fc;
     70   int i, j;
     71 
     72   for (i = 0; i < INTRA_MODES; ++i) {
     73     for (j = 0; j < INTRA_MODES; ++j) {
     74       vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
     75                       vp9_intra_mode_tree);
     76     }
     77   }
     78 
     79   vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
     80   for (i = 0; i < INTRA_MODES; ++i) {
     81     vp9_cost_tokens(cpi->intra_uv_mode_cost[KEY_FRAME][i],
     82                     vp9_kf_uv_mode_prob[i], vp9_intra_mode_tree);
     83     vp9_cost_tokens(cpi->intra_uv_mode_cost[INTER_FRAME][i],
     84                     fc->uv_mode_prob[i], vp9_intra_mode_tree);
     85   }
     86 
     87   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
     88     vp9_cost_tokens(cpi->switchable_interp_costs[i],
     89                     fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
     90   }
     91 
     92   for (i = TX_8X8; i < TX_SIZES; ++i) {
     93     for (j = 0; j < TX_SIZE_CONTEXTS; ++j) {
     94       const vpx_prob *tx_probs = get_tx_probs(i, j, &fc->tx_probs);
     95       int k;
     96       for (k = 0; k <= i; ++k) {
     97         int cost = 0;
     98         int m;
     99         for (m = 0; m <= k - (k == i); ++m) {
    100           if (m == k)
    101             cost += vp9_cost_zero(tx_probs[m]);
    102           else
    103             cost += vp9_cost_one(tx_probs[m]);
    104         }
    105         cpi->tx_size_cost[i - 1][j][k] = cost;
    106       }
    107     }
    108   }
    109 }
    110 
    111 static void fill_token_costs(vp9_coeff_cost *c,
    112                              vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
    113   int i, j, k, l;
    114   TX_SIZE t;
    115   for (t = TX_4X4; t <= TX_32X32; ++t)
    116     for (i = 0; i < PLANE_TYPES; ++i)
    117       for (j = 0; j < REF_TYPES; ++j)
    118         for (k = 0; k < COEF_BANDS; ++k)
    119           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
    120             vpx_prob probs[ENTROPY_NODES];
    121             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
    122             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs, vp9_coef_tree);
    123             vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
    124                                  vp9_coef_tree);
    125             assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
    126                    c[t][i][j][k][1][l][EOB_TOKEN]);
    127           }
    128 }
    129 
    130 // Values are now correlated to quantizer.
    131 static int sad_per_bit16lut_8[QINDEX_RANGE];
    132 static int sad_per_bit4lut_8[QINDEX_RANGE];
    133 
    134 #if CONFIG_VP9_HIGHBITDEPTH
    135 static int sad_per_bit16lut_10[QINDEX_RANGE];
    136 static int sad_per_bit4lut_10[QINDEX_RANGE];
    137 static int sad_per_bit16lut_12[QINDEX_RANGE];
    138 static int sad_per_bit4lut_12[QINDEX_RANGE];
    139 #endif
    140 
    141 static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range,
    142                             vpx_bit_depth_t bit_depth) {
    143   int i;
    144   // Initialize the sad lut tables using a formulaic calculation for now.
    145   // This is to make it easier to resolve the impact of experimental changes
    146   // to the quantizer tables.
    147   for (i = 0; i < range; i++) {
    148     const double q = vp9_convert_qindex_to_q(i, bit_depth);
    149     bit16lut[i] = (int)(0.0418 * q + 2.4107);
    150     bit4lut[i] = (int)(0.063 * q + 2.742);
    151   }
    152 }
    153 
    154 void vp9_init_me_luts(void) {
    155   init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE,
    156                   VPX_BITS_8);
    157 #if CONFIG_VP9_HIGHBITDEPTH
    158   init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE,
    159                   VPX_BITS_10);
    160   init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE,
    161                   VPX_BITS_12);
    162 #endif
    163 }
    164 
    165 static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12,
    166                                          8,  8,  4,  4,  2,  2,  1,  0 };
    167 
    168 // Note that the element below for frame type "USE_BUF_FRAME", which indicates
    169 // that the show frame flag is set, should not be used as no real frame
    170 // is encoded so we should not reach here. However, a dummy value
    171 // is inserted here to make sure the data structure has the right number
    172 // of values assigned.
    173 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { 128, 144, 128,
    174                                                               128, 144, 144 };
    175 
    176 int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
    177   // largest dc_quant is 21387, therefore rdmult should always fit in int32_t
    178   const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth);
    179   uint32_t rdmult = q * q;
    180 
    181   if (cpi->common.frame_type != KEY_FRAME) {
    182     if (qindex < 128)
    183       rdmult = rdmult * 4;
    184     else if (qindex < 190)
    185       rdmult = rdmult * 4 + rdmult / 2;
    186     else
    187       rdmult = rdmult * 3;
    188   } else {
    189     if (qindex < 64)
    190       rdmult = rdmult * 4;
    191     else if (qindex <= 128)
    192       rdmult = rdmult * 3 + rdmult / 2;
    193     else if (qindex < 190)
    194       rdmult = rdmult * 4 + rdmult / 2;
    195     else
    196       rdmult = rdmult * 7 + rdmult / 2;
    197   }
    198 #if CONFIG_VP9_HIGHBITDEPTH
    199   switch (cpi->common.bit_depth) {
    200     case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(rdmult, 4); break;
    201     case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(rdmult, 8); break;
    202     default: break;
    203   }
    204 #endif  // CONFIG_VP9_HIGHBITDEPTH
    205   return rdmult > 0 ? rdmult : 1;
    206 }
    207 
    208 static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) {
    209   int64_t rdmult_64 = rdmult;
    210   if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
    211     const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
    212     const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index];
    213     const int gfu_boost = cpi->multi_layer_arf
    214                               ? gf_group->gfu_boost[gf_group->index]
    215                               : cpi->rc.gfu_boost;
    216     const int boost_index = VPXMIN(15, (gfu_boost / 100));
    217 
    218     rdmult_64 = (rdmult_64 * rd_frame_type_factor[frame_type]) >> 7;
    219     rdmult_64 += ((rdmult_64 * rd_boost_factor[boost_index]) >> 7);
    220   }
    221   return (int)rdmult_64;
    222 }
    223 
    224 int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
    225   int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex);
    226   return modulate_rdmult(cpi, rdmult);
    227 }
    228 
    229 int vp9_get_adaptive_rdmult(const VP9_COMP *cpi, double beta) {
    230   int rdmult =
    231       vp9_compute_rd_mult_based_on_qindex(cpi, cpi->common.base_qindex);
    232   rdmult = (int)((double)rdmult / beta);
    233   rdmult = rdmult > 0 ? rdmult : 1;
    234   return modulate_rdmult(cpi, rdmult);
    235 }
    236 
    237 static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
    238   double q;
    239 #if CONFIG_VP9_HIGHBITDEPTH
    240   switch (bit_depth) {
    241     case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break;
    242     case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break;
    243     default:
    244       assert(bit_depth == VPX_BITS_12);
    245       q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0;
    246       break;
    247   }
    248 #else
    249   (void)bit_depth;
    250   q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0;
    251 #endif  // CONFIG_VP9_HIGHBITDEPTH
    252   // TODO(debargha): Adjust the function below.
    253   return VPXMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
    254 }
    255 
    256 void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
    257 #if CONFIG_VP9_HIGHBITDEPTH
    258   switch (cpi->common.bit_depth) {
    259     case VPX_BITS_8:
    260       x->sadperbit16 = sad_per_bit16lut_8[qindex];
    261       x->sadperbit4 = sad_per_bit4lut_8[qindex];
    262       break;
    263     case VPX_BITS_10:
    264       x->sadperbit16 = sad_per_bit16lut_10[qindex];
    265       x->sadperbit4 = sad_per_bit4lut_10[qindex];
    266       break;
    267     default:
    268       assert(cpi->common.bit_depth == VPX_BITS_12);
    269       x->sadperbit16 = sad_per_bit16lut_12[qindex];
    270       x->sadperbit4 = sad_per_bit4lut_12[qindex];
    271       break;
    272   }
    273 #else
    274   (void)cpi;
    275   x->sadperbit16 = sad_per_bit16lut_8[qindex];
    276   x->sadperbit4 = sad_per_bit4lut_8[qindex];
    277 #endif  // CONFIG_VP9_HIGHBITDEPTH
    278 }
    279 
    280 static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) {
    281   int i, bsize, segment_id;
    282 
    283   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
    284     const int qindex =
    285         clamp(vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex) +
    286                   cm->y_dc_delta_q,
    287               0, MAXQ);
    288     const int q = compute_rd_thresh_factor(qindex, cm->bit_depth);
    289 
    290     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
    291       // Threshold here seems unnecessarily harsh but fine given actual
    292       // range of values used for cpi->sf.thresh_mult[].
    293       const int t = q * rd_thresh_block_size_factor[bsize];
    294       const int thresh_max = INT_MAX / t;
    295 
    296       if (bsize >= BLOCK_8X8) {
    297         for (i = 0; i < MAX_MODES; ++i)
    298           rd->threshes[segment_id][bsize][i] = rd->thresh_mult[i] < thresh_max
    299                                                    ? rd->thresh_mult[i] * t / 4
    300                                                    : INT_MAX;
    301       } else {
    302         for (i = 0; i < MAX_REFS; ++i)
    303           rd->threshes[segment_id][bsize][i] =
    304               rd->thresh_mult_sub8x8[i] < thresh_max
    305                   ? rd->thresh_mult_sub8x8[i] * t / 4
    306                   : INT_MAX;
    307       }
    308     }
    309   }
    310 }
    311 
    312 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
    313   VP9_COMMON *const cm = &cpi->common;
    314   MACROBLOCK *const x = &cpi->td.mb;
    315   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
    316   RD_OPT *const rd = &cpi->rd;
    317   int i;
    318 
    319   vpx_clear_system_state();
    320 
    321   rd->RDDIV = RDDIV_BITS;  // In bits (to multiply D by 128).
    322   rd->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
    323 
    324   set_error_per_bit(x, rd->RDMULT);
    325 
    326   x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
    327                        cm->frame_type != KEY_FRAME)
    328                           ? 0
    329                           : 1;
    330 
    331   set_block_thresholds(cm, rd);
    332   set_partition_probs(cm, xd);
    333 
    334   if (cpi->oxcf.pass == 1) {
    335     if (!frame_is_intra_only(cm))
    336       vp9_build_nmv_cost_table(
    337           x->nmvjointcost,
    338           cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
    339           &cm->fc->nmvc, cm->allow_high_precision_mv);
    340   } else {
    341     if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME)
    342       fill_token_costs(x->token_costs, cm->fc->coef_probs);
    343 
    344     if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
    345         cm->frame_type == KEY_FRAME) {
    346       for (i = 0; i < PARTITION_CONTEXTS; ++i)
    347         vp9_cost_tokens(cpi->partition_cost[i], get_partition_probs(xd, i),
    348                         vp9_partition_tree);
    349     }
    350 
    351     if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 ||
    352         cm->frame_type == KEY_FRAME) {
    353       fill_mode_costs(cpi);
    354 
    355       if (!frame_is_intra_only(cm)) {
    356         vp9_build_nmv_cost_table(
    357             x->nmvjointcost,
    358             cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost,
    359             &cm->fc->nmvc, cm->allow_high_precision_mv);
    360 
    361         for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
    362           vp9_cost_tokens((int *)cpi->inter_mode_cost[i],
    363                           cm->fc->inter_mode_probs[i], vp9_inter_mode_tree);
    364       }
    365     }
    366   }
    367 }
    368 
    369 // NOTE: The tables below must be of the same size.
    370 
    371 // The functions described below are sampled at the four most significant
    372 // bits of x^2 + 8 / 256.
    373 
    374 // Normalized rate:
    375 // This table models the rate for a Laplacian source with given variance
    376 // when quantized with a uniform quantizer with given stepsize. The
    377 // closed form expression is:
    378 // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
    379 // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
    380 // and H(x) is the binary entropy function.
    381 static const int rate_tab_q10[] = {
    382   65536, 6086, 5574, 5275, 5063, 4899, 4764, 4651, 4553, 4389, 4255, 4142, 4044,
    383   3958,  3881, 3811, 3748, 3635, 3538, 3453, 3376, 3307, 3244, 3186, 3133, 3037,
    384   2952,  2877, 2809, 2747, 2690, 2638, 2589, 2501, 2423, 2353, 2290, 2232, 2179,
    385   2130,  2084, 2001, 1928, 1862, 1802, 1748, 1698, 1651, 1608, 1530, 1460, 1398,
    386   1342,  1290, 1243, 1199, 1159, 1086, 1021, 963,  911,  864,  821,  781,  745,
    387   680,   623,  574,  530,  490,  455,  424,  395,  345,  304,  269,  239,  213,
    388   190,   171,  154,  126,  104,  87,   73,   61,   52,   44,   38,   28,   21,
    389   16,    12,   10,   8,    6,    5,    3,    2,    1,    1,    1,    0,    0,
    390 };
    391 
    392 // Normalized distortion:
    393 // This table models the normalized distortion for a Laplacian source
    394 // with given variance when quantized with a uniform quantizer
    395 // with given stepsize. The closed form expression is:
    396 // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
    397 // where x = qpstep / sqrt(variance).
    398 // Note the actual distortion is Dn * variance.
    399 static const int dist_tab_q10[] = {
    400   0,    0,    1,    1,    1,    2,    2,    2,    3,    3,    4,    5,    5,
    401   6,    7,    7,    8,    9,    11,   12,   13,   15,   16,   17,   18,   21,
    402   24,   26,   29,   31,   34,   36,   39,   44,   49,   54,   59,   64,   69,
    403   73,   78,   88,   97,   106,  115,  124,  133,  142,  151,  167,  184,  200,
    404   215,  231,  245,  260,  274,  301,  327,  351,  375,  397,  418,  439,  458,
    405   495,  528,  559,  587,  613,  637,  659,  680,  717,  749,  777,  801,  823,
    406   842,  859,  874,  899,  919,  936,  949,  960,  969,  977,  983,  994,  1001,
    407   1006, 1010, 1013, 1015, 1017, 1018, 1020, 1022, 1022, 1023, 1023, 1023, 1024,
    408 };
    409 static const int xsq_iq_q10[] = {
    410   0,      4,      8,      12,     16,     20,     24,     28,     32,
    411   40,     48,     56,     64,     72,     80,     88,     96,     112,
    412   128,    144,    160,    176,    192,    208,    224,    256,    288,
    413   320,    352,    384,    416,    448,    480,    544,    608,    672,
    414   736,    800,    864,    928,    992,    1120,   1248,   1376,   1504,
    415   1632,   1760,   1888,   2016,   2272,   2528,   2784,   3040,   3296,
    416   3552,   3808,   4064,   4576,   5088,   5600,   6112,   6624,   7136,
    417   7648,   8160,   9184,   10208,  11232,  12256,  13280,  14304,  15328,
    418   16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,  32736,
    419   36832,  40928,  45024,  49120,  53216,  57312,  61408,  65504,  73696,
    420   81888,  90080,  98272,  106464, 114656, 122848, 131040, 147424, 163808,
    421   180192, 196576, 212960, 229344, 245728,
    422 };
    423 
    424 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
    425   const int tmp = (xsq_q10 >> 2) + 8;
    426   const int k = get_msb(tmp) - 3;
    427   const int xq = (k << 3) + ((tmp >> k) & 0x7);
    428   const int one_q10 = 1 << 10;
    429   const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
    430   const int b_q10 = one_q10 - a_q10;
    431   *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
    432   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
    433 }
    434 
    435 static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE],
    436                               int r_q10[MAX_MB_PLANE],
    437                               int d_q10[MAX_MB_PLANE]) {
    438   int i;
    439   const int one_q10 = 1 << 10;
    440   for (i = 0; i < MAX_MB_PLANE; ++i) {
    441     const int tmp = (xsq_q10[i] >> 2) + 8;
    442     const int k = get_msb(tmp) - 3;
    443     const int xq = (k << 3) + ((tmp >> k) & 0x7);
    444     const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k);
    445     const int b_q10 = one_q10 - a_q10;
    446     r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
    447     d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
    448   }
    449 }
    450 
    451 static const uint32_t MAX_XSQ_Q10 = 245727;
    452 
    453 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
    454                                   unsigned int qstep, int *rate,
    455                                   int64_t *dist) {
    456   // This function models the rate and distortion for a Laplacian
    457   // source with given variance when quantized with a uniform quantizer
    458   // with given stepsize. The closed form expressions are in:
    459   // Hang and Chen, "Source Model for transform video coder and its
    460   // application - Part I: Fundamental Theory", IEEE Trans. Circ.
    461   // Sys. for Video Tech., April 1997.
    462   if (var == 0) {
    463     *rate = 0;
    464     *dist = 0;
    465   } else {
    466     int d_q10, r_q10;
    467     const uint64_t xsq_q10_64 =
    468         (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
    469     const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
    470     model_rd_norm(xsq_q10, &r_q10, &d_q10);
    471     *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - VP9_PROB_COST_SHIFT);
    472     *dist = (var * (int64_t)d_q10 + 512) >> 10;
    473   }
    474 }
    475 
    476 // Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where
    477 // vectors are of length MAX_MB_PLANE and all elements of var are non-zero.
    478 void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
    479                                       unsigned int n_log2[MAX_MB_PLANE],
    480                                       unsigned int qstep[MAX_MB_PLANE],
    481                                       int64_t *rate_sum, int64_t *dist_sum) {
    482   int i;
    483   int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE];
    484   for (i = 0; i < MAX_MB_PLANE; ++i) {
    485     const uint64_t xsq_q10_64 =
    486         (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) /
    487         var[i];
    488     xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
    489   }
    490   model_rd_norm_vec(xsq_q10, r_q10, d_q10);
    491   for (i = 0; i < MAX_MB_PLANE; ++i) {
    492     int rate =
    493         ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT);
    494     int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10;
    495     *rate_sum += rate;
    496     *dist_sum += dist;
    497   }
    498 }
    499 
    500 void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
    501                               const struct macroblockd_plane *pd,
    502                               ENTROPY_CONTEXT t_above[16],
    503                               ENTROPY_CONTEXT t_left[16]) {
    504   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
    505   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
    506   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
    507   const ENTROPY_CONTEXT *const above = pd->above_context;
    508   const ENTROPY_CONTEXT *const left = pd->left_context;
    509 
    510   int i;
    511   switch (tx_size) {
    512     case TX_4X4:
    513       memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
    514       memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
    515       break;
    516     case TX_8X8:
    517       for (i = 0; i < num_4x4_w; i += 2)
    518         t_above[i] = !!*(const uint16_t *)&above[i];
    519       for (i = 0; i < num_4x4_h; i += 2)
    520         t_left[i] = !!*(const uint16_t *)&left[i];
    521       break;
    522     case TX_16X16:
    523       for (i = 0; i < num_4x4_w; i += 4)
    524         t_above[i] = !!*(const uint32_t *)&above[i];
    525       for (i = 0; i < num_4x4_h; i += 4)
    526         t_left[i] = !!*(const uint32_t *)&left[i];
    527       break;
    528     default:
    529       assert(tx_size == TX_32X32);
    530       for (i = 0; i < num_4x4_w; i += 8)
    531         t_above[i] = !!*(const uint64_t *)&above[i];
    532       for (i = 0; i < num_4x4_h; i += 8)
    533         t_left[i] = !!*(const uint64_t *)&left[i];
    534       break;
    535   }
    536 }
    537 
    538 void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
    539                  int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
    540   int i;
    541   int zero_seen = 0;
    542   int best_index = 0;
    543   int best_sad = INT_MAX;
    544   int this_sad = INT_MAX;
    545   int max_mv = 0;
    546   int near_same_nearest;
    547   uint8_t *src_y_ptr = x->plane[0].src.buf;
    548   uint8_t *ref_y_ptr;
    549   const int num_mv_refs =
    550       MAX_MV_REF_CANDIDATES + (block_size < x->max_partition_size);
    551 
    552   MV pred_mv[3];
    553   pred_mv[0] = x->mbmi_ext->ref_mvs[ref_frame][0].as_mv;
    554   pred_mv[1] = x->mbmi_ext->ref_mvs[ref_frame][1].as_mv;
    555   pred_mv[2] = x->pred_mv[ref_frame];
    556   assert(num_mv_refs <= (int)(sizeof(pred_mv) / sizeof(pred_mv[0])));
    557 
    558   near_same_nearest = x->mbmi_ext->ref_mvs[ref_frame][0].as_int ==
    559                       x->mbmi_ext->ref_mvs[ref_frame][1].as_int;
    560 
    561   // Get the sad for each candidate reference mv.
    562   for (i = 0; i < num_mv_refs; ++i) {
    563     const MV *this_mv = &pred_mv[i];
    564     int fp_row, fp_col;
    565     if (this_mv->row == INT16_MAX || this_mv->col == INT16_MAX) continue;
    566     if (i == 1 && near_same_nearest) continue;
    567     fp_row = (this_mv->row + 3 + (this_mv->row >= 0)) >> 3;
    568     fp_col = (this_mv->col + 3 + (this_mv->col >= 0)) >> 3;
    569     max_mv = VPXMAX(max_mv, VPXMAX(abs(this_mv->row), abs(this_mv->col)) >> 3);
    570 
    571     if (fp_row == 0 && fp_col == 0 && zero_seen) continue;
    572     zero_seen |= (fp_row == 0 && fp_col == 0);
    573 
    574     ref_y_ptr = &ref_y_buffer[ref_y_stride * fp_row + fp_col];
    575     // Find sad for current vector.
    576     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
    577                                            ref_y_ptr, ref_y_stride);
    578     // Note if it is the best so far.
    579     if (this_sad < best_sad) {
    580       best_sad = this_sad;
    581       best_index = i;
    582     }
    583   }
    584 
    585   // Note the index of the mv that worked best in the reference list.
    586   x->mv_best_ref_index[ref_frame] = best_index;
    587   x->max_mv_context[ref_frame] = max_mv;
    588   x->pred_mv_sad[ref_frame] = best_sad;
    589 }
    590 
    591 void vp9_setup_pred_block(const MACROBLOCKD *xd,
    592                           struct buf_2d dst[MAX_MB_PLANE],
    593                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
    594                           const struct scale_factors *scale,
    595                           const struct scale_factors *scale_uv) {
    596   int i;
    597 
    598   dst[0].buf = src->y_buffer;
    599   dst[0].stride = src->y_stride;
    600   dst[1].buf = src->u_buffer;
    601   dst[2].buf = src->v_buffer;
    602   dst[1].stride = dst[2].stride = src->uv_stride;
    603 
    604   for (i = 0; i < MAX_MB_PLANE; ++i) {
    605     setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
    606                      i ? scale_uv : scale, xd->plane[i].subsampling_x,
    607                      xd->plane[i].subsampling_y);
    608   }
    609 }
    610 
    611 int vp9_raster_block_offset(BLOCK_SIZE plane_bsize, int raster_block,
    612                             int stride) {
    613   const int bw = b_width_log2_lookup[plane_bsize];
    614   const int y = 4 * (raster_block >> bw);
    615   const int x = 4 * (raster_block & ((1 << bw) - 1));
    616   return y * stride + x;
    617 }
    618 
    619 int16_t *vp9_raster_block_offset_int16(BLOCK_SIZE plane_bsize, int raster_block,
    620                                        int16_t *base) {
    621   const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
    622   return base + vp9_raster_block_offset(plane_bsize, raster_block, stride);
    623 }
    624 
    625 YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
    626                                              int ref_frame) {
    627   const VP9_COMMON *const cm = &cpi->common;
    628   const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
    629   const int ref_idx = get_ref_frame_buf_idx(cpi, ref_frame);
    630   assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
    631   return (scaled_idx != ref_idx && scaled_idx != INVALID_IDX)
    632              ? &cm->buffer_pool->frame_bufs[scaled_idx].buf
    633              : NULL;
    634 }
    635 
    636 int vp9_get_switchable_rate(const VP9_COMP *cpi, const MACROBLOCKD *const xd) {
    637   const MODE_INFO *const mi = xd->mi[0];
    638   const int ctx = get_pred_context_switchable_interp(xd);
    639   return SWITCHABLE_INTERP_RATE_FACTOR *
    640          cpi->switchable_interp_costs[ctx][mi->interp_filter];
    641 }
    642 
    643 void vp9_set_rd_speed_thresholds(VP9_COMP *cpi) {
    644   int i;
    645   RD_OPT *const rd = &cpi->rd;
    646   SPEED_FEATURES *const sf = &cpi->sf;
    647 
    648   // Set baseline threshold values.
    649   for (i = 0; i < MAX_MODES; ++i)
    650     rd->thresh_mult[i] = cpi->oxcf.mode == BEST ? -500 : 0;
    651 
    652   if (sf->adaptive_rd_thresh) {
    653     rd->thresh_mult[THR_NEARESTMV] = 300;
    654     rd->thresh_mult[THR_NEARESTG] = 300;
    655     rd->thresh_mult[THR_NEARESTA] = 300;
    656   } else {
    657     rd->thresh_mult[THR_NEARESTMV] = 0;
    658     rd->thresh_mult[THR_NEARESTG] = 0;
    659     rd->thresh_mult[THR_NEARESTA] = 0;
    660   }
    661 
    662   rd->thresh_mult[THR_DC] += 1000;
    663 
    664   rd->thresh_mult[THR_NEWMV] += 1000;
    665   rd->thresh_mult[THR_NEWA] += 1000;
    666   rd->thresh_mult[THR_NEWG] += 1000;
    667 
    668   rd->thresh_mult[THR_NEARMV] += 1000;
    669   rd->thresh_mult[THR_NEARA] += 1000;
    670   rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
    671   rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
    672 
    673   rd->thresh_mult[THR_TM] += 1000;
    674 
    675   rd->thresh_mult[THR_COMP_NEARLA] += 1500;
    676   rd->thresh_mult[THR_COMP_NEWLA] += 2000;
    677   rd->thresh_mult[THR_NEARG] += 1000;
    678   rd->thresh_mult[THR_COMP_NEARGA] += 1500;
    679   rd->thresh_mult[THR_COMP_NEWGA] += 2000;
    680 
    681   rd->thresh_mult[THR_ZEROMV] += 2000;
    682   rd->thresh_mult[THR_ZEROG] += 2000;
    683   rd->thresh_mult[THR_ZEROA] += 2000;
    684   rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
    685   rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
    686 
    687   rd->thresh_mult[THR_H_PRED] += 2000;
    688   rd->thresh_mult[THR_V_PRED] += 2000;
    689   rd->thresh_mult[THR_D45_PRED] += 2500;
    690   rd->thresh_mult[THR_D135_PRED] += 2500;
    691   rd->thresh_mult[THR_D117_PRED] += 2500;
    692   rd->thresh_mult[THR_D153_PRED] += 2500;
    693   rd->thresh_mult[THR_D207_PRED] += 2500;
    694   rd->thresh_mult[THR_D63_PRED] += 2500;
    695 }
    696 
    697 void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
    698   static const int thresh_mult[2][MAX_REFS] = {
    699     { 2500, 2500, 2500, 4500, 4500, 2500 },
    700     { 2000, 2000, 2000, 4000, 4000, 2000 }
    701   };
    702   RD_OPT *const rd = &cpi->rd;
    703   const int idx = cpi->oxcf.mode == BEST;
    704   memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
    705 }
    706 
    707 void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
    708                                int bsize, int best_mode_index) {
    709   if (rd_thresh > 0) {
    710     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
    711     int mode;
    712     for (mode = 0; mode < top_mode; ++mode) {
    713       const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4);
    714       const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_64X64);
    715       BLOCK_SIZE bs;
    716       for (bs = min_size; bs <= max_size; ++bs) {
    717         int *const fact = &factor_buf[bs][mode];
    718         if (mode == best_mode_index) {
    719           *fact -= (*fact >> 4);
    720         } else {
    721           *fact = VPXMIN(*fact + RD_THRESH_INC, rd_thresh * RD_THRESH_MAX_FACT);
    722         }
    723       }
    724     }
    725   }
    726 }
    727 
    728 int vp9_get_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize,
    729                                int qindex, int qdelta) {
    730   // Reduce the intra cost penalty for small blocks (<=16x16).
    731   int reduction_fac =
    732       (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
    733 
    734   if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
    735     // Don't reduce intra cost penalty if estimated noise level is high.
    736     reduction_fac = 0;
    737 
    738   // Always use VPX_BITS_8 as input here because the penalty is applied
    739   // to rate not distortion so we want a consistent penalty for all bit
    740   // depths. If the actual bit depth were passed in here then the value
    741   // retured by vp9_dc_quant() would scale with the bit depth and we would
    742   // then need to apply inverse scaling to correct back to a bit depth
    743   // independent rate penalty.
    744   return (20 * vp9_dc_quant(qindex, qdelta, VPX_BITS_8)) >> reduction_fac;
    745 }
    746