Home | History | Annotate | Download | only in encoder
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <assert.h>
     12 #include <limits.h>
     13 #include <math.h>
     14 #include <stdio.h>
     15 
     16 #include "./vp9_rtcd.h"
     17 
     18 #include "vpx_mem/vpx_mem.h"
     19 
     20 #include "vp9/common/vp9_common.h"
     21 #include "vp9/common/vp9_entropy.h"
     22 #include "vp9/common/vp9_entropymode.h"
     23 #include "vp9/common/vp9_idct.h"
     24 #include "vp9/common/vp9_mvref_common.h"
     25 #include "vp9/common/vp9_pragmas.h"
     26 #include "vp9/common/vp9_pred_common.h"
     27 #include "vp9/common/vp9_quant_common.h"
     28 #include "vp9/common/vp9_reconinter.h"
     29 #include "vp9/common/vp9_reconintra.h"
     30 #include "vp9/common/vp9_seg_common.h"
     31 #include "vp9/common/vp9_systemdependent.h"
     32 
     33 #include "vp9/encoder/vp9_cost.h"
     34 #include "vp9/encoder/vp9_encodemb.h"
     35 #include "vp9/encoder/vp9_encodemv.h"
     36 #include "vp9/encoder/vp9_mcomp.h"
     37 #include "vp9/encoder/vp9_onyx_int.h"
     38 #include "vp9/encoder/vp9_quantize.h"
     39 #include "vp9/encoder/vp9_ratectrl.h"
     40 #include "vp9/encoder/vp9_rdopt.h"
     41 #include "vp9/encoder/vp9_tokenize.h"
     42 #include "vp9/encoder/vp9_variance.h"
     43 
     44 #define RD_THRESH_MAX_FACT 64
     45 #define RD_THRESH_INC      1
     46 #define RD_THRESH_POW      1.25
     47 #define RD_MULT_EPB_RATIO  64
     48 
     49 /* Factor to weigh the rate for switchable interp filters */
     50 #define SWITCHABLE_INTERP_RATE_FACTOR 1
     51 
     52 #define LAST_FRAME_MODE_MASK    0xFFEDCD60
     53 #define GOLDEN_FRAME_MODE_MASK  0xFFDA3BB0
     54 #define ALT_REF_MODE_MASK       0xFFC648D0
     55 
     56 #define MIN_EARLY_TERM_INDEX    3
     57 
     58 typedef struct {
     59   MB_PREDICTION_MODE mode;
     60   MV_REFERENCE_FRAME ref_frame[2];
     61 } MODE_DEFINITION;
     62 
     63 typedef struct {
     64   MV_REFERENCE_FRAME ref_frame[2];
     65 } REF_DEFINITION;
     66 
     67 struct rdcost_block_args {
     68   MACROBLOCK *x;
     69   ENTROPY_CONTEXT t_above[16];
     70   ENTROPY_CONTEXT t_left[16];
     71   int rate;
     72   int64_t dist;
     73   int64_t sse;
     74   int this_rate;
     75   int64_t this_dist;
     76   int64_t this_sse;
     77   int64_t this_rd;
     78   int64_t best_rd;
     79   int skip;
     80   int use_fast_coef_costing;
     81   const scan_order *so;
     82 };
     83 
     84 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
     85   {NEARESTMV, {LAST_FRAME,   NONE}},
     86   {NEARESTMV, {ALTREF_FRAME, NONE}},
     87   {NEARESTMV, {GOLDEN_FRAME, NONE}},
     88 
     89   {DC_PRED,   {INTRA_FRAME,  NONE}},
     90 
     91   {NEWMV,     {LAST_FRAME,   NONE}},
     92   {NEWMV,     {ALTREF_FRAME, NONE}},
     93   {NEWMV,     {GOLDEN_FRAME, NONE}},
     94 
     95   {NEARMV,    {LAST_FRAME,   NONE}},
     96   {NEARMV,    {ALTREF_FRAME, NONE}},
     97   {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
     98   {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
     99 
    100   {TM_PRED,   {INTRA_FRAME,  NONE}},
    101 
    102   {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
    103   {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
    104   {NEARMV,    {GOLDEN_FRAME, NONE}},
    105   {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
    106   {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
    107 
    108   {ZEROMV,    {LAST_FRAME,   NONE}},
    109   {ZEROMV,    {GOLDEN_FRAME, NONE}},
    110   {ZEROMV,    {ALTREF_FRAME, NONE}},
    111   {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
    112   {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
    113 
    114   {H_PRED,    {INTRA_FRAME,  NONE}},
    115   {V_PRED,    {INTRA_FRAME,  NONE}},
    116   {D135_PRED, {INTRA_FRAME,  NONE}},
    117   {D207_PRED, {INTRA_FRAME,  NONE}},
    118   {D153_PRED, {INTRA_FRAME,  NONE}},
    119   {D63_PRED,  {INTRA_FRAME,  NONE}},
    120   {D117_PRED, {INTRA_FRAME,  NONE}},
    121   {D45_PRED,  {INTRA_FRAME,  NONE}},
    122 };
    123 
    124 const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
    125   {{LAST_FRAME,   NONE}},
    126   {{GOLDEN_FRAME, NONE}},
    127   {{ALTREF_FRAME, NONE}},
    128   {{LAST_FRAME,   ALTREF_FRAME}},
    129   {{GOLDEN_FRAME, ALTREF_FRAME}},
    130   {{INTRA_FRAME,  NONE}},
    131 };
    132 
    133 // The baseline rd thresholds for breaking out of the rd loop for
    134 // certain modes are assumed to be based on 8x8 blocks.
    135 // This table is used to correct for blocks size.
    136 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
    137 static int rd_thresh_block_size_factor[BLOCK_SIZES] =
    138   {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
    139 
    140 static int raster_block_offset(BLOCK_SIZE plane_bsize,
    141                                int raster_block, int stride) {
    142   const int bw = b_width_log2(plane_bsize);
    143   const int y = 4 * (raster_block >> bw);
    144   const int x = 4 * (raster_block & ((1 << bw) - 1));
    145   return y * stride + x;
    146 }
    147 static int16_t* raster_block_offset_int16(BLOCK_SIZE plane_bsize,
    148                                           int raster_block, int16_t *base) {
    149   const int stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
    150   return base + raster_block_offset(plane_bsize, raster_block, stride);
    151 }
    152 
    153 static void fill_mode_costs(VP9_COMP *cpi) {
    154   MACROBLOCK *const x = &cpi->mb;
    155   const FRAME_CONTEXT *const fc = &cpi->common.fc;
    156   int i, j;
    157 
    158   for (i = 0; i < INTRA_MODES; i++)
    159     for (j = 0; j < INTRA_MODES; j++)
    160       vp9_cost_tokens((int *)x->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
    161                       vp9_intra_mode_tree);
    162 
    163   // TODO(rbultje) separate tables for superblock costing?
    164   vp9_cost_tokens(x->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
    165   vp9_cost_tokens(x->intra_uv_mode_cost[KEY_FRAME],
    166                   vp9_kf_uv_mode_prob[TM_PRED], vp9_intra_mode_tree);
    167   vp9_cost_tokens(x->intra_uv_mode_cost[INTER_FRAME],
    168                   fc->uv_mode_prob[TM_PRED], vp9_intra_mode_tree);
    169 
    170   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
    171     vp9_cost_tokens((int *)x->switchable_interp_costs[i],
    172                     fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
    173 }
    174 
    175 static void fill_token_costs(vp9_coeff_cost *c,
    176                              vp9_coeff_probs_model (*p)[PLANE_TYPES]) {
    177   int i, j, k, l;
    178   TX_SIZE t;
    179   for (t = TX_4X4; t <= TX_32X32; ++t)
    180     for (i = 0; i < PLANE_TYPES; ++i)
    181       for (j = 0; j < REF_TYPES; ++j)
    182         for (k = 0; k < COEF_BANDS; ++k)
    183           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
    184             vp9_prob probs[ENTROPY_NODES];
    185             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
    186             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
    187                             vp9_coef_tree);
    188             vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
    189                                  vp9_coef_tree);
    190             assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
    191                    c[t][i][j][k][1][l][EOB_TOKEN]);
    192           }
    193 }
    194 
    195 static const int rd_iifactor[32] = {
    196   4, 4, 3, 2, 1, 0, 0, 0,
    197   0, 0, 0, 0, 0, 0, 0, 0,
    198   0, 0, 0, 0, 0, 0, 0, 0,
    199   0, 0, 0, 0, 0, 0, 0, 0,
    200 };
    201 
    202 // 3* dc_qlookup[Q]*dc_qlookup[Q];
    203 
    204 /* values are now correlated to quantizer */
    205 static int sad_per_bit16lut[QINDEX_RANGE];
    206 static int sad_per_bit4lut[QINDEX_RANGE];
    207 
    208 void vp9_init_me_luts() {
    209   int i;
    210 
    211   // Initialize the sad lut tables using a formulaic calculation for now
    212   // This is to make it easier to resolve the impact of experimental changes
    213   // to the quantizer tables.
    214   for (i = 0; i < QINDEX_RANGE; i++) {
    215     const double q = vp9_convert_qindex_to_q(i);
    216     sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107);
    217     sad_per_bit4lut[i] = (int)(0.063 * q + 2.742);
    218   }
    219 }
    220 
    221 int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
    222   const int q = vp9_dc_quant(qindex, 0);
    223   // TODO(debargha): Adjust the function below
    224   int rdmult = 88 * q * q / 25;
    225   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
    226     if (cpi->twopass.next_iiratio > 31)
    227       rdmult += (rdmult * rd_iifactor[31]) >> 4;
    228     else
    229       rdmult += (rdmult * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
    230   }
    231   return rdmult;
    232 }
    233 
    234 static int compute_rd_thresh_factor(int qindex) {
    235   // TODO(debargha): Adjust the function below
    236   const int q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
    237   return MAX(q, 8);
    238 }
    239 
    240 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
    241   cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
    242   cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
    243 }
    244 
    245 static void set_block_thresholds(VP9_COMP *cpi) {
    246   const VP9_COMMON *const cm = &cpi->common;
    247   int i, bsize, segment_id;
    248 
    249   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
    250     const int qindex = clamp(vp9_get_qindex(&cm->seg, segment_id,
    251                                             cm->base_qindex) + cm->y_dc_delta_q,
    252                              0, MAXQ);
    253     const int q = compute_rd_thresh_factor(qindex);
    254 
    255     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
    256       // Threshold here seems unnecessarily harsh but fine given actual
    257       // range of values used for cpi->sf.thresh_mult[].
    258       const int t = q * rd_thresh_block_size_factor[bsize];
    259       const int thresh_max = INT_MAX / t;
    260 
    261       for (i = 0; i < MAX_MODES; ++i)
    262         cpi->rd_threshes[segment_id][bsize][i] =
    263             cpi->rd_thresh_mult[i] < thresh_max ? cpi->rd_thresh_mult[i] * t / 4
    264                                             : INT_MAX;
    265 
    266       for (i = 0; i < MAX_REFS; ++i) {
    267         cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
    268             cpi->rd_thresh_mult_sub8x8[i] < thresh_max
    269                 ? cpi->rd_thresh_mult_sub8x8[i] * t / 4
    270                 : INT_MAX;
    271       }
    272     }
    273   }
    274 }
    275 
    276 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
    277   VP9_COMMON *const cm = &cpi->common;
    278   MACROBLOCK *const x = &cpi->mb;
    279   int i;
    280 
    281   vp9_clear_system_state();
    282 
    283   cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
    284   cpi->RDMULT = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
    285 
    286   x->errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
    287   x->errorperbit += (x->errorperbit == 0);
    288 
    289   x->select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
    290                          cm->frame_type != KEY_FRAME) ? 0 : 1;
    291 
    292   set_block_thresholds(cpi);
    293 
    294   if (!cpi->sf.use_nonrd_pick_mode || cm->frame_type == KEY_FRAME) {
    295     fill_token_costs(x->token_costs, cm->fc.coef_probs);
    296 
    297     for (i = 0; i < PARTITION_CONTEXTS; i++)
    298       vp9_cost_tokens(x->partition_cost[i], get_partition_probs(cm, i),
    299                       vp9_partition_tree);
    300   }
    301 
    302   if (!cpi->sf.use_nonrd_pick_mode || (cm->current_video_frame & 0x07) == 1 ||
    303       cm->frame_type == KEY_FRAME) {
    304     fill_mode_costs(cpi);
    305 
    306     if (!frame_is_intra_only(cm)) {
    307       vp9_build_nmv_cost_table(x->nmvjointcost,
    308                                cm->allow_high_precision_mv ? x->nmvcost_hp
    309                                                            : x->nmvcost,
    310                                &cm->fc.nmvc, cm->allow_high_precision_mv);
    311 
    312       for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
    313         vp9_cost_tokens((int *)x->inter_mode_cost[i],
    314                         cm->fc.inter_mode_probs[i], vp9_inter_mode_tree);
    315     }
    316   }
    317 }
    318 
    319 static const int MAX_XSQ_Q10 = 245727;
    320 
    321 static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
    322   // NOTE: The tables below must be of the same size
    323 
    324   // The functions described below are sampled at the four most significant
    325   // bits of x^2 + 8 / 256
    326 
    327   // Normalized rate
    328   // This table models the rate for a Laplacian source
    329   // source with given variance when quantized with a uniform quantizer
    330   // with given stepsize. The closed form expression is:
    331   // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
    332   // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
    333   // and H(x) is the binary entropy function.
    334   static const int rate_tab_q10[] = {
    335     65536,  6086,  5574,  5275,  5063,  4899,  4764,  4651,
    336      4553,  4389,  4255,  4142,  4044,  3958,  3881,  3811,
    337      3748,  3635,  3538,  3453,  3376,  3307,  3244,  3186,
    338      3133,  3037,  2952,  2877,  2809,  2747,  2690,  2638,
    339      2589,  2501,  2423,  2353,  2290,  2232,  2179,  2130,
    340      2084,  2001,  1928,  1862,  1802,  1748,  1698,  1651,
    341      1608,  1530,  1460,  1398,  1342,  1290,  1243,  1199,
    342      1159,  1086,  1021,   963,   911,   864,   821,   781,
    343       745,   680,   623,   574,   530,   490,   455,   424,
    344       395,   345,   304,   269,   239,   213,   190,   171,
    345       154,   126,   104,    87,    73,    61,    52,    44,
    346        38,    28,    21,    16,    12,    10,     8,     6,
    347         5,     3,     2,     1,     1,     1,     0,     0,
    348   };
    349   // Normalized distortion
    350   // This table models the normalized distortion for a Laplacian source
    351   // source with given variance when quantized with a uniform quantizer
    352   // with given stepsize. The closed form expression is:
    353   // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
    354   // where x = qpstep / sqrt(variance)
    355   // Note the actual distortion is Dn * variance.
    356   static const int dist_tab_q10[] = {
    357        0,     0,     1,     1,     1,     2,     2,     2,
    358        3,     3,     4,     5,     5,     6,     7,     7,
    359        8,     9,    11,    12,    13,    15,    16,    17,
    360       18,    21,    24,    26,    29,    31,    34,    36,
    361       39,    44,    49,    54,    59,    64,    69,    73,
    362       78,    88,    97,   106,   115,   124,   133,   142,
    363      151,   167,   184,   200,   215,   231,   245,   260,
    364      274,   301,   327,   351,   375,   397,   418,   439,
    365      458,   495,   528,   559,   587,   613,   637,   659,
    366      680,   717,   749,   777,   801,   823,   842,   859,
    367      874,   899,   919,   936,   949,   960,   969,   977,
    368      983,   994,  1001,  1006,  1010,  1013,  1015,  1017,
    369     1018,  1020,  1022,  1022,  1023,  1023,  1023,  1024,
    370   };
    371   static const int xsq_iq_q10[] = {
    372          0,      4,      8,     12,     16,     20,     24,     28,
    373         32,     40,     48,     56,     64,     72,     80,     88,
    374         96,    112,    128,    144,    160,    176,    192,    208,
    375        224,    256,    288,    320,    352,    384,    416,    448,
    376        480,    544,    608,    672,    736,    800,    864,    928,
    377        992,   1120,   1248,   1376,   1504,   1632,   1760,   1888,
    378       2016,   2272,   2528,   2784,   3040,   3296,   3552,   3808,
    379       4064,   4576,   5088,   5600,   6112,   6624,   7136,   7648,
    380       8160,   9184,  10208,  11232,  12256,  13280,  14304,  15328,
    381      16352,  18400,  20448,  22496,  24544,  26592,  28640,  30688,
    382      32736,  36832,  40928,  45024,  49120,  53216,  57312,  61408,
    383      65504,  73696,  81888,  90080,  98272, 106464, 114656, 122848,
    384     131040, 147424, 163808, 180192, 196576, 212960, 229344, 245728,
    385   };
    386   /*
    387   static const int tab_size = sizeof(rate_tab_q10) / sizeof(rate_tab_q10[0]);
    388   assert(sizeof(dist_tab_q10) / sizeof(dist_tab_q10[0]) == tab_size);
    389   assert(sizeof(xsq_iq_q10) / sizeof(xsq_iq_q10[0]) == tab_size);
    390   assert(MAX_XSQ_Q10 + 1 == xsq_iq_q10[tab_size - 1]);
    391   */
    392   int tmp = (xsq_q10 >> 2) + 8;
    393   int k = get_msb(tmp) - 3;
    394   int xq = (k << 3) + ((tmp >> k) & 0x7);
    395   const int one_q10 = 1 << 10;
    396   const int a_q10 = ((xsq_q10 - xsq_iq_q10[xq]) << 10) >> (2 + k);
    397   const int b_q10 = one_q10 - a_q10;
    398   *r_q10 = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
    399   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
    400 }
    401 
    402 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
    403                                   unsigned int qstep, int *rate,
    404                                   int64_t *dist) {
    405   // This function models the rate and distortion for a Laplacian
    406   // source with given variance when quantized with a uniform quantizer
    407   // with given stepsize. The closed form expressions are in:
    408   // Hang and Chen, "Source Model for transform video coder and its
    409   // application - Part I: Fundamental Theory", IEEE Trans. Circ.
    410   // Sys. for Video Tech., April 1997.
    411   if (var == 0) {
    412     *rate = 0;
    413     *dist = 0;
    414   } else {
    415     int d_q10, r_q10;
    416     const uint64_t xsq_q10_64 =
    417         ((((uint64_t)qstep * qstep * n) << 10) + (var >> 1)) / var;
    418     const int xsq_q10 = xsq_q10_64 > MAX_XSQ_Q10 ?
    419                         MAX_XSQ_Q10 : (int)xsq_q10_64;
    420     model_rd_norm(xsq_q10, &r_q10, &d_q10);
    421     *rate = (n * r_q10 + 2) >> 2;
    422     *dist = (var * (int64_t)d_q10 + 512) >> 10;
    423   }
    424 }
    425 
    426 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
    427                             MACROBLOCK *x, MACROBLOCKD *xd,
    428                             int *out_rate_sum, int64_t *out_dist_sum) {
    429   // Note our transform coeffs are 8 times an orthogonal transform.
    430   // Hence quantizer step is also 8 times. To get effective quantizer
    431   // we need to divide by 8 before sending to modeling function.
    432   int i;
    433   int64_t rate_sum = 0;
    434   int64_t dist_sum = 0;
    435   const int ref = xd->mi[0]->mbmi.ref_frame[0];
    436   unsigned int sse;
    437 
    438   for (i = 0; i < MAX_MB_PLANE; ++i) {
    439     struct macroblock_plane *const p = &x->plane[i];
    440     struct macroblockd_plane *const pd = &xd->plane[i];
    441     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
    442 
    443     (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
    444                               pd->dst.buf, pd->dst.stride, &sse);
    445 
    446     if (i == 0)
    447       x->pred_sse[ref] = sse;
    448 
    449     // Fast approximate the modelling function.
    450     if (cpi->speed > 4) {
    451       int64_t rate;
    452       int64_t dist;
    453       int64_t square_error = sse;
    454       int quantizer = (pd->dequant[1] >> 3);
    455 
    456       if (quantizer < 120)
    457         rate = (square_error * (280 - quantizer)) >> 8;
    458       else
    459         rate = 0;
    460       dist = (square_error * quantizer) >> 8;
    461       rate_sum += rate;
    462       dist_sum += dist;
    463     } else {
    464       int rate;
    465       int64_t dist;
    466       vp9_model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
    467                                    pd->dequant[1] >> 3, &rate, &dist);
    468       rate_sum += rate;
    469       dist_sum += dist;
    470     }
    471   }
    472 
    473   *out_rate_sum = (int)rate_sum;
    474   *out_dist_sum = dist_sum << 4;
    475 }
    476 
    477 static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
    478                                  TX_SIZE tx_size,
    479                                  MACROBLOCK *x, MACROBLOCKD *xd,
    480                                  int *out_rate_sum, int64_t *out_dist_sum,
    481                                  int *out_skip) {
    482   int j, k;
    483   BLOCK_SIZE bs;
    484   const struct macroblock_plane *const p = &x->plane[0];
    485   const struct macroblockd_plane *const pd = &xd->plane[0];
    486   const int width = 4 * num_4x4_blocks_wide_lookup[bsize];
    487   const int height = 4 * num_4x4_blocks_high_lookup[bsize];
    488   int rate_sum = 0;
    489   int64_t dist_sum = 0;
    490   const int t = 4 << tx_size;
    491 
    492   if (tx_size == TX_4X4) {
    493     bs = BLOCK_4X4;
    494   } else if (tx_size == TX_8X8) {
    495     bs = BLOCK_8X8;
    496   } else if (tx_size == TX_16X16) {
    497     bs = BLOCK_16X16;
    498   } else if (tx_size == TX_32X32) {
    499     bs = BLOCK_32X32;
    500   } else {
    501     assert(0);
    502   }
    503 
    504   *out_skip = 1;
    505   for (j = 0; j < height; j += t) {
    506     for (k = 0; k < width; k += t) {
    507       int rate;
    508       int64_t dist;
    509       unsigned int sse;
    510       cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride,
    511                          &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride,
    512                          &sse);
    513       // sse works better than var, since there is no dc prediction used
    514       vp9_model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3,
    515                                    &rate, &dist);
    516       rate_sum += rate;
    517       dist_sum += dist;
    518       *out_skip &= (rate < 1024);
    519     }
    520   }
    521 
    522   *out_rate_sum = rate_sum;
    523   *out_dist_sum = dist_sum << 4;
    524 }
    525 
    526 int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
    527                           intptr_t block_size, int64_t *ssz) {
    528   int i;
    529   int64_t error = 0, sqcoeff = 0;
    530 
    531   for (i = 0; i < block_size; i++) {
    532     const int diff = coeff[i] - dqcoeff[i];
    533     error +=  diff * diff;
    534     sqcoeff += coeff[i] * coeff[i];
    535   }
    536 
    537   *ssz = sqcoeff;
    538   return error;
    539 }
    540 
    541 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
    542  * decide whether to include cost of a trailing EOB node or not (i.e. we
    543  * can skip this if the last coefficient in this transform block, e.g. the
    544  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
    545  * were non-zero). */
    546 static const int16_t band_counts[TX_SIZES][8] = {
    547   { 1, 2, 3, 4,  3,   16 - 13, 0 },
    548   { 1, 2, 3, 4, 11,   64 - 21, 0 },
    549   { 1, 2, 3, 4, 11,  256 - 21, 0 },
    550   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
    551 };
    552 static INLINE int cost_coeffs(MACROBLOCK *x,
    553                               int plane, int block,
    554                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
    555                               TX_SIZE tx_size,
    556                               const int16_t *scan, const int16_t *nb,
    557                               int use_fast_coef_costing) {
    558   MACROBLOCKD *const xd = &x->e_mbd;
    559   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
    560   const struct macroblock_plane *p = &x->plane[plane];
    561   const struct macroblockd_plane *pd = &xd->plane[plane];
    562   const PLANE_TYPE type = pd->plane_type;
    563   const int16_t *band_count = &band_counts[tx_size][1];
    564   const int eob = p->eobs[block];
    565   const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
    566   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
    567                    x->token_costs[tx_size][type][is_inter_block(mbmi)];
    568   uint8_t token_cache[32 * 32];
    569   int pt = combine_entropy_contexts(*A, *L);
    570   int c, cost;
    571   // Check for consistency of tx_size with mode info
    572   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
    573                               : get_uv_tx_size(mbmi) == tx_size);
    574 
    575   if (eob == 0) {
    576     // single eob token
    577     cost = token_costs[0][0][pt][EOB_TOKEN];
    578     c = 0;
    579   } else {
    580     int band_left = *band_count++;
    581 
    582     // dc token
    583     int v = qcoeff[0];
    584     int prev_t = vp9_dct_value_tokens_ptr[v].token;
    585     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
    586     token_cache[0] = vp9_pt_energy_class[prev_t];
    587     ++token_costs;
    588 
    589     // ac tokens
    590     for (c = 1; c < eob; c++) {
    591       const int rc = scan[c];
    592       int t;
    593 
    594       v = qcoeff[rc];
    595       t = vp9_dct_value_tokens_ptr[v].token;
    596       if (use_fast_coef_costing) {
    597         cost += (*token_costs)[!prev_t][!prev_t][t] + vp9_dct_value_cost_ptr[v];
    598       } else {
    599         pt = get_coef_context(nb, token_cache, c);
    600         cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
    601         token_cache[rc] = vp9_pt_energy_class[t];
    602       }
    603       prev_t = t;
    604       if (!--band_left) {
    605         band_left = *band_count++;
    606         ++token_costs;
    607       }
    608     }
    609 
    610     // eob token
    611     if (band_left) {
    612       if (use_fast_coef_costing) {
    613         cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
    614       } else {
    615         pt = get_coef_context(nb, token_cache, c);
    616         cost += (*token_costs)[0][pt][EOB_TOKEN];
    617       }
    618     }
    619   }
    620 
    621   // is eob first coefficient;
    622   *A = *L = (c > 0);
    623 
    624   return cost;
    625 }
    626 static void dist_block(int plane, int block, TX_SIZE tx_size,
    627                        struct rdcost_block_args* args) {
    628   const int ss_txfrm_size = tx_size << 1;
    629   MACROBLOCK* const x = args->x;
    630   MACROBLOCKD* const xd = &x->e_mbd;
    631   const struct macroblock_plane *const p = &x->plane[plane];
    632   const struct macroblockd_plane *const pd = &xd->plane[plane];
    633   int64_t this_sse;
    634   int shift = tx_size == TX_32X32 ? 0 : 2;
    635   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
    636   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    637   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
    638                                &this_sse) >> shift;
    639   args->sse  = this_sse >> shift;
    640 
    641   if (x->skip_encode && !is_inter_block(&xd->mi[0]->mbmi)) {
    642     // TODO(jingning): tune the model to better capture the distortion.
    643     int64_t p = (pd->dequant[1] * pd->dequant[1] *
    644                     (1 << ss_txfrm_size)) >> (shift + 2);
    645     args->dist += (p >> 4);
    646     args->sse  += p;
    647   }
    648 }
    649 
    650 static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
    651                        TX_SIZE tx_size, struct rdcost_block_args* args) {
    652   int x_idx, y_idx;
    653   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x_idx, &y_idx);
    654 
    655   args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
    656                            args->t_left + y_idx, tx_size,
    657                            args->so->scan, args->so->neighbors,
    658                            args->use_fast_coef_costing);
    659 }
    660 
    661 static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
    662                           TX_SIZE tx_size, void *arg) {
    663   struct rdcost_block_args *args = arg;
    664   MACROBLOCK *const x = args->x;
    665   MACROBLOCKD *const xd = &x->e_mbd;
    666   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
    667   int64_t rd1, rd2, rd;
    668 
    669   if (args->skip)
    670     return;
    671 
    672   if (!is_inter_block(mbmi))
    673     vp9_encode_block_intra(x, plane, block, plane_bsize, tx_size, &mbmi->skip);
    674   else
    675     vp9_xform_quant(x, plane, block, plane_bsize, tx_size);
    676 
    677   dist_block(plane, block, tx_size, args);
    678   rate_block(plane, block, plane_bsize, tx_size, args);
    679   rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
    680   rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
    681 
    682   // TODO(jingning): temporarily enabled only for luma component
    683   rd = MIN(rd1, rd2);
    684   if (plane == 0)
    685     x->zcoeff_blk[tx_size][block] = !x->plane[plane].eobs[block] ||
    686                                     (rd1 > rd2 && !xd->lossless);
    687 
    688   args->this_rate += args->rate;
    689   args->this_dist += args->dist;
    690   args->this_sse  += args->sse;
    691   args->this_rd += rd;
    692 
    693   if (args->this_rd > args->best_rd) {
    694     args->skip = 1;
    695     return;
    696   }
    697 }
    698 
    699 void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
    700                               const struct macroblockd_plane *pd,
    701                               ENTROPY_CONTEXT t_above[16],
    702                               ENTROPY_CONTEXT t_left[16]) {
    703   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
    704   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
    705   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
    706   const ENTROPY_CONTEXT *const above = pd->above_context;
    707   const ENTROPY_CONTEXT *const left = pd->left_context;
    708 
    709   int i;
    710   switch (tx_size) {
    711     case TX_4X4:
    712       vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
    713       vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
    714       break;
    715     case TX_8X8:
    716       for (i = 0; i < num_4x4_w; i += 2)
    717         t_above[i] = !!*(const uint16_t *)&above[i];
    718       for (i = 0; i < num_4x4_h; i += 2)
    719         t_left[i] = !!*(const uint16_t *)&left[i];
    720       break;
    721     case TX_16X16:
    722       for (i = 0; i < num_4x4_w; i += 4)
    723         t_above[i] = !!*(const uint32_t *)&above[i];
    724       for (i = 0; i < num_4x4_h; i += 4)
    725         t_left[i] = !!*(const uint32_t *)&left[i];
    726       break;
    727     case TX_32X32:
    728       for (i = 0; i < num_4x4_w; i += 8)
    729         t_above[i] = !!*(const uint64_t *)&above[i];
    730       for (i = 0; i < num_4x4_h; i += 8)
    731         t_left[i] = !!*(const uint64_t *)&left[i];
    732       break;
    733     default:
    734       assert(0 && "Invalid transform size.");
    735   }
    736 }
    737 
    738 static void txfm_rd_in_plane(MACROBLOCK *x,
    739                              int *rate, int64_t *distortion,
    740                              int *skippable, int64_t *sse,
    741                              int64_t ref_best_rd, int plane,
    742                              BLOCK_SIZE bsize, TX_SIZE tx_size,
    743                              int use_fast_coef_casting) {
    744   MACROBLOCKD *const xd = &x->e_mbd;
    745   const struct macroblockd_plane *const pd = &xd->plane[plane];
    746   struct rdcost_block_args args = { 0 };
    747   args.x = x;
    748   args.best_rd = ref_best_rd;
    749   args.use_fast_coef_costing = use_fast_coef_casting;
    750 
    751   if (plane == 0)
    752     xd->mi[0]->mbmi.tx_size = tx_size;
    753 
    754   vp9_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
    755 
    756   args.so = get_scan(xd, tx_size, pd->plane_type, 0);
    757 
    758   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
    759                                          block_rd_txfm, &args);
    760   if (args.skip) {
    761     *rate       = INT_MAX;
    762     *distortion = INT64_MAX;
    763     *sse        = INT64_MAX;
    764     *skippable  = 0;
    765   } else {
    766     *distortion = args.this_dist;
    767     *rate       = args.this_rate;
    768     *sse        = args.this_sse;
    769     *skippable  = vp9_is_skippable_in_plane(x, bsize, plane);
    770   }
    771 }
    772 
    773 static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
    774                                      int *rate, int64_t *distortion,
    775                                      int *skip, int64_t *sse,
    776                                      int64_t ref_best_rd,
    777                                      BLOCK_SIZE bs) {
    778   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
    779   VP9_COMMON *const cm = &cpi->common;
    780   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
    781   MACROBLOCKD *const xd = &x->e_mbd;
    782   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
    783 
    784   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
    785 
    786   txfm_rd_in_plane(x, rate, distortion, skip,
    787                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
    788                    mbmi->tx_size, cpi->sf.use_fast_coef_costing);
    789   cpi->tx_stepdown_count[0]++;
    790 }
    791 
    792 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
    793                                      int (*r)[2], int *rate,
    794                                      int64_t *d, int64_t *distortion,
    795                                      int *s, int *skip,
    796                                      int64_t tx_cache[TX_MODES],
    797                                      BLOCK_SIZE bs) {
    798   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
    799   VP9_COMMON *const cm = &cpi->common;
    800   MACROBLOCKD *const xd = &x->e_mbd;
    801   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
    802   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
    803   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
    804                              {INT64_MAX, INT64_MAX},
    805                              {INT64_MAX, INT64_MAX},
    806                              {INT64_MAX, INT64_MAX}};
    807   int n, m;
    808   int s0, s1;
    809   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
    810   int64_t best_rd = INT64_MAX;
    811   TX_SIZE best_tx = TX_4X4;
    812 
    813   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
    814   assert(skip_prob > 0);
    815   s0 = vp9_cost_bit(skip_prob, 0);
    816   s1 = vp9_cost_bit(skip_prob, 1);
    817 
    818   for (n = TX_4X4; n <= max_tx_size; n++) {
    819     r[n][1] = r[n][0];
    820     if (r[n][0] < INT_MAX) {
    821       for (m = 0; m <= n - (n == max_tx_size); m++) {
    822         if (m == n)
    823           r[n][1] += vp9_cost_zero(tx_probs[m]);
    824         else
    825           r[n][1] += vp9_cost_one(tx_probs[m]);
    826       }
    827     }
    828     if (d[n] == INT64_MAX) {
    829       rd[n][0] = rd[n][1] = INT64_MAX;
    830     } else if (s[n]) {
    831       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
    832     } else {
    833       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
    834       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
    835     }
    836 
    837     if (rd[n][1] < best_rd) {
    838       best_tx = n;
    839       best_rd = rd[n][1];
    840     }
    841   }
    842   mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
    843                       best_tx : MIN(max_tx_size, max_mode_tx_size);
    844 
    845 
    846   *distortion = d[mbmi->tx_size];
    847   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
    848   *skip       = s[mbmi->tx_size];
    849 
    850   tx_cache[ONLY_4X4] = rd[TX_4X4][0];
    851   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
    852   tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
    853   tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
    854 
    855   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
    856     tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
    857     cpi->tx_stepdown_count[0]++;
    858   } else if (max_tx_size >= TX_16X16 && best_tx == TX_16X16) {
    859     tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
    860     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
    861   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
    862     tx_cache[TX_MODE_SELECT] = rd[TX_8X8][1];
    863     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
    864   } else {
    865     tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1];
    866     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
    867   }
    868 }
    869 
    870 static int64_t scaled_rd_cost(int rdmult, int rddiv,
    871                               int rate, int64_t dist, double scale) {
    872   return (int64_t) (RDCOST(rdmult, rddiv, rate, dist) * scale);
    873 }
    874 
    875 static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
    876                                           int (*r)[2], int *rate,
    877                                           int64_t *d, int64_t *distortion,
    878                                           int *s, int *skip, int64_t *sse,
    879                                           int64_t ref_best_rd,
    880                                           BLOCK_SIZE bs) {
    881   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
    882   VP9_COMMON *const cm = &cpi->common;
    883   MACROBLOCKD *const xd = &x->e_mbd;
    884   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
    885   vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
    886   int64_t rd[TX_SIZES][2] = {{INT64_MAX, INT64_MAX},
    887                              {INT64_MAX, INT64_MAX},
    888                              {INT64_MAX, INT64_MAX},
    889                              {INT64_MAX, INT64_MAX}};
    890   int n, m;
    891   int s0, s1;
    892   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
    893   const TX_SIZE max_mode_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
    894   int64_t best_rd = INT64_MAX;
    895   TX_SIZE best_tx = TX_4X4;
    896 
    897   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
    898   assert(skip_prob > 0);
    899   s0 = vp9_cost_bit(skip_prob, 0);
    900   s1 = vp9_cost_bit(skip_prob, 1);
    901 
    902   for (n = TX_4X4; n <= max_tx_size; n++) {
    903     double scale = scale_rd[n];
    904     r[n][1] = r[n][0];
    905     for (m = 0; m <= n - (n == max_tx_size); m++) {
    906       if (m == n)
    907         r[n][1] += vp9_cost_zero(tx_probs[m]);
    908       else
    909         r[n][1] += vp9_cost_one(tx_probs[m]);
    910     }
    911     if (s[n]) {
    912       rd[n][0] = rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, s1, d[n],
    913                                            scale);
    914     } else {
    915       rd[n][0] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][0] + s0, d[n],
    916                                 scale);
    917       rd[n][1] = scaled_rd_cost(x->rdmult, x->rddiv, r[n][1] + s0, d[n],
    918                                 scale);
    919     }
    920     if (rd[n][1] < best_rd) {
    921       best_rd = rd[n][1];
    922       best_tx = n;
    923     }
    924   }
    925 
    926   mbmi->tx_size = cm->tx_mode == TX_MODE_SELECT ?
    927                       best_tx : MIN(max_tx_size, max_mode_tx_size);
    928 
    929   // Actually encode using the chosen mode if a model was used, but do not
    930   // update the r, d costs
    931   txfm_rd_in_plane(x, rate, distortion, skip,
    932                    &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size,
    933                    cpi->sf.use_fast_coef_costing);
    934 
    935   if (max_tx_size == TX_32X32 && best_tx == TX_32X32) {
    936     cpi->tx_stepdown_count[0]++;
    937   } else if (max_tx_size >= TX_16X16 &&  best_tx == TX_16X16) {
    938     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
    939   } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
    940     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
    941   } else {
    942     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
    943   }
    944 }
    945 
    946 static void inter_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
    947                                   int64_t *distortion, int *skip,
    948                                   int64_t *psse, BLOCK_SIZE bs,
    949                                   int64_t txfm_cache[TX_MODES],
    950                                   int64_t ref_best_rd) {
    951   int r[TX_SIZES][2], s[TX_SIZES];
    952   int64_t d[TX_SIZES], sse[TX_SIZES];
    953   MACROBLOCKD *xd = &x->e_mbd;
    954   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
    955   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
    956   TX_SIZE tx_size;
    957 
    958   assert(bs == mbmi->sb_type);
    959 
    960   vp9_subtract_plane(x, bs, 0);
    961 
    962   if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
    963     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
    964     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
    965                              ref_best_rd, bs);
    966     if (psse)
    967       *psse = sse[mbmi->tx_size];
    968     return;
    969   }
    970 
    971   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER) {
    972     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
    973       model_rd_for_sb_y_tx(cpi, bs, tx_size, x, xd,
    974                            &r[tx_size][0], &d[tx_size], &s[tx_size]);
    975     choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
    976                                   skip, sse, ref_best_rd, bs);
    977   } else {
    978     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
    979       txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
    980                        &s[tx_size], &sse[tx_size],
    981                        ref_best_rd, 0, bs, tx_size,
    982                        cpi->sf.use_fast_coef_costing);
    983     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
    984                              skip, txfm_cache, bs);
    985   }
    986   if (psse)
    987     *psse = sse[mbmi->tx_size];
    988 }
    989 
    990 static void intra_super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
    991                                   int64_t *distortion, int *skip,
    992                                   int64_t *psse, BLOCK_SIZE bs,
    993                                   int64_t txfm_cache[TX_MODES],
    994                                   int64_t ref_best_rd) {
    995   int64_t sse[TX_SIZES];
    996   MACROBLOCKD *xd = &x->e_mbd;
    997   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
    998 
    999   assert(bs == mbmi->sb_type);
   1000   if (cpi->sf.tx_size_search_method != USE_FULL_RD) {
   1001     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
   1002     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
   1003                              ref_best_rd, bs);
   1004   } else {
   1005     int r[TX_SIZES][2], s[TX_SIZES];
   1006     int64_t d[TX_SIZES];
   1007     TX_SIZE tx_size;
   1008     for (tx_size = TX_4X4; tx_size <= max_txsize_lookup[bs]; ++tx_size)
   1009       txfm_rd_in_plane(x, &r[tx_size][0], &d[tx_size],
   1010                        &s[tx_size], &sse[tx_size],
   1011                        ref_best_rd, 0, bs, tx_size,
   1012                        cpi->sf.use_fast_coef_costing);
   1013     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
   1014                              skip, txfm_cache, bs);
   1015   }
   1016   if (psse)
   1017     *psse = sse[mbmi->tx_size];
   1018 }
   1019 
   1020 
   1021 static int conditional_skipintra(MB_PREDICTION_MODE mode,
   1022                                  MB_PREDICTION_MODE best_intra_mode) {
   1023   if (mode == D117_PRED &&
   1024       best_intra_mode != V_PRED &&
   1025       best_intra_mode != D135_PRED)
   1026     return 1;
   1027   if (mode == D63_PRED &&
   1028       best_intra_mode != V_PRED &&
   1029       best_intra_mode != D45_PRED)
   1030     return 1;
   1031   if (mode == D207_PRED &&
   1032       best_intra_mode != H_PRED &&
   1033       best_intra_mode != D45_PRED)
   1034     return 1;
   1035   if (mode == D153_PRED &&
   1036       best_intra_mode != H_PRED &&
   1037       best_intra_mode != D135_PRED)
   1038     return 1;
   1039   return 0;
   1040 }
   1041 
   1042 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   1043                                      MB_PREDICTION_MODE *best_mode,
   1044                                      const int *bmode_costs,
   1045                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
   1046                                      int *bestrate, int *bestratey,
   1047                                      int64_t *bestdistortion,
   1048                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
   1049   MB_PREDICTION_MODE mode;
   1050   MACROBLOCKD *const xd = &x->e_mbd;
   1051   int64_t best_rd = rd_thresh;
   1052 
   1053   struct macroblock_plane *p = &x->plane[0];
   1054   struct macroblockd_plane *pd = &xd->plane[0];
   1055   const int src_stride = p->src.stride;
   1056   const int dst_stride = pd->dst.stride;
   1057   const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
   1058                                                             src_stride)];
   1059   uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
   1060                                                        dst_stride)];
   1061   ENTROPY_CONTEXT ta[2], tempa[2];
   1062   ENTROPY_CONTEXT tl[2], templ[2];
   1063 
   1064   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   1065   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   1066   int idx, idy;
   1067   uint8_t best_dst[8 * 8];
   1068 
   1069   assert(ib < 4);
   1070 
   1071   vpx_memcpy(ta, a, sizeof(ta));
   1072   vpx_memcpy(tl, l, sizeof(tl));
   1073   xd->mi[0]->mbmi.tx_size = TX_4X4;
   1074 
   1075   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
   1076     int64_t this_rd;
   1077     int ratey = 0;
   1078     int64_t distortion = 0;
   1079     int rate = bmode_costs[mode];
   1080 
   1081     if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
   1082       continue;
   1083 
   1084     // Only do the oblique modes if the best so far is
   1085     // one of the neighboring directional modes
   1086     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
   1087       if (conditional_skipintra(mode, *best_mode))
   1088           continue;
   1089     }
   1090 
   1091     vpx_memcpy(tempa, ta, sizeof(ta));
   1092     vpx_memcpy(templ, tl, sizeof(tl));
   1093 
   1094     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
   1095       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
   1096         const int block = ib + idy * 2 + idx;
   1097         const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
   1098         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
   1099         int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
   1100                                                             p->src_diff);
   1101         int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
   1102         xd->mi[0]->bmi[block].as_mode = mode;
   1103         vp9_predict_intra_block(xd, block, 1,
   1104                                 TX_4X4, mode,
   1105                                 x->skip_encode ? src : dst,
   1106                                 x->skip_encode ? src_stride : dst_stride,
   1107                                 dst, dst_stride, idx, idy, 0);
   1108         vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
   1109 
   1110         if (xd->lossless) {
   1111           const scan_order *so = &vp9_default_scan_orders[TX_4X4];
   1112           vp9_fwht4x4(src_diff, coeff, 8);
   1113           vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
   1114           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
   1115                                so->scan, so->neighbors,
   1116                                cpi->sf.use_fast_coef_costing);
   1117           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
   1118             goto next;
   1119           vp9_iwht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, dst_stride,
   1120                           p->eobs[block]);
   1121         } else {
   1122           int64_t unused;
   1123           const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
   1124           const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
   1125           vp9_fht4x4(src_diff, coeff, 8, tx_type);
   1126           vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
   1127           ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
   1128                              so->scan, so->neighbors,
   1129                              cpi->sf.use_fast_coef_costing);
   1130           distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
   1131                                         16, &unused) >> 2;
   1132           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
   1133             goto next;
   1134           vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block),
   1135                          dst, dst_stride, p->eobs[block]);
   1136         }
   1137       }
   1138     }
   1139 
   1140     rate += ratey;
   1141     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
   1142 
   1143     if (this_rd < best_rd) {
   1144       *bestrate = rate;
   1145       *bestratey = ratey;
   1146       *bestdistortion = distortion;
   1147       best_rd = this_rd;
   1148       *best_mode = mode;
   1149       vpx_memcpy(a, tempa, sizeof(tempa));
   1150       vpx_memcpy(l, templ, sizeof(templ));
   1151       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
   1152         vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
   1153                    num_4x4_blocks_wide * 4);
   1154     }
   1155   next:
   1156     {}
   1157   }
   1158 
   1159   if (best_rd >= rd_thresh || x->skip_encode)
   1160     return best_rd;
   1161 
   1162   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
   1163     vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
   1164                num_4x4_blocks_wide * 4);
   1165 
   1166   return best_rd;
   1167 }
   1168 
   1169 static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP *cpi, MACROBLOCK *mb,
   1170                                             int *rate, int *rate_y,
   1171                                             int64_t *distortion,
   1172                                             int64_t best_rd) {
   1173   int i, j;
   1174   const MACROBLOCKD *const xd = &mb->e_mbd;
   1175   MODE_INFO *const mic = xd->mi[0];
   1176   const MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
   1177   const MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
   1178   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   1179   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   1180   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   1181   int idx, idy;
   1182   int cost = 0;
   1183   int64_t total_distortion = 0;
   1184   int tot_rate_y = 0;
   1185   int64_t total_rd = 0;
   1186   ENTROPY_CONTEXT t_above[4], t_left[4];
   1187   const int *bmode_costs = mb->mbmode_cost;
   1188 
   1189   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
   1190   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
   1191 
   1192   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   1193   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
   1194     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
   1195       MB_PREDICTION_MODE best_mode = DC_PRED;
   1196       int r = INT_MAX, ry = INT_MAX;
   1197       int64_t d = INT64_MAX, this_rd = INT64_MAX;
   1198       i = idy * 2 + idx;
   1199       if (cpi->common.frame_type == KEY_FRAME) {
   1200         const MB_PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, i);
   1201         const MB_PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, i);
   1202 
   1203         bmode_costs  = mb->y_mode_costs[A][L];
   1204       }
   1205 
   1206       this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
   1207                                       t_above + idx, t_left + idy, &r, &ry, &d,
   1208                                       bsize, best_rd - total_rd);
   1209       if (this_rd >= best_rd - total_rd)
   1210         return INT64_MAX;
   1211 
   1212       total_rd += this_rd;
   1213       cost += r;
   1214       total_distortion += d;
   1215       tot_rate_y += ry;
   1216 
   1217       mic->bmi[i].as_mode = best_mode;
   1218       for (j = 1; j < num_4x4_blocks_high; ++j)
   1219         mic->bmi[i + j * 2].as_mode = best_mode;
   1220       for (j = 1; j < num_4x4_blocks_wide; ++j)
   1221         mic->bmi[i + j].as_mode = best_mode;
   1222 
   1223       if (total_rd >= best_rd)
   1224         return INT64_MAX;
   1225     }
   1226   }
   1227 
   1228   *rate = cost;
   1229   *rate_y = tot_rate_y;
   1230   *distortion = total_distortion;
   1231   mic->mbmi.mode = mic->bmi[3].as_mode;
   1232 
   1233   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
   1234 }
   1235 
   1236 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   1237                                       int *rate, int *rate_tokenonly,
   1238                                       int64_t *distortion, int *skippable,
   1239                                       BLOCK_SIZE bsize,
   1240                                       int64_t tx_cache[TX_MODES],
   1241                                       int64_t best_rd) {
   1242   MB_PREDICTION_MODE mode;
   1243   MB_PREDICTION_MODE mode_selected = DC_PRED;
   1244   MACROBLOCKD *const xd = &x->e_mbd;
   1245   MODE_INFO *const mic = xd->mi[0];
   1246   int this_rate, this_rate_tokenonly, s;
   1247   int64_t this_distortion, this_rd;
   1248   TX_SIZE best_tx = TX_4X4;
   1249   int i;
   1250   int *bmode_costs = x->mbmode_cost;
   1251 
   1252   if (cpi->sf.tx_size_search_method == USE_FULL_RD)
   1253     for (i = 0; i < TX_MODES; i++)
   1254       tx_cache[i] = INT64_MAX;
   1255 
   1256   /* Y Search for intra prediction mode */
   1257   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
   1258     int64_t local_tx_cache[TX_MODES];
   1259     MODE_INFO *above_mi = xd->mi[-xd->mi_stride];
   1260     MODE_INFO *left_mi = xd->left_available ? xd->mi[-1] : NULL;
   1261 
   1262     if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
   1263       continue;
   1264 
   1265     if (cpi->common.frame_type == KEY_FRAME) {
   1266       const MB_PREDICTION_MODE A = vp9_above_block_mode(mic, above_mi, 0);
   1267       const MB_PREDICTION_MODE L = vp9_left_block_mode(mic, left_mi, 0);
   1268 
   1269       bmode_costs = x->y_mode_costs[A][L];
   1270     }
   1271     mic->mbmi.mode = mode;
   1272 
   1273     intra_super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
   1274         &s, NULL, bsize, local_tx_cache, best_rd);
   1275 
   1276     if (this_rate_tokenonly == INT_MAX)
   1277       continue;
   1278 
   1279     this_rate = this_rate_tokenonly + bmode_costs[mode];
   1280     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
   1281 
   1282     if (this_rd < best_rd) {
   1283       mode_selected   = mode;
   1284       best_rd         = this_rd;
   1285       best_tx         = mic->mbmi.tx_size;
   1286       *rate           = this_rate;
   1287       *rate_tokenonly = this_rate_tokenonly;
   1288       *distortion     = this_distortion;
   1289       *skippable      = s;
   1290     }
   1291 
   1292     if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
   1293       for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
   1294         const int64_t adj_rd = this_rd + local_tx_cache[i] -
   1295             local_tx_cache[cpi->common.tx_mode];
   1296         if (adj_rd < tx_cache[i]) {
   1297           tx_cache[i] = adj_rd;
   1298         }
   1299       }
   1300     }
   1301   }
   1302 
   1303   mic->mbmi.mode = mode_selected;
   1304   mic->mbmi.tx_size = best_tx;
   1305 
   1306   return best_rd;
   1307 }
   1308 
   1309 static void super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x,
   1310                              int *rate, int64_t *distortion, int *skippable,
   1311                              int64_t *sse, BLOCK_SIZE bsize,
   1312                              int64_t ref_best_rd) {
   1313   MACROBLOCKD *const xd = &x->e_mbd;
   1314   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   1315   TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
   1316   int plane;
   1317   int pnrate = 0, pnskip = 1;
   1318   int64_t pndist = 0, pnsse = 0;
   1319 
   1320   if (ref_best_rd < 0)
   1321     goto term;
   1322 
   1323   if (is_inter_block(mbmi)) {
   1324     int plane;
   1325     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
   1326       vp9_subtract_plane(x, bsize, plane);
   1327   }
   1328 
   1329   *rate = 0;
   1330   *distortion = 0;
   1331   *sse = 0;
   1332   *skippable = 1;
   1333 
   1334   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
   1335     txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
   1336                      ref_best_rd, plane, bsize, uv_txfm_size,
   1337                      cpi->sf.use_fast_coef_costing);
   1338     if (pnrate == INT_MAX)
   1339       goto term;
   1340     *rate += pnrate;
   1341     *distortion += pndist;
   1342     *sse += pnsse;
   1343     *skippable &= pnskip;
   1344   }
   1345   return;
   1346 
   1347   term:
   1348   *rate = INT_MAX;
   1349   *distortion = INT64_MAX;
   1350   *sse = INT64_MAX;
   1351   *skippable = 0;
   1352   return;
   1353 }
   1354 
   1355 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
   1356                                        PICK_MODE_CONTEXT *ctx,
   1357                                        int *rate, int *rate_tokenonly,
   1358                                        int64_t *distortion, int *skippable,
   1359                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
   1360   MACROBLOCKD *xd = &x->e_mbd;
   1361   MB_PREDICTION_MODE mode;
   1362   MB_PREDICTION_MODE mode_selected = DC_PRED;
   1363   int64_t best_rd = INT64_MAX, this_rd;
   1364   int this_rate_tokenonly, this_rate, s;
   1365   int64_t this_distortion, this_sse;
   1366 
   1367   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
   1368     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
   1369       continue;
   1370 
   1371     xd->mi[0]->mbmi.uv_mode = mode;
   1372 
   1373     super_block_uvrd(cpi, x, &this_rate_tokenonly,
   1374                      &this_distortion, &s, &this_sse, bsize, best_rd);
   1375     if (this_rate_tokenonly == INT_MAX)
   1376       continue;
   1377     this_rate = this_rate_tokenonly +
   1378                 x->intra_uv_mode_cost[cpi->common.frame_type][mode];
   1379     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
   1380 
   1381     if (this_rd < best_rd) {
   1382       mode_selected   = mode;
   1383       best_rd         = this_rd;
   1384       *rate           = this_rate;
   1385       *rate_tokenonly = this_rate_tokenonly;
   1386       *distortion     = this_distortion;
   1387       *skippable      = s;
   1388       if (!x->select_txfm_size) {
   1389         int i;
   1390         struct macroblock_plane *const p = x->plane;
   1391         struct macroblockd_plane *const pd = xd->plane;
   1392         for (i = 1; i < MAX_MB_PLANE; ++i) {
   1393           p[i].coeff    = ctx->coeff_pbuf[i][2];
   1394           p[i].qcoeff   = ctx->qcoeff_pbuf[i][2];
   1395           pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
   1396           p[i].eobs    = ctx->eobs_pbuf[i][2];
   1397 
   1398           ctx->coeff_pbuf[i][2]   = ctx->coeff_pbuf[i][0];
   1399           ctx->qcoeff_pbuf[i][2]  = ctx->qcoeff_pbuf[i][0];
   1400           ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0];
   1401           ctx->eobs_pbuf[i][2]    = ctx->eobs_pbuf[i][0];
   1402 
   1403           ctx->coeff_pbuf[i][0]   = p[i].coeff;
   1404           ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
   1405           ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
   1406           ctx->eobs_pbuf[i][0]    = p[i].eobs;
   1407         }
   1408       }
   1409     }
   1410   }
   1411 
   1412   xd->mi[0]->mbmi.uv_mode = mode_selected;
   1413   return best_rd;
   1414 }
   1415 
   1416 static int64_t rd_sbuv_dcpred(const VP9_COMP *cpi, MACROBLOCK *x,
   1417                               int *rate, int *rate_tokenonly,
   1418                               int64_t *distortion, int *skippable,
   1419                               BLOCK_SIZE bsize) {
   1420   const VP9_COMMON *cm = &cpi->common;
   1421   int64_t unused;
   1422 
   1423   x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
   1424   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
   1425                    skippable, &unused, bsize, INT64_MAX);
   1426   *rate = *rate_tokenonly + x->intra_uv_mode_cost[cm->frame_type][DC_PRED];
   1427   return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
   1428 }
   1429 
   1430 static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   1431                                  BLOCK_SIZE bsize, TX_SIZE max_tx_size,
   1432                                  int *rate_uv, int *rate_uv_tokenonly,
   1433                                  int64_t *dist_uv, int *skip_uv,
   1434                                  MB_PREDICTION_MODE *mode_uv) {
   1435   MACROBLOCK *const x = &cpi->mb;
   1436 
   1437   // Use an estimated rd for uv_intra based on DC_PRED if the
   1438   // appropriate speed flag is set.
   1439   if (cpi->sf.use_uv_intra_rd_estimate) {
   1440     rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv,
   1441                    skip_uv, bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   1442   // Else do a proper rd search for each possible transform size that may
   1443   // be considered in the main rd loop.
   1444   } else {
   1445     rd_pick_intra_sbuv_mode(cpi, x, ctx,
   1446                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
   1447                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
   1448   }
   1449   *mode_uv = x->e_mbd.mi[0]->mbmi.uv_mode;
   1450 }
   1451 
   1452 static int cost_mv_ref(const VP9_COMP *cpi, MB_PREDICTION_MODE mode,
   1453                        int mode_context) {
   1454   const MACROBLOCK *const x = &cpi->mb;
   1455   const int segment_id = x->e_mbd.mi[0]->mbmi.segment_id;
   1456 
   1457   // Don't account for mode here if segment skip is enabled.
   1458   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
   1459     assert(is_inter_mode(mode));
   1460     return x->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
   1461   } else {
   1462     return 0;
   1463   }
   1464 }
   1465 
   1466 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   1467                                 BLOCK_SIZE bsize,
   1468                                 int_mv *frame_mv,
   1469                                 int mi_row, int mi_col,
   1470                                 int_mv single_newmv[MAX_REF_FRAMES],
   1471                                 int *rate_mv);
   1472 
   1473 static int labels2mode(VP9_COMP *cpi, MACROBLOCKD *xd, int i,
   1474                        MB_PREDICTION_MODE mode,
   1475                        int_mv this_mv[2],
   1476                        int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
   1477                        int_mv seg_mvs[MAX_REF_FRAMES],
   1478                        int_mv *best_ref_mv[2],
   1479                        const int *mvjcost, int *mvcost[2]) {
   1480   MODE_INFO *const mic = xd->mi[0];
   1481   const MB_MODE_INFO *const mbmi = &mic->mbmi;
   1482   int thismvcost = 0;
   1483   int idx, idy;
   1484   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
   1485   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   1486   const int is_compound = has_second_ref(mbmi);
   1487 
   1488   // the only time we should do costing for new motion vector or mode
   1489   // is when we are on a new label  (jbb May 08, 2007)
   1490   switch (mode) {
   1491     case NEWMV:
   1492       this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
   1493       thismvcost += vp9_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
   1494                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
   1495       if (is_compound) {
   1496         this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
   1497         thismvcost += vp9_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
   1498                                       mvjcost, mvcost, MV_COST_WEIGHT_SUB);
   1499       }
   1500       break;
   1501     case NEARESTMV:
   1502       this_mv[0].as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
   1503       if (is_compound)
   1504         this_mv[1].as_int = frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
   1505       break;
   1506     case NEARMV:
   1507       this_mv[0].as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
   1508       if (is_compound)
   1509         this_mv[1].as_int = frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
   1510       break;
   1511     case ZEROMV:
   1512       this_mv[0].as_int = 0;
   1513       if (is_compound)
   1514         this_mv[1].as_int = 0;
   1515       break;
   1516     default:
   1517       break;
   1518   }
   1519 
   1520   mic->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
   1521   if (is_compound)
   1522     mic->bmi[i].as_mv[1].as_int = this_mv[1].as_int;
   1523 
   1524   mic->bmi[i].as_mode = mode;
   1525 
   1526   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
   1527     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
   1528       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
   1529                  &mic->bmi[i], sizeof(mic->bmi[i]));
   1530 
   1531   return cost_mv_ref(cpi, mode, mbmi->mode_context[mbmi->ref_frame[0]]) +
   1532             thismvcost;
   1533 }
   1534 
   1535 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   1536                                        MACROBLOCK *x,
   1537                                        int64_t best_yrd,
   1538                                        int i,
   1539                                        int *labelyrate,
   1540                                        int64_t *distortion, int64_t *sse,
   1541                                        ENTROPY_CONTEXT *ta,
   1542                                        ENTROPY_CONTEXT *tl,
   1543                                        int mi_row, int mi_col) {
   1544   int k;
   1545   MACROBLOCKD *xd = &x->e_mbd;
   1546   struct macroblockd_plane *const pd = &xd->plane[0];
   1547   struct macroblock_plane *const p = &x->plane[0];
   1548   MODE_INFO *const mi = xd->mi[0];
   1549   const BLOCK_SIZE plane_bsize = get_plane_block_size(mi->mbmi.sb_type, pd);
   1550   const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   1551   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   1552   int idx, idy;
   1553 
   1554   const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
   1555                                                              p->src.stride)];
   1556   uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
   1557                                                         pd->dst.stride)];
   1558   int64_t thisdistortion = 0, thissse = 0;
   1559   int thisrate = 0, ref;
   1560   const scan_order *so = &vp9_default_scan_orders[TX_4X4];
   1561   const int is_compound = has_second_ref(&mi->mbmi);
   1562   const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
   1563 
   1564   for (ref = 0; ref < 1 + is_compound; ++ref) {
   1565     const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
   1566                                                pd->pre[ref].stride)];
   1567     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
   1568                               dst, pd->dst.stride,
   1569                               &mi->bmi[i].as_mv[ref].as_mv,
   1570                               &xd->block_refs[ref]->sf, width, height, ref,
   1571                               kernel, MV_PRECISION_Q3,
   1572                               mi_col * MI_SIZE + 4 * (i % 2),
   1573                               mi_row * MI_SIZE + 4 * (i / 2));
   1574   }
   1575 
   1576   vp9_subtract_block(height, width,
   1577                      raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
   1578                      src, p->src.stride,
   1579                      dst, pd->dst.stride);
   1580 
   1581   k = i;
   1582   for (idy = 0; idy < height / 4; ++idy) {
   1583     for (idx = 0; idx < width / 4; ++idx) {
   1584       int64_t ssz, rd, rd1, rd2;
   1585       int16_t* coeff;
   1586 
   1587       k += (idy * 2 + idx);
   1588       coeff = BLOCK_OFFSET(p->coeff, k);
   1589       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
   1590                     coeff, 8);
   1591       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
   1592       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
   1593                                         16, &ssz);
   1594       thissse += ssz;
   1595       thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
   1596                               so->scan, so->neighbors,
   1597                               cpi->sf.use_fast_coef_costing);
   1598       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
   1599       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
   1600       rd = MIN(rd1, rd2);
   1601       if (rd >= best_yrd)
   1602         return INT64_MAX;
   1603     }
   1604   }
   1605 
   1606   *distortion = thisdistortion >> 2;
   1607   *labelyrate = thisrate;
   1608   *sse = thissse >> 2;
   1609 
   1610   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
   1611 }
   1612 
   1613 typedef struct {
   1614   int eobs;
   1615   int brate;
   1616   int byrate;
   1617   int64_t bdist;
   1618   int64_t bsse;
   1619   int64_t brdcost;
   1620   int_mv mvs[2];
   1621   ENTROPY_CONTEXT ta[2];
   1622   ENTROPY_CONTEXT tl[2];
   1623 } SEG_RDSTAT;
   1624 
   1625 typedef struct {
   1626   int_mv *ref_mv[2];
   1627   int_mv mvp;
   1628 
   1629   int64_t segment_rd;
   1630   int r;
   1631   int64_t d;
   1632   int64_t sse;
   1633   int segment_yrate;
   1634   MB_PREDICTION_MODE modes[4];
   1635   SEG_RDSTAT rdstat[4][INTER_MODES];
   1636   int mvthresh;
   1637 } BEST_SEG_INFO;
   1638 
   1639 static INLINE int mv_check_bounds(const MACROBLOCK *x, const MV *mv) {
   1640   return (mv->row >> 3) < x->mv_row_min ||
   1641          (mv->row >> 3) > x->mv_row_max ||
   1642          (mv->col >> 3) < x->mv_col_min ||
   1643          (mv->col >> 3) > x->mv_col_max;
   1644 }
   1645 
   1646 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
   1647   MB_MODE_INFO *const mbmi = &x->e_mbd.mi[0]->mbmi;
   1648   struct macroblock_plane *const p = &x->plane[0];
   1649   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
   1650 
   1651   p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
   1652   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
   1653   pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
   1654                                                        pd->pre[0].stride)];
   1655   if (has_second_ref(mbmi))
   1656     pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
   1657                                                          pd->pre[1].stride)];
   1658 }
   1659 
   1660 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
   1661                                   struct buf_2d orig_pre[2]) {
   1662   MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
   1663   x->plane[0].src = orig_src;
   1664   x->e_mbd.plane[0].pre[0] = orig_pre[0];
   1665   if (has_second_ref(mbmi))
   1666     x->e_mbd.plane[0].pre[1] = orig_pre[1];
   1667 }
   1668 
   1669 static INLINE int mv_has_subpel(const MV *mv) {
   1670   return (mv->row & 0x0F) || (mv->col & 0x0F);
   1671 }
   1672 
   1673 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
   1674 // TODO(aconverse): Find out if this is still productive then clean up or remove
   1675 static int check_best_zero_mv(
   1676     const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
   1677     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
   1678     int disable_inter_mode_mask, int this_mode, int ref_frame,
   1679     int second_ref_frame) {
   1680   if (!(disable_inter_mode_mask & (1 << INTER_OFFSET(ZEROMV))) &&
   1681       (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
   1682       frame_mv[this_mode][ref_frame].as_int == 0 &&
   1683       (second_ref_frame == NONE ||
   1684        frame_mv[this_mode][second_ref_frame].as_int == 0)) {
   1685     int rfc = mode_context[ref_frame];
   1686     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
   1687     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
   1688     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
   1689 
   1690     if (this_mode == NEARMV) {
   1691       if (c1 > c3) return 0;
   1692     } else if (this_mode == NEARESTMV) {
   1693       if (c2 > c3) return 0;
   1694     } else {
   1695       assert(this_mode == ZEROMV);
   1696       if (second_ref_frame == NONE) {
   1697         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frame].as_int == 0) ||
   1698             (c3 >= c1 && frame_mv[NEARMV][ref_frame].as_int == 0))
   1699           return 0;
   1700       } else {
   1701         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frame].as_int == 0 &&
   1702              frame_mv[NEARESTMV][second_ref_frame].as_int == 0) ||
   1703             (c3 >= c1 && frame_mv[NEARMV][ref_frame].as_int == 0 &&
   1704              frame_mv[NEARMV][second_ref_frame].as_int == 0))
   1705           return 0;
   1706       }
   1707     }
   1708   }
   1709   return 1;
   1710 }
   1711 
   1712 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   1713                                     const TileInfo *const tile,
   1714                                     BEST_SEG_INFO *bsi_buf, int filter_idx,
   1715                                     int_mv seg_mvs[4][MAX_REF_FRAMES],
   1716                                     int mi_row, int mi_col) {
   1717   int k, br = 0, idx, idy;
   1718   int64_t bd = 0, block_sse = 0;
   1719   MB_PREDICTION_MODE this_mode;
   1720   MACROBLOCKD *xd = &x->e_mbd;
   1721   VP9_COMMON *cm = &cpi->common;
   1722   MODE_INFO *mi = xd->mi[0];
   1723   MB_MODE_INFO *const mbmi = &mi->mbmi;
   1724   struct macroblock_plane *const p = &x->plane[0];
   1725   struct macroblockd_plane *const pd = &xd->plane[0];
   1726   const int label_count = 4;
   1727   int64_t this_segment_rd = 0;
   1728   int label_mv_thresh;
   1729   int segmentyrate = 0;
   1730   const BLOCK_SIZE bsize = mbmi->sb_type;
   1731   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   1732   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   1733   vp9_variance_fn_ptr_t *v_fn_ptr = &cpi->fn_ptr[bsize];
   1734   ENTROPY_CONTEXT t_above[2], t_left[2];
   1735   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   1736   int mode_idx;
   1737   int subpelmv = 1, have_ref = 0;
   1738   const int has_second_rf = has_second_ref(mbmi);
   1739   const int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
   1740 
   1741   vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
   1742   vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
   1743 
   1744   // 64 makes this threshold really big effectively
   1745   // making it so that we very rarely check mvs on
   1746   // segments.   setting this to 1 would make mv thresh
   1747   // roughly equal to what it is for macroblocks
   1748   label_mv_thresh = 1 * bsi->mvthresh / label_count;
   1749 
   1750   // Segmentation method overheads
   1751   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
   1752     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
   1753       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
   1754       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
   1755       int_mv mode_mv[MB_MODE_COUNT][2];
   1756       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   1757       MB_PREDICTION_MODE mode_selected = ZEROMV;
   1758       int64_t best_rd = INT64_MAX;
   1759       const int i = idy * 2 + idx;
   1760       int ref;
   1761 
   1762       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
   1763         const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
   1764         frame_mv[ZEROMV][frame].as_int = 0;
   1765         vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, i, ref, mi_row, mi_col,
   1766                                       &frame_mv[NEARESTMV][frame],
   1767                                       &frame_mv[NEARMV][frame]);
   1768       }
   1769 
   1770       // search for the best motion vector on this segment
   1771       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
   1772         const struct buf_2d orig_src = x->plane[0].src;
   1773         struct buf_2d orig_pre[2];
   1774 
   1775         mode_idx = INTER_OFFSET(this_mode);
   1776         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
   1777         if (disable_inter_mode_mask & (1 << mode_idx))
   1778           continue;
   1779 
   1780         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
   1781                                 disable_inter_mode_mask,
   1782                                 this_mode, mbmi->ref_frame[0],
   1783                                 mbmi->ref_frame[1]))
   1784           continue;
   1785 
   1786         vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
   1787         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
   1788                    sizeof(bsi->rdstat[i][mode_idx].ta));
   1789         vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
   1790                    sizeof(bsi->rdstat[i][mode_idx].tl));
   1791 
   1792         // motion search for newmv (single predictor case only)
   1793         if (!has_second_rf && this_mode == NEWMV &&
   1794             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
   1795           int_mv *const new_mv = &mode_mv[NEWMV][0];
   1796           int step_param = 0;
   1797           int further_steps;
   1798           int thissme, bestsme = INT_MAX;
   1799           int sadpb = x->sadperbit4;
   1800           MV mvp_full;
   1801           int max_mv;
   1802 
   1803           /* Is the best so far sufficiently good that we cant justify doing
   1804            * and new motion search. */
   1805           if (best_rd < label_mv_thresh)
   1806             break;
   1807 
   1808           if (cpi->oxcf.mode != MODE_SECONDPASS_BEST &&
   1809               cpi->oxcf.mode != MODE_BESTQUALITY) {
   1810             // use previous block's result as next block's MV predictor.
   1811             if (i > 0) {
   1812               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
   1813               if (i == 2)
   1814                 bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
   1815             }
   1816           }
   1817           if (i == 0)
   1818             max_mv = x->max_mv_context[mbmi->ref_frame[0]];
   1819           else
   1820             max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
   1821 
   1822           if (cpi->sf.auto_mv_step_size && cm->show_frame) {
   1823             // Take wtd average of the step_params based on the last frame's
   1824             // max mv magnitude and the best ref mvs of the current block for
   1825             // the given reference.
   1826             step_param = (vp9_init_search_range(cpi, max_mv) +
   1827                           cpi->mv_step_param) >> 1;
   1828           } else {
   1829             step_param = cpi->mv_step_param;
   1830           }
   1831 
   1832           mvp_full.row = bsi->mvp.as_mv.row >> 3;
   1833           mvp_full.col = bsi->mvp.as_mv.col >> 3;
   1834 
   1835           if (cpi->sf.adaptive_motion_search && cm->show_frame) {
   1836             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3;
   1837             mvp_full.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3;
   1838             step_param = MAX(step_param, 8);
   1839           }
   1840 
   1841           further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   1842           // adjust src pointer for this block
   1843           mi_buf_shift(x, i);
   1844 
   1845           vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
   1846 
   1847           if (cpi->sf.search_method == HEX) {
   1848             bestsme = vp9_hex_search(x, &mvp_full,
   1849                                      step_param,
   1850                                      sadpb, 1, v_fn_ptr, 1,
   1851                                      &bsi->ref_mv[0]->as_mv,
   1852                                      &new_mv->as_mv);
   1853             if (bestsme < INT_MAX)
   1854               bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv,
   1855                                            &bsi->ref_mv[0]->as_mv,
   1856                                            v_fn_ptr, 1);
   1857           } else if (cpi->sf.search_method == SQUARE) {
   1858             bestsme = vp9_square_search(x, &mvp_full,
   1859                                         step_param,
   1860                                         sadpb, 1, v_fn_ptr, 1,
   1861                                         &bsi->ref_mv[0]->as_mv,
   1862                                         &new_mv->as_mv);
   1863             if (bestsme < INT_MAX)
   1864               bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv,
   1865                                            &bsi->ref_mv[0]->as_mv,
   1866                                            v_fn_ptr, 1);
   1867           } else if (cpi->sf.search_method == BIGDIA) {
   1868             bestsme = vp9_bigdia_search(x, &mvp_full,
   1869                                         step_param,
   1870                                         sadpb, 1, v_fn_ptr, 1,
   1871                                         &bsi->ref_mv[0]->as_mv,
   1872                                         &new_mv->as_mv);
   1873             if (bestsme < INT_MAX)
   1874               bestsme = vp9_get_mvpred_var(x, &new_mv->as_mv,
   1875                                            &bsi->ref_mv[0]->as_mv,
   1876                                            v_fn_ptr, 1);
   1877           } else {
   1878             bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
   1879                                              sadpb, further_steps, 0, v_fn_ptr,
   1880                                              &bsi->ref_mv[0]->as_mv,
   1881                                              &new_mv->as_mv);
   1882           }
   1883 
   1884           // Should we do a full search (best quality only)
   1885           if (cpi->oxcf.mode == MODE_BESTQUALITY ||
   1886               cpi->oxcf.mode == MODE_SECONDPASS_BEST) {
   1887             int_mv *const best_mv = &mi->bmi[i].as_mv[0];
   1888             /* Check if mvp_full is within the range. */
   1889             clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max,
   1890                      x->mv_row_min, x->mv_row_max);
   1891             thissme = cpi->full_search_sad(x, &mvp_full,
   1892                                            sadpb, 16, v_fn_ptr,
   1893                                            x->nmvjointcost, x->mvcost,
   1894                                            &bsi->ref_mv[0]->as_mv,
   1895                                            &best_mv->as_mv);
   1896             if (thissme < bestsme) {
   1897               bestsme = thissme;
   1898               new_mv->as_int = best_mv->as_int;
   1899             } else {
   1900               // The full search result is actually worse so re-instate the
   1901               // previous best vector
   1902               best_mv->as_int = new_mv->as_int;
   1903             }
   1904           }
   1905 
   1906           if (bestsme < INT_MAX) {
   1907             int distortion;
   1908             cpi->find_fractional_mv_step(x,
   1909                                          &new_mv->as_mv,
   1910                                          &bsi->ref_mv[0]->as_mv,
   1911                                          cm->allow_high_precision_mv,
   1912                                          x->errorperbit, v_fn_ptr,
   1913                                          cpi->sf.subpel_force_stop,
   1914                                          cpi->sf.subpel_iters_per_step,
   1915                                          x->nmvjointcost, x->mvcost,
   1916                                          &distortion,
   1917                                          &x->pred_sse[mbmi->ref_frame[0]]);
   1918 
   1919             // save motion search result for use in compound prediction
   1920             seg_mvs[i][mbmi->ref_frame[0]].as_int = new_mv->as_int;
   1921           }
   1922 
   1923           if (cpi->sf.adaptive_motion_search)
   1924             x->pred_mv[mbmi->ref_frame[0]].as_int = new_mv->as_int;
   1925 
   1926           // restore src pointers
   1927           mi_buf_restore(x, orig_src, orig_pre);
   1928         }
   1929 
   1930         if (has_second_rf) {
   1931           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
   1932               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
   1933             continue;
   1934         }
   1935 
   1936         if (has_second_rf && this_mode == NEWMV &&
   1937             mbmi->interp_filter == EIGHTTAP) {
   1938           // adjust src pointers
   1939           mi_buf_shift(x, i);
   1940           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
   1941             int rate_mv;
   1942             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
   1943                                 mi_row, mi_col, seg_mvs[i],
   1944                                 &rate_mv);
   1945             seg_mvs[i][mbmi->ref_frame[0]].as_int =
   1946                 frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
   1947             seg_mvs[i][mbmi->ref_frame[1]].as_int =
   1948                 frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
   1949           }
   1950           // restore src pointers
   1951           mi_buf_restore(x, orig_src, orig_pre);
   1952         }
   1953 
   1954         bsi->rdstat[i][mode_idx].brate =
   1955             labels2mode(cpi, xd, i, this_mode, mode_mv[this_mode], frame_mv,
   1956                         seg_mvs[i], bsi->ref_mv, x->nmvjointcost, x->mvcost);
   1957 
   1958         for (ref = 0; ref < 1 + has_second_rf; ++ref) {
   1959           bsi->rdstat[i][mode_idx].mvs[ref].as_int =
   1960               mode_mv[this_mode][ref].as_int;
   1961           if (num_4x4_blocks_wide > 1)
   1962             bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
   1963                 mode_mv[this_mode][ref].as_int;
   1964           if (num_4x4_blocks_high > 1)
   1965             bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
   1966                 mode_mv[this_mode][ref].as_int;
   1967         }
   1968 
   1969         // Trap vectors that reach beyond the UMV borders
   1970         if (mv_check_bounds(x, &mode_mv[this_mode][0].as_mv) ||
   1971             (has_second_rf &&
   1972              mv_check_bounds(x, &mode_mv[this_mode][1].as_mv)))
   1973           continue;
   1974 
   1975         if (filter_idx > 0) {
   1976           BEST_SEG_INFO *ref_bsi = bsi_buf;
   1977           subpelmv = 0;
   1978           have_ref = 1;
   1979 
   1980           for (ref = 0; ref < 1 + has_second_rf; ++ref) {
   1981             subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
   1982             have_ref &= mode_mv[this_mode][ref].as_int ==
   1983                 ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
   1984           }
   1985 
   1986           if (filter_idx > 1 && !subpelmv && !have_ref) {
   1987             ref_bsi = bsi_buf + 1;
   1988             have_ref = 1;
   1989             for (ref = 0; ref < 1 + has_second_rf; ++ref)
   1990               have_ref &= mode_mv[this_mode][ref].as_int ==
   1991                   ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
   1992           }
   1993 
   1994           if (!subpelmv && have_ref &&
   1995               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
   1996             vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
   1997                        sizeof(SEG_RDSTAT));
   1998             if (num_4x4_blocks_wide > 1)
   1999               bsi->rdstat[i + 1][mode_idx].eobs =
   2000                   ref_bsi->rdstat[i + 1][mode_idx].eobs;
   2001             if (num_4x4_blocks_high > 1)
   2002               bsi->rdstat[i + 2][mode_idx].eobs =
   2003                   ref_bsi->rdstat[i + 2][mode_idx].eobs;
   2004 
   2005             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
   2006               mode_selected = this_mode;
   2007               best_rd = bsi->rdstat[i][mode_idx].brdcost;
   2008             }
   2009             continue;
   2010           }
   2011         }
   2012 
   2013         bsi->rdstat[i][mode_idx].brdcost =
   2014             encode_inter_mb_segment(cpi, x,
   2015                                     bsi->segment_rd - this_segment_rd, i,
   2016                                     &bsi->rdstat[i][mode_idx].byrate,
   2017                                     &bsi->rdstat[i][mode_idx].bdist,
   2018                                     &bsi->rdstat[i][mode_idx].bsse,
   2019                                     bsi->rdstat[i][mode_idx].ta,
   2020                                     bsi->rdstat[i][mode_idx].tl,
   2021                                     mi_row, mi_col);
   2022         if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
   2023           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
   2024                                             bsi->rdstat[i][mode_idx].brate, 0);
   2025           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
   2026           bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
   2027           if (num_4x4_blocks_wide > 1)
   2028             bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
   2029           if (num_4x4_blocks_high > 1)
   2030             bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
   2031         }
   2032 
   2033         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
   2034           mode_selected = this_mode;
   2035           best_rd = bsi->rdstat[i][mode_idx].brdcost;
   2036         }
   2037       } /*for each 4x4 mode*/
   2038 
   2039       if (best_rd == INT64_MAX) {
   2040         int iy, midx;
   2041         for (iy = i + 1; iy < 4; ++iy)
   2042           for (midx = 0; midx < INTER_MODES; ++midx)
   2043             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
   2044         bsi->segment_rd = INT64_MAX;
   2045         return;
   2046       }
   2047 
   2048       mode_idx = INTER_OFFSET(mode_selected);
   2049       vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
   2050       vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
   2051 
   2052       labels2mode(cpi, xd, i, mode_selected, mode_mv[mode_selected],
   2053                   frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
   2054                   x->mvcost);
   2055 
   2056       br += bsi->rdstat[i][mode_idx].brate;
   2057       bd += bsi->rdstat[i][mode_idx].bdist;
   2058       block_sse += bsi->rdstat[i][mode_idx].bsse;
   2059       segmentyrate += bsi->rdstat[i][mode_idx].byrate;
   2060       this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
   2061 
   2062       if (this_segment_rd > bsi->segment_rd) {
   2063         int iy, midx;
   2064         for (iy = i + 1; iy < 4; ++iy)
   2065           for (midx = 0; midx < INTER_MODES; ++midx)
   2066             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
   2067         bsi->segment_rd = INT64_MAX;
   2068         return;
   2069       }
   2070     }
   2071   } /* for each label */
   2072 
   2073   bsi->r = br;
   2074   bsi->d = bd;
   2075   bsi->segment_yrate = segmentyrate;
   2076   bsi->segment_rd = this_segment_rd;
   2077   bsi->sse = block_sse;
   2078 
   2079   // update the coding decisions
   2080   for (k = 0; k < 4; ++k)
   2081     bsi->modes[k] = mi->bmi[k].as_mode;
   2082 }
   2083 
   2084 static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   2085                                            const TileInfo *const tile,
   2086                                            int_mv *best_ref_mv,
   2087                                            int_mv *second_best_ref_mv,
   2088                                            int64_t best_rd,
   2089                                            int *returntotrate,
   2090                                            int *returnyrate,
   2091                                            int64_t *returndistortion,
   2092                                            int *skippable, int64_t *psse,
   2093                                            int mvthresh,
   2094                                            int_mv seg_mvs[4][MAX_REF_FRAMES],
   2095                                            BEST_SEG_INFO *bsi_buf,
   2096                                            int filter_idx,
   2097                                            int mi_row, int mi_col) {
   2098   int i;
   2099   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   2100   MACROBLOCKD *xd = &x->e_mbd;
   2101   MODE_INFO *mi = xd->mi[0];
   2102   MB_MODE_INFO *mbmi = &mi->mbmi;
   2103   int mode_idx;
   2104 
   2105   vp9_zero(*bsi);
   2106 
   2107   bsi->segment_rd = best_rd;
   2108   bsi->ref_mv[0] = best_ref_mv;
   2109   bsi->ref_mv[1] = second_best_ref_mv;
   2110   bsi->mvp.as_int = best_ref_mv->as_int;
   2111   bsi->mvthresh = mvthresh;
   2112 
   2113   for (i = 0; i < 4; i++)
   2114     bsi->modes[i] = ZEROMV;
   2115 
   2116   rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs,
   2117                           mi_row, mi_col);
   2118 
   2119   if (bsi->segment_rd > best_rd)
   2120     return INT64_MAX;
   2121   /* set it to the best */
   2122   for (i = 0; i < 4; i++) {
   2123     mode_idx = INTER_OFFSET(bsi->modes[i]);
   2124     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
   2125     if (has_second_ref(mbmi))
   2126       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
   2127     x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
   2128     mi->bmi[i].as_mode = bsi->modes[i];
   2129   }
   2130 
   2131   /*
   2132    * used to set mbmi->mv.as_int
   2133    */
   2134   *returntotrate = bsi->r;
   2135   *returndistortion = bsi->d;
   2136   *returnyrate = bsi->segment_yrate;
   2137   *skippable = vp9_is_skippable_in_plane(x, BLOCK_8X8, 0);
   2138   *psse = bsi->sse;
   2139   mbmi->mode = bsi->modes[3];
   2140 
   2141   return bsi->segment_rd;
   2142 }
   2143 
   2144 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   2145                     uint8_t *ref_y_buffer, int ref_y_stride,
   2146                     int ref_frame, BLOCK_SIZE block_size ) {
   2147   MACROBLOCKD *xd = &x->e_mbd;
   2148   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   2149   int_mv this_mv;
   2150   int i;
   2151   int zero_seen = 0;
   2152   int best_index = 0;
   2153   int best_sad = INT_MAX;
   2154   int this_sad = INT_MAX;
   2155   int max_mv = 0;
   2156 
   2157   uint8_t *src_y_ptr = x->plane[0].src.buf;
   2158   uint8_t *ref_y_ptr;
   2159   int row_offset, col_offset;
   2160   int num_mv_refs = MAX_MV_REF_CANDIDATES +
   2161                     (cpi->sf.adaptive_motion_search &&
   2162                      cpi->common.show_frame &&
   2163                      block_size < cpi->sf.max_partition_size);
   2164 
   2165   int_mv pred_mv[3];
   2166   pred_mv[0] = mbmi->ref_mvs[ref_frame][0];
   2167   pred_mv[1] = mbmi->ref_mvs[ref_frame][1];
   2168   pred_mv[2] = x->pred_mv[ref_frame];
   2169 
   2170   // Get the sad for each candidate reference mv
   2171   for (i = 0; i < num_mv_refs; i++) {
   2172     this_mv.as_int = pred_mv[i].as_int;
   2173 
   2174     max_mv = MAX(max_mv,
   2175                  MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
   2176     // only need to check zero mv once
   2177     if (!this_mv.as_int && zero_seen)
   2178       continue;
   2179 
   2180     zero_seen = zero_seen || !this_mv.as_int;
   2181 
   2182     row_offset = this_mv.as_mv.row >> 3;
   2183     col_offset = this_mv.as_mv.col >> 3;
   2184     ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
   2185 
   2186     // Find sad for current vector.
   2187     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
   2188                                            ref_y_ptr, ref_y_stride,
   2189                                            0x7fffffff);
   2190 
   2191     // Note if it is the best so far.
   2192     if (this_sad < best_sad) {
   2193       best_sad = this_sad;
   2194       best_index = i;
   2195     }
   2196   }
   2197 
   2198   // Note the index of the mv that worked best in the reference list.
   2199   x->mv_best_ref_index[ref_frame] = best_index;
   2200   x->max_mv_context[ref_frame] = max_mv;
   2201   x->pred_mv_sad[ref_frame] = best_sad;
   2202 }
   2203 
   2204 static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
   2205                                      unsigned int *ref_costs_single,
   2206                                      unsigned int *ref_costs_comp,
   2207                                      vp9_prob *comp_mode_p) {
   2208   VP9_COMMON *const cm = &cpi->common;
   2209   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   2210   int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
   2211                                              SEG_LVL_REF_FRAME);
   2212   if (seg_ref_active) {
   2213     vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
   2214     vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
   2215     *comp_mode_p = 128;
   2216   } else {
   2217     vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
   2218     vp9_prob comp_inter_p = 128;
   2219 
   2220     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
   2221       comp_inter_p = vp9_get_reference_mode_prob(cm, xd);
   2222       *comp_mode_p = comp_inter_p;
   2223     } else {
   2224       *comp_mode_p = 128;
   2225     }
   2226 
   2227     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
   2228 
   2229     if (cm->reference_mode != COMPOUND_REFERENCE) {
   2230       vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
   2231       vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
   2232       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
   2233 
   2234       if (cm->reference_mode == REFERENCE_MODE_SELECT)
   2235         base_cost += vp9_cost_bit(comp_inter_p, 0);
   2236 
   2237       ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
   2238           ref_costs_single[ALTREF_FRAME] = base_cost;
   2239       ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
   2240       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
   2241       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
   2242       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
   2243       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
   2244     } else {
   2245       ref_costs_single[LAST_FRAME]   = 512;
   2246       ref_costs_single[GOLDEN_FRAME] = 512;
   2247       ref_costs_single[ALTREF_FRAME] = 512;
   2248     }
   2249     if (cm->reference_mode != SINGLE_REFERENCE) {
   2250       vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
   2251       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
   2252 
   2253       if (cm->reference_mode == REFERENCE_MODE_SELECT)
   2254         base_cost += vp9_cost_bit(comp_inter_p, 1);
   2255 
   2256       ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
   2257       ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
   2258     } else {
   2259       ref_costs_comp[LAST_FRAME]   = 512;
   2260       ref_costs_comp[GOLDEN_FRAME] = 512;
   2261     }
   2262   }
   2263 }
   2264 
   2265 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   2266                          int mode_index,
   2267                          int_mv *ref_mv,
   2268                          int_mv *second_ref_mv,
   2269                          int64_t comp_pred_diff[REFERENCE_MODES],
   2270                          int64_t tx_size_diff[TX_MODES],
   2271                          int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
   2272   MACROBLOCKD *const xd = &x->e_mbd;
   2273 
   2274   // Take a snapshot of the coding context so it can be
   2275   // restored if we decide to encode this way
   2276   ctx->skip = x->skip;
   2277   ctx->best_mode_index = mode_index;
   2278   ctx->mic = *xd->mi[0];
   2279 
   2280   ctx->best_ref_mv[0].as_int = ref_mv->as_int;
   2281   ctx->best_ref_mv[1].as_int = second_ref_mv->as_int;
   2282 
   2283   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
   2284   ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
   2285   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
   2286 
   2287   vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
   2288   vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
   2289              sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
   2290 }
   2291 
   2292 static void setup_pred_block(const MACROBLOCKD *xd,
   2293                              struct buf_2d dst[MAX_MB_PLANE],
   2294                              const YV12_BUFFER_CONFIG *src,
   2295                              int mi_row, int mi_col,
   2296                              const struct scale_factors *scale,
   2297                              const struct scale_factors *scale_uv) {
   2298   int i;
   2299 
   2300   dst[0].buf = src->y_buffer;
   2301   dst[0].stride = src->y_stride;
   2302   dst[1].buf = src->u_buffer;
   2303   dst[2].buf = src->v_buffer;
   2304   dst[1].stride = dst[2].stride = src->uv_stride;
   2305 #if CONFIG_ALPHA
   2306   dst[3].buf = src->alpha_buffer;
   2307   dst[3].stride = src->alpha_stride;
   2308 #endif
   2309 
   2310   // TODO(jkoleszar): Make scale factors per-plane data
   2311   for (i = 0; i < MAX_MB_PLANE; i++) {
   2312     setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
   2313                      i ? scale_uv : scale,
   2314                      xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
   2315   }
   2316 }
   2317 
   2318 void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   2319                             const TileInfo *const tile,
   2320                             MV_REFERENCE_FRAME ref_frame,
   2321                             BLOCK_SIZE block_size,
   2322                             int mi_row, int mi_col,
   2323                             int_mv frame_nearest_mv[MAX_REF_FRAMES],
   2324                             int_mv frame_near_mv[MAX_REF_FRAMES],
   2325                             struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
   2326   const VP9_COMMON *cm = &cpi->common;
   2327   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   2328   MACROBLOCKD *const xd = &x->e_mbd;
   2329   MODE_INFO *const mi = xd->mi[0];
   2330   int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame];
   2331   const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
   2332 
   2333   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   2334   // use the UV scaling factors.
   2335   setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
   2336 
   2337   // Gets an initial list of candidate vectors from neighbours and orders them
   2338   vp9_find_mv_refs(cm, xd, tile, mi, ref_frame, candidates, mi_row, mi_col);
   2339 
   2340   // Candidate refinement carried out at encoder and decoder
   2341   vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates,
   2342                         &frame_nearest_mv[ref_frame],
   2343                         &frame_near_mv[ref_frame]);
   2344 
   2345   // Further refinement that is encode side only to test the top few candidates
   2346   // in full and choose the best as the centre point for subsequent searches.
   2347   // The current implementation doesn't support scaling.
   2348   if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8)
   2349     mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride,
   2350             ref_frame, block_size);
   2351 }
   2352 
   2353 const YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(const VP9_COMP *cpi,
   2354                                                    int ref_frame) {
   2355   const VP9_COMMON *const cm = &cpi->common;
   2356   const int ref_idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
   2357   const int scaled_idx = cpi->scaled_ref_idx[ref_frame - 1];
   2358   return (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf : NULL;
   2359 }
   2360 
   2361 static INLINE int get_switchable_rate(const MACROBLOCK *x) {
   2362   const MACROBLOCKD *const xd = &x->e_mbd;
   2363   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   2364   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   2365   return SWITCHABLE_INTERP_RATE_FACTOR *
   2366              x->switchable_interp_costs[ctx][mbmi->interp_filter];
   2367 }
   2368 
   2369 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   2370                                  const TileInfo *const tile,
   2371                                  BLOCK_SIZE bsize,
   2372                                  int mi_row, int mi_col,
   2373                                  int_mv *tmp_mv, int *rate_mv) {
   2374   MACROBLOCKD *xd = &x->e_mbd;
   2375   VP9_COMMON *cm = &cpi->common;
   2376   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   2377   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   2378   int bestsme = INT_MAX;
   2379   int further_steps, step_param;
   2380   int sadpb = x->sadperbit16;
   2381   MV mvp_full;
   2382   int ref = mbmi->ref_frame[0];
   2383   MV ref_mv = mbmi->ref_mvs[ref][0].as_mv;
   2384 
   2385   int tmp_col_min = x->mv_col_min;
   2386   int tmp_col_max = x->mv_col_max;
   2387   int tmp_row_min = x->mv_row_min;
   2388   int tmp_row_max = x->mv_row_max;
   2389 
   2390   const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi,
   2391                                                                         ref);
   2392 
   2393   MV pred_mv[3];
   2394   pred_mv[0] = mbmi->ref_mvs[ref][0].as_mv;
   2395   pred_mv[1] = mbmi->ref_mvs[ref][1].as_mv;
   2396   pred_mv[2] = x->pred_mv[ref].as_mv;
   2397 
   2398   if (scaled_ref_frame) {
   2399     int i;
   2400     // Swap out the reference frame for a version that's been scaled to
   2401     // match the resolution of the current frame, allowing the existing
   2402     // motion search code to be used without additional modifications.
   2403     for (i = 0; i < MAX_MB_PLANE; i++)
   2404       backup_yv12[i] = xd->plane[i].pre[0];
   2405 
   2406     vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   2407   }
   2408 
   2409   vp9_set_mv_search_range(x, &ref_mv);
   2410 
   2411   // Work out the size of the first step in the mv step search.
   2412   // 0 here is maximum length first step. 1 is MAX >> 1 etc.
   2413   if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
   2414     // Take wtd average of the step_params based on the last frame's
   2415     // max mv magnitude and that based on the best ref mvs of the current
   2416     // block for the given reference.
   2417     step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
   2418                   cpi->mv_step_param) >> 1;
   2419   } else {
   2420     step_param = cpi->mv_step_param;
   2421   }
   2422 
   2423   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
   2424       cpi->common.show_frame) {
   2425     int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
   2426                                                        b_width_log2(bsize)));
   2427     step_param = MAX(step_param, boffset);
   2428   }
   2429 
   2430   if (cpi->sf.adaptive_motion_search) {
   2431     int bwl = b_width_log2_lookup[bsize];
   2432     int bhl = b_height_log2_lookup[bsize];
   2433     int i;
   2434     int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
   2435 
   2436     if (tlevel < 5)
   2437       step_param += 2;
   2438 
   2439     for (i = LAST_FRAME; i <= ALTREF_FRAME && cpi->common.show_frame; ++i) {
   2440       if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
   2441         x->pred_mv[ref].as_int = 0;
   2442         tmp_mv->as_int = INVALID_MV;
   2443 
   2444         if (scaled_ref_frame) {
   2445           int i;
   2446           for (i = 0; i < MAX_MB_PLANE; i++)
   2447             xd->plane[i].pre[0] = backup_yv12[i];
   2448         }
   2449         return;
   2450       }
   2451     }
   2452   }
   2453 
   2454   mvp_full = pred_mv[x->mv_best_ref_index[ref]];
   2455 
   2456   mvp_full.col >>= 3;
   2457   mvp_full.row >>= 3;
   2458 
   2459   // Further step/diamond searches as necessary
   2460   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
   2461 
   2462   if (cpi->sf.search_method == FAST_DIAMOND) {
   2463     bestsme = vp9_fast_dia_search(x, &mvp_full, step_param, sadpb, 0,
   2464                                   &cpi->fn_ptr[bsize], 1,
   2465                                   &ref_mv, &tmp_mv->as_mv);
   2466     if (bestsme < INT_MAX)
   2467       bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
   2468                                    &cpi->fn_ptr[bsize], 1);
   2469   } else if (cpi->sf.search_method == FAST_HEX) {
   2470     bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, 0,
   2471                                   &cpi->fn_ptr[bsize], 1,
   2472                                   &ref_mv, &tmp_mv->as_mv);
   2473     if (bestsme < INT_MAX)
   2474       bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
   2475                                    &cpi->fn_ptr[bsize], 1);
   2476   } else if (cpi->sf.search_method == HEX) {
   2477     bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1,
   2478                              &cpi->fn_ptr[bsize], 1,
   2479                              &ref_mv, &tmp_mv->as_mv);
   2480     if (bestsme < INT_MAX)
   2481       bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
   2482                                    &cpi->fn_ptr[bsize], 1);
   2483   } else if (cpi->sf.search_method == SQUARE) {
   2484     bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1,
   2485                                 &cpi->fn_ptr[bsize], 1,
   2486                                 &ref_mv, &tmp_mv->as_mv);
   2487     if (bestsme < INT_MAX)
   2488       bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
   2489                                    &cpi->fn_ptr[bsize], 1);
   2490   } else if (cpi->sf.search_method == BIGDIA) {
   2491     bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1,
   2492                                 &cpi->fn_ptr[bsize], 1,
   2493                                 &ref_mv, &tmp_mv->as_mv);
   2494     if (bestsme < INT_MAX)
   2495       bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv,
   2496                                    &cpi->fn_ptr[bsize], 1);
   2497   } else {
   2498     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
   2499                                      sadpb, further_steps, 1,
   2500                                      &cpi->fn_ptr[bsize],
   2501                                      &ref_mv, &tmp_mv->as_mv);
   2502   }
   2503 
   2504   x->mv_col_min = tmp_col_min;
   2505   x->mv_col_max = tmp_col_max;
   2506   x->mv_row_min = tmp_row_min;
   2507   x->mv_row_max = tmp_row_max;
   2508 
   2509   if (bestsme < INT_MAX) {
   2510     int dis;  /* TODO: use dis in distortion calculation later. */
   2511     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
   2512                                  cm->allow_high_precision_mv,
   2513                                  x->errorperbit,
   2514                                  &cpi->fn_ptr[bsize],
   2515                                  cpi->sf.subpel_force_stop,
   2516                                  cpi->sf.subpel_iters_per_step,
   2517                                  x->nmvjointcost, x->mvcost,
   2518                                  &dis, &x->pred_sse[ref]);
   2519   }
   2520   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
   2521                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   2522 
   2523   if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
   2524     x->pred_mv[ref].as_int = tmp_mv->as_int;
   2525 
   2526   if (scaled_ref_frame) {
   2527     int i;
   2528     for (i = 0; i < MAX_MB_PLANE; i++)
   2529       xd->plane[i].pre[0] = backup_yv12[i];
   2530   }
   2531 }
   2532 
   2533 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   2534                                 BLOCK_SIZE bsize,
   2535                                 int_mv *frame_mv,
   2536                                 int mi_row, int mi_col,
   2537                                 int_mv single_newmv[MAX_REF_FRAMES],
   2538                                 int *rate_mv) {
   2539   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
   2540   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
   2541   MACROBLOCKD *xd = &x->e_mbd;
   2542   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   2543   const int refs[2] = { mbmi->ref_frame[0],
   2544                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   2545   int_mv ref_mv[2];
   2546   int ite, ref;
   2547   // Prediction buffer from second frame.
   2548   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
   2549   const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
   2550 
   2551   // Do joint motion search in compound mode to get more accurate mv.
   2552   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
   2553   struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
   2554   int last_besterr[2] = {INT_MAX, INT_MAX};
   2555   const YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
   2556     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
   2557     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
   2558   };
   2559 
   2560   for (ref = 0; ref < 2; ++ref) {
   2561     ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
   2562 
   2563     if (scaled_ref_frame[ref]) {
   2564       int i;
   2565       // Swap out the reference frame for a version that's been scaled to
   2566       // match the resolution of the current frame, allowing the existing
   2567       // motion search code to be used without additional modifications.
   2568       for (i = 0; i < MAX_MB_PLANE; i++)
   2569         backup_yv12[ref][i] = xd->plane[i].pre[ref];
   2570       vp9_setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col,
   2571                            NULL);
   2572     }
   2573 
   2574     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
   2575   }
   2576 
   2577   // Allow joint search multiple times iteratively for each ref frame
   2578   // and break out the search loop if it couldn't find better mv.
   2579   for (ite = 0; ite < 4; ite++) {
   2580     struct buf_2d ref_yv12[2];
   2581     int bestsme = INT_MAX;
   2582     int sadpb = x->sadperbit16;
   2583     int_mv tmp_mv;
   2584     int search_range = 3;
   2585 
   2586     int tmp_col_min = x->mv_col_min;
   2587     int tmp_col_max = x->mv_col_max;
   2588     int tmp_row_min = x->mv_row_min;
   2589     int tmp_row_max = x->mv_row_max;
   2590     int id = ite % 2;
   2591 
   2592     // Initialized here because of compiler problem in Visual Studio.
   2593     ref_yv12[0] = xd->plane[0].pre[0];
   2594     ref_yv12[1] = xd->plane[0].pre[1];
   2595 
   2596     // Get pred block from second frame.
   2597     vp9_build_inter_predictor(ref_yv12[!id].buf,
   2598                               ref_yv12[!id].stride,
   2599                               second_pred, pw,
   2600                               &frame_mv[refs[!id]].as_mv,
   2601                               &xd->block_refs[!id]->sf,
   2602                               pw, ph, 0,
   2603                               kernel, MV_PRECISION_Q3,
   2604                               mi_col * MI_SIZE, mi_row * MI_SIZE);
   2605 
   2606     // Compound motion search on first ref frame.
   2607     if (id)
   2608       xd->plane[0].pre[0] = ref_yv12[id];
   2609     vp9_set_mv_search_range(x, &ref_mv[id].as_mv);
   2610 
   2611     // Use mv result from single mode as mvp.
   2612     tmp_mv.as_int = frame_mv[refs[id]].as_int;
   2613 
   2614     tmp_mv.as_mv.col >>= 3;
   2615     tmp_mv.as_mv.row >>= 3;
   2616 
   2617     // Small-range full-pixel motion search
   2618     bestsme = vp9_refining_search_8p_c(x, &tmp_mv.as_mv, sadpb,
   2619                                        search_range,
   2620                                        &cpi->fn_ptr[bsize],
   2621                                        x->nmvjointcost, x->mvcost,
   2622                                        &ref_mv[id].as_mv, second_pred,
   2623                                        pw, ph);
   2624     if (bestsme < INT_MAX)
   2625       bestsme = vp9_get_mvpred_av_var(x, &tmp_mv.as_mv, &ref_mv[id].as_mv,
   2626                                       second_pred, &cpi->fn_ptr[bsize], 1);
   2627 
   2628     x->mv_col_min = tmp_col_min;
   2629     x->mv_col_max = tmp_col_max;
   2630     x->mv_row_min = tmp_row_min;
   2631     x->mv_row_max = tmp_row_max;
   2632 
   2633     if (bestsme < INT_MAX) {
   2634       int dis; /* TODO: use dis in distortion calculation later. */
   2635       unsigned int sse;
   2636       bestsme = cpi->find_fractional_mv_step_comp(
   2637           x, &tmp_mv.as_mv,
   2638           &ref_mv[id].as_mv,
   2639           cpi->common.allow_high_precision_mv,
   2640           x->errorperbit,
   2641           &cpi->fn_ptr[bsize],
   2642           0, cpi->sf.subpel_iters_per_step,
   2643           x->nmvjointcost, x->mvcost,
   2644           &dis, &sse, second_pred,
   2645           pw, ph);
   2646     }
   2647 
   2648     if (id)
   2649       xd->plane[0].pre[0] = scaled_first_yv12;
   2650 
   2651     if (bestsme < last_besterr[id]) {
   2652       frame_mv[refs[id]].as_int = tmp_mv.as_int;
   2653       last_besterr[id] = bestsme;
   2654     } else {
   2655       break;
   2656     }
   2657   }
   2658 
   2659   *rate_mv = 0;
   2660 
   2661   for (ref = 0; ref < 2; ++ref) {
   2662     if (scaled_ref_frame[ref]) {
   2663       // restore the predictor
   2664       int i;
   2665       for (i = 0; i < MAX_MB_PLANE; i++)
   2666         xd->plane[i].pre[ref] = backup_yv12[ref][i];
   2667     }
   2668 
   2669     *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
   2670                                 &mbmi->ref_mvs[refs[ref]][0].as_mv,
   2671                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   2672   }
   2673 
   2674   vpx_free(second_pred);
   2675 }
   2676 
   2677 static INLINE void restore_dst_buf(MACROBLOCKD *xd,
   2678                                    uint8_t *orig_dst[MAX_MB_PLANE],
   2679                                    int orig_dst_stride[MAX_MB_PLANE]) {
   2680   int i;
   2681   for (i = 0; i < MAX_MB_PLANE; i++) {
   2682     xd->plane[i].dst.buf = orig_dst[i];
   2683     xd->plane[i].dst.stride = orig_dst_stride[i];
   2684   }
   2685 }
   2686 
   2687 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   2688                                  const TileInfo *const tile,
   2689                                  BLOCK_SIZE bsize,
   2690                                  int64_t txfm_cache[],
   2691                                  int *rate2, int64_t *distortion,
   2692                                  int *skippable,
   2693                                  int *rate_y, int64_t *distortion_y,
   2694                                  int *rate_uv, int64_t *distortion_uv,
   2695                                  int *mode_excluded, int *disable_skip,
   2696                                  INTERP_FILTER *best_filter,
   2697                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
   2698                                  int mi_row, int mi_col,
   2699                                  int_mv single_newmv[MAX_REF_FRAMES],
   2700                                  int64_t *psse,
   2701                                  const int64_t ref_best_rd) {
   2702   VP9_COMMON *cm = &cpi->common;
   2703   MACROBLOCKD *xd = &x->e_mbd;
   2704   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   2705   const int is_comp_pred = has_second_ref(mbmi);
   2706   const int num_refs = is_comp_pred ? 2 : 1;
   2707   const int this_mode = mbmi->mode;
   2708   int_mv *frame_mv = mode_mv[this_mode];
   2709   int i;
   2710   int refs[2] = { mbmi->ref_frame[0],
   2711     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   2712   int_mv cur_mv[2];
   2713   int64_t this_rd = 0;
   2714   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
   2715   int pred_exists = 0;
   2716   int intpel_mv;
   2717   int64_t rd, best_rd = INT64_MAX;
   2718   int best_needs_copy = 0;
   2719   uint8_t *orig_dst[MAX_MB_PLANE];
   2720   int orig_dst_stride[MAX_MB_PLANE];
   2721   int rs = 0;
   2722 
   2723   if (is_comp_pred) {
   2724     if (frame_mv[refs[0]].as_int == INVALID_MV ||
   2725         frame_mv[refs[1]].as_int == INVALID_MV)
   2726       return INT64_MAX;
   2727   }
   2728 
   2729   if (this_mode == NEWMV) {
   2730     int rate_mv;
   2731     if (is_comp_pred) {
   2732       // Initialize mv using single prediction mode result.
   2733       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
   2734       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
   2735 
   2736       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
   2737         joint_motion_search(cpi, x, bsize, frame_mv,
   2738                             mi_row, mi_col, single_newmv, &rate_mv);
   2739       } else {
   2740         rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
   2741                                    &mbmi->ref_mvs[refs[0]][0].as_mv,
   2742                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   2743         rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
   2744                                    &mbmi->ref_mvs[refs[1]][0].as_mv,
   2745                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   2746       }
   2747       *rate2 += rate_mv;
   2748     } else {
   2749       int_mv tmp_mv;
   2750       single_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
   2751                            &tmp_mv, &rate_mv);
   2752       if (tmp_mv.as_int == INVALID_MV)
   2753         return INT64_MAX;
   2754       *rate2 += rate_mv;
   2755       frame_mv[refs[0]].as_int =
   2756           xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
   2757       single_newmv[refs[0]].as_int = tmp_mv.as_int;
   2758     }
   2759   }
   2760 
   2761   for (i = 0; i < num_refs; ++i) {
   2762     cur_mv[i] = frame_mv[refs[i]];
   2763     // Clip "next_nearest" so that it does not extend to far out of image
   2764     if (this_mode != NEWMV)
   2765       clamp_mv2(&cur_mv[i].as_mv, xd);
   2766 
   2767     if (mv_check_bounds(x, &cur_mv[i].as_mv))
   2768       return INT64_MAX;
   2769     mbmi->mv[i].as_int = cur_mv[i].as_int;
   2770   }
   2771 
   2772   // do first prediction into the destination buffer. Do the next
   2773   // prediction into a temporary buffer. Then keep track of which one
   2774   // of these currently holds the best predictor, and use the other
   2775   // one for future predictions. In the end, copy from tmp_buf to
   2776   // dst if necessary.
   2777   for (i = 0; i < MAX_MB_PLANE; i++) {
   2778     orig_dst[i] = xd->plane[i].dst.buf;
   2779     orig_dst_stride[i] = xd->plane[i].dst.stride;
   2780   }
   2781 
   2782   /* We don't include the cost of the second reference here, because there
   2783    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
   2784    * words if you present them in that order, the second one is always known
   2785    * if the first is known */
   2786   *rate2 += cost_mv_ref(cpi, this_mode, mbmi->mode_context[refs[0]]);
   2787 
   2788   if (!(*mode_excluded))
   2789     *mode_excluded = is_comp_pred ? cm->reference_mode == SINGLE_REFERENCE
   2790                                   : cm->reference_mode == COMPOUND_REFERENCE;
   2791 
   2792   pred_exists = 0;
   2793   // Are all MVs integer pel for Y and UV
   2794   intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
   2795   if (is_comp_pred)
   2796     intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
   2797 
   2798   // Search for best switchable filter by checking the variance of
   2799   // pred error irrespective of whether the filter will be used
   2800   cpi->mask_filter_rd = 0;
   2801   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
   2802     cpi->rd_filter_cache[i] = INT64_MAX;
   2803 
   2804   if (cm->interp_filter != BILINEAR) {
   2805     *best_filter = EIGHTTAP;
   2806     if (x->source_variance <
   2807         cpi->sf.disable_filter_search_var_thresh) {
   2808       *best_filter = EIGHTTAP;
   2809     } else {
   2810       int newbest;
   2811       int tmp_rate_sum = 0;
   2812       int64_t tmp_dist_sum = 0;
   2813 
   2814       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
   2815         int j;
   2816         int64_t rs_rd;
   2817         mbmi->interp_filter = i;
   2818         rs = get_switchable_rate(x);
   2819         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
   2820 
   2821         if (i > 0 && intpel_mv) {
   2822           rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
   2823           cpi->rd_filter_cache[i] = rd;
   2824           cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
   2825               MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
   2826           if (cm->interp_filter == SWITCHABLE)
   2827             rd += rs_rd;
   2828           cpi->mask_filter_rd = MAX(cpi->mask_filter_rd, rd);
   2829         } else {
   2830           int rate_sum = 0;
   2831           int64_t dist_sum = 0;
   2832           if ((cm->interp_filter == SWITCHABLE &&
   2833                (!i || best_needs_copy)) ||
   2834               (cm->interp_filter != SWITCHABLE &&
   2835                (cm->interp_filter == mbmi->interp_filter ||
   2836                 (i == 0 && intpel_mv)))) {
   2837             restore_dst_buf(xd, orig_dst, orig_dst_stride);
   2838           } else {
   2839             for (j = 0; j < MAX_MB_PLANE; j++) {
   2840               xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
   2841               xd->plane[j].dst.stride = 64;
   2842             }
   2843           }
   2844           vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   2845           model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
   2846 
   2847           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
   2848           cpi->rd_filter_cache[i] = rd;
   2849           cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
   2850               MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
   2851           if (cm->interp_filter == SWITCHABLE)
   2852             rd += rs_rd;
   2853           cpi->mask_filter_rd = MAX(cpi->mask_filter_rd, rd);
   2854 
   2855           if (i == 0 && intpel_mv) {
   2856             tmp_rate_sum = rate_sum;
   2857             tmp_dist_sum = dist_sum;
   2858           }
   2859         }
   2860 
   2861         if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
   2862           if (rd / 2 > ref_best_rd) {
   2863             restore_dst_buf(xd, orig_dst, orig_dst_stride);
   2864             return INT64_MAX;
   2865           }
   2866         }
   2867         newbest = i == 0 || rd < best_rd;
   2868 
   2869         if (newbest) {
   2870           best_rd = rd;
   2871           *best_filter = mbmi->interp_filter;
   2872           if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
   2873             best_needs_copy = !best_needs_copy;
   2874         }
   2875 
   2876         if ((cm->interp_filter == SWITCHABLE && newbest) ||
   2877             (cm->interp_filter != SWITCHABLE &&
   2878              cm->interp_filter == mbmi->interp_filter)) {
   2879           pred_exists = 1;
   2880         }
   2881       }
   2882       restore_dst_buf(xd, orig_dst, orig_dst_stride);
   2883     }
   2884   }
   2885   // Set the appropriate filter
   2886   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
   2887       cm->interp_filter : *best_filter;
   2888   rs = cm->interp_filter == SWITCHABLE ? get_switchable_rate(x) : 0;
   2889 
   2890   if (pred_exists) {
   2891     if (best_needs_copy) {
   2892       // again temporarily set the buffers to local memory to prevent a memcpy
   2893       for (i = 0; i < MAX_MB_PLANE; i++) {
   2894         xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
   2895         xd->plane[i].dst.stride = 64;
   2896       }
   2897     }
   2898   } else {
   2899     // Handles the special case when a filter that is not in the
   2900     // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
   2901     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   2902   }
   2903 
   2904   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
   2905     int tmp_rate;
   2906     int64_t tmp_dist;
   2907     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
   2908     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
   2909     // if current pred_error modeled rd is substantially more than the best
   2910     // so far, do not bother doing full rd
   2911     if (rd / 2 > ref_best_rd) {
   2912       restore_dst_buf(xd, orig_dst, orig_dst_stride);
   2913       return INT64_MAX;
   2914     }
   2915   }
   2916 
   2917   if (cm->interp_filter == SWITCHABLE)
   2918     *rate2 += get_switchable_rate(x);
   2919 
   2920   if (!is_comp_pred) {
   2921     if (!x->in_active_map) {
   2922       if (psse)
   2923         *psse = 0;
   2924       *distortion = 0;
   2925       x->skip = 1;
   2926     } else if (cpi->allow_encode_breakout && x->encode_breakout) {
   2927       const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
   2928       const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
   2929       unsigned int var, sse;
   2930       // Skipping threshold for ac.
   2931       unsigned int thresh_ac;
   2932       // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
   2933       // Use extreme low threshold for static frames to limit skipping.
   2934       const unsigned int max_thresh = (cpi->allow_encode_breakout ==
   2935                                       ENCODE_BREAKOUT_LIMITED) ? 128 : 36000;
   2936       // The encode_breakout input
   2937       const unsigned int min_thresh =
   2938           MIN(((unsigned int)x->encode_breakout << 4), max_thresh);
   2939 
   2940       // Calculate threshold according to dequant value.
   2941       thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
   2942       thresh_ac = clamp(thresh_ac, min_thresh, max_thresh);
   2943 
   2944       var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
   2945                                    xd->plane[0].dst.buf,
   2946                                    xd->plane[0].dst.stride, &sse);
   2947 
   2948       // Adjust threshold according to partition size.
   2949       thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
   2950           b_height_log2_lookup[bsize]);
   2951 
   2952       // Y skipping condition checking
   2953       if (sse < thresh_ac || sse == 0) {
   2954         // Skipping threshold for dc
   2955         unsigned int thresh_dc;
   2956 
   2957         thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
   2958 
   2959         // dc skipping checking
   2960         if ((sse - var) < thresh_dc || sse == var) {
   2961           unsigned int sse_u, sse_v;
   2962           unsigned int var_u, var_v;
   2963 
   2964           var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
   2965                                           x->plane[1].src.stride,
   2966                                           xd->plane[1].dst.buf,
   2967                                           xd->plane[1].dst.stride, &sse_u);
   2968 
   2969           // U skipping condition checking
   2970           if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
   2971               (sse_u - var_u < thresh_dc || sse_u == var_u)) {
   2972             var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
   2973                                             x->plane[2].src.stride,
   2974                                             xd->plane[2].dst.buf,
   2975                                             xd->plane[2].dst.stride, &sse_v);
   2976 
   2977             // V skipping condition checking
   2978             if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
   2979                 (sse_v - var_v < thresh_dc || sse_v == var_v)) {
   2980               x->skip = 1;
   2981 
   2982               // The cost of skip bit needs to be added.
   2983               *rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
   2984 
   2985               // Scaling factor for SSE from spatial domain to frequency domain
   2986               // is 16. Adjust distortion accordingly.
   2987               *distortion_uv = (sse_u + sse_v) << 4;
   2988               *distortion = (sse << 4) + *distortion_uv;
   2989 
   2990               *disable_skip = 1;
   2991               this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
   2992             }
   2993           }
   2994         }
   2995       }
   2996     }
   2997   }
   2998 
   2999   if (!x->skip) {
   3000     int skippable_y, skippable_uv;
   3001     int64_t sseuv = INT64_MAX;
   3002     int64_t rdcosty = INT64_MAX;
   3003 
   3004     // Y cost and distortion
   3005     inter_super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
   3006                           bsize, txfm_cache, ref_best_rd);
   3007 
   3008     if (*rate_y == INT_MAX) {
   3009       *rate2 = INT_MAX;
   3010       *distortion = INT64_MAX;
   3011       restore_dst_buf(xd, orig_dst, orig_dst_stride);
   3012       return INT64_MAX;
   3013     }
   3014 
   3015     *rate2 += *rate_y;
   3016     *distortion += *distortion_y;
   3017 
   3018     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
   3019     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
   3020 
   3021     super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
   3022                      bsize, ref_best_rd - rdcosty);
   3023     if (*rate_uv == INT_MAX) {
   3024       *rate2 = INT_MAX;
   3025       *distortion = INT64_MAX;
   3026       restore_dst_buf(xd, orig_dst, orig_dst_stride);
   3027       return INT64_MAX;
   3028     }
   3029 
   3030     *psse += sseuv;
   3031     *rate2 += *rate_uv;
   3032     *distortion += *distortion_uv;
   3033     *skippable = skippable_y && skippable_uv;
   3034   }
   3035 
   3036   restore_dst_buf(xd, orig_dst, orig_dst_stride);
   3037   return this_rd;  // if 0, this will be re-calculated by caller
   3038 }
   3039 
   3040 static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   3041                            int max_plane) {
   3042   struct macroblock_plane *const p = x->plane;
   3043   struct macroblockd_plane *const pd = x->e_mbd.plane;
   3044   int i;
   3045 
   3046   for (i = 0; i < max_plane; ++i) {
   3047     p[i].coeff    = ctx->coeff_pbuf[i][1];
   3048     p[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
   3049     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
   3050     p[i].eobs    = ctx->eobs_pbuf[i][1];
   3051 
   3052     ctx->coeff_pbuf[i][1]   = ctx->coeff_pbuf[i][0];
   3053     ctx->qcoeff_pbuf[i][1]  = ctx->qcoeff_pbuf[i][0];
   3054     ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0];
   3055     ctx->eobs_pbuf[i][1]    = ctx->eobs_pbuf[i][0];
   3056 
   3057     ctx->coeff_pbuf[i][0]   = p[i].coeff;
   3058     ctx->qcoeff_pbuf[i][0]  = p[i].qcoeff;
   3059     ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
   3060     ctx->eobs_pbuf[i][0]    = p[i].eobs;
   3061   }
   3062 }
   3063 
   3064 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   3065                                int *returnrate, int64_t *returndist,
   3066                                BLOCK_SIZE bsize,
   3067                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   3068   VP9_COMMON *const cm = &cpi->common;
   3069   MACROBLOCKD *const xd = &x->e_mbd;
   3070   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
   3071   int y_skip = 0, uv_skip = 0;
   3072   int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
   3073   TX_SIZE max_uv_tx_size;
   3074   x->skip_encode = 0;
   3075   ctx->skip = 0;
   3076   xd->mi[0]->mbmi.ref_frame[0] = INTRA_FRAME;
   3077 
   3078   if (bsize >= BLOCK_8X8) {
   3079     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
   3080                                &dist_y, &y_skip, bsize, tx_cache,
   3081                                best_rd) >= best_rd) {
   3082       *returnrate = INT_MAX;
   3083       return;
   3084     }
   3085     max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize);
   3086     rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
   3087                             &dist_uv, &uv_skip, bsize, max_uv_tx_size);
   3088   } else {
   3089     y_skip = 0;
   3090     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
   3091                                      &dist_y, best_rd) >= best_rd) {
   3092       *returnrate = INT_MAX;
   3093       return;
   3094     }
   3095     max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize);
   3096     rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
   3097                             &dist_uv, &uv_skip, BLOCK_8X8, max_uv_tx_size);
   3098   }
   3099 
   3100   if (y_skip && uv_skip) {
   3101     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
   3102                   vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
   3103     *returndist = dist_y + dist_uv;
   3104     vp9_zero(ctx->tx_rd_diff);
   3105   } else {
   3106     int i;
   3107     *returnrate = rate_y + rate_uv + vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
   3108     *returndist = dist_y + dist_uv;
   3109     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
   3110       for (i = 0; i < TX_MODES; i++) {
   3111         if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
   3112           ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
   3113         else
   3114           ctx->tx_rd_diff[i] = 0;
   3115       }
   3116   }
   3117 
   3118   ctx->mic = *xd->mi[0];
   3119 }
   3120 
   3121 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   3122                                   const TileInfo *const tile,
   3123                                   int mi_row, int mi_col,
   3124                                   int *returnrate,
   3125                                   int64_t *returndistortion,
   3126                                   BLOCK_SIZE bsize,
   3127                                   PICK_MODE_CONTEXT *ctx,
   3128                                   int64_t best_rd_so_far) {
   3129   VP9_COMMON *const cm = &cpi->common;
   3130   MACROBLOCKD *const xd = &x->e_mbd;
   3131   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   3132   const struct segmentation *const seg = &cm->seg;
   3133   MB_PREDICTION_MODE this_mode;
   3134   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   3135   unsigned char segment_id = mbmi->segment_id;
   3136   int comp_pred, i;
   3137   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   3138   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   3139   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   3140   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
   3141                                     VP9_ALT_FLAG };
   3142   int64_t best_rd = best_rd_so_far;
   3143   int64_t best_tx_rd[TX_MODES];
   3144   int64_t best_tx_diff[TX_MODES];
   3145   int64_t best_pred_diff[REFERENCE_MODES];
   3146   int64_t best_pred_rd[REFERENCE_MODES];
   3147   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   3148   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   3149   MB_MODE_INFO best_mbmode = { 0 };
   3150   int mode_index, best_mode_index = 0;
   3151   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   3152   vp9_prob comp_mode_p;
   3153   int64_t best_intra_rd = INT64_MAX;
   3154   int64_t best_inter_rd = INT64_MAX;
   3155   MB_PREDICTION_MODE best_intra_mode = DC_PRED;
   3156   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
   3157   INTERP_FILTER tmp_best_filter = SWITCHABLE;
   3158   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
   3159   int64_t dist_uv[TX_SIZES];
   3160   int skip_uv[TX_SIZES];
   3161   MB_PREDICTION_MODE mode_uv[TX_SIZES];
   3162   int64_t mode_distortions[MB_MODE_COUNT] = {-1};
   3163   int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
   3164   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
   3165   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   3166   int best_skip2 = 0;
   3167   int mode_skip_mask = 0;
   3168   int mode_skip_start = cpi->sf.mode_skip_start + 1;
   3169   const int *const rd_threshes = cpi->rd_threshes[segment_id][bsize];
   3170   const int *const rd_thresh_freq_fact = cpi->rd_thresh_freq_fact[bsize];
   3171   const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
   3172   const int intra_y_mode_mask =
   3173       cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
   3174   int disable_inter_mode_mask = cpi->sf.disable_inter_mode_mask[bsize];
   3175 
   3176   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   3177 
   3178   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
   3179                            &comp_mode_p);
   3180 
   3181   for (i = 0; i < REFERENCE_MODES; ++i)
   3182     best_pred_rd[i] = INT64_MAX;
   3183   for (i = 0; i < TX_MODES; i++)
   3184     best_tx_rd[i] = INT64_MAX;
   3185   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
   3186     best_filter_rd[i] = INT64_MAX;
   3187   for (i = 0; i < TX_SIZES; i++)
   3188     rate_uv_intra[i] = INT_MAX;
   3189   for (i = 0; i < MAX_REF_FRAMES; ++i)
   3190     x->pred_sse[i] = INT_MAX;
   3191 
   3192   *returnrate = INT_MAX;
   3193 
   3194   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
   3195     x->pred_mv_sad[ref_frame] = INT_MAX;
   3196     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
   3197       vp9_setup_buffer_inter(cpi, x, tile,
   3198                              ref_frame, bsize, mi_row, mi_col,
   3199                              frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
   3200     }
   3201     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   3202     frame_mv[ZEROMV][ref_frame].as_int = 0;
   3203   }
   3204 
   3205   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
   3206     // All modes from vp9_mode_order that use this frame as any ref
   3207     static const int ref_frame_mask_all[] = {
   3208         0x0, 0x123291, 0x25c444, 0x39b722
   3209     };
   3210     // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
   3211     // this frame as their primary ref
   3212     static const int ref_frame_mask_fixedmv[] = {
   3213         0x0, 0x121281, 0x24c404, 0x080102
   3214     };
   3215     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
   3216       // Skip modes for missing references
   3217       mode_skip_mask |= ref_frame_mask_all[ref_frame];
   3218     } else if (cpi->sf.reference_masking) {
   3219       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
   3220         // Skip fixed mv modes for poor references
   3221         if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
   3222           mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
   3223           break;
   3224         }
   3225       }
   3226     }
   3227     // If the segment reference frame feature is enabled....
   3228     // then do nothing if the current ref frame is not allowed..
   3229     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
   3230         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
   3231       mode_skip_mask |= ref_frame_mask_all[ref_frame];
   3232     }
   3233   }
   3234 
   3235   // If the segment skip feature is enabled....
   3236   // then do nothing if the current mode is not allowed..
   3237   if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
   3238     const int inter_non_zero_mode_mask = 0x1F7F7;
   3239     mode_skip_mask |= inter_non_zero_mode_mask;
   3240   }
   3241 
   3242   // Disable this drop out case if the ref frame
   3243   // segment level feature is enabled for this segment. This is to
   3244   // prevent the possibility that we end up unable to pick any mode.
   3245   if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
   3246     // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
   3247     // unless ARNR filtering is enabled in which case we want
   3248     // an unfiltered alternative. We allow near/nearest as well
   3249     // because they may result in zero-zero MVs but be cheaper.
   3250     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
   3251       const int altref_zero_mask =
   3252           ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
   3253       mode_skip_mask |= altref_zero_mask;
   3254       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
   3255         mode_skip_mask |= (1 << THR_NEARA);
   3256       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
   3257         mode_skip_mask |= (1 << THR_NEARESTA);
   3258     }
   3259   }
   3260 
   3261   // TODO(JBB): This is to make up for the fact that we don't have sad
   3262   // functions that work when the block size reads outside the umv.  We
   3263   // should fix this either by making the motion search just work on
   3264   // a representative block in the boundary ( first ) and then implement a
   3265   // function that does sads when inside the border..
   3266   if ((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) {
   3267     const int new_modes_mask =
   3268         (1 << THR_NEWMV) | (1 << THR_NEWG) | (1 << THR_NEWA) |
   3269         (1 << THR_COMP_NEWLA) | (1 << THR_COMP_NEWGA);
   3270     mode_skip_mask |= new_modes_mask;
   3271   }
   3272 
   3273   if (bsize > cpi->sf.max_intra_bsize) {
   3274     mode_skip_mask |= 0xFF30808;
   3275   }
   3276 
   3277   if (!x->in_active_map) {
   3278     int mode_index;
   3279     assert(cpi->ref_frame_flags & VP9_LAST_FLAG);
   3280     if (frame_mv[NEARESTMV][LAST_FRAME].as_int == 0)
   3281       mode_index = THR_NEARESTMV;
   3282     else if (frame_mv[NEARMV][LAST_FRAME].as_int == 0)
   3283       mode_index = THR_NEARMV;
   3284     else
   3285       mode_index = THR_ZEROMV;
   3286     mode_skip_mask = ~(1 << mode_index);
   3287     mode_skip_start = MAX_MODES;
   3288     disable_inter_mode_mask = 0;
   3289   }
   3290 
   3291   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
   3292     int mode_excluded = 0;
   3293     int64_t this_rd = INT64_MAX;
   3294     int disable_skip = 0;
   3295     int compmode_cost = 0;
   3296     int rate2 = 0, rate_y = 0, rate_uv = 0;
   3297     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
   3298     int skippable = 0;
   3299     int64_t tx_cache[TX_MODES];
   3300     int i;
   3301     int this_skip2 = 0;
   3302     int64_t total_sse = INT64_MAX;
   3303     int early_term = 0;
   3304 
   3305     // Look at the reference frame of the best mode so far and set the
   3306     // skip mask to look at a subset of the remaining modes.
   3307     if (mode_index == mode_skip_start) {
   3308       switch (vp9_mode_order[best_mode_index].ref_frame[0]) {
   3309         case INTRA_FRAME:
   3310           break;
   3311         case LAST_FRAME:
   3312           mode_skip_mask |= LAST_FRAME_MODE_MASK;
   3313           break;
   3314         case GOLDEN_FRAME:
   3315           mode_skip_mask |= GOLDEN_FRAME_MODE_MASK;
   3316           break;
   3317         case ALTREF_FRAME:
   3318           mode_skip_mask |= ALT_REF_MODE_MASK;
   3319           break;
   3320         case NONE:
   3321         case MAX_REF_FRAMES:
   3322           assert(0 && "Invalid Reference frame");
   3323       }
   3324     }
   3325     if (mode_skip_mask & (1 << mode_index))
   3326       continue;
   3327 
   3328     // Test best rd so far against threshold for trying this mode.
   3329     if (best_rd < ((int64_t)rd_threshes[mode_index] *
   3330                   rd_thresh_freq_fact[mode_index] >> 5) ||
   3331         rd_threshes[mode_index] == INT_MAX)
   3332      continue;
   3333 
   3334     this_mode = vp9_mode_order[mode_index].mode;
   3335     ref_frame = vp9_mode_order[mode_index].ref_frame[0];
   3336     if (ref_frame != INTRA_FRAME &&
   3337         disable_inter_mode_mask & (1 << INTER_OFFSET(this_mode)))
   3338       continue;
   3339     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
   3340 
   3341     comp_pred = second_ref_frame > INTRA_FRAME;
   3342     if (comp_pred) {
   3343       if ((mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA) &&
   3344           vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
   3345         continue;
   3346       if ((mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH) &&
   3347           ref_frame != best_inter_ref_frame &&
   3348           second_ref_frame != best_inter_ref_frame)
   3349         continue;
   3350       mode_excluded = cm->reference_mode == SINGLE_REFERENCE;
   3351     } else {
   3352       if (ref_frame != INTRA_FRAME)
   3353         mode_excluded = cm->reference_mode == COMPOUND_REFERENCE;
   3354     }
   3355 
   3356     if (ref_frame == INTRA_FRAME) {
   3357       if (!(intra_y_mode_mask & (1 << this_mode)))
   3358         continue;
   3359       if (this_mode != DC_PRED) {
   3360         // Disable intra modes other than DC_PRED for blocks with low variance
   3361         // Threshold for intra skipping based on source variance
   3362         // TODO(debargha): Specialize the threshold for super block sizes
   3363         const unsigned int skip_intra_var_thresh = 64;
   3364         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
   3365             x->source_variance < skip_intra_var_thresh)
   3366           continue;
   3367         // Only search the oblique modes if the best so far is
   3368         // one of the neighboring directional modes
   3369         if ((mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
   3370             (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
   3371           if (vp9_mode_order[best_mode_index].ref_frame[0] > INTRA_FRAME)
   3372             continue;
   3373         }
   3374         if (mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
   3375           if (conditional_skipintra(this_mode, best_intra_mode))
   3376               continue;
   3377         }
   3378       }
   3379     } else {
   3380       if (x->in_active_map &&
   3381           !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
   3382         if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
   3383                                 disable_inter_mode_mask, this_mode, ref_frame,
   3384                                 second_ref_frame))
   3385           continue;
   3386     }
   3387 
   3388     mbmi->mode = this_mode;
   3389     mbmi->uv_mode = x->in_active_map ? DC_PRED : this_mode;
   3390     mbmi->ref_frame[0] = ref_frame;
   3391     mbmi->ref_frame[1] = second_ref_frame;
   3392     // Evaluate all sub-pel filters irrespective of whether we can use
   3393     // them for this frame.
   3394     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
   3395                                                           : cm->interp_filter;
   3396     x->skip = 0;
   3397     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
   3398 
   3399     // Select prediction reference frames.
   3400     for (i = 0; i < MAX_MB_PLANE; i++) {
   3401       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
   3402       if (comp_pred)
   3403         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
   3404     }
   3405 
   3406     for (i = 0; i < TX_MODES; ++i)
   3407       tx_cache[i] = INT64_MAX;
   3408 
   3409 #ifdef MODE_TEST_HIT_STATS
   3410     // TEST/DEBUG CODE
   3411     // Keep a rcord of the number of test hits at each size
   3412     cpi->mode_test_hits[bsize]++;
   3413 #endif
   3414 
   3415     if (ref_frame == INTRA_FRAME) {
   3416       TX_SIZE uv_tx;
   3417       intra_super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
   3418                             bsize, tx_cache, best_rd);
   3419 
   3420       if (rate_y == INT_MAX)
   3421         continue;
   3422 
   3423       uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize);
   3424       if (rate_uv_intra[uv_tx] == INT_MAX) {
   3425         choose_intra_uv_mode(cpi, ctx, bsize, uv_tx,
   3426                              &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
   3427                              &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
   3428       }
   3429 
   3430       rate_uv = rate_uv_tokenonly[uv_tx];
   3431       distortion_uv = dist_uv[uv_tx];
   3432       skippable = skippable && skip_uv[uv_tx];
   3433       mbmi->uv_mode = mode_uv[uv_tx];
   3434 
   3435       rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
   3436       if (this_mode != DC_PRED && this_mode != TM_PRED)
   3437         rate2 += intra_cost_penalty;
   3438       distortion2 = distortion_y + distortion_uv;
   3439     } else {
   3440       this_rd = handle_inter_mode(cpi, x, tile, bsize,
   3441                                   tx_cache,
   3442                                   &rate2, &distortion2, &skippable,
   3443                                   &rate_y, &distortion_y,
   3444                                   &rate_uv, &distortion_uv,
   3445                                   &mode_excluded, &disable_skip,
   3446                                   &tmp_best_filter, frame_mv,
   3447                                   mi_row, mi_col,
   3448                                   single_newmv, &total_sse, best_rd);
   3449       if (this_rd == INT64_MAX)
   3450         continue;
   3451 
   3452       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
   3453 
   3454       if (cm->reference_mode == REFERENCE_MODE_SELECT)
   3455         rate2 += compmode_cost;
   3456     }
   3457 
   3458     // Estimate the reference frame signaling cost and add it
   3459     // to the rolling cost variable.
   3460     if (comp_pred) {
   3461       rate2 += ref_costs_comp[ref_frame];
   3462     } else {
   3463       rate2 += ref_costs_single[ref_frame];
   3464     }
   3465 
   3466     if (!disable_skip) {
   3467       // Test for the condition where skip block will be activated
   3468       // because there are no non zero coefficients and make any
   3469       // necessary adjustment for rate. Ignore if skip is coded at
   3470       // segment level as the cost wont have been added in.
   3471       // Is Mb level skip allowed (i.e. not coded at segment level).
   3472       const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
   3473                                                          SEG_LVL_SKIP);
   3474 
   3475       if (skippable) {
   3476         // Back out the coefficient coding costs
   3477         rate2 -= (rate_y + rate_uv);
   3478         // for best yrd calculation
   3479         rate_uv = 0;
   3480 
   3481         if (mb_skip_allowed) {
   3482           int prob_skip_cost;
   3483 
   3484           // Cost the skip mb case
   3485           vp9_prob skip_prob = vp9_get_skip_prob(cm, xd);
   3486           if (skip_prob) {
   3487             prob_skip_cost = vp9_cost_bit(skip_prob, 1);
   3488             rate2 += prob_skip_cost;
   3489           }
   3490         }
   3491       } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
   3492         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
   3493             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
   3494           // Add in the cost of the no skip flag.
   3495           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
   3496         } else {
   3497           // FIXME(rbultje) make this work for splitmv also
   3498           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
   3499           distortion2 = total_sse;
   3500           assert(total_sse >= 0);
   3501           rate2 -= (rate_y + rate_uv);
   3502           rate_y = 0;
   3503           rate_uv = 0;
   3504           this_skip2 = 1;
   3505         }
   3506       } else if (mb_skip_allowed) {
   3507         // Add in the cost of the no skip flag.
   3508         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
   3509       }
   3510 
   3511       // Calculate the final RD estimate for this mode.
   3512       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
   3513     }
   3514 
   3515     if (ref_frame == INTRA_FRAME) {
   3516     // Keep record of best intra rd
   3517       if (this_rd < best_intra_rd) {
   3518         best_intra_rd = this_rd;
   3519         best_intra_mode = mbmi->mode;
   3520       }
   3521     } else {
   3522       // Keep record of best inter rd with single reference
   3523       if (!comp_pred && !mode_excluded && this_rd < best_inter_rd) {
   3524         best_inter_rd = this_rd;
   3525         best_inter_ref_frame = ref_frame;
   3526       }
   3527     }
   3528 
   3529     if (!disable_skip && ref_frame == INTRA_FRAME) {
   3530       for (i = 0; i < REFERENCE_MODES; ++i)
   3531         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
   3532       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
   3533         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
   3534     }
   3535 
   3536     // Store the respective mode distortions for later use.
   3537     if (mode_distortions[this_mode] == -1
   3538         || distortion2 < mode_distortions[this_mode]) {
   3539       mode_distortions[this_mode] = distortion2;
   3540     }
   3541 
   3542     // Did this mode help.. i.e. is it the new best mode
   3543     if (this_rd < best_rd || x->skip) {
   3544       int max_plane = MAX_MB_PLANE;
   3545       if (!mode_excluded) {
   3546         // Note index of best mode so far
   3547         best_mode_index = mode_index;
   3548 
   3549         if (ref_frame == INTRA_FRAME) {
   3550           /* required for left and above block mv */
   3551           mbmi->mv[0].as_int = 0;
   3552           max_plane = 1;
   3553         }
   3554 
   3555         *returnrate = rate2;
   3556         *returndistortion = distortion2;
   3557         best_rd = this_rd;
   3558         best_mbmode = *mbmi;
   3559         best_skip2 = this_skip2;
   3560         if (!x->select_txfm_size)
   3561           swap_block_ptr(x, ctx, max_plane);
   3562         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
   3563                    sizeof(uint8_t) * ctx->num_4x4_blk);
   3564 
   3565         // TODO(debargha): enhance this test with a better distortion prediction
   3566         // based on qp, activity mask and history
   3567         if ((mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
   3568             (mode_index > MIN_EARLY_TERM_INDEX)) {
   3569           const int qstep = xd->plane[0].dequant[1];
   3570           // TODO(debargha): Enhance this by specializing for each mode_index
   3571           int scale = 4;
   3572           if (x->source_variance < UINT_MAX) {
   3573             const int var_adjust = (x->source_variance < 16);
   3574             scale -= var_adjust;
   3575           }
   3576           if (ref_frame > INTRA_FRAME &&
   3577               distortion2 * scale < qstep * qstep) {
   3578             early_term = 1;
   3579           }
   3580         }
   3581       }
   3582     }
   3583 
   3584     /* keep record of best compound/single-only prediction */
   3585     if (!disable_skip && ref_frame != INTRA_FRAME) {
   3586       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
   3587 
   3588       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
   3589         single_rate = rate2 - compmode_cost;
   3590         hybrid_rate = rate2;
   3591       } else {
   3592         single_rate = rate2;
   3593         hybrid_rate = rate2 + compmode_cost;
   3594       }
   3595 
   3596       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
   3597       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
   3598 
   3599       if (!comp_pred) {
   3600         if (single_rd < best_pred_rd[SINGLE_REFERENCE]) {
   3601           best_pred_rd[SINGLE_REFERENCE] = single_rd;
   3602         }
   3603       } else {
   3604         if (single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
   3605           best_pred_rd[COMPOUND_REFERENCE] = single_rd;
   3606         }
   3607       }
   3608       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
   3609         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
   3610 
   3611       /* keep record of best filter type */
   3612       if (!mode_excluded && cm->interp_filter != BILINEAR) {
   3613         int64_t ref = cpi->rd_filter_cache[cm->interp_filter == SWITCHABLE ?
   3614                               SWITCHABLE_FILTERS : cm->interp_filter];
   3615 
   3616         for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
   3617           int64_t adj_rd;
   3618           if (ref == INT64_MAX)
   3619             adj_rd = 0;
   3620           else if (cpi->rd_filter_cache[i] == INT64_MAX)
   3621             // when early termination is triggered, the encoder does not have
   3622             // access to the rate-distortion cost. it only knows that the cost
   3623             // should be above the maximum valid value. hence it takes the known
   3624             // maximum plus an arbitrary constant as the rate-distortion cost.
   3625             adj_rd = cpi->mask_filter_rd - ref + 10;
   3626           else
   3627             adj_rd = cpi->rd_filter_cache[i] - ref;
   3628 
   3629           adj_rd += this_rd;
   3630           best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
   3631         }
   3632       }
   3633     }
   3634 
   3635     /* keep record of best txfm size */
   3636     if (bsize < BLOCK_32X32) {
   3637       if (bsize < BLOCK_16X16)
   3638         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
   3639 
   3640       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
   3641     }
   3642     if (!mode_excluded && this_rd != INT64_MAX) {
   3643       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
   3644         int64_t adj_rd = INT64_MAX;
   3645         adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
   3646 
   3647         if (adj_rd < best_tx_rd[i])
   3648           best_tx_rd[i] = adj_rd;
   3649       }
   3650     }
   3651 
   3652     if (early_term)
   3653       break;
   3654 
   3655     if (x->skip && !comp_pred)
   3656       break;
   3657   }
   3658 
   3659   if (best_rd >= best_rd_so_far)
   3660     return INT64_MAX;
   3661 
   3662   // If we used an estimate for the uv intra rd in the loop above...
   3663   if (cpi->sf.use_uv_intra_rd_estimate) {
   3664     // Do Intra UV best rd mode selection if best mode choice above was intra.
   3665     if (vp9_mode_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
   3666       TX_SIZE uv_tx_size;
   3667       *mbmi = best_mbmode;
   3668       uv_tx_size = get_uv_tx_size(mbmi);
   3669       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
   3670                               &rate_uv_tokenonly[uv_tx_size],
   3671                               &dist_uv[uv_tx_size],
   3672                               &skip_uv[uv_tx_size],
   3673                               bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize,
   3674                               uv_tx_size);
   3675     }
   3676   }
   3677 
   3678   assert((cm->interp_filter == SWITCHABLE) ||
   3679          (cm->interp_filter == best_mbmode.interp_filter) ||
   3680          !is_inter_block(&best_mbmode));
   3681 
   3682   // Updating rd_thresh_freq_fact[] here means that the different
   3683   // partition/block sizes are handled independently based on the best
   3684   // choice for the current partition. It may well be better to keep a scaled
   3685   // best rd so far value and update rd_thresh_freq_fact based on the mode/size
   3686   // combination that wins out.
   3687   if (cpi->sf.adaptive_rd_thresh) {
   3688     for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
   3689       int *const fact = &cpi->rd_thresh_freq_fact[bsize][mode_index];
   3690 
   3691       if (mode_index == best_mode_index) {
   3692         *fact -= (*fact >> 3);
   3693       } else {
   3694         *fact = MIN(*fact + RD_THRESH_INC,
   3695                     cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
   3696       }
   3697     }
   3698   }
   3699 
   3700   // macroblock modes
   3701   *mbmi = best_mbmode;
   3702   x->skip |= best_skip2;
   3703 
   3704   for (i = 0; i < REFERENCE_MODES; ++i) {
   3705     if (best_pred_rd[i] == INT64_MAX)
   3706       best_pred_diff[i] = INT_MIN;
   3707     else
   3708       best_pred_diff[i] = best_rd - best_pred_rd[i];
   3709   }
   3710 
   3711   if (!x->skip) {
   3712     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
   3713       if (best_filter_rd[i] == INT64_MAX)
   3714         best_filter_diff[i] = 0;
   3715       else
   3716         best_filter_diff[i] = best_rd - best_filter_rd[i];
   3717     }
   3718     if (cm->interp_filter == SWITCHABLE)
   3719       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
   3720     for (i = 0; i < TX_MODES; i++) {
   3721       if (best_tx_rd[i] == INT64_MAX)
   3722         best_tx_diff[i] = 0;
   3723       else
   3724         best_tx_diff[i] = best_rd - best_tx_rd[i];
   3725     }
   3726   } else {
   3727     vp9_zero(best_filter_diff);
   3728     vp9_zero(best_tx_diff);
   3729   }
   3730 
   3731   if (!x->in_active_map) {
   3732     assert(mbmi->ref_frame[0] == LAST_FRAME);
   3733     assert(mbmi->ref_frame[1] == NONE);
   3734     assert(mbmi->mode == NEARESTMV ||
   3735            mbmi->mode == NEARMV ||
   3736            mbmi->mode == ZEROMV);
   3737     assert(frame_mv[mbmi->mode][LAST_FRAME].as_int == 0);
   3738     assert(mbmi->mode == mbmi->uv_mode);
   3739   }
   3740 
   3741   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   3742   store_coding_context(x, ctx, best_mode_index,
   3743                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
   3744                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
   3745                                       mbmi->ref_frame[1]][0],
   3746                        best_pred_diff, best_tx_diff, best_filter_diff);
   3747 
   3748   return best_rd;
   3749 }
   3750 
   3751 
   3752 int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   3753                                       const TileInfo *const tile,
   3754                                       int mi_row, int mi_col,
   3755                                       int *returnrate,
   3756                                       int64_t *returndistortion,
   3757                                       BLOCK_SIZE bsize,
   3758                                       PICK_MODE_CONTEXT *ctx,
   3759                                       int64_t best_rd_so_far) {
   3760   VP9_COMMON *cm = &cpi->common;
   3761   MACROBLOCKD *xd = &x->e_mbd;
   3762   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   3763   const struct segmentation *seg = &cm->seg;
   3764   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   3765   unsigned char segment_id = mbmi->segment_id;
   3766   int comp_pred, i;
   3767   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   3768   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   3769   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
   3770                                     VP9_ALT_FLAG };
   3771   int64_t best_rd = best_rd_so_far;
   3772   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   3773   int64_t best_tx_rd[TX_MODES];
   3774   int64_t best_tx_diff[TX_MODES];
   3775   int64_t best_pred_diff[REFERENCE_MODES];
   3776   int64_t best_pred_rd[REFERENCE_MODES];
   3777   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   3778   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   3779   MB_MODE_INFO best_mbmode = { 0 };
   3780   int mode_index, best_mode_index = 0;
   3781   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   3782   vp9_prob comp_mode_p;
   3783   int64_t best_inter_rd = INT64_MAX;
   3784   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
   3785   INTERP_FILTER tmp_best_filter = SWITCHABLE;
   3786   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
   3787   int64_t dist_uv[TX_SIZES];
   3788   int skip_uv[TX_SIZES];
   3789   MB_PREDICTION_MODE mode_uv[TX_SIZES] = { 0 };
   3790   int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
   3791   int_mv seg_mvs[4][MAX_REF_FRAMES];
   3792   b_mode_info best_bmodes[4];
   3793   int best_skip2 = 0;
   3794   int ref_frame_mask = 0;
   3795   int mode_skip_mask = 0;
   3796 
   3797   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   3798   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
   3799 
   3800   for (i = 0; i < 4; i++) {
   3801     int j;
   3802     for (j = 0; j < MAX_REF_FRAMES; j++)
   3803       seg_mvs[i][j].as_int = INVALID_MV;
   3804   }
   3805 
   3806   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
   3807                            &comp_mode_p);
   3808 
   3809   for (i = 0; i < REFERENCE_MODES; ++i)
   3810     best_pred_rd[i] = INT64_MAX;
   3811   for (i = 0; i < TX_MODES; i++)
   3812     best_tx_rd[i] = INT64_MAX;
   3813   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
   3814     best_filter_rd[i] = INT64_MAX;
   3815   for (i = 0; i < TX_SIZES; i++)
   3816     rate_uv_intra[i] = INT_MAX;
   3817 
   3818   *returnrate = INT_MAX;
   3819 
   3820   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
   3821     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
   3822       vp9_setup_buffer_inter(cpi, x, tile,
   3823                              ref_frame, bsize, mi_row, mi_col,
   3824                              frame_mv[NEARESTMV], frame_mv[NEARMV],
   3825                              yv12_mb);
   3826     }
   3827     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   3828     frame_mv[ZEROMV][ref_frame].as_int = 0;
   3829   }
   3830 
   3831   for (ref_frame = LAST_FRAME;
   3832        ref_frame <= ALTREF_FRAME && cpi->sf.reference_masking; ++ref_frame) {
   3833     int i;
   3834     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
   3835       if ((x->pred_mv_sad[ref_frame] >> 1) > x->pred_mv_sad[i]) {
   3836         ref_frame_mask |= (1 << ref_frame);
   3837         break;
   3838       }
   3839     }
   3840   }
   3841 
   3842   for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
   3843     int mode_excluded = 0;
   3844     int64_t this_rd = INT64_MAX;
   3845     int disable_skip = 0;
   3846     int compmode_cost = 0;
   3847     int rate2 = 0, rate_y = 0, rate_uv = 0;
   3848     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
   3849     int skippable = 0;
   3850     int64_t tx_cache[TX_MODES];
   3851     int i;
   3852     int this_skip2 = 0;
   3853     int64_t total_sse = INT_MAX;
   3854     int early_term = 0;
   3855 
   3856     for (i = 0; i < TX_MODES; ++i)
   3857       tx_cache[i] = INT64_MAX;
   3858 
   3859     x->skip = 0;
   3860     ref_frame = vp9_ref_order[mode_index].ref_frame[0];
   3861     second_ref_frame = vp9_ref_order[mode_index].ref_frame[1];
   3862 
   3863     // Look at the reference frame of the best mode so far and set the
   3864     // skip mask to look at a subset of the remaining modes.
   3865     if (mode_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
   3866       if (mode_index == 3) {
   3867         switch (vp9_ref_order[best_mode_index].ref_frame[0]) {
   3868           case INTRA_FRAME:
   3869             mode_skip_mask = 0;
   3870             break;
   3871           case LAST_FRAME:
   3872             mode_skip_mask = 0x0010;
   3873             break;
   3874           case GOLDEN_FRAME:
   3875             mode_skip_mask = 0x0008;
   3876             break;
   3877           case ALTREF_FRAME:
   3878             mode_skip_mask = 0x0000;
   3879             break;
   3880           case NONE:
   3881           case MAX_REF_FRAMES:
   3882             assert(0 && "Invalid Reference frame");
   3883         }
   3884       }
   3885       if (mode_skip_mask & (1 << mode_index))
   3886         continue;
   3887     }
   3888 
   3889     // Test best rd so far against threshold for trying this mode.
   3890     if ((best_rd <
   3891          ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] *
   3892           cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) ||
   3893         cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] == INT_MAX)
   3894       continue;
   3895 
   3896     // Do not allow compound prediction if the segment level reference
   3897     // frame feature is in use as in this case there can only be one reference.
   3898     if ((second_ref_frame > INTRA_FRAME) &&
   3899          vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
   3900       continue;
   3901 
   3902     mbmi->ref_frame[0] = ref_frame;
   3903     mbmi->ref_frame[1] = second_ref_frame;
   3904 
   3905     if (!(ref_frame == INTRA_FRAME
   3906         || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
   3907       continue;
   3908     }
   3909     if (!(second_ref_frame == NONE
   3910         || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
   3911       continue;
   3912     }
   3913 
   3914     comp_pred = second_ref_frame > INTRA_FRAME;
   3915     if (comp_pred) {
   3916       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
   3917         if (vp9_ref_order[best_mode_index].ref_frame[0] == INTRA_FRAME)
   3918           continue;
   3919       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
   3920         if (ref_frame != best_inter_ref_frame &&
   3921             second_ref_frame != best_inter_ref_frame)
   3922           continue;
   3923     }
   3924 
   3925     // TODO(jingning, jkoleszar): scaling reference frame not supported for
   3926     // sub8x8 blocks.
   3927     if (ref_frame > 0 && vp9_is_scaled(&cm->frame_refs[ref_frame - 1].sf))
   3928       continue;
   3929 
   3930     if (second_ref_frame > 0 &&
   3931         vp9_is_scaled(&cm->frame_refs[second_ref_frame - 1].sf))
   3932       continue;
   3933 
   3934     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
   3935     mbmi->uv_mode = DC_PRED;
   3936 
   3937     // Evaluate all sub-pel filters irrespective of whether we can use
   3938     // them for this frame.
   3939     mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
   3940                                                           : cm->interp_filter;
   3941 
   3942     if (comp_pred) {
   3943       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
   3944         continue;
   3945 
   3946       mode_excluded = mode_excluded ? mode_excluded
   3947                                     : cm->reference_mode == SINGLE_REFERENCE;
   3948     } else {
   3949       if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
   3950         mode_excluded = mode_excluded ?
   3951             mode_excluded : cm->reference_mode == COMPOUND_REFERENCE;
   3952       }
   3953     }
   3954 
   3955     // Select prediction reference frames.
   3956     for (i = 0; i < MAX_MB_PLANE; i++) {
   3957       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
   3958       if (comp_pred)
   3959         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
   3960     }
   3961 
   3962     // If the segment reference frame feature is enabled....
   3963     // then do nothing if the current ref frame is not allowed..
   3964     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
   3965         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
   3966             (int)ref_frame) {
   3967       continue;
   3968     // If the segment skip feature is enabled....
   3969     // then do nothing if the current mode is not allowed..
   3970     } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
   3971                ref_frame != INTRA_FRAME) {
   3972       continue;
   3973     // Disable this drop out case if the ref frame
   3974     // segment level feature is enabled for this segment. This is to
   3975     // prevent the possibility that we end up unable to pick any mode.
   3976     } else if (!vp9_segfeature_active(seg, segment_id,
   3977                                       SEG_LVL_REF_FRAME)) {
   3978       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
   3979       // unless ARNR filtering is enabled in which case we want
   3980       // an unfiltered alternative. We allow near/nearest as well
   3981       // because they may result in zero-zero MVs but be cheaper.
   3982       if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
   3983         continue;
   3984     }
   3985 
   3986 #ifdef MODE_TEST_HIT_STATS
   3987     // TEST/DEBUG CODE
   3988     // Keep a rcord of the number of test hits at each size
   3989     cpi->mode_test_hits[bsize]++;
   3990 #endif
   3991 
   3992     if (ref_frame == INTRA_FRAME) {
   3993       int rate;
   3994       mbmi->tx_size = TX_4X4;
   3995       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
   3996                                        &distortion_y, best_rd) >= best_rd)
   3997         continue;
   3998       rate2 += rate;
   3999       rate2 += intra_cost_penalty;
   4000       distortion2 += distortion_y;
   4001 
   4002       if (rate_uv_intra[TX_4X4] == INT_MAX) {
   4003         choose_intra_uv_mode(cpi, ctx, bsize, TX_4X4,
   4004                              &rate_uv_intra[TX_4X4],
   4005                              &rate_uv_tokenonly[TX_4X4],
   4006                              &dist_uv[TX_4X4], &skip_uv[TX_4X4],
   4007                              &mode_uv[TX_4X4]);
   4008       }
   4009       rate2 += rate_uv_intra[TX_4X4];
   4010       rate_uv = rate_uv_tokenonly[TX_4X4];
   4011       distortion2 += dist_uv[TX_4X4];
   4012       distortion_uv = dist_uv[TX_4X4];
   4013       mbmi->uv_mode = mode_uv[TX_4X4];
   4014       tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
   4015       for (i = 0; i < TX_MODES; ++i)
   4016         tx_cache[i] = tx_cache[ONLY_4X4];
   4017     } else {
   4018       int rate;
   4019       int64_t distortion;
   4020       int64_t this_rd_thresh;
   4021       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
   4022       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
   4023       int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
   4024       int tmp_best_skippable = 0;
   4025       int switchable_filter_index;
   4026       int_mv *second_ref = comp_pred ?
   4027                              &mbmi->ref_mvs[second_ref_frame][0] : NULL;
   4028       b_mode_info tmp_best_bmodes[16];
   4029       MB_MODE_INFO tmp_best_mbmode;
   4030       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
   4031       int pred_exists = 0;
   4032       int uv_skippable;
   4033 
   4034       this_rd_thresh = (ref_frame == LAST_FRAME) ?
   4035           cpi->rd_thresh_sub8x8[segment_id][bsize][THR_LAST] :
   4036           cpi->rd_thresh_sub8x8[segment_id][bsize][THR_ALTR];
   4037       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
   4038           cpi->rd_thresh_sub8x8[segment_id][bsize][THR_GOLD] : this_rd_thresh;
   4039       xd->mi[0]->mbmi.tx_size = TX_4X4;
   4040 
   4041       cpi->mask_filter_rd = 0;
   4042       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
   4043         cpi->rd_filter_cache[i] = INT64_MAX;
   4044 
   4045       if (cm->interp_filter != BILINEAR) {
   4046         tmp_best_filter = EIGHTTAP;
   4047         if (x->source_variance <
   4048             cpi->sf.disable_filter_search_var_thresh) {
   4049           tmp_best_filter = EIGHTTAP;
   4050         } else if (cpi->sf.adaptive_pred_interp_filter == 1 &&
   4051                    ctx->pred_interp_filter < SWITCHABLE) {
   4052           tmp_best_filter = ctx->pred_interp_filter;
   4053         } else if (cpi->sf.adaptive_pred_interp_filter == 2) {
   4054           tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
   4055                               ctx->pred_interp_filter : 0;
   4056         } else {
   4057           for (switchable_filter_index = 0;
   4058                switchable_filter_index < SWITCHABLE_FILTERS;
   4059                ++switchable_filter_index) {
   4060             int newbest, rs;
   4061             int64_t rs_rd;
   4062             mbmi->interp_filter = switchable_filter_index;
   4063             tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
   4064                                                  &mbmi->ref_mvs[ref_frame][0],
   4065                                                  second_ref,
   4066                                                  best_yrd,
   4067                                                  &rate, &rate_y, &distortion,
   4068                                                  &skippable, &total_sse,
   4069                                                  (int)this_rd_thresh, seg_mvs,
   4070                                                  bsi, switchable_filter_index,
   4071                                                  mi_row, mi_col);
   4072 
   4073             if (tmp_rd == INT64_MAX)
   4074               continue;
   4075             rs = get_switchable_rate(x);
   4076             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
   4077             cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
   4078             cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
   4079                 MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
   4080                     tmp_rd + rs_rd);
   4081             if (cm->interp_filter == SWITCHABLE)
   4082               tmp_rd += rs_rd;
   4083 
   4084             cpi->mask_filter_rd = MAX(cpi->mask_filter_rd, tmp_rd);
   4085 
   4086             newbest = (tmp_rd < tmp_best_rd);
   4087             if (newbest) {
   4088               tmp_best_filter = mbmi->interp_filter;
   4089               tmp_best_rd = tmp_rd;
   4090             }
   4091             if ((newbest && cm->interp_filter == SWITCHABLE) ||
   4092                 (mbmi->interp_filter == cm->interp_filter &&
   4093                  cm->interp_filter != SWITCHABLE)) {
   4094               tmp_best_rdu = tmp_rd;
   4095               tmp_best_rate = rate;
   4096               tmp_best_ratey = rate_y;
   4097               tmp_best_distortion = distortion;
   4098               tmp_best_sse = total_sse;
   4099               tmp_best_skippable = skippable;
   4100               tmp_best_mbmode = *mbmi;
   4101               for (i = 0; i < 4; i++) {
   4102                 tmp_best_bmodes[i] = xd->mi[0]->bmi[i];
   4103                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
   4104               }
   4105               pred_exists = 1;
   4106               if (switchable_filter_index == 0 &&
   4107                   cpi->sf.use_rd_breakout &&
   4108                   best_rd < INT64_MAX) {
   4109                 if (tmp_best_rdu / 2 > best_rd) {
   4110                   // skip searching the other filters if the first is
   4111                   // already substantially larger than the best so far
   4112                   tmp_best_filter = mbmi->interp_filter;
   4113                   tmp_best_rdu = INT64_MAX;
   4114                   break;
   4115                 }
   4116               }
   4117             }
   4118           }  // switchable_filter_index loop
   4119         }
   4120       }
   4121 
   4122       if (tmp_best_rdu == INT64_MAX && pred_exists)
   4123         continue;
   4124 
   4125       mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
   4126                              tmp_best_filter : cm->interp_filter);
   4127       if (!pred_exists) {
   4128         // Handles the special case when a filter that is not in the
   4129         // switchable list (bilinear, 6-tap) is indicated at the frame level
   4130         tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
   4131                      &mbmi->ref_mvs[ref_frame][0],
   4132                      second_ref,
   4133                      best_yrd,
   4134                      &rate, &rate_y, &distortion,
   4135                      &skippable, &total_sse,
   4136                      (int)this_rd_thresh, seg_mvs,
   4137                      bsi, 0,
   4138                      mi_row, mi_col);
   4139         if (tmp_rd == INT64_MAX)
   4140           continue;
   4141       } else {
   4142         total_sse = tmp_best_sse;
   4143         rate = tmp_best_rate;
   4144         rate_y = tmp_best_ratey;
   4145         distortion = tmp_best_distortion;
   4146         skippable = tmp_best_skippable;
   4147         *mbmi = tmp_best_mbmode;
   4148         for (i = 0; i < 4; i++)
   4149           xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
   4150       }
   4151 
   4152       rate2 += rate;
   4153       distortion2 += distortion;
   4154 
   4155       if (cm->interp_filter == SWITCHABLE)
   4156         rate2 += get_switchable_rate(x);
   4157 
   4158       if (!mode_excluded)
   4159         mode_excluded = comp_pred ? cm->reference_mode == SINGLE_REFERENCE
   4160                                   : cm->reference_mode == COMPOUND_REFERENCE;
   4161 
   4162       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
   4163 
   4164       tmp_best_rdu = best_rd -
   4165           MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
   4166               RDCOST(x->rdmult, x->rddiv, 0, total_sse));
   4167 
   4168       if (tmp_best_rdu > 0) {
   4169         // If even the 'Y' rd value of split is higher than best so far
   4170         // then dont bother looking at UV
   4171         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
   4172                                         BLOCK_8X8);
   4173         super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
   4174                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
   4175         if (rate_uv == INT_MAX)
   4176           continue;
   4177         rate2 += rate_uv;
   4178         distortion2 += distortion_uv;
   4179         skippable = skippable && uv_skippable;
   4180         total_sse += uv_sse;
   4181 
   4182         tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
   4183         for (i = 0; i < TX_MODES; ++i)
   4184           tx_cache[i] = tx_cache[ONLY_4X4];
   4185       }
   4186     }
   4187 
   4188     if (cm->reference_mode == REFERENCE_MODE_SELECT)
   4189       rate2 += compmode_cost;
   4190 
   4191     // Estimate the reference frame signaling cost and add it
   4192     // to the rolling cost variable.
   4193     if (second_ref_frame > INTRA_FRAME) {
   4194       rate2 += ref_costs_comp[ref_frame];
   4195     } else {
   4196       rate2 += ref_costs_single[ref_frame];
   4197     }
   4198 
   4199     if (!disable_skip) {
   4200       // Test for the condition where skip block will be activated
   4201       // because there are no non zero coefficients and make any
   4202       // necessary adjustment for rate. Ignore if skip is coded at
   4203       // segment level as the cost wont have been added in.
   4204       // Is Mb level skip allowed (i.e. not coded at segment level).
   4205       const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
   4206                                                          SEG_LVL_SKIP);
   4207 
   4208       if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
   4209         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
   4210             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
   4211           // Add in the cost of the no skip flag.
   4212           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
   4213         } else {
   4214           // FIXME(rbultje) make this work for splitmv also
   4215           rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
   4216           distortion2 = total_sse;
   4217           assert(total_sse >= 0);
   4218           rate2 -= (rate_y + rate_uv);
   4219           rate_y = 0;
   4220           rate_uv = 0;
   4221           this_skip2 = 1;
   4222         }
   4223       } else if (mb_skip_allowed) {
   4224         // Add in the cost of the no skip flag.
   4225         rate2 += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
   4226       }
   4227 
   4228       // Calculate the final RD estimate for this mode.
   4229       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
   4230     }
   4231 
   4232     // Keep record of best inter rd with single reference
   4233     if (is_inter_block(&xd->mi[0]->mbmi) &&
   4234         !has_second_ref(&xd->mi[0]->mbmi) &&
   4235         !mode_excluded &&
   4236         this_rd < best_inter_rd) {
   4237       best_inter_rd = this_rd;
   4238       best_inter_ref_frame = ref_frame;
   4239     }
   4240 
   4241     if (!disable_skip && ref_frame == INTRA_FRAME) {
   4242       for (i = 0; i < REFERENCE_MODES; ++i)
   4243         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
   4244       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
   4245         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
   4246     }
   4247 
   4248     // Did this mode help.. i.e. is it the new best mode
   4249     if (this_rd < best_rd || x->skip) {
   4250       if (!mode_excluded) {
   4251         int max_plane = MAX_MB_PLANE;
   4252         // Note index of best mode so far
   4253         best_mode_index = mode_index;
   4254 
   4255         if (ref_frame == INTRA_FRAME) {
   4256           /* required for left and above block mv */
   4257           mbmi->mv[0].as_int = 0;
   4258           max_plane = 1;
   4259         }
   4260 
   4261         *returnrate = rate2;
   4262         *returndistortion = distortion2;
   4263         best_rd = this_rd;
   4264         best_yrd = best_rd -
   4265                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
   4266         best_mbmode = *mbmi;
   4267         best_skip2 = this_skip2;
   4268         if (!x->select_txfm_size)
   4269           swap_block_ptr(x, ctx, max_plane);
   4270         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
   4271                    sizeof(uint8_t) * ctx->num_4x4_blk);
   4272 
   4273         for (i = 0; i < 4; i++)
   4274           best_bmodes[i] = xd->mi[0]->bmi[i];
   4275 
   4276         // TODO(debargha): enhance this test with a better distortion prediction
   4277         // based on qp, activity mask and history
   4278         if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
   4279             (mode_index > MIN_EARLY_TERM_INDEX)) {
   4280           const int qstep = xd->plane[0].dequant[1];
   4281           // TODO(debargha): Enhance this by specializing for each mode_index
   4282           int scale = 4;
   4283           if (x->source_variance < UINT_MAX) {
   4284             const int var_adjust = (x->source_variance < 16);
   4285             scale -= var_adjust;
   4286           }
   4287           if (ref_frame > INTRA_FRAME &&
   4288               distortion2 * scale < qstep * qstep) {
   4289             early_term = 1;
   4290           }
   4291         }
   4292       }
   4293     }
   4294 
   4295     /* keep record of best compound/single-only prediction */
   4296     if (!disable_skip && ref_frame != INTRA_FRAME) {
   4297       int64_t single_rd, hybrid_rd, single_rate, hybrid_rate;
   4298 
   4299       if (cm->reference_mode == REFERENCE_MODE_SELECT) {
   4300         single_rate = rate2 - compmode_cost;
   4301         hybrid_rate = rate2;
   4302       } else {
   4303         single_rate = rate2;
   4304         hybrid_rate = rate2 + compmode_cost;
   4305       }
   4306 
   4307       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
   4308       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
   4309 
   4310       if (second_ref_frame <= INTRA_FRAME &&
   4311           single_rd < best_pred_rd[SINGLE_REFERENCE]) {
   4312         best_pred_rd[SINGLE_REFERENCE] = single_rd;
   4313       } else if (second_ref_frame > INTRA_FRAME &&
   4314                  single_rd < best_pred_rd[COMPOUND_REFERENCE]) {
   4315         best_pred_rd[COMPOUND_REFERENCE] = single_rd;
   4316       }
   4317       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
   4318         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
   4319     }
   4320 
   4321     /* keep record of best filter type */
   4322     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
   4323         cm->interp_filter != BILINEAR) {
   4324       int64_t ref = cpi->rd_filter_cache[cm->interp_filter == SWITCHABLE ?
   4325                               SWITCHABLE_FILTERS : cm->interp_filter];
   4326       int64_t adj_rd;
   4327       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
   4328         if (ref == INT64_MAX)
   4329           adj_rd = 0;
   4330         else if (cpi->rd_filter_cache[i] == INT64_MAX)
   4331           // when early termination is triggered, the encoder does not have
   4332           // access to the rate-distortion cost. it only knows that the cost
   4333           // should be above the maximum valid value. hence it takes the known
   4334           // maximum plus an arbitrary constant as the rate-distortion cost.
   4335           adj_rd = cpi->mask_filter_rd - ref + 10;
   4336         else
   4337           adj_rd = cpi->rd_filter_cache[i] - ref;
   4338 
   4339         adj_rd += this_rd;
   4340         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
   4341       }
   4342     }
   4343 
   4344     /* keep record of best txfm size */
   4345     if (bsize < BLOCK_32X32) {
   4346       if (bsize < BLOCK_16X16) {
   4347         tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
   4348         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
   4349       }
   4350       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
   4351     }
   4352     if (!mode_excluded && this_rd != INT64_MAX) {
   4353       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
   4354         int64_t adj_rd = INT64_MAX;
   4355         if (ref_frame > INTRA_FRAME)
   4356           adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
   4357         else
   4358           adj_rd = this_rd;
   4359 
   4360         if (adj_rd < best_tx_rd[i])
   4361           best_tx_rd[i] = adj_rd;
   4362       }
   4363     }
   4364 
   4365     if (early_term)
   4366       break;
   4367 
   4368     if (x->skip && !comp_pred)
   4369       break;
   4370   }
   4371 
   4372   if (best_rd >= best_rd_so_far)
   4373     return INT64_MAX;
   4374 
   4375   // If we used an estimate for the uv intra rd in the loop above...
   4376   if (cpi->sf.use_uv_intra_rd_estimate) {
   4377     // Do Intra UV best rd mode selection if best mode choice above was intra.
   4378     if (vp9_ref_order[best_mode_index].ref_frame[0] == INTRA_FRAME) {
   4379       TX_SIZE uv_tx_size;
   4380       *mbmi = best_mbmode;
   4381       uv_tx_size = get_uv_tx_size(mbmi);
   4382       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
   4383                               &rate_uv_tokenonly[uv_tx_size],
   4384                               &dist_uv[uv_tx_size],
   4385                               &skip_uv[uv_tx_size],
   4386                               BLOCK_8X8, uv_tx_size);
   4387     }
   4388   }
   4389 
   4390   if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
   4391     *returnrate = INT_MAX;
   4392     *returndistortion = INT64_MAX;
   4393     return best_rd;
   4394   }
   4395 
   4396   assert((cm->interp_filter == SWITCHABLE) ||
   4397          (cm->interp_filter == best_mbmode.interp_filter) ||
   4398          !is_inter_block(&best_mbmode));
   4399 
   4400   // Updating rd_thresh_freq_fact[] here means that the different
   4401   // partition/block sizes are handled independently based on the best
   4402   // choice for the current partition. It may well be better to keep a scaled
   4403   // best rd so far value and update rd_thresh_freq_fact based on the mode/size
   4404   // combination that wins out.
   4405   if (cpi->sf.adaptive_rd_thresh) {
   4406     for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
   4407       int *const fact = &cpi->rd_thresh_freq_sub8x8[bsize][mode_index];
   4408 
   4409       if (mode_index == best_mode_index) {
   4410         *fact -= (*fact >> 3);
   4411       } else {
   4412         *fact = MIN(*fact + RD_THRESH_INC,
   4413                     cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT);
   4414       }
   4415     }
   4416   }
   4417 
   4418   // macroblock modes
   4419   *mbmi = best_mbmode;
   4420   x->skip |= best_skip2;
   4421   if (!is_inter_block(&best_mbmode)) {
   4422     for (i = 0; i < 4; i++)
   4423       xd->mi[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
   4424   } else {
   4425     for (i = 0; i < 4; ++i)
   4426       vpx_memcpy(&xd->mi[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
   4427 
   4428     mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
   4429     mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
   4430   }
   4431 
   4432   for (i = 0; i < REFERENCE_MODES; ++i) {
   4433     if (best_pred_rd[i] == INT64_MAX)
   4434       best_pred_diff[i] = INT_MIN;
   4435     else
   4436       best_pred_diff[i] = best_rd - best_pred_rd[i];
   4437   }
   4438 
   4439   if (!x->skip) {
   4440     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
   4441       if (best_filter_rd[i] == INT64_MAX)
   4442         best_filter_diff[i] = 0;
   4443       else
   4444         best_filter_diff[i] = best_rd - best_filter_rd[i];
   4445     }
   4446     if (cm->interp_filter == SWITCHABLE)
   4447       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
   4448   } else {
   4449     vp9_zero(best_filter_diff);
   4450   }
   4451 
   4452   if (!x->skip) {
   4453     for (i = 0; i < TX_MODES; i++) {
   4454       if (best_tx_rd[i] == INT64_MAX)
   4455         best_tx_diff[i] = 0;
   4456       else
   4457         best_tx_diff[i] = best_rd - best_tx_rd[i];
   4458     }
   4459   } else {
   4460     vp9_zero(best_tx_diff);
   4461   }
   4462 
   4463   set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
   4464   store_coding_context(x, ctx, best_mode_index,
   4465                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
   4466                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
   4467                                       mbmi->ref_frame[1]][0],
   4468                        best_pred_diff, best_tx_diff, best_filter_diff);
   4469 
   4470   return best_rd;
   4471 }
   4472