Home | History | Annotate | Download | only in encoder
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <stdio.h>
     12 #include <math.h>
     13 #include <limits.h>
     14 #include <assert.h>
     15 
     16 #include "vp9/common/vp9_pragmas.h"
     17 #include "vp9/encoder/vp9_tokenize.h"
     18 #include "vp9/encoder/vp9_treewriter.h"
     19 #include "vp9/encoder/vp9_onyx_int.h"
     20 #include "vp9/encoder/vp9_modecosts.h"
     21 #include "vp9/encoder/vp9_encodeintra.h"
     22 #include "vp9/common/vp9_entropymode.h"
     23 #include "vp9/common/vp9_reconinter.h"
     24 #include "vp9/common/vp9_reconintra.h"
     25 #include "vp9/common/vp9_findnearmv.h"
     26 #include "vp9/common/vp9_quant_common.h"
     27 #include "vp9/encoder/vp9_encodemb.h"
     28 #include "vp9/encoder/vp9_quantize.h"
     29 #include "vp9/encoder/vp9_variance.h"
     30 #include "vp9/encoder/vp9_mcomp.h"
     31 #include "vp9/encoder/vp9_rdopt.h"
     32 #include "vp9/encoder/vp9_ratectrl.h"
     33 #include "vpx_mem/vpx_mem.h"
     34 #include "vp9/common/vp9_systemdependent.h"
     35 #include "vp9/encoder/vp9_encodemv.h"
     36 #include "vp9/common/vp9_seg_common.h"
     37 #include "vp9/common/vp9_pred_common.h"
     38 #include "vp9/common/vp9_entropy.h"
     39 #include "./vp9_rtcd.h"
     40 #include "vp9/common/vp9_mvref_common.h"
     41 #include "vp9/common/vp9_common.h"
     42 
     43 #define INVALID_MV 0x80008000
     44 
     45 /* Factor to weigh the rate for switchable interp filters */
     46 #define SWITCHABLE_INTERP_RATE_FACTOR 1
     47 
     48 #define LAST_FRAME_MODE_MASK    0xFFEDCD60
     49 #define GOLDEN_FRAME_MODE_MASK  0xFFDA3BB0
     50 #define ALT_REF_MODE_MASK       0xFFC648D0
     51 
     52 #define MIN_EARLY_TERM_INDEX    3
     53 
     54 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
     55   {NEARESTMV, LAST_FRAME,   NONE},
     56   {NEARESTMV, ALTREF_FRAME, NONE},
     57   {NEARESTMV, GOLDEN_FRAME, NONE},
     58 
     59   {DC_PRED,   INTRA_FRAME,  NONE},
     60 
     61   {NEWMV,     LAST_FRAME,   NONE},
     62   {NEWMV,     ALTREF_FRAME, NONE},
     63   {NEWMV,     GOLDEN_FRAME, NONE},
     64 
     65   {NEARMV,    LAST_FRAME,   NONE},
     66   {NEARMV,    ALTREF_FRAME, NONE},
     67   {NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
     68   {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
     69 
     70   {TM_PRED,   INTRA_FRAME,  NONE},
     71 
     72   {NEARMV,    LAST_FRAME,   ALTREF_FRAME},
     73   {NEWMV,     LAST_FRAME,   ALTREF_FRAME},
     74   {NEARMV,    GOLDEN_FRAME, NONE},
     75   {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
     76   {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
     77 
     78   {ZEROMV,    LAST_FRAME,   NONE},
     79   {ZEROMV,    GOLDEN_FRAME, NONE},
     80   {ZEROMV,    ALTREF_FRAME, NONE},
     81   {ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
     82   {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
     83 
     84   {H_PRED,    INTRA_FRAME,  NONE},
     85   {V_PRED,    INTRA_FRAME,  NONE},
     86   {D135_PRED, INTRA_FRAME,  NONE},
     87   {D207_PRED, INTRA_FRAME,  NONE},
     88   {D153_PRED, INTRA_FRAME,  NONE},
     89   {D63_PRED,  INTRA_FRAME,  NONE},
     90   {D117_PRED, INTRA_FRAME,  NONE},
     91   {D45_PRED,  INTRA_FRAME,  NONE},
     92 };
     93 
     94 const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
     95   {LAST_FRAME,   NONE},
     96   {GOLDEN_FRAME, NONE},
     97   {ALTREF_FRAME, NONE},
     98   {LAST_FRAME,   ALTREF_FRAME},
     99   {GOLDEN_FRAME, ALTREF_FRAME},
    100   {INTRA_FRAME,  NONE},
    101 };
    102 
    103 // The baseline rd thresholds for breaking out of the rd loop for
    104 // certain modes are assumed to be based on 8x8 blocks.
    105 // This table is used to correct for blocks size.
    106 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
    107 static int rd_thresh_block_size_factor[BLOCK_SIZES] =
    108   {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
    109 
    110 #define RD_THRESH_MAX_FACT 64
    111 #define RD_THRESH_INC      1
    112 #define RD_THRESH_POW      1.25
    113 #define RD_MULT_EPB_RATIO  64
    114 
    115 #define MV_COST_WEIGHT      108
    116 #define MV_COST_WEIGHT_SUB  120
    117 
    118 static void fill_token_costs(vp9_coeff_cost *c,
    119                              vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
    120   int i, j, k, l;
    121   TX_SIZE t;
    122   for (t = TX_4X4; t <= TX_32X32; t++)
    123     for (i = 0; i < BLOCK_TYPES; i++)
    124       for (j = 0; j < REF_TYPES; j++)
    125         for (k = 0; k < COEF_BANDS; k++)
    126           for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
    127             vp9_prob probs[ENTROPY_NODES];
    128             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
    129             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
    130                             vp9_coef_tree);
    131             vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
    132                                  vp9_coef_tree);
    133             assert(c[t][i][j][k][0][l][DCT_EOB_TOKEN] ==
    134                    c[t][i][j][k][1][l][DCT_EOB_TOKEN]);
    135           }
    136 }
    137 
    138 static const int rd_iifactor[32] = {
    139   4, 4, 3, 2, 1, 0, 0, 0,
    140   0, 0, 0, 0, 0, 0, 0, 0,
    141   0, 0, 0, 0, 0, 0, 0, 0,
    142   0, 0, 0, 0, 0, 0, 0, 0,
    143 };
    144 
    145 // 3* dc_qlookup[Q]*dc_qlookup[Q];
    146 
    147 /* values are now correlated to quantizer */
    148 static int sad_per_bit16lut[QINDEX_RANGE];
    149 static int sad_per_bit4lut[QINDEX_RANGE];
    150 
    151 void vp9_init_me_luts() {
    152   int i;
    153 
    154   // Initialize the sad lut tables using a formulaic calculation for now
    155   // This is to make it easier to resolve the impact of experimental changes
    156   // to the quantizer tables.
    157   for (i = 0; i < QINDEX_RANGE; i++) {
    158     sad_per_bit16lut[i] =
    159       (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
    160     sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);
    161   }
    162 }
    163 
    164 int vp9_compute_rd_mult(VP9_COMP *cpi, int qindex) {
    165   const int q = vp9_dc_quant(qindex, 0);
    166   // TODO(debargha): Adjust the function below
    167   int rdmult = 88 * q * q / 25;
    168   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
    169     if (cpi->twopass.next_iiratio > 31)
    170       rdmult += (rdmult * rd_iifactor[31]) >> 4;
    171     else
    172       rdmult += (rdmult * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
    173   }
    174   return rdmult;
    175 }
    176 
    177 static int compute_rd_thresh_factor(int qindex) {
    178   int q;
    179   // TODO(debargha): Adjust the function below
    180   q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12);
    181   if (q < 8)
    182     q = 8;
    183   return q;
    184 }
    185 
    186 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
    187   cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
    188   cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
    189 }
    190 
    191 static void set_block_thresholds(VP9_COMP *cpi) {
    192   int i, bsize, segment_id;
    193   VP9_COMMON *cm = &cpi->common;
    194 
    195   for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) {
    196     int q;
    197     int segment_qindex = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
    198     segment_qindex = clamp(segment_qindex + cm->y_dc_delta_q, 0, MAXQ);
    199     q = compute_rd_thresh_factor(segment_qindex);
    200 
    201     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
    202       // Threshold here seem unecessarily harsh but fine given actual
    203       // range of values used for cpi->sf.thresh_mult[]
    204       int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
    205 
    206       for (i = 0; i < MAX_MODES; ++i) {
    207         if (cpi->sf.thresh_mult[i] < thresh_max) {
    208           cpi->rd_threshes[segment_id][bsize][i] =
    209               cpi->sf.thresh_mult[i] * q *
    210               rd_thresh_block_size_factor[bsize] / 4;
    211         } else {
    212           cpi->rd_threshes[segment_id][bsize][i] = INT_MAX;
    213         }
    214       }
    215 
    216       for (i = 0; i < MAX_REFS; ++i) {
    217         if (cpi->sf.thresh_mult_sub8x8[i] < thresh_max) {
    218           cpi->rd_thresh_sub8x8[segment_id][bsize][i] =
    219               cpi->sf.thresh_mult_sub8x8[i] * q *
    220               rd_thresh_block_size_factor[bsize] / 4;
    221         } else {
    222           cpi->rd_thresh_sub8x8[segment_id][bsize][i] = INT_MAX;
    223         }
    224       }
    225     }
    226   }
    227 }
    228 
    229 void vp9_initialize_rd_consts(VP9_COMP *cpi) {
    230   VP9_COMMON *cm = &cpi->common;
    231   int qindex, i;
    232 
    233   vp9_clear_system_state();  // __asm emms;
    234 
    235   // Further tests required to see if optimum is different
    236   // for key frames, golden frames and arf frames.
    237   // if (cpi->common.refresh_golden_frame ||
    238   //     cpi->common.refresh_alt_ref_frame)
    239   qindex = clamp(cm->base_qindex + cm->y_dc_delta_q, 0, MAXQ);
    240 
    241   cpi->RDDIV = RDDIV_BITS;  // in bits (to multiply D by 128)
    242   cpi->RDMULT = vp9_compute_rd_mult(cpi, qindex);
    243 
    244   cpi->mb.errorperbit = cpi->RDMULT / RD_MULT_EPB_RATIO;
    245   cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
    246 
    247   vp9_set_speed_features(cpi);
    248 
    249   cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
    250                               cm->frame_type != KEY_FRAME) ?
    251                              0 : 1;
    252 
    253   set_block_thresholds(cpi);
    254 
    255   fill_token_costs(cpi->mb.token_costs, cm->fc.coef_probs);
    256 
    257   for (i = 0; i < PARTITION_CONTEXTS; i++)
    258     vp9_cost_tokens(cpi->mb.partition_cost[i], get_partition_probs(cm, i),
    259                     vp9_partition_tree);
    260 
    261   /*rough estimate for costing*/
    262   vp9_init_mode_costs(cpi);
    263 
    264   if (!frame_is_intra_only(cm)) {
    265     vp9_build_nmv_cost_table(
    266         cpi->mb.nmvjointcost,
    267         cm->allow_high_precision_mv ? cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
    268         &cm->fc.nmvc,
    269         cm->allow_high_precision_mv, 1, 1);
    270 
    271     for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
    272       MB_PREDICTION_MODE m;
    273 
    274       for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
    275         cpi->mb.inter_mode_cost[i][INTER_OFFSET(m)] =
    276             cost_token(vp9_inter_mode_tree,
    277                        cm->fc.inter_mode_probs[i],
    278                        &vp9_inter_mode_encodings[INTER_OFFSET(m)]);
    279     }
    280   }
    281 }
    282 
    283 static INLINE void linear_interpolate2(double x, int ntab, int inv_step,
    284                                        const double *tab1, const double *tab2,
    285                                        double *v1, double *v2) {
    286   double y = x * inv_step;
    287   int d = (int) y;
    288   if (d >= ntab - 1) {
    289     *v1 = tab1[ntab - 1];
    290     *v2 = tab2[ntab - 1];
    291   } else {
    292     double a = y - d;
    293     *v1 = tab1[d] * (1 - a) + tab1[d + 1] * a;
    294     *v2 = tab2[d] * (1 - a) + tab2[d + 1] * a;
    295   }
    296 }
    297 
    298 static void model_rd_norm(double x, double *R, double *D) {
    299   static const int inv_tab_step = 8;
    300   static const int tab_size = 120;
    301   // NOTE: The tables below must be of the same size
    302   //
    303   // Normalized rate
    304   // This table models the rate for a Laplacian source
    305   // source with given variance when quantized with a uniform quantizer
    306   // with given stepsize. The closed form expression is:
    307   // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
    308   // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
    309   // and H(x) is the binary entropy function.
    310   static const double rate_tab[] = {
    311     64.00, 4.944, 3.949, 3.372, 2.966, 2.655, 2.403, 2.194,
    312     2.014, 1.858, 1.720, 1.596, 1.485, 1.384, 1.291, 1.206,
    313     1.127, 1.054, 0.986, 0.923, 0.863, 0.808, 0.756, 0.708,
    314     0.662, 0.619, 0.579, 0.541, 0.506, 0.473, 0.442, 0.412,
    315     0.385, 0.359, 0.335, 0.313, 0.291, 0.272, 0.253, 0.236,
    316     0.220, 0.204, 0.190, 0.177, 0.165, 0.153, 0.142, 0.132,
    317     0.123, 0.114, 0.106, 0.099, 0.091, 0.085, 0.079, 0.073,
    318     0.068, 0.063, 0.058, 0.054, 0.050, 0.047, 0.043, 0.040,
    319     0.037, 0.034, 0.032, 0.029, 0.027, 0.025, 0.023, 0.022,
    320     0.020, 0.019, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012,
    321     0.011, 0.010, 0.009, 0.008, 0.008, 0.007, 0.007, 0.006,
    322     0.006, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.003,
    323     0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002,
    324     0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
    325     0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.000,
    326   };
    327   // Normalized distortion
    328   // This table models the normalized distortion for a Laplacian source
    329   // source with given variance when quantized with a uniform quantizer
    330   // with given stepsize. The closed form expression is:
    331   // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
    332   // where x = qpstep / sqrt(variance)
    333   // Note the actual distortion is Dn * variance.
    334   static const double dist_tab[] = {
    335     0.000, 0.001, 0.005, 0.012, 0.021, 0.032, 0.045, 0.061,
    336     0.079, 0.098, 0.119, 0.142, 0.166, 0.190, 0.216, 0.242,
    337     0.269, 0.296, 0.324, 0.351, 0.378, 0.405, 0.432, 0.458,
    338     0.484, 0.509, 0.534, 0.557, 0.580, 0.603, 0.624, 0.645,
    339     0.664, 0.683, 0.702, 0.719, 0.735, 0.751, 0.766, 0.780,
    340     0.794, 0.807, 0.819, 0.830, 0.841, 0.851, 0.861, 0.870,
    341     0.878, 0.886, 0.894, 0.901, 0.907, 0.913, 0.919, 0.925,
    342     0.930, 0.935, 0.939, 0.943, 0.947, 0.951, 0.954, 0.957,
    343     0.960, 0.963, 0.966, 0.968, 0.971, 0.973, 0.975, 0.976,
    344     0.978, 0.980, 0.981, 0.982, 0.984, 0.985, 0.986, 0.987,
    345     0.988, 0.989, 0.990, 0.990, 0.991, 0.992, 0.992, 0.993,
    346     0.993, 0.994, 0.994, 0.995, 0.995, 0.996, 0.996, 0.996,
    347     0.996, 0.997, 0.997, 0.997, 0.997, 0.998, 0.998, 0.998,
    348     0.998, 0.998, 0.998, 0.999, 0.999, 0.999, 0.999, 0.999,
    349     0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1.000,
    350   };
    351   /*
    352   assert(sizeof(rate_tab) == tab_size * sizeof(rate_tab[0]);
    353   assert(sizeof(dist_tab) == tab_size * sizeof(dist_tab[0]);
    354   assert(sizeof(rate_tab) == sizeof(dist_tab));
    355   */
    356   assert(x >= 0.0);
    357   linear_interpolate2(x, tab_size, inv_tab_step,
    358                       rate_tab, dist_tab, R, D);
    359 }
    360 
    361 static void model_rd_from_var_lapndz(int var, int n, int qstep,
    362                                      int *rate, int64_t *dist) {
    363   // This function models the rate and distortion for a Laplacian
    364   // source with given variance when quantized with a uniform quantizer
    365   // with given stepsize. The closed form expressions are in:
    366   // Hang and Chen, "Source Model for transform video coder and its
    367   // application - Part I: Fundamental Theory", IEEE Trans. Circ.
    368   // Sys. for Video Tech., April 1997.
    369   vp9_clear_system_state();
    370   if (var == 0 || n == 0) {
    371     *rate = 0;
    372     *dist = 0;
    373   } else {
    374     double D, R;
    375     double s2 = (double) var / n;
    376     double x = qstep / sqrt(s2);
    377     model_rd_norm(x, &R, &D);
    378     *rate = (int)((n << 8) * R + 0.5);
    379     *dist = (int)(var * D + 0.5);
    380   }
    381   vp9_clear_system_state();
    382 }
    383 
    384 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
    385                             MACROBLOCK *x, MACROBLOCKD *xd,
    386                             int *out_rate_sum, int64_t *out_dist_sum) {
    387   // Note our transform coeffs are 8 times an orthogonal transform.
    388   // Hence quantizer step is also 8 times. To get effective quantizer
    389   // we need to divide by 8 before sending to modeling function.
    390   int i, rate_sum = 0, dist_sum = 0;
    391 
    392   for (i = 0; i < MAX_MB_PLANE; ++i) {
    393     struct macroblock_plane *const p = &x->plane[i];
    394     struct macroblockd_plane *const pd = &xd->plane[i];
    395     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
    396     unsigned int sse;
    397     int rate;
    398     int64_t dist;
    399     (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
    400                               pd->dst.buf, pd->dst.stride, &sse);
    401     // sse works better than var, since there is no dc prediction used
    402     model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
    403                              pd->dequant[1] >> 3, &rate, &dist);
    404 
    405     rate_sum += rate;
    406     dist_sum += (int)dist;
    407   }
    408 
    409   *out_rate_sum = rate_sum;
    410   *out_dist_sum = dist_sum << 4;
    411 }
    412 
    413 static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
    414                                  TX_SIZE tx_size,
    415                                  MACROBLOCK *x, MACROBLOCKD *xd,
    416                                  int *out_rate_sum, int64_t *out_dist_sum,
    417                                  int *out_skip) {
    418   int j, k;
    419   BLOCK_SIZE bs;
    420   struct macroblock_plane *const p = &x->plane[0];
    421   struct macroblockd_plane *const pd = &xd->plane[0];
    422   const int width = 4 << num_4x4_blocks_wide_lookup[bsize];
    423   const int height = 4 << num_4x4_blocks_high_lookup[bsize];
    424   int rate_sum = 0;
    425   int64_t dist_sum = 0;
    426   const int t = 4 << tx_size;
    427 
    428   if (tx_size == TX_4X4) {
    429     bs = BLOCK_4X4;
    430   } else if (tx_size == TX_8X8) {
    431     bs = BLOCK_8X8;
    432   } else if (tx_size == TX_16X16) {
    433     bs = BLOCK_16X16;
    434   } else if (tx_size == TX_32X32) {
    435     bs = BLOCK_32X32;
    436   } else {
    437     assert(0);
    438   }
    439 
    440   *out_skip = 1;
    441   for (j = 0; j < height; j += t) {
    442     for (k = 0; k < width; k += t) {
    443       int rate;
    444       int64_t dist;
    445       unsigned int sse;
    446       cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride,
    447                          &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride,
    448                          &sse);
    449       // sse works better than var, since there is no dc prediction used
    450       model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, &rate, &dist);
    451       rate_sum += rate;
    452       dist_sum += dist;
    453       *out_skip &= (rate < 1024);
    454     }
    455   }
    456 
    457   *out_rate_sum = rate_sum;
    458   *out_dist_sum = dist_sum << 4;
    459 }
    460 
    461 int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
    462                           intptr_t block_size, int64_t *ssz) {
    463   int i;
    464   int64_t error = 0, sqcoeff = 0;
    465 
    466   for (i = 0; i < block_size; i++) {
    467     int this_diff = coeff[i] - dqcoeff[i];
    468     error += (unsigned)this_diff * this_diff;
    469     sqcoeff += (unsigned) coeff[i] * coeff[i];
    470   }
    471 
    472   *ssz = sqcoeff;
    473   return error;
    474 }
    475 
    476 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
    477  * decide whether to include cost of a trailing EOB node or not (i.e. we
    478  * can skip this if the last coefficient in this transform block, e.g. the
    479  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
    480  * were non-zero). */
    481 static const int16_t band_counts[TX_SIZES][8] = {
    482   { 1, 2, 3, 4,  3,   16 - 13, 0 },
    483   { 1, 2, 3, 4, 11,   64 - 21, 0 },
    484   { 1, 2, 3, 4, 11,  256 - 21, 0 },
    485   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
    486 };
    487 
    488 static INLINE int cost_coeffs(MACROBLOCK *x,
    489                               int plane, int block,
    490                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
    491                               TX_SIZE tx_size,
    492                               const int16_t *scan, const int16_t *nb) {
    493   MACROBLOCKD *const xd = &x->e_mbd;
    494   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
    495   struct macroblockd_plane *pd = &xd->plane[plane];
    496   const PLANE_TYPE type = pd->plane_type;
    497   const int16_t *band_count = &band_counts[tx_size][1];
    498   const int eob = pd->eobs[block];
    499   const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
    500   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
    501   unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
    502                    x->token_costs[tx_size][type][ref];
    503   const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
    504   uint8_t *p_tok = x->token_cache;
    505   int pt = combine_entropy_contexts(above_ec, left_ec);
    506   int c, cost;
    507 
    508   // Check for consistency of tx_size with mode info
    509   assert(type == PLANE_TYPE_Y_WITH_DC ? mbmi->tx_size == tx_size
    510                                       : get_uv_tx_size(mbmi) == tx_size);
    511 
    512   if (eob == 0) {
    513     // single eob token
    514     cost = token_costs[0][0][pt][DCT_EOB_TOKEN];
    515     c = 0;
    516   } else {
    517     int band_left = *band_count++;
    518 
    519     // dc token
    520     int v = qcoeff_ptr[0];
    521     int prev_t = vp9_dct_value_tokens_ptr[v].token;
    522     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
    523     p_tok[0] = vp9_pt_energy_class[prev_t];
    524     ++token_costs;
    525 
    526     // ac tokens
    527     for (c = 1; c < eob; c++) {
    528       const int rc = scan[c];
    529       int t;
    530 
    531       v = qcoeff_ptr[rc];
    532       t = vp9_dct_value_tokens_ptr[v].token;
    533       pt = get_coef_context(nb, p_tok, c);
    534       cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
    535       p_tok[rc] = vp9_pt_energy_class[t];
    536       prev_t = t;
    537       if (!--band_left) {
    538         band_left = *band_count++;
    539         ++token_costs;
    540       }
    541     }
    542 
    543     // eob token
    544     if (band_left) {
    545       pt = get_coef_context(nb, p_tok, c);
    546       cost += (*token_costs)[0][pt][DCT_EOB_TOKEN];
    547     }
    548   }
    549 
    550   // is eob first coefficient;
    551   *A = *L = (c > 0);
    552 
    553   return cost;
    554 }
    555 
    556 static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
    557   const int ss_txfrm_size = tx_size << 1;
    558   struct rdcost_block_args* args = arg;
    559   MACROBLOCK* const x = args->x;
    560   MACROBLOCKD* const xd = &x->e_mbd;
    561   struct macroblock_plane *const p = &x->plane[plane];
    562   struct macroblockd_plane *const pd = &xd->plane[plane];
    563   int64_t this_sse;
    564   int shift = args->tx_size == TX_32X32 ? 0 : 2;
    565   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
    566   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    567   args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
    568                                &this_sse) >> shift;
    569   args->sse  = this_sse >> shift;
    570 
    571   if (x->skip_encode &&
    572       xd->mi_8x8[0]->mbmi.ref_frame[0] == INTRA_FRAME) {
    573     // TODO(jingning): tune the model to better capture the distortion.
    574     int64_t p = (pd->dequant[1] * pd->dequant[1] *
    575                     (1 << ss_txfrm_size)) >> (shift + 2);
    576     args->dist += (p >> 4);
    577     args->sse  += p;
    578   }
    579 }
    580 
    581 static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
    582                        TX_SIZE tx_size, void *arg) {
    583   struct rdcost_block_args* args = arg;
    584 
    585   int x_idx, y_idx;
    586   txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx);
    587 
    588   args->rate = cost_coeffs(args->x, plane, block, args->t_above + x_idx,
    589                            args->t_left + y_idx, args->tx_size,
    590                            args->scan, args->nb);
    591 }
    592 
    593 static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
    594                            TX_SIZE tx_size, void *arg) {
    595   struct rdcost_block_args *args = arg;
    596   MACROBLOCK *const x = args->x;
    597   MACROBLOCKD *const xd = &x->e_mbd;
    598   struct encode_b_args encode_args = {x, NULL};
    599   int64_t rd1, rd2, rd;
    600 
    601   if (args->skip)
    602     return;
    603 
    604   if (!is_inter_block(&xd->mi_8x8[0]->mbmi))
    605     vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args);
    606   else
    607     vp9_xform_quant(plane, block, plane_bsize, tx_size, &encode_args);
    608 
    609   dist_block(plane, block, tx_size, args);
    610   rate_block(plane, block, plane_bsize, tx_size, args);
    611   rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
    612   rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
    613 
    614   // TODO(jingning): temporarily enabled only for luma component
    615   rd = MIN(rd1, rd2);
    616   if (!xd->lossless && plane == 0)
    617     x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
    618 
    619   args->this_rate += args->rate;
    620   args->this_dist += args->dist;
    621   args->this_sse  += args->sse;
    622   args->this_rd += rd;
    623 
    624   if (args->this_rd > args->best_rd) {
    625     args->skip = 1;
    626     return;
    627   }
    628 }
    629 
    630 void vp9_get_entropy_contexts(TX_SIZE tx_size,
    631     ENTROPY_CONTEXT t_above[16], ENTROPY_CONTEXT t_left[16],
    632     const ENTROPY_CONTEXT *above, const ENTROPY_CONTEXT *left,
    633     int num_4x4_w, int num_4x4_h) {
    634   int i;
    635   switch (tx_size) {
    636     case TX_4X4:
    637       vpx_memcpy(t_above, above, sizeof(ENTROPY_CONTEXT) * num_4x4_w);
    638       vpx_memcpy(t_left, left, sizeof(ENTROPY_CONTEXT) * num_4x4_h);
    639       break;
    640     case TX_8X8:
    641       for (i = 0; i < num_4x4_w; i += 2)
    642         t_above[i] = !!*(const uint16_t *)&above[i];
    643       for (i = 0; i < num_4x4_h; i += 2)
    644         t_left[i] = !!*(const uint16_t *)&left[i];
    645       break;
    646     case TX_16X16:
    647       for (i = 0; i < num_4x4_w; i += 4)
    648         t_above[i] = !!*(const uint32_t *)&above[i];
    649       for (i = 0; i < num_4x4_h; i += 4)
    650         t_left[i] = !!*(const uint32_t *)&left[i];
    651       break;
    652     case TX_32X32:
    653       for (i = 0; i < num_4x4_w; i += 8)
    654         t_above[i] = !!*(const uint64_t *)&above[i];
    655       for (i = 0; i < num_4x4_h; i += 8)
    656         t_left[i] = !!*(const uint64_t *)&left[i];
    657       break;
    658     default:
    659       assert(!"Invalid transform size.");
    660   }
    661 }
    662 
    663 static void init_rdcost_stack(MACROBLOCK *x, TX_SIZE tx_size,
    664                               const int num_4x4_w, const int num_4x4_h,
    665                               const int64_t ref_rdcost,
    666                               struct rdcost_block_args *arg) {
    667   vpx_memset(arg, 0, sizeof(struct rdcost_block_args));
    668   arg->x = x;
    669   arg->tx_size = tx_size;
    670   arg->bw = num_4x4_w;
    671   arg->bh = num_4x4_h;
    672   arg->best_rd = ref_rdcost;
    673 }
    674 
    675 static void txfm_rd_in_plane(MACROBLOCK *x,
    676                              struct rdcost_block_args *rd_stack,
    677                              int *rate, int64_t *distortion,
    678                              int *skippable, int64_t *sse,
    679                              int64_t ref_best_rd, int plane,
    680                              BLOCK_SIZE bsize, TX_SIZE tx_size) {
    681   MACROBLOCKD *const xd = &x->e_mbd;
    682   struct macroblockd_plane *const pd = &xd->plane[plane];
    683   const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
    684   const int num_4x4_w = num_4x4_blocks_wide_lookup[bs];
    685   const int num_4x4_h = num_4x4_blocks_high_lookup[bs];
    686 
    687   init_rdcost_stack(x, tx_size, num_4x4_w, num_4x4_h,
    688                     ref_best_rd, rd_stack);
    689   if (plane == 0)
    690     xd->mi_8x8[0]->mbmi.tx_size = tx_size;
    691 
    692   vp9_get_entropy_contexts(tx_size, rd_stack->t_above, rd_stack->t_left,
    693                            pd->above_context, pd->left_context,
    694                            num_4x4_w, num_4x4_h);
    695 
    696   get_scan(xd, tx_size, pd->plane_type, 0, &rd_stack->scan, &rd_stack->nb);
    697 
    698   foreach_transformed_block_in_plane(xd, bsize, plane,
    699                                      block_yrd_txfm, rd_stack);
    700   if (rd_stack->skip) {
    701     *rate       = INT_MAX;
    702     *distortion = INT64_MAX;
    703     *sse        = INT64_MAX;
    704     *skippable  = 0;
    705   } else {
    706     *distortion = rd_stack->this_dist;
    707     *rate       = rd_stack->this_rate;
    708     *sse        = rd_stack->this_sse;
    709     *skippable  = vp9_is_skippable_in_plane(xd, bsize, plane);
    710   }
    711 }
    712 
    713 static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
    714                                      int *rate, int64_t *distortion,
    715                                      int *skip, int64_t *sse,
    716                                      int64_t ref_best_rd,
    717                                      BLOCK_SIZE bs) {
    718   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
    719   VP9_COMMON *const cm = &cpi->common;
    720   const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
    721   MACROBLOCKD *const xd = &x->e_mbd;
    722   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
    723 
    724   mbmi->tx_size = MIN(max_tx_size, largest_tx_size);
    725 
    726   txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
    727                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
    728                    mbmi->tx_size);
    729   cpi->tx_stepdown_count[0]++;
    730 }
    731 
    732 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
    733                                      int (*r)[2], int *rate,
    734                                      int64_t *d, int64_t *distortion,
    735                                      int *s, int *skip,
    736                                      int64_t tx_cache[TX_MODES],
    737                                      BLOCK_SIZE bs) {
    738   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
    739   VP9_COMMON *const cm = &cpi->common;
    740   MACROBLOCKD *const xd = &x->e_mbd;
    741   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
    742   vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
    743   int64_t rd[TX_SIZES][2];
    744   int n, m;
    745   int s0, s1;
    746 
    747   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
    748 
    749   for (n = TX_4X4; n <= max_tx_size; n++) {
    750     r[n][1] = r[n][0];
    751     if (r[n][0] == INT_MAX)
    752       continue;
    753     for (m = 0; m <= n - (n == max_tx_size); m++) {
    754       if (m == n)
    755         r[n][1] += vp9_cost_zero(tx_probs[m]);
    756       else
    757         r[n][1] += vp9_cost_one(tx_probs[m]);
    758     }
    759   }
    760 
    761   assert(skip_prob > 0);
    762   s0 = vp9_cost_bit(skip_prob, 0);
    763   s1 = vp9_cost_bit(skip_prob, 1);
    764 
    765   for (n = TX_4X4; n <= max_tx_size; n++) {
    766     if (d[n] == INT64_MAX) {
    767       rd[n][0] = rd[n][1] = INT64_MAX;
    768       continue;
    769     }
    770     if (s[n]) {
    771       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
    772     } else {
    773       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
    774       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
    775     }
    776   }
    777 
    778   if (max_tx_size == TX_32X32 &&
    779       (cm->tx_mode == ALLOW_32X32 ||
    780        (cm->tx_mode == TX_MODE_SELECT &&
    781         rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
    782         rd[TX_32X32][1] < rd[TX_4X4][1]))) {
    783     mbmi->tx_size = TX_32X32;
    784   } else if (max_tx_size >= TX_16X16 &&
    785              (cm->tx_mode == ALLOW_16X16 ||
    786               cm->tx_mode == ALLOW_32X32 ||
    787               (cm->tx_mode == TX_MODE_SELECT &&
    788                rd[TX_16X16][1] < rd[TX_8X8][1] &&
    789                rd[TX_16X16][1] < rd[TX_4X4][1]))) {
    790     mbmi->tx_size = TX_16X16;
    791   } else if (cm->tx_mode == ALLOW_8X8 ||
    792              cm->tx_mode == ALLOW_16X16 ||
    793              cm->tx_mode == ALLOW_32X32 ||
    794            (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
    795     mbmi->tx_size = TX_8X8;
    796   } else {
    797     mbmi->tx_size = TX_4X4;
    798   }
    799 
    800   *distortion = d[mbmi->tx_size];
    801   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
    802   *skip       = s[mbmi->tx_size];
    803 
    804   tx_cache[ONLY_4X4] = rd[TX_4X4][0];
    805   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
    806   tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
    807   tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
    808   if (max_tx_size == TX_32X32 &&
    809       rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
    810       rd[TX_32X32][1] < rd[TX_4X4][1])
    811     tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
    812   else if (max_tx_size >= TX_16X16 &&
    813            rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
    814     tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
    815   else
    816     tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
    817                                  rd[TX_4X4][1] : rd[TX_8X8][1];
    818 
    819   if (max_tx_size == TX_32X32 &&
    820       rd[TX_32X32][1] < rd[TX_16X16][1] &&
    821       rd[TX_32X32][1] < rd[TX_8X8][1] &&
    822       rd[TX_32X32][1] < rd[TX_4X4][1]) {
    823     cpi->tx_stepdown_count[0]++;
    824   } else if (max_tx_size >= TX_16X16 &&
    825              rd[TX_16X16][1] < rd[TX_8X8][1] &&
    826              rd[TX_16X16][1] < rd[TX_4X4][1]) {
    827     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
    828   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
    829     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
    830   } else {
    831     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
    832   }
    833 }
    834 
    835 static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
    836                                           int (*r)[2], int *rate,
    837                                           int64_t *d, int64_t *distortion,
    838                                           int *s, int *skip, int64_t *sse,
    839                                           int64_t ref_best_rd,
    840                                           BLOCK_SIZE bs) {
    841   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
    842   VP9_COMMON *const cm = &cpi->common;
    843   MACROBLOCKD *const xd = &x->e_mbd;
    844   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
    845   vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
    846   int64_t rd[TX_SIZES][2];
    847   int n, m;
    848   int s0, s1;
    849   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
    850   // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
    851 
    852   const vp9_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc.tx_probs);
    853 
    854   // for (n = TX_4X4; n <= max_txfm_size; n++)
    855   //   r[n][0] = (r[n][0] * scale_r[n]);
    856 
    857   for (n = TX_4X4; n <= max_tx_size; n++) {
    858     r[n][1] = r[n][0];
    859     for (m = 0; m <= n - (n == max_tx_size); m++) {
    860       if (m == n)
    861         r[n][1] += vp9_cost_zero(tx_probs[m]);
    862       else
    863         r[n][1] += vp9_cost_one(tx_probs[m]);
    864     }
    865   }
    866 
    867   assert(skip_prob > 0);
    868   s0 = vp9_cost_bit(skip_prob, 0);
    869   s1 = vp9_cost_bit(skip_prob, 1);
    870 
    871   for (n = TX_4X4; n <= max_tx_size; n++) {
    872     if (s[n]) {
    873       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
    874     } else {
    875       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
    876       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
    877     }
    878   }
    879   for (n = TX_4X4; n <= max_tx_size; n++) {
    880     rd[n][0] = (int64_t)(scale_rd[n] * rd[n][0]);
    881     rd[n][1] = (int64_t)(scale_rd[n] * rd[n][1]);
    882   }
    883 
    884   if (max_tx_size == TX_32X32 &&
    885       (cm->tx_mode == ALLOW_32X32 ||
    886        (cm->tx_mode == TX_MODE_SELECT &&
    887         rd[TX_32X32][1] <= rd[TX_16X16][1] &&
    888         rd[TX_32X32][1] <= rd[TX_8X8][1] &&
    889         rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
    890     mbmi->tx_size = TX_32X32;
    891   } else if (max_tx_size >= TX_16X16 &&
    892              (cm->tx_mode == ALLOW_16X16 ||
    893               cm->tx_mode == ALLOW_32X32 ||
    894               (cm->tx_mode == TX_MODE_SELECT &&
    895                rd[TX_16X16][1] <= rd[TX_8X8][1] &&
    896                rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
    897     mbmi->tx_size = TX_16X16;
    898   } else if (cm->tx_mode == ALLOW_8X8 ||
    899              cm->tx_mode == ALLOW_16X16 ||
    900              cm->tx_mode == ALLOW_32X32 ||
    901            (cm->tx_mode == TX_MODE_SELECT &&
    902             rd[TX_8X8][1] <= rd[TX_4X4][1])) {
    903     mbmi->tx_size = TX_8X8;
    904   } else {
    905     mbmi->tx_size = TX_4X4;
    906   }
    907 
    908   // Actually encode using the chosen mode if a model was used, but do not
    909   // update the r, d costs
    910   txfm_rd_in_plane(x, &cpi->rdcost_stack, rate, distortion, skip,
    911                    &sse[mbmi->tx_size], ref_best_rd, 0, bs, mbmi->tx_size);
    912 
    913   if (max_tx_size == TX_32X32 &&
    914       rd[TX_32X32][1] <= rd[TX_16X16][1] &&
    915       rd[TX_32X32][1] <= rd[TX_8X8][1] &&
    916       rd[TX_32X32][1] <= rd[TX_4X4][1]) {
    917     cpi->tx_stepdown_count[0]++;
    918   } else if (max_tx_size >= TX_16X16 &&
    919              rd[TX_16X16][1] <= rd[TX_8X8][1] &&
    920              rd[TX_16X16][1] <= rd[TX_4X4][1]) {
    921     cpi->tx_stepdown_count[max_tx_size - TX_16X16]++;
    922   } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
    923     cpi->tx_stepdown_count[max_tx_size - TX_8X8]++;
    924   } else {
    925     cpi->tx_stepdown_count[max_tx_size - TX_4X4]++;
    926   }
    927 }
    928 
    929 static void super_block_yrd(VP9_COMP *cpi,
    930                             MACROBLOCK *x, int *rate, int64_t *distortion,
    931                             int *skip, int64_t *psse, BLOCK_SIZE bs,
    932                             int64_t txfm_cache[TX_MODES],
    933                             int64_t ref_best_rd) {
    934   int r[TX_SIZES][2], s[TX_SIZES];
    935   int64_t d[TX_SIZES], sse[TX_SIZES];
    936   MACROBLOCKD *xd = &x->e_mbd;
    937   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
    938   struct rdcost_block_args *rdcost_stack = &cpi->rdcost_stack;
    939   const int b_inter_mode = is_inter_block(mbmi);
    940 
    941   assert(bs == mbmi->sb_type);
    942   if (b_inter_mode)
    943     vp9_subtract_sby(x, bs);
    944 
    945   if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
    946       (cpi->sf.tx_size_search_method != USE_FULL_RD &&
    947        !b_inter_mode)) {
    948     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
    949     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
    950                              ref_best_rd, bs);
    951     if (psse)
    952       *psse = sse[mbmi->tx_size];
    953     return;
    954   }
    955 
    956   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
    957       b_inter_mode) {
    958     if (bs >= BLOCK_32X32)
    959       model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
    960                            &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
    961     if (bs >= BLOCK_16X16)
    962       model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
    963                            &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
    964 
    965     model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
    966                          &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
    967 
    968     model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
    969                          &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
    970 
    971     choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
    972                                   skip, sse, ref_best_rd, bs);
    973   } else {
    974     if (bs >= BLOCK_32X32)
    975       txfm_rd_in_plane(x, rdcost_stack, &r[TX_32X32][0], &d[TX_32X32],
    976                        &s[TX_32X32], &sse[TX_32X32],
    977                        ref_best_rd, 0, bs, TX_32X32);
    978     if (bs >= BLOCK_16X16)
    979       txfm_rd_in_plane(x, rdcost_stack, &r[TX_16X16][0], &d[TX_16X16],
    980                        &s[TX_16X16], &sse[TX_16X16],
    981                        ref_best_rd, 0, bs, TX_16X16);
    982     txfm_rd_in_plane(x, rdcost_stack, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
    983                      &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
    984     txfm_rd_in_plane(x, rdcost_stack, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
    985                      &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
    986     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
    987                              skip, txfm_cache, bs);
    988   }
    989   if (psse)
    990     *psse = sse[mbmi->tx_size];
    991 }
    992 
    993 static int conditional_skipintra(MB_PREDICTION_MODE mode,
    994                                  MB_PREDICTION_MODE best_intra_mode) {
    995   if (mode == D117_PRED &&
    996       best_intra_mode != V_PRED &&
    997       best_intra_mode != D135_PRED)
    998     return 1;
    999   if (mode == D63_PRED &&
   1000       best_intra_mode != V_PRED &&
   1001       best_intra_mode != D45_PRED)
   1002     return 1;
   1003   if (mode == D207_PRED &&
   1004       best_intra_mode != H_PRED &&
   1005       best_intra_mode != D45_PRED)
   1006     return 1;
   1007   if (mode == D153_PRED &&
   1008       best_intra_mode != H_PRED &&
   1009       best_intra_mode != D135_PRED)
   1010     return 1;
   1011   return 0;
   1012 }
   1013 
   1014 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   1015                                      MB_PREDICTION_MODE *best_mode,
   1016                                      int *bmode_costs,
   1017                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
   1018                                      int *bestrate, int *bestratey,
   1019                                      int64_t *bestdistortion,
   1020                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
   1021   MB_PREDICTION_MODE mode;
   1022   MACROBLOCKD *xd = &x->e_mbd;
   1023   int64_t best_rd = rd_thresh;
   1024   int rate = 0;
   1025   int64_t distortion;
   1026   struct macroblock_plane *p = &x->plane[0];
   1027   struct macroblockd_plane *pd = &xd->plane[0];
   1028   const int src_stride = p->src.stride;
   1029   const int dst_stride = pd->dst.stride;
   1030   uint8_t *src_init = raster_block_offset_uint8(BLOCK_8X8, ib,
   1031                                                 p->src.buf, src_stride);
   1032   uint8_t *dst_init = raster_block_offset_uint8(BLOCK_8X8, ib,
   1033                                                 pd->dst.buf, dst_stride);
   1034   int16_t *src_diff, *coeff;
   1035 
   1036   ENTROPY_CONTEXT ta[2], tempa[2];
   1037   ENTROPY_CONTEXT tl[2], templ[2];
   1038 
   1039   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   1040   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   1041   int idx, idy;
   1042   uint8_t best_dst[8 * 8];
   1043 
   1044   assert(ib < 4);
   1045 
   1046   vpx_memcpy(ta, a, sizeof(ta));
   1047   vpx_memcpy(tl, l, sizeof(tl));
   1048   xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
   1049 
   1050   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
   1051     int64_t this_rd;
   1052     int ratey = 0;
   1053 
   1054     if (!(cpi->sf.intra_y_mode_mask[TX_4X4] & (1 << mode)))
   1055       continue;
   1056 
   1057     // Only do the oblique modes if the best so far is
   1058     // one of the neighboring directional modes
   1059     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
   1060       if (conditional_skipintra(mode, *best_mode))
   1061           continue;
   1062     }
   1063 
   1064     rate = bmode_costs[mode];
   1065     distortion = 0;
   1066 
   1067     vpx_memcpy(tempa, ta, sizeof(ta));
   1068     vpx_memcpy(templ, tl, sizeof(tl));
   1069 
   1070     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
   1071       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
   1072         int64_t ssz;
   1073         const int16_t *scan;
   1074         const int16_t *nb;
   1075         uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
   1076         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
   1077         const int block = ib + idy * 2 + idx;
   1078         TX_TYPE tx_type;
   1079         xd->mi_8x8[0]->bmi[block].as_mode = mode;
   1080         src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
   1081         coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
   1082         vp9_predict_intra_block(xd, block, 1,
   1083                                 TX_4X4, mode,
   1084                                 x->skip_encode ? src : dst,
   1085                                 x->skip_encode ? src_stride : dst_stride,
   1086                                 dst, dst_stride);
   1087         vp9_subtract_block(4, 4, src_diff, 8,
   1088                            src, src_stride,
   1089                            dst, dst_stride);
   1090 
   1091         tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
   1092         get_scan_nb_4x4(tx_type, &scan, &nb);
   1093 
   1094         if (tx_type != DCT_DCT)
   1095           vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
   1096         else
   1097           x->fwd_txm4x4(src_diff, coeff, 8);
   1098 
   1099         vp9_regular_quantize_b_4x4(x, 4, block, scan, get_iscan_4x4(tx_type));
   1100 
   1101         ratey += cost_coeffs(x, 0, block,
   1102                              tempa + idx, templ + idy, TX_4X4, scan, nb);
   1103         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
   1104                                       16, &ssz) >> 2;
   1105         if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
   1106           goto next;
   1107 
   1108         if (tx_type != DCT_DCT)
   1109           vp9_iht4x4_16_add(BLOCK_OFFSET(pd->dqcoeff, block),
   1110                                dst, pd->dst.stride, tx_type);
   1111         else
   1112           xd->itxm_add(BLOCK_OFFSET(pd->dqcoeff, block), dst, pd->dst.stride,
   1113                        16);
   1114       }
   1115     }
   1116 
   1117     rate += ratey;
   1118     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
   1119 
   1120     if (this_rd < best_rd) {
   1121       *bestrate = rate;
   1122       *bestratey = ratey;
   1123       *bestdistortion = distortion;
   1124       best_rd = this_rd;
   1125       *best_mode = mode;
   1126       vpx_memcpy(a, tempa, sizeof(tempa));
   1127       vpx_memcpy(l, templ, sizeof(templ));
   1128       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
   1129         vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
   1130                    num_4x4_blocks_wide * 4);
   1131     }
   1132   next:
   1133     {}
   1134   }
   1135 
   1136   if (best_rd >= rd_thresh || x->skip_encode)
   1137     return best_rd;
   1138 
   1139   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
   1140     vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
   1141                num_4x4_blocks_wide * 4);
   1142 
   1143   return best_rd;
   1144 }
   1145 
   1146 static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
   1147                                             MACROBLOCK * const mb,
   1148                                             int * const rate,
   1149                                             int * const rate_y,
   1150                                             int64_t * const distortion,
   1151                                             int64_t best_rd) {
   1152   int i, j;
   1153   MACROBLOCKD *const xd = &mb->e_mbd;
   1154   MODE_INFO *const mic = xd->mi_8x8[0];
   1155   const MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
   1156   const MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
   1157   const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type;
   1158   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   1159   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   1160   int idx, idy;
   1161   int cost = 0;
   1162   int64_t total_distortion = 0;
   1163   int tot_rate_y = 0;
   1164   int64_t total_rd = 0;
   1165   ENTROPY_CONTEXT t_above[4], t_left[4];
   1166   int *bmode_costs;
   1167 
   1168   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
   1169   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
   1170 
   1171   bmode_costs = mb->mbmode_cost;
   1172 
   1173   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   1174   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
   1175     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
   1176       MB_PREDICTION_MODE best_mode = DC_PRED;
   1177       int r = INT_MAX, ry = INT_MAX;
   1178       int64_t d = INT64_MAX, this_rd = INT64_MAX;
   1179       i = idy * 2 + idx;
   1180       if (cpi->common.frame_type == KEY_FRAME) {
   1181         const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, i);
   1182         const MB_PREDICTION_MODE L = left_block_mode(mic, left_mi, i);
   1183 
   1184         bmode_costs  = mb->y_mode_costs[A][L];
   1185       }
   1186 
   1187       this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
   1188                                       t_above + idx, t_left + idy, &r, &ry, &d,
   1189                                       bsize, best_rd - total_rd);
   1190       if (this_rd >= best_rd - total_rd)
   1191         return INT64_MAX;
   1192 
   1193       total_rd += this_rd;
   1194       cost += r;
   1195       total_distortion += d;
   1196       tot_rate_y += ry;
   1197 
   1198       mic->bmi[i].as_mode = best_mode;
   1199       for (j = 1; j < num_4x4_blocks_high; ++j)
   1200         mic->bmi[i + j * 2].as_mode = best_mode;
   1201       for (j = 1; j < num_4x4_blocks_wide; ++j)
   1202         mic->bmi[i + j].as_mode = best_mode;
   1203 
   1204       if (total_rd >= best_rd)
   1205         return INT64_MAX;
   1206     }
   1207   }
   1208 
   1209   *rate = cost;
   1210   *rate_y = tot_rate_y;
   1211   *distortion = total_distortion;
   1212   mic->mbmi.mode = mic->bmi[3].as_mode;
   1213 
   1214   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
   1215 }
   1216 
   1217 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   1218                                       int *rate, int *rate_tokenonly,
   1219                                       int64_t *distortion, int *skippable,
   1220                                       BLOCK_SIZE bsize,
   1221                                       int64_t tx_cache[TX_MODES],
   1222                                       int64_t best_rd) {
   1223   MB_PREDICTION_MODE mode;
   1224   MB_PREDICTION_MODE mode_selected = DC_PRED;
   1225   MACROBLOCKD *const xd = &x->e_mbd;
   1226   MODE_INFO *const mic = xd->mi_8x8[0];
   1227   int this_rate, this_rate_tokenonly, s;
   1228   int64_t this_distortion, this_rd;
   1229   TX_SIZE best_tx = TX_4X4;
   1230   int i;
   1231   int *bmode_costs = x->mbmode_cost;
   1232 
   1233   if (cpi->sf.tx_size_search_method == USE_FULL_RD)
   1234     for (i = 0; i < TX_MODES; i++)
   1235       tx_cache[i] = INT64_MAX;
   1236 
   1237   /* Y Search for intra prediction mode */
   1238   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
   1239     int64_t local_tx_cache[TX_MODES];
   1240     MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
   1241     MODE_INFO *left_mi = xd->left_available ? xd->mi_8x8[-1] : NULL;
   1242 
   1243     if (!(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]] & (1 << mode)))
   1244       continue;
   1245 
   1246     if (cpi->common.frame_type == KEY_FRAME) {
   1247       const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, 0);
   1248       const MB_PREDICTION_MODE L = left_block_mode(mic, left_mi, 0);
   1249 
   1250       bmode_costs = x->y_mode_costs[A][L];
   1251     }
   1252     mic->mbmi.mode = mode;
   1253 
   1254     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
   1255                     bsize, local_tx_cache, best_rd);
   1256 
   1257     if (this_rate_tokenonly == INT_MAX)
   1258       continue;
   1259 
   1260     this_rate = this_rate_tokenonly + bmode_costs[mode];
   1261     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
   1262 
   1263     if (this_rd < best_rd) {
   1264       mode_selected   = mode;
   1265       best_rd         = this_rd;
   1266       best_tx         = mic->mbmi.tx_size;
   1267       *rate           = this_rate;
   1268       *rate_tokenonly = this_rate_tokenonly;
   1269       *distortion     = this_distortion;
   1270       *skippable      = s;
   1271     }
   1272 
   1273     if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
   1274       for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
   1275         const int64_t adj_rd = this_rd + local_tx_cache[i] -
   1276             local_tx_cache[cpi->common.tx_mode];
   1277         if (adj_rd < tx_cache[i]) {
   1278           tx_cache[i] = adj_rd;
   1279         }
   1280       }
   1281     }
   1282   }
   1283 
   1284   mic->mbmi.mode = mode_selected;
   1285   mic->mbmi.tx_size = best_tx;
   1286 
   1287   return best_rd;
   1288 }
   1289 
   1290 static void super_block_uvrd(VP9_COMP *const cpi, MACROBLOCK *x,
   1291                              int *rate, int64_t *distortion, int *skippable,
   1292                              int64_t *sse, BLOCK_SIZE bsize,
   1293                              int64_t ref_best_rd) {
   1294   MACROBLOCKD *const xd = &x->e_mbd;
   1295   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   1296   TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
   1297   int plane;
   1298   int pnrate = 0, pnskip = 1;
   1299   int64_t pndist = 0, pnsse = 0;
   1300 
   1301   if (ref_best_rd < 0)
   1302     goto term;
   1303 
   1304   if (is_inter_block(mbmi))
   1305     vp9_subtract_sbuv(x, bsize);
   1306 
   1307   *rate = 0;
   1308   *distortion = 0;
   1309   *sse = 0;
   1310   *skippable = 1;
   1311 
   1312   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
   1313     txfm_rd_in_plane(x, &cpi->rdcost_stack, &pnrate, &pndist, &pnskip, &pnsse,
   1314                      ref_best_rd, plane, bsize, uv_txfm_size);
   1315     if (pnrate == INT_MAX)
   1316       goto term;
   1317     *rate += pnrate;
   1318     *distortion += pndist;
   1319     *sse += pnsse;
   1320     *skippable &= pnskip;
   1321   }
   1322   return;
   1323 
   1324   term:
   1325   *rate = INT_MAX;
   1326   *distortion = INT64_MAX;
   1327   *sse = INT64_MAX;
   1328   *skippable = 0;
   1329   return;
   1330 }
   1331 
   1332 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
   1333                                        PICK_MODE_CONTEXT *ctx,
   1334                                        int *rate, int *rate_tokenonly,
   1335                                        int64_t *distortion, int *skippable,
   1336                                        BLOCK_SIZE bsize) {
   1337   MB_PREDICTION_MODE mode;
   1338   MB_PREDICTION_MODE mode_selected = DC_PRED;
   1339   int64_t best_rd = INT64_MAX, this_rd;
   1340   int this_rate_tokenonly, this_rate, s;
   1341   int64_t this_distortion, this_sse;
   1342 
   1343   // int mode_mask = (bsize <= BLOCK_8X8)
   1344   //                ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
   1345 
   1346   for (mode = DC_PRED; mode <= TM_PRED; mode ++) {
   1347     // if (!(mode_mask & (1 << mode)))
   1348     if (!(cpi->sf.intra_uv_mode_mask[max_uv_txsize_lookup[bsize]]
   1349           & (1 << mode)))
   1350       continue;
   1351 
   1352     x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
   1353 
   1354     super_block_uvrd(cpi, x, &this_rate_tokenonly,
   1355                      &this_distortion, &s, &this_sse, bsize, best_rd);
   1356     if (this_rate_tokenonly == INT_MAX)
   1357       continue;
   1358     this_rate = this_rate_tokenonly +
   1359                 x->intra_uv_mode_cost[cpi->common.frame_type][mode];
   1360     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
   1361 
   1362     if (this_rd < best_rd) {
   1363       mode_selected   = mode;
   1364       best_rd         = this_rd;
   1365       *rate           = this_rate;
   1366       *rate_tokenonly = this_rate_tokenonly;
   1367       *distortion     = this_distortion;
   1368       *skippable      = s;
   1369       if (!x->select_txfm_size) {
   1370         int i;
   1371         struct macroblock_plane *const p = x->plane;
   1372         struct macroblockd_plane *const pd = x->e_mbd.plane;
   1373         for (i = 1; i < MAX_MB_PLANE; ++i) {
   1374           p[i].coeff    = ctx->coeff_pbuf[i][2];
   1375           pd[i].qcoeff  = ctx->qcoeff_pbuf[i][2];
   1376           pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
   1377           pd[i].eobs    = ctx->eobs_pbuf[i][2];
   1378 
   1379           ctx->coeff_pbuf[i][2]   = ctx->coeff_pbuf[i][0];
   1380           ctx->qcoeff_pbuf[i][2]  = ctx->qcoeff_pbuf[i][0];
   1381           ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0];
   1382           ctx->eobs_pbuf[i][2]    = ctx->eobs_pbuf[i][0];
   1383 
   1384           ctx->coeff_pbuf[i][0]   = p[i].coeff;
   1385           ctx->qcoeff_pbuf[i][0]  = pd[i].qcoeff;
   1386           ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
   1387           ctx->eobs_pbuf[i][0]    = pd[i].eobs;
   1388         }
   1389       }
   1390     }
   1391   }
   1392 
   1393   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
   1394 
   1395   return best_rd;
   1396 }
   1397 
   1398 static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
   1399                               int *rate, int *rate_tokenonly,
   1400                               int64_t *distortion, int *skippable,
   1401                               BLOCK_SIZE bsize) {
   1402   int64_t this_rd;
   1403   int64_t this_sse;
   1404 
   1405   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
   1406   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
   1407                    skippable, &this_sse, bsize, INT64_MAX);
   1408   *rate = *rate_tokenonly +
   1409           x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
   1410   this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
   1411 
   1412   return this_rd;
   1413 }
   1414 
   1415 static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
   1416                                  BLOCK_SIZE bsize, int *rate_uv,
   1417                                  int *rate_uv_tokenonly,
   1418                                  int64_t *dist_uv, int *skip_uv,
   1419                                  MB_PREDICTION_MODE *mode_uv) {
   1420   MACROBLOCK *const x = &cpi->mb;
   1421 
   1422   // Use an estimated rd for uv_intra based on DC_PRED if the
   1423   // appropriate speed flag is set.
   1424   if (cpi->sf.use_uv_intra_rd_estimate) {
   1425     rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
   1426                    bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   1427   // Else do a proper rd search for each possible transform size that may
   1428   // be considered in the main rd loop.
   1429   } else {
   1430     rd_pick_intra_sbuv_mode(cpi, x, ctx,
   1431                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
   1432                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   1433   }
   1434   *mode_uv = x->e_mbd.mi_8x8[0]->mbmi.uv_mode;
   1435 }
   1436 
   1437 static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
   1438                        int mode_context) {
   1439   MACROBLOCK *const x = &cpi->mb;
   1440   MACROBLOCKD *const xd = &x->e_mbd;
   1441   const int segment_id = xd->mi_8x8[0]->mbmi.segment_id;
   1442 
   1443   // Don't account for mode here if segment skip is enabled.
   1444   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
   1445     assert(is_inter_mode(mode));
   1446     return x->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
   1447   } else {
   1448     return 0;
   1449   }
   1450 }
   1451 
   1452 void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
   1453   x->e_mbd.mi_8x8[0]->mbmi.mode = mb;
   1454   x->e_mbd.mi_8x8[0]->mbmi.mv[0].as_int = mv->as_int;
   1455 }
   1456 
   1457 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   1458                                 BLOCK_SIZE bsize,
   1459                                 int_mv *frame_mv,
   1460                                 int mi_row, int mi_col,
   1461                                 int_mv single_newmv[MAX_REF_FRAMES],
   1462                                 int *rate_mv);
   1463 
   1464 static int labels2mode(MACROBLOCK *x, int i,
   1465                        MB_PREDICTION_MODE this_mode,
   1466                        int_mv *this_mv, int_mv *this_second_mv,
   1467                        int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
   1468                        int_mv seg_mvs[MAX_REF_FRAMES],
   1469                        int_mv *best_ref_mv,
   1470                        int_mv *second_best_ref_mv,
   1471                        int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
   1472   MACROBLOCKD *const xd = &x->e_mbd;
   1473   MODE_INFO *const mic = xd->mi_8x8[0];
   1474   MB_MODE_INFO *mbmi = &mic->mbmi;
   1475   int cost = 0, thismvcost = 0;
   1476   int idx, idy;
   1477   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
   1478   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   1479   const int has_second_rf = has_second_ref(mbmi);
   1480 
   1481   /* We have to be careful retrieving previously-encoded motion vectors.
   1482    Ones from this macroblock have to be pulled from the BLOCKD array
   1483    as they have not yet made it to the bmi array in our MB_MODE_INFO. */
   1484   MB_PREDICTION_MODE m;
   1485 
   1486   // the only time we should do costing for new motion vector or mode
   1487   // is when we are on a new label  (jbb May 08, 2007)
   1488   switch (m = this_mode) {
   1489     case NEWMV:
   1490       this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
   1491       thismvcost  = vp9_mv_bit_cost(&this_mv->as_mv, &best_ref_mv->as_mv,
   1492                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
   1493       if (has_second_rf) {
   1494         this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
   1495         thismvcost += vp9_mv_bit_cost(&this_second_mv->as_mv,
   1496                                       &second_best_ref_mv->as_mv,
   1497                                       mvjcost, mvcost, MV_COST_WEIGHT_SUB);
   1498       }
   1499       break;
   1500     case NEARESTMV:
   1501       this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
   1502       if (has_second_rf)
   1503         this_second_mv->as_int =
   1504             frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
   1505       break;
   1506     case NEARMV:
   1507       this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
   1508       if (has_second_rf)
   1509         this_second_mv->as_int =
   1510             frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
   1511       break;
   1512     case ZEROMV:
   1513       this_mv->as_int = 0;
   1514       if (has_second_rf)
   1515         this_second_mv->as_int = 0;
   1516       break;
   1517     default:
   1518       break;
   1519   }
   1520 
   1521   cost = cost_mv_ref(cpi, this_mode,
   1522                      mbmi->mode_context[mbmi->ref_frame[0]]);
   1523 
   1524   mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
   1525   if (has_second_rf)
   1526     mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
   1527 
   1528   mic->bmi[i].as_mode = m;
   1529 
   1530   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
   1531     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
   1532       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
   1533                  &mic->bmi[i], sizeof(mic->bmi[i]));
   1534 
   1535   cost += thismvcost;
   1536   return cost;
   1537 }
   1538 
   1539 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   1540                                        MACROBLOCK *x,
   1541                                        int64_t best_yrd,
   1542                                        int i,
   1543                                        int *labelyrate,
   1544                                        int64_t *distortion, int64_t *sse,
   1545                                        ENTROPY_CONTEXT *ta,
   1546                                        ENTROPY_CONTEXT *tl) {
   1547   int k;
   1548   MACROBLOCKD *xd = &x->e_mbd;
   1549   struct macroblockd_plane *const pd = &xd->plane[0];
   1550   struct macroblock_plane *const p = &x->plane[0];
   1551   MODE_INFO *const mi = xd->mi_8x8[0];
   1552   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
   1553   const int width = plane_block_width(bsize, pd);
   1554   const int height = plane_block_height(bsize, pd);
   1555   int idx, idy;
   1556 
   1557   uint8_t *const src = raster_block_offset_uint8(BLOCK_8X8, i,
   1558                                                  p->src.buf, p->src.stride);
   1559   uint8_t *const dst = raster_block_offset_uint8(BLOCK_8X8, i,
   1560                                                  pd->dst.buf, pd->dst.stride);
   1561   int64_t thisdistortion = 0, thissse = 0;
   1562   int thisrate = 0, ref;
   1563   const int is_compound = has_second_ref(&mi->mbmi);
   1564   for (ref = 0; ref < 1 + is_compound; ++ref) {
   1565     const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i,
   1566                                      pd->pre[ref].buf, pd->pre[ref].stride);
   1567     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
   1568                               dst, pd->dst.stride,
   1569                               &mi->bmi[i].as_mv[ref].as_mv,
   1570                               &xd->scale_factor[ref],
   1571                               width, height, ref, &xd->subpix, MV_PRECISION_Q3);
   1572   }
   1573 
   1574   vp9_subtract_block(height, width,
   1575                      raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8,
   1576                      src, p->src.stride,
   1577                      dst, pd->dst.stride);
   1578 
   1579   k = i;
   1580   for (idy = 0; idy < height / 4; ++idy) {
   1581     for (idx = 0; idx < width / 4; ++idx) {
   1582       int64_t ssz, rd, rd1, rd2;
   1583       int16_t* coeff;
   1584 
   1585       k += (idy * 2 + idx);
   1586       coeff = BLOCK_OFFSET(p->coeff, k);
   1587       x->fwd_txm4x4(raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
   1588                     coeff, 8);
   1589       vp9_regular_quantize_b_4x4(x, 4, k, get_scan_4x4(DCT_DCT),
   1590                                  get_iscan_4x4(DCT_DCT));
   1591       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
   1592                                         16, &ssz);
   1593       thissse += ssz;
   1594       thisrate += cost_coeffs(x, 0, k,
   1595                               ta + (k & 1),
   1596                               tl + (k >> 1), TX_4X4,
   1597                               vp9_default_scan_4x4,
   1598                               vp9_default_scan_4x4_neighbors);
   1599       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
   1600       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
   1601       rd = MIN(rd1, rd2);
   1602       if (rd >= best_yrd)
   1603         return INT64_MAX;
   1604     }
   1605   }
   1606 
   1607   *distortion = thisdistortion >> 2;
   1608   *labelyrate = thisrate;
   1609   *sse = thissse >> 2;
   1610 
   1611   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
   1612 }
   1613 
   1614 typedef struct {
   1615   int eobs;
   1616   int brate;
   1617   int byrate;
   1618   int64_t bdist;
   1619   int64_t bsse;
   1620   int64_t brdcost;
   1621   int_mv mvs[2];
   1622   ENTROPY_CONTEXT ta[2];
   1623   ENTROPY_CONTEXT tl[2];
   1624 } SEG_RDSTAT;
   1625 
   1626 typedef struct {
   1627   int_mv *ref_mv, *second_ref_mv;
   1628   int_mv mvp;
   1629 
   1630   int64_t segment_rd;
   1631   int r;
   1632   int64_t d;
   1633   int64_t sse;
   1634   int segment_yrate;
   1635   MB_PREDICTION_MODE modes[4];
   1636   SEG_RDSTAT rdstat[4][INTER_MODES];
   1637   int mvthresh;
   1638 } BEST_SEG_INFO;
   1639 
   1640 static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
   1641   int r = 0;
   1642   r |= (mv->as_mv.row >> 3) < x->mv_row_min;
   1643   r |= (mv->as_mv.row >> 3) > x->mv_row_max;
   1644   r |= (mv->as_mv.col >> 3) < x->mv_col_min;
   1645   r |= (mv->as_mv.col >> 3) > x->mv_col_max;
   1646   return r;
   1647 }
   1648 
   1649 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
   1650   MB_MODE_INFO *const mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
   1651   struct macroblock_plane *const p = &x->plane[0];
   1652   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
   1653 
   1654   p->src.buf = raster_block_offset_uint8(BLOCK_8X8, i, p->src.buf,
   1655                                          p->src.stride);
   1656   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
   1657   pd->pre[0].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[0].buf,
   1658                                              pd->pre[0].stride);
   1659   if (has_second_ref(mbmi))
   1660     pd->pre[1].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[1].buf,
   1661                                                pd->pre[1].stride);
   1662 }
   1663 
   1664 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
   1665                                   struct buf_2d orig_pre[2]) {
   1666   MB_MODE_INFO *mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
   1667   x->plane[0].src = orig_src;
   1668   x->e_mbd.plane[0].pre[0] = orig_pre[0];
   1669   if (has_second_ref(mbmi))
   1670     x->e_mbd.plane[0].pre[1] = orig_pre[1];
   1671 }
   1672 
   1673 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   1674                                     const TileInfo *const tile,
   1675                                     BEST_SEG_INFO *bsi_buf, int filter_idx,
   1676                                     int_mv seg_mvs[4][MAX_REF_FRAMES],
   1677                                     int mi_row, int mi_col) {
   1678   int i, br = 0, idx, idy;
   1679   int64_t bd = 0, block_sse = 0;
   1680   MB_PREDICTION_MODE this_mode;
   1681   MODE_INFO *mi = x->e_mbd.mi_8x8[0];
   1682   MB_MODE_INFO *const mbmi = &mi->mbmi;
   1683   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
   1684   const int label_count = 4;
   1685   int64_t this_segment_rd = 0;
   1686   int label_mv_thresh;
   1687   int segmentyrate = 0;
   1688   const BLOCK_SIZE bsize = mbmi->sb_type;
   1689   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   1690   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   1691   vp9_variance_fn_ptr_t *v_fn_ptr;
   1692   ENTROPY_CONTEXT t_above[2], t_left[2];
   1693   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   1694   int mode_idx;
   1695   int subpelmv = 1, have_ref = 0;
   1696   const int has_second_rf = has_second_ref(mbmi);
   1697 
   1698   vpx_memcpy(t_above, pd->above_context, sizeof(t_above));
   1699   vpx_memcpy(t_left, pd->left_context, sizeof(t_left));
   1700 
   1701   v_fn_ptr = &cpi->fn_ptr[bsize];
   1702 
   1703   // 64 makes this threshold really big effectively
   1704   // making it so that we very rarely check mvs on
   1705   // segments.   setting this to 1 would make mv thresh
   1706   // roughly equal to what it is for macroblocks
   1707   label_mv_thresh = 1 * bsi->mvthresh / label_count;
   1708 
   1709   // Segmentation method overheads
   1710   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
   1711     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
   1712       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
   1713       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
   1714       int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT];
   1715       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   1716       MB_PREDICTION_MODE mode_selected = ZEROMV;
   1717       int64_t best_rd = INT64_MAX;
   1718       i = idy * 2 + idx;
   1719 
   1720       frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
   1721       vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
   1722                                     &frame_mv[NEARESTMV][mbmi->ref_frame[0]],
   1723                                     &frame_mv[NEARMV][mbmi->ref_frame[0]],
   1724                                     i, 0, mi_row, mi_col);
   1725       if (has_second_rf) {
   1726         frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
   1727         vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd, tile,
   1728                                       &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
   1729                                       &frame_mv[NEARMV][mbmi->ref_frame[1]],
   1730                                       i, 1, mi_row, mi_col);
   1731       }
   1732       // search for the best motion vector on this segment
   1733       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
   1734         const struct buf_2d orig_src = x->plane[0].src;
   1735         struct buf_2d orig_pre[2];
   1736 
   1737         mode_idx = INTER_OFFSET(this_mode);
   1738         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
   1739 
   1740         // if we're near/nearest and mv == 0,0, compare to zeromv
   1741         if ((this_mode == NEARMV || this_mode == NEARESTMV ||
   1742              this_mode == ZEROMV) &&
   1743             frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 &&
   1744             (!has_second_rf ||
   1745              frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) {
   1746           int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
   1747           int c1 = cost_mv_ref(cpi, NEARMV, rfc);
   1748           int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
   1749           int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
   1750 
   1751           if (this_mode == NEARMV) {
   1752             if (c1 > c3)
   1753               continue;
   1754           } else if (this_mode == NEARESTMV) {
   1755             if (c2 > c3)
   1756               continue;
   1757           } else {
   1758             assert(this_mode == ZEROMV);
   1759             if (!has_second_rf) {
   1760               if ((c3 >= c2 &&
   1761                    frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
   1762                   (c3 >= c1 &&
   1763                    frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
   1764                 continue;
   1765             } else {
   1766               if ((c3 >= c2 &&
   1767                    frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
   1768                    frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
   1769                   (c3 >= c1 &&
   1770                    frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
   1771                    frame_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
   1772                 continue;
   1773             }
   1774           }
   1775         }
   1776 
   1777         vpx_memcpy(orig_pre, pd->pre, sizeof(orig_pre));
   1778         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
   1779                    sizeof(bsi->rdstat[i][mode_idx].ta));
   1780         vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
   1781                    sizeof(bsi->rdstat[i][mode_idx].tl));
   1782 
   1783         // motion search for newmv (single predictor case only)
   1784         if (!has_second_rf && this_mode == NEWMV &&
   1785             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
   1786           int step_param = 0;
   1787           int further_steps;
   1788           int thissme, bestsme = INT_MAX;
   1789           int sadpb = x->sadperbit4;
   1790           int_mv mvp_full;
   1791           int max_mv;
   1792 
   1793           /* Is the best so far sufficiently good that we cant justify doing
   1794            * and new motion search. */
   1795           if (best_rd < label_mv_thresh)
   1796             break;
   1797 
   1798           if (cpi->compressor_speed) {
   1799             // use previous block's result as next block's MV predictor.
   1800             if (i > 0) {
   1801               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
   1802               if (i == 2)
   1803                 bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
   1804             }
   1805           }
   1806           if (i == 0)
   1807             max_mv = x->max_mv_context[mbmi->ref_frame[0]];
   1808           else
   1809             max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
   1810 
   1811           if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
   1812             // Take wtd average of the step_params based on the last frame's
   1813             // max mv magnitude and the best ref mvs of the current block for
   1814             // the given reference.
   1815             step_param = (vp9_init_search_range(cpi, max_mv) +
   1816                           cpi->mv_step_param) >> 1;
   1817           } else {
   1818             step_param = cpi->mv_step_param;
   1819           }
   1820 
   1821           mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
   1822           mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
   1823 
   1824           if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) {
   1825             mvp_full.as_mv.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3;
   1826             mvp_full.as_mv.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3;
   1827             step_param = MAX(step_param, 8);
   1828           }
   1829 
   1830           further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   1831           // adjust src pointer for this block
   1832           mi_buf_shift(x, i);
   1833           if (cpi->sf.search_method == HEX) {
   1834             bestsme = vp9_hex_search(x, &mvp_full.as_mv,
   1835                                      step_param,
   1836                                      sadpb, 1, v_fn_ptr, 1,
   1837                                      &bsi->ref_mv->as_mv,
   1838                                      &mode_mv[NEWMV].as_mv);
   1839           } else if (cpi->sf.search_method == SQUARE) {
   1840             bestsme = vp9_square_search(x, &mvp_full.as_mv,
   1841                                         step_param,
   1842                                         sadpb, 1, v_fn_ptr, 1,
   1843                                         &bsi->ref_mv->as_mv,
   1844                                         &mode_mv[NEWMV].as_mv);
   1845           } else if (cpi->sf.search_method == BIGDIA) {
   1846             bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
   1847                                         step_param,
   1848                                         sadpb, 1, v_fn_ptr, 1,
   1849                                         &bsi->ref_mv->as_mv,
   1850                                         &mode_mv[NEWMV].as_mv);
   1851           } else {
   1852             bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
   1853                                              sadpb, further_steps, 0, v_fn_ptr,
   1854                                              bsi->ref_mv, &mode_mv[NEWMV]);
   1855           }
   1856 
   1857           // Should we do a full search (best quality only)
   1858           if (cpi->compressor_speed == 0) {
   1859             /* Check if mvp_full is within the range. */
   1860             clamp_mv(&mvp_full.as_mv, x->mv_col_min, x->mv_col_max,
   1861                      x->mv_row_min, x->mv_row_max);
   1862 
   1863             thissme = cpi->full_search_sad(x, &mvp_full,
   1864                                            sadpb, 16, v_fn_ptr,
   1865                                            x->nmvjointcost, x->mvcost,
   1866                                            bsi->ref_mv, i);
   1867 
   1868             if (thissme < bestsme) {
   1869               bestsme = thissme;
   1870               mode_mv[NEWMV].as_int = mi->bmi[i].as_mv[0].as_int;
   1871             } else {
   1872               /* The full search result is actually worse so re-instate the
   1873                * previous best vector */
   1874               mi->bmi[i].as_mv[0].as_int = mode_mv[NEWMV].as_int;
   1875             }
   1876           }
   1877 
   1878           if (bestsme < INT_MAX) {
   1879             int distortion;
   1880             unsigned int sse;
   1881             cpi->find_fractional_mv_step(x,
   1882                                          &mode_mv[NEWMV].as_mv,
   1883                                          &bsi->ref_mv->as_mv,
   1884                                          cpi->common.allow_high_precision_mv,
   1885                                          x->errorperbit, v_fn_ptr,
   1886                                          0, cpi->sf.subpel_iters_per_step,
   1887                                          x->nmvjointcost, x->mvcost,
   1888                                          &distortion, &sse);
   1889 
   1890             // save motion search result for use in compound prediction
   1891             seg_mvs[i][mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;
   1892           }
   1893 
   1894           if (cpi->sf.adaptive_motion_search)
   1895             x->pred_mv[mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;
   1896 
   1897           // restore src pointers
   1898           mi_buf_restore(x, orig_src, orig_pre);
   1899         }
   1900 
   1901         if (has_second_rf) {
   1902           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
   1903               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
   1904             continue;
   1905         }
   1906 
   1907         if (has_second_rf && this_mode == NEWMV &&
   1908             mbmi->interp_filter == EIGHTTAP) {
   1909           // adjust src pointers
   1910           mi_buf_shift(x, i);
   1911           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
   1912             int rate_mv;
   1913             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
   1914                                 mi_row, mi_col, seg_mvs[i],
   1915                                 &rate_mv);
   1916             seg_mvs[i][mbmi->ref_frame[0]].as_int =
   1917                 frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
   1918             seg_mvs[i][mbmi->ref_frame[1]].as_int =
   1919                 frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
   1920           }
   1921           // restore src pointers
   1922           mi_buf_restore(x, orig_src, orig_pre);
   1923         }
   1924 
   1925         bsi->rdstat[i][mode_idx].brate =
   1926             labels2mode(x, i, this_mode, &mode_mv[this_mode],
   1927                         &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
   1928                         bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
   1929                         x->mvcost, cpi);
   1930 
   1931 
   1932         bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int;
   1933         if (num_4x4_blocks_wide > 1)
   1934           bsi->rdstat[i + 1][mode_idx].mvs[0].as_int =
   1935               mode_mv[this_mode].as_int;
   1936         if (num_4x4_blocks_high > 1)
   1937           bsi->rdstat[i + 2][mode_idx].mvs[0].as_int =
   1938               mode_mv[this_mode].as_int;
   1939         if (has_second_rf) {
   1940           bsi->rdstat[i][mode_idx].mvs[1].as_int =
   1941               second_mode_mv[this_mode].as_int;
   1942           if (num_4x4_blocks_wide > 1)
   1943             bsi->rdstat[i + 1][mode_idx].mvs[1].as_int =
   1944                 second_mode_mv[this_mode].as_int;
   1945           if (num_4x4_blocks_high > 1)
   1946             bsi->rdstat[i + 2][mode_idx].mvs[1].as_int =
   1947                 second_mode_mv[this_mode].as_int;
   1948         }
   1949 
   1950         // Trap vectors that reach beyond the UMV borders
   1951         if (mv_check_bounds(x, &mode_mv[this_mode]))
   1952           continue;
   1953         if (has_second_rf &&
   1954             mv_check_bounds(x, &second_mode_mv[this_mode]))
   1955           continue;
   1956 
   1957         if (filter_idx > 0) {
   1958           BEST_SEG_INFO *ref_bsi = bsi_buf;
   1959           subpelmv = (mode_mv[this_mode].as_mv.row & 0x0f) ||
   1960                      (mode_mv[this_mode].as_mv.col & 0x0f);
   1961           have_ref = mode_mv[this_mode].as_int ==
   1962                      ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
   1963           if (has_second_rf) {
   1964             subpelmv |= (second_mode_mv[this_mode].as_mv.row & 0x0f) ||
   1965                         (second_mode_mv[this_mode].as_mv.col & 0x0f);
   1966             have_ref  &= second_mode_mv[this_mode].as_int ==
   1967                          ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
   1968           }
   1969 
   1970           if (filter_idx > 1 && !subpelmv && !have_ref) {
   1971             ref_bsi = bsi_buf + 1;
   1972             have_ref = mode_mv[this_mode].as_int ==
   1973                        ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
   1974             if (has_second_rf) {
   1975               have_ref  &= second_mode_mv[this_mode].as_int ==
   1976                            ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
   1977             }
   1978           }
   1979 
   1980           if (!subpelmv && have_ref &&
   1981               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
   1982             vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
   1983                        sizeof(SEG_RDSTAT));
   1984             if (num_4x4_blocks_wide > 1)
   1985               bsi->rdstat[i + 1][mode_idx].eobs =
   1986                   ref_bsi->rdstat[i + 1][mode_idx].eobs;
   1987             if (num_4x4_blocks_high > 1)
   1988               bsi->rdstat[i + 2][mode_idx].eobs =
   1989                   ref_bsi->rdstat[i + 2][mode_idx].eobs;
   1990 
   1991             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
   1992               mode_selected = this_mode;
   1993               best_rd = bsi->rdstat[i][mode_idx].brdcost;
   1994             }
   1995             continue;
   1996           }
   1997         }
   1998 
   1999         bsi->rdstat[i][mode_idx].brdcost =
   2000             encode_inter_mb_segment(cpi, x,
   2001                                     bsi->segment_rd - this_segment_rd, i,
   2002                                     &bsi->rdstat[i][mode_idx].byrate,
   2003                                     &bsi->rdstat[i][mode_idx].bdist,
   2004                                     &bsi->rdstat[i][mode_idx].bsse,
   2005                                     bsi->rdstat[i][mode_idx].ta,
   2006                                     bsi->rdstat[i][mode_idx].tl);
   2007         if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
   2008           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
   2009                                             bsi->rdstat[i][mode_idx].brate, 0);
   2010           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
   2011           bsi->rdstat[i][mode_idx].eobs = pd->eobs[i];
   2012           if (num_4x4_blocks_wide > 1)
   2013             bsi->rdstat[i + 1][mode_idx].eobs = pd->eobs[i + 1];
   2014           if (num_4x4_blocks_high > 1)
   2015             bsi->rdstat[i + 2][mode_idx].eobs = pd->eobs[i + 2];
   2016         }
   2017 
   2018         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
   2019           mode_selected = this_mode;
   2020           best_rd = bsi->rdstat[i][mode_idx].brdcost;
   2021         }
   2022       } /*for each 4x4 mode*/
   2023 
   2024       if (best_rd == INT64_MAX) {
   2025         int iy, midx;
   2026         for (iy = i + 1; iy < 4; ++iy)
   2027           for (midx = 0; midx < INTER_MODES; ++midx)
   2028             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
   2029         bsi->segment_rd = INT64_MAX;
   2030         return;
   2031       }
   2032 
   2033       mode_idx = INTER_OFFSET(mode_selected);
   2034       vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
   2035       vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
   2036 
   2037       labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
   2038                   &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
   2039                   bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
   2040                   x->mvcost, cpi);
   2041 
   2042       br += bsi->rdstat[i][mode_idx].brate;
   2043       bd += bsi->rdstat[i][mode_idx].bdist;
   2044       block_sse += bsi->rdstat[i][mode_idx].bsse;
   2045       segmentyrate += bsi->rdstat[i][mode_idx].byrate;
   2046       this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
   2047 
   2048       if (this_segment_rd > bsi->segment_rd) {
   2049         int iy, midx;
   2050         for (iy = i + 1; iy < 4; ++iy)
   2051           for (midx = 0; midx < INTER_MODES; ++midx)
   2052             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
   2053         bsi->segment_rd = INT64_MAX;
   2054         return;
   2055       }
   2056     }
   2057   } /* for each label */
   2058 
   2059   bsi->r = br;
   2060   bsi->d = bd;
   2061   bsi->segment_yrate = segmentyrate;
   2062   bsi->segment_rd = this_segment_rd;
   2063   bsi->sse = block_sse;
   2064 
   2065   // update the coding decisions
   2066   for (i = 0; i < 4; ++i)
   2067     bsi->modes[i] = mi->bmi[i].as_mode;
   2068 }
   2069 
   2070 static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   2071                                            const TileInfo *const tile,
   2072                                            int_mv *best_ref_mv,
   2073                                            int_mv *second_best_ref_mv,
   2074                                            int64_t best_rd,
   2075                                            int *returntotrate,
   2076                                            int *returnyrate,
   2077                                            int64_t *returndistortion,
   2078                                            int *skippable, int64_t *psse,
   2079                                            int mvthresh,
   2080                                            int_mv seg_mvs[4][MAX_REF_FRAMES],
   2081                                            BEST_SEG_INFO *bsi_buf,
   2082                                            int filter_idx,
   2083                                            int mi_row, int mi_col) {
   2084   int i;
   2085   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   2086   MACROBLOCKD *xd = &x->e_mbd;
   2087   MODE_INFO *mi = xd->mi_8x8[0];
   2088   MB_MODE_INFO *mbmi = &mi->mbmi;
   2089   int mode_idx;
   2090 
   2091   vp9_zero(*bsi);
   2092 
   2093   bsi->segment_rd = best_rd;
   2094   bsi->ref_mv = best_ref_mv;
   2095   bsi->second_ref_mv = second_best_ref_mv;
   2096   bsi->mvp.as_int = best_ref_mv->as_int;
   2097   bsi->mvthresh = mvthresh;
   2098 
   2099   for (i = 0; i < 4; i++)
   2100     bsi->modes[i] = ZEROMV;
   2101 
   2102   rd_check_segment_txsize(cpi, x, tile, bsi_buf, filter_idx, seg_mvs,
   2103                           mi_row, mi_col);
   2104 
   2105   if (bsi->segment_rd > best_rd)
   2106     return INT64_MAX;
   2107   /* set it to the best */
   2108   for (i = 0; i < 4; i++) {
   2109     mode_idx = INTER_OFFSET(bsi->modes[i]);
   2110     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
   2111     if (has_second_ref(mbmi))
   2112       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
   2113     xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
   2114     mi->bmi[i].as_mode = bsi->modes[i];
   2115   }
   2116 
   2117   /*
   2118    * used to set mbmi->mv.as_int
   2119    */
   2120   *returntotrate = bsi->r;
   2121   *returndistortion = bsi->d;
   2122   *returnyrate = bsi->segment_yrate;
   2123   *skippable = vp9_is_skippable_in_plane(&x->e_mbd, BLOCK_8X8, 0);
   2124   *psse = bsi->sse;
   2125   mbmi->mode = bsi->modes[3];
   2126 
   2127   return bsi->segment_rd;
   2128 }
   2129 
   2130 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   2131                     uint8_t *ref_y_buffer, int ref_y_stride,
   2132                     int ref_frame, BLOCK_SIZE block_size ) {
   2133   MACROBLOCKD *xd = &x->e_mbd;
   2134   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   2135   int_mv this_mv;
   2136   int i;
   2137   int zero_seen = 0;
   2138   int best_index = 0;
   2139   int best_sad = INT_MAX;
   2140   int this_sad = INT_MAX;
   2141   unsigned int max_mv = 0;
   2142 
   2143   uint8_t *src_y_ptr = x->plane[0].src.buf;
   2144   uint8_t *ref_y_ptr;
   2145   int row_offset, col_offset;
   2146   int num_mv_refs = MAX_MV_REF_CANDIDATES +
   2147                     (cpi->sf.adaptive_motion_search &&
   2148                      cpi->common.show_frame &&
   2149                      block_size < cpi->sf.max_partition_size);
   2150 
   2151   // Get the sad for each candidate reference mv
   2152   for (i = 0; i < num_mv_refs; i++) {
   2153     this_mv.as_int = (i < MAX_MV_REF_CANDIDATES) ?
   2154         mbmi->ref_mvs[ref_frame][i].as_int : x->pred_mv[ref_frame].as_int;
   2155 
   2156     max_mv = MAX(max_mv,
   2157                  MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
   2158     // The list is at an end if we see 0 for a second time.
   2159     if (!this_mv.as_int && zero_seen)
   2160       break;
   2161     zero_seen = zero_seen || !this_mv.as_int;
   2162 
   2163     row_offset = this_mv.as_mv.row >> 3;
   2164     col_offset = this_mv.as_mv.col >> 3;
   2165     ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
   2166 
   2167     // Find sad for current vector.
   2168     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
   2169                                            ref_y_ptr, ref_y_stride,
   2170                                            0x7fffffff);
   2171 
   2172     // Note if it is the best so far.
   2173     if (this_sad < best_sad) {
   2174       best_sad = this_sad;
   2175       best_index = i;
   2176     }
   2177   }
   2178 
   2179   // Note the index of the mv that worked best in the reference list.
   2180   x->mv_best_ref_index[ref_frame] = best_index;
   2181   x->max_mv_context[ref_frame] = max_mv;
   2182 }
   2183 
   2184 static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
   2185                                      unsigned int *ref_costs_single,
   2186                                      unsigned int *ref_costs_comp,
   2187                                      vp9_prob *comp_mode_p) {
   2188   VP9_COMMON *const cm = &cpi->common;
   2189   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   2190   int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
   2191                                              SEG_LVL_REF_FRAME);
   2192   if (seg_ref_active) {
   2193     vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
   2194     vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
   2195     *comp_mode_p = 128;
   2196   } else {
   2197     vp9_prob intra_inter_p = vp9_get_pred_prob_intra_inter(cm, xd);
   2198     vp9_prob comp_inter_p = 128;
   2199 
   2200     if (cm->comp_pred_mode == HYBRID_PREDICTION) {
   2201       comp_inter_p = vp9_get_pred_prob_comp_inter_inter(cm, xd);
   2202       *comp_mode_p = comp_inter_p;
   2203     } else {
   2204       *comp_mode_p = 128;
   2205     }
   2206 
   2207     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
   2208 
   2209     if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
   2210       vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
   2211       vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
   2212       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
   2213 
   2214       if (cm->comp_pred_mode == HYBRID_PREDICTION)
   2215         base_cost += vp9_cost_bit(comp_inter_p, 0);
   2216 
   2217       ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
   2218           ref_costs_single[ALTREF_FRAME] = base_cost;
   2219       ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
   2220       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
   2221       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
   2222       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
   2223       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
   2224     } else {
   2225       ref_costs_single[LAST_FRAME]   = 512;
   2226       ref_costs_single[GOLDEN_FRAME] = 512;
   2227       ref_costs_single[ALTREF_FRAME] = 512;
   2228     }
   2229     if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
   2230       vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
   2231       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
   2232 
   2233       if (cm->comp_pred_mode == HYBRID_PREDICTION)
   2234         base_cost += vp9_cost_bit(comp_inter_p, 1);
   2235 
   2236       ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
   2237       ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
   2238     } else {
   2239       ref_costs_comp[LAST_FRAME]   = 512;
   2240       ref_costs_comp[GOLDEN_FRAME] = 512;
   2241     }
   2242   }
   2243 }
   2244 
   2245 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   2246                          int mode_index,
   2247                          int_mv *ref_mv,
   2248                          int_mv *second_ref_mv,
   2249                          int64_t comp_pred_diff[NB_PREDICTION_TYPES],
   2250                          int64_t tx_size_diff[TX_MODES],
   2251                          int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS]) {
   2252   MACROBLOCKD *const xd = &x->e_mbd;
   2253 
   2254   // Take a snapshot of the coding context so it can be
   2255   // restored if we decide to encode this way
   2256   ctx->skip = x->skip;
   2257   ctx->best_mode_index = mode_index;
   2258   ctx->mic = *xd->mi_8x8[0];
   2259 
   2260   ctx->best_ref_mv.as_int = ref_mv->as_int;
   2261   ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
   2262 
   2263   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_PREDICTION_ONLY];
   2264   ctx->comp_pred_diff   = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
   2265   ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
   2266 
   2267   vpx_memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
   2268   vpx_memcpy(ctx->best_filter_diff, best_filter_diff,
   2269              sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
   2270 }
   2271 
   2272 static void setup_pred_block(const MACROBLOCKD *xd,
   2273                              struct buf_2d dst[MAX_MB_PLANE],
   2274                              const YV12_BUFFER_CONFIG *src,
   2275                              int mi_row, int mi_col,
   2276                              const struct scale_factors *scale,
   2277                              const struct scale_factors *scale_uv) {
   2278   int i;
   2279 
   2280   dst[0].buf = src->y_buffer;
   2281   dst[0].stride = src->y_stride;
   2282   dst[1].buf = src->u_buffer;
   2283   dst[2].buf = src->v_buffer;
   2284   dst[1].stride = dst[2].stride = src->uv_stride;
   2285 #if CONFIG_ALPHA
   2286   dst[3].buf = src->alpha_buffer;
   2287   dst[3].stride = src->alpha_stride;
   2288 #endif
   2289 
   2290   // TODO(jkoleszar): Make scale factors per-plane data
   2291   for (i = 0; i < MAX_MB_PLANE; i++) {
   2292     setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
   2293                      i ? scale_uv : scale,
   2294                      xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
   2295   }
   2296 }
   2297 
   2298 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   2299                                const TileInfo *const tile,
   2300                                int idx, MV_REFERENCE_FRAME frame_type,
   2301                                BLOCK_SIZE block_size,
   2302                                int mi_row, int mi_col,
   2303                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
   2304                                int_mv frame_near_mv[MAX_REF_FRAMES],
   2305                                struct buf_2d yv12_mb[4][MAX_MB_PLANE],
   2306                                struct scale_factors scale[MAX_REF_FRAMES]) {
   2307   VP9_COMMON *cm = &cpi->common;
   2308   YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
   2309   MACROBLOCKD *const xd = &x->e_mbd;
   2310   MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   2311 
   2312   // set up scaling factors
   2313   scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
   2314 
   2315   scale[frame_type].sfc->set_scaled_offsets(&scale[frame_type],
   2316                                             mi_row * MI_SIZE, mi_col * MI_SIZE);
   2317 
   2318   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   2319   // use the UV scaling factors.
   2320   setup_pred_block(xd, yv12_mb[frame_type], yv12, mi_row, mi_col,
   2321                    &scale[frame_type], &scale[frame_type]);
   2322 
   2323   // Gets an initial list of candidate vectors from neighbours and orders them
   2324   vp9_find_mv_refs(cm, xd, tile, xd->mi_8x8[0],
   2325                    xd->last_mi,
   2326                    frame_type,
   2327                    mbmi->ref_mvs[frame_type], mi_row, mi_col);
   2328 
   2329   // Candidate refinement carried out at encoder and decoder
   2330   vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv,
   2331                         mbmi->ref_mvs[frame_type],
   2332                         &frame_nearest_mv[frame_type],
   2333                         &frame_near_mv[frame_type]);
   2334 
   2335   // Further refinement that is encode side only to test the top few candidates
   2336   // in full and choose the best as the centre point for subsequent searches.
   2337   // The current implementation doesn't support scaling.
   2338   if (!vp9_is_scaled(scale[frame_type].sfc) && block_size >= BLOCK_8X8)
   2339     mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
   2340             frame_type, block_size);
   2341 }
   2342 
   2343 static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
   2344   YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
   2345   int fb = get_ref_frame_idx(cpi, ref_frame);
   2346   int fb_scale = get_scale_ref_frame_idx(cpi, ref_frame);
   2347   if (cpi->scaled_ref_idx[fb_scale] != cpi->common.ref_frame_map[fb])
   2348     scaled_ref_frame = &cpi->common.yv12_fb[cpi->scaled_ref_idx[fb_scale]];
   2349   return scaled_ref_frame;
   2350 }
   2351 
   2352 static INLINE int get_switchable_rate(const MACROBLOCK *x) {
   2353   const MACROBLOCKD *const xd = &x->e_mbd;
   2354   const MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
   2355   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   2356   return SWITCHABLE_INTERP_RATE_FACTOR *
   2357              x->switchable_interp_costs[ctx][mbmi->interp_filter];
   2358 }
   2359 
   2360 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   2361                                  const TileInfo *const tile,
   2362                                  BLOCK_SIZE bsize,
   2363                                  int mi_row, int mi_col,
   2364                                  int_mv *tmp_mv, int *rate_mv) {
   2365   MACROBLOCKD *xd = &x->e_mbd;
   2366   VP9_COMMON *cm = &cpi->common;
   2367   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   2368   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   2369   int bestsme = INT_MAX;
   2370   int further_steps, step_param;
   2371   int sadpb = x->sadperbit16;
   2372   int_mv mvp_full;
   2373   int ref = mbmi->ref_frame[0];
   2374   int_mv ref_mv = mbmi->ref_mvs[ref][0];
   2375   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   2376 
   2377   int tmp_col_min = x->mv_col_min;
   2378   int tmp_col_max = x->mv_col_max;
   2379   int tmp_row_min = x->mv_row_min;
   2380   int tmp_row_max = x->mv_row_max;
   2381 
   2382   YV12_BUFFER_CONFIG *scaled_ref_frame = get_scaled_ref_frame(cpi, ref);
   2383 
   2384   if (scaled_ref_frame) {
   2385     int i;
   2386     // Swap out the reference frame for a version that's been scaled to
   2387     // match the resolution of the current frame, allowing the existing
   2388     // motion search code to be used without additional modifications.
   2389     for (i = 0; i < MAX_MB_PLANE; i++)
   2390       backup_yv12[i] = xd->plane[i].pre[0];
   2391 
   2392     setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   2393   }
   2394 
   2395   vp9_clamp_mv_min_max(x, &ref_mv.as_mv);
   2396 
   2397   // Adjust search parameters based on small partitions' result.
   2398   if (x->fast_ms) {
   2399     // && abs(mvp_full.as_mv.row - x->pred_mv.as_mv.row) < 24 &&
   2400     // abs(mvp_full.as_mv.col - x->pred_mv.as_mv.col) < 24) {
   2401     // adjust search range
   2402     step_param = 6;
   2403     if (x->fast_ms > 1)
   2404       step_param = 8;
   2405 
   2406     // Get prediction MV.
   2407     mvp_full.as_int = x->pred_mv[ref].as_int;
   2408 
   2409     // Adjust MV sign if needed.
   2410     if (cm->ref_frame_sign_bias[ref]) {
   2411       mvp_full.as_mv.col *= -1;
   2412       mvp_full.as_mv.row *= -1;
   2413     }
   2414   } else {
   2415     // Work out the size of the first step in the mv step search.
   2416     // 0 here is maximum length first step. 1 is MAX >> 1 etc.
   2417     if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
   2418       // Take wtd average of the step_params based on the last frame's
   2419       // max mv magnitude and that based on the best ref mvs of the current
   2420       // block for the given reference.
   2421       step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
   2422                     cpi->mv_step_param) >> 1;
   2423     } else {
   2424       step_param = cpi->mv_step_param;
   2425     }
   2426   }
   2427 
   2428   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
   2429       cpi->common.show_frame) {
   2430     int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
   2431                                                        b_width_log2(bsize)));
   2432     step_param = MAX(step_param, boffset);
   2433   }
   2434 
   2435   mvp_full.as_int = x->mv_best_ref_index[ref] < MAX_MV_REF_CANDIDATES ?
   2436       mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int :
   2437       x->pred_mv[ref].as_int;
   2438 
   2439   mvp_full.as_mv.col >>= 3;
   2440   mvp_full.as_mv.row >>= 3;
   2441 
   2442   // Further step/diamond searches as necessary
   2443   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
   2444 
   2445   if (cpi->sf.search_method == HEX) {
   2446     bestsme = vp9_hex_search(x, &mvp_full.as_mv,
   2447                              step_param,
   2448                              sadpb, 1,
   2449                              &cpi->fn_ptr[block_size], 1,
   2450                              &ref_mv.as_mv, &tmp_mv->as_mv);
   2451   } else if (cpi->sf.search_method == SQUARE) {
   2452     bestsme = vp9_square_search(x, &mvp_full.as_mv,
   2453                                 step_param,
   2454                                 sadpb, 1,
   2455                                 &cpi->fn_ptr[block_size], 1,
   2456                                 &ref_mv.as_mv, &tmp_mv->as_mv);
   2457   } else if (cpi->sf.search_method == BIGDIA) {
   2458     bestsme = vp9_bigdia_search(x, &mvp_full.as_mv,
   2459                                 step_param,
   2460                                 sadpb, 1,
   2461                                 &cpi->fn_ptr[block_size], 1,
   2462                                 &ref_mv.as_mv, &tmp_mv->as_mv);
   2463   } else {
   2464     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
   2465                                      sadpb, further_steps, 1,
   2466                                      &cpi->fn_ptr[block_size],
   2467                                      &ref_mv, tmp_mv);
   2468   }
   2469 
   2470   x->mv_col_min = tmp_col_min;
   2471   x->mv_col_max = tmp_col_max;
   2472   x->mv_row_min = tmp_row_min;
   2473   x->mv_row_max = tmp_row_max;
   2474 
   2475   if (bestsme < INT_MAX) {
   2476     int dis;  /* TODO: use dis in distortion calculation later. */
   2477     unsigned int sse;
   2478     cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv.as_mv,
   2479                                  cm->allow_high_precision_mv,
   2480                                  x->errorperbit,
   2481                                  &cpi->fn_ptr[block_size],
   2482                                  0, cpi->sf.subpel_iters_per_step,
   2483                                  x->nmvjointcost, x->mvcost,
   2484                                  &dis, &sse);
   2485   }
   2486   *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv.as_mv,
   2487                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   2488 
   2489   if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
   2490     x->pred_mv[ref].as_int = tmp_mv->as_int;
   2491 
   2492   if (scaled_ref_frame) {
   2493     int i;
   2494     for (i = 0; i < MAX_MB_PLANE; i++)
   2495       xd->plane[i].pre[0] = backup_yv12[i];
   2496   }
   2497 }
   2498 
   2499 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   2500                                 BLOCK_SIZE bsize,
   2501                                 int_mv *frame_mv,
   2502                                 int mi_row, int mi_col,
   2503                                 int_mv single_newmv[MAX_REF_FRAMES],
   2504                                 int *rate_mv) {
   2505   int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
   2506   MACROBLOCKD *xd = &x->e_mbd;
   2507   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   2508   const int refs[2] = { mbmi->ref_frame[0],
   2509                         mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1] };
   2510   int_mv ref_mv[2];
   2511   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   2512   int ite, ref;
   2513   // Prediction buffer from second frame.
   2514   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
   2515 
   2516   // Do joint motion search in compound mode to get more accurate mv.
   2517   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
   2518   struct buf_2d scaled_first_yv12 = xd->plane[0].pre[0];
   2519   int last_besterr[2] = {INT_MAX, INT_MAX};
   2520   YV12_BUFFER_CONFIG *const scaled_ref_frame[2] = {
   2521     get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
   2522     get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
   2523   };
   2524 
   2525   for (ref = 0; ref < 2; ++ref) {
   2526     ref_mv[ref] = mbmi->ref_mvs[refs[ref]][0];
   2527 
   2528     if (scaled_ref_frame[ref]) {
   2529       int i;
   2530       // Swap out the reference frame for a version that's been scaled to
   2531       // match the resolution of the current frame, allowing the existing
   2532       // motion search code to be used without additional modifications.
   2533       for (i = 0; i < MAX_MB_PLANE; i++)
   2534         backup_yv12[ref][i] = xd->plane[i].pre[ref];
   2535       setup_pre_planes(xd, ref, scaled_ref_frame[ref], mi_row, mi_col, NULL);
   2536     }
   2537 
   2538     xd->scale_factor[ref].sfc->set_scaled_offsets(&xd->scale_factor[ref],
   2539                                                   mi_row, mi_col);
   2540     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
   2541   }
   2542 
   2543   // Allow joint search multiple times iteratively for each ref frame
   2544   // and break out the search loop if it couldn't find better mv.
   2545   for (ite = 0; ite < 4; ite++) {
   2546     struct buf_2d ref_yv12[2];
   2547     int bestsme = INT_MAX;
   2548     int sadpb = x->sadperbit16;
   2549     int_mv tmp_mv;
   2550     int search_range = 3;
   2551 
   2552     int tmp_col_min = x->mv_col_min;
   2553     int tmp_col_max = x->mv_col_max;
   2554     int tmp_row_min = x->mv_row_min;
   2555     int tmp_row_max = x->mv_row_max;
   2556     int id = ite % 2;
   2557 
   2558     // Initialized here because of compiler problem in Visual Studio.
   2559     ref_yv12[0] = xd->plane[0].pre[0];
   2560     ref_yv12[1] = xd->plane[0].pre[1];
   2561 
   2562     // Get pred block from second frame.
   2563     vp9_build_inter_predictor(ref_yv12[!id].buf,
   2564                               ref_yv12[!id].stride,
   2565                               second_pred, pw,
   2566                               &frame_mv[refs[!id]].as_mv,
   2567                               &xd->scale_factor[!id],
   2568                               pw, ph, 0,
   2569                               &xd->subpix, MV_PRECISION_Q3);
   2570 
   2571     // Compound motion search on first ref frame.
   2572     if (id)
   2573       xd->plane[0].pre[0] = ref_yv12[id];
   2574     vp9_clamp_mv_min_max(x, &ref_mv[id].as_mv);
   2575 
   2576     // Use mv result from single mode as mvp.
   2577     tmp_mv.as_int = frame_mv[refs[id]].as_int;
   2578 
   2579     tmp_mv.as_mv.col >>= 3;
   2580     tmp_mv.as_mv.row >>= 3;
   2581 
   2582     // Small-range full-pixel motion search
   2583     bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
   2584                                        search_range,
   2585                                        &cpi->fn_ptr[block_size],
   2586                                        x->nmvjointcost, x->mvcost,
   2587                                        &ref_mv[id], second_pred,
   2588                                        pw, ph);
   2589 
   2590     x->mv_col_min = tmp_col_min;
   2591     x->mv_col_max = tmp_col_max;
   2592     x->mv_row_min = tmp_row_min;
   2593     x->mv_row_max = tmp_row_max;
   2594 
   2595     if (bestsme < INT_MAX) {
   2596       int dis; /* TODO: use dis in distortion calculation later. */
   2597       unsigned int sse;
   2598 
   2599       bestsme = cpi->find_fractional_mv_step_comp(
   2600           x, &tmp_mv.as_mv,
   2601           &ref_mv[id].as_mv,
   2602           cpi->common.allow_high_precision_mv,
   2603           x->errorperbit,
   2604           &cpi->fn_ptr[block_size],
   2605           0, cpi->sf.subpel_iters_per_step,
   2606           x->nmvjointcost, x->mvcost,
   2607           &dis, &sse, second_pred,
   2608           pw, ph);
   2609     }
   2610 
   2611     if (id)
   2612       xd->plane[0].pre[0] = scaled_first_yv12;
   2613 
   2614     if (bestsme < last_besterr[id]) {
   2615       frame_mv[refs[id]].as_int = tmp_mv.as_int;
   2616       last_besterr[id] = bestsme;
   2617     } else {
   2618       break;
   2619     }
   2620   }
   2621 
   2622   *rate_mv = 0;
   2623 
   2624   for (ref = 0; ref < 2; ++ref) {
   2625     if (scaled_ref_frame[ref]) {
   2626       // restore the predictor
   2627       int i;
   2628       for (i = 0; i < MAX_MB_PLANE; i++)
   2629         xd->plane[i].pre[ref] = backup_yv12[ref][i];
   2630     }
   2631 
   2632     *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
   2633                                 &mbmi->ref_mvs[refs[ref]][0].as_mv,
   2634                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   2635   }
   2636 
   2637   vpx_free(second_pred);
   2638 }
   2639 
   2640 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   2641                                  const TileInfo *const tile,
   2642                                  BLOCK_SIZE bsize,
   2643                                  int64_t txfm_cache[],
   2644                                  int *rate2, int64_t *distortion,
   2645                                  int *skippable,
   2646                                  int *rate_y, int64_t *distortion_y,
   2647                                  int *rate_uv, int64_t *distortion_uv,
   2648                                  int *mode_excluded, int *disable_skip,
   2649                                  INTERPOLATION_TYPE *best_filter,
   2650                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
   2651                                  int mi_row, int mi_col,
   2652                                  int_mv single_newmv[MAX_REF_FRAMES],
   2653                                  int64_t *psse,
   2654                                  const int64_t ref_best_rd) {
   2655   VP9_COMMON *cm = &cpi->common;
   2656   MACROBLOCKD *xd = &x->e_mbd;
   2657   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   2658   const int is_comp_pred = has_second_ref(mbmi);
   2659   const int num_refs = is_comp_pred ? 2 : 1;
   2660   const int this_mode = mbmi->mode;
   2661   int_mv *frame_mv = mode_mv[this_mode];
   2662   int i;
   2663   int refs[2] = { mbmi->ref_frame[0],
   2664     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   2665   int_mv cur_mv[2];
   2666   int64_t this_rd = 0;
   2667   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
   2668   int pred_exists = 0;
   2669   int intpel_mv;
   2670   int64_t rd, best_rd = INT64_MAX;
   2671   int best_needs_copy = 0;
   2672   uint8_t *orig_dst[MAX_MB_PLANE];
   2673   int orig_dst_stride[MAX_MB_PLANE];
   2674   int rs = 0;
   2675 
   2676   if (is_comp_pred) {
   2677     if (frame_mv[refs[0]].as_int == INVALID_MV ||
   2678         frame_mv[refs[1]].as_int == INVALID_MV)
   2679       return INT64_MAX;
   2680   }
   2681 
   2682   if (this_mode == NEWMV) {
   2683     int rate_mv;
   2684     if (is_comp_pred) {
   2685       // Initialize mv using single prediction mode result.
   2686       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
   2687       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
   2688 
   2689       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
   2690         joint_motion_search(cpi, x, bsize, frame_mv,
   2691                             mi_row, mi_col, single_newmv, &rate_mv);
   2692       } else {
   2693         rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
   2694                                    &mbmi->ref_mvs[refs[0]][0].as_mv,
   2695                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   2696         rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]].as_mv,
   2697                                    &mbmi->ref_mvs[refs[1]][0].as_mv,
   2698                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   2699       }
   2700       *rate2 += rate_mv;
   2701     } else {
   2702       int_mv tmp_mv;
   2703       single_motion_search(cpi, x, tile, bsize, mi_row, mi_col,
   2704                            &tmp_mv, &rate_mv);
   2705       *rate2 += rate_mv;
   2706       frame_mv[refs[0]].as_int =
   2707           xd->mi_8x8[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
   2708       single_newmv[refs[0]].as_int = tmp_mv.as_int;
   2709     }
   2710   }
   2711 
   2712   // if we're near/nearest and mv == 0,0, compare to zeromv
   2713   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
   2714       frame_mv[refs[0]].as_int == 0 &&
   2715       !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
   2716       (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
   2717     int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
   2718     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
   2719     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
   2720     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
   2721 
   2722     if (this_mode == NEARMV) {
   2723       if (c1 > c3)
   2724         return INT64_MAX;
   2725     } else if (this_mode == NEARESTMV) {
   2726       if (c2 > c3)
   2727         return INT64_MAX;
   2728     } else {
   2729       assert(this_mode == ZEROMV);
   2730       if (num_refs == 1) {
   2731         if ((c3 >= c2 &&
   2732              mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
   2733             (c3 >= c1 &&
   2734              mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
   2735           return INT64_MAX;
   2736       } else {
   2737         if ((c3 >= c2 &&
   2738              mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
   2739              mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
   2740             (c3 >= c1 &&
   2741              mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
   2742              mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
   2743           return INT64_MAX;
   2744       }
   2745     }
   2746   }
   2747 
   2748   for (i = 0; i < num_refs; ++i) {
   2749     cur_mv[i] = frame_mv[refs[i]];
   2750     // Clip "next_nearest" so that it does not extend to far out of image
   2751     if (this_mode != NEWMV)
   2752       clamp_mv2(&cur_mv[i].as_mv, xd);
   2753 
   2754     if (mv_check_bounds(x, &cur_mv[i]))
   2755       return INT64_MAX;
   2756     mbmi->mv[i].as_int = cur_mv[i].as_int;
   2757   }
   2758 
   2759   // do first prediction into the destination buffer. Do the next
   2760   // prediction into a temporary buffer. Then keep track of which one
   2761   // of these currently holds the best predictor, and use the other
   2762   // one for future predictions. In the end, copy from tmp_buf to
   2763   // dst if necessary.
   2764   for (i = 0; i < MAX_MB_PLANE; i++) {
   2765     orig_dst[i] = xd->plane[i].dst.buf;
   2766     orig_dst_stride[i] = xd->plane[i].dst.stride;
   2767   }
   2768 
   2769   /* We don't include the cost of the second reference here, because there
   2770    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
   2771    * words if you present them in that order, the second one is always known
   2772    * if the first is known */
   2773   *rate2 += cost_mv_ref(cpi, this_mode,
   2774                         mbmi->mode_context[mbmi->ref_frame[0]]);
   2775 
   2776   if (!(*mode_excluded)) {
   2777     if (is_comp_pred) {
   2778       *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
   2779     } else {
   2780       *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
   2781     }
   2782   }
   2783 
   2784   pred_exists = 0;
   2785   // Are all MVs integer pel for Y and UV
   2786   intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 &&
   2787       (mbmi->mv[0].as_mv.col & 15) == 0;
   2788   if (is_comp_pred)
   2789     intpel_mv &= (mbmi->mv[1].as_mv.row & 15) == 0 &&
   2790         (mbmi->mv[1].as_mv.col & 15) == 0;
   2791   // Search for best switchable filter by checking the variance of
   2792   // pred error irrespective of whether the filter will be used
   2793   if (cm->mcomp_filter_type != BILINEAR) {
   2794     *best_filter = EIGHTTAP;
   2795     if (x->source_variance <
   2796         cpi->sf.disable_filter_search_var_thresh) {
   2797       *best_filter = EIGHTTAP;
   2798       vp9_zero(cpi->rd_filter_cache);
   2799     } else {
   2800       int i, newbest;
   2801       int tmp_rate_sum = 0;
   2802       int64_t tmp_dist_sum = 0;
   2803 
   2804       cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
   2805       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
   2806         int j;
   2807         int64_t rs_rd;
   2808         mbmi->interp_filter = i;
   2809         vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
   2810         rs = get_switchable_rate(x);
   2811         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
   2812 
   2813         if (i > 0 && intpel_mv) {
   2814           cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
   2815                                            tmp_rate_sum, tmp_dist_sum);
   2816           cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
   2817               MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
   2818                   cpi->rd_filter_cache[i] + rs_rd);
   2819           rd = cpi->rd_filter_cache[i];
   2820           if (cm->mcomp_filter_type == SWITCHABLE)
   2821             rd += rs_rd;
   2822         } else {
   2823           int rate_sum = 0;
   2824           int64_t dist_sum = 0;
   2825           if ((cm->mcomp_filter_type == SWITCHABLE &&
   2826                (!i || best_needs_copy)) ||
   2827               (cm->mcomp_filter_type != SWITCHABLE &&
   2828                (cm->mcomp_filter_type == mbmi->interp_filter ||
   2829                 (i == 0 && intpel_mv)))) {
   2830             for (j = 0; j < MAX_MB_PLANE; j++) {
   2831               xd->plane[j].dst.buf = orig_dst[j];
   2832               xd->plane[j].dst.stride = orig_dst_stride[j];
   2833             }
   2834           } else {
   2835             for (j = 0; j < MAX_MB_PLANE; j++) {
   2836               xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
   2837               xd->plane[j].dst.stride = 64;
   2838             }
   2839           }
   2840           vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   2841           model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
   2842           cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
   2843                                            rate_sum, dist_sum);
   2844           cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
   2845               MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
   2846                   cpi->rd_filter_cache[i] + rs_rd);
   2847           rd = cpi->rd_filter_cache[i];
   2848           if (cm->mcomp_filter_type == SWITCHABLE)
   2849             rd += rs_rd;
   2850           if (i == 0 && intpel_mv) {
   2851             tmp_rate_sum = rate_sum;
   2852             tmp_dist_sum = dist_sum;
   2853           }
   2854         }
   2855         if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
   2856           if (rd / 2 > ref_best_rd) {
   2857             for (i = 0; i < MAX_MB_PLANE; i++) {
   2858               xd->plane[i].dst.buf = orig_dst[i];
   2859               xd->plane[i].dst.stride = orig_dst_stride[i];
   2860             }
   2861             return INT64_MAX;
   2862           }
   2863         }
   2864         newbest = i == 0 || rd < best_rd;
   2865 
   2866         if (newbest) {
   2867           best_rd = rd;
   2868           *best_filter = mbmi->interp_filter;
   2869           if (cm->mcomp_filter_type == SWITCHABLE && i && !intpel_mv)
   2870             best_needs_copy = !best_needs_copy;
   2871         }
   2872 
   2873         if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
   2874             (cm->mcomp_filter_type != SWITCHABLE &&
   2875              cm->mcomp_filter_type == mbmi->interp_filter)) {
   2876           pred_exists = 1;
   2877         }
   2878       }
   2879 
   2880       for (i = 0; i < MAX_MB_PLANE; i++) {
   2881         xd->plane[i].dst.buf = orig_dst[i];
   2882         xd->plane[i].dst.stride = orig_dst_stride[i];
   2883       }
   2884     }
   2885   }
   2886   // Set the appropriate filter
   2887   mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
   2888       cm->mcomp_filter_type : *best_filter;
   2889   vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
   2890   rs = cm->mcomp_filter_type == SWITCHABLE ? get_switchable_rate(x) : 0;
   2891 
   2892   if (pred_exists) {
   2893     if (best_needs_copy) {
   2894       // again temporarily set the buffers to local memory to prevent a memcpy
   2895       for (i = 0; i < MAX_MB_PLANE; i++) {
   2896         xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
   2897         xd->plane[i].dst.stride = 64;
   2898       }
   2899     }
   2900   } else {
   2901     // Handles the special case when a filter that is not in the
   2902     // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
   2903     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   2904   }
   2905 
   2906 
   2907   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
   2908     int tmp_rate;
   2909     int64_t tmp_dist;
   2910     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
   2911     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
   2912     // if current pred_error modeled rd is substantially more than the best
   2913     // so far, do not bother doing full rd
   2914     if (rd / 2 > ref_best_rd) {
   2915       for (i = 0; i < MAX_MB_PLANE; i++) {
   2916         xd->plane[i].dst.buf = orig_dst[i];
   2917         xd->plane[i].dst.stride = orig_dst_stride[i];
   2918       }
   2919       return INT64_MAX;
   2920     }
   2921   }
   2922 
   2923   if (cpi->common.mcomp_filter_type == SWITCHABLE)
   2924     *rate2 += get_switchable_rate(x);
   2925 
   2926   if (!is_comp_pred && cpi->enable_encode_breakout) {
   2927     if (cpi->active_map_enabled && x->active_ptr[0] == 0)
   2928       x->skip = 1;
   2929     else if (x->encode_breakout) {
   2930       const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
   2931       const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
   2932       unsigned int var, sse;
   2933       // Skipping threshold for ac.
   2934       unsigned int thresh_ac;
   2935       // The encode_breakout input
   2936       unsigned int encode_breakout = x->encode_breakout << 4;
   2937       unsigned int max_thresh = 36000;
   2938 
   2939       // Use extreme low threshold for static frames to limit skipping.
   2940       if (cpi->enable_encode_breakout == 2)
   2941         max_thresh = 128;
   2942 
   2943       // Calculate threshold according to dequant value.
   2944       thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
   2945 
   2946       // Use encode_breakout input if it is bigger than internal threshold.
   2947       if (thresh_ac < encode_breakout)
   2948         thresh_ac = encode_breakout;
   2949 
   2950       // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
   2951       if (thresh_ac > max_thresh)
   2952         thresh_ac = max_thresh;
   2953 
   2954       var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
   2955                                    xd->plane[0].dst.buf,
   2956                                    xd->plane[0].dst.stride, &sse);
   2957 
   2958       // Adjust threshold according to partition size.
   2959       thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
   2960           b_height_log2_lookup[bsize]);
   2961 
   2962       // Y skipping condition checking
   2963       if (sse < thresh_ac || sse == 0) {
   2964         // Skipping threshold for dc
   2965         unsigned int thresh_dc;
   2966 
   2967         thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
   2968 
   2969         // dc skipping checking
   2970         if ((sse - var) < thresh_dc || sse == var) {
   2971           unsigned int sse_u, sse_v;
   2972           unsigned int var_u, var_v;
   2973 
   2974           var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
   2975                                           x->plane[1].src.stride,
   2976                                           xd->plane[1].dst.buf,
   2977                                           xd->plane[1].dst.stride, &sse_u);
   2978 
   2979           // U skipping condition checking
   2980           if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
   2981               (sse_u - var_u < thresh_dc || sse_u == var_u)) {
   2982             var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
   2983                                             x->plane[2].src.stride,
   2984                                             xd->plane[2].dst.buf,
   2985                                             xd->plane[2].dst.stride, &sse_v);
   2986 
   2987             // V skipping condition checking
   2988             if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
   2989                 (sse_v - var_v < thresh_dc || sse_v == var_v)) {
   2990               x->skip = 1;
   2991 
   2992               // The cost of skip bit needs to be added.
   2993               *rate2 += vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
   2994 
   2995               // Scaling factor for SSE from spatial domain to frequency domain
   2996               // is 16. Adjust distortion accordingly.
   2997               *distortion_uv = (sse_u + sse_v) << 4;
   2998               *distortion = (sse << 4) + *distortion_uv;
   2999 
   3000               *disable_skip = 1;
   3001               this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
   3002             }
   3003           }
   3004         }
   3005       }
   3006     }
   3007   }
   3008 
   3009   if (!x->skip) {
   3010     int skippable_y, skippable_uv;
   3011     int64_t sseuv = INT64_MAX;
   3012     int64_t rdcosty = INT64_MAX;
   3013 
   3014     // Y cost and distortion
   3015     super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
   3016                     bsize, txfm_cache, ref_best_rd);
   3017 
   3018     if (*rate_y == INT_MAX) {
   3019       *rate2 = INT_MAX;
   3020       *distortion = INT64_MAX;
   3021       for (i = 0; i < MAX_MB_PLANE; i++) {
   3022         xd->plane[i].dst.buf = orig_dst[i];
   3023         xd->plane[i].dst.stride = orig_dst_stride[i];
   3024       }
   3025       return INT64_MAX;
   3026     }
   3027 
   3028     *rate2 += *rate_y;
   3029     *distortion += *distortion_y;
   3030 
   3031     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
   3032     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
   3033 
   3034     super_block_uvrd(cpi, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
   3035                      bsize, ref_best_rd - rdcosty);
   3036     if (*rate_uv == INT_MAX) {
   3037       *rate2 = INT_MAX;
   3038       *distortion = INT64_MAX;
   3039       for (i = 0; i < MAX_MB_PLANE; i++) {
   3040         xd->plane[i].dst.buf = orig_dst[i];
   3041         xd->plane[i].dst.stride = orig_dst_stride[i];
   3042       }
   3043       return INT64_MAX;
   3044     }
   3045 
   3046     *psse += sseuv;
   3047     *rate2 += *rate_uv;
   3048     *distortion += *distortion_uv;
   3049     *skippable = skippable_y && skippable_uv;
   3050   }
   3051 
   3052   for (i = 0; i < MAX_MB_PLANE; i++) {
   3053     xd->plane[i].dst.buf = orig_dst[i];
   3054     xd->plane[i].dst.stride = orig_dst_stride[i];
   3055   }
   3056 
   3057   return this_rd;  // if 0, this will be re-calculated by caller
   3058 }
   3059 
   3060 static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   3061                            int max_plane) {
   3062   struct macroblock_plane *const p = x->plane;
   3063   struct macroblockd_plane *const pd = x->e_mbd.plane;
   3064   int i;
   3065 
   3066   for (i = 0; i < max_plane; ++i) {
   3067     p[i].coeff    = ctx->coeff_pbuf[i][1];
   3068     pd[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
   3069     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
   3070     pd[i].eobs    = ctx->eobs_pbuf[i][1];
   3071 
   3072     ctx->coeff_pbuf[i][1]   = ctx->coeff_pbuf[i][0];
   3073     ctx->qcoeff_pbuf[i][1]  = ctx->qcoeff_pbuf[i][0];
   3074     ctx->dqcoeff_pbuf[i][1] = ctx->dqcoeff_pbuf[i][0];
   3075     ctx->eobs_pbuf[i][1]    = ctx->eobs_pbuf[i][0];
   3076 
   3077     ctx->coeff_pbuf[i][0]   = p[i].coeff;
   3078     ctx->qcoeff_pbuf[i][0]  = pd[i].qcoeff;
   3079     ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
   3080     ctx->eobs_pbuf[i][0]    = pd[i].eobs;
   3081   }
   3082 }
   3083 
   3084 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   3085                                int *returnrate, int64_t *returndist,
   3086                                BLOCK_SIZE bsize,
   3087                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   3088   VP9_COMMON *const cm = &cpi->common;
   3089   MACROBLOCKD *const xd = &x->e_mbd;
   3090   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
   3091   int y_skip = 0, uv_skip = 0;
   3092   int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
   3093   x->skip_encode = 0;
   3094   ctx->skip = 0;
   3095   xd->mi_8x8[0]->mbmi.ref_frame[0] = INTRA_FRAME;
   3096   if (bsize >= BLOCK_8X8) {
   3097     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
   3098                                &dist_y, &y_skip, bsize, tx_cache,
   3099                                best_rd) >= best_rd) {
   3100       *returnrate = INT_MAX;
   3101       return;
   3102     }
   3103     rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
   3104                             &dist_uv, &uv_skip, bsize);
   3105   } else {
   3106     y_skip = 0;
   3107     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
   3108                                      &dist_y, best_rd) >= best_rd) {
   3109       *returnrate = INT_MAX;
   3110       return;
   3111     }
   3112     rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
   3113                             &dist_uv, &uv_skip, BLOCK_8X8);
   3114   }
   3115 
   3116   if (y_skip && uv_skip) {
   3117     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
   3118                   vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
   3119     *returndist = dist_y + dist_uv;
   3120     vp9_zero(ctx->tx_rd_diff);
   3121   } else {
   3122     int i;
   3123     *returnrate = rate_y + rate_uv +
   3124         vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
   3125     *returndist = dist_y + dist_uv;
   3126     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
   3127       for (i = 0; i < TX_MODES; i++) {
   3128         if (tx_cache[i] < INT64_MAX && tx_cache[cm->tx_mode] < INT64_MAX)
   3129           ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
   3130         else
   3131           ctx->tx_rd_diff[i] = 0;
   3132       }
   3133   }
   3134 
   3135   ctx->mic = *xd->mi_8x8[0];
   3136 }
   3137 
   3138 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   3139                                   const TileInfo *const tile,
   3140                                   int mi_row, int mi_col,
   3141                                   int *returnrate,
   3142                                   int64_t *returndistortion,
   3143                                   BLOCK_SIZE bsize,
   3144                                   PICK_MODE_CONTEXT *ctx,
   3145                                   int64_t best_rd_so_far) {
   3146   VP9_COMMON *cm = &cpi->common;
   3147   MACROBLOCKD *xd = &x->e_mbd;
   3148   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   3149   const struct segmentation *seg = &cm->seg;
   3150   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   3151   MB_PREDICTION_MODE this_mode;
   3152   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   3153   unsigned char segment_id = mbmi->segment_id;
   3154   int comp_pred, i;
   3155   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   3156   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   3157   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   3158   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
   3159                                     VP9_ALT_FLAG };
   3160   int idx_list[4] = {0,
   3161                      cpi->lst_fb_idx,
   3162                      cpi->gld_fb_idx,
   3163                      cpi->alt_fb_idx};
   3164   int64_t best_rd = best_rd_so_far;
   3165   int64_t best_tx_rd[TX_MODES];
   3166   int64_t best_tx_diff[TX_MODES];
   3167   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   3168   int64_t best_pred_rd[NB_PREDICTION_TYPES];
   3169   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   3170   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   3171   MB_MODE_INFO best_mbmode = { 0 };
   3172   int j;
   3173   int mode_index, best_mode_index = 0;
   3174   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   3175   vp9_prob comp_mode_p;
   3176   int64_t best_intra_rd = INT64_MAX;
   3177   int64_t best_inter_rd = INT64_MAX;
   3178   MB_PREDICTION_MODE best_intra_mode = DC_PRED;
   3179   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
   3180   INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE;
   3181   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
   3182   int64_t dist_uv[TX_SIZES];
   3183   int skip_uv[TX_SIZES];
   3184   MB_PREDICTION_MODE mode_uv[TX_SIZES];
   3185   struct scale_factors scale_factor[4];
   3186   unsigned int ref_frame_mask = 0;
   3187   unsigned int mode_mask = 0;
   3188   int64_t mode_distortions[MB_MODE_COUNT] = {-1};
   3189   int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
   3190   int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
   3191   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
   3192   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   3193   int best_skip2 = 0;
   3194 
   3195   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   3196 
   3197   // Everywhere the flag is set the error is much higher than its neighbors.
   3198   ctx->frames_with_high_error = 0;
   3199   ctx->modes_with_high_error = 0;
   3200 
   3201   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
   3202                            &comp_mode_p);
   3203 
   3204   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
   3205     best_pred_rd[i] = INT64_MAX;
   3206   for (i = 0; i < TX_MODES; i++)
   3207     best_tx_rd[i] = INT64_MAX;
   3208   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
   3209     best_filter_rd[i] = INT64_MAX;
   3210   for (i = 0; i < TX_SIZES; i++)
   3211     rate_uv_intra[i] = INT_MAX;
   3212 
   3213   *returnrate = INT_MAX;
   3214 
   3215   // Create a mask set to 1 for each reference frame used by a smaller
   3216   // resolution.
   3217   if (cpi->sf.use_avoid_tested_higherror) {
   3218     switch (block_size) {
   3219       case BLOCK_64X64:
   3220         for (i = 0; i < 4; i++) {
   3221           for (j = 0; j < 4; j++) {
   3222             ref_frame_mask |= x->mb_context[i][j].frames_with_high_error;
   3223             mode_mask |= x->mb_context[i][j].modes_with_high_error;
   3224           }
   3225         }
   3226         for (i = 0; i < 4; i++) {
   3227           ref_frame_mask |= x->sb32_context[i].frames_with_high_error;
   3228           mode_mask |= x->sb32_context[i].modes_with_high_error;
   3229         }
   3230         break;
   3231       case BLOCK_32X32:
   3232         for (i = 0; i < 4; i++) {
   3233           ref_frame_mask |=
   3234               x->mb_context[x->sb_index][i].frames_with_high_error;
   3235           mode_mask |= x->mb_context[x->sb_index][i].modes_with_high_error;
   3236         }
   3237         break;
   3238       default:
   3239         // Until we handle all block sizes set it to present;
   3240         ref_frame_mask = 0;
   3241         mode_mask = 0;
   3242         break;
   3243     }
   3244     ref_frame_mask = ~ref_frame_mask;
   3245     mode_mask = ~mode_mask;
   3246   }
   3247 
   3248   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
   3249     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
   3250       setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
   3251                          block_size, mi_row, mi_col,
   3252                          frame_mv[NEARESTMV], frame_mv[NEARMV],
   3253                          yv12_mb, scale_factor);
   3254     }
   3255     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   3256     frame_mv[ZEROMV][ref_frame].as_int = 0;
   3257   }
   3258 
   3259   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
   3260     int mode_excluded = 0;
   3261     int64_t this_rd = INT64_MAX;
   3262     int disable_skip = 0;
   3263     int compmode_cost = 0;
   3264     int rate2 = 0, rate_y = 0, rate_uv = 0;
   3265     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
   3266     int skippable = 0;
   3267     int64_t tx_cache[TX_MODES];
   3268     int i;
   3269     int this_skip2 = 0;
   3270     int64_t total_sse = INT_MAX;
   3271     int early_term = 0;
   3272 
   3273     for (i = 0; i < TX_MODES; ++i)
   3274       tx_cache[i] = INT64_MAX;
   3275 
   3276     x->skip = 0;
   3277     this_mode = vp9_mode_order[mode_index].mode;
   3278     ref_frame = vp9_mode_order[mode_index].ref_frame;
   3279     second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
   3280 
   3281     // Look at the reference frame of the best mode so far and set the
   3282     // skip mask to look at a subset of the remaining modes.
   3283     if (mode_index > cpi->sf.mode_skip_start) {
   3284       if (mode_index == (cpi->sf.mode_skip_start + 1)) {
   3285         switch (vp9_mode_order[best_mode_index].ref_frame) {
   3286           case INTRA_FRAME:
   3287             cpi->mode_skip_mask = 0;
   3288             break;
   3289           case LAST_FRAME:
   3290             cpi->mode_skip_mask = LAST_FRAME_MODE_MASK;
   3291             break;
   3292           case GOLDEN_FRAME:
   3293             cpi->mode_skip_mask = GOLDEN_FRAME_MODE_MASK;
   3294             break;
   3295           case ALTREF_FRAME:
   3296             cpi->mode_skip_mask = ALT_REF_MODE_MASK;
   3297             break;
   3298           case NONE:
   3299           case MAX_REF_FRAMES:
   3300             assert(!"Invalid Reference frame");
   3301         }
   3302       }
   3303       if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
   3304         continue;
   3305     }
   3306 
   3307     // Skip if the current reference frame has been masked off
   3308     if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
   3309         (cpi->ref_frame_mask & (1 << ref_frame)))
   3310       continue;
   3311 
   3312     // Test best rd so far against threshold for trying this mode.
   3313     if ((best_rd < ((int64_t)cpi->rd_threshes[segment_id][bsize][mode_index] *
   3314                      cpi->rd_thresh_freq_fact[bsize][mode_index] >> 5)) ||
   3315         cpi->rd_threshes[segment_id][bsize][mode_index] == INT_MAX)
   3316       continue;
   3317 
   3318     // Do not allow compound prediction if the segment level reference
   3319     // frame feature is in use as in this case there can only be one reference.
   3320     if ((second_ref_frame > INTRA_FRAME) &&
   3321          vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
   3322       continue;
   3323 
   3324     // Skip some checking based on small partitions' result.
   3325     if (x->fast_ms > 1 && !ref_frame)
   3326       continue;
   3327     if (x->fast_ms > 2 && ref_frame != x->subblock_ref)
   3328       continue;
   3329 
   3330     if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_8X8) {
   3331       if (!(ref_frame_mask & (1 << ref_frame))) {
   3332         continue;
   3333       }
   3334       if (!(mode_mask & (1 << this_mode))) {
   3335         continue;
   3336       }
   3337       if (second_ref_frame != NONE
   3338           && !(ref_frame_mask & (1 << second_ref_frame))) {
   3339         continue;
   3340       }
   3341     }
   3342 
   3343     mbmi->ref_frame[0] = ref_frame;
   3344     mbmi->ref_frame[1] = second_ref_frame;
   3345 
   3346     if (!(ref_frame == INTRA_FRAME
   3347         || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
   3348       continue;
   3349     }
   3350     if (!(second_ref_frame == NONE
   3351         || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
   3352       continue;
   3353     }
   3354 
   3355     comp_pred = second_ref_frame > INTRA_FRAME;
   3356     if (comp_pred) {
   3357       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
   3358         if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
   3359           continue;
   3360       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
   3361         if (ref_frame != best_inter_ref_frame &&
   3362             second_ref_frame != best_inter_ref_frame)
   3363           continue;
   3364     }
   3365 
   3366     set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
   3367     mbmi->uv_mode = DC_PRED;
   3368 
   3369     // Evaluate all sub-pel filters irrespective of whether we can use
   3370     // them for this frame.
   3371     mbmi->interp_filter = cm->mcomp_filter_type;
   3372     vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
   3373 
   3374     if (comp_pred) {
   3375       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
   3376         continue;
   3377       set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
   3378 
   3379       mode_excluded = mode_excluded
   3380                          ? mode_excluded
   3381                          : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
   3382     } else {
   3383       if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
   3384         mode_excluded =
   3385             mode_excluded ?
   3386                 mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
   3387       }
   3388     }
   3389 
   3390     // Select prediction reference frames.
   3391     for (i = 0; i < MAX_MB_PLANE; i++) {
   3392       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
   3393       if (comp_pred)
   3394         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
   3395     }
   3396 
   3397     // If the segment reference frame feature is enabled....
   3398     // then do nothing if the current ref frame is not allowed..
   3399     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
   3400         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
   3401             (int)ref_frame) {
   3402       continue;
   3403     // If the segment skip feature is enabled....
   3404     // then do nothing if the current mode is not allowed..
   3405     } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
   3406                (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
   3407       continue;
   3408     // Disable this drop out case if the ref frame
   3409     // segment level feature is enabled for this segment. This is to
   3410     // prevent the possibility that we end up unable to pick any mode.
   3411     } else if (!vp9_segfeature_active(seg, segment_id,
   3412                                       SEG_LVL_REF_FRAME)) {
   3413       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
   3414       // unless ARNR filtering is enabled in which case we want
   3415       // an unfiltered alternative. We allow near/nearest as well
   3416       // because they may result in zero-zero MVs but be cheaper.
   3417       if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
   3418         if ((this_mode != ZEROMV &&
   3419              !(this_mode == NEARMV &&
   3420                frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
   3421              !(this_mode == NEARESTMV &&
   3422                frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
   3423             ref_frame != ALTREF_FRAME) {
   3424           continue;
   3425         }
   3426       }
   3427     }
   3428     // TODO(JBB): This is to make up for the fact that we don't have sad
   3429     // functions that work when the block size reads outside the umv.  We
   3430     // should fix this either by making the motion search just work on
   3431     // a representative block in the boundary ( first ) and then implement a
   3432     // function that does sads when inside the border..
   3433     if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
   3434         this_mode == NEWMV) {
   3435       continue;
   3436     }
   3437 
   3438 #ifdef MODE_TEST_HIT_STATS
   3439     // TEST/DEBUG CODE
   3440     // Keep a rcord of the number of test hits at each size
   3441     cpi->mode_test_hits[bsize]++;
   3442 #endif
   3443 
   3444 
   3445     if (ref_frame == INTRA_FRAME) {
   3446       TX_SIZE uv_tx;
   3447       // Disable intra modes other than DC_PRED for blocks with low variance
   3448       // Threshold for intra skipping based on source variance
   3449       // TODO(debargha): Specialize the threshold for super block sizes
   3450       static const unsigned int skip_intra_var_thresh[BLOCK_SIZES] = {
   3451         64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
   3452       };
   3453       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
   3454           this_mode != DC_PRED &&
   3455           x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
   3456         continue;
   3457       // Only search the oblique modes if the best so far is
   3458       // one of the neighboring directional modes
   3459       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
   3460           (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
   3461         if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
   3462           continue;
   3463       }
   3464       mbmi->mode = this_mode;
   3465       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
   3466         if (conditional_skipintra(mbmi->mode, best_intra_mode))
   3467             continue;
   3468       }
   3469 
   3470       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
   3471                       bsize, tx_cache, best_rd);
   3472 
   3473       if (rate_y == INT_MAX)
   3474         continue;
   3475 
   3476       uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]);
   3477       if (rate_uv_intra[uv_tx] == INT_MAX) {
   3478         choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[uv_tx],
   3479                              &rate_uv_tokenonly[uv_tx],
   3480                              &dist_uv[uv_tx], &skip_uv[uv_tx],
   3481                              &mode_uv[uv_tx]);
   3482       }
   3483 
   3484       rate_uv = rate_uv_tokenonly[uv_tx];
   3485       distortion_uv = dist_uv[uv_tx];
   3486       skippable = skippable && skip_uv[uv_tx];
   3487       mbmi->uv_mode = mode_uv[uv_tx];
   3488 
   3489       rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
   3490       if (this_mode != DC_PRED && this_mode != TM_PRED)
   3491         rate2 += intra_cost_penalty;
   3492       distortion2 = distortion_y + distortion_uv;
   3493     } else {
   3494       mbmi->mode = this_mode;
   3495       compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
   3496       this_rd = handle_inter_mode(cpi, x, tile, bsize,
   3497                                   tx_cache,
   3498                                   &rate2, &distortion2, &skippable,
   3499                                   &rate_y, &distortion_y,
   3500                                   &rate_uv, &distortion_uv,
   3501                                   &mode_excluded, &disable_skip,
   3502                                   &tmp_best_filter, frame_mv,
   3503                                   mi_row, mi_col,
   3504                                   single_newmv, &total_sse, best_rd);
   3505       if (this_rd == INT64_MAX)
   3506         continue;
   3507     }
   3508 
   3509     if (cm->comp_pred_mode == HYBRID_PREDICTION) {
   3510       rate2 += compmode_cost;
   3511     }
   3512 
   3513     // Estimate the reference frame signaling cost and add it
   3514     // to the rolling cost variable.
   3515     if (second_ref_frame > INTRA_FRAME) {
   3516       rate2 += ref_costs_comp[ref_frame];
   3517     } else {
   3518       rate2 += ref_costs_single[ref_frame];
   3519     }
   3520 
   3521     if (!disable_skip) {
   3522       // Test for the condition where skip block will be activated
   3523       // because there are no non zero coefficients and make any
   3524       // necessary adjustment for rate. Ignore if skip is coded at
   3525       // segment level as the cost wont have been added in.
   3526       // Is Mb level skip allowed (i.e. not coded at segment level).
   3527       const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
   3528                                                          SEG_LVL_SKIP);
   3529 
   3530       if (skippable) {
   3531         // Back out the coefficient coding costs
   3532         rate2 -= (rate_y + rate_uv);
   3533         // for best yrd calculation
   3534         rate_uv = 0;
   3535 
   3536         if (mb_skip_allowed) {
   3537           int prob_skip_cost;
   3538 
   3539           // Cost the skip mb case
   3540           vp9_prob skip_prob =
   3541             vp9_get_pred_prob_mbskip(cm, xd);
   3542 
   3543           if (skip_prob) {
   3544             prob_skip_cost = vp9_cost_bit(skip_prob, 1);
   3545             rate2 += prob_skip_cost;
   3546           }
   3547         }
   3548       } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
   3549         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
   3550             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
   3551           // Add in the cost of the no skip flag.
   3552           int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
   3553                                             0);
   3554           rate2 += prob_skip_cost;
   3555         } else {
   3556           // FIXME(rbultje) make this work for splitmv also
   3557           int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
   3558                                             1);
   3559           rate2 += prob_skip_cost;
   3560           distortion2 = total_sse;
   3561           assert(total_sse >= 0);
   3562           rate2 -= (rate_y + rate_uv);
   3563           rate_y = 0;
   3564           rate_uv = 0;
   3565           this_skip2 = 1;
   3566         }
   3567       } else if (mb_skip_allowed) {
   3568         // Add in the cost of the no skip flag.
   3569         int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
   3570                                           0);
   3571         rate2 += prob_skip_cost;
   3572       }
   3573 
   3574       // Calculate the final RD estimate for this mode.
   3575       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
   3576     }
   3577 
   3578     // Keep record of best intra rd
   3579     if (!is_inter_block(&xd->mi_8x8[0]->mbmi) &&
   3580         this_rd < best_intra_rd) {
   3581       best_intra_rd = this_rd;
   3582       best_intra_mode = xd->mi_8x8[0]->mbmi.mode;
   3583     }
   3584 
   3585     // Keep record of best inter rd with single reference
   3586     if (is_inter_block(&xd->mi_8x8[0]->mbmi) &&
   3587         !has_second_ref(&xd->mi_8x8[0]->mbmi) &&
   3588         !mode_excluded && this_rd < best_inter_rd) {
   3589       best_inter_rd = this_rd;
   3590       best_inter_ref_frame = ref_frame;
   3591     }
   3592 
   3593     if (!disable_skip && ref_frame == INTRA_FRAME) {
   3594       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
   3595         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
   3596       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
   3597         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
   3598     }
   3599 
   3600     // Store the respective mode distortions for later use.
   3601     if (mode_distortions[this_mode] == -1
   3602         || distortion2 < mode_distortions[this_mode]) {
   3603       mode_distortions[this_mode] = distortion2;
   3604     }
   3605     if (frame_distortions[ref_frame] == -1
   3606         || distortion2 < frame_distortions[ref_frame]) {
   3607       frame_distortions[ref_frame] = distortion2;
   3608     }
   3609 
   3610     // Did this mode help.. i.e. is it the new best mode
   3611     if (this_rd < best_rd || x->skip) {
   3612       int max_plane = MAX_MB_PLANE;
   3613       if (!mode_excluded) {
   3614         // Note index of best mode so far
   3615         best_mode_index = mode_index;
   3616 
   3617         if (ref_frame == INTRA_FRAME) {
   3618           /* required for left and above block mv */
   3619           mbmi->mv[0].as_int = 0;
   3620           max_plane = 1;
   3621         }
   3622 
   3623         *returnrate = rate2;
   3624         *returndistortion = distortion2;
   3625         best_rd = this_rd;
   3626         best_mbmode = *mbmi;
   3627         best_skip2 = this_skip2;
   3628         if (!x->select_txfm_size)
   3629           swap_block_ptr(x, ctx, max_plane);
   3630         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
   3631                    sizeof(uint8_t) * ctx->num_4x4_blk);
   3632 
   3633         // TODO(debargha): enhance this test with a better distortion prediction
   3634         // based on qp, activity mask and history
   3635         if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
   3636             (mode_index > MIN_EARLY_TERM_INDEX)) {
   3637           const int qstep = xd->plane[0].dequant[1];
   3638           // TODO(debargha): Enhance this by specializing for each mode_index
   3639           int scale = 4;
   3640           if (x->source_variance < UINT_MAX) {
   3641             const int var_adjust = (x->source_variance < 16);
   3642             scale -= var_adjust;
   3643           }
   3644           if (ref_frame > INTRA_FRAME &&
   3645               distortion2 * scale < qstep * qstep) {
   3646             early_term = 1;
   3647           }
   3648         }
   3649       }
   3650     }
   3651 
   3652     /* keep record of best compound/single-only prediction */
   3653     if (!disable_skip && ref_frame != INTRA_FRAME) {
   3654       int single_rd, hybrid_rd, single_rate, hybrid_rate;
   3655 
   3656       if (cm->comp_pred_mode == HYBRID_PREDICTION) {
   3657         single_rate = rate2 - compmode_cost;
   3658         hybrid_rate = rate2;
   3659       } else {
   3660         single_rate = rate2;
   3661         hybrid_rate = rate2 + compmode_cost;
   3662       }
   3663 
   3664       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
   3665       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
   3666 
   3667       if (second_ref_frame <= INTRA_FRAME &&
   3668           single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
   3669         best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
   3670       } else if (second_ref_frame > INTRA_FRAME &&
   3671                  single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
   3672         best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
   3673       }
   3674       if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
   3675         best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
   3676     }
   3677 
   3678     /* keep record of best filter type */
   3679     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
   3680         cm->mcomp_filter_type != BILINEAR) {
   3681       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
   3682                               SWITCHABLE_FILTERS : cm->mcomp_filter_type];
   3683       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
   3684         int64_t adj_rd;
   3685         // In cases of poor prediction, filter_cache[] can contain really big
   3686         // values, which actually are bigger than this_rd itself. This can
   3687         // cause negative best_filter_rd[] values, which is obviously silly.
   3688         // Therefore, if filter_cache < ref, we do an adjusted calculation.
   3689         if (cpi->rd_filter_cache[i] >= ref) {
   3690           adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
   3691         } else {
   3692           // FIXME(rbultje) do this for comppsred also
   3693           //
   3694           // To prevent out-of-range computation in
   3695           //    adj_rd = cpi->rd_filter_cache[i] * this_rd / ref
   3696           // cpi->rd_filter_cache[i] / ref is converted to a 256 based ratio.
   3697           int tmp = cpi->rd_filter_cache[i] * 256 / ref;
   3698           adj_rd = (this_rd * tmp) >> 8;
   3699         }
   3700         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
   3701       }
   3702     }
   3703 
   3704     /* keep record of best txfm size */
   3705     if (bsize < BLOCK_32X32) {
   3706       if (bsize < BLOCK_16X16)
   3707         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
   3708 
   3709       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
   3710     }
   3711     if (!mode_excluded && this_rd != INT64_MAX) {
   3712       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
   3713         int64_t adj_rd = INT64_MAX;
   3714         adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
   3715 
   3716         if (adj_rd < best_tx_rd[i])
   3717           best_tx_rd[i] = adj_rd;
   3718       }
   3719     }
   3720 
   3721     if (early_term)
   3722       break;
   3723 
   3724     if (x->skip && !comp_pred)
   3725       break;
   3726   }
   3727 
   3728   if (best_rd >= best_rd_so_far)
   3729     return INT64_MAX;
   3730 
   3731   // If we used an estimate for the uv intra rd in the loop above...
   3732   if (cpi->sf.use_uv_intra_rd_estimate) {
   3733     // Do Intra UV best rd mode selection if best mode choice above was intra.
   3734     if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
   3735       TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
   3736       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
   3737                               &rate_uv_tokenonly[uv_tx_size],
   3738                               &dist_uv[uv_tx_size],
   3739                               &skip_uv[uv_tx_size],
   3740                               bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   3741     }
   3742   }
   3743 
   3744   // If we are using reference masking and the set mask flag is set then
   3745   // create the reference frame mask.
   3746   if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
   3747     cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame);
   3748 
   3749   // Flag all modes that have a distortion thats > 2x the best we found at
   3750   // this level.
   3751   for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
   3752     if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
   3753       continue;
   3754 
   3755     if (mode_distortions[mode_index] > 2 * *returndistortion) {
   3756       ctx->modes_with_high_error |= (1 << mode_index);
   3757     }
   3758   }
   3759 
   3760   // Flag all ref frames that have a distortion thats > 2x the best we found at
   3761   // this level.
   3762   for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
   3763     if (frame_distortions[ref_frame] > 2 * *returndistortion) {
   3764       ctx->frames_with_high_error |= (1 << ref_frame);
   3765     }
   3766   }
   3767 
   3768   assert((cm->mcomp_filter_type == SWITCHABLE) ||
   3769          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
   3770          (best_mbmode.ref_frame[0] == INTRA_FRAME));
   3771 
   3772   // Updating rd_thresh_freq_fact[] here means that the different
   3773   // partition/block sizes are handled independently based on the best
   3774   // choice for the current partition. It may well be better to keep a scaled
   3775   // best rd so far value and update rd_thresh_freq_fact based on the mode/size
   3776   // combination that wins out.
   3777   if (cpi->sf.adaptive_rd_thresh) {
   3778     for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
   3779       if (mode_index == best_mode_index) {
   3780         cpi->rd_thresh_freq_fact[bsize][mode_index] -=
   3781           (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
   3782       } else {
   3783         cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
   3784         if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
   3785             (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
   3786           cpi->rd_thresh_freq_fact[bsize][mode_index] =
   3787             cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
   3788         }
   3789       }
   3790     }
   3791   }
   3792 
   3793   // macroblock modes
   3794   *mbmi = best_mbmode;
   3795   x->skip |= best_skip2;
   3796 
   3797   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
   3798     if (best_pred_rd[i] == INT64_MAX)
   3799       best_pred_diff[i] = INT_MIN;
   3800     else
   3801       best_pred_diff[i] = best_rd - best_pred_rd[i];
   3802   }
   3803 
   3804   if (!x->skip) {
   3805     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
   3806       if (best_filter_rd[i] == INT64_MAX)
   3807         best_filter_diff[i] = 0;
   3808       else
   3809         best_filter_diff[i] = best_rd - best_filter_rd[i];
   3810     }
   3811     if (cm->mcomp_filter_type == SWITCHABLE)
   3812       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
   3813   } else {
   3814     vp9_zero(best_filter_diff);
   3815   }
   3816 
   3817   if (!x->skip) {
   3818     for (i = 0; i < TX_MODES; i++) {
   3819       if (best_tx_rd[i] == INT64_MAX)
   3820         best_tx_diff[i] = 0;
   3821       else
   3822         best_tx_diff[i] = best_rd - best_tx_rd[i];
   3823     }
   3824   } else {
   3825     vp9_zero(best_tx_diff);
   3826   }
   3827 
   3828   set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
   3829                     scale_factor);
   3830   store_coding_context(x, ctx, best_mode_index,
   3831                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
   3832                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
   3833                                       mbmi->ref_frame[1]][0],
   3834                        best_pred_diff, best_tx_diff, best_filter_diff);
   3835 
   3836   return best_rd;
   3837 }
   3838 
   3839 
   3840 int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
   3841                                       const TileInfo *const tile,
   3842                                       int mi_row, int mi_col,
   3843                                       int *returnrate,
   3844                                       int64_t *returndistortion,
   3845                                       BLOCK_SIZE bsize,
   3846                                       PICK_MODE_CONTEXT *ctx,
   3847                                       int64_t best_rd_so_far) {
   3848   VP9_COMMON *cm = &cpi->common;
   3849   MACROBLOCKD *xd = &x->e_mbd;
   3850   MB_MODE_INFO *mbmi = &xd->mi_8x8[0]->mbmi;
   3851   const struct segmentation *seg = &cm->seg;
   3852   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   3853   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   3854   unsigned char segment_id = mbmi->segment_id;
   3855   int comp_pred, i;
   3856   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   3857   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   3858   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
   3859                                     VP9_ALT_FLAG };
   3860   int idx_list[4] = {0,
   3861                      cpi->lst_fb_idx,
   3862                      cpi->gld_fb_idx,
   3863                      cpi->alt_fb_idx};
   3864   int64_t best_rd = best_rd_so_far;
   3865   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   3866   int64_t best_tx_rd[TX_MODES];
   3867   int64_t best_tx_diff[TX_MODES];
   3868   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   3869   int64_t best_pred_rd[NB_PREDICTION_TYPES];
   3870   int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
   3871   int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   3872   MB_MODE_INFO best_mbmode = { 0 };
   3873   int mode_index, best_mode_index = 0;
   3874   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   3875   vp9_prob comp_mode_p;
   3876   int64_t best_inter_rd = INT64_MAX;
   3877   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
   3878   INTERPOLATION_TYPE tmp_best_filter = SWITCHABLE;
   3879   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
   3880   int64_t dist_uv[TX_SIZES];
   3881   int skip_uv[TX_SIZES];
   3882   MB_PREDICTION_MODE mode_uv[TX_SIZES] = { 0 };
   3883   struct scale_factors scale_factor[4];
   3884   unsigned int ref_frame_mask = 0;
   3885   unsigned int mode_mask = 0;
   3886   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
   3887                                              cpi->common.y_dc_delta_q);
   3888   int_mv seg_mvs[4][MAX_REF_FRAMES];
   3889   b_mode_info best_bmodes[4];
   3890   int best_skip2 = 0;
   3891 
   3892   x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
   3893   vpx_memset(x->zcoeff_blk[TX_4X4], 0, 4);
   3894 
   3895   for (i = 0; i < 4; i++) {
   3896     int j;
   3897     for (j = 0; j < MAX_REF_FRAMES; j++)
   3898       seg_mvs[i][j].as_int = INVALID_MV;
   3899   }
   3900 
   3901   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
   3902                            &comp_mode_p);
   3903 
   3904   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
   3905     best_pred_rd[i] = INT64_MAX;
   3906   for (i = 0; i < TX_MODES; i++)
   3907     best_tx_rd[i] = INT64_MAX;
   3908   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
   3909     best_filter_rd[i] = INT64_MAX;
   3910   for (i = 0; i < TX_SIZES; i++)
   3911     rate_uv_intra[i] = INT_MAX;
   3912 
   3913   *returnrate = INT_MAX;
   3914 
   3915   // Create a mask set to 1 for each reference frame used by a smaller
   3916   // resolution.
   3917   if (cpi->sf.use_avoid_tested_higherror) {
   3918     ref_frame_mask = 0;
   3919     mode_mask = 0;
   3920     ref_frame_mask = ~ref_frame_mask;
   3921     mode_mask = ~mode_mask;
   3922   }
   3923 
   3924   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
   3925     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
   3926       setup_buffer_inter(cpi, x, tile, idx_list[ref_frame], ref_frame,
   3927                          block_size, mi_row, mi_col,
   3928                          frame_mv[NEARESTMV], frame_mv[NEARMV],
   3929                          yv12_mb, scale_factor);
   3930     }
   3931     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   3932     frame_mv[ZEROMV][ref_frame].as_int = 0;
   3933   }
   3934 
   3935   for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
   3936     int mode_excluded = 0;
   3937     int64_t this_rd = INT64_MAX;
   3938     int disable_skip = 0;
   3939     int compmode_cost = 0;
   3940     int rate2 = 0, rate_y = 0, rate_uv = 0;
   3941     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
   3942     int skippable = 0;
   3943     int64_t tx_cache[TX_MODES];
   3944     int i;
   3945     int this_skip2 = 0;
   3946     int64_t total_sse = INT_MAX;
   3947     int early_term = 0;
   3948 
   3949     for (i = 0; i < TX_MODES; ++i)
   3950       tx_cache[i] = INT64_MAX;
   3951 
   3952     x->skip = 0;
   3953     ref_frame = vp9_ref_order[mode_index].ref_frame;
   3954     second_ref_frame = vp9_ref_order[mode_index].second_ref_frame;
   3955 
   3956     // Look at the reference frame of the best mode so far and set the
   3957     // skip mask to look at a subset of the remaining modes.
   3958     if (mode_index > 2 && cpi->sf.mode_skip_start < MAX_MODES) {
   3959       if (mode_index == 3) {
   3960         switch (vp9_ref_order[best_mode_index].ref_frame) {
   3961           case INTRA_FRAME:
   3962             cpi->mode_skip_mask = 0;
   3963             break;
   3964           case LAST_FRAME:
   3965             cpi->mode_skip_mask = 0x0010;
   3966             break;
   3967           case GOLDEN_FRAME:
   3968             cpi->mode_skip_mask = 0x0008;
   3969             break;
   3970           case ALTREF_FRAME:
   3971             cpi->mode_skip_mask = 0x0000;
   3972             break;
   3973           case NONE:
   3974           case MAX_REF_FRAMES:
   3975             assert(!"Invalid Reference frame");
   3976         }
   3977       }
   3978       if (cpi->mode_skip_mask & ((int64_t)1 << mode_index))
   3979         continue;
   3980     }
   3981 
   3982     // Skip if the current reference frame has been masked off
   3983     if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
   3984         (cpi->ref_frame_mask & (1 << ref_frame)))
   3985       continue;
   3986 
   3987     // Test best rd so far against threshold for trying this mode.
   3988     if ((best_rd <
   3989          ((int64_t)cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] *
   3990           cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 5)) ||
   3991         cpi->rd_thresh_sub8x8[segment_id][bsize][mode_index] == INT_MAX)
   3992       continue;
   3993 
   3994     // Do not allow compound prediction if the segment level reference
   3995     // frame feature is in use as in this case there can only be one reference.
   3996     if ((second_ref_frame > INTRA_FRAME) &&
   3997          vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
   3998       continue;
   3999 
   4000     mbmi->ref_frame[0] = ref_frame;
   4001     mbmi->ref_frame[1] = second_ref_frame;
   4002 
   4003     if (!(ref_frame == INTRA_FRAME
   4004         || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
   4005       continue;
   4006     }
   4007     if (!(second_ref_frame == NONE
   4008         || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
   4009       continue;
   4010     }
   4011 
   4012     comp_pred = second_ref_frame > INTRA_FRAME;
   4013     if (comp_pred) {
   4014       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
   4015         if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME)
   4016           continue;
   4017       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
   4018         if (ref_frame != best_inter_ref_frame &&
   4019             second_ref_frame != best_inter_ref_frame)
   4020           continue;
   4021     }
   4022 
   4023     // TODO(jingning, jkoleszar): scaling reference frame not supported for
   4024     // sub8x8 blocks.
   4025     if (ref_frame > 0 &&
   4026         vp9_is_scaled(scale_factor[ref_frame].sfc))
   4027       continue;
   4028 
   4029     if (second_ref_frame > 0 &&
   4030         vp9_is_scaled(scale_factor[second_ref_frame].sfc))
   4031       continue;
   4032 
   4033     set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
   4034     mbmi->uv_mode = DC_PRED;
   4035 
   4036     // Evaluate all sub-pel filters irrespective of whether we can use
   4037     // them for this frame.
   4038     mbmi->interp_filter = cm->mcomp_filter_type;
   4039     vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
   4040 
   4041     if (comp_pred) {
   4042       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
   4043         continue;
   4044       set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
   4045 
   4046       mode_excluded = mode_excluded
   4047                          ? mode_excluded
   4048                          : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
   4049     } else {
   4050       if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
   4051         mode_excluded =
   4052             mode_excluded ?
   4053                 mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
   4054       }
   4055     }
   4056 
   4057     // Select prediction reference frames.
   4058     for (i = 0; i < MAX_MB_PLANE; i++) {
   4059       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
   4060       if (comp_pred)
   4061         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
   4062     }
   4063 
   4064     // If the segment reference frame feature is enabled....
   4065     // then do nothing if the current ref frame is not allowed..
   4066     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
   4067         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
   4068             (int)ref_frame) {
   4069       continue;
   4070     // If the segment skip feature is enabled....
   4071     // then do nothing if the current mode is not allowed..
   4072     } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
   4073                ref_frame != INTRA_FRAME) {
   4074       continue;
   4075     // Disable this drop out case if the ref frame
   4076     // segment level feature is enabled for this segment. This is to
   4077     // prevent the possibility that we end up unable to pick any mode.
   4078     } else if (!vp9_segfeature_active(seg, segment_id,
   4079                                       SEG_LVL_REF_FRAME)) {
   4080       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
   4081       // unless ARNR filtering is enabled in which case we want
   4082       // an unfiltered alternative. We allow near/nearest as well
   4083       // because they may result in zero-zero MVs but be cheaper.
   4084       if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
   4085         continue;
   4086     }
   4087 
   4088 #ifdef MODE_TEST_HIT_STATS
   4089     // TEST/DEBUG CODE
   4090     // Keep a rcord of the number of test hits at each size
   4091     cpi->mode_test_hits[bsize]++;
   4092 #endif
   4093 
   4094     if (ref_frame == INTRA_FRAME) {
   4095       int rate;
   4096       mbmi->tx_size = TX_4X4;
   4097       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
   4098                                        &distortion_y, best_rd) >= best_rd)
   4099         continue;
   4100       rate2 += rate;
   4101       rate2 += intra_cost_penalty;
   4102       distortion2 += distortion_y;
   4103 
   4104       if (rate_uv_intra[TX_4X4] == INT_MAX) {
   4105         choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[TX_4X4],
   4106                              &rate_uv_tokenonly[TX_4X4],
   4107                              &dist_uv[TX_4X4], &skip_uv[TX_4X4],
   4108                              &mode_uv[TX_4X4]);
   4109       }
   4110       rate2 += rate_uv_intra[TX_4X4];
   4111       rate_uv = rate_uv_tokenonly[TX_4X4];
   4112       distortion2 += dist_uv[TX_4X4];
   4113       distortion_uv = dist_uv[TX_4X4];
   4114       mbmi->uv_mode = mode_uv[TX_4X4];
   4115       tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
   4116       for (i = 0; i < TX_MODES; ++i)
   4117         tx_cache[i] = tx_cache[ONLY_4X4];
   4118     } else {
   4119       int rate;
   4120       int64_t distortion;
   4121       int64_t this_rd_thresh;
   4122       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
   4123       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
   4124       int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
   4125       int tmp_best_skippable = 0;
   4126       int switchable_filter_index;
   4127       int_mv *second_ref = comp_pred ?
   4128                              &mbmi->ref_mvs[second_ref_frame][0] : NULL;
   4129       b_mode_info tmp_best_bmodes[16];
   4130       MB_MODE_INFO tmp_best_mbmode;
   4131       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
   4132       int pred_exists = 0;
   4133       int uv_skippable;
   4134 
   4135       this_rd_thresh = (ref_frame == LAST_FRAME) ?
   4136           cpi->rd_thresh_sub8x8[segment_id][bsize][THR_LAST] :
   4137           cpi->rd_thresh_sub8x8[segment_id][bsize][THR_ALTR];
   4138       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
   4139           cpi->rd_thresh_sub8x8[segment_id][bsize][THR_GOLD] : this_rd_thresh;
   4140       xd->mi_8x8[0]->mbmi.tx_size = TX_4X4;
   4141 
   4142       cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
   4143       if (cm->mcomp_filter_type != BILINEAR) {
   4144         tmp_best_filter = EIGHTTAP;
   4145         if (x->source_variance <
   4146             cpi->sf.disable_filter_search_var_thresh) {
   4147           tmp_best_filter = EIGHTTAP;
   4148           vp9_zero(cpi->rd_filter_cache);
   4149         } else {
   4150           for (switchable_filter_index = 0;
   4151                switchable_filter_index < SWITCHABLE_FILTERS;
   4152                ++switchable_filter_index) {
   4153             int newbest, rs;
   4154             int64_t rs_rd;
   4155             mbmi->interp_filter = switchable_filter_index;
   4156             vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
   4157 
   4158             tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
   4159                                                  &mbmi->ref_mvs[ref_frame][0],
   4160                                                  second_ref,
   4161                                                  best_yrd,
   4162                                                  &rate, &rate_y, &distortion,
   4163                                                  &skippable, &total_sse,
   4164                                                  (int)this_rd_thresh, seg_mvs,
   4165                                                  bsi, switchable_filter_index,
   4166                                                  mi_row, mi_col);
   4167 
   4168             if (tmp_rd == INT64_MAX)
   4169               continue;
   4170             cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
   4171             rs = get_switchable_rate(x);
   4172             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
   4173             cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
   4174                 MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
   4175                     tmp_rd + rs_rd);
   4176             if (cm->mcomp_filter_type == SWITCHABLE)
   4177               tmp_rd += rs_rd;
   4178 
   4179             newbest = (tmp_rd < tmp_best_rd);
   4180             if (newbest) {
   4181               tmp_best_filter = mbmi->interp_filter;
   4182               tmp_best_rd = tmp_rd;
   4183             }
   4184             if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
   4185                 (mbmi->interp_filter == cm->mcomp_filter_type &&
   4186                  cm->mcomp_filter_type != SWITCHABLE)) {
   4187               tmp_best_rdu = tmp_rd;
   4188               tmp_best_rate = rate;
   4189               tmp_best_ratey = rate_y;
   4190               tmp_best_distortion = distortion;
   4191               tmp_best_sse = total_sse;
   4192               tmp_best_skippable = skippable;
   4193               tmp_best_mbmode = *mbmi;
   4194               for (i = 0; i < 4; i++) {
   4195                 tmp_best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
   4196                 x->zcoeff_blk[TX_4X4][i] = !xd->plane[0].eobs[i];
   4197               }
   4198               pred_exists = 1;
   4199               if (switchable_filter_index == 0 &&
   4200                   cpi->sf.use_rd_breakout &&
   4201                   best_rd < INT64_MAX) {
   4202                 if (tmp_best_rdu / 2 > best_rd) {
   4203                   // skip searching the other filters if the first is
   4204                   // already substantially larger than the best so far
   4205                   tmp_best_filter = mbmi->interp_filter;
   4206                   tmp_best_rdu = INT64_MAX;
   4207                   break;
   4208                 }
   4209               }
   4210             }
   4211           }  // switchable_filter_index loop
   4212         }
   4213       }
   4214 
   4215       if (tmp_best_rdu == INT64_MAX)
   4216         continue;
   4217 
   4218       mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
   4219                              tmp_best_filter : cm->mcomp_filter_type);
   4220       vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
   4221       if (!pred_exists) {
   4222         // Handles the special case when a filter that is not in the
   4223         // switchable list (bilinear, 6-tap) is indicated at the frame level
   4224         tmp_rd = rd_pick_best_mbsegmentation(cpi, x, tile,
   4225                      &mbmi->ref_mvs[ref_frame][0],
   4226                      second_ref,
   4227                      best_yrd,
   4228                      &rate, &rate_y, &distortion,
   4229                      &skippable, &total_sse,
   4230                      (int)this_rd_thresh, seg_mvs,
   4231                      bsi, 0,
   4232                      mi_row, mi_col);
   4233         if (tmp_rd == INT64_MAX)
   4234           continue;
   4235       } else {
   4236         if (cpi->common.mcomp_filter_type == SWITCHABLE) {
   4237           int rs = get_switchable_rate(x);
   4238           tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
   4239         }
   4240         tmp_rd = tmp_best_rdu;
   4241         total_sse = tmp_best_sse;
   4242         rate = tmp_best_rate;
   4243         rate_y = tmp_best_ratey;
   4244         distortion = tmp_best_distortion;
   4245         skippable = tmp_best_skippable;
   4246         *mbmi = tmp_best_mbmode;
   4247         for (i = 0; i < 4; i++)
   4248           xd->mi_8x8[0]->bmi[i] = tmp_best_bmodes[i];
   4249       }
   4250 
   4251       rate2 += rate;
   4252       distortion2 += distortion;
   4253 
   4254       if (cpi->common.mcomp_filter_type == SWITCHABLE)
   4255         rate2 += get_switchable_rate(x);
   4256 
   4257       if (!mode_excluded) {
   4258         if (comp_pred)
   4259           mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
   4260         else
   4261           mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
   4262       }
   4263       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
   4264 
   4265       tmp_best_rdu = best_rd -
   4266           MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
   4267               RDCOST(x->rdmult, x->rddiv, 0, total_sse));
   4268 
   4269       if (tmp_best_rdu > 0) {
   4270         // If even the 'Y' rd value of split is higher than best so far
   4271         // then dont bother looking at UV
   4272         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
   4273                                         BLOCK_8X8);
   4274         super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
   4275                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
   4276         if (rate_uv == INT_MAX)
   4277           continue;
   4278         rate2 += rate_uv;
   4279         distortion2 += distortion_uv;
   4280         skippable = skippable && uv_skippable;
   4281         total_sse += uv_sse;
   4282 
   4283         tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
   4284         for (i = 0; i < TX_MODES; ++i)
   4285           tx_cache[i] = tx_cache[ONLY_4X4];
   4286       }
   4287     }
   4288 
   4289     if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
   4290       rate2 += compmode_cost;
   4291     }
   4292 
   4293     // Estimate the reference frame signaling cost and add it
   4294     // to the rolling cost variable.
   4295     if (second_ref_frame > INTRA_FRAME) {
   4296       rate2 += ref_costs_comp[ref_frame];
   4297     } else {
   4298       rate2 += ref_costs_single[ref_frame];
   4299     }
   4300 
   4301     if (!disable_skip) {
   4302       // Test for the condition where skip block will be activated
   4303       // because there are no non zero coefficients and make any
   4304       // necessary adjustment for rate. Ignore if skip is coded at
   4305       // segment level as the cost wont have been added in.
   4306       // Is Mb level skip allowed (i.e. not coded at segment level).
   4307       const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
   4308                                                          SEG_LVL_SKIP);
   4309 
   4310       if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
   4311         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
   4312             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
   4313           // Add in the cost of the no skip flag.
   4314           int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
   4315                                             0);
   4316           rate2 += prob_skip_cost;
   4317         } else {
   4318           // FIXME(rbultje) make this work for splitmv also
   4319           int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
   4320                                             1);
   4321           rate2 += prob_skip_cost;
   4322           distortion2 = total_sse;
   4323           assert(total_sse >= 0);
   4324           rate2 -= (rate_y + rate_uv);
   4325           rate_y = 0;
   4326           rate_uv = 0;
   4327           this_skip2 = 1;
   4328         }
   4329       } else if (mb_skip_allowed) {
   4330         // Add in the cost of the no skip flag.
   4331         int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
   4332                                           0);
   4333         rate2 += prob_skip_cost;
   4334       }
   4335 
   4336       // Calculate the final RD estimate for this mode.
   4337       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
   4338     }
   4339 
   4340     // Keep record of best inter rd with single reference
   4341     if (xd->mi_8x8[0]->mbmi.ref_frame[0] > INTRA_FRAME &&
   4342         xd->mi_8x8[0]->mbmi.ref_frame[1] == NONE &&
   4343         !mode_excluded &&
   4344         this_rd < best_inter_rd) {
   4345       best_inter_rd = this_rd;
   4346       best_inter_ref_frame = ref_frame;
   4347     }
   4348 
   4349     if (!disable_skip && ref_frame == INTRA_FRAME) {
   4350       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
   4351         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
   4352       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
   4353         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
   4354     }
   4355 
   4356     // Did this mode help.. i.e. is it the new best mode
   4357     if (this_rd < best_rd || x->skip) {
   4358       if (!mode_excluded) {
   4359         int max_plane = MAX_MB_PLANE;
   4360         // Note index of best mode so far
   4361         best_mode_index = mode_index;
   4362 
   4363         if (ref_frame == INTRA_FRAME) {
   4364           /* required for left and above block mv */
   4365           mbmi->mv[0].as_int = 0;
   4366           max_plane = 1;
   4367         }
   4368 
   4369         *returnrate = rate2;
   4370         *returndistortion = distortion2;
   4371         best_rd = this_rd;
   4372         best_yrd = best_rd -
   4373                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
   4374         best_mbmode = *mbmi;
   4375         best_skip2 = this_skip2;
   4376         if (!x->select_txfm_size)
   4377           swap_block_ptr(x, ctx, max_plane);
   4378         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
   4379                    sizeof(uint8_t) * ctx->num_4x4_blk);
   4380 
   4381         for (i = 0; i < 4; i++)
   4382           best_bmodes[i] = xd->mi_8x8[0]->bmi[i];
   4383 
   4384         // TODO(debargha): enhance this test with a better distortion prediction
   4385         // based on qp, activity mask and history
   4386         if ((cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) &&
   4387             (mode_index > MIN_EARLY_TERM_INDEX)) {
   4388           const int qstep = xd->plane[0].dequant[1];
   4389           // TODO(debargha): Enhance this by specializing for each mode_index
   4390           int scale = 4;
   4391           if (x->source_variance < UINT_MAX) {
   4392             const int var_adjust = (x->source_variance < 16);
   4393             scale -= var_adjust;
   4394           }
   4395           if (ref_frame > INTRA_FRAME &&
   4396               distortion2 * scale < qstep * qstep) {
   4397             early_term = 1;
   4398           }
   4399         }
   4400       }
   4401     }
   4402 
   4403     /* keep record of best compound/single-only prediction */
   4404     if (!disable_skip && ref_frame != INTRA_FRAME) {
   4405       int single_rd, hybrid_rd, single_rate, hybrid_rate;
   4406 
   4407       if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
   4408         single_rate = rate2 - compmode_cost;
   4409         hybrid_rate = rate2;
   4410       } else {
   4411         single_rate = rate2;
   4412         hybrid_rate = rate2 + compmode_cost;
   4413       }
   4414 
   4415       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
   4416       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
   4417 
   4418       if (second_ref_frame <= INTRA_FRAME &&
   4419           single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
   4420         best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
   4421       } else if (second_ref_frame > INTRA_FRAME &&
   4422                  single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
   4423         best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
   4424       }
   4425       if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
   4426         best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
   4427     }
   4428 
   4429     /* keep record of best filter type */
   4430     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
   4431         cm->mcomp_filter_type != BILINEAR) {
   4432       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
   4433                               SWITCHABLE_FILTERS : cm->mcomp_filter_type];
   4434       for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
   4435         int64_t adj_rd;
   4436         // In cases of poor prediction, filter_cache[] can contain really big
   4437         // values, which actually are bigger than this_rd itself. This can
   4438         // cause negative best_filter_rd[] values, which is obviously silly.
   4439         // Therefore, if filter_cache < ref, we do an adjusted calculation.
   4440         if (cpi->rd_filter_cache[i] >= ref)
   4441           adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
   4442         else  // FIXME(rbultje) do this for comppred also
   4443           adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref;
   4444         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
   4445       }
   4446     }
   4447 
   4448     /* keep record of best txfm size */
   4449     if (bsize < BLOCK_32X32) {
   4450       if (bsize < BLOCK_16X16) {
   4451         tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
   4452         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
   4453       }
   4454       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
   4455     }
   4456     if (!mode_excluded && this_rd != INT64_MAX) {
   4457       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
   4458         int64_t adj_rd = INT64_MAX;
   4459         if (ref_frame > INTRA_FRAME)
   4460           adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
   4461         else
   4462           adj_rd = this_rd;
   4463 
   4464         if (adj_rd < best_tx_rd[i])
   4465           best_tx_rd[i] = adj_rd;
   4466       }
   4467     }
   4468 
   4469     if (early_term)
   4470       break;
   4471 
   4472     if (x->skip && !comp_pred)
   4473       break;
   4474   }
   4475 
   4476   if (best_rd >= best_rd_so_far)
   4477     return INT64_MAX;
   4478 
   4479   // If we used an estimate for the uv intra rd in the loop above...
   4480   if (cpi->sf.use_uv_intra_rd_estimate) {
   4481     // Do Intra UV best rd mode selection if best mode choice above was intra.
   4482     if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) {
   4483       TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
   4484       rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
   4485                               &rate_uv_tokenonly[uv_tx_size],
   4486                               &dist_uv[uv_tx_size],
   4487                               &skip_uv[uv_tx_size],
   4488                               BLOCK_8X8);
   4489     }
   4490   }
   4491 
   4492   // If we are using reference masking and the set mask flag is set then
   4493   // create the reference frame mask.
   4494   if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
   4495     cpi->ref_frame_mask = ~(1 << vp9_ref_order[best_mode_index].ref_frame);
   4496 
   4497   if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
   4498     *returnrate = INT_MAX;
   4499     *returndistortion = INT_MAX;
   4500     return best_rd;
   4501   }
   4502 
   4503   assert((cm->mcomp_filter_type == SWITCHABLE) ||
   4504          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
   4505          (best_mbmode.ref_frame[0] == INTRA_FRAME));
   4506 
   4507   // Updating rd_thresh_freq_fact[] here means that the different
   4508   // partition/block sizes are handled independently based on the best
   4509   // choice for the current partition. It may well be better to keep a scaled
   4510   // best rd so far value and update rd_thresh_freq_fact based on the mode/size
   4511   // combination that wins out.
   4512   if (cpi->sf.adaptive_rd_thresh) {
   4513     for (mode_index = 0; mode_index < MAX_REFS; ++mode_index) {
   4514       if (mode_index == best_mode_index) {
   4515         cpi->rd_thresh_freq_sub8x8[bsize][mode_index] -=
   4516           (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >> 3);
   4517       } else {
   4518         cpi->rd_thresh_freq_sub8x8[bsize][mode_index] += RD_THRESH_INC;
   4519         if (cpi->rd_thresh_freq_sub8x8[bsize][mode_index] >
   4520             (cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT)) {
   4521           cpi->rd_thresh_freq_sub8x8[bsize][mode_index] =
   4522             cpi->sf.adaptive_rd_thresh * RD_THRESH_MAX_FACT;
   4523         }
   4524       }
   4525     }
   4526   }
   4527 
   4528   // macroblock modes
   4529   *mbmi = best_mbmode;
   4530   x->skip |= best_skip2;
   4531   if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
   4532     for (i = 0; i < 4; i++)
   4533       xd->mi_8x8[0]->bmi[i].as_mode = best_bmodes[i].as_mode;
   4534   } else {
   4535     for (i = 0; i < 4; ++i)
   4536       vpx_memcpy(&xd->mi_8x8[0]->bmi[i], &best_bmodes[i], sizeof(b_mode_info));
   4537 
   4538     mbmi->mv[0].as_int = xd->mi_8x8[0]->bmi[3].as_mv[0].as_int;
   4539     mbmi->mv[1].as_int = xd->mi_8x8[0]->bmi[3].as_mv[1].as_int;
   4540   }
   4541 
   4542   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
   4543     if (best_pred_rd[i] == INT64_MAX)
   4544       best_pred_diff[i] = INT_MIN;
   4545     else
   4546       best_pred_diff[i] = best_rd - best_pred_rd[i];
   4547   }
   4548 
   4549   if (!x->skip) {
   4550     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
   4551       if (best_filter_rd[i] == INT64_MAX)
   4552         best_filter_diff[i] = 0;
   4553       else
   4554         best_filter_diff[i] = best_rd - best_filter_rd[i];
   4555     }
   4556     if (cm->mcomp_filter_type == SWITCHABLE)
   4557       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
   4558   } else {
   4559     vp9_zero(best_filter_diff);
   4560   }
   4561 
   4562   if (!x->skip) {
   4563     for (i = 0; i < TX_MODES; i++) {
   4564       if (best_tx_rd[i] == INT64_MAX)
   4565         best_tx_diff[i] = 0;
   4566       else
   4567         best_tx_diff[i] = best_rd - best_tx_rd[i];
   4568     }
   4569   } else {
   4570     vp9_zero(best_tx_diff);
   4571   }
   4572 
   4573   set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
   4574                     scale_factor);
   4575   store_coding_context(x, ctx, best_mode_index,
   4576                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
   4577                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
   4578                                       mbmi->ref_frame[1]][0],
   4579                        best_pred_diff, best_tx_diff, best_filter_diff);
   4580 
   4581   return best_rd;
   4582 }
   4583