Home | History | Annotate | Download | only in encoder
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <stdio.h>
     12 #include <math.h>
     13 #include <limits.h>
     14 #include <assert.h>
     15 
     16 #include "vp9/common/vp9_pragmas.h"
     17 #include "vp9/encoder/vp9_tokenize.h"
     18 #include "vp9/encoder/vp9_treewriter.h"
     19 #include "vp9/encoder/vp9_onyx_int.h"
     20 #include "vp9/encoder/vp9_modecosts.h"
     21 #include "vp9/encoder/vp9_encodeintra.h"
     22 #include "vp9/common/vp9_entropymode.h"
     23 #include "vp9/common/vp9_reconinter.h"
     24 #include "vp9/common/vp9_reconintra.h"
     25 #include "vp9/common/vp9_findnearmv.h"
     26 #include "vp9/common/vp9_quant_common.h"
     27 #include "vp9/encoder/vp9_encodemb.h"
     28 #include "vp9/encoder/vp9_quantize.h"
     29 #include "vp9/encoder/vp9_variance.h"
     30 #include "vp9/encoder/vp9_mcomp.h"
     31 #include "vp9/encoder/vp9_rdopt.h"
     32 #include "vp9/encoder/vp9_ratectrl.h"
     33 #include "vpx_mem/vpx_mem.h"
     34 #include "vp9/common/vp9_systemdependent.h"
     35 #include "vp9/encoder/vp9_encodemv.h"
     36 #include "vp9/common/vp9_seg_common.h"
     37 #include "vp9/common/vp9_pred_common.h"
     38 #include "vp9/common/vp9_entropy.h"
     39 #include "vp9_rtcd.h"
     40 #include "vp9/common/vp9_mvref_common.h"
     41 #include "vp9/common/vp9_common.h"
     42 
     43 #define INVALID_MV 0x80008000
     44 
     45 /* Factor to weigh the rate for switchable interp filters */
     46 #define SWITCHABLE_INTERP_RATE_FACTOR 1
     47 
     48 DECLARE_ALIGNED(16, extern const uint8_t,
     49                 vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
     50 
     51 #define LAST_FRAME_MODE_MASK    0xFFDADCD60
     52 #define GOLDEN_FRAME_MODE_MASK  0xFFB5A3BB0
     53 #define ALT_REF_MODE_MASK       0xFF8C648D0
     54 
     55 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
     56   {RD_NEARESTMV, LAST_FRAME,   NONE},
     57   {RD_NEARESTMV, ALTREF_FRAME, NONE},
     58   {RD_NEARESTMV, GOLDEN_FRAME, NONE},
     59 
     60   {RD_DC_PRED,   INTRA_FRAME,  NONE},
     61 
     62   {RD_NEWMV,     LAST_FRAME,   NONE},
     63   {RD_NEWMV,     ALTREF_FRAME, NONE},
     64   {RD_NEWMV,     GOLDEN_FRAME, NONE},
     65 
     66   {RD_NEARMV,    LAST_FRAME,   NONE},
     67   {RD_NEARMV,    ALTREF_FRAME, NONE},
     68   {RD_NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
     69   {RD_NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
     70 
     71   {RD_TM_PRED,   INTRA_FRAME,  NONE},
     72 
     73   {RD_NEARMV,    LAST_FRAME,   ALTREF_FRAME},
     74   {RD_NEWMV,     LAST_FRAME,   ALTREF_FRAME},
     75   {RD_NEARMV,    GOLDEN_FRAME, NONE},
     76   {RD_NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
     77   {RD_NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
     78 
     79   {RD_SPLITMV,   LAST_FRAME,   NONE},
     80   {RD_SPLITMV,   GOLDEN_FRAME, NONE},
     81   {RD_SPLITMV,   ALTREF_FRAME, NONE},
     82   {RD_SPLITMV,   LAST_FRAME,   ALTREF_FRAME},
     83   {RD_SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
     84 
     85   {RD_ZEROMV,    LAST_FRAME,   NONE},
     86   {RD_ZEROMV,    GOLDEN_FRAME, NONE},
     87   {RD_ZEROMV,    ALTREF_FRAME, NONE},
     88   {RD_ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
     89   {RD_ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
     90 
     91   {RD_I4X4_PRED, INTRA_FRAME,  NONE},
     92   {RD_H_PRED,    INTRA_FRAME,  NONE},
     93   {RD_V_PRED,    INTRA_FRAME,  NONE},
     94   {RD_D135_PRED, INTRA_FRAME,  NONE},
     95   {RD_D207_PRED, INTRA_FRAME,  NONE},
     96   {RD_D153_PRED, INTRA_FRAME,  NONE},
     97   {RD_D63_PRED,  INTRA_FRAME,  NONE},
     98   {RD_D117_PRED, INTRA_FRAME,  NONE},
     99   {RD_D45_PRED,  INTRA_FRAME,  NONE},
    100 };
    101 
    102 // The baseline rd thresholds for breaking out of the rd loop for
    103 // certain modes are assumed to be based on 8x8 blocks.
    104 // This table is used to correct for blocks size.
    105 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
    106 static int rd_thresh_block_size_factor[BLOCK_SIZES] =
    107   {2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32};
    108 
    109 #define MAX_RD_THRESH_FACT 64
    110 #define RD_THRESH_INC 1
    111 
    112 static void fill_token_costs(vp9_coeff_cost *c,
    113                              vp9_coeff_probs_model (*p)[BLOCK_TYPES]) {
    114   int i, j, k, l;
    115   TX_SIZE t;
    116   for (t = TX_4X4; t <= TX_32X32; t++)
    117     for (i = 0; i < BLOCK_TYPES; i++)
    118       for (j = 0; j < REF_TYPES; j++)
    119         for (k = 0; k < COEF_BANDS; k++)
    120           for (l = 0; l < PREV_COEF_CONTEXTS; l++) {
    121             vp9_prob probs[ENTROPY_NODES];
    122             vp9_model_to_full_probs(p[t][i][j][k][l], probs);
    123             vp9_cost_tokens((int *)c[t][i][j][k][0][l], probs,
    124                             vp9_coef_tree);
    125             vp9_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
    126                                  vp9_coef_tree);
    127             assert(c[t][i][j][k][0][l][DCT_EOB_TOKEN] ==
    128                    c[t][i][j][k][1][l][DCT_EOB_TOKEN]);
    129           }
    130 }
    131 
    132 static const int rd_iifactor[32] = {
    133   4, 4, 3, 2, 1, 0, 0, 0,
    134   0, 0, 0, 0, 0, 0, 0, 0,
    135   0, 0, 0, 0, 0, 0, 0, 0,
    136   0, 0, 0, 0, 0, 0, 0, 0,
    137 };
    138 
    139 // 3* dc_qlookup[Q]*dc_qlookup[Q];
    140 
    141 /* values are now correlated to quantizer */
    142 static int sad_per_bit16lut[QINDEX_RANGE];
    143 static int sad_per_bit4lut[QINDEX_RANGE];
    144 
    145 void vp9_init_me_luts() {
    146   int i;
    147 
    148   // Initialize the sad lut tables using a formulaic calculation for now
    149   // This is to make it easier to resolve the impact of experimental changes
    150   // to the quantizer tables.
    151   for (i = 0; i < QINDEX_RANGE; i++) {
    152     sad_per_bit16lut[i] =
    153       (int)((0.0418 * vp9_convert_qindex_to_q(i)) + 2.4107);
    154     sad_per_bit4lut[i] = (int)(0.063 * vp9_convert_qindex_to_q(i) + 2.742);
    155   }
    156 }
    157 
    158 static int compute_rd_mult(int qindex) {
    159   const int q = vp9_dc_quant(qindex, 0);
    160   return (11 * q * q) >> 2;
    161 }
    162 
    163 static MB_PREDICTION_MODE rd_mode_to_mode(RD_PREDICTION_MODE rd_mode) {
    164   if (rd_mode == RD_SPLITMV || rd_mode == RD_I4X4_PRED) {
    165     assert(!"Invalid rd_mode");
    166     return MB_MODE_COUNT;
    167   }
    168   assert((int)rd_mode < (int)MB_MODE_COUNT);
    169   return (MB_PREDICTION_MODE)rd_mode;
    170 }
    171 
    172 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
    173   cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
    174   cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
    175 }
    176 
    177 
    178 void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
    179   int q, i, bsize;
    180 
    181   vp9_clear_system_state();  // __asm emms;
    182 
    183   // Further tests required to see if optimum is different
    184   // for key frames, golden frames and arf frames.
    185   // if (cpi->common.refresh_golden_frame ||
    186   //     cpi->common.refresh_alt_ref_frame)
    187   qindex = clamp(qindex, 0, MAXQ);
    188 
    189   cpi->RDMULT = compute_rd_mult(qindex);
    190   if (cpi->pass == 2 && (cpi->common.frame_type != KEY_FRAME)) {
    191     if (cpi->twopass.next_iiratio > 31)
    192       cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) >> 4;
    193     else
    194       cpi->RDMULT +=
    195           (cpi->RDMULT * rd_iifactor[cpi->twopass.next_iiratio]) >> 4;
    196   }
    197   cpi->mb.errorperbit = cpi->RDMULT >> 6;
    198   cpi->mb.errorperbit += (cpi->mb.errorperbit == 0);
    199 
    200   vp9_set_speed_features(cpi);
    201 
    202   q = (int)pow(vp9_dc_quant(qindex, 0) >> 2, 1.25);
    203   q <<= 2;
    204   if (q < 8)
    205     q = 8;
    206 
    207   if (cpi->RDMULT > 1000) {
    208     cpi->RDDIV = 1;
    209     cpi->RDMULT /= 100;
    210 
    211     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
    212       for (i = 0; i < MAX_MODES; ++i) {
    213         // Threshold here seem unecessarily harsh but fine given actual
    214         // range of values used for cpi->sf.thresh_mult[]
    215         int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
    216 
    217         // *4 relates to the scaling of rd_thresh_block_size_factor[]
    218         if ((int64_t)cpi->sf.thresh_mult[i] < thresh_max) {
    219           cpi->rd_threshes[bsize][i] =
    220             cpi->sf.thresh_mult[i] * q *
    221             rd_thresh_block_size_factor[bsize] / (4 * 100);
    222         } else {
    223           cpi->rd_threshes[bsize][i] = INT_MAX;
    224         }
    225       }
    226     }
    227   } else {
    228     cpi->RDDIV = 100;
    229 
    230     for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) {
    231       for (i = 0; i < MAX_MODES; i++) {
    232         // Threshold here seem unecessarily harsh but fine given actual
    233         // range of values used for cpi->sf.thresh_mult[]
    234         int thresh_max = INT_MAX / (q * rd_thresh_block_size_factor[bsize]);
    235 
    236         if (cpi->sf.thresh_mult[i] < thresh_max) {
    237           cpi->rd_threshes[bsize][i] =
    238             cpi->sf.thresh_mult[i] * q *
    239             rd_thresh_block_size_factor[bsize] / 4;
    240         } else {
    241           cpi->rd_threshes[bsize][i] = INT_MAX;
    242         }
    243       }
    244     }
    245   }
    246 
    247   fill_token_costs(cpi->mb.token_costs, cpi->common.fc.coef_probs);
    248 
    249   for (i = 0; i < NUM_PARTITION_CONTEXTS; i++)
    250     vp9_cost_tokens(cpi->mb.partition_cost[i],
    251                     cpi->common.fc.partition_prob[cpi->common.frame_type][i],
    252                     vp9_partition_tree);
    253 
    254   /*rough estimate for costing*/
    255   vp9_init_mode_costs(cpi);
    256 
    257   if (cpi->common.frame_type != KEY_FRAME) {
    258     vp9_build_nmv_cost_table(
    259         cpi->mb.nmvjointcost,
    260         cpi->mb.e_mbd.allow_high_precision_mv ?
    261         cpi->mb.nmvcost_hp : cpi->mb.nmvcost,
    262         &cpi->common.fc.nmvc,
    263         cpi->mb.e_mbd.allow_high_precision_mv, 1, 1);
    264 
    265     for (i = 0; i < INTER_MODE_CONTEXTS; i++) {
    266       MB_PREDICTION_MODE m;
    267 
    268       for (m = NEARESTMV; m < MB_MODE_COUNT; m++)
    269         cpi->mb.inter_mode_cost[i][m - NEARESTMV] =
    270             cost_token(vp9_inter_mode_tree,
    271                        cpi->common.fc.inter_mode_probs[i],
    272                        vp9_inter_mode_encodings - NEARESTMV + m);
    273     }
    274   }
    275 }
    276 
    277 static INLINE void linear_interpolate2(double x, int ntab, int inv_step,
    278                                        const double *tab1, const double *tab2,
    279                                        double *v1, double *v2) {
    280   double y = x * inv_step;
    281   int d = (int) y;
    282   if (d >= ntab - 1) {
    283     *v1 = tab1[ntab - 1];
    284     *v2 = tab2[ntab - 1];
    285   } else {
    286     double a = y - d;
    287     *v1 = tab1[d] * (1 - a) + tab1[d + 1] * a;
    288     *v2 = tab2[d] * (1 - a) + tab2[d + 1] * a;
    289   }
    290 }
    291 
    292 static void model_rd_norm(double x, double *R, double *D) {
    293   static const int inv_tab_step = 8;
    294   static const int tab_size = 120;
    295   // NOTE: The tables below must be of the same size
    296   //
    297   // Normalized rate
    298   // This table models the rate for a Laplacian source
    299   // source with given variance when quantized with a uniform quantizer
    300   // with given stepsize. The closed form expression is:
    301   // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
    302   // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance),
    303   // and H(x) is the binary entropy function.
    304   static const double rate_tab[] = {
    305     64.00, 4.944, 3.949, 3.372, 2.966, 2.655, 2.403, 2.194,
    306     2.014, 1.858, 1.720, 1.596, 1.485, 1.384, 1.291, 1.206,
    307     1.127, 1.054, 0.986, 0.923, 0.863, 0.808, 0.756, 0.708,
    308     0.662, 0.619, 0.579, 0.541, 0.506, 0.473, 0.442, 0.412,
    309     0.385, 0.359, 0.335, 0.313, 0.291, 0.272, 0.253, 0.236,
    310     0.220, 0.204, 0.190, 0.177, 0.165, 0.153, 0.142, 0.132,
    311     0.123, 0.114, 0.106, 0.099, 0.091, 0.085, 0.079, 0.073,
    312     0.068, 0.063, 0.058, 0.054, 0.050, 0.047, 0.043, 0.040,
    313     0.037, 0.034, 0.032, 0.029, 0.027, 0.025, 0.023, 0.022,
    314     0.020, 0.019, 0.017, 0.016, 0.015, 0.014, 0.013, 0.012,
    315     0.011, 0.010, 0.009, 0.008, 0.008, 0.007, 0.007, 0.006,
    316     0.006, 0.005, 0.005, 0.005, 0.004, 0.004, 0.004, 0.003,
    317     0.003, 0.003, 0.003, 0.002, 0.002, 0.002, 0.002, 0.002,
    318     0.002, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
    319     0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.000,
    320   };
    321   // Normalized distortion
    322   // This table models the normalized distortion for a Laplacian source
    323   // source with given variance when quantized with a uniform quantizer
    324   // with given stepsize. The closed form expression is:
    325   // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
    326   // where x = qpstep / sqrt(variance)
    327   // Note the actual distortion is Dn * variance.
    328   static const double dist_tab[] = {
    329     0.000, 0.001, 0.005, 0.012, 0.021, 0.032, 0.045, 0.061,
    330     0.079, 0.098, 0.119, 0.142, 0.166, 0.190, 0.216, 0.242,
    331     0.269, 0.296, 0.324, 0.351, 0.378, 0.405, 0.432, 0.458,
    332     0.484, 0.509, 0.534, 0.557, 0.580, 0.603, 0.624, 0.645,
    333     0.664, 0.683, 0.702, 0.719, 0.735, 0.751, 0.766, 0.780,
    334     0.794, 0.807, 0.819, 0.830, 0.841, 0.851, 0.861, 0.870,
    335     0.878, 0.886, 0.894, 0.901, 0.907, 0.913, 0.919, 0.925,
    336     0.930, 0.935, 0.939, 0.943, 0.947, 0.951, 0.954, 0.957,
    337     0.960, 0.963, 0.966, 0.968, 0.971, 0.973, 0.975, 0.976,
    338     0.978, 0.980, 0.981, 0.982, 0.984, 0.985, 0.986, 0.987,
    339     0.988, 0.989, 0.990, 0.990, 0.991, 0.992, 0.992, 0.993,
    340     0.993, 0.994, 0.994, 0.995, 0.995, 0.996, 0.996, 0.996,
    341     0.996, 0.997, 0.997, 0.997, 0.997, 0.998, 0.998, 0.998,
    342     0.998, 0.998, 0.998, 0.999, 0.999, 0.999, 0.999, 0.999,
    343     0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 1.000,
    344   };
    345   /*
    346   assert(sizeof(rate_tab) == tab_size * sizeof(rate_tab[0]);
    347   assert(sizeof(dist_tab) == tab_size * sizeof(dist_tab[0]);
    348   assert(sizeof(rate_tab) == sizeof(dist_tab));
    349   */
    350   assert(x >= 0.0);
    351   linear_interpolate2(x, tab_size, inv_tab_step,
    352                       rate_tab, dist_tab, R, D);
    353 }
    354 
    355 static void model_rd_from_var_lapndz(int var, int n, int qstep,
    356                                      int *rate, int64_t *dist) {
    357   // This function models the rate and distortion for a Laplacian
    358   // source with given variance when quantized with a uniform quantizer
    359   // with given stepsize. The closed form expressions are in:
    360   // Hang and Chen, "Source Model for transform video coder and its
    361   // application - Part I: Fundamental Theory", IEEE Trans. Circ.
    362   // Sys. for Video Tech., April 1997.
    363   vp9_clear_system_state();
    364   if (var == 0 || n == 0) {
    365     *rate = 0;
    366     *dist = 0;
    367   } else {
    368     double D, R;
    369     double s2 = (double) var / n;
    370     double x = qstep / sqrt(s2);
    371     model_rd_norm(x, &R, &D);
    372     *rate = ((n << 8) * R + 0.5);
    373     *dist = (var * D + 0.5);
    374   }
    375   vp9_clear_system_state();
    376 }
    377 
    378 static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
    379                             MACROBLOCK *x, MACROBLOCKD *xd,
    380                             int *out_rate_sum, int64_t *out_dist_sum) {
    381   // Note our transform coeffs are 8 times an orthogonal transform.
    382   // Hence quantizer step is also 8 times. To get effective quantizer
    383   // we need to divide by 8 before sending to modeling function.
    384   int i, rate_sum = 0, dist_sum = 0;
    385 
    386   for (i = 0; i < MAX_MB_PLANE; ++i) {
    387     struct macroblock_plane *const p = &x->plane[i];
    388     struct macroblockd_plane *const pd = &xd->plane[i];
    389     const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
    390     unsigned int sse;
    391     int rate;
    392     int64_t dist;
    393     (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
    394                               pd->dst.buf, pd->dst.stride, &sse);
    395     // sse works better than var, since there is no dc prediction used
    396     model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs],
    397                              pd->dequant[1] >> 3, &rate, &dist);
    398 
    399     rate_sum += rate;
    400     dist_sum += dist;
    401   }
    402 
    403   *out_rate_sum = rate_sum;
    404   *out_dist_sum = dist_sum << 4;
    405 }
    406 
    407 static void model_rd_for_sb_y_tx(VP9_COMP *cpi, BLOCK_SIZE bsize,
    408                                  TX_SIZE tx_size,
    409                                  MACROBLOCK *x, MACROBLOCKD *xd,
    410                                  int *out_rate_sum, int64_t *out_dist_sum,
    411                                  int *out_skip) {
    412   int j, k;
    413   BLOCK_SIZE bs;
    414   struct macroblock_plane *const p = &x->plane[0];
    415   struct macroblockd_plane *const pd = &xd->plane[0];
    416   const int width = 4 << num_4x4_blocks_wide_lookup[bsize];
    417   const int height = 4 << num_4x4_blocks_high_lookup[bsize];
    418   int rate_sum = 0;
    419   int64_t dist_sum = 0;
    420   const int t = 4 << tx_size;
    421 
    422   if (tx_size == TX_4X4) {
    423     bs = BLOCK_4X4;
    424   } else if (tx_size == TX_8X8) {
    425     bs = BLOCK_8X8;
    426   } else if (tx_size == TX_16X16) {
    427     bs = BLOCK_16X16;
    428   } else if (tx_size == TX_32X32) {
    429     bs = BLOCK_32X32;
    430   } else {
    431     assert(0);
    432   }
    433 
    434   *out_skip = 1;
    435   for (j = 0; j < height; j += t) {
    436     for (k = 0; k < width; k += t) {
    437       int rate;
    438       int64_t dist;
    439       unsigned int sse;
    440       cpi->fn_ptr[bs].vf(&p->src.buf[j * p->src.stride + k], p->src.stride,
    441                          &pd->dst.buf[j * pd->dst.stride + k], pd->dst.stride,
    442                          &sse);
    443       // sse works better than var, since there is no dc prediction used
    444       model_rd_from_var_lapndz(sse, t * t, pd->dequant[1] >> 3, &rate, &dist);
    445       rate_sum += rate;
    446       dist_sum += dist;
    447       *out_skip &= (rate < 1024);
    448     }
    449   }
    450 
    451   *out_rate_sum = rate_sum;
    452   *out_dist_sum = dist_sum << 4;
    453 }
    454 
    455 int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
    456                           intptr_t block_size, int64_t *ssz) {
    457   int i;
    458   int64_t error = 0, sqcoeff = 0;
    459 
    460   for (i = 0; i < block_size; i++) {
    461     int this_diff = coeff[i] - dqcoeff[i];
    462     error += (unsigned)this_diff * this_diff;
    463     sqcoeff += (unsigned) coeff[i] * coeff[i];
    464   }
    465 
    466   *ssz = sqcoeff;
    467   return error;
    468 }
    469 
    470 /* The trailing '0' is a terminator which is used inside cost_coeffs() to
    471  * decide whether to include cost of a trailing EOB node or not (i.e. we
    472  * can skip this if the last coefficient in this transform block, e.g. the
    473  * 16th coefficient in a 4x4 block or the 64th coefficient in a 8x8 block,
    474  * were non-zero). */
    475 static const int16_t band_counts[TX_SIZES][8] = {
    476   { 1, 2, 3, 4,  3,   16 - 13, 0 },
    477   { 1, 2, 3, 4, 11,   64 - 21, 0 },
    478   { 1, 2, 3, 4, 11,  256 - 21, 0 },
    479   { 1, 2, 3, 4, 11, 1024 - 21, 0 },
    480 };
    481 
    482 static INLINE int cost_coeffs(MACROBLOCK *mb,
    483                               int plane, int block,
    484                               ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
    485                               TX_SIZE tx_size,
    486                               const int16_t *scan, const int16_t *nb) {
    487   MACROBLOCKD *const xd = &mb->e_mbd;
    488   MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
    489   struct macroblockd_plane *pd = &xd->plane[plane];
    490   const PLANE_TYPE type = pd->plane_type;
    491   const int16_t *band_count = &band_counts[tx_size][1];
    492   const int eob = pd->eobs[block];
    493   const int16_t *const qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
    494   const int ref = mbmi->ref_frame[0] != INTRA_FRAME;
    495   unsigned int (*token_costs)[2][PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] =
    496                    mb->token_costs[tx_size][type][ref];
    497   const ENTROPY_CONTEXT above_ec = !!*A, left_ec = !!*L;
    498   uint8_t token_cache[1024];
    499   int pt = combine_entropy_contexts(above_ec, left_ec);
    500   int c, cost;
    501 
    502   // Check for consistency of tx_size with mode info
    503   assert(type == PLANE_TYPE_Y_WITH_DC ? mbmi->tx_size == tx_size
    504                                       : get_uv_tx_size(mbmi) == tx_size);
    505 
    506   if (eob == 0) {
    507     // single eob token
    508     cost = token_costs[0][0][pt][DCT_EOB_TOKEN];
    509     c = 0;
    510   } else {
    511     int band_left = *band_count++;
    512 
    513     // dc token
    514     int v = qcoeff_ptr[0];
    515     int prev_t = vp9_dct_value_tokens_ptr[v].token;
    516     cost = (*token_costs)[0][pt][prev_t] + vp9_dct_value_cost_ptr[v];
    517     token_cache[0] = vp9_pt_energy_class[prev_t];
    518     ++token_costs;
    519 
    520     // ac tokens
    521     for (c = 1; c < eob; c++) {
    522       const int rc = scan[c];
    523       int t;
    524 
    525       v = qcoeff_ptr[rc];
    526       t = vp9_dct_value_tokens_ptr[v].token;
    527       pt = get_coef_context(nb, token_cache, c);
    528       cost += (*token_costs)[!prev_t][pt][t] + vp9_dct_value_cost_ptr[v];
    529       token_cache[rc] = vp9_pt_energy_class[t];
    530       prev_t = t;
    531       if (!--band_left) {
    532         band_left = *band_count++;
    533         ++token_costs;
    534       }
    535     }
    536 
    537     // eob token
    538     if (band_left) {
    539       pt = get_coef_context(nb, token_cache, c);
    540       cost += (*token_costs)[0][pt][DCT_EOB_TOKEN];
    541     }
    542   }
    543 
    544   // is eob first coefficient;
    545   *A = *L = (c > 0);
    546 
    547   return cost;
    548 }
    549 
    550 struct rdcost_block_args {
    551   MACROBLOCK *x;
    552   ENTROPY_CONTEXT t_above[16];
    553   ENTROPY_CONTEXT t_left[16];
    554   TX_SIZE tx_size;
    555   int bw;
    556   int bh;
    557   int rate;
    558   int64_t dist;
    559   int64_t sse;
    560   int64_t best_rd;
    561   int skip;
    562   const int16_t *scan, *nb;
    563 };
    564 
    565 static void dist_block(int plane, int block, TX_SIZE tx_size, void *arg) {
    566   const int ss_txfrm_size = tx_size << 1;
    567   struct rdcost_block_args* args = arg;
    568   MACROBLOCK* const x = args->x;
    569   MACROBLOCKD* const xd = &x->e_mbd;
    570   struct macroblock_plane *const p = &x->plane[plane];
    571   struct macroblockd_plane *const pd = &xd->plane[plane];
    572   int64_t this_sse;
    573   int shift = args->tx_size == TX_32X32 ? 0 : 2;
    574   int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
    575   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    576   args->dist += vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
    577                                 &this_sse) >> shift;
    578   args->sse += this_sse >> shift;
    579 
    580   if (x->skip_encode &&
    581       xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME) {
    582     // TODO(jingning): tune the model to better capture the distortion.
    583     int64_t p = (pd->dequant[1] * pd->dequant[1] *
    584                     (1 << ss_txfrm_size)) >> shift;
    585     args->dist += p;
    586     args->sse  += p;
    587   }
    588 }
    589 
    590 static void rate_block(int plane, int block, BLOCK_SIZE plane_bsize,
    591                        TX_SIZE tx_size, void *arg) {
    592   struct rdcost_block_args* args = arg;
    593 
    594   int x_idx, y_idx;
    595   txfrm_block_to_raster_xy(plane_bsize, args->tx_size, block, &x_idx, &y_idx);
    596 
    597   args->rate += cost_coeffs(args->x, plane, block,
    598                             args->t_above + x_idx,
    599                             args->t_left + y_idx, args->tx_size,
    600                             args->scan, args->nb);
    601 }
    602 
    603 static void block_yrd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
    604                            TX_SIZE tx_size, void *arg) {
    605   struct rdcost_block_args *args = arg;
    606   MACROBLOCK *const x = args->x;
    607   MACROBLOCKD *const xd = &x->e_mbd;
    608   struct encode_b_args encode_args = {x, NULL};
    609   int64_t rd1, rd2, rd;
    610 
    611   if (args->skip)
    612     return;
    613   rd1 = RDCOST(x->rdmult, x->rddiv, args->rate, args->dist);
    614   rd2 = RDCOST(x->rdmult, x->rddiv, 0, args->sse);
    615   rd = MIN(rd1, rd2);
    616   if (rd > args->best_rd) {
    617     args->skip = 1;
    618     args->rate = INT_MAX;
    619     args->dist = INT64_MAX;
    620     args->sse  = INT64_MAX;
    621     return;
    622   }
    623 
    624   if (!is_inter_block(&xd->this_mi->mbmi))
    625     vp9_encode_block_intra(plane, block, plane_bsize, tx_size, &encode_args);
    626   else
    627     vp9_xform_quant(plane, block, plane_bsize, tx_size, &encode_args);
    628 
    629   dist_block(plane, block, tx_size, args);
    630   rate_block(plane, block, plane_bsize, tx_size, args);
    631 }
    632 
    633 static void txfm_rd_in_plane(MACROBLOCK *x,
    634                              int *rate, int64_t *distortion,
    635                              int *skippable, int64_t *sse,
    636                              int64_t ref_best_rd, int plane,
    637                              BLOCK_SIZE bsize, TX_SIZE tx_size) {
    638   MACROBLOCKD *const xd = &x->e_mbd;
    639   struct macroblockd_plane *const pd = &xd->plane[plane];
    640   const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
    641   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bs];
    642   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bs];
    643   int i;
    644   struct rdcost_block_args args = { x, { 0 }, { 0 }, tx_size,
    645                                     num_4x4_blocks_wide, num_4x4_blocks_high,
    646                                     0, 0, 0, ref_best_rd, 0 };
    647   if (plane == 0)
    648     xd->this_mi->mbmi.tx_size = tx_size;
    649 
    650   switch (tx_size) {
    651     case TX_4X4:
    652       vpx_memcpy(&args.t_above, pd->above_context,
    653                  sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide);
    654       vpx_memcpy(&args.t_left, pd->left_context,
    655                  sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high);
    656       get_scan_nb_4x4(get_tx_type_4x4(pd->plane_type, xd, 0),
    657                       &args.scan, &args.nb);
    658       break;
    659     case TX_8X8:
    660       for (i = 0; i < num_4x4_blocks_wide; i += 2)
    661         args.t_above[i] = !!*(uint16_t *)&pd->above_context[i];
    662       for (i = 0; i < num_4x4_blocks_high; i += 2)
    663         args.t_left[i] = !!*(uint16_t *)&pd->left_context[i];
    664       get_scan_nb_8x8(get_tx_type_8x8(pd->plane_type, xd),
    665                       &args.scan, &args.nb);
    666       break;
    667     case TX_16X16:
    668       for (i = 0; i < num_4x4_blocks_wide; i += 4)
    669         args.t_above[i] = !!*(uint32_t *)&pd->above_context[i];
    670       for (i = 0; i < num_4x4_blocks_high; i += 4)
    671         args.t_left[i] = !!*(uint32_t *)&pd->left_context[i];
    672       get_scan_nb_16x16(get_tx_type_16x16(pd->plane_type, xd),
    673                         &args.scan, &args.nb);
    674       break;
    675     case TX_32X32:
    676       for (i = 0; i < num_4x4_blocks_wide; i += 8)
    677         args.t_above[i] = !!*(uint64_t *)&pd->above_context[i];
    678       for (i = 0; i < num_4x4_blocks_high; i += 8)
    679         args.t_left[i] = !!*(uint64_t *)&pd->left_context[i];
    680       args.scan = vp9_default_scan_32x32;
    681       args.nb = vp9_default_scan_32x32_neighbors;
    682       break;
    683     default:
    684       assert(0);
    685   }
    686 
    687   foreach_transformed_block_in_plane(xd, bsize, plane, block_yrd_txfm, &args);
    688   *distortion = args.dist;
    689   *rate       = args.rate;
    690   *sse        = args.sse;
    691   *skippable  = vp9_is_skippable_in_plane(xd, bsize, plane) && (!args.skip);
    692 }
    693 
    694 static void choose_largest_txfm_size(VP9_COMP *cpi, MACROBLOCK *x,
    695                                      int *rate, int64_t *distortion,
    696                                      int *skip, int64_t *sse,
    697                                      int64_t ref_best_rd,
    698                                      BLOCK_SIZE bs) {
    699   const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
    700   VP9_COMMON *const cm = &cpi->common;
    701   MACROBLOCKD *const xd = &x->e_mbd;
    702   MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
    703   if (max_txfm_size == TX_32X32 &&
    704       (cm->tx_mode == ALLOW_32X32 ||
    705        cm->tx_mode == TX_MODE_SELECT)) {
    706     mbmi->tx_size = TX_32X32;
    707   } else if (max_txfm_size >= TX_16X16 &&
    708              (cm->tx_mode == ALLOW_16X16 ||
    709               cm->tx_mode == ALLOW_32X32 ||
    710               cm->tx_mode == TX_MODE_SELECT)) {
    711     mbmi->tx_size = TX_16X16;
    712   } else if (cm->tx_mode != ONLY_4X4) {
    713     mbmi->tx_size = TX_8X8;
    714   } else {
    715     mbmi->tx_size = TX_4X4;
    716   }
    717   txfm_rd_in_plane(x, rate, distortion, skip,
    718                    &sse[mbmi->tx_size], ref_best_rd, 0, bs,
    719                    mbmi->tx_size);
    720   cpi->txfm_stepdown_count[0]++;
    721 }
    722 
    723 static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
    724                                      int (*r)[2], int *rate,
    725                                      int64_t *d, int64_t *distortion,
    726                                      int *s, int *skip,
    727                                      int64_t tx_cache[TX_MODES],
    728                                      BLOCK_SIZE bs) {
    729   const TX_SIZE max_tx_size = max_txsize_lookup[bs];
    730   VP9_COMMON *const cm = &cpi->common;
    731   MACROBLOCKD *const xd = &x->e_mbd;
    732   MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
    733   vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
    734   int64_t rd[TX_SIZES][2];
    735   int n, m;
    736   int s0, s1;
    737 
    738   const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs, xd->this_mi);
    739 
    740   for (n = TX_4X4; n <= max_tx_size; n++) {
    741     r[n][1] = r[n][0];
    742     if (r[n][0] == INT_MAX)
    743       continue;
    744     for (m = 0; m <= n - (n == max_tx_size); m++) {
    745       if (m == n)
    746         r[n][1] += vp9_cost_zero(tx_probs[m]);
    747       else
    748         r[n][1] += vp9_cost_one(tx_probs[m]);
    749     }
    750   }
    751 
    752   assert(skip_prob > 0);
    753   s0 = vp9_cost_bit(skip_prob, 0);
    754   s1 = vp9_cost_bit(skip_prob, 1);
    755 
    756   for (n = TX_4X4; n <= max_tx_size; n++) {
    757     if (d[n] == INT64_MAX) {
    758       rd[n][0] = rd[n][1] = INT64_MAX;
    759       continue;
    760     }
    761     if (s[n]) {
    762       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
    763     } else {
    764       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
    765       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
    766     }
    767   }
    768 
    769   if (max_tx_size == TX_32X32 &&
    770       (cm->tx_mode == ALLOW_32X32 ||
    771        (cm->tx_mode == TX_MODE_SELECT &&
    772         rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
    773         rd[TX_32X32][1] < rd[TX_4X4][1]))) {
    774     mbmi->tx_size = TX_32X32;
    775   } else if (max_tx_size >= TX_16X16 &&
    776              (cm->tx_mode == ALLOW_16X16 ||
    777               cm->tx_mode == ALLOW_32X32 ||
    778               (cm->tx_mode == TX_MODE_SELECT &&
    779                rd[TX_16X16][1] < rd[TX_8X8][1] &&
    780                rd[TX_16X16][1] < rd[TX_4X4][1]))) {
    781     mbmi->tx_size = TX_16X16;
    782   } else if (cm->tx_mode == ALLOW_8X8 ||
    783              cm->tx_mode == ALLOW_16X16 ||
    784              cm->tx_mode == ALLOW_32X32 ||
    785            (cm->tx_mode == TX_MODE_SELECT && rd[TX_8X8][1] < rd[TX_4X4][1])) {
    786     mbmi->tx_size = TX_8X8;
    787   } else {
    788     mbmi->tx_size = TX_4X4;
    789   }
    790 
    791   *distortion = d[mbmi->tx_size];
    792   *rate       = r[mbmi->tx_size][cm->tx_mode == TX_MODE_SELECT];
    793   *skip       = s[mbmi->tx_size];
    794 
    795   tx_cache[ONLY_4X4] = rd[TX_4X4][0];
    796   tx_cache[ALLOW_8X8] = rd[TX_8X8][0];
    797   tx_cache[ALLOW_16X16] = rd[MIN(max_tx_size, TX_16X16)][0];
    798   tx_cache[ALLOW_32X32] = rd[MIN(max_tx_size, TX_32X32)][0];
    799   if (max_tx_size == TX_32X32 &&
    800       rd[TX_32X32][1] < rd[TX_16X16][1] && rd[TX_32X32][1] < rd[TX_8X8][1] &&
    801       rd[TX_32X32][1] < rd[TX_4X4][1])
    802     tx_cache[TX_MODE_SELECT] = rd[TX_32X32][1];
    803   else if (max_tx_size >= TX_16X16 &&
    804            rd[TX_16X16][1] < rd[TX_8X8][1] && rd[TX_16X16][1] < rd[TX_4X4][1])
    805     tx_cache[TX_MODE_SELECT] = rd[TX_16X16][1];
    806   else
    807     tx_cache[TX_MODE_SELECT] = rd[TX_4X4][1] < rd[TX_8X8][1] ?
    808                                  rd[TX_4X4][1] : rd[TX_8X8][1];
    809 
    810   if (max_tx_size == TX_32X32 &&
    811       rd[TX_32X32][1] < rd[TX_16X16][1] &&
    812       rd[TX_32X32][1] < rd[TX_8X8][1] &&
    813       rd[TX_32X32][1] < rd[TX_4X4][1]) {
    814     cpi->txfm_stepdown_count[0]++;
    815   } else if (max_tx_size >= TX_16X16 &&
    816              rd[TX_16X16][1] < rd[TX_8X8][1] &&
    817              rd[TX_16X16][1] < rd[TX_4X4][1]) {
    818     cpi->txfm_stepdown_count[max_tx_size - TX_16X16]++;
    819   } else if (rd[TX_8X8][1] < rd[TX_4X4][1]) {
    820     cpi->txfm_stepdown_count[max_tx_size - TX_8X8]++;
    821   } else {
    822     cpi->txfm_stepdown_count[max_tx_size - TX_4X4]++;
    823   }
    824 }
    825 
    826 static void choose_txfm_size_from_modelrd(VP9_COMP *cpi, MACROBLOCK *x,
    827                                           int (*r)[2], int *rate,
    828                                           int64_t *d, int64_t *distortion,
    829                                           int *s, int *skip, int64_t *sse,
    830                                           int64_t ref_best_rd,
    831                                           BLOCK_SIZE bs) {
    832   const TX_SIZE max_txfm_size = max_txsize_lookup[bs];
    833   VP9_COMMON *const cm = &cpi->common;
    834   MACROBLOCKD *const xd = &x->e_mbd;
    835   MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
    836   vp9_prob skip_prob = vp9_get_pred_prob_mbskip(cm, xd);
    837   int64_t rd[TX_SIZES][2];
    838   int n, m;
    839   int s0, s1;
    840   double scale_rd[TX_SIZES] = {1.73, 1.44, 1.20, 1.00};
    841   // double scale_r[TX_SIZES] = {2.82, 2.00, 1.41, 1.00};
    842 
    843   const vp9_prob *tx_probs = get_tx_probs2(xd, &cm->fc.tx_probs,  xd->this_mi);
    844 
    845   // for (n = TX_4X4; n <= max_txfm_size; n++)
    846   //   r[n][0] = (r[n][0] * scale_r[n]);
    847 
    848   for (n = TX_4X4; n <= max_txfm_size; n++) {
    849     r[n][1] = r[n][0];
    850     for (m = 0; m <= n - (n == max_txfm_size); m++) {
    851       if (m == n)
    852         r[n][1] += vp9_cost_zero(tx_probs[m]);
    853       else
    854         r[n][1] += vp9_cost_one(tx_probs[m]);
    855     }
    856   }
    857 
    858   assert(skip_prob > 0);
    859   s0 = vp9_cost_bit(skip_prob, 0);
    860   s1 = vp9_cost_bit(skip_prob, 1);
    861 
    862   for (n = TX_4X4; n <= max_txfm_size; n++) {
    863     if (s[n]) {
    864       rd[n][0] = rd[n][1] = RDCOST(x->rdmult, x->rddiv, s1, d[n]);
    865     } else {
    866       rd[n][0] = RDCOST(x->rdmult, x->rddiv, r[n][0] + s0, d[n]);
    867       rd[n][1] = RDCOST(x->rdmult, x->rddiv, r[n][1] + s0, d[n]);
    868     }
    869   }
    870   for (n = TX_4X4; n <= max_txfm_size; n++) {
    871     rd[n][0] = (scale_rd[n] * rd[n][0]);
    872     rd[n][1] = (scale_rd[n] * rd[n][1]);
    873   }
    874 
    875   if (max_txfm_size == TX_32X32 &&
    876       (cm->tx_mode == ALLOW_32X32 ||
    877        (cm->tx_mode == TX_MODE_SELECT &&
    878         rd[TX_32X32][1] <= rd[TX_16X16][1] &&
    879         rd[TX_32X32][1] <= rd[TX_8X8][1] &&
    880         rd[TX_32X32][1] <= rd[TX_4X4][1]))) {
    881     mbmi->tx_size = TX_32X32;
    882   } else if (max_txfm_size >= TX_16X16 &&
    883              (cm->tx_mode == ALLOW_16X16 ||
    884               cm->tx_mode == ALLOW_32X32 ||
    885               (cm->tx_mode == TX_MODE_SELECT &&
    886                rd[TX_16X16][1] <= rd[TX_8X8][1] &&
    887                rd[TX_16X16][1] <= rd[TX_4X4][1]))) {
    888     mbmi->tx_size = TX_16X16;
    889   } else if (cm->tx_mode == ALLOW_8X8 ||
    890              cm->tx_mode == ALLOW_16X16 ||
    891              cm->tx_mode == ALLOW_32X32 ||
    892            (cm->tx_mode == TX_MODE_SELECT &&
    893             rd[TX_8X8][1] <= rd[TX_4X4][1])) {
    894     mbmi->tx_size = TX_8X8;
    895   } else {
    896     mbmi->tx_size = TX_4X4;
    897   }
    898 
    899   // Actually encode using the chosen mode if a model was used, but do not
    900   // update the r, d costs
    901   txfm_rd_in_plane(x, rate, distortion, skip, &sse[mbmi->tx_size],
    902                    ref_best_rd, 0, bs, mbmi->tx_size);
    903 
    904   if (max_txfm_size == TX_32X32 &&
    905       rd[TX_32X32][1] <= rd[TX_16X16][1] &&
    906       rd[TX_32X32][1] <= rd[TX_8X8][1] &&
    907       rd[TX_32X32][1] <= rd[TX_4X4][1]) {
    908     cpi->txfm_stepdown_count[0]++;
    909   } else if (max_txfm_size >= TX_16X16 &&
    910              rd[TX_16X16][1] <= rd[TX_8X8][1] &&
    911              rd[TX_16X16][1] <= rd[TX_4X4][1]) {
    912     cpi->txfm_stepdown_count[max_txfm_size - TX_16X16]++;
    913   } else if (rd[TX_8X8][1] <= rd[TX_4X4][1]) {
    914     cpi->txfm_stepdown_count[max_txfm_size - TX_8X8]++;
    915   } else {
    916     cpi->txfm_stepdown_count[max_txfm_size - TX_4X4]++;
    917   }
    918 }
    919 
    920 static void super_block_yrd(VP9_COMP *cpi,
    921                             MACROBLOCK *x, int *rate, int64_t *distortion,
    922                             int *skip, int64_t *psse, BLOCK_SIZE bs,
    923                             int64_t txfm_cache[TX_MODES],
    924                             int64_t ref_best_rd) {
    925   int r[TX_SIZES][2], s[TX_SIZES];
    926   int64_t d[TX_SIZES], sse[TX_SIZES];
    927   MACROBLOCKD *xd = &x->e_mbd;
    928   MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
    929 
    930   assert(bs == mbmi->sb_type);
    931   if (mbmi->ref_frame[0] > INTRA_FRAME)
    932     vp9_subtract_sby(x, bs);
    933 
    934   if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
    935       (cpi->sf.tx_size_search_method != USE_FULL_RD &&
    936        mbmi->ref_frame[0] == INTRA_FRAME)) {
    937     vpx_memset(txfm_cache, 0, TX_MODES * sizeof(int64_t));
    938     choose_largest_txfm_size(cpi, x, rate, distortion, skip, sse,
    939                              ref_best_rd, bs);
    940     if (psse)
    941       *psse = sse[mbmi->tx_size];
    942     return;
    943   }
    944 
    945   if (cpi->sf.tx_size_search_method == USE_LARGESTINTRA_MODELINTER &&
    946       mbmi->ref_frame[0] > INTRA_FRAME) {
    947     if (bs >= BLOCK_32X32)
    948       model_rd_for_sb_y_tx(cpi, bs, TX_32X32, x, xd,
    949                            &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]);
    950     if (bs >= BLOCK_16X16)
    951       model_rd_for_sb_y_tx(cpi, bs, TX_16X16, x, xd,
    952                            &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]);
    953 
    954     model_rd_for_sb_y_tx(cpi, bs, TX_8X8, x, xd,
    955                          &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]);
    956 
    957     model_rd_for_sb_y_tx(cpi, bs, TX_4X4, x, xd,
    958                          &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]);
    959 
    960     choose_txfm_size_from_modelrd(cpi, x, r, rate, d, distortion, s,
    961                                   skip, sse, ref_best_rd, bs);
    962   } else {
    963     if (bs >= BLOCK_32X32)
    964       txfm_rd_in_plane(x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32],
    965                        &sse[TX_32X32], ref_best_rd, 0, bs, TX_32X32);
    966     if (bs >= BLOCK_16X16)
    967       txfm_rd_in_plane(x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16],
    968                        &sse[TX_16X16], ref_best_rd, 0, bs, TX_16X16);
    969     txfm_rd_in_plane(x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8],
    970                      &sse[TX_8X8], ref_best_rd, 0, bs, TX_8X8);
    971     txfm_rd_in_plane(x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4],
    972                      &sse[TX_4X4], ref_best_rd, 0, bs, TX_4X4);
    973     choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s,
    974                              skip, txfm_cache, bs);
    975   }
    976   if (psse)
    977     *psse = sse[mbmi->tx_size];
    978 }
    979 
    980 static int conditional_skipintra(MB_PREDICTION_MODE mode,
    981                                  MB_PREDICTION_MODE best_intra_mode) {
    982   if (mode == D117_PRED &&
    983       best_intra_mode != V_PRED &&
    984       best_intra_mode != D135_PRED)
    985     return 1;
    986   if (mode == D63_PRED &&
    987       best_intra_mode != V_PRED &&
    988       best_intra_mode != D45_PRED)
    989     return 1;
    990   if (mode == D207_PRED &&
    991       best_intra_mode != H_PRED &&
    992       best_intra_mode != D45_PRED)
    993     return 1;
    994   if (mode == D153_PRED &&
    995       best_intra_mode != H_PRED &&
    996       best_intra_mode != D135_PRED)
    997     return 1;
    998   return 0;
    999 }
   1000 
   1001 static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   1002                                      MB_PREDICTION_MODE *best_mode,
   1003                                      int *bmode_costs,
   1004                                      ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
   1005                                      int *bestrate, int *bestratey,
   1006                                      int64_t *bestdistortion,
   1007                                      BLOCK_SIZE bsize, int64_t rd_thresh) {
   1008   MB_PREDICTION_MODE mode;
   1009   MACROBLOCKD *xd = &x->e_mbd;
   1010   int64_t best_rd = rd_thresh;
   1011   int rate = 0;
   1012   int64_t distortion;
   1013   struct macroblock_plane *p = &x->plane[0];
   1014   struct macroblockd_plane *pd = &xd->plane[0];
   1015   const int src_stride = p->src.stride;
   1016   const int dst_stride = pd->dst.stride;
   1017   uint8_t *src_init = raster_block_offset_uint8(BLOCK_8X8, ib,
   1018                                                 p->src.buf, src_stride);
   1019   uint8_t *dst_init = raster_block_offset_uint8(BLOCK_8X8, ib,
   1020                                                 pd->dst.buf, dst_stride);
   1021   int16_t *src_diff, *coeff;
   1022 
   1023   ENTROPY_CONTEXT ta[2], tempa[2];
   1024   ENTROPY_CONTEXT tl[2], templ[2];
   1025   TX_TYPE tx_type = DCT_DCT;
   1026   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   1027   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   1028   int idx, idy, block;
   1029   uint8_t best_dst[8 * 8];
   1030 
   1031   assert(ib < 4);
   1032 
   1033   vpx_memcpy(ta, a, sizeof(ta));
   1034   vpx_memcpy(tl, l, sizeof(tl));
   1035   xd->this_mi->mbmi.tx_size = TX_4X4;
   1036 
   1037   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
   1038     int64_t this_rd;
   1039     int ratey = 0;
   1040 
   1041     if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
   1042       continue;
   1043 
   1044     // Only do the oblique modes if the best so far is
   1045     // one of the neighboring directional modes
   1046     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
   1047       if (conditional_skipintra(mode, *best_mode))
   1048           continue;
   1049     }
   1050 
   1051     rate = bmode_costs[mode];
   1052     distortion = 0;
   1053 
   1054     vpx_memcpy(tempa, ta, sizeof(ta));
   1055     vpx_memcpy(templ, tl, sizeof(tl));
   1056 
   1057     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
   1058       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
   1059         int64_t ssz;
   1060         const int16_t *scan;
   1061         uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
   1062         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
   1063 
   1064         block = ib + idy * 2 + idx;
   1065         xd->this_mi->bmi[block].as_mode = mode;
   1066         src_diff = raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
   1067         coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
   1068         vp9_predict_intra_block(xd, block, 1,
   1069                                 TX_4X4, mode,
   1070                                 x->skip_encode ? src : dst,
   1071                                 x->skip_encode ? src_stride : dst_stride,
   1072                                 dst, dst_stride);
   1073         vp9_subtract_block(4, 4, src_diff, 8,
   1074                            src, src_stride,
   1075                            dst, dst_stride);
   1076 
   1077         tx_type = get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block);
   1078         if (tx_type != DCT_DCT) {
   1079           vp9_short_fht4x4(src_diff, coeff, 8, tx_type);
   1080           x->quantize_b_4x4(x, block, tx_type, 16);
   1081         } else {
   1082           x->fwd_txm4x4(src_diff, coeff, 16);
   1083           x->quantize_b_4x4(x, block, tx_type, 16);
   1084         }
   1085 
   1086         scan = get_scan_4x4(get_tx_type_4x4(PLANE_TYPE_Y_WITH_DC, xd, block));
   1087         ratey += cost_coeffs(x, 0, block,
   1088                              tempa + idx, templ + idy, TX_4X4, scan,
   1089                              vp9_get_coef_neighbors_handle(scan));
   1090         distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
   1091                                       16, &ssz) >> 2;
   1092         if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
   1093           goto next;
   1094 
   1095         if (tx_type != DCT_DCT)
   1096           vp9_short_iht4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
   1097                                dst, pd->dst.stride, tx_type);
   1098         else
   1099           xd->inv_txm4x4_add(BLOCK_OFFSET(pd->dqcoeff, block),
   1100                              dst, pd->dst.stride);
   1101       }
   1102     }
   1103 
   1104     rate += ratey;
   1105     this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
   1106 
   1107     if (this_rd < best_rd) {
   1108       *bestrate = rate;
   1109       *bestratey = ratey;
   1110       *bestdistortion = distortion;
   1111       best_rd = this_rd;
   1112       *best_mode = mode;
   1113       vpx_memcpy(a, tempa, sizeof(tempa));
   1114       vpx_memcpy(l, templ, sizeof(templ));
   1115       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
   1116         vpx_memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
   1117                    num_4x4_blocks_wide * 4);
   1118     }
   1119   next:
   1120     {}
   1121   }
   1122 
   1123   if (best_rd >= rd_thresh || x->skip_encode)
   1124     return best_rd;
   1125 
   1126   for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
   1127     vpx_memcpy(dst_init + idy * dst_stride, best_dst + idy * 8,
   1128                num_4x4_blocks_wide * 4);
   1129 
   1130   return best_rd;
   1131 }
   1132 
   1133 static int64_t rd_pick_intra_sub_8x8_y_mode(VP9_COMP * const cpi,
   1134                                             MACROBLOCK * const mb,
   1135                                             int * const rate,
   1136                                             int * const rate_y,
   1137                                             int64_t * const distortion,
   1138                                             int64_t best_rd) {
   1139   int i, j;
   1140   MACROBLOCKD *const xd = &mb->e_mbd;
   1141   MODE_INFO *const mic = xd->this_mi;
   1142   const MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
   1143   const MODE_INFO *left_mi = xd->mi_8x8[-1];
   1144   const BLOCK_SIZE bsize = xd->this_mi->mbmi.sb_type;
   1145   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   1146   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   1147   int idx, idy;
   1148   int cost = 0;
   1149   int64_t total_distortion = 0;
   1150   int tot_rate_y = 0;
   1151   int64_t total_rd = 0;
   1152   ENTROPY_CONTEXT t_above[4], t_left[4];
   1153   int *bmode_costs;
   1154 
   1155   vpx_memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
   1156   vpx_memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
   1157 
   1158   bmode_costs = mb->mbmode_cost;
   1159 
   1160   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   1161   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
   1162     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
   1163       MB_PREDICTION_MODE best_mode = DC_PRED;
   1164       int r = INT_MAX, ry = INT_MAX;
   1165       int64_t d = INT64_MAX, this_rd = INT64_MAX;
   1166       i = idy * 2 + idx;
   1167       if (cpi->common.frame_type == KEY_FRAME) {
   1168         const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, i);
   1169         const MB_PREDICTION_MODE L = (xd->left_available || idx) ?
   1170                                      left_block_mode(mic, left_mi, i) :
   1171                                      DC_PRED;
   1172 
   1173         bmode_costs  = mb->y_mode_costs[A][L];
   1174       }
   1175 
   1176       this_rd = rd_pick_intra4x4block(cpi, mb, i, &best_mode, bmode_costs,
   1177                                       t_above + idx, t_left + idy, &r, &ry, &d,
   1178                                       bsize, best_rd - total_rd);
   1179       if (this_rd >= best_rd - total_rd)
   1180         return INT64_MAX;
   1181 
   1182       total_rd += this_rd;
   1183       cost += r;
   1184       total_distortion += d;
   1185       tot_rate_y += ry;
   1186 
   1187       mic->bmi[i].as_mode = best_mode;
   1188       for (j = 1; j < num_4x4_blocks_high; ++j)
   1189         mic->bmi[i + j * 2].as_mode = best_mode;
   1190       for (j = 1; j < num_4x4_blocks_wide; ++j)
   1191         mic->bmi[i + j].as_mode = best_mode;
   1192 
   1193       if (total_rd >= best_rd)
   1194         return INT64_MAX;
   1195     }
   1196   }
   1197 
   1198   *rate = cost;
   1199   *rate_y = tot_rate_y;
   1200   *distortion = total_distortion;
   1201   mic->mbmi.mode = mic->bmi[3].as_mode;
   1202 
   1203   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
   1204 }
   1205 
   1206 static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
   1207                                       int *rate, int *rate_tokenonly,
   1208                                       int64_t *distortion, int *skippable,
   1209                                       BLOCK_SIZE bsize,
   1210                                       int64_t tx_cache[TX_MODES],
   1211                                       int64_t best_rd) {
   1212   MB_PREDICTION_MODE mode;
   1213   MB_PREDICTION_MODE mode_selected = DC_PRED;
   1214   MACROBLOCKD *const xd = &x->e_mbd;
   1215   MODE_INFO *const mic = xd->this_mi;
   1216   int this_rate, this_rate_tokenonly, s;
   1217   int64_t this_distortion, this_rd;
   1218   TX_SIZE best_tx = TX_4X4;
   1219   int i;
   1220   int *bmode_costs = x->mbmode_cost;
   1221 
   1222   if (cpi->sf.tx_size_search_method == USE_FULL_RD)
   1223     for (i = 0; i < TX_MODES; i++)
   1224       tx_cache[i] = INT64_MAX;
   1225 
   1226   /* Y Search for intra prediction mode */
   1227   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
   1228     int64_t local_tx_cache[TX_MODES];
   1229     MODE_INFO *above_mi = xd->mi_8x8[-xd->mode_info_stride];
   1230     MODE_INFO *left_mi = xd->mi_8x8[-1];
   1231 
   1232     if (!(cpi->sf.intra_y_mode_mask & (1 << mode)))
   1233       continue;
   1234 
   1235     if (cpi->common.frame_type == KEY_FRAME) {
   1236       const MB_PREDICTION_MODE A = above_block_mode(mic, above_mi, 0);
   1237       const MB_PREDICTION_MODE L = xd->left_available ?
   1238                                    left_block_mode(mic, left_mi, 0) : DC_PRED;
   1239 
   1240       bmode_costs = x->y_mode_costs[A][L];
   1241     }
   1242     mic->mbmi.mode = mode;
   1243 
   1244     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
   1245                     bsize, local_tx_cache, best_rd);
   1246 
   1247     if (this_rate_tokenonly == INT_MAX)
   1248       continue;
   1249 
   1250     this_rate = this_rate_tokenonly + bmode_costs[mode];
   1251     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
   1252 
   1253     if (this_rd < best_rd) {
   1254       mode_selected   = mode;
   1255       best_rd         = this_rd;
   1256       best_tx         = mic->mbmi.tx_size;
   1257       *rate           = this_rate;
   1258       *rate_tokenonly = this_rate_tokenonly;
   1259       *distortion     = this_distortion;
   1260       *skippable      = s;
   1261     }
   1262 
   1263     if (cpi->sf.tx_size_search_method == USE_FULL_RD && this_rd < INT64_MAX) {
   1264       for (i = 0; i < TX_MODES && local_tx_cache[i] < INT64_MAX; i++) {
   1265         const int64_t adj_rd = this_rd + local_tx_cache[i] -
   1266             local_tx_cache[cpi->common.tx_mode];
   1267         if (adj_rd < tx_cache[i]) {
   1268           tx_cache[i] = adj_rd;
   1269         }
   1270       }
   1271     }
   1272   }
   1273 
   1274   mic->mbmi.mode = mode_selected;
   1275   mic->mbmi.tx_size = best_tx;
   1276 
   1277   return best_rd;
   1278 }
   1279 
   1280 static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
   1281                              int *rate, int64_t *distortion, int *skippable,
   1282                              int64_t *sse, BLOCK_SIZE bsize,
   1283                              int64_t ref_best_rd) {
   1284   MACROBLOCKD *const xd = &x->e_mbd;
   1285   MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
   1286   TX_SIZE uv_txfm_size = get_uv_tx_size(mbmi);
   1287   int plane;
   1288   int pnrate = 0, pnskip = 1;
   1289   int64_t pndist = 0, pnsse = 0;
   1290 
   1291   if (ref_best_rd < 0)
   1292     goto term;
   1293 
   1294   if (is_inter_block(mbmi))
   1295     vp9_subtract_sbuv(x, bsize);
   1296 
   1297   *rate = 0;
   1298   *distortion = 0;
   1299   *sse = 0;
   1300   *skippable = 1;
   1301 
   1302   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
   1303     txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
   1304                      ref_best_rd, plane, bsize, uv_txfm_size);
   1305     if (pnrate == INT_MAX)
   1306       goto term;
   1307     *rate += pnrate;
   1308     *distortion += pndist;
   1309     *sse += pnsse;
   1310     *skippable &= pnskip;
   1311   }
   1312   return;
   1313 
   1314   term:
   1315   *rate = INT_MAX;
   1316   *distortion = INT64_MAX;
   1317   *sse = INT64_MAX;
   1318   *skippable = 0;
   1319   return;
   1320 }
   1321 
   1322 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
   1323                                        int *rate, int *rate_tokenonly,
   1324                                        int64_t *distortion, int *skippable,
   1325                                        BLOCK_SIZE bsize) {
   1326   MB_PREDICTION_MODE mode;
   1327   MB_PREDICTION_MODE mode_selected = DC_PRED;
   1328   int64_t best_rd = INT64_MAX, this_rd;
   1329   int this_rate_tokenonly, this_rate, s;
   1330   int64_t this_distortion, this_sse;
   1331 
   1332   // int mode_mask = (bsize <= BLOCK_8X8)
   1333   //                ? ALL_INTRA_MODES : cpi->sf.intra_uv_mode_mask;
   1334 
   1335   for (mode = DC_PRED; mode <= TM_PRED; mode++) {
   1336     // if (!(mode_mask & (1 << mode)))
   1337     if (!(cpi->sf.intra_uv_mode_mask & (1 << mode)))
   1338       continue;
   1339 
   1340     x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode;
   1341 
   1342     super_block_uvrd(&cpi->common, x, &this_rate_tokenonly,
   1343                      &this_distortion, &s, &this_sse, bsize, best_rd);
   1344     if (this_rate_tokenonly == INT_MAX)
   1345       continue;
   1346     this_rate = this_rate_tokenonly +
   1347                 x->intra_uv_mode_cost[cpi->common.frame_type][mode];
   1348     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
   1349 
   1350     if (this_rd < best_rd) {
   1351       mode_selected   = mode;
   1352       best_rd         = this_rd;
   1353       *rate           = this_rate;
   1354       *rate_tokenonly = this_rate_tokenonly;
   1355       *distortion     = this_distortion;
   1356       *skippable      = s;
   1357     }
   1358   }
   1359 
   1360   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = mode_selected;
   1361 
   1362   return best_rd;
   1363 }
   1364 
   1365 static int64_t rd_sbuv_dcpred(VP9_COMP *cpi, MACROBLOCK *x,
   1366                               int *rate, int *rate_tokenonly,
   1367                               int64_t *distortion, int *skippable,
   1368                               BLOCK_SIZE bsize) {
   1369   int64_t this_rd;
   1370   int64_t this_sse;
   1371 
   1372   x->e_mbd.mi_8x8[0]->mbmi.uv_mode = DC_PRED;
   1373   super_block_uvrd(&cpi->common, x, rate_tokenonly,
   1374                    distortion, skippable, &this_sse, bsize, INT64_MAX);
   1375   *rate = *rate_tokenonly +
   1376           x->intra_uv_mode_cost[cpi->common.frame_type][DC_PRED];
   1377   this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
   1378 
   1379   return this_rd;
   1380 }
   1381 
   1382 static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
   1383                                  int *rate_uv, int *rate_uv_tokenonly,
   1384                                  int64_t *dist_uv, int *skip_uv,
   1385                                  MB_PREDICTION_MODE *mode_uv) {
   1386   MACROBLOCK *const x = &cpi->mb;
   1387 
   1388   // Use an estimated rd for uv_intra based on DC_PRED if the
   1389   // appropriate speed flag is set.
   1390   if (cpi->sf.use_uv_intra_rd_estimate) {
   1391     rd_sbuv_dcpred(cpi, x, rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
   1392                    bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   1393   // Else do a proper rd search for each possible transform size that may
   1394   // be considered in the main rd loop.
   1395   } else {
   1396     rd_pick_intra_sbuv_mode(cpi, x,
   1397                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
   1398                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   1399   }
   1400   *mode_uv = x->e_mbd.mi_8x8[0]->mbmi.uv_mode;
   1401 }
   1402 
   1403 static int cost_mv_ref(VP9_COMP *cpi, MB_PREDICTION_MODE mode,
   1404                        int mode_context) {
   1405   MACROBLOCK *const x = &cpi->mb;
   1406   MACROBLOCKD *const xd = &x->e_mbd;
   1407   const int segment_id = xd->this_mi->mbmi.segment_id;
   1408 
   1409   // Don't account for mode here if segment skip is enabled.
   1410   if (!vp9_segfeature_active(&cpi->common.seg, segment_id, SEG_LVL_SKIP)) {
   1411     assert(is_inter_mode(mode));
   1412     return x->inter_mode_cost[mode_context][mode - NEARESTMV];
   1413   } else {
   1414     return 0;
   1415   }
   1416 }
   1417 
   1418 void vp9_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, int_mv *mv) {
   1419   x->e_mbd.mi_8x8[0]->mbmi.mode = mb;
   1420   x->e_mbd.mi_8x8[0]->mbmi.mv[0].as_int = mv->as_int;
   1421 }
   1422 
   1423 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   1424                                 BLOCK_SIZE bsize,
   1425                                 int_mv *frame_mv,
   1426                                 int mi_row, int mi_col,
   1427                                 int_mv single_newmv[MAX_REF_FRAMES],
   1428                                 int *rate_mv);
   1429 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   1430                                  BLOCK_SIZE bsize,
   1431                                  int mi_row, int mi_col,
   1432                                  int_mv *tmp_mv, int *rate_mv);
   1433 
   1434 static int labels2mode(MACROBLOCK *x, int i,
   1435                        MB_PREDICTION_MODE this_mode,
   1436                        int_mv *this_mv, int_mv *this_second_mv,
   1437                        int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
   1438                        int_mv seg_mvs[MAX_REF_FRAMES],
   1439                        int_mv *best_ref_mv,
   1440                        int_mv *second_best_ref_mv,
   1441                        int *mvjcost, int *mvcost[2], VP9_COMP *cpi) {
   1442   MACROBLOCKD *const xd = &x->e_mbd;
   1443   MODE_INFO *const mic = xd->this_mi;
   1444   MB_MODE_INFO *mbmi = &mic->mbmi;
   1445   int cost = 0, thismvcost = 0;
   1446   int idx, idy;
   1447   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
   1448   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   1449 
   1450   /* We have to be careful retrieving previously-encoded motion vectors.
   1451    Ones from this macroblock have to be pulled from the BLOCKD array
   1452    as they have not yet made it to the bmi array in our MB_MODE_INFO. */
   1453   MB_PREDICTION_MODE m;
   1454 
   1455   // the only time we should do costing for new motion vector or mode
   1456   // is when we are on a new label  (jbb May 08, 2007)
   1457   switch (m = this_mode) {
   1458     case NEWMV:
   1459       this_mv->as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
   1460       thismvcost  = vp9_mv_bit_cost(this_mv, best_ref_mv, mvjcost, mvcost,
   1461                                     102);
   1462       if (mbmi->ref_frame[1] > 0) {
   1463         this_second_mv->as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
   1464         thismvcost += vp9_mv_bit_cost(this_second_mv, second_best_ref_mv,
   1465                                       mvjcost, mvcost, 102);
   1466       }
   1467       break;
   1468     case NEARESTMV:
   1469       this_mv->as_int = frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int;
   1470       if (mbmi->ref_frame[1] > 0)
   1471         this_second_mv->as_int =
   1472             frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int;
   1473       break;
   1474     case NEARMV:
   1475       this_mv->as_int = frame_mv[NEARMV][mbmi->ref_frame[0]].as_int;
   1476       if (mbmi->ref_frame[1] > 0)
   1477         this_second_mv->as_int =
   1478             frame_mv[NEARMV][mbmi->ref_frame[1]].as_int;
   1479       break;
   1480     case ZEROMV:
   1481       this_mv->as_int = 0;
   1482       if (mbmi->ref_frame[1] > 0)
   1483         this_second_mv->as_int = 0;
   1484       break;
   1485     default:
   1486       break;
   1487   }
   1488 
   1489   cost = cost_mv_ref(cpi, this_mode,
   1490                      mbmi->mode_context[mbmi->ref_frame[0]]);
   1491 
   1492   mic->bmi[i].as_mv[0].as_int = this_mv->as_int;
   1493   if (mbmi->ref_frame[1] > 0)
   1494     mic->bmi[i].as_mv[1].as_int = this_second_mv->as_int;
   1495 
   1496   x->partition_info->bmi[i].mode = m;
   1497   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
   1498     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
   1499       vpx_memcpy(&mic->bmi[i + idy * 2 + idx],
   1500                  &mic->bmi[i], sizeof(mic->bmi[i]));
   1501 
   1502   cost += thismvcost;
   1503   return cost;
   1504 }
   1505 
   1506 static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
   1507                                        MACROBLOCK *x,
   1508                                        int64_t best_yrd,
   1509                                        int i,
   1510                                        int *labelyrate,
   1511                                        int64_t *distortion, int64_t *sse,
   1512                                        ENTROPY_CONTEXT *ta,
   1513                                        ENTROPY_CONTEXT *tl) {
   1514   int k;
   1515   MACROBLOCKD *xd = &x->e_mbd;
   1516   struct macroblockd_plane *const pd = &xd->plane[0];
   1517   MODE_INFO *const mi = xd->this_mi;
   1518   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
   1519   const int width = plane_block_width(bsize, pd);
   1520   const int height = plane_block_height(bsize, pd);
   1521   int idx, idy;
   1522   const int src_stride = x->plane[0].src.stride;
   1523   uint8_t* const src = raster_block_offset_uint8(BLOCK_8X8, i,
   1524                                                  x->plane[0].src.buf,
   1525                                                  src_stride);
   1526   int16_t* src_diff = raster_block_offset_int16(BLOCK_8X8, i,
   1527                                                 x->plane[0].src_diff);
   1528   int16_t* coeff = BLOCK_OFFSET(x->plane[0].coeff, i);
   1529   uint8_t* const dst = raster_block_offset_uint8(BLOCK_8X8, i,
   1530                                                  pd->dst.buf, pd->dst.stride);
   1531   int64_t thisdistortion = 0, thissse = 0;
   1532   int thisrate = 0;
   1533   int ref, second_ref = has_second_ref(&mi->mbmi);
   1534 
   1535   for (ref = 0; ref < 1 + second_ref; ++ref) {
   1536     const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i,
   1537                                      pd->pre[ref].buf, pd->pre[ref].stride);
   1538     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
   1539                               dst, pd->dst.stride,
   1540                               &mi->bmi[i].as_mv[ref].as_mv,
   1541                               &xd->scale_factor[ref],
   1542                               width, height, ref, &xd->subpix, MV_PRECISION_Q3);
   1543   }
   1544 
   1545   vp9_subtract_block(height, width, src_diff, 8, src, src_stride,
   1546                      dst, pd->dst.stride);
   1547 
   1548   k = i;
   1549   for (idy = 0; idy < height / 4; ++idy) {
   1550     for (idx = 0; idx < width / 4; ++idx) {
   1551       int64_t ssz, rd, rd1, rd2;
   1552 
   1553       k += (idy * 2 + idx);
   1554       src_diff = raster_block_offset_int16(BLOCK_8X8, k,
   1555                                            x->plane[0].src_diff);
   1556       coeff = BLOCK_OFFSET(x->plane[0].coeff, k);
   1557       x->fwd_txm4x4(src_diff, coeff, 16);
   1558       x->quantize_b_4x4(x, k, DCT_DCT, 16);
   1559       thisdistortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
   1560                                         16, &ssz);
   1561       thissse += ssz;
   1562       thisrate += cost_coeffs(x, 0, k,
   1563                               ta + (k & 1),
   1564                               tl + (k >> 1), TX_4X4,
   1565                               vp9_default_scan_4x4,
   1566                               vp9_default_scan_4x4_neighbors);
   1567       rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
   1568       rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
   1569       rd = MIN(rd1, rd2);
   1570       if (rd >= best_yrd)
   1571         return INT64_MAX;
   1572     }
   1573   }
   1574   *distortion = thisdistortion >> 2;
   1575   *labelyrate = thisrate;
   1576   *sse = thissse >> 2;
   1577 
   1578   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
   1579 }
   1580 
   1581 typedef struct {
   1582   int eobs;
   1583   int brate;
   1584   int byrate;
   1585   int64_t bdist;
   1586   int64_t bsse;
   1587   int64_t brdcost;
   1588   int_mv mvs[2];
   1589   ENTROPY_CONTEXT ta[2];
   1590   ENTROPY_CONTEXT tl[2];
   1591 } SEG_RDSTAT;
   1592 
   1593 typedef struct {
   1594   int_mv *ref_mv, *second_ref_mv;
   1595   int_mv mvp;
   1596 
   1597   int64_t segment_rd;
   1598   int r;
   1599   int64_t d;
   1600   int64_t sse;
   1601   int segment_yrate;
   1602   MB_PREDICTION_MODE modes[4];
   1603   SEG_RDSTAT rdstat[4][INTER_MODES];
   1604   int mvthresh;
   1605 } BEST_SEG_INFO;
   1606 
   1607 static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
   1608   int r = 0;
   1609   r |= (mv->as_mv.row >> 3) < x->mv_row_min;
   1610   r |= (mv->as_mv.row >> 3) > x->mv_row_max;
   1611   r |= (mv->as_mv.col >> 3) < x->mv_col_min;
   1612   r |= (mv->as_mv.col >> 3) > x->mv_col_max;
   1613   return r;
   1614 }
   1615 
   1616 static INLINE void mi_buf_shift(MACROBLOCK *x, int i) {
   1617   MB_MODE_INFO *const mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
   1618   struct macroblock_plane *const p = &x->plane[0];
   1619   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
   1620 
   1621   p->src.buf = raster_block_offset_uint8(BLOCK_8X8, i, p->src.buf,
   1622                                          p->src.stride);
   1623   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
   1624   pd->pre[0].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[0].buf,
   1625                                              pd->pre[0].stride);
   1626   if (mbmi->ref_frame[1])
   1627     pd->pre[1].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[1].buf,
   1628                                                pd->pre[1].stride);
   1629 }
   1630 
   1631 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
   1632                                   struct buf_2d orig_pre[2]) {
   1633   MB_MODE_INFO *mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
   1634   x->plane[0].src = orig_src;
   1635   x->e_mbd.plane[0].pre[0] = orig_pre[0];
   1636   if (mbmi->ref_frame[1])
   1637     x->e_mbd.plane[0].pre[1] = orig_pre[1];
   1638 }
   1639 
   1640 static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
   1641                                     BEST_SEG_INFO *bsi_buf, int filter_idx,
   1642                                     int_mv seg_mvs[4][MAX_REF_FRAMES],
   1643                                     int mi_row, int mi_col) {
   1644   int i, j, br = 0, idx, idy;
   1645   int64_t bd = 0, block_sse = 0;
   1646   MB_PREDICTION_MODE this_mode;
   1647   MODE_INFO *mi = x->e_mbd.mi_8x8[0];
   1648   MB_MODE_INFO *const mbmi = &mi->mbmi;
   1649   const int label_count = 4;
   1650   int64_t this_segment_rd = 0;
   1651   int label_mv_thresh;
   1652   int segmentyrate = 0;
   1653   const BLOCK_SIZE bsize = mbmi->sb_type;
   1654   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   1655   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
   1656   vp9_variance_fn_ptr_t *v_fn_ptr;
   1657   ENTROPY_CONTEXT t_above[2], t_left[2];
   1658   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   1659   int mode_idx;
   1660   int subpelmv = 1, have_ref = 0;
   1661 
   1662   vpx_memcpy(t_above, x->e_mbd.plane[0].above_context, sizeof(t_above));
   1663   vpx_memcpy(t_left, x->e_mbd.plane[0].left_context, sizeof(t_left));
   1664 
   1665   v_fn_ptr = &cpi->fn_ptr[bsize];
   1666 
   1667   // 64 makes this threshold really big effectively
   1668   // making it so that we very rarely check mvs on
   1669   // segments.   setting this to 1 would make mv thresh
   1670   // roughly equal to what it is for macroblocks
   1671   label_mv_thresh = 1 * bsi->mvthresh / label_count;
   1672 
   1673   // Segmentation method overheads
   1674   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
   1675     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
   1676       // TODO(jingning,rbultje): rewrite the rate-distortion optimization
   1677       // loop for 4x4/4x8/8x4 block coding. to be replaced with new rd loop
   1678       int_mv mode_mv[MB_MODE_COUNT], second_mode_mv[MB_MODE_COUNT];
   1679       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   1680       MB_PREDICTION_MODE mode_selected = ZEROMV;
   1681       int64_t best_rd = INT64_MAX;
   1682       i = idy * 2 + idx;
   1683 
   1684       frame_mv[ZEROMV][mbmi->ref_frame[0]].as_int = 0;
   1685       frame_mv[ZEROMV][mbmi->ref_frame[1]].as_int = 0;
   1686       vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
   1687                                     &frame_mv[NEARESTMV][mbmi->ref_frame[0]],
   1688                                     &frame_mv[NEARMV][mbmi->ref_frame[0]],
   1689                                     i, 0, mi_row, mi_col);
   1690       if (mbmi->ref_frame[1] > 0)
   1691         vp9_append_sub8x8_mvs_for_idx(&cpi->common, &x->e_mbd,
   1692                                    &frame_mv[NEARESTMV][mbmi->ref_frame[1]],
   1693                                    &frame_mv[NEARMV][mbmi->ref_frame[1]],
   1694                                    i, 1, mi_row, mi_col);
   1695 
   1696       // search for the best motion vector on this segment
   1697       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
   1698         const struct buf_2d orig_src = x->plane[0].src;
   1699         struct buf_2d orig_pre[2];
   1700 
   1701         mode_idx = inter_mode_offset(this_mode);
   1702         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
   1703 
   1704         // if we're near/nearest and mv == 0,0, compare to zeromv
   1705         if ((this_mode == NEARMV || this_mode == NEARESTMV ||
   1706              this_mode == ZEROMV) &&
   1707             frame_mv[this_mode][mbmi->ref_frame[0]].as_int == 0 &&
   1708             (mbmi->ref_frame[1] <= 0 ||
   1709              frame_mv[this_mode][mbmi->ref_frame[1]].as_int == 0)) {
   1710           int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
   1711           int c1 = cost_mv_ref(cpi, NEARMV, rfc);
   1712           int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
   1713           int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
   1714 
   1715           if (this_mode == NEARMV) {
   1716             if (c1 > c3)
   1717               continue;
   1718           } else if (this_mode == NEARESTMV) {
   1719             if (c2 > c3)
   1720               continue;
   1721           } else {
   1722             assert(this_mode == ZEROMV);
   1723             if (mbmi->ref_frame[1] <= 0) {
   1724               if ((c3 >= c2 &&
   1725                    frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
   1726                   (c3 >= c1 &&
   1727                    frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
   1728                 continue;
   1729             } else {
   1730               if ((c3 >= c2 &&
   1731                    frame_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
   1732                    frame_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
   1733                   (c3 >= c1 &&
   1734                    frame_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
   1735                    frame_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
   1736                 continue;
   1737             }
   1738           }
   1739         }
   1740 
   1741         vpx_memcpy(orig_pre, x->e_mbd.plane[0].pre, sizeof(orig_pre));
   1742         vpx_memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
   1743                    sizeof(bsi->rdstat[i][mode_idx].ta));
   1744         vpx_memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
   1745                    sizeof(bsi->rdstat[i][mode_idx].tl));
   1746 
   1747         // motion search for newmv (single predictor case only)
   1748         if (mbmi->ref_frame[1] <= 0 && this_mode == NEWMV &&
   1749             seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
   1750           int step_param = 0;
   1751           int further_steps;
   1752           int thissme, bestsme = INT_MAX;
   1753           int sadpb = x->sadperbit4;
   1754           int_mv mvp_full;
   1755           int max_mv;
   1756 
   1757           /* Is the best so far sufficiently good that we cant justify doing
   1758            * and new motion search. */
   1759           if (best_rd < label_mv_thresh)
   1760             break;
   1761 
   1762           if (cpi->compressor_speed) {
   1763             // use previous block's result as next block's MV predictor.
   1764             if (i > 0) {
   1765               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
   1766               if (i == 2)
   1767                 bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
   1768             }
   1769           }
   1770           if (i == 0)
   1771             max_mv = x->max_mv_context[mbmi->ref_frame[0]];
   1772           else
   1773             max_mv = MAX(abs(bsi->mvp.as_mv.row), abs(bsi->mvp.as_mv.col)) >> 3;
   1774 
   1775           if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
   1776             // Take wtd average of the step_params based on the last frame's
   1777             // max mv magnitude and the best ref mvs of the current block for
   1778             // the given reference.
   1779             step_param = (vp9_init_search_range(cpi, max_mv) +
   1780                           cpi->mv_step_param) >> 1;
   1781           } else {
   1782             step_param = cpi->mv_step_param;
   1783           }
   1784 
   1785           mvp_full.as_mv.row = bsi->mvp.as_mv.row >> 3;
   1786           mvp_full.as_mv.col = bsi->mvp.as_mv.col >> 3;
   1787 
   1788           if (cpi->sf.adaptive_motion_search && cpi->common.show_frame) {
   1789             mvp_full.as_mv.row = x->pred_mv[mbmi->ref_frame[0]].as_mv.row >> 3;
   1790             mvp_full.as_mv.col = x->pred_mv[mbmi->ref_frame[0]].as_mv.col >> 3;
   1791             step_param = MAX(step_param, 8);
   1792           }
   1793 
   1794           further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   1795           // adjust src pointer for this block
   1796           mi_buf_shift(x, i);
   1797           if (cpi->sf.search_method == HEX) {
   1798             bestsme = vp9_hex_search(x, &mvp_full,
   1799                                      step_param,
   1800                                      sadpb, 1, v_fn_ptr, 1,
   1801                                      bsi->ref_mv, &mode_mv[NEWMV]);
   1802           } else if (cpi->sf.search_method == SQUARE) {
   1803             bestsme = vp9_square_search(x, &mvp_full,
   1804                                         step_param,
   1805                                         sadpb, 1, v_fn_ptr, 1,
   1806                                         bsi->ref_mv, &mode_mv[NEWMV]);
   1807           } else if (cpi->sf.search_method == BIGDIA) {
   1808             bestsme = vp9_bigdia_search(x, &mvp_full,
   1809                                         step_param,
   1810                                         sadpb, 1, v_fn_ptr, 1,
   1811                                         bsi->ref_mv, &mode_mv[NEWMV]);
   1812           } else {
   1813             bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
   1814                                              sadpb, further_steps, 0, v_fn_ptr,
   1815                                              bsi->ref_mv, &mode_mv[NEWMV]);
   1816           }
   1817 
   1818           // Should we do a full search (best quality only)
   1819           if (cpi->compressor_speed == 0) {
   1820             /* Check if mvp_full is within the range. */
   1821             clamp_mv(&mvp_full.as_mv, x->mv_col_min, x->mv_col_max,
   1822                      x->mv_row_min, x->mv_row_max);
   1823 
   1824             thissme = cpi->full_search_sad(x, &mvp_full,
   1825                                            sadpb, 16, v_fn_ptr,
   1826                                            x->nmvjointcost, x->mvcost,
   1827                                            bsi->ref_mv, i);
   1828 
   1829             if (thissme < bestsme) {
   1830               bestsme = thissme;
   1831               mode_mv[NEWMV].as_int = mi->bmi[i].as_mv[0].as_int;
   1832             } else {
   1833               /* The full search result is actually worse so re-instate the
   1834                * previous best vector */
   1835               mi->bmi[i].as_mv[0].as_int = mode_mv[NEWMV].as_int;
   1836             }
   1837           }
   1838 
   1839           if (bestsme < INT_MAX) {
   1840             int distortion;
   1841             unsigned int sse;
   1842             cpi->find_fractional_mv_step(x, &mode_mv[NEWMV],
   1843                                          bsi->ref_mv, x->errorperbit, v_fn_ptr,
   1844                                          0, cpi->sf.subpel_iters_per_step,
   1845                                          x->nmvjointcost, x->mvcost,
   1846                                          &distortion, &sse);
   1847 
   1848             // save motion search result for use in compound prediction
   1849             seg_mvs[i][mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;
   1850           }
   1851 
   1852           if (cpi->sf.adaptive_motion_search)
   1853             x->pred_mv[mbmi->ref_frame[0]].as_int = mode_mv[NEWMV].as_int;
   1854 
   1855           // restore src pointers
   1856           mi_buf_restore(x, orig_src, orig_pre);
   1857         }
   1858 
   1859         if (mbmi->ref_frame[1] > 0 && this_mode == NEWMV &&
   1860             mbmi->interp_filter == EIGHTTAP) {
   1861           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
   1862               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
   1863             continue;
   1864 
   1865           // adjust src pointers
   1866           mi_buf_shift(x, i);
   1867           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
   1868             int rate_mv;
   1869             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
   1870                                 mi_row, mi_col, seg_mvs[i],
   1871                                 &rate_mv);
   1872             seg_mvs[i][mbmi->ref_frame[0]].as_int =
   1873                 frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
   1874             seg_mvs[i][mbmi->ref_frame[1]].as_int =
   1875                 frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
   1876           }
   1877           // restore src pointers
   1878           mi_buf_restore(x, orig_src, orig_pre);
   1879         }
   1880 
   1881         bsi->rdstat[i][mode_idx].brate =
   1882             labels2mode(x, i, this_mode, &mode_mv[this_mode],
   1883                         &second_mode_mv[this_mode], frame_mv, seg_mvs[i],
   1884                         bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
   1885                         x->mvcost, cpi);
   1886 
   1887         bsi->rdstat[i][mode_idx].mvs[0].as_int = mode_mv[this_mode].as_int;
   1888         if (num_4x4_blocks_wide > 1)
   1889           bsi->rdstat[i + 1][mode_idx].mvs[0].as_int =
   1890               mode_mv[this_mode].as_int;
   1891         if (num_4x4_blocks_high > 1)
   1892           bsi->rdstat[i + 2][mode_idx].mvs[0].as_int =
   1893               mode_mv[this_mode].as_int;
   1894         if (mbmi->ref_frame[1] > 0) {
   1895           bsi->rdstat[i][mode_idx].mvs[1].as_int =
   1896               second_mode_mv[this_mode].as_int;
   1897           if (num_4x4_blocks_wide > 1)
   1898             bsi->rdstat[i + 1][mode_idx].mvs[1].as_int =
   1899                 second_mode_mv[this_mode].as_int;
   1900           if (num_4x4_blocks_high > 1)
   1901             bsi->rdstat[i + 2][mode_idx].mvs[1].as_int =
   1902                 second_mode_mv[this_mode].as_int;
   1903         }
   1904 
   1905         // Trap vectors that reach beyond the UMV borders
   1906         if (mv_check_bounds(x, &mode_mv[this_mode]))
   1907           continue;
   1908         if (mbmi->ref_frame[1] > 0 &&
   1909             mv_check_bounds(x, &second_mode_mv[this_mode]))
   1910           continue;
   1911 
   1912         if (filter_idx > 0) {
   1913           BEST_SEG_INFO *ref_bsi = bsi_buf;
   1914           subpelmv = (mode_mv[this_mode].as_mv.row & 0x0f) ||
   1915                      (mode_mv[this_mode].as_mv.col & 0x0f);
   1916           have_ref = mode_mv[this_mode].as_int ==
   1917                      ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
   1918           if (mbmi->ref_frame[1] > 0) {
   1919             subpelmv |= (second_mode_mv[this_mode].as_mv.row & 0x0f) ||
   1920                         (second_mode_mv[this_mode].as_mv.col & 0x0f);
   1921             have_ref  &= second_mode_mv[this_mode].as_int ==
   1922                          ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
   1923           }
   1924 
   1925           if (filter_idx > 1 && !subpelmv && !have_ref) {
   1926             ref_bsi = bsi_buf + 1;
   1927             have_ref = mode_mv[this_mode].as_int ==
   1928                        ref_bsi->rdstat[i][mode_idx].mvs[0].as_int;
   1929             if (mbmi->ref_frame[1] > 0) {
   1930               have_ref  &= second_mode_mv[this_mode].as_int ==
   1931                            ref_bsi->rdstat[i][mode_idx].mvs[1].as_int;
   1932             }
   1933           }
   1934 
   1935           if (!subpelmv && have_ref &&
   1936               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
   1937             vpx_memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
   1938                        sizeof(SEG_RDSTAT));
   1939             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
   1940               mode_selected = this_mode;
   1941               best_rd = bsi->rdstat[i][mode_idx].brdcost;
   1942             }
   1943             continue;
   1944           }
   1945         }
   1946 
   1947         bsi->rdstat[i][mode_idx].brdcost =
   1948             encode_inter_mb_segment(cpi, x,
   1949                                     bsi->segment_rd - this_segment_rd, i,
   1950                                     &bsi->rdstat[i][mode_idx].byrate,
   1951                                     &bsi->rdstat[i][mode_idx].bdist,
   1952                                     &bsi->rdstat[i][mode_idx].bsse,
   1953                                     bsi->rdstat[i][mode_idx].ta,
   1954                                     bsi->rdstat[i][mode_idx].tl);
   1955         if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
   1956           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
   1957                                             bsi->rdstat[i][mode_idx].brate, 0);
   1958           bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
   1959           bsi->rdstat[i][mode_idx].eobs = x->e_mbd.plane[0].eobs[i];
   1960         }
   1961 
   1962         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
   1963           mode_selected = this_mode;
   1964           best_rd = bsi->rdstat[i][mode_idx].brdcost;
   1965         }
   1966       } /*for each 4x4 mode*/
   1967 
   1968       if (best_rd == INT64_MAX) {
   1969         int iy, midx;
   1970         for (iy = i + 1; iy < 4; ++iy)
   1971           for (midx = 0; midx < INTER_MODES; ++midx)
   1972             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
   1973         bsi->segment_rd = INT64_MAX;
   1974         return;
   1975       }
   1976 
   1977       mode_idx = inter_mode_offset(mode_selected);
   1978       vpx_memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
   1979       vpx_memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
   1980 
   1981       labels2mode(x, i, mode_selected, &mode_mv[mode_selected],
   1982                   &second_mode_mv[mode_selected], frame_mv, seg_mvs[i],
   1983                   bsi->ref_mv, bsi->second_ref_mv, x->nmvjointcost,
   1984                   x->mvcost, cpi);
   1985 
   1986       br += bsi->rdstat[i][mode_idx].brate;
   1987       bd += bsi->rdstat[i][mode_idx].bdist;
   1988       block_sse += bsi->rdstat[i][mode_idx].bsse;
   1989       segmentyrate += bsi->rdstat[i][mode_idx].byrate;
   1990       this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
   1991 
   1992       if (this_segment_rd > bsi->segment_rd) {
   1993         int iy, midx;
   1994         for (iy = i + 1; iy < 4; ++iy)
   1995           for (midx = 0; midx < INTER_MODES; ++midx)
   1996             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
   1997         bsi->segment_rd = INT64_MAX;
   1998         return;
   1999       }
   2000 
   2001       for (j = 1; j < num_4x4_blocks_high; ++j)
   2002         vpx_memcpy(&x->partition_info->bmi[i + j * 2],
   2003                    &x->partition_info->bmi[i],
   2004                    sizeof(x->partition_info->bmi[i]));
   2005       for (j = 1; j < num_4x4_blocks_wide; ++j)
   2006         vpx_memcpy(&x->partition_info->bmi[i + j],
   2007                    &x->partition_info->bmi[i],
   2008                    sizeof(x->partition_info->bmi[i]));
   2009     }
   2010   } /* for each label */
   2011 
   2012   bsi->r = br;
   2013   bsi->d = bd;
   2014   bsi->segment_yrate = segmentyrate;
   2015   bsi->segment_rd = this_segment_rd;
   2016   bsi->sse = block_sse;
   2017 
   2018   // update the coding decisions
   2019   for (i = 0; i < 4; ++i)
   2020     bsi->modes[i] = x->partition_info->bmi[i].mode;
   2021 }
   2022 
   2023 static int64_t rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
   2024                                            int_mv *best_ref_mv,
   2025                                            int_mv *second_best_ref_mv,
   2026                                            int64_t best_rd,
   2027                                            int *returntotrate,
   2028                                            int *returnyrate,
   2029                                            int64_t *returndistortion,
   2030                                            int *skippable, int64_t *psse,
   2031                                            int mvthresh,
   2032                                            int_mv seg_mvs[4][MAX_REF_FRAMES],
   2033                                            BEST_SEG_INFO *bsi_buf,
   2034                                            int filter_idx,
   2035                                            int mi_row, int mi_col) {
   2036   int i;
   2037   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
   2038   MACROBLOCKD *xd = &x->e_mbd;
   2039   MODE_INFO *mi = xd->this_mi;
   2040   MB_MODE_INFO *mbmi = &mi->mbmi;
   2041   int mode_idx;
   2042 
   2043   vp9_zero(*bsi);
   2044 
   2045   bsi->segment_rd = best_rd;
   2046   bsi->ref_mv = best_ref_mv;
   2047   bsi->second_ref_mv = second_best_ref_mv;
   2048   bsi->mvp.as_int = best_ref_mv->as_int;
   2049   bsi->mvthresh = mvthresh;
   2050 
   2051   for (i = 0; i < 4; i++)
   2052     bsi->modes[i] = ZEROMV;
   2053 
   2054   rd_check_segment_txsize(cpi, x, bsi_buf, filter_idx, seg_mvs, mi_row, mi_col);
   2055 
   2056   if (bsi->segment_rd > best_rd)
   2057     return INT64_MAX;
   2058   /* set it to the best */
   2059   for (i = 0; i < 4; i++) {
   2060     mode_idx = inter_mode_offset(bsi->modes[i]);
   2061     mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
   2062     if (mbmi->ref_frame[1] > 0)
   2063       mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
   2064     xd->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
   2065     x->partition_info->bmi[i].mode = bsi->modes[i];
   2066   }
   2067 
   2068   /*
   2069    * used to set mbmi->mv.as_int
   2070    */
   2071   *returntotrate = bsi->r;
   2072   *returndistortion = bsi->d;
   2073   *returnyrate = bsi->segment_yrate;
   2074   *skippable = vp9_is_skippable_in_plane(&x->e_mbd, BLOCK_8X8, 0);
   2075   *psse = bsi->sse;
   2076   mbmi->mode = bsi->modes[3];
   2077 
   2078   return bsi->segment_rd;
   2079 }
   2080 
   2081 static void mv_pred(VP9_COMP *cpi, MACROBLOCK *x,
   2082                     uint8_t *ref_y_buffer, int ref_y_stride,
   2083                     int ref_frame, BLOCK_SIZE block_size ) {
   2084   MACROBLOCKD *xd = &x->e_mbd;
   2085   MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   2086   int_mv this_mv;
   2087   int i;
   2088   int zero_seen = 0;
   2089   int best_index = 0;
   2090   int best_sad = INT_MAX;
   2091   int this_sad = INT_MAX;
   2092   unsigned int max_mv = 0;
   2093 
   2094   uint8_t *src_y_ptr = x->plane[0].src.buf;
   2095   uint8_t *ref_y_ptr;
   2096   int row_offset, col_offset;
   2097   int num_mv_refs = MAX_MV_REF_CANDIDATES +
   2098                     (cpi->sf.adaptive_motion_search &&
   2099                      cpi->common.show_frame &&
   2100                      block_size < cpi->sf.max_partition_size);
   2101 
   2102   // Get the sad for each candidate reference mv
   2103   for (i = 0; i < num_mv_refs; i++) {
   2104     this_mv.as_int = (i < MAX_MV_REF_CANDIDATES) ?
   2105         mbmi->ref_mvs[ref_frame][i].as_int : x->pred_mv[ref_frame].as_int;
   2106 
   2107     max_mv = MAX(max_mv,
   2108                  MAX(abs(this_mv.as_mv.row), abs(this_mv.as_mv.col)) >> 3);
   2109     // The list is at an end if we see 0 for a second time.
   2110     if (!this_mv.as_int && zero_seen)
   2111       break;
   2112     zero_seen = zero_seen || !this_mv.as_int;
   2113 
   2114     row_offset = this_mv.as_mv.row >> 3;
   2115     col_offset = this_mv.as_mv.col >> 3;
   2116     ref_y_ptr = ref_y_buffer + (ref_y_stride * row_offset) + col_offset;
   2117 
   2118     // Find sad for current vector.
   2119     this_sad = cpi->fn_ptr[block_size].sdf(src_y_ptr, x->plane[0].src.stride,
   2120                                            ref_y_ptr, ref_y_stride,
   2121                                            0x7fffffff);
   2122 
   2123     // Note if it is the best so far.
   2124     if (this_sad < best_sad) {
   2125       best_sad = this_sad;
   2126       best_index = i;
   2127     }
   2128   }
   2129 
   2130   // Note the index of the mv that worked best in the reference list.
   2131   x->mv_best_ref_index[ref_frame] = best_index;
   2132   x->max_mv_context[ref_frame] = max_mv;
   2133 }
   2134 
   2135 static void estimate_ref_frame_costs(VP9_COMP *cpi, int segment_id,
   2136                                      unsigned int *ref_costs_single,
   2137                                      unsigned int *ref_costs_comp,
   2138                                      vp9_prob *comp_mode_p) {
   2139   VP9_COMMON *const cm = &cpi->common;
   2140   MACROBLOCKD *const xd = &cpi->mb.e_mbd;
   2141   int seg_ref_active = vp9_segfeature_active(&cm->seg, segment_id,
   2142                                              SEG_LVL_REF_FRAME);
   2143   if (seg_ref_active) {
   2144     vpx_memset(ref_costs_single, 0, MAX_REF_FRAMES * sizeof(*ref_costs_single));
   2145     vpx_memset(ref_costs_comp,   0, MAX_REF_FRAMES * sizeof(*ref_costs_comp));
   2146     *comp_mode_p = 128;
   2147   } else {
   2148     vp9_prob intra_inter_p = vp9_get_pred_prob_intra_inter(cm, xd);
   2149     vp9_prob comp_inter_p = 128;
   2150 
   2151     if (cm->comp_pred_mode == HYBRID_PREDICTION) {
   2152       comp_inter_p = vp9_get_pred_prob_comp_inter_inter(cm, xd);
   2153       *comp_mode_p = comp_inter_p;
   2154     } else {
   2155       *comp_mode_p = 128;
   2156     }
   2157 
   2158     ref_costs_single[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
   2159 
   2160     if (cm->comp_pred_mode != COMP_PREDICTION_ONLY) {
   2161       vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
   2162       vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
   2163       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
   2164 
   2165       if (cm->comp_pred_mode == HYBRID_PREDICTION)
   2166         base_cost += vp9_cost_bit(comp_inter_p, 0);
   2167 
   2168       ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
   2169           ref_costs_single[ALTREF_FRAME] = base_cost;
   2170       ref_costs_single[LAST_FRAME]   += vp9_cost_bit(ref_single_p1, 0);
   2171       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
   2172       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
   2173       ref_costs_single[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
   2174       ref_costs_single[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
   2175     } else {
   2176       ref_costs_single[LAST_FRAME]   = 512;
   2177       ref_costs_single[GOLDEN_FRAME] = 512;
   2178       ref_costs_single[ALTREF_FRAME] = 512;
   2179     }
   2180     if (cm->comp_pred_mode != SINGLE_PREDICTION_ONLY) {
   2181       vp9_prob ref_comp_p = vp9_get_pred_prob_comp_ref_p(cm, xd);
   2182       unsigned int base_cost = vp9_cost_bit(intra_inter_p, 1);
   2183 
   2184       if (cm->comp_pred_mode == HYBRID_PREDICTION)
   2185         base_cost += vp9_cost_bit(comp_inter_p, 1);
   2186 
   2187       ref_costs_comp[LAST_FRAME]   = base_cost + vp9_cost_bit(ref_comp_p, 0);
   2188       ref_costs_comp[GOLDEN_FRAME] = base_cost + vp9_cost_bit(ref_comp_p, 1);
   2189     } else {
   2190       ref_costs_comp[LAST_FRAME]   = 512;
   2191       ref_costs_comp[GOLDEN_FRAME] = 512;
   2192     }
   2193   }
   2194 }
   2195 
   2196 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
   2197                          int mode_index,
   2198                          PARTITION_INFO *partition,
   2199                          int_mv *ref_mv,
   2200                          int_mv *second_ref_mv,
   2201                          int64_t comp_pred_diff[NB_PREDICTION_TYPES],
   2202                          int64_t tx_size_diff[TX_MODES],
   2203                          int64_t best_filter_diff[SWITCHABLE_FILTERS + 1]) {
   2204   MACROBLOCKD *const xd = &x->e_mbd;
   2205 
   2206   // Take a snapshot of the coding context so it can be
   2207   // restored if we decide to encode this way
   2208   ctx->skip = x->skip;
   2209   ctx->best_mode_index = mode_index;
   2210   ctx->mic = *xd->this_mi;
   2211 
   2212   if (partition)
   2213     ctx->partition_info = *partition;
   2214 
   2215   ctx->best_ref_mv.as_int = ref_mv->as_int;
   2216   ctx->second_best_ref_mv.as_int = second_ref_mv->as_int;
   2217 
   2218   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_PREDICTION_ONLY];
   2219   ctx->comp_pred_diff   = (int)comp_pred_diff[COMP_PREDICTION_ONLY];
   2220   ctx->hybrid_pred_diff = (int)comp_pred_diff[HYBRID_PREDICTION];
   2221 
   2222   // FIXME(rbultje) does this memcpy the whole array? I believe sizeof()
   2223   // doesn't actually work this way
   2224   memcpy(ctx->tx_rd_diff, tx_size_diff, sizeof(ctx->tx_rd_diff));
   2225   memcpy(ctx->best_filter_diff, best_filter_diff,
   2226          sizeof(*best_filter_diff) * (SWITCHABLE_FILTERS + 1));
   2227 }
   2228 
   2229 static void setup_pred_block(const MACROBLOCKD *xd,
   2230                              struct buf_2d dst[MAX_MB_PLANE],
   2231                              const YV12_BUFFER_CONFIG *src,
   2232                              int mi_row, int mi_col,
   2233                              const struct scale_factors *scale,
   2234                              const struct scale_factors *scale_uv) {
   2235   int i;
   2236 
   2237   dst[0].buf = src->y_buffer;
   2238   dst[0].stride = src->y_stride;
   2239   dst[1].buf = src->u_buffer;
   2240   dst[2].buf = src->v_buffer;
   2241   dst[1].stride = dst[2].stride = src->uv_stride;
   2242 #if CONFIG_ALPHA
   2243   dst[3].buf = src->alpha_buffer;
   2244   dst[3].stride = src->alpha_stride;
   2245 #endif
   2246 
   2247   // TODO(jkoleszar): Make scale factors per-plane data
   2248   for (i = 0; i < MAX_MB_PLANE; i++) {
   2249     setup_pred_plane(dst + i, dst[i].buf, dst[i].stride, mi_row, mi_col,
   2250                      i ? scale_uv : scale,
   2251                      xd->plane[i].subsampling_x, xd->plane[i].subsampling_y);
   2252   }
   2253 }
   2254 
   2255 static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
   2256                                int idx, MV_REFERENCE_FRAME frame_type,
   2257                                BLOCK_SIZE block_size,
   2258                                int mi_row, int mi_col,
   2259                                int_mv frame_nearest_mv[MAX_REF_FRAMES],
   2260                                int_mv frame_near_mv[MAX_REF_FRAMES],
   2261                                struct buf_2d yv12_mb[4][MAX_MB_PLANE],
   2262                                struct scale_factors scale[MAX_REF_FRAMES]) {
   2263   VP9_COMMON *cm = &cpi->common;
   2264   YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]];
   2265   MACROBLOCKD *const xd = &x->e_mbd;
   2266   MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
   2267 
   2268   // set up scaling factors
   2269   scale[frame_type] = cpi->common.active_ref_scale[frame_type - 1];
   2270 
   2271   scale[frame_type].x_offset_q4 =
   2272       ROUND_POWER_OF_TWO(mi_col * MI_SIZE * scale[frame_type].x_scale_fp,
   2273        REF_SCALE_SHIFT) & 0xf;
   2274   scale[frame_type].y_offset_q4 =
   2275       ROUND_POWER_OF_TWO(mi_row * MI_SIZE * scale[frame_type].y_scale_fp,
   2276        REF_SCALE_SHIFT) & 0xf;
   2277 
   2278   // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this
   2279   // use the UV scaling factors.
   2280   setup_pred_block(xd, yv12_mb[frame_type], yv12, mi_row, mi_col,
   2281                    &scale[frame_type], &scale[frame_type]);
   2282 
   2283   // Gets an initial list of candidate vectors from neighbours and orders them
   2284   vp9_find_mv_refs(&cpi->common, xd, xd->this_mi,
   2285                    xd->last_mi,
   2286                    frame_type,
   2287                    mbmi->ref_mvs[frame_type], mi_row, mi_col);
   2288 
   2289   // Candidate refinement carried out at encoder and decoder
   2290   vp9_find_best_ref_mvs(xd,
   2291                         mbmi->ref_mvs[frame_type],
   2292                         &frame_nearest_mv[frame_type],
   2293                         &frame_near_mv[frame_type]);
   2294 
   2295   // Further refinement that is encode side only to test the top few candidates
   2296   // in full and choose the best as the centre point for subsequent searches.
   2297   // The current implementation doesn't support scaling.
   2298   if (!vp9_is_scaled(&scale[frame_type]))
   2299     mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride,
   2300             frame_type, block_size);
   2301 }
   2302 
   2303 static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
   2304   YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
   2305   int fb = get_ref_frame_idx(cpi, ref_frame);
   2306   int fb_scale = get_scale_ref_frame_idx(cpi, ref_frame);
   2307   if (cpi->scaled_ref_idx[fb_scale] != cpi->common.ref_frame_map[fb])
   2308     scaled_ref_frame = &cpi->common.yv12_fb[cpi->scaled_ref_idx[fb_scale]];
   2309   return scaled_ref_frame;
   2310 }
   2311 
   2312 static INLINE int get_switchable_rate(const MACROBLOCK *x) {
   2313   const MACROBLOCKD *const xd = &x->e_mbd;
   2314   const MB_MODE_INFO *const mbmi = &xd->this_mi->mbmi;
   2315   const int ctx = vp9_get_pred_context_switchable_interp(xd);
   2316   return SWITCHABLE_INTERP_RATE_FACTOR *
   2317              x->switchable_interp_costs[ctx][mbmi->interp_filter];
   2318 }
   2319 
   2320 static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   2321                                  BLOCK_SIZE bsize,
   2322                                  int mi_row, int mi_col,
   2323                                  int_mv *tmp_mv, int *rate_mv) {
   2324   MACROBLOCKD *xd = &x->e_mbd;
   2325   VP9_COMMON *cm = &cpi->common;
   2326   MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   2327   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   2328   int bestsme = INT_MAX;
   2329   int further_steps, step_param;
   2330   int sadpb = x->sadperbit16;
   2331   int_mv mvp_full;
   2332   int ref = mbmi->ref_frame[0];
   2333   int_mv ref_mv = mbmi->ref_mvs[ref][0];
   2334   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   2335 
   2336   int tmp_col_min = x->mv_col_min;
   2337   int tmp_col_max = x->mv_col_max;
   2338   int tmp_row_min = x->mv_row_min;
   2339   int tmp_row_max = x->mv_row_max;
   2340 
   2341   YV12_BUFFER_CONFIG *scaled_ref_frame = get_scaled_ref_frame(cpi, ref);
   2342 
   2343   if (scaled_ref_frame) {
   2344     int i;
   2345     // Swap out the reference frame for a version that's been scaled to
   2346     // match the resolution of the current frame, allowing the existing
   2347     // motion search code to be used without additional modifications.
   2348     for (i = 0; i < MAX_MB_PLANE; i++)
   2349       backup_yv12[i] = xd->plane[i].pre[0];
   2350 
   2351     setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
   2352   }
   2353 
   2354   vp9_clamp_mv_min_max(x, &ref_mv.as_mv);
   2355 
   2356   // Adjust search parameters based on small partitions' result.
   2357   if (x->fast_ms) {
   2358     // && abs(mvp_full.as_mv.row - x->pred_mv.as_mv.row) < 24 &&
   2359     // abs(mvp_full.as_mv.col - x->pred_mv.as_mv.col) < 24) {
   2360     // adjust search range
   2361     step_param = 6;
   2362     if (x->fast_ms > 1)
   2363       step_param = 8;
   2364 
   2365     // Get prediction MV.
   2366     mvp_full.as_int = x->pred_mv[ref].as_int;
   2367 
   2368     // Adjust MV sign if needed.
   2369     if (cm->ref_frame_sign_bias[ref]) {
   2370       mvp_full.as_mv.col *= -1;
   2371       mvp_full.as_mv.row *= -1;
   2372     }
   2373   } else {
   2374     // Work out the size of the first step in the mv step search.
   2375     // 0 here is maximum length first step. 1 is MAX >> 1 etc.
   2376     if (cpi->sf.auto_mv_step_size && cpi->common.show_frame) {
   2377       // Take wtd average of the step_params based on the last frame's
   2378       // max mv magnitude and that based on the best ref mvs of the current
   2379       // block for the given reference.
   2380       step_param = (vp9_init_search_range(cpi, x->max_mv_context[ref]) +
   2381                     cpi->mv_step_param) >> 1;
   2382     } else {
   2383       step_param = cpi->mv_step_param;
   2384     }
   2385   }
   2386 
   2387   if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64 &&
   2388       cpi->common.show_frame) {
   2389     int boffset = 2 * (b_width_log2(BLOCK_64X64) - MIN(b_height_log2(bsize),
   2390                                                        b_width_log2(bsize)));
   2391     step_param = MAX(step_param, boffset);
   2392   }
   2393 
   2394   mvp_full.as_int = x->mv_best_ref_index[ref] < MAX_MV_REF_CANDIDATES ?
   2395       mbmi->ref_mvs[ref][x->mv_best_ref_index[ref]].as_int :
   2396       x->pred_mv[ref].as_int;
   2397 
   2398   mvp_full.as_mv.col >>= 3;
   2399   mvp_full.as_mv.row >>= 3;
   2400 
   2401   // Further step/diamond searches as necessary
   2402   further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
   2403 
   2404   if (cpi->sf.search_method == HEX) {
   2405     bestsme = vp9_hex_search(x, &mvp_full,
   2406                              step_param,
   2407                              sadpb, 1,
   2408                              &cpi->fn_ptr[block_size], 1,
   2409                              &ref_mv, tmp_mv);
   2410   } else if (cpi->sf.search_method == SQUARE) {
   2411     bestsme = vp9_square_search(x, &mvp_full,
   2412                                 step_param,
   2413                                 sadpb, 1,
   2414                                 &cpi->fn_ptr[block_size], 1,
   2415                                 &ref_mv, tmp_mv);
   2416   } else if (cpi->sf.search_method == BIGDIA) {
   2417     bestsme = vp9_bigdia_search(x, &mvp_full,
   2418                                 step_param,
   2419                                 sadpb, 1,
   2420                                 &cpi->fn_ptr[block_size], 1,
   2421                                 &ref_mv, tmp_mv);
   2422   } else {
   2423     bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param,
   2424                                      sadpb, further_steps, 1,
   2425                                      &cpi->fn_ptr[block_size],
   2426                                      &ref_mv, tmp_mv);
   2427   }
   2428 
   2429   x->mv_col_min = tmp_col_min;
   2430   x->mv_col_max = tmp_col_max;
   2431   x->mv_row_min = tmp_row_min;
   2432   x->mv_row_max = tmp_row_max;
   2433 
   2434   if (bestsme < INT_MAX) {
   2435     int dis;  /* TODO: use dis in distortion calculation later. */
   2436     unsigned int sse;
   2437     cpi->find_fractional_mv_step(x, tmp_mv, &ref_mv,
   2438                                  x->errorperbit,
   2439                                  &cpi->fn_ptr[block_size],
   2440                                  0, cpi->sf.subpel_iters_per_step,
   2441                                  x->nmvjointcost, x->mvcost,
   2442                                  &dis, &sse);
   2443   }
   2444   *rate_mv = vp9_mv_bit_cost(tmp_mv, &ref_mv,
   2445                              x->nmvjointcost, x->mvcost,
   2446                              96);
   2447 
   2448   if (cpi->sf.adaptive_motion_search && cpi->common.show_frame)
   2449     x->pred_mv[ref].as_int = tmp_mv->as_int;
   2450 
   2451   if (scaled_ref_frame) {
   2452     int i;
   2453     for (i = 0; i < MAX_MB_PLANE; i++)
   2454       xd->plane[i].pre[0] = backup_yv12[i];
   2455   }
   2456 }
   2457 
   2458 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   2459                                 BLOCK_SIZE bsize,
   2460                                 int_mv *frame_mv,
   2461                                 int mi_row, int mi_col,
   2462                                 int_mv single_newmv[MAX_REF_FRAMES],
   2463                                 int *rate_mv) {
   2464   int pw = 4 << b_width_log2(bsize), ph = 4 << b_height_log2(bsize);
   2465   MACROBLOCKD *xd = &x->e_mbd;
   2466   MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   2467   int refs[2] = { mbmi->ref_frame[0],
   2468     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   2469   int_mv ref_mv[2];
   2470   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   2471   int ite;
   2472   // Prediction buffer from second frame.
   2473   uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
   2474 
   2475   // Do joint motion search in compound mode to get more accurate mv.
   2476   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
   2477   struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
   2478   struct buf_2d scaled_first_yv12;
   2479   int last_besterr[2] = {INT_MAX, INT_MAX};
   2480   YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
   2481   scaled_ref_frame[0] = get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
   2482   scaled_ref_frame[1] = get_scaled_ref_frame(cpi, mbmi->ref_frame[1]);
   2483 
   2484   ref_mv[0] = mbmi->ref_mvs[refs[0]][0];
   2485   ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
   2486 
   2487   if (scaled_ref_frame[0]) {
   2488     int i;
   2489     // Swap out the reference frame for a version that's been scaled to
   2490     // match the resolution of the current frame, allowing the existing
   2491     // motion search code to be used without additional modifications.
   2492     for (i = 0; i < MAX_MB_PLANE; i++)
   2493       backup_yv12[i] = xd->plane[i].pre[0];
   2494     setup_pre_planes(xd, 0, scaled_ref_frame[0], mi_row, mi_col, NULL);
   2495   }
   2496 
   2497   if (scaled_ref_frame[1]) {
   2498     int i;
   2499     for (i = 0; i < MAX_MB_PLANE; i++)
   2500       backup_second_yv12[i] = xd->plane[i].pre[1];
   2501 
   2502     setup_pre_planes(xd, 0, scaled_ref_frame[1], mi_row, mi_col, NULL);
   2503   }
   2504 
   2505   xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
   2506                                          mi_row, mi_col);
   2507   xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
   2508                                          mi_row, mi_col);
   2509   scaled_first_yv12 = xd->plane[0].pre[0];
   2510 
   2511   // Initialize mv using single prediction mode result.
   2512   frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
   2513   frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
   2514 
   2515   // Allow joint search multiple times iteratively for each ref frame
   2516   // and break out the search loop if it couldn't find better mv.
   2517   for (ite = 0; ite < 4; ite++) {
   2518     struct buf_2d ref_yv12[2];
   2519     int bestsme = INT_MAX;
   2520     int sadpb = x->sadperbit16;
   2521     int_mv tmp_mv;
   2522     int search_range = 3;
   2523 
   2524     int tmp_col_min = x->mv_col_min;
   2525     int tmp_col_max = x->mv_col_max;
   2526     int tmp_row_min = x->mv_row_min;
   2527     int tmp_row_max = x->mv_row_max;
   2528     int id = ite % 2;
   2529 
   2530     // Initialized here because of compiler problem in Visual Studio.
   2531     ref_yv12[0] = xd->plane[0].pre[0];
   2532     ref_yv12[1] = xd->plane[0].pre[1];
   2533 
   2534     // Get pred block from second frame.
   2535     vp9_build_inter_predictor(ref_yv12[!id].buf,
   2536                               ref_yv12[!id].stride,
   2537                               second_pred, pw,
   2538                               &frame_mv[refs[!id]].as_mv,
   2539                               &xd->scale_factor[!id],
   2540                               pw, ph, 0,
   2541                               &xd->subpix, MV_PRECISION_Q3);
   2542 
   2543     // Compound motion search on first ref frame.
   2544     if (id)
   2545       xd->plane[0].pre[0] = ref_yv12[id];
   2546     vp9_clamp_mv_min_max(x, &ref_mv[id].as_mv);
   2547 
   2548     // Use mv result from single mode as mvp.
   2549     tmp_mv.as_int = frame_mv[refs[id]].as_int;
   2550 
   2551     tmp_mv.as_mv.col >>= 3;
   2552     tmp_mv.as_mv.row >>= 3;
   2553 
   2554     // Small-range full-pixel motion search
   2555     bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
   2556                                        search_range,
   2557                                        &cpi->fn_ptr[block_size],
   2558                                        x->nmvjointcost, x->mvcost,
   2559                                        &ref_mv[id], second_pred,
   2560                                        pw, ph);
   2561 
   2562     x->mv_col_min = tmp_col_min;
   2563     x->mv_col_max = tmp_col_max;
   2564     x->mv_row_min = tmp_row_min;
   2565     x->mv_row_max = tmp_row_max;
   2566 
   2567     if (bestsme < INT_MAX) {
   2568       int dis; /* TODO: use dis in distortion calculation later. */
   2569       unsigned int sse;
   2570 
   2571       bestsme = cpi->find_fractional_mv_step_comp(
   2572           x, &tmp_mv,
   2573           &ref_mv[id],
   2574           x->errorperbit,
   2575           &cpi->fn_ptr[block_size],
   2576           0, cpi->sf.subpel_iters_per_step,
   2577           x->nmvjointcost, x->mvcost,
   2578           &dis, &sse, second_pred,
   2579           pw, ph);
   2580     }
   2581 
   2582     if (id)
   2583       xd->plane[0].pre[0] = scaled_first_yv12;
   2584 
   2585     if (bestsme < last_besterr[id]) {
   2586       frame_mv[refs[id]].as_int = tmp_mv.as_int;
   2587       last_besterr[id] = bestsme;
   2588     } else {
   2589       break;
   2590     }
   2591   }
   2592 
   2593   // restore the predictor
   2594   if (scaled_ref_frame[0]) {
   2595     int i;
   2596     for (i = 0; i < MAX_MB_PLANE; i++)
   2597       xd->plane[i].pre[0] = backup_yv12[i];
   2598   }
   2599 
   2600   if (scaled_ref_frame[1]) {
   2601     int i;
   2602     for (i = 0; i < MAX_MB_PLANE; i++)
   2603       xd->plane[i].pre[1] = backup_second_yv12[i];
   2604   }
   2605   *rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]],
   2606                               &mbmi->ref_mvs[refs[0]][0],
   2607                               x->nmvjointcost, x->mvcost, 96);
   2608   *rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
   2609                               &mbmi->ref_mvs[refs[1]][0],
   2610                               x->nmvjointcost, x->mvcost, 96);
   2611 
   2612   vpx_free(second_pred);
   2613 }
   2614 
   2615 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
   2616                                  BLOCK_SIZE bsize,
   2617                                  int64_t txfm_cache[],
   2618                                  int *rate2, int64_t *distortion,
   2619                                  int *skippable,
   2620                                  int *rate_y, int64_t *distortion_y,
   2621                                  int *rate_uv, int64_t *distortion_uv,
   2622                                  int *mode_excluded, int *disable_skip,
   2623                                  INTERPOLATIONFILTERTYPE *best_filter,
   2624                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
   2625                                  int mi_row, int mi_col,
   2626                                  int_mv single_newmv[MAX_REF_FRAMES],
   2627                                  int64_t *psse,
   2628                                  const int64_t ref_best_rd) {
   2629   VP9_COMMON *cm = &cpi->common;
   2630   MACROBLOCKD *xd = &x->e_mbd;
   2631   MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   2632   const int is_comp_pred = (mbmi->ref_frame[1] > 0);
   2633   const int num_refs = is_comp_pred ? 2 : 1;
   2634   const int this_mode = mbmi->mode;
   2635   int_mv *frame_mv = mode_mv[this_mode];
   2636   int i;
   2637   int refs[2] = { mbmi->ref_frame[0],
   2638     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   2639   int_mv cur_mv[2];
   2640   int64_t this_rd = 0;
   2641   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
   2642   int pred_exists = 0;
   2643   int intpel_mv;
   2644   int64_t rd, best_rd = INT64_MAX;
   2645   int best_needs_copy = 0;
   2646   uint8_t *orig_dst[MAX_MB_PLANE];
   2647   int orig_dst_stride[MAX_MB_PLANE];
   2648   int rs = 0;
   2649 
   2650   if (this_mode == NEWMV) {
   2651     int rate_mv;
   2652     if (is_comp_pred) {
   2653       // Initialize mv using single prediction mode result.
   2654       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
   2655       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
   2656 
   2657       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
   2658         joint_motion_search(cpi, x, bsize, frame_mv,
   2659                             mi_row, mi_col, single_newmv, &rate_mv);
   2660       } else {
   2661         rate_mv  = vp9_mv_bit_cost(&frame_mv[refs[0]],
   2662                                    &mbmi->ref_mvs[refs[0]][0],
   2663                                    x->nmvjointcost, x->mvcost, 96);
   2664         rate_mv += vp9_mv_bit_cost(&frame_mv[refs[1]],
   2665                                    &mbmi->ref_mvs[refs[1]][0],
   2666                                    x->nmvjointcost, x->mvcost, 96);
   2667       }
   2668       if (frame_mv[refs[0]].as_int == INVALID_MV ||
   2669           frame_mv[refs[1]].as_int == INVALID_MV)
   2670         return INT64_MAX;
   2671       *rate2 += rate_mv;
   2672     } else {
   2673       int_mv tmp_mv;
   2674       single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
   2675       *rate2 += rate_mv;
   2676       frame_mv[refs[0]].as_int =
   2677           xd->this_mi->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
   2678       single_newmv[refs[0]].as_int = tmp_mv.as_int;
   2679     }
   2680   }
   2681 
   2682   // if we're near/nearest and mv == 0,0, compare to zeromv
   2683   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
   2684       frame_mv[refs[0]].as_int == 0 &&
   2685       !vp9_segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
   2686       (num_refs == 1 || frame_mv[refs[1]].as_int == 0)) {
   2687     int rfc = mbmi->mode_context[mbmi->ref_frame[0]];
   2688     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
   2689     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
   2690     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
   2691 
   2692     if (this_mode == NEARMV) {
   2693       if (c1 > c3)
   2694         return INT64_MAX;
   2695     } else if (this_mode == NEARESTMV) {
   2696       if (c2 > c3)
   2697         return INT64_MAX;
   2698     } else {
   2699       assert(this_mode == ZEROMV);
   2700       if (num_refs == 1) {
   2701         if ((c3 >= c2 &&
   2702              mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0) ||
   2703             (c3 >= c1 &&
   2704              mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0))
   2705           return INT64_MAX;
   2706       } else {
   2707         if ((c3 >= c2 &&
   2708              mode_mv[NEARESTMV][mbmi->ref_frame[0]].as_int == 0 &&
   2709              mode_mv[NEARESTMV][mbmi->ref_frame[1]].as_int == 0) ||
   2710             (c3 >= c1 &&
   2711              mode_mv[NEARMV][mbmi->ref_frame[0]].as_int == 0 &&
   2712              mode_mv[NEARMV][mbmi->ref_frame[1]].as_int == 0))
   2713           return INT64_MAX;
   2714       }
   2715     }
   2716   }
   2717 
   2718   for (i = 0; i < num_refs; ++i) {
   2719     cur_mv[i] = frame_mv[refs[i]];
   2720     // Clip "next_nearest" so that it does not extend to far out of image
   2721     if (this_mode != NEWMV)
   2722       clamp_mv2(&cur_mv[i].as_mv, xd);
   2723 
   2724     if (mv_check_bounds(x, &cur_mv[i]))
   2725       return INT64_MAX;
   2726     mbmi->mv[i].as_int = cur_mv[i].as_int;
   2727   }
   2728 
   2729   // do first prediction into the destination buffer. Do the next
   2730   // prediction into a temporary buffer. Then keep track of which one
   2731   // of these currently holds the best predictor, and use the other
   2732   // one for future predictions. In the end, copy from tmp_buf to
   2733   // dst if necessary.
   2734   for (i = 0; i < MAX_MB_PLANE; i++) {
   2735     orig_dst[i] = xd->plane[i].dst.buf;
   2736     orig_dst_stride[i] = xd->plane[i].dst.stride;
   2737   }
   2738 
   2739   /* We don't include the cost of the second reference here, because there
   2740    * are only three options: Last/Golden, ARF/Last or Golden/ARF, or in other
   2741    * words if you present them in that order, the second one is always known
   2742    * if the first is known */
   2743   *rate2 += cost_mv_ref(cpi, this_mode,
   2744                         mbmi->mode_context[mbmi->ref_frame[0]]);
   2745 
   2746   if (!(*mode_excluded)) {
   2747     if (is_comp_pred) {
   2748       *mode_excluded = (cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY);
   2749     } else {
   2750       *mode_excluded = (cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY);
   2751     }
   2752   }
   2753 
   2754   pred_exists = 0;
   2755   // Are all MVs integer pel for Y and UV
   2756   intpel_mv = (mbmi->mv[0].as_mv.row & 15) == 0 &&
   2757       (mbmi->mv[0].as_mv.col & 15) == 0;
   2758   if (is_comp_pred)
   2759     intpel_mv &= (mbmi->mv[1].as_mv.row & 15) == 0 &&
   2760         (mbmi->mv[1].as_mv.col & 15) == 0;
   2761   // Search for best switchable filter by checking the variance of
   2762   // pred error irrespective of whether the filter will be used
   2763   if (cm->mcomp_filter_type != BILINEAR) {
   2764     *best_filter = EIGHTTAP;
   2765     if (x->source_variance <
   2766         cpi->sf.disable_filter_search_var_thresh) {
   2767       *best_filter = EIGHTTAP;
   2768       vp9_zero(cpi->rd_filter_cache);
   2769     } else {
   2770       int i, newbest;
   2771       int tmp_rate_sum = 0;
   2772       int64_t tmp_dist_sum = 0;
   2773 
   2774       cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
   2775       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
   2776         int j;
   2777         int64_t rs_rd;
   2778         mbmi->interp_filter = i;
   2779         vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
   2780         rs = get_switchable_rate(x);
   2781         rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
   2782 
   2783         if (i > 0 && intpel_mv) {
   2784           cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
   2785                                            tmp_rate_sum, tmp_dist_sum);
   2786           cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
   2787               MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
   2788                   cpi->rd_filter_cache[i] + rs_rd);
   2789           rd = cpi->rd_filter_cache[i];
   2790           if (cm->mcomp_filter_type == SWITCHABLE)
   2791             rd += rs_rd;
   2792         } else {
   2793           int rate_sum = 0;
   2794           int64_t dist_sum = 0;
   2795           if ((cm->mcomp_filter_type == SWITCHABLE &&
   2796                (!i || best_needs_copy)) ||
   2797               (cm->mcomp_filter_type != SWITCHABLE &&
   2798                (cm->mcomp_filter_type == mbmi->interp_filter ||
   2799                 (i == 0 && intpel_mv)))) {
   2800             for (j = 0; j < MAX_MB_PLANE; j++) {
   2801               xd->plane[j].dst.buf = orig_dst[j];
   2802               xd->plane[j].dst.stride = orig_dst_stride[j];
   2803             }
   2804           } else {
   2805             for (j = 0; j < MAX_MB_PLANE; j++) {
   2806               xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
   2807               xd->plane[j].dst.stride = 64;
   2808             }
   2809           }
   2810           vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   2811           model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
   2812           cpi->rd_filter_cache[i] = RDCOST(x->rdmult, x->rddiv,
   2813                                            rate_sum, dist_sum);
   2814           cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
   2815               MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
   2816                   cpi->rd_filter_cache[i] + rs_rd);
   2817           rd = cpi->rd_filter_cache[i];
   2818           if (cm->mcomp_filter_type == SWITCHABLE)
   2819             rd += rs_rd;
   2820           if (i == 0 && intpel_mv) {
   2821             tmp_rate_sum = rate_sum;
   2822             tmp_dist_sum = dist_sum;
   2823           }
   2824         }
   2825         if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
   2826           if (rd / 2 > ref_best_rd) {
   2827             for (i = 0; i < MAX_MB_PLANE; i++) {
   2828               xd->plane[i].dst.buf = orig_dst[i];
   2829               xd->plane[i].dst.stride = orig_dst_stride[i];
   2830             }
   2831             return INT64_MAX;
   2832           }
   2833         }
   2834         newbest = i == 0 || rd < best_rd;
   2835 
   2836         if (newbest) {
   2837           best_rd = rd;
   2838           *best_filter = mbmi->interp_filter;
   2839           if (cm->mcomp_filter_type == SWITCHABLE && i && !intpel_mv)
   2840             best_needs_copy = !best_needs_copy;
   2841         }
   2842 
   2843         if ((cm->mcomp_filter_type == SWITCHABLE && newbest) ||
   2844             (cm->mcomp_filter_type != SWITCHABLE &&
   2845              cm->mcomp_filter_type == mbmi->interp_filter)) {
   2846           pred_exists = 1;
   2847         }
   2848       }
   2849 
   2850       for (i = 0; i < MAX_MB_PLANE; i++) {
   2851         xd->plane[i].dst.buf = orig_dst[i];
   2852         xd->plane[i].dst.stride = orig_dst_stride[i];
   2853       }
   2854     }
   2855   }
   2856   // Set the appropriate filter
   2857   mbmi->interp_filter = cm->mcomp_filter_type != SWITCHABLE ?
   2858       cm->mcomp_filter_type : *best_filter;
   2859   vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
   2860   rs = cm->mcomp_filter_type == SWITCHABLE ? get_switchable_rate(x) : 0;
   2861 
   2862   if (pred_exists) {
   2863     if (best_needs_copy) {
   2864       // again temporarily set the buffers to local memory to prevent a memcpy
   2865       for (i = 0; i < MAX_MB_PLANE; i++) {
   2866         xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
   2867         xd->plane[i].dst.stride = 64;
   2868       }
   2869     }
   2870   } else {
   2871     // Handles the special case when a filter that is not in the
   2872     // switchable list (ex. bilinear, 6-tap) is indicated at the frame level
   2873     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
   2874   }
   2875 
   2876 
   2877   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
   2878     int tmp_rate;
   2879     int64_t tmp_dist;
   2880     model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist);
   2881     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
   2882     // if current pred_error modeled rd is substantially more than the best
   2883     // so far, do not bother doing full rd
   2884     if (rd / 2 > ref_best_rd) {
   2885       for (i = 0; i < MAX_MB_PLANE; i++) {
   2886         xd->plane[i].dst.buf = orig_dst[i];
   2887         xd->plane[i].dst.stride = orig_dst_stride[i];
   2888       }
   2889       return INT64_MAX;
   2890     }
   2891   }
   2892 
   2893   if (cpi->common.mcomp_filter_type == SWITCHABLE)
   2894     *rate2 += get_switchable_rate(x);
   2895 
   2896   if (!is_comp_pred && cpi->enable_encode_breakout) {
   2897     if (cpi->active_map_enabled && x->active_ptr[0] == 0)
   2898       x->skip = 1;
   2899     else if (x->encode_breakout) {
   2900       const BLOCK_SIZE y_size = get_plane_block_size(bsize, &xd->plane[0]);
   2901       const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
   2902       unsigned int var, sse;
   2903       // Skipping threshold for ac.
   2904       unsigned int thresh_ac;
   2905       // The encode_breakout input
   2906       unsigned int encode_breakout = x->encode_breakout << 4;
   2907       int max_thresh = 36000;
   2908 
   2909       // Use extreme low threshold for static frames to limit skipping.
   2910       if (cpi->enable_encode_breakout == 2)
   2911         max_thresh = 128;
   2912 
   2913       // Calculate threshold according to dequant value.
   2914       thresh_ac = (xd->plane[0].dequant[1] * xd->plane[0].dequant[1]) / 9;
   2915 
   2916       // Use encode_breakout input if it is bigger than internal threshold.
   2917       if (thresh_ac < encode_breakout)
   2918         thresh_ac = encode_breakout;
   2919 
   2920       // Set a maximum for threshold to avoid big PSNR loss in low bitrate case.
   2921       if (thresh_ac > max_thresh)
   2922         thresh_ac = max_thresh;
   2923 
   2924       var = cpi->fn_ptr[y_size].vf(x->plane[0].src.buf, x->plane[0].src.stride,
   2925                                    xd->plane[0].dst.buf,
   2926                                    xd->plane[0].dst.stride, &sse);
   2927 
   2928       // Adjust threshold according to partition size.
   2929       thresh_ac >>= 8 - (b_width_log2_lookup[bsize] +
   2930           b_height_log2_lookup[bsize]);
   2931 
   2932       // Y skipping condition checking
   2933       if (sse < thresh_ac || sse == 0) {
   2934         // Skipping threshold for dc
   2935         unsigned int thresh_dc;
   2936 
   2937         thresh_dc = (xd->plane[0].dequant[0] * xd->plane[0].dequant[0] >> 6);
   2938 
   2939         // dc skipping checking
   2940         if ((sse - var) < thresh_dc || sse == var) {
   2941           unsigned int sse_u, sse_v;
   2942           unsigned int var_u, var_v;
   2943 
   2944           var_u = cpi->fn_ptr[uv_size].vf(x->plane[1].src.buf,
   2945                                           x->plane[1].src.stride,
   2946                                           xd->plane[1].dst.buf,
   2947                                           xd->plane[1].dst.stride, &sse_u);
   2948 
   2949           // U skipping condition checking
   2950           if ((sse_u * 4 < thresh_ac || sse_u == 0) &&
   2951               (sse_u - var_u < thresh_dc || sse_u == var_u)) {
   2952             var_v = cpi->fn_ptr[uv_size].vf(x->plane[2].src.buf,
   2953                                             x->plane[2].src.stride,
   2954                                             xd->plane[2].dst.buf,
   2955                                             xd->plane[2].dst.stride, &sse_v);
   2956 
   2957             // V skipping condition checking
   2958             if ((sse_v * 4 < thresh_ac || sse_v == 0) &&
   2959                 (sse_v - var_v < thresh_dc || sse_v == var_v)) {
   2960               x->skip = 1;
   2961 
   2962               // The cost of skip bit needs to be added.
   2963               *rate2 += vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
   2964 
   2965               // Scaling factor for SSE from spatial domain to frequency domain
   2966               // is 16. Adjust distortion accordingly.
   2967               *distortion_uv = (sse_u + sse_v) << 4;
   2968               *distortion = (sse << 4) + *distortion_uv;
   2969 
   2970               *disable_skip = 1;
   2971               this_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
   2972             }
   2973           }
   2974         }
   2975       }
   2976     }
   2977   }
   2978 
   2979   if (!x->skip) {
   2980     int skippable_y, skippable_uv;
   2981     int64_t sseuv = INT64_MAX;
   2982     int64_t rdcosty = INT64_MAX;
   2983 
   2984     // Y cost and distortion
   2985     super_block_yrd(cpi, x, rate_y, distortion_y, &skippable_y, psse,
   2986                     bsize, txfm_cache, ref_best_rd);
   2987 
   2988     if (*rate_y == INT_MAX) {
   2989       *rate2 = INT_MAX;
   2990       *distortion = INT64_MAX;
   2991       for (i = 0; i < MAX_MB_PLANE; i++) {
   2992         xd->plane[i].dst.buf = orig_dst[i];
   2993         xd->plane[i].dst.stride = orig_dst_stride[i];
   2994       }
   2995       return INT64_MAX;
   2996     }
   2997 
   2998     *rate2 += *rate_y;
   2999     *distortion += *distortion_y;
   3000 
   3001     rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
   3002     rdcosty = MIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
   3003 
   3004     super_block_uvrd(cm, x, rate_uv, distortion_uv, &skippable_uv, &sseuv,
   3005                      bsize, ref_best_rd - rdcosty);
   3006     if (*rate_uv == INT_MAX) {
   3007       *rate2 = INT_MAX;
   3008       *distortion = INT64_MAX;
   3009       for (i = 0; i < MAX_MB_PLANE; i++) {
   3010         xd->plane[i].dst.buf = orig_dst[i];
   3011         xd->plane[i].dst.stride = orig_dst_stride[i];
   3012       }
   3013       return INT64_MAX;
   3014     }
   3015 
   3016     *psse += sseuv;
   3017     *rate2 += *rate_uv;
   3018     *distortion += *distortion_uv;
   3019     *skippable = skippable_y && skippable_uv;
   3020   }
   3021 
   3022   for (i = 0; i < MAX_MB_PLANE; i++) {
   3023     xd->plane[i].dst.buf = orig_dst[i];
   3024     xd->plane[i].dst.stride = orig_dst_stride[i];
   3025   }
   3026 
   3027   return this_rd;  // if 0, this will be re-calculated by caller
   3028 }
   3029 
   3030 void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   3031                                int *returnrate, int64_t *returndist,
   3032                                BLOCK_SIZE bsize,
   3033                                PICK_MODE_CONTEXT *ctx, int64_t best_rd) {
   3034   VP9_COMMON *const cm = &cpi->common;
   3035   MACROBLOCKD *const xd = &x->e_mbd;
   3036   int rate_y = 0, rate_uv = 0, rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
   3037   int y_skip = 0, uv_skip = 0;
   3038   int64_t dist_y = 0, dist_uv = 0, tx_cache[TX_MODES] = { 0 };
   3039   x->skip_encode = 0;
   3040   ctx->skip = 0;
   3041   xd->this_mi->mbmi.ref_frame[0] = INTRA_FRAME;
   3042   if (bsize >= BLOCK_8X8) {
   3043     if (rd_pick_intra_sby_mode(cpi, x, &rate_y, &rate_y_tokenonly,
   3044                                &dist_y, &y_skip, bsize, tx_cache,
   3045                                best_rd) >= best_rd) {
   3046       *returnrate = INT_MAX;
   3047       return;
   3048     }
   3049     rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
   3050                             &dist_uv, &uv_skip, bsize);
   3051   } else {
   3052     y_skip = 0;
   3053     if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate_y, &rate_y_tokenonly,
   3054                                      &dist_y, best_rd) >= best_rd) {
   3055       *returnrate = INT_MAX;
   3056       return;
   3057     }
   3058     rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
   3059                             &dist_uv, &uv_skip, BLOCK_8X8);
   3060   }
   3061 
   3062   if (y_skip && uv_skip) {
   3063     *returnrate = rate_y + rate_uv - rate_y_tokenonly - rate_uv_tokenonly +
   3064                   vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 1);
   3065     *returndist = dist_y + dist_uv;
   3066     vp9_zero(ctx->tx_rd_diff);
   3067   } else {
   3068     int i;
   3069     *returnrate = rate_y + rate_uv +
   3070         vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd), 0);
   3071     *returndist = dist_y + dist_uv;
   3072     if (cpi->sf.tx_size_search_method == USE_FULL_RD)
   3073       for (i = 0; i < TX_MODES; i++)
   3074         ctx->tx_rd_diff[i] = tx_cache[i] - tx_cache[cm->tx_mode];
   3075   }
   3076 
   3077   ctx->mic = *xd->this_mi;
   3078 }
   3079 
   3080 int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
   3081                                   int mi_row, int mi_col,
   3082                                   int *returnrate,
   3083                                   int64_t *returndistortion,
   3084                                   BLOCK_SIZE bsize,
   3085                                   PICK_MODE_CONTEXT *ctx,
   3086                                   int64_t best_rd_so_far) {
   3087   VP9_COMMON *cm = &cpi->common;
   3088   MACROBLOCKD *xd = &x->e_mbd;
   3089   MB_MODE_INFO *mbmi = &xd->this_mi->mbmi;
   3090   const struct segmentation *seg = &cm->seg;
   3091   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
   3092   RD_PREDICTION_MODE this_mode;
   3093   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   3094   unsigned char segment_id = mbmi->segment_id;
   3095   int comp_pred, i;
   3096   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   3097   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   3098   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   3099   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
   3100                                     VP9_ALT_FLAG };
   3101   int idx_list[4] = {0,
   3102                      cpi->lst_fb_idx,
   3103                      cpi->gld_fb_idx,
   3104                      cpi->alt_fb_idx};
   3105   int64_t best_rd = best_rd_so_far;
   3106   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   3107   int64_t best_tx_rd[TX_MODES];
   3108   int64_t best_tx_diff[TX_MODES];
   3109   int64_t best_pred_diff[NB_PREDICTION_TYPES];
   3110   int64_t best_pred_rd[NB_PREDICTION_TYPES];
   3111   int64_t best_filter_rd[SWITCHABLE_FILTERS + 1];
   3112   int64_t best_filter_diff[SWITCHABLE_FILTERS + 1];
   3113   MB_MODE_INFO best_mbmode = { 0 };
   3114   int j;
   3115   int mode_index, best_mode_index = 0;
   3116   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   3117   vp9_prob comp_mode_p;
   3118   int64_t best_intra_rd = INT64_MAX;
   3119   int64_t best_inter_rd = INT64_MAX;
   3120   MB_PREDICTION_MODE best_intra_mode = DC_PRED;
   3121   // MB_PREDICTION_MODE best_inter_mode = ZEROMV;
   3122   MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
   3123   INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
   3124   int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
   3125   int64_t dist_uv[TX_SIZES];
   3126   int skip_uv[TX_SIZES];
   3127   MB_PREDICTION_MODE mode_uv[TX_SIZES];
   3128   struct scale_factors scale_factor[4];
   3129   unsigned int ref_frame_mask = 0;
   3130   unsigned int mode_mask = 0;
   3131   int64_t mode_distortions[MB_MODE_COUNT] = {-1};
   3132   int64_t frame_distortions[MAX_REF_FRAMES] = {-1};
   3133   int intra_cost_penalty = 20 * vp9_dc_quant(cpi->common.base_qindex,
   3134                                              cpi->common.y_dc_delta_q);
   3135   int_mv seg_mvs[4][MAX_REF_FRAMES];
   3136   union b_mode_info best_bmodes[4];
   3137   PARTITION_INFO best_partition;
   3138   const int bws = num_8x8_blocks_wide_lookup[bsize] / 2;
   3139   const int bhs = num_8x8_blocks_high_lookup[bsize] / 2;
   3140   int best_skip2 = 0;
   3141 
   3142   x->skip_encode = cpi->sf.skip_encode_frame && xd->q_index < QIDX_SKIP_THRESH;
   3143 
   3144   for (i = 0; i < 4; i++) {
   3145     int j;
   3146     for (j = 0; j < MAX_REF_FRAMES; j++)
   3147       seg_mvs[i][j].as_int = INVALID_MV;
   3148   }
   3149   // Everywhere the flag is set the error is much higher than its neighbors.
   3150   ctx->frames_with_high_error = 0;
   3151   ctx->modes_with_high_error = 0;
   3152 
   3153   estimate_ref_frame_costs(cpi, segment_id, ref_costs_single, ref_costs_comp,
   3154                            &comp_mode_p);
   3155 
   3156   for (i = 0; i < NB_PREDICTION_TYPES; ++i)
   3157     best_pred_rd[i] = INT64_MAX;
   3158   for (i = 0; i < TX_MODES; i++)
   3159     best_tx_rd[i] = INT64_MAX;
   3160   for (i = 0; i <= SWITCHABLE_FILTERS; i++)
   3161     best_filter_rd[i] = INT64_MAX;
   3162   for (i = 0; i < TX_SIZES; i++)
   3163     rate_uv_intra[i] = INT_MAX;
   3164 
   3165   *returnrate = INT_MAX;
   3166 
   3167   // Create a mask set to 1 for each reference frame used by a smaller
   3168   // resolution.
   3169   if (cpi->sf.use_avoid_tested_higherror) {
   3170     switch (block_size) {
   3171       case BLOCK_64X64:
   3172         for (i = 0; i < 4; i++) {
   3173           for (j = 0; j < 4; j++) {
   3174             ref_frame_mask |= x->mb_context[i][j].frames_with_high_error;
   3175             mode_mask |= x->mb_context[i][j].modes_with_high_error;
   3176           }
   3177         }
   3178         for (i = 0; i < 4; i++) {
   3179           ref_frame_mask |= x->sb32_context[i].frames_with_high_error;
   3180           mode_mask |= x->sb32_context[i].modes_with_high_error;
   3181         }
   3182         break;
   3183       case BLOCK_32X32:
   3184         for (i = 0; i < 4; i++) {
   3185           ref_frame_mask |=
   3186               x->mb_context[xd->sb_index][i].frames_with_high_error;
   3187           mode_mask |= x->mb_context[xd->sb_index][i].modes_with_high_error;
   3188         }
   3189         break;
   3190       default:
   3191         // Until we handle all block sizes set it to present;
   3192         ref_frame_mask = 0;
   3193         mode_mask = 0;
   3194         break;
   3195     }
   3196     ref_frame_mask = ~ref_frame_mask;
   3197     mode_mask = ~mode_mask;
   3198   }
   3199 
   3200   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
   3201     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
   3202       setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
   3203                          mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
   3204                          yv12_mb, scale_factor);
   3205     }
   3206     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   3207     frame_mv[ZEROMV][ref_frame].as_int = 0;
   3208   }
   3209 
   3210   for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
   3211     int mode_excluded = 0;
   3212     int64_t this_rd = INT64_MAX;
   3213     int disable_skip = 0;
   3214     int compmode_cost = 0;
   3215     int rate2 = 0, rate_y = 0, rate_uv = 0;
   3216     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
   3217     int skippable = 0;
   3218     int64_t tx_cache[TX_MODES];
   3219     int i;
   3220     int this_skip2 = 0;
   3221     int64_t total_sse = INT_MAX;
   3222     int early_term = 0;
   3223 
   3224     for (i = 0; i < TX_MODES; ++i)
   3225       tx_cache[i] = INT64_MAX;
   3226 
   3227     x->skip = 0;
   3228     this_mode = vp9_mode_order[mode_index].mode;
   3229     ref_frame = vp9_mode_order[mode_index].ref_frame;
   3230     second_ref_frame = vp9_mode_order[mode_index].second_ref_frame;
   3231 
   3232     // Look at the reference frame of the best mode so far and set the
   3233     // skip mask to look at a subset of the remaining modes.
   3234     if (mode_index > cpi->sf.mode_skip_start) {
   3235       if (mode_index == (cpi->sf.mode_skip_start + 1)) {
   3236         switch (vp9_mode_order[best_mode_index].ref_frame) {
   3237           case INTRA_FRAME:
   3238             cpi->mode_skip_mask = 0;
   3239             break;
   3240           case LAST_FRAME:
   3241             cpi->mode_skip_mask = LAST_FRAME_MODE_MASK;
   3242             break;
   3243           case GOLDEN_FRAME:
   3244             cpi->mode_skip_mask = GOLDEN_FRAME_MODE_MASK;
   3245             break;
   3246           case ALTREF_FRAME:
   3247             cpi->mode_skip_mask = ALT_REF_MODE_MASK;
   3248             break;
   3249           case NONE:
   3250           case MAX_REF_FRAMES:
   3251             assert(!"Invalid Reference frame");
   3252         }
   3253       }
   3254       if (cpi->mode_skip_mask & (1 << mode_index))
   3255         continue;
   3256     }
   3257 
   3258     // Skip if the current reference frame has been masked off
   3259     if (cpi->sf.reference_masking && !cpi->set_ref_frame_mask &&
   3260         (cpi->ref_frame_mask & (1 << ref_frame)))
   3261       continue;
   3262 
   3263     // Test best rd so far against threshold for trying this mode.
   3264     if ((best_rd < ((cpi->rd_threshes[bsize][mode_index] *
   3265                      cpi->rd_thresh_freq_fact[bsize][mode_index]) >> 5)) ||
   3266         cpi->rd_threshes[bsize][mode_index] == INT_MAX)
   3267       continue;
   3268 
   3269     // Do not allow compound prediction if the segment level reference
   3270     // frame feature is in use as in this case there can only be one reference.
   3271     if ((second_ref_frame > INTRA_FRAME) &&
   3272          vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
   3273       continue;
   3274 
   3275     // Skip some checking based on small partitions' result.
   3276     if (x->fast_ms > 1 && !ref_frame)
   3277       continue;
   3278     if (x->fast_ms > 2 && ref_frame != x->subblock_ref)
   3279       continue;
   3280 
   3281     if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_8X8) {
   3282       if (!(ref_frame_mask & (1 << ref_frame))) {
   3283         continue;
   3284       }
   3285       if (!(mode_mask & (1 << this_mode))) {
   3286         continue;
   3287       }
   3288       if (second_ref_frame != NONE
   3289           && !(ref_frame_mask & (1 << second_ref_frame))) {
   3290         continue;
   3291       }
   3292     }
   3293 
   3294     mbmi->ref_frame[0] = ref_frame;
   3295     mbmi->ref_frame[1] = second_ref_frame;
   3296 
   3297     if (!(ref_frame == INTRA_FRAME
   3298         || (cpi->ref_frame_flags & flag_list[ref_frame]))) {
   3299       continue;
   3300     }
   3301     if (!(second_ref_frame == NONE
   3302         || (cpi->ref_frame_flags & flag_list[second_ref_frame]))) {
   3303       continue;
   3304     }
   3305 
   3306     comp_pred = second_ref_frame > INTRA_FRAME;
   3307     if (comp_pred) {
   3308       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
   3309         if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
   3310           continue;
   3311       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
   3312         if (ref_frame != best_inter_ref_frame &&
   3313             second_ref_frame != best_inter_ref_frame)
   3314           continue;
   3315     }
   3316     // TODO(jingning, jkoleszar): scaling reference frame not supported for
   3317     // SPLITMV.
   3318     if (ref_frame > 0 &&
   3319         vp9_is_scaled(&scale_factor[ref_frame]) &&
   3320         this_mode == RD_SPLITMV)
   3321       continue;
   3322 
   3323     if (second_ref_frame > 0 &&
   3324         vp9_is_scaled(&scale_factor[second_ref_frame]) &&
   3325         this_mode == RD_SPLITMV)
   3326       continue;
   3327 
   3328     if (bsize >= BLOCK_8X8 &&
   3329         (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
   3330       continue;
   3331 
   3332     if (bsize < BLOCK_8X8 &&
   3333         !(this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
   3334       continue;
   3335 
   3336     set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
   3337     mbmi->uv_mode = DC_PRED;
   3338 
   3339     // Evaluate all sub-pel filters irrespective of whether we can use
   3340     // them for this frame.
   3341     mbmi->interp_filter = cm->mcomp_filter_type;
   3342     vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
   3343 
   3344     if (comp_pred) {
   3345       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
   3346         continue;
   3347       set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
   3348 
   3349       mode_excluded = mode_excluded
   3350                          ? mode_excluded
   3351                          : cm->comp_pred_mode == SINGLE_PREDICTION_ONLY;
   3352     } else {
   3353       if (ref_frame != INTRA_FRAME && second_ref_frame != INTRA_FRAME) {
   3354         mode_excluded =
   3355             mode_excluded ?
   3356                 mode_excluded : cm->comp_pred_mode == COMP_PREDICTION_ONLY;
   3357       }
   3358     }
   3359 
   3360     // Select prediction reference frames.
   3361     for (i = 0; i < MAX_MB_PLANE; i++) {
   3362       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
   3363       if (comp_pred)
   3364         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
   3365     }
   3366 
   3367     // If the segment reference frame feature is enabled....
   3368     // then do nothing if the current ref frame is not allowed..
   3369     if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
   3370         vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
   3371             (int)ref_frame) {
   3372       continue;
   3373     // If the segment skip feature is enabled....
   3374     // then do nothing if the current mode is not allowed..
   3375     } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
   3376                (this_mode != RD_ZEROMV && ref_frame != INTRA_FRAME)) {
   3377       continue;
   3378     // Disable this drop out case if the ref frame
   3379     // segment level feature is enabled for this segment. This is to
   3380     // prevent the possibility that we end up unable to pick any mode.
   3381     } else if (!vp9_segfeature_active(seg, segment_id,
   3382                                       SEG_LVL_REF_FRAME)) {
   3383       // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
   3384       // unless ARNR filtering is enabled in which case we want
   3385       // an unfiltered alternative. We allow near/nearest as well
   3386       // because they may result in zero-zero MVs but be cheaper.
   3387       if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
   3388         if ((this_mode != RD_ZEROMV &&
   3389              !(this_mode == RD_NEARMV &&
   3390                frame_mv[RD_NEARMV][ALTREF_FRAME].as_int == 0) &&
   3391              !(this_mode == RD_NEARESTMV &&
   3392                frame_mv[RD_NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
   3393             ref_frame != ALTREF_FRAME) {
   3394           continue;
   3395         }
   3396       }
   3397     }
   3398     // TODO(JBB): This is to make up for the fact that we don't have sad
   3399     // functions that work when the block size reads outside the umv.  We
   3400     // should fix this either by making the motion search just work on
   3401     // a representative block in the boundary ( first ) and then implement a
   3402     // function that does sads when inside the border..
   3403     if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
   3404         this_mode == RD_NEWMV) {
   3405       continue;
   3406     }
   3407 
   3408 #ifdef MODE_TEST_HIT_STATS
   3409     // TEST/DEBUG CODE
   3410     // Keep a rcord of the number of test hits at each size
   3411     cpi->mode_test_hits[bsize]++;
   3412 #endif
   3413 
   3414     if (this_mode == RD_I4X4_PRED) {
   3415       int rate;
   3416 
   3417       /*
   3418       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
   3419           (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME))
   3420         continue;
   3421         */
   3422 
   3423       // RD_I4X4_PRED is only considered for block sizes less than 8x8.
   3424       mbmi->tx_size = TX_4X4;
   3425       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
   3426                                        &distortion_y, best_rd) >= best_rd)
   3427         continue;
   3428       rate2 += rate;
   3429       rate2 += intra_cost_penalty;
   3430       distortion2 += distortion_y;
   3431 
   3432       if (rate_uv_intra[TX_4X4] == INT_MAX) {
   3433         choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
   3434                              &rate_uv_tokenonly[TX_4X4],
   3435                              &dist_uv[TX_4X4], &skip_uv[TX_4X4],
   3436                              &mode_uv[TX_4X4]);
   3437       }
   3438       rate2 += rate_uv_intra[TX_4X4];
   3439       rate_uv = rate_uv_tokenonly[TX_4X4];
   3440       distortion2 += dist_uv[TX_4X4];
   3441       distortion_uv = dist_uv[TX_4X4];
   3442       mbmi->uv_mode = mode_uv[TX_4X4];
   3443       tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
   3444       for (i = 0; i < TX_MODES; ++i)
   3445         tx_cache[i] = tx_cache[ONLY_4X4];
   3446     } else if (ref_frame == INTRA_FRAME) {
   3447       TX_SIZE uv_tx;
   3448       // Disable intra modes other than DC_PRED for blocks with low variance
   3449       // Threshold for intra skipping based on source variance
   3450       // TODO(debargha): Specialize the threshold for super block sizes
   3451       static const int skip_intra_var_thresh[BLOCK_SIZES] = {
   3452         64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
   3453       };
   3454       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
   3455           this_mode != RD_DC_PRED &&
   3456           x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
   3457         continue;
   3458       // Only search the oblique modes if the best so far is
   3459       // one of the neighboring directional modes
   3460       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
   3461           (this_mode >= RD_D45_PRED && this_mode <= RD_TM_PRED)) {
   3462         if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
   3463           continue;
   3464       }
   3465       mbmi->mode = rd_mode_to_mode(this_mode);
   3466       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
   3467         if (conditional_skipintra(mbmi->mode, best_intra_mode))
   3468             continue;
   3469       }
   3470 
   3471       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
   3472                       bsize, tx_cache, best_rd);
   3473 
   3474       if (rate_y == INT_MAX)
   3475         continue;
   3476 
   3477       uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]);
   3478       if (rate_uv_intra[uv_tx] == INT_MAX) {
   3479         choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
   3480                              &rate_uv_tokenonly[uv_tx],
   3481                              &dist_uv[uv_tx], &skip_uv[uv_tx],
   3482                              &mode_uv[uv_tx]);
   3483       }
   3484 
   3485       rate_uv = rate_uv_tokenonly[uv_tx];
   3486       distortion_uv = dist_uv[uv_tx];
   3487       skippable = skippable && skip_uv[uv_tx];
   3488       mbmi->uv_mode = mode_uv[uv_tx];
   3489 
   3490       rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
   3491       if (this_mode != RD_DC_PRED && this_mode != RD_TM_PRED)
   3492         rate2 += intra_cost_penalty;
   3493       distortion2 = distortion_y + distortion_uv;
   3494     } else if (this_mode == RD_SPLITMV) {
   3495       const int is_comp_pred = second_ref_frame > 0;
   3496       int rate;
   3497       int64_t distortion;
   3498       int64_t this_rd_thresh;
   3499       int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
   3500       int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
   3501       int64_t tmp_best_distortion = INT_MAX, tmp_best_sse, uv_sse;
   3502       int tmp_best_skippable = 0;
   3503       int switchable_filter_index;
   3504       int_mv *second_ref = is_comp_pred ?
   3505           &mbmi->ref_mvs[second_ref_frame][0] : NULL;
   3506       union b_mode_info tmp_best_bmodes[16];
   3507       MB_MODE_INFO tmp_best_mbmode;
   3508       PARTITION_INFO tmp_best_partition;
   3509       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
   3510       int pred_exists = 0;
   3511       int uv_skippable;
   3512       if (is_comp_pred) {
   3513         if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_BESTINTRA)
   3514           if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME)
   3515             continue;
   3516         if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_COMP_REFMISMATCH)
   3517           if (ref_frame != best_inter_ref_frame &&
   3518               second_ref_frame != best_inter_ref_frame)
   3519             continue;
   3520       }
   3521 
   3522       this_rd_thresh = (ref_frame == LAST_FRAME) ?
   3523           cpi->rd_threshes[bsize][THR_NEWMV] :
   3524           cpi->rd_threshes[bsize][THR_NEWA];
   3525       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
   3526           cpi->rd_threshes[bsize][THR_NEWG] : this_rd_thresh;
   3527       xd->this_mi->mbmi.tx_size = TX_4X4;
   3528 
   3529       cpi->rd_filter_cache[SWITCHABLE_FILTERS] = INT64_MAX;
   3530       if (cm->mcomp_filter_type != BILINEAR) {
   3531         tmp_best_filter = EIGHTTAP;
   3532         if (x->source_variance <
   3533             cpi->sf.disable_filter_search_var_thresh) {
   3534           tmp_best_filter = EIGHTTAP;
   3535           vp9_zero(cpi->rd_filter_cache);
   3536         } else {
   3537           for (switchable_filter_index = 0;
   3538                switchable_filter_index < SWITCHABLE_FILTERS;
   3539                ++switchable_filter_index) {
   3540             int newbest, rs;
   3541             int64_t rs_rd;
   3542             mbmi->interp_filter = switchable_filter_index;
   3543             vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
   3544 
   3545             tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
   3546                                                  &mbmi->ref_mvs[ref_frame][0],
   3547                                                  second_ref,
   3548                                                  best_yrd,
   3549                                                  &rate, &rate_y, &distortion,
   3550                                                  &skippable, &total_sse,
   3551                                                  (int)this_rd_thresh, seg_mvs,
   3552                                                  bsi, switchable_filter_index,
   3553                                                  mi_row, mi_col);
   3554 
   3555             if (tmp_rd == INT64_MAX)
   3556               continue;
   3557             cpi->rd_filter_cache[switchable_filter_index] = tmp_rd;
   3558             rs = get_switchable_rate(x);
   3559             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
   3560             cpi->rd_filter_cache[SWITCHABLE_FILTERS] =
   3561                 MIN(cpi->rd_filter_cache[SWITCHABLE_FILTERS],
   3562                     tmp_rd + rs_rd);
   3563             if (cm->mcomp_filter_type == SWITCHABLE)
   3564               tmp_rd += rs_rd;
   3565 
   3566             newbest = (tmp_rd < tmp_best_rd);
   3567             if (newbest) {
   3568               tmp_best_filter = mbmi->interp_filter;
   3569               tmp_best_rd = tmp_rd;
   3570             }
   3571             if ((newbest && cm->mcomp_filter_type == SWITCHABLE) ||
   3572                 (mbmi->interp_filter == cm->mcomp_filter_type &&
   3573                  cm->mcomp_filter_type != SWITCHABLE)) {
   3574               tmp_best_rdu = tmp_rd;
   3575               tmp_best_rate = rate;
   3576               tmp_best_ratey = rate_y;
   3577               tmp_best_distortion = distortion;
   3578               tmp_best_sse = total_sse;
   3579               tmp_best_skippable = skippable;
   3580               tmp_best_mbmode = *mbmi;
   3581               tmp_best_partition = *x->partition_info;
   3582               for (i = 0; i < 4; i++)
   3583                 tmp_best_bmodes[i] = xd->this_mi->bmi[i];
   3584               pred_exists = 1;
   3585               if (switchable_filter_index == 0 &&
   3586                   cpi->sf.use_rd_breakout &&
   3587                   best_rd < INT64_MAX) {
   3588                 if (tmp_best_rdu / 2 > best_rd) {
   3589                   // skip searching the other filters if the first is
   3590                   // already substantially larger than the best so far
   3591                   tmp_best_filter = mbmi->interp_filter;
   3592                   tmp_best_rdu = INT64_MAX;
   3593                   break;
   3594                 }
   3595               }
   3596             }
   3597           }  // switchable_filter_index loop
   3598         }
   3599       }
   3600 
   3601       if (tmp_best_rdu == INT64_MAX)
   3602         continue;
   3603 
   3604       mbmi->interp_filter = (cm->mcomp_filter_type == SWITCHABLE ?
   3605                              tmp_best_filter : cm->mcomp_filter_type);
   3606       vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
   3607       if (!pred_exists) {
   3608         // Handles the special case when a filter that is not in the
   3609         // switchable list (bilinear, 6-tap) is indicated at the frame level
   3610         tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
   3611                      &mbmi->ref_mvs[ref_frame][0],
   3612                      second_ref,
   3613                      best_yrd,
   3614                      &rate, &rate_y, &distortion,
   3615                      &skippable, &total_sse,
   3616                      (int)this_rd_thresh, seg_mvs,
   3617                      bsi, 0,
   3618                      mi_row, mi_col);
   3619         if (tmp_rd == INT64_MAX)
   3620           continue;
   3621       } else {
   3622         if (cpi->common.mcomp_filter_type == SWITCHABLE) {
   3623           int rs = get_switchable_rate(x);
   3624           tmp_best_rdu -= RDCOST(x->rdmult, x->rddiv, rs, 0);
   3625         }
   3626         tmp_rd = tmp_best_rdu;
   3627         total_sse = tmp_best_sse;
   3628         rate = tmp_best_rate;
   3629         rate_y = tmp_best_ratey;
   3630         distortion = tmp_best_distortion;
   3631         skippable = tmp_best_skippable;
   3632         *mbmi = tmp_best_mbmode;
   3633         *x->partition_info = tmp_best_partition;
   3634         for (i = 0; i < 4; i++)
   3635           xd->this_mi->bmi[i] = tmp_best_bmodes[i];
   3636       }
   3637 
   3638       rate2 += rate;
   3639       distortion2 += distortion;
   3640 
   3641       if (cpi->common.mcomp_filter_type == SWITCHABLE)
   3642         rate2 += get_switchable_rate(x);
   3643 
   3644       if (!mode_excluded) {
   3645         if (is_comp_pred)
   3646           mode_excluded = cpi->common.comp_pred_mode == SINGLE_PREDICTION_ONLY;
   3647         else
   3648           mode_excluded = cpi->common.comp_pred_mode == COMP_PREDICTION_ONLY;
   3649       }
   3650       compmode_cost = vp9_cost_bit(comp_mode_p, is_comp_pred);
   3651 
   3652       tmp_best_rdu = best_rd -
   3653           MIN(RDCOST(x->rdmult, x->rddiv, rate2, distortion2),
   3654               RDCOST(x->rdmult, x->rddiv, 0, total_sse));
   3655 
   3656       if (tmp_best_rdu > 0) {
   3657         // If even the 'Y' rd value of split is higher than best so far
   3658         // then dont bother looking at UV
   3659         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
   3660                                         BLOCK_8X8);
   3661         super_block_uvrd(cm, x, &rate_uv, &distortion_uv, &uv_skippable,
   3662                          &uv_sse, BLOCK_8X8, tmp_best_rdu);
   3663         if (rate_uv == INT_MAX)
   3664           continue;
   3665         rate2 += rate_uv;
   3666         distortion2 += distortion_uv;
   3667         skippable = skippable && uv_skippable;
   3668         total_sse += uv_sse;
   3669 
   3670         tx_cache[ONLY_4X4] = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
   3671         for (i = 0; i < TX_MODES; ++i)
   3672           tx_cache[i] = tx_cache[ONLY_4X4];
   3673       }
   3674     } else {
   3675       mbmi->mode = rd_mode_to_mode(this_mode);
   3676       compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
   3677       this_rd = handle_inter_mode(cpi, x, bsize,
   3678                                   tx_cache,
   3679                                   &rate2, &distortion2, &skippable,
   3680                                   &rate_y, &distortion_y,
   3681                                   &rate_uv, &distortion_uv,
   3682                                   &mode_excluded, &disable_skip,
   3683                                   &tmp_best_filter, frame_mv,
   3684                                   mi_row, mi_col,
   3685                                   single_newmv, &total_sse, best_rd);
   3686       if (this_rd == INT64_MAX)
   3687         continue;
   3688     }
   3689 
   3690     if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
   3691       rate2 += compmode_cost;
   3692     }
   3693 
   3694     // Estimate the reference frame signaling cost and add it
   3695     // to the rolling cost variable.
   3696     if (second_ref_frame > INTRA_FRAME) {
   3697       rate2 += ref_costs_comp[ref_frame];
   3698     } else {
   3699       rate2 += ref_costs_single[ref_frame];
   3700     }
   3701 
   3702     if (!disable_skip) {
   3703       // Test for the condition where skip block will be activated
   3704       // because there are no non zero coefficients and make any
   3705       // necessary adjustment for rate. Ignore if skip is coded at
   3706       // segment level as the cost wont have been added in.
   3707       // Is Mb level skip allowed (i.e. not coded at segment level).
   3708       const int mb_skip_allowed = !vp9_segfeature_active(seg, segment_id,
   3709                                                          SEG_LVL_SKIP);
   3710 
   3711       if (skippable && bsize >= BLOCK_8X8) {
   3712         // Back out the coefficient coding costs
   3713         rate2 -= (rate_y + rate_uv);
   3714         // for best yrd calculation
   3715         rate_uv = 0;
   3716 
   3717         if (mb_skip_allowed) {
   3718           int prob_skip_cost;
   3719 
   3720           // Cost the skip mb case
   3721           vp9_prob skip_prob =
   3722             vp9_get_pred_prob_mbskip(cm, xd);
   3723 
   3724           if (skip_prob) {
   3725             prob_skip_cost = vp9_cost_bit(skip_prob, 1);
   3726             rate2 += prob_skip_cost;
   3727           }
   3728         }
   3729       } else if (mb_skip_allowed && ref_frame != INTRA_FRAME && !xd->lossless) {
   3730         if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
   3731             RDCOST(x->rdmult, x->rddiv, 0, total_sse)) {
   3732           // Add in the cost of the no skip flag.
   3733           int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
   3734                                             0);
   3735           rate2 += prob_skip_cost;
   3736         } else {
   3737           // FIXME(rbultje) make this work for splitmv also
   3738           int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
   3739                                             1);
   3740           rate2 += prob_skip_cost;
   3741           distortion2 = total_sse;
   3742           assert(total_sse >= 0);
   3743           rate2 -= (rate_y + rate_uv);
   3744           rate_y = 0;
   3745           rate_uv = 0;
   3746           this_skip2 = 1;
   3747         }
   3748       } else if (mb_skip_allowed) {
   3749         // Add in the cost of the no skip flag.
   3750         int prob_skip_cost = vp9_cost_bit(vp9_get_pred_prob_mbskip(cm, xd),
   3751                                           0);
   3752         rate2 += prob_skip_cost;
   3753       }
   3754 
   3755       // Calculate the final RD estimate for this mode.
   3756       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
   3757     }
   3758 
   3759     // Keep record of best intra rd
   3760     if (xd->this_mi->mbmi.ref_frame[0] == INTRA_FRAME &&
   3761         is_intra_mode(xd->this_mi->mbmi.mode) &&
   3762         this_rd < best_intra_rd) {
   3763       best_intra_rd = this_rd;
   3764       best_intra_mode = xd->this_mi->mbmi.mode;
   3765     }
   3766     // Keep record of best inter rd with single reference
   3767     if (xd->this_mi->mbmi.ref_frame[0] > INTRA_FRAME &&
   3768         xd->this_mi->mbmi.ref_frame[1] == NONE &&
   3769         !mode_excluded &&
   3770         this_rd < best_inter_rd) {
   3771       best_inter_rd = this_rd;
   3772       best_inter_ref_frame = ref_frame;
   3773       // best_inter_mode = xd->this_mi->mbmi.mode;
   3774     }
   3775 
   3776     if (!disable_skip && ref_frame == INTRA_FRAME) {
   3777       for (i = 0; i < NB_PREDICTION_TYPES; ++i)
   3778         best_pred_rd[i] = MIN(best_pred_rd[i], this_rd);
   3779       for (i = 0; i <= SWITCHABLE_FILTERS; i++)
   3780         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
   3781     }
   3782 
   3783     if (this_mode != RD_I4X4_PRED && this_mode != RD_SPLITMV) {
   3784       // Store the respective mode distortions for later use.
   3785       if (mode_distortions[this_mode] == -1
   3786           || distortion2 < mode_distortions[this_mode]) {
   3787         mode_distortions[this_mode] = distortion2;
   3788       }
   3789       if (frame_distortions[ref_frame] == -1
   3790           || distortion2 < frame_distortions[ref_frame]) {
   3791         frame_distortions[ref_frame] = distortion2;
   3792       }
   3793     }
   3794 
   3795     // Did this mode help.. i.e. is it the new best mode
   3796     if (this_rd < best_rd || x->skip) {
   3797       if (!mode_excluded) {
   3798         // Note index of best mode so far
   3799         best_mode_index = mode_index;
   3800 
   3801         if (ref_frame == INTRA_FRAME) {
   3802           /* required for left and above block mv */
   3803           mbmi->mv[0].as_int = 0;
   3804         }
   3805 
   3806         *returnrate = rate2;
   3807         *returndistortion = distortion2;
   3808         best_rd = this_rd;
   3809         best_yrd = best_rd -
   3810                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
   3811         best_mbmode = *mbmi;
   3812         best_skip2 = this_skip2;
   3813         best_partition = *x->partition_info;
   3814 
   3815         if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)
   3816           for (i = 0; i < 4; i++)
   3817             best_bmodes[i] = xd->this_mi->bmi[i];
   3818 
   3819         // TODO(debargha): enhance this test with a better distortion prediction
   3820         // based on qp, activity mask and history
   3821         if (cpi->sf.mode_search_skip_flags & FLAG_EARLY_TERMINATE) {
   3822           const int qstep = xd->plane[0].dequant[1];
   3823           // TODO(debargha): Enhance this by specializing for each mode_index
   3824           int scale = 4;
   3825           if (x->source_variance < UINT_MAX) {
   3826             const int var_adjust = (x->source_variance < 16);
   3827             scale -= var_adjust;
   3828           }
   3829           if (ref_frame > INTRA_FRAME &&
   3830               distortion2 * scale < qstep * qstep) {
   3831             early_term = 1;
   3832           }
   3833         }
   3834       }
   3835     }
   3836 
   3837     /* keep record of best compound/single-only prediction */
   3838     if (!disable_skip && ref_frame != INTRA_FRAME) {
   3839       int single_rd, hybrid_rd, single_rate, hybrid_rate;
   3840 
   3841       if (cpi->common.comp_pred_mode == HYBRID_PREDICTION) {
   3842         single_rate = rate2 - compmode_cost;
   3843         hybrid_rate = rate2;
   3844       } else {
   3845         single_rate = rate2;
   3846         hybrid_rate = rate2 + compmode_cost;
   3847       }
   3848 
   3849       single_rd = RDCOST(x->rdmult, x->rddiv, single_rate, distortion2);
   3850       hybrid_rd = RDCOST(x->rdmult, x->rddiv, hybrid_rate, distortion2);
   3851 
   3852       if (second_ref_frame <= INTRA_FRAME &&
   3853           single_rd < best_pred_rd[SINGLE_PREDICTION_ONLY]) {
   3854         best_pred_rd[SINGLE_PREDICTION_ONLY] = single_rd;
   3855       } else if (second_ref_frame > INTRA_FRAME &&
   3856                  single_rd < best_pred_rd[COMP_PREDICTION_ONLY]) {
   3857         best_pred_rd[COMP_PREDICTION_ONLY] = single_rd;
   3858       }
   3859       if (hybrid_rd < best_pred_rd[HYBRID_PREDICTION])
   3860         best_pred_rd[HYBRID_PREDICTION] = hybrid_rd;
   3861     }
   3862 
   3863     /* keep record of best filter type */
   3864     if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
   3865         cm->mcomp_filter_type != BILINEAR) {
   3866       int64_t ref = cpi->rd_filter_cache[cm->mcomp_filter_type == SWITCHABLE ?
   3867                               SWITCHABLE_FILTERS : cm->mcomp_filter_type];
   3868       for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
   3869         int64_t adj_rd;
   3870         // In cases of poor prediction, filter_cache[] can contain really big
   3871         // values, which actually are bigger than this_rd itself. This can
   3872         // cause negative best_filter_rd[] values, which is obviously silly.
   3873         // Therefore, if filter_cache < ref, we do an adjusted calculation.
   3874         if (cpi->rd_filter_cache[i] >= ref)
   3875           adj_rd = this_rd + cpi->rd_filter_cache[i] - ref;
   3876         else  // FIXME(rbultje) do this for comppred also
   3877           adj_rd = this_rd - (ref - cpi->rd_filter_cache[i]) * this_rd / ref;
   3878         best_filter_rd[i] = MIN(best_filter_rd[i], adj_rd);
   3879       }
   3880     }
   3881 
   3882     /* keep record of best txfm size */
   3883     if (bsize < BLOCK_32X32) {
   3884       if (bsize < BLOCK_16X16) {
   3885         if (this_mode == RD_SPLITMV || this_mode == RD_I4X4_PRED)
   3886           tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
   3887         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
   3888       }
   3889       tx_cache[ALLOW_32X32] = tx_cache[ALLOW_16X16];
   3890     }
   3891     if (!mode_excluded && this_rd != INT64_MAX) {
   3892       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
   3893         int64_t adj_rd = INT64_MAX;
   3894         if (this_mode != RD_I4X4_PRED) {
   3895           adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
   3896         } else {
   3897           adj_rd = this_rd;
   3898         }
   3899 
   3900         if (adj_rd < best_tx_rd[i])
   3901           best_tx_rd[i] = adj_rd;
   3902       }
   3903     }
   3904 
   3905     if (early_term)
   3906       break;
   3907 
   3908     if (x->skip && !comp_pred)
   3909       break;
   3910   }
   3911 
   3912   if (best_rd >= best_rd_so_far)
   3913     return INT64_MAX;
   3914 
   3915   // If we used an estimate for the uv intra rd in the loop above...
   3916   if (cpi->sf.use_uv_intra_rd_estimate) {
   3917     // Do Intra UV best rd mode selection if best mode choice above was intra.
   3918     if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
   3919       TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
   3920       rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
   3921                               &rate_uv_tokenonly[uv_tx_size],
   3922                               &dist_uv[uv_tx_size],
   3923                               &skip_uv[uv_tx_size],
   3924                               bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   3925     }
   3926   }
   3927 
   3928   // If we are using reference masking and the set mask flag is set then
   3929   // create the reference frame mask.
   3930   if (cpi->sf.reference_masking && cpi->set_ref_frame_mask)
   3931     cpi->ref_frame_mask = ~(1 << vp9_mode_order[best_mode_index].ref_frame);
   3932 
   3933   // Flag all modes that have a distortion thats > 2x the best we found at
   3934   // this level.
   3935   for (mode_index = 0; mode_index < MB_MODE_COUNT; ++mode_index) {
   3936     if (mode_index == NEARESTMV || mode_index == NEARMV || mode_index == NEWMV)
   3937       continue;
   3938 
   3939     if (mode_distortions[mode_index] > 2 * *returndistortion) {
   3940       ctx->modes_with_high_error |= (1 << mode_index);
   3941     }
   3942   }
   3943 
   3944   // Flag all ref frames that have a distortion thats > 2x the best we found at
   3945   // this level.
   3946   for (ref_frame = INTRA_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
   3947     if (frame_distortions[ref_frame] > 2 * *returndistortion) {
   3948       ctx->frames_with_high_error |= (1 << ref_frame);
   3949     }
   3950   }
   3951 
   3952   if (best_rd == INT64_MAX && bsize < BLOCK_8X8) {
   3953     *returnrate = INT_MAX;
   3954     *returndistortion = INT_MAX;
   3955     return best_rd;
   3956   }
   3957 
   3958   assert((cm->mcomp_filter_type == SWITCHABLE) ||
   3959          (cm->mcomp_filter_type == best_mbmode.interp_filter) ||
   3960          (best_mbmode.ref_frame[0] == INTRA_FRAME));
   3961 
   3962   // Updating rd_thresh_freq_fact[] here means that the different
   3963   // partition/block sizes are handled independently based on the best
   3964   // choice for the current partition. It may well be better to keep a scaled
   3965   // best rd so far value and update rd_thresh_freq_fact based on the mode/size
   3966   // combination that wins out.
   3967   if (cpi->sf.adaptive_rd_thresh) {
   3968     for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
   3969       if (mode_index == best_mode_index) {
   3970         cpi->rd_thresh_freq_fact[bsize][mode_index] -=
   3971           (cpi->rd_thresh_freq_fact[bsize][mode_index] >> 3);
   3972       } else {
   3973         cpi->rd_thresh_freq_fact[bsize][mode_index] += RD_THRESH_INC;
   3974         if (cpi->rd_thresh_freq_fact[bsize][mode_index] >
   3975             (cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT)) {
   3976           cpi->rd_thresh_freq_fact[bsize][mode_index] =
   3977             cpi->sf.adaptive_rd_thresh * MAX_RD_THRESH_FACT;
   3978         }
   3979       }
   3980     }
   3981   }
   3982 
   3983   // macroblock modes
   3984   *mbmi = best_mbmode;
   3985   x->skip |= best_skip2;
   3986   if (best_mbmode.ref_frame[0] == INTRA_FRAME &&
   3987       best_mbmode.sb_type < BLOCK_8X8) {
   3988     for (i = 0; i < 4; i++)
   3989       xd->this_mi->bmi[i].as_mode = best_bmodes[i].as_mode;
   3990   }
   3991 
   3992   if (best_mbmode.ref_frame[0] != INTRA_FRAME &&
   3993       best_mbmode.sb_type < BLOCK_8X8) {
   3994     for (i = 0; i < 4; i++)
   3995       xd->this_mi->bmi[i].as_mv[0].as_int =
   3996           best_bmodes[i].as_mv[0].as_int;
   3997 
   3998     if (mbmi->ref_frame[1] > 0)
   3999       for (i = 0; i < 4; i++)
   4000         xd->this_mi->bmi[i].as_mv[1].as_int =
   4001             best_bmodes[i].as_mv[1].as_int;
   4002 
   4003     *x->partition_info = best_partition;
   4004 
   4005     mbmi->mv[0].as_int = xd->this_mi->bmi[3].as_mv[0].as_int;
   4006     mbmi->mv[1].as_int = xd->this_mi->bmi[3].as_mv[1].as_int;
   4007   }
   4008 
   4009   for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
   4010     if (best_pred_rd[i] == INT64_MAX)
   4011       best_pred_diff[i] = INT_MIN;
   4012     else
   4013       best_pred_diff[i] = best_rd - best_pred_rd[i];
   4014   }
   4015 
   4016   if (!x->skip) {
   4017     for (i = 0; i <= SWITCHABLE_FILTERS; i++) {
   4018       if (best_filter_rd[i] == INT64_MAX)
   4019         best_filter_diff[i] = 0;
   4020       else
   4021         best_filter_diff[i] = best_rd - best_filter_rd[i];
   4022     }
   4023     if (cm->mcomp_filter_type == SWITCHABLE)
   4024       assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
   4025   } else {
   4026     vpx_memset(best_filter_diff, 0, sizeof(best_filter_diff));
   4027   }
   4028 
   4029   if (!x->skip) {
   4030     for (i = 0; i < TX_MODES; i++) {
   4031       if (best_tx_rd[i] == INT64_MAX)
   4032         best_tx_diff[i] = 0;
   4033       else
   4034         best_tx_diff[i] = best_rd - best_tx_rd[i];
   4035     }
   4036   } else {
   4037     vpx_memset(best_tx_diff, 0, sizeof(best_tx_diff));
   4038   }
   4039 
   4040   set_scale_factors(xd, mbmi->ref_frame[0], mbmi->ref_frame[1],
   4041                     scale_factor);
   4042   store_coding_context(x, ctx, best_mode_index,
   4043                        &best_partition,
   4044                        &mbmi->ref_mvs[mbmi->ref_frame[0]][0],
   4045                        &mbmi->ref_mvs[mbmi->ref_frame[1] < 0 ? 0 :
   4046                                       mbmi->ref_frame[1]][0],
   4047                        best_pred_diff, best_tx_diff, best_filter_diff);
   4048 
   4049   return best_rd;
   4050 }
   4051