Home | History | Annotate | Download | only in encoder
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include "./vp9_rtcd.h"
     12 
     13 #include "vpx_ports/mem.h"
     14 #include "vpx/vpx_integer.h"
     15 
     16 #include "vp9/common/vp9_common.h"
     17 #include "vp9/common/vp9_filter.h"
     18 
     19 #include "vp9/encoder/vp9_variance.h"
     20 
     21 void variance(const uint8_t *src_ptr,
     22               int  source_stride,
     23               const uint8_t *ref_ptr,
     24               int  recon_stride,
     25               int  w,
     26               int  h,
     27               unsigned int *sse,
     28               int *sum) {
     29   int i, j;
     30   int diff;
     31 
     32   *sum = 0;
     33   *sse = 0;
     34 
     35   for (i = 0; i < h; i++) {
     36     for (j = 0; j < w; j++) {
     37       diff = src_ptr[j] - ref_ptr[j];
     38       *sum += diff;
     39       *sse += diff * diff;
     40     }
     41 
     42     src_ptr += source_stride;
     43     ref_ptr += recon_stride;
     44   }
     45 }
     46 
     47 /****************************************************************************
     48  *
     49  *  ROUTINE       : filter_block2d_bil_first_pass
     50  *
     51  *  INPUTS        : uint8_t  *src_ptr          : Pointer to source block.
     52  *                  uint32_t src_pixels_per_line : Stride of input block.
     53  *                  uint32_t pixel_step        : Offset between filter input
     54  *                                               samples (see notes).
     55  *                  uint32_t output_height     : Input block height.
     56  *                  uint32_t output_width      : Input block width.
     57  *                  int32_t  *vp9_filter       : Array of 2 bi-linear filter
     58  *                                               taps.
     59  *
     60  *  OUTPUTS       : int32_t *output_ptr        : Pointer to filtered block.
     61  *
     62  *  RETURNS       : void
     63  *
     64  *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
     65  *                  either horizontal or vertical direction to produce the
     66  *                  filtered output block. Used to implement first-pass
     67  *                  of 2-D separable filter.
     68  *
     69  *  SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
     70  *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
     71  *                  pixel_step defines whether the filter is applied
     72  *                  horizontally (pixel_step=1) or vertically (pixel_step=
     73  *                  stride).
     74  *                  It defines the offset required to move from one input
     75  *                  to the next.
     76  *
     77  ****************************************************************************/
     78 static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
     79                                               uint16_t *output_ptr,
     80                                               unsigned int src_pixels_per_line,
     81                                               int pixel_step,
     82                                               unsigned int output_height,
     83                                               unsigned int output_width,
     84                                               const int16_t *vp9_filter) {
     85   unsigned int i, j;
     86 
     87   for (i = 0; i < output_height; i++) {
     88     for (j = 0; j < output_width; j++) {
     89       output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
     90                           (int)src_ptr[pixel_step] * vp9_filter[1],
     91                           FILTER_BITS);
     92 
     93       src_ptr++;
     94     }
     95 
     96     // Next row...
     97     src_ptr    += src_pixels_per_line - output_width;
     98     output_ptr += output_width;
     99   }
    100 }
    101 
    102 /****************************************************************************
    103  *
    104  *  ROUTINE       : filter_block2d_bil_second_pass
    105  *
    106  *  INPUTS        : int32_t  *src_ptr          : Pointer to source block.
    107  *                  uint32_t src_pixels_per_line : Stride of input block.
    108  *                  uint32_t pixel_step        : Offset between filter input
    109  *                                               samples (see notes).
    110  *                  uint32_t output_height     : Input block height.
    111  *                  uint32_t output_width      : Input block width.
    112  *                  int32_t  *vp9_filter       : Array of 2 bi-linear filter
    113  *                                               taps.
    114  *
    115  *  OUTPUTS       : uint16_t *output_ptr       : Pointer to filtered block.
    116  *
    117  *  RETURNS       : void
    118  *
    119  *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
    120  *                  either horizontal or vertical direction to produce the
    121  *                  filtered output block. Used to implement second-pass
    122  *                  of 2-D separable filter.
    123  *
    124  *  SPECIAL NOTES : Requires 32-bit input as produced by
    125  *                  filter_block2d_bil_first_pass.
    126  *                  Two filter taps should sum to VP9_FILTER_WEIGHT.
    127  *                  pixel_step defines whether the filter is applied
    128  *                  horizontally (pixel_step=1) or vertically (pixel_step=
    129  *                  stride).
    130  *                  It defines the offset required to move from one input
    131  *                  to the next.
    132  *
    133  ****************************************************************************/
    134 static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
    135                                                uint8_t *output_ptr,
    136                                                unsigned int src_pixels_per_line,
    137                                                unsigned int pixel_step,
    138                                                unsigned int output_height,
    139                                                unsigned int output_width,
    140                                                const int16_t *vp9_filter) {
    141   unsigned int  i, j;
    142 
    143   for (i = 0; i < output_height; i++) {
    144     for (j = 0; j < output_width; j++) {
    145       output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
    146                           (int)src_ptr[pixel_step] * vp9_filter[1],
    147                           FILTER_BITS);
    148       src_ptr++;
    149     }
    150 
    151     src_ptr += src_pixels_per_line - output_width;
    152     output_ptr += output_width;
    153   }
    154 }
    155 
    156 unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
    157   unsigned int i, sum = 0;
    158 
    159   for (i = 0; i < 256; i++) {
    160     sum += (src_ptr[i] * src_ptr[i]);
    161   }
    162 
    163   return sum;
    164 }
    165 
    166 unsigned int vp9_variance64x32_c(const uint8_t *src_ptr,
    167                                  int  source_stride,
    168                                  const uint8_t *ref_ptr,
    169                                  int  recon_stride,
    170                                  unsigned int *sse) {
    171   unsigned int var;
    172   int avg;
    173 
    174   variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, &var, &avg);
    175   *sse = var;
    176   return (var - (((int64_t)avg * avg) >> 11));
    177 }
    178 
    179 unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr,
    180                                            int  src_pixels_per_line,
    181                                            int  xoffset,
    182                                            int  yoffset,
    183                                            const uint8_t *dst_ptr,
    184                                            int dst_pixels_per_line,
    185                                            unsigned int *sse) {
    186   uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
    187   uint8_t temp2[68 * 64];
    188   const int16_t *hfilter, *vfilter;
    189 
    190   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    191   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    192 
    193   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    194                                     1, 33, 64, hfilter);
    195   var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
    196 
    197   return vp9_variance64x32(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
    198 }
    199 
    200 unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
    201                                                int  src_pixels_per_line,
    202                                                int  xoffset,
    203                                                int  yoffset,
    204                                                const uint8_t *dst_ptr,
    205                                                int dst_pixels_per_line,
    206                                                unsigned int *sse,
    207                                                const uint8_t *second_pred) {
    208   uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
    209   uint8_t temp2[68 * 64];
    210   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
    211   const int16_t *hfilter, *vfilter;
    212 
    213   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    214   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    215 
    216   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    217                                     1, 33, 64, hfilter);
    218   var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
    219   vp9_comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
    220   return vp9_variance64x32(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
    221 }
    222 
    223 unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,
    224                                  int  source_stride,
    225                                  const uint8_t *ref_ptr,
    226                                  int  recon_stride,
    227                                  unsigned int *sse) {
    228   unsigned int var;
    229   int avg;
    230 
    231   variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 64, &var, &avg);
    232   *sse = var;
    233   return (var - (((int64_t)avg * avg) >> 11));
    234 }
    235 
    236 unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr,
    237                                            int  src_pixels_per_line,
    238                                            int  xoffset,
    239                                            int  yoffset,
    240                                            const uint8_t *dst_ptr,
    241                                            int dst_pixels_per_line,
    242                                            unsigned int *sse) {
    243   uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
    244   uint8_t temp2[68 * 64];
    245   const int16_t *hfilter, *vfilter;
    246 
    247   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    248   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    249 
    250   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    251                                     1, 65, 32, hfilter);
    252   var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
    253 
    254   return vp9_variance32x64(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
    255 }
    256 
    257 unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
    258                                                int  src_pixels_per_line,
    259                                                int  xoffset,
    260                                                int  yoffset,
    261                                                const uint8_t *dst_ptr,
    262                                                int dst_pixels_per_line,
    263                                                unsigned int *sse,
    264                                                const uint8_t *second_pred) {
    265   uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
    266   uint8_t temp2[68 * 64];
    267   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64);  // compound pred buffer
    268   const int16_t *hfilter, *vfilter;
    269 
    270   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    271   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    272 
    273   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    274                                     1, 65, 32, hfilter);
    275   var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
    276   vp9_comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
    277   return vp9_variance32x64(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
    278 }
    279 
    280 unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,
    281                                  int  source_stride,
    282                                  const uint8_t *ref_ptr,
    283                                  int  recon_stride,
    284                                  unsigned int *sse) {
    285   unsigned int var;
    286   int avg;
    287 
    288   variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, &var, &avg);
    289   *sse = var;
    290   return (var - (((int64_t)avg * avg) >> 9));
    291 }
    292 
    293 unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr,
    294                                            int  src_pixels_per_line,
    295                                            int  xoffset,
    296                                            int  yoffset,
    297                                            const uint8_t *dst_ptr,
    298                                            int dst_pixels_per_line,
    299                                            unsigned int *sse) {
    300   uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
    301   uint8_t temp2[36 * 32];
    302   const int16_t *hfilter, *vfilter;
    303 
    304   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    305   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    306 
    307   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    308                                     1, 17, 32, hfilter);
    309   var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
    310 
    311   return vp9_variance32x16(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
    312 }
    313 
    314 unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
    315                                                int  src_pixels_per_line,
    316                                                int  xoffset,
    317                                                int  yoffset,
    318                                                const uint8_t *dst_ptr,
    319                                                int dst_pixels_per_line,
    320                                                unsigned int *sse,
    321                                                const uint8_t *second_pred) {
    322   uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
    323   uint8_t temp2[36 * 32];
    324   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16);  // compound pred buffer
    325   const int16_t *hfilter, *vfilter;
    326 
    327   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    328   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    329 
    330   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    331                                     1, 17, 32, hfilter);
    332   var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
    333   vp9_comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
    334   return vp9_variance32x16(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
    335 }
    336 
    337 unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,
    338                                  int  source_stride,
    339                                  const uint8_t *ref_ptr,
    340                                  int  recon_stride,
    341                                  unsigned int *sse) {
    342   unsigned int var;
    343   int avg;
    344 
    345   variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 32, &var, &avg);
    346   *sse = var;
    347   return (var - (((int64_t)avg * avg) >> 9));
    348 }
    349 
    350 unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr,
    351                                            int  src_pixels_per_line,
    352                                            int  xoffset,
    353                                            int  yoffset,
    354                                            const uint8_t *dst_ptr,
    355                                            int dst_pixels_per_line,
    356                                            unsigned int *sse) {
    357   uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
    358   uint8_t temp2[36 * 32];
    359   const int16_t *hfilter, *vfilter;
    360 
    361   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    362   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    363 
    364   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    365                                     1, 33, 16, hfilter);
    366   var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
    367 
    368   return vp9_variance16x32(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
    369 }
    370 
    371 unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
    372                                                int  src_pixels_per_line,
    373                                                int  xoffset,
    374                                                int  yoffset,
    375                                                const uint8_t *dst_ptr,
    376                                                int dst_pixels_per_line,
    377                                                unsigned int *sse,
    378                                                const uint8_t *second_pred) {
    379   uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
    380   uint8_t temp2[36 * 32];
    381   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32);  // compound pred buffer
    382   const int16_t *hfilter, *vfilter;
    383 
    384   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    385   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    386 
    387   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    388                                     1, 33, 16, hfilter);
    389   var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
    390   vp9_comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
    391   return vp9_variance16x32(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
    392 }
    393 
    394 unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
    395                                  int  source_stride,
    396                                  const uint8_t *ref_ptr,
    397                                  int  recon_stride,
    398                                  unsigned int *sse) {
    399   unsigned int var;
    400   int avg;
    401 
    402   variance(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, &var, &avg);
    403   *sse = var;
    404   return (var - (((int64_t)avg * avg) >> 12));
    405 }
    406 
    407 unsigned int vp9_variance32x32_c(const uint8_t *src_ptr,
    408                                  int  source_stride,
    409                                  const uint8_t *ref_ptr,
    410                                  int  recon_stride,
    411                                  unsigned int *sse) {
    412   unsigned int var;
    413   int avg;
    414 
    415   variance(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg);
    416   *sse = var;
    417   return (var - (((int64_t)avg * avg) >> 10));
    418 }
    419 
    420 unsigned int vp9_variance16x16_c(const uint8_t *src_ptr,
    421                                  int  source_stride,
    422                                  const uint8_t *ref_ptr,
    423                                  int  recon_stride,
    424                                  unsigned int *sse) {
    425   unsigned int var;
    426   int avg;
    427 
    428   variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
    429   *sse = var;
    430   return (var - (((unsigned int)avg * avg) >> 8));
    431 }
    432 
    433 unsigned int vp9_variance8x16_c(const uint8_t *src_ptr,
    434                                 int  source_stride,
    435                                 const uint8_t *ref_ptr,
    436                                 int  recon_stride,
    437                                 unsigned int *sse) {
    438   unsigned int var;
    439   int avg;
    440 
    441   variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
    442   *sse = var;
    443   return (var - (((unsigned int)avg * avg) >> 7));
    444 }
    445 
    446 unsigned int vp9_variance16x8_c(const uint8_t *src_ptr,
    447                                 int  source_stride,
    448                                 const uint8_t *ref_ptr,
    449                                 int  recon_stride,
    450                                 unsigned int *sse) {
    451   unsigned int var;
    452   int avg;
    453 
    454   variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
    455   *sse = var;
    456   return (var - (((unsigned int)avg * avg) >> 7));
    457 }
    458 
    459 void vp9_get_sse_sum_8x8_c(const uint8_t *src_ptr, int source_stride,
    460                        const uint8_t *ref_ptr, int ref_stride,
    461                        unsigned int *sse, int *sum) {
    462   variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
    463 }
    464 
    465 unsigned int vp9_variance8x8_c(const uint8_t *src_ptr,
    466                                int  source_stride,
    467                                const uint8_t *ref_ptr,
    468                                int  recon_stride,
    469                                unsigned int *sse) {
    470   unsigned int var;
    471   int avg;
    472 
    473   variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
    474   *sse = var;
    475   return (var - (((unsigned int)avg * avg) >> 6));
    476 }
    477 
    478 unsigned int vp9_variance8x4_c(const uint8_t *src_ptr,
    479                                int  source_stride,
    480                                const uint8_t *ref_ptr,
    481                                int  recon_stride,
    482                                unsigned int *sse) {
    483   unsigned int var;
    484   int avg;
    485 
    486   variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 4, &var, &avg);
    487   *sse = var;
    488   return (var - (((unsigned int)avg * avg) >> 5));
    489 }
    490 
    491 unsigned int vp9_variance4x8_c(const uint8_t *src_ptr,
    492                                int  source_stride,
    493                                const uint8_t *ref_ptr,
    494                                int  recon_stride,
    495                                unsigned int *sse) {
    496   unsigned int var;
    497   int avg;
    498 
    499   variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 8, &var, &avg);
    500   *sse = var;
    501   return (var - (((unsigned int)avg * avg) >> 5));
    502 }
    503 
    504 unsigned int vp9_variance4x4_c(const uint8_t *src_ptr,
    505                                int  source_stride,
    506                                const uint8_t *ref_ptr,
    507                                int  recon_stride,
    508                                unsigned int *sse) {
    509   unsigned int var;
    510   int avg;
    511 
    512   variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
    513   *sse = var;
    514   return (var - (((unsigned int)avg * avg) >> 4));
    515 }
    516 
    517 
    518 unsigned int vp9_mse16x16_c(const uint8_t *src_ptr,
    519                             int  source_stride,
    520                             const uint8_t *ref_ptr,
    521                             int  recon_stride,
    522                             unsigned int *sse) {
    523   unsigned int var;
    524   int avg;
    525 
    526   variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
    527   *sse = var;
    528   return var;
    529 }
    530 
    531 unsigned int vp9_mse16x8_c(const uint8_t *src_ptr,
    532                            int  source_stride,
    533                            const uint8_t *ref_ptr,
    534                            int  recon_stride,
    535                            unsigned int *sse) {
    536   unsigned int var;
    537   int avg;
    538 
    539   variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
    540   *sse = var;
    541   return var;
    542 }
    543 
    544 unsigned int vp9_mse8x16_c(const uint8_t *src_ptr,
    545                            int  source_stride,
    546                            const uint8_t *ref_ptr,
    547                            int  recon_stride,
    548                            unsigned int *sse) {
    549   unsigned int var;
    550   int avg;
    551 
    552   variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
    553   *sse = var;
    554   return var;
    555 }
    556 
    557 unsigned int vp9_mse8x8_c(const uint8_t *src_ptr,
    558                           int  source_stride,
    559                           const uint8_t *ref_ptr,
    560                           int  recon_stride,
    561                           unsigned int *sse) {
    562   unsigned int var;
    563   int avg;
    564 
    565   variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
    566   *sse = var;
    567   return var;
    568 }
    569 
    570 
    571 unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
    572                                          int  src_pixels_per_line,
    573                                          int  xoffset,
    574                                          int  yoffset,
    575                                          const uint8_t *dst_ptr,
    576                                          int dst_pixels_per_line,
    577                                          unsigned int *sse) {
    578   uint8_t temp2[20 * 16];
    579   const int16_t *hfilter, *vfilter;
    580   uint16_t fdata3[5 * 4];  // Temp data buffer used in filtering
    581 
    582   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    583   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    584 
    585   // First filter 1d Horizontal
    586   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    587                                     1, 5, 4, hfilter);
    588 
    589   // Now filter Verticaly
    590   var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
    591 
    592   return vp9_variance4x4(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
    593 }
    594 
    595 unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
    596                                              int  src_pixels_per_line,
    597                                              int  xoffset,
    598                                              int  yoffset,
    599                                              const uint8_t *dst_ptr,
    600                                              int dst_pixels_per_line,
    601                                              unsigned int *sse,
    602                                              const uint8_t *second_pred) {
    603   uint8_t temp2[20 * 16];
    604   const int16_t *hfilter, *vfilter;
    605   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4);  // compound pred buffer
    606   uint16_t fdata3[5 * 4];  // Temp data buffer used in filtering
    607 
    608   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    609   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    610 
    611   // First filter 1d Horizontal
    612   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    613                                     1, 5, 4, hfilter);
    614 
    615   // Now filter Verticaly
    616   var_filter_block2d_bil_second_pass(fdata3, temp2, 4,  4,  4,  4, vfilter);
    617   vp9_comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
    618   return vp9_variance4x4(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
    619 }
    620 
    621 unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
    622                                          int  src_pixels_per_line,
    623                                          int  xoffset,
    624                                          int  yoffset,
    625                                          const uint8_t *dst_ptr,
    626                                          int dst_pixels_per_line,
    627                                          unsigned int *sse) {
    628   uint16_t fdata3[9 * 8];  // Temp data buffer used in filtering
    629   uint8_t temp2[20 * 16];
    630   const int16_t *hfilter, *vfilter;
    631 
    632   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    633   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    634 
    635   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    636                                     1, 9, 8, hfilter);
    637   var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
    638 
    639   return vp9_variance8x8(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
    640 }
    641 
    642 unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
    643                                              int  src_pixels_per_line,
    644                                              int  xoffset,
    645                                              int  yoffset,
    646                                              const uint8_t *dst_ptr,
    647                                              int dst_pixels_per_line,
    648                                              unsigned int *sse,
    649                                              const uint8_t *second_pred) {
    650   uint16_t fdata3[9 * 8];  // Temp data buffer used in filtering
    651   uint8_t temp2[20 * 16];
    652   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8);  // compound pred buffer
    653   const int16_t *hfilter, *vfilter;
    654 
    655   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    656   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    657 
    658   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    659                                     1, 9, 8, hfilter);
    660   var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
    661   vp9_comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
    662   return vp9_variance8x8(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
    663 }
    664 
    665 unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
    666                                            int  src_pixels_per_line,
    667                                            int  xoffset,
    668                                            int  yoffset,
    669                                            const uint8_t *dst_ptr,
    670                                            int dst_pixels_per_line,
    671                                            unsigned int *sse) {
    672   uint16_t fdata3[17 * 16];  // Temp data buffer used in filtering
    673   uint8_t temp2[20 * 16];
    674   const int16_t *hfilter, *vfilter;
    675 
    676   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    677   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    678 
    679   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    680                                     1, 17, 16, hfilter);
    681   var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
    682 
    683   return vp9_variance16x16(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
    684 }
    685 
    686 unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
    687                                                int  src_pixels_per_line,
    688                                                int  xoffset,
    689                                                int  yoffset,
    690                                                const uint8_t *dst_ptr,
    691                                                int dst_pixels_per_line,
    692                                                unsigned int *sse,
    693                                                const uint8_t *second_pred) {
    694   uint16_t fdata3[17 * 16];
    695   uint8_t temp2[20 * 16];
    696   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16);  // compound pred buffer
    697   const int16_t *hfilter, *vfilter;
    698 
    699   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    700   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    701 
    702   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    703                                     1, 17, 16, hfilter);
    704   var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
    705 
    706   vp9_comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
    707   return vp9_variance16x16(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
    708 }
    709 
    710 unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
    711                                            int  src_pixels_per_line,
    712                                            int  xoffset,
    713                                            int  yoffset,
    714                                            const uint8_t *dst_ptr,
    715                                            int dst_pixels_per_line,
    716                                            unsigned int *sse) {
    717   uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
    718   uint8_t temp2[68 * 64];
    719   const int16_t *hfilter, *vfilter;
    720 
    721   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    722   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    723 
    724   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    725                                     1, 65, 64, hfilter);
    726   var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
    727 
    728   return vp9_variance64x64(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
    729 }
    730 
    731 unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
    732                                                int  src_pixels_per_line,
    733                                                int  xoffset,
    734                                                int  yoffset,
    735                                                const uint8_t *dst_ptr,
    736                                                int dst_pixels_per_line,
    737                                                unsigned int *sse,
    738                                                const uint8_t *second_pred) {
    739   uint16_t fdata3[65 * 64];  // Temp data buffer used in filtering
    740   uint8_t temp2[68 * 64];
    741   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64);  // compound pred buffer
    742   const int16_t *hfilter, *vfilter;
    743 
    744   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    745   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    746 
    747   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    748                                     1, 65, 64, hfilter);
    749   var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
    750   vp9_comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
    751   return vp9_variance64x64(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
    752 }
    753 
    754 unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
    755                                            int  src_pixels_per_line,
    756                                            int  xoffset,
    757                                            int  yoffset,
    758                                            const uint8_t *dst_ptr,
    759                                            int dst_pixels_per_line,
    760                                            unsigned int *sse) {
    761   uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
    762   uint8_t temp2[36 * 32];
    763   const int16_t *hfilter, *vfilter;
    764 
    765   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    766   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    767 
    768   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    769                                     1, 33, 32, hfilter);
    770   var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
    771 
    772   return vp9_variance32x32(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
    773 }
    774 
    775 unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
    776                                                int  src_pixels_per_line,
    777                                                int  xoffset,
    778                                                int  yoffset,
    779                                                const uint8_t *dst_ptr,
    780                                                int dst_pixels_per_line,
    781                                                unsigned int *sse,
    782                                                const uint8_t *second_pred) {
    783   uint16_t fdata3[33 * 32];  // Temp data buffer used in filtering
    784   uint8_t temp2[36 * 32];
    785   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32);  // compound pred buffer
    786   const int16_t *hfilter, *vfilter;
    787 
    788   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    789   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    790 
    791   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    792                                     1, 33, 32, hfilter);
    793   var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
    794   vp9_comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
    795   return vp9_variance32x32(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
    796 }
    797 
    798 unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
    799                                               int  source_stride,
    800                                               const uint8_t *ref_ptr,
    801                                               int  recon_stride,
    802                                               unsigned int *sse) {
    803   return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0,
    804                                        ref_ptr, recon_stride, sse);
    805 }
    806 
    807 unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr,
    808                                               int  source_stride,
    809                                               const uint8_t *ref_ptr,
    810                                               int  recon_stride,
    811                                               unsigned int *sse) {
    812   return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
    813                                        ref_ptr, recon_stride, sse);
    814 }
    815 
    816 unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr,
    817                                               int  source_stride,
    818                                               const uint8_t *ref_ptr,
    819                                               int  recon_stride,
    820                                               unsigned int *sse) {
    821   return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 0,
    822                                        ref_ptr, recon_stride, sse);
    823 }
    824 
    825 unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr,
    826                                               int  source_stride,
    827                                               const uint8_t *ref_ptr,
    828                                               int  recon_stride,
    829                                               unsigned int *sse) {
    830   return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
    831                                        ref_ptr, recon_stride, sse);
    832 }
    833 
    834 unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr,
    835                                               int  source_stride,
    836                                               const uint8_t *ref_ptr,
    837                                               int  recon_stride,
    838                                               unsigned int *sse) {
    839   return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
    840                                        ref_ptr, recon_stride, sse);
    841 }
    842 
    843 unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr,
    844                                               int  source_stride,
    845                                               const uint8_t *ref_ptr,
    846                                               int  recon_stride,
    847                                               unsigned int *sse) {
    848   return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 0, 8,
    849                                        ref_ptr, recon_stride, sse);
    850 }
    851 
    852 unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr,
    853                                                int  source_stride,
    854                                                const uint8_t *ref_ptr,
    855                                                int  recon_stride,
    856                                                unsigned int *sse) {
    857   return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8,
    858                                        ref_ptr, recon_stride, sse);
    859 }
    860 
    861 unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr,
    862                                                int  source_stride,
    863                                                const uint8_t *ref_ptr,
    864                                                int  recon_stride,
    865                                                unsigned int *sse) {
    866   return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
    867                                        ref_ptr, recon_stride, sse);
    868 }
    869 
    870 unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr,
    871                                                int  source_stride,
    872                                                const uint8_t *ref_ptr,
    873                                                int  recon_stride,
    874                                                unsigned int *sse) {
    875   return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 8,
    876                                        ref_ptr, recon_stride, sse);
    877 }
    878 
    879 unsigned int vp9_sub_pixel_mse16x16_c(const uint8_t *src_ptr,
    880                                       int  src_pixels_per_line,
    881                                       int  xoffset,
    882                                       int  yoffset,
    883                                       const uint8_t *dst_ptr,
    884                                       int dst_pixels_per_line,
    885                                       unsigned int *sse) {
    886   vp9_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line,
    887                                 xoffset, yoffset, dst_ptr,
    888                                 dst_pixels_per_line, sse);
    889   return *sse;
    890 }
    891 
    892 unsigned int vp9_sub_pixel_mse32x32_c(const uint8_t *src_ptr,
    893                                       int  src_pixels_per_line,
    894                                       int  xoffset,
    895                                       int  yoffset,
    896                                       const uint8_t *dst_ptr,
    897                                       int dst_pixels_per_line,
    898                                       unsigned int *sse) {
    899   vp9_sub_pixel_variance32x32_c(src_ptr, src_pixels_per_line,
    900                                 xoffset, yoffset, dst_ptr,
    901                                 dst_pixels_per_line, sse);
    902   return *sse;
    903 }
    904 
    905 unsigned int vp9_sub_pixel_mse64x64_c(const uint8_t *src_ptr,
    906                                       int  src_pixels_per_line,
    907                                       int  xoffset,
    908                                       int  yoffset,
    909                                       const uint8_t *dst_ptr,
    910                                       int dst_pixels_per_line,
    911                                       unsigned int *sse) {
    912   vp9_sub_pixel_variance64x64_c(src_ptr, src_pixels_per_line,
    913                                 xoffset, yoffset, dst_ptr,
    914                                 dst_pixels_per_line, sse);
    915   return *sse;
    916 }
    917 
    918 unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
    919                                           int  src_pixels_per_line,
    920                                           int  xoffset,
    921                                           int  yoffset,
    922                                           const uint8_t *dst_ptr,
    923                                           int dst_pixels_per_line,
    924                                           unsigned int *sse) {
    925   uint16_t fdata3[16 * 9];  // Temp data buffer used in filtering
    926   uint8_t temp2[20 * 16];
    927   const int16_t *hfilter, *vfilter;
    928 
    929   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    930   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    931 
    932   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    933                                     1, 9, 16, hfilter);
    934   var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
    935 
    936   return vp9_variance16x8(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
    937 }
    938 
    939 unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
    940                                               int  src_pixels_per_line,
    941                                               int  xoffset,
    942                                               int  yoffset,
    943                                               const uint8_t *dst_ptr,
    944                                               int dst_pixels_per_line,
    945                                               unsigned int *sse,
    946                                               const uint8_t *second_pred) {
    947   uint16_t fdata3[16 * 9];  // Temp data buffer used in filtering
    948   uint8_t temp2[20 * 16];
    949   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8);  // compound pred buffer
    950   const int16_t *hfilter, *vfilter;
    951 
    952   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    953   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    954 
    955   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    956                                     1, 9, 16, hfilter);
    957   var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
    958   vp9_comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
    959   return vp9_variance16x8(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
    960 }
    961 
    962 unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
    963                                           int  src_pixels_per_line,
    964                                           int  xoffset,
    965                                           int  yoffset,
    966                                           const uint8_t *dst_ptr,
    967                                           int dst_pixels_per_line,
    968                                           unsigned int *sse) {
    969   uint16_t fdata3[9 * 16];  // Temp data buffer used in filtering
    970   uint8_t temp2[20 * 16];
    971   const int16_t *hfilter, *vfilter;
    972 
    973   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    974   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    975 
    976   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
    977                                     1, 17, 8, hfilter);
    978   var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
    979 
    980   return vp9_variance8x16(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
    981 }
    982 
    983 unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
    984                                               int  src_pixels_per_line,
    985                                               int  xoffset,
    986                                               int  yoffset,
    987                                               const uint8_t *dst_ptr,
    988                                               int dst_pixels_per_line,
    989                                               unsigned int *sse,
    990                                               const uint8_t *second_pred) {
    991   uint16_t fdata3[9 * 16];  // Temp data buffer used in filtering
    992   uint8_t temp2[20 * 16];
    993   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16);  // compound pred buffer
    994   const int16_t *hfilter, *vfilter;
    995 
    996   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
    997   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
    998 
    999   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1000                                     1, 17, 8, hfilter);
   1001   var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
   1002   vp9_comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
   1003   return vp9_variance8x16(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
   1004 }
   1005 
   1006 unsigned int vp9_sub_pixel_variance8x4_c(const uint8_t *src_ptr,
   1007                                          int  src_pixels_per_line,
   1008                                          int  xoffset,
   1009                                          int  yoffset,
   1010                                          const uint8_t *dst_ptr,
   1011                                          int dst_pixels_per_line,
   1012                                          unsigned int *sse) {
   1013   uint16_t fdata3[8 * 5];  // Temp data buffer used in filtering
   1014   uint8_t temp2[20 * 16];
   1015   const int16_t *hfilter, *vfilter;
   1016 
   1017   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1018   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1019 
   1020   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1021                                     1, 5, 8, hfilter);
   1022   var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);
   1023 
   1024   return vp9_variance8x4(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
   1025 }
   1026 
   1027 unsigned int vp9_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr,
   1028                                              int  src_pixels_per_line,
   1029                                              int  xoffset,
   1030                                              int  yoffset,
   1031                                              const uint8_t *dst_ptr,
   1032                                              int dst_pixels_per_line,
   1033                                              unsigned int *sse,
   1034                                              const uint8_t *second_pred) {
   1035   uint16_t fdata3[8 * 5];  // Temp data buffer used in filtering
   1036   uint8_t temp2[20 * 16];
   1037   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 4);  // compound pred buffer
   1038   const int16_t *hfilter, *vfilter;
   1039 
   1040   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1041   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1042 
   1043   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1044                                     1, 5, 8, hfilter);
   1045   var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 4, 8, vfilter);
   1046   vp9_comp_avg_pred(temp3, second_pred, 8, 4, temp2, 8);
   1047   return vp9_variance8x4(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
   1048 }
   1049 
   1050 unsigned int vp9_sub_pixel_variance4x8_c(const uint8_t *src_ptr,
   1051                                          int  src_pixels_per_line,
   1052                                          int  xoffset,
   1053                                          int  yoffset,
   1054                                          const uint8_t *dst_ptr,
   1055                                          int dst_pixels_per_line,
   1056                                          unsigned int *sse) {
   1057   uint16_t fdata3[5 * 8];  // Temp data buffer used in filtering
   1058   // FIXME(jingning,rbultje): this temp2 buffer probably doesn't need to be
   1059   // of this big? same issue appears in all other block size settings.
   1060   uint8_t temp2[20 * 16];
   1061   const int16_t *hfilter, *vfilter;
   1062 
   1063   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1064   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1065 
   1066   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1067                                     1, 9, 4, hfilter);
   1068   var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);
   1069 
   1070   return vp9_variance4x8(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
   1071 }
   1072 
   1073 unsigned int vp9_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr,
   1074                                              int  src_pixels_per_line,
   1075                                              int  xoffset,
   1076                                              int  yoffset,
   1077                                              const uint8_t *dst_ptr,
   1078                                              int dst_pixels_per_line,
   1079                                              unsigned int *sse,
   1080                                              const uint8_t *second_pred) {
   1081   uint16_t fdata3[5 * 8];  // Temp data buffer used in filtering
   1082   uint8_t temp2[20 * 16];
   1083   DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 8);  // compound pred buffer
   1084   const int16_t *hfilter, *vfilter;
   1085 
   1086   hfilter = BILINEAR_FILTERS_2TAP(xoffset);
   1087   vfilter = BILINEAR_FILTERS_2TAP(yoffset);
   1088 
   1089   var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
   1090                                     1, 9, 4, hfilter);
   1091   var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 8, 4, vfilter);
   1092   vp9_comp_avg_pred(temp3, second_pred, 4, 8, temp2, 4);
   1093   return vp9_variance4x8(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
   1094 }
   1095 
   1096 
   1097 void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
   1098                        int height, const uint8_t *ref, int ref_stride) {
   1099   int i, j;
   1100 
   1101   for (i = 0; i < height; i++) {
   1102     for (j = 0; j < width; j++) {
   1103       int tmp;
   1104       tmp = pred[j] + ref[j];
   1105       comp_pred[j] = (tmp + 1) >> 1;
   1106     }
   1107     comp_pred += width;
   1108     pred += width;
   1109     ref += ref_stride;
   1110   }
   1111 }
   1112