Home | History | Annotate | Download | only in common
      1 /*
      2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
      3  *
      4  * This source code is subject to the terms of the BSD 2 Clause License and
      5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6  * was not distributed with this source code in the LICENSE file, you can
      7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8  * Media Patent License 1.0 was not distributed with this source code in the
      9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10  */
     11 
     12 #include <assert.h>
     13 #include <string.h>
     14 
     15 #include "config/aom_dsp_rtcd.h"
     16 #include "config/av1_rtcd.h"
     17 
     18 #include "av1/common/blockd.h"
     19 #include "av1/common/convolve.h"
     20 #include "av1/common/filter.h"
     21 #include "av1/common/onyxc_int.h"
     22 #include "av1/common/resize.h"
     23 #include "aom_dsp/aom_dsp_common.h"
     24 #include "aom_ports/mem.h"
     25 
     26 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
     27                              int dst_stride, int w, int h,
     28                              const int16_t *x_filters, int x0_qn,
     29                              int x_step_qn) {
     30   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
     31   for (int y = 0; y < h; ++y) {
     32     int x_qn = x0_qn;
     33     for (int x = 0; x < w; ++x) {
     34       const uint8_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
     35       const int x_filter_idx =
     36           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
     37       assert(x_filter_idx <= RS_SUBPEL_MASK);
     38       const int16_t *const x_filter =
     39           &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
     40       int sum = 0;
     41       for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
     42         sum += src_x[k] * x_filter[k];
     43       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
     44       x_qn += x_step_qn;
     45     }
     46     src += src_stride;
     47     dst += dst_stride;
     48   }
     49 }
     50 
     51 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
     52                                     uint16_t *dst, int dst_stride, int w, int h,
     53                                     const int16_t *x_filters, int x0_qn,
     54                                     int x_step_qn, int bd) {
     55   src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
     56   for (int y = 0; y < h; ++y) {
     57     int x_qn = x0_qn;
     58     for (int x = 0; x < w; ++x) {
     59       const uint16_t *const src_x = &src[x_qn >> RS_SCALE_SUBPEL_BITS];
     60       const int x_filter_idx =
     61           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
     62       assert(x_filter_idx <= RS_SUBPEL_MASK);
     63       const int16_t *const x_filter =
     64           &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
     65       int sum = 0;
     66       for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
     67         sum += src_x[k] * x_filter[k];
     68       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
     69       x_qn += x_step_qn;
     70     }
     71     src += src_stride;
     72     dst += dst_stride;
     73   }
     74 }
     75 
     76 void av1_convolve_2d_sobel_y_c(const uint8_t *src, int src_stride, double *dst,
     77                                int dst_stride, int w, int h, int dir,
     78                                double norm) {
     79   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
     80   DECLARE_ALIGNED(256, static const int16_t, sobel_a[3]) = { 1, 0, -1 };
     81   DECLARE_ALIGNED(256, static const int16_t, sobel_b[3]) = { 1, 2, 1 };
     82   const int taps = 3;
     83   int im_h = h + taps - 1;
     84   int im_stride = w;
     85   const int fo_vert = 1;
     86   const int fo_horiz = 1;
     87 
     88   // horizontal filter
     89   const uint8_t *src_horiz = src - fo_vert * src_stride;
     90   const int16_t *x_filter = dir ? sobel_a : sobel_b;
     91   for (int y = 0; y < im_h; ++y) {
     92     for (int x = 0; x < w; ++x) {
     93       int16_t sum = 0;
     94       for (int k = 0; k < taps; ++k) {
     95         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
     96       }
     97       im_block[y * im_stride + x] = sum;
     98     }
     99   }
    100 
    101   // vertical filter
    102   int16_t *src_vert = im_block + fo_vert * im_stride;
    103   const int16_t *y_filter = dir ? sobel_b : sobel_a;
    104   for (int y = 0; y < h; ++y) {
    105     for (int x = 0; x < w; ++x) {
    106       int16_t sum = 0;
    107       for (int k = 0; k < taps; ++k) {
    108         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
    109       }
    110       dst[y * dst_stride + x] = sum * norm;
    111     }
    112   }
    113 }
    114 
    115 void av1_convolve_2d_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
    116                           int dst_stride, int w, int h,
    117                           const InterpFilterParams *filter_params_x,
    118                           const InterpFilterParams *filter_params_y,
    119                           const int subpel_x_q4, const int subpel_y_q4,
    120                           ConvolveParams *conv_params) {
    121   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
    122   int im_h = h + filter_params_y->taps - 1;
    123   int im_stride = w;
    124   const int fo_vert = filter_params_y->taps / 2 - 1;
    125   const int fo_horiz = filter_params_x->taps / 2 - 1;
    126   const int bd = 8;
    127   const int bits =
    128       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    129 
    130   // horizontal filter
    131   const uint8_t *src_horiz = src - fo_vert * src_stride;
    132   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    133       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
    134   for (int y = 0; y < im_h; ++y) {
    135     for (int x = 0; x < w; ++x) {
    136       int32_t sum = (1 << (bd + FILTER_BITS - 1));
    137       for (int k = 0; k < filter_params_x->taps; ++k) {
    138         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
    139       }
    140       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
    141       im_block[y * im_stride + x] =
    142           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    143     }
    144   }
    145 
    146   // vertical filter
    147   int16_t *src_vert = im_block + fo_vert * im_stride;
    148   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    149       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
    150   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    151   for (int y = 0; y < h; ++y) {
    152     for (int x = 0; x < w; ++x) {
    153       int32_t sum = 1 << offset_bits;
    154       for (int k = 0; k < filter_params_y->taps; ++k) {
    155         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
    156       }
    157       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
    158       int16_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
    159                     ((1 << (offset_bits - conv_params->round_1)) +
    160                      (1 << (offset_bits - conv_params->round_1 - 1)));
    161       dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
    162     }
    163   }
    164 }
    165 
    166 void av1_convolve_y_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
    167                          int dst_stride, int w, int h,
    168                          const InterpFilterParams *filter_params_x,
    169                          const InterpFilterParams *filter_params_y,
    170                          const int subpel_x_q4, const int subpel_y_q4,
    171                          ConvolveParams *conv_params) {
    172   const int fo_vert = filter_params_y->taps / 2 - 1;
    173   (void)filter_params_x;
    174   (void)subpel_x_q4;
    175   (void)conv_params;
    176 
    177   assert(conv_params->round_0 <= FILTER_BITS);
    178   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
    179          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
    180 
    181   // vertical filter
    182   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    183       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
    184   for (int y = 0; y < h; ++y) {
    185     for (int x = 0; x < w; ++x) {
    186       int32_t res = 0;
    187       for (int k = 0; k < filter_params_y->taps; ++k) {
    188         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
    189       }
    190       dst[y * dst_stride + x] =
    191           clip_pixel(ROUND_POWER_OF_TWO(res, FILTER_BITS));
    192     }
    193   }
    194 }
    195 
    196 void av1_convolve_x_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
    197                          int dst_stride, int w, int h,
    198                          const InterpFilterParams *filter_params_x,
    199                          const InterpFilterParams *filter_params_y,
    200                          const int subpel_x_q4, const int subpel_y_q4,
    201                          ConvolveParams *conv_params) {
    202   const int fo_horiz = filter_params_x->taps / 2 - 1;
    203   const int bits = FILTER_BITS - conv_params->round_0;
    204   (void)filter_params_y;
    205   (void)subpel_y_q4;
    206   (void)conv_params;
    207 
    208   assert(bits >= 0);
    209   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
    210          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
    211 
    212   // horizontal filter
    213   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    214       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
    215 
    216   for (int y = 0; y < h; ++y) {
    217     for (int x = 0; x < w; ++x) {
    218       int32_t res = 0;
    219       for (int k = 0; k < filter_params_x->taps; ++k) {
    220         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
    221       }
    222       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
    223       dst[y * dst_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(res, bits));
    224     }
    225   }
    226 }
    227 
    228 void av1_convolve_2d_copy_sr_c(const uint8_t *src, int src_stride, uint8_t *dst,
    229                                int dst_stride, int w, int h,
    230                                const InterpFilterParams *filter_params_x,
    231                                const InterpFilterParams *filter_params_y,
    232                                const int subpel_x_q4, const int subpel_y_q4,
    233                                ConvolveParams *conv_params) {
    234   (void)filter_params_x;
    235   (void)filter_params_y;
    236   (void)subpel_x_q4;
    237   (void)subpel_y_q4;
    238   (void)conv_params;
    239 
    240   for (int y = 0; y < h; ++y) {
    241     memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
    242   }
    243 }
    244 
    245 void av1_dist_wtd_convolve_2d_c(const uint8_t *src, int src_stride,
    246                                 uint8_t *dst8, int dst8_stride, int w, int h,
    247                                 const InterpFilterParams *filter_params_x,
    248                                 const InterpFilterParams *filter_params_y,
    249                                 const int subpel_x_q4, const int subpel_y_q4,
    250                                 ConvolveParams *conv_params) {
    251   CONV_BUF_TYPE *dst = conv_params->dst;
    252   int dst_stride = conv_params->dst_stride;
    253   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
    254   int im_h = h + filter_params_y->taps - 1;
    255   int im_stride = w;
    256   const int fo_vert = filter_params_y->taps / 2 - 1;
    257   const int fo_horiz = filter_params_x->taps / 2 - 1;
    258   const int bd = 8;
    259   const int round_bits =
    260       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    261 
    262   // horizontal filter
    263   const uint8_t *src_horiz = src - fo_vert * src_stride;
    264   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    265       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
    266   for (int y = 0; y < im_h; ++y) {
    267     for (int x = 0; x < w; ++x) {
    268       int32_t sum = (1 << (bd + FILTER_BITS - 1));
    269       for (int k = 0; k < filter_params_x->taps; ++k) {
    270         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
    271       }
    272       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
    273       im_block[y * im_stride + x] =
    274           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    275     }
    276   }
    277 
    278   // vertical filter
    279   int16_t *src_vert = im_block + fo_vert * im_stride;
    280   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    281       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
    282   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    283   for (int y = 0; y < h; ++y) {
    284     for (int x = 0; x < w; ++x) {
    285       int32_t sum = 1 << offset_bits;
    286       for (int k = 0; k < filter_params_y->taps; ++k) {
    287         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
    288       }
    289       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
    290       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
    291       if (conv_params->do_average) {
    292         int32_t tmp = dst[y * dst_stride + x];
    293         if (conv_params->use_dist_wtd_comp_avg) {
    294           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    295           tmp = tmp >> DIST_PRECISION_BITS;
    296         } else {
    297           tmp += res;
    298           tmp = tmp >> 1;
    299         }
    300         tmp -= (1 << (offset_bits - conv_params->round_1)) +
    301                (1 << (offset_bits - conv_params->round_1 - 1));
    302         dst8[y * dst8_stride + x] =
    303             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
    304       } else {
    305         dst[y * dst_stride + x] = res;
    306       }
    307     }
    308   }
    309 }
    310 
    311 void av1_dist_wtd_convolve_y_c(const uint8_t *src, int src_stride,
    312                                uint8_t *dst8, int dst8_stride, int w, int h,
    313                                const InterpFilterParams *filter_params_x,
    314                                const InterpFilterParams *filter_params_y,
    315                                const int subpel_x_q4, const int subpel_y_q4,
    316                                ConvolveParams *conv_params) {
    317   CONV_BUF_TYPE *dst = conv_params->dst;
    318   int dst_stride = conv_params->dst_stride;
    319   const int fo_vert = filter_params_y->taps / 2 - 1;
    320   const int bits = FILTER_BITS - conv_params->round_0;
    321   const int bd = 8;
    322   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    323   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
    324                            (1 << (offset_bits - conv_params->round_1 - 1));
    325   const int round_bits =
    326       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    327   (void)filter_params_x;
    328   (void)subpel_x_q4;
    329 
    330   // vertical filter
    331   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    332       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
    333   for (int y = 0; y < h; ++y) {
    334     for (int x = 0; x < w; ++x) {
    335       int32_t res = 0;
    336       for (int k = 0; k < filter_params_y->taps; ++k) {
    337         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
    338       }
    339       res *= (1 << bits);
    340       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
    341 
    342       if (conv_params->do_average) {
    343         int32_t tmp = dst[y * dst_stride + x];
    344         if (conv_params->use_dist_wtd_comp_avg) {
    345           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    346           tmp = tmp >> DIST_PRECISION_BITS;
    347         } else {
    348           tmp += res;
    349           tmp = tmp >> 1;
    350         }
    351         tmp -= round_offset;
    352         dst8[y * dst8_stride + x] =
    353             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
    354       } else {
    355         dst[y * dst_stride + x] = res;
    356       }
    357     }
    358   }
    359 }
    360 
    361 void av1_dist_wtd_convolve_x_c(const uint8_t *src, int src_stride,
    362                                uint8_t *dst8, int dst8_stride, int w, int h,
    363                                const InterpFilterParams *filter_params_x,
    364                                const InterpFilterParams *filter_params_y,
    365                                const int subpel_x_q4, const int subpel_y_q4,
    366                                ConvolveParams *conv_params) {
    367   CONV_BUF_TYPE *dst = conv_params->dst;
    368   int dst_stride = conv_params->dst_stride;
    369   const int fo_horiz = filter_params_x->taps / 2 - 1;
    370   const int bits = FILTER_BITS - conv_params->round_1;
    371   const int bd = 8;
    372   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    373   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
    374                            (1 << (offset_bits - conv_params->round_1 - 1));
    375   const int round_bits =
    376       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    377   (void)filter_params_y;
    378   (void)subpel_y_q4;
    379 
    380   // horizontal filter
    381   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    382       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
    383   for (int y = 0; y < h; ++y) {
    384     for (int x = 0; x < w; ++x) {
    385       int32_t res = 0;
    386       for (int k = 0; k < filter_params_x->taps; ++k) {
    387         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
    388       }
    389       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
    390       res += round_offset;
    391 
    392       if (conv_params->do_average) {
    393         int32_t tmp = dst[y * dst_stride + x];
    394         if (conv_params->use_dist_wtd_comp_avg) {
    395           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    396           tmp = tmp >> DIST_PRECISION_BITS;
    397         } else {
    398           tmp += res;
    399           tmp = tmp >> 1;
    400         }
    401         tmp -= round_offset;
    402         dst8[y * dst8_stride + x] =
    403             clip_pixel(ROUND_POWER_OF_TWO(tmp, round_bits));
    404       } else {
    405         dst[y * dst_stride + x] = res;
    406       }
    407     }
    408   }
    409 }
    410 
    411 void av1_dist_wtd_convolve_2d_copy_c(
    412     const uint8_t *src, int src_stride, uint8_t *dst8, int dst8_stride, int w,
    413     int h, const InterpFilterParams *filter_params_x,
    414     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
    415     const int subpel_y_q4, ConvolveParams *conv_params) {
    416   CONV_BUF_TYPE *dst = conv_params->dst;
    417   int dst_stride = conv_params->dst_stride;
    418   const int bits =
    419       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
    420   const int bd = 8;
    421   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    422   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
    423                            (1 << (offset_bits - conv_params->round_1 - 1));
    424   (void)filter_params_x;
    425   (void)filter_params_y;
    426   (void)subpel_x_q4;
    427   (void)subpel_y_q4;
    428 
    429   for (int y = 0; y < h; ++y) {
    430     for (int x = 0; x < w; ++x) {
    431       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
    432       res += round_offset;
    433 
    434       if (conv_params->do_average) {
    435         int32_t tmp = dst[y * dst_stride + x];
    436         if (conv_params->use_dist_wtd_comp_avg) {
    437           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    438           tmp = tmp >> DIST_PRECISION_BITS;
    439         } else {
    440           tmp += res;
    441           tmp = tmp >> 1;
    442         }
    443         tmp -= round_offset;
    444         dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
    445       } else {
    446         dst[y * dst_stride + x] = res;
    447       }
    448     }
    449   }
    450 }
    451 
    452 void av1_convolve_2d_scale_c(const uint8_t *src, int src_stride, uint8_t *dst8,
    453                              int dst8_stride, int w, int h,
    454                              const InterpFilterParams *filter_params_x,
    455                              const InterpFilterParams *filter_params_y,
    456                              const int subpel_x_qn, const int x_step_qn,
    457                              const int subpel_y_qn, const int y_step_qn,
    458                              ConvolveParams *conv_params) {
    459   int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
    460   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
    461              filter_params_y->taps;
    462   CONV_BUF_TYPE *dst16 = conv_params->dst;
    463   const int dst16_stride = conv_params->dst_stride;
    464   const int bits =
    465       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    466   assert(bits >= 0);
    467   int im_stride = w;
    468   const int fo_vert = filter_params_y->taps / 2 - 1;
    469   const int fo_horiz = filter_params_x->taps / 2 - 1;
    470   const int bd = 8;
    471 
    472   // horizontal filter
    473   const uint8_t *src_horiz = src - fo_vert * src_stride;
    474   for (int y = 0; y < im_h; ++y) {
    475     int x_qn = subpel_x_qn;
    476     for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
    477       const uint8_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
    478       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
    479       assert(x_filter_idx < SUBPEL_SHIFTS);
    480       const int16_t *x_filter =
    481           av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
    482       int32_t sum = (1 << (bd + FILTER_BITS - 1));
    483       for (int k = 0; k < filter_params_x->taps; ++k) {
    484         sum += x_filter[k] * src_x[k - fo_horiz];
    485       }
    486       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
    487       im_block[y * im_stride + x] =
    488           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    489     }
    490     src_horiz += src_stride;
    491   }
    492 
    493   // vertical filter
    494   int16_t *src_vert = im_block + fo_vert * im_stride;
    495   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    496   for (int x = 0; x < w; ++x) {
    497     int y_qn = subpel_y_qn;
    498     for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
    499       const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
    500       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
    501       assert(y_filter_idx < SUBPEL_SHIFTS);
    502       const int16_t *y_filter =
    503           av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
    504       int32_t sum = 1 << offset_bits;
    505       for (int k = 0; k < filter_params_y->taps; ++k) {
    506         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
    507       }
    508       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
    509       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
    510       if (conv_params->is_compound) {
    511         if (conv_params->do_average) {
    512           int32_t tmp = dst16[y * dst16_stride + x];
    513           if (conv_params->use_dist_wtd_comp_avg) {
    514             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    515             tmp = tmp >> DIST_PRECISION_BITS;
    516           } else {
    517             tmp += res;
    518             tmp = tmp >> 1;
    519           }
    520           /* Subtract round offset and convolve round */
    521           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
    522                        (1 << (offset_bits - conv_params->round_1 - 1)));
    523           dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
    524         } else {
    525           dst16[y * dst16_stride + x] = res;
    526         }
    527       } else {
    528         /* Subtract round offset and convolve round */
    529         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
    530                              (1 << (offset_bits - conv_params->round_1 - 1)));
    531         dst8[y * dst8_stride + x] = clip_pixel(ROUND_POWER_OF_TWO(tmp, bits));
    532       }
    533     }
    534     src_vert++;
    535   }
    536 }
    537 
    538 static void convolve_2d_scale_wrapper(
    539     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
    540     int h, const InterpFilterParams *filter_params_x,
    541     const InterpFilterParams *filter_params_y, const int subpel_x_qn,
    542     const int x_step_qn, const int subpel_y_qn, const int y_step_qn,
    543     ConvolveParams *conv_params) {
    544   if (conv_params->is_compound) {
    545     assert(conv_params->dst != NULL);
    546   }
    547   av1_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h, filter_params_x,
    548                         filter_params_y, subpel_x_qn, x_step_qn, subpel_y_qn,
    549                         y_step_qn, conv_params);
    550 }
    551 
    552 // TODO(huisu (at) google.com): bilinear filtering only needs 2 taps in general. So
    553 // we may create optimized code to do 2-tap filtering for all bilinear filtering
    554 // usages, not just IntraBC.
    555 static void convolve_2d_for_intrabc(const uint8_t *src, int src_stride,
    556                                     uint8_t *dst, int dst_stride, int w, int h,
    557                                     int subpel_x_q4, int subpel_y_q4,
    558                                     ConvolveParams *conv_params) {
    559   const InterpFilterParams *filter_params_x =
    560       subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
    561   const InterpFilterParams *filter_params_y =
    562       subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
    563   if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
    564     av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
    565                          filter_params_x, filter_params_y, 0, 0, conv_params);
    566   } else if (subpel_x_q4 != 0) {
    567     av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
    568                         filter_params_y, 0, 0, conv_params);
    569   } else {
    570     av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
    571                         filter_params_y, 0, 0, conv_params);
    572   }
    573 }
    574 
    575 void av1_convolve_2d_facade(const uint8_t *src, int src_stride, uint8_t *dst,
    576                             int dst_stride, int w, int h,
    577                             InterpFilters interp_filters, const int subpel_x_q4,
    578                             int x_step_q4, const int subpel_y_q4, int y_step_q4,
    579                             int scaled, ConvolveParams *conv_params,
    580                             const struct scale_factors *sf, int is_intrabc) {
    581   assert(IMPLIES(is_intrabc, !scaled));
    582   (void)x_step_q4;
    583   (void)y_step_q4;
    584   (void)dst;
    585   (void)dst_stride;
    586 
    587   if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
    588     convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h, subpel_x_q4,
    589                             subpel_y_q4, conv_params);
    590     return;
    591   }
    592 
    593   InterpFilter filter_x = 0;
    594   InterpFilter filter_y = 0;
    595   const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
    596   const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
    597   if (need_filter_params_x)
    598     filter_x = av1_extract_interp_filter(interp_filters, 1);
    599   if (need_filter_params_y)
    600     filter_y = av1_extract_interp_filter(interp_filters, 0);
    601   const InterpFilterParams *filter_params_x =
    602       need_filter_params_x
    603           ? av1_get_interp_filter_params_with_block_size(filter_x, w)
    604           : NULL;
    605   const InterpFilterParams *filter_params_y =
    606       need_filter_params_y
    607           ? av1_get_interp_filter_params_with_block_size(filter_y, h)
    608           : NULL;
    609 
    610   if (scaled) {
    611     convolve_2d_scale_wrapper(src, src_stride, dst, dst_stride, w, h,
    612                               filter_params_x, filter_params_y, subpel_x_q4,
    613                               x_step_q4, subpel_y_q4, y_step_q4, conv_params);
    614   } else {
    615     sf->convolve[subpel_x_q4 != 0][subpel_y_q4 != 0][conv_params->is_compound](
    616         src, src_stride, dst, dst_stride, w, h, filter_params_x,
    617         filter_params_y, subpel_x_q4, subpel_y_q4, conv_params);
    618   }
    619 }
    620 
    621 void av1_highbd_convolve_2d_copy_sr_c(
    622     const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w,
    623     int h, const InterpFilterParams *filter_params_x,
    624     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
    625     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
    626   (void)filter_params_x;
    627   (void)filter_params_y;
    628   (void)subpel_x_q4;
    629   (void)subpel_y_q4;
    630   (void)conv_params;
    631   (void)bd;
    632 
    633   for (int y = 0; y < h; ++y) {
    634     memmove(dst + y * dst_stride, src + y * src_stride, w * sizeof(src[0]));
    635   }
    636 }
    637 
    638 void av1_highbd_convolve_x_sr_c(const uint16_t *src, int src_stride,
    639                                 uint16_t *dst, int dst_stride, int w, int h,
    640                                 const InterpFilterParams *filter_params_x,
    641                                 const InterpFilterParams *filter_params_y,
    642                                 const int subpel_x_q4, const int subpel_y_q4,
    643                                 ConvolveParams *conv_params, int bd) {
    644   const int fo_horiz = filter_params_x->taps / 2 - 1;
    645   const int bits = FILTER_BITS - conv_params->round_0;
    646   (void)filter_params_y;
    647   (void)subpel_y_q4;
    648 
    649   assert(bits >= 0);
    650   assert((FILTER_BITS - conv_params->round_1) >= 0 ||
    651          ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
    652 
    653   // horizontal filter
    654   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    655       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
    656   for (int y = 0; y < h; ++y) {
    657     for (int x = 0; x < w; ++x) {
    658       int32_t res = 0;
    659       for (int k = 0; k < filter_params_x->taps; ++k) {
    660         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
    661       }
    662       res = ROUND_POWER_OF_TWO(res, conv_params->round_0);
    663       dst[y * dst_stride + x] =
    664           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
    665     }
    666   }
    667 }
    668 
    669 void av1_highbd_convolve_y_sr_c(const uint16_t *src, int src_stride,
    670                                 uint16_t *dst, int dst_stride, int w, int h,
    671                                 const InterpFilterParams *filter_params_x,
    672                                 const InterpFilterParams *filter_params_y,
    673                                 const int subpel_x_q4, const int subpel_y_q4,
    674                                 ConvolveParams *conv_params, int bd) {
    675   const int fo_vert = filter_params_y->taps / 2 - 1;
    676   (void)filter_params_x;
    677   (void)subpel_x_q4;
    678   (void)conv_params;
    679 
    680   assert(conv_params->round_0 <= FILTER_BITS);
    681   assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
    682          ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
    683   // vertical filter
    684   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    685       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
    686   for (int y = 0; y < h; ++y) {
    687     for (int x = 0; x < w; ++x) {
    688       int32_t res = 0;
    689       for (int k = 0; k < filter_params_y->taps; ++k) {
    690         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
    691       }
    692       dst[y * dst_stride + x] =
    693           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, FILTER_BITS), bd);
    694     }
    695   }
    696 }
    697 
    698 void av1_highbd_convolve_2d_sr_c(const uint16_t *src, int src_stride,
    699                                  uint16_t *dst, int dst_stride, int w, int h,
    700                                  const InterpFilterParams *filter_params_x,
    701                                  const InterpFilterParams *filter_params_y,
    702                                  const int subpel_x_q4, const int subpel_y_q4,
    703                                  ConvolveParams *conv_params, int bd) {
    704   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
    705   int im_h = h + filter_params_y->taps - 1;
    706   int im_stride = w;
    707   const int fo_vert = filter_params_y->taps / 2 - 1;
    708   const int fo_horiz = filter_params_x->taps / 2 - 1;
    709   const int bits =
    710       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    711   assert(bits >= 0);
    712 
    713   // horizontal filter
    714   const uint16_t *src_horiz = src - fo_vert * src_stride;
    715   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    716       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
    717   for (int y = 0; y < im_h; ++y) {
    718     for (int x = 0; x < w; ++x) {
    719       int32_t sum = (1 << (bd + FILTER_BITS - 1));
    720       for (int k = 0; k < filter_params_x->taps; ++k) {
    721         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
    722       }
    723       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
    724       im_block[y * im_stride + x] =
    725           ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    726     }
    727   }
    728 
    729   // vertical filter
    730   int16_t *src_vert = im_block + fo_vert * im_stride;
    731   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    732       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
    733   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    734   for (int y = 0; y < h; ++y) {
    735     for (int x = 0; x < w; ++x) {
    736       int32_t sum = 1 << offset_bits;
    737       for (int k = 0; k < filter_params_y->taps; ++k) {
    738         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
    739       }
    740       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
    741       int32_t res = ROUND_POWER_OF_TWO(sum, conv_params->round_1) -
    742                     ((1 << (offset_bits - conv_params->round_1)) +
    743                      (1 << (offset_bits - conv_params->round_1 - 1)));
    744       dst[y * dst_stride + x] =
    745           clip_pixel_highbd(ROUND_POWER_OF_TWO(res, bits), bd);
    746     }
    747   }
    748 }
    749 
    750 void av1_highbd_dist_wtd_convolve_2d_c(
    751     const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
    752     int w, int h, const InterpFilterParams *filter_params_x,
    753     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
    754     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
    755   int x, y, k;
    756   int16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
    757   CONV_BUF_TYPE *dst = conv_params->dst;
    758   int dst_stride = conv_params->dst_stride;
    759   int im_h = h + filter_params_y->taps - 1;
    760   int im_stride = w;
    761   const int fo_vert = filter_params_y->taps / 2 - 1;
    762   const int fo_horiz = filter_params_x->taps / 2 - 1;
    763   const int round_bits =
    764       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    765   assert(round_bits >= 0);
    766 
    767   // horizontal filter
    768   const uint16_t *src_horiz = src - fo_vert * src_stride;
    769   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    770       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
    771   for (y = 0; y < im_h; ++y) {
    772     for (x = 0; x < w; ++x) {
    773       int32_t sum = (1 << (bd + FILTER_BITS - 1));
    774       for (k = 0; k < filter_params_x->taps; ++k) {
    775         sum += x_filter[k] * src_horiz[y * src_stride + x - fo_horiz + k];
    776       }
    777       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
    778       (void)bd;
    779       im_block[y * im_stride + x] =
    780           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    781     }
    782   }
    783 
    784   // vertical filter
    785   int16_t *src_vert = im_block + fo_vert * im_stride;
    786   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    787   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    788       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
    789   for (y = 0; y < h; ++y) {
    790     for (x = 0; x < w; ++x) {
    791       int32_t sum = 1 << offset_bits;
    792       for (k = 0; k < filter_params_y->taps; ++k) {
    793         sum += y_filter[k] * src_vert[(y - fo_vert + k) * im_stride + x];
    794       }
    795       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
    796       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
    797       if (conv_params->do_average) {
    798         int32_t tmp = dst[y * dst_stride + x];
    799         if (conv_params->use_dist_wtd_comp_avg) {
    800           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    801           tmp = tmp >> DIST_PRECISION_BITS;
    802         } else {
    803           tmp += res;
    804           tmp = tmp >> 1;
    805         }
    806         tmp -= (1 << (offset_bits - conv_params->round_1)) +
    807                (1 << (offset_bits - conv_params->round_1 - 1));
    808         dst16[y * dst16_stride + x] =
    809             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
    810       } else {
    811         dst[y * dst_stride + x] = res;
    812       }
    813     }
    814   }
    815 }
    816 
    817 void av1_highbd_dist_wtd_convolve_x_c(
    818     const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
    819     int w, int h, const InterpFilterParams *filter_params_x,
    820     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
    821     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
    822   CONV_BUF_TYPE *dst = conv_params->dst;
    823   int dst_stride = conv_params->dst_stride;
    824   const int fo_horiz = filter_params_x->taps / 2 - 1;
    825   const int bits = FILTER_BITS - conv_params->round_1;
    826   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    827   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
    828                            (1 << (offset_bits - conv_params->round_1 - 1));
    829   const int round_bits =
    830       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    831   assert(round_bits >= 0);
    832   (void)filter_params_y;
    833   (void)subpel_y_q4;
    834   assert(bits >= 0);
    835   // horizontal filter
    836   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
    837       filter_params_x, subpel_x_q4 & SUBPEL_MASK);
    838   for (int y = 0; y < h; ++y) {
    839     for (int x = 0; x < w; ++x) {
    840       int32_t res = 0;
    841       for (int k = 0; k < filter_params_x->taps; ++k) {
    842         res += x_filter[k] * src[y * src_stride + x - fo_horiz + k];
    843       }
    844       res = (1 << bits) * ROUND_POWER_OF_TWO(res, conv_params->round_0);
    845       res += round_offset;
    846 
    847       if (conv_params->do_average) {
    848         int32_t tmp = dst[y * dst_stride + x];
    849         if (conv_params->use_dist_wtd_comp_avg) {
    850           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    851           tmp = tmp >> DIST_PRECISION_BITS;
    852         } else {
    853           tmp += res;
    854           tmp = tmp >> 1;
    855         }
    856         tmp -= round_offset;
    857         dst16[y * dst16_stride + x] =
    858             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
    859       } else {
    860         dst[y * dst_stride + x] = res;
    861       }
    862     }
    863   }
    864 }
    865 
    866 void av1_highbd_dist_wtd_convolve_y_c(
    867     const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
    868     int w, int h, const InterpFilterParams *filter_params_x,
    869     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
    870     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
    871   CONV_BUF_TYPE *dst = conv_params->dst;
    872   int dst_stride = conv_params->dst_stride;
    873   const int fo_vert = filter_params_y->taps / 2 - 1;
    874   const int bits = FILTER_BITS - conv_params->round_0;
    875   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    876   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
    877                            (1 << (offset_bits - conv_params->round_1 - 1));
    878   const int round_bits =
    879       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    880   assert(round_bits >= 0);
    881   (void)filter_params_x;
    882   (void)subpel_x_q4;
    883   assert(bits >= 0);
    884   // vertical filter
    885   const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
    886       filter_params_y, subpel_y_q4 & SUBPEL_MASK);
    887   for (int y = 0; y < h; ++y) {
    888     for (int x = 0; x < w; ++x) {
    889       int32_t res = 0;
    890       for (int k = 0; k < filter_params_y->taps; ++k) {
    891         res += y_filter[k] * src[(y - fo_vert + k) * src_stride + x];
    892       }
    893       res *= (1 << bits);
    894       res = ROUND_POWER_OF_TWO(res, conv_params->round_1) + round_offset;
    895 
    896       if (conv_params->do_average) {
    897         int32_t tmp = dst[y * dst_stride + x];
    898         if (conv_params->use_dist_wtd_comp_avg) {
    899           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    900           tmp = tmp >> DIST_PRECISION_BITS;
    901         } else {
    902           tmp += res;
    903           tmp = tmp >> 1;
    904         }
    905         tmp -= round_offset;
    906         dst16[y * dst16_stride + x] =
    907             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, round_bits), bd);
    908       } else {
    909         dst[y * dst_stride + x] = res;
    910       }
    911     }
    912   }
    913 }
    914 
    915 void av1_highbd_dist_wtd_convolve_2d_copy_c(
    916     const uint16_t *src, int src_stride, uint16_t *dst16, int dst16_stride,
    917     int w, int h, const InterpFilterParams *filter_params_x,
    918     const InterpFilterParams *filter_params_y, const int subpel_x_q4,
    919     const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
    920   CONV_BUF_TYPE *dst = conv_params->dst;
    921   int dst_stride = conv_params->dst_stride;
    922   const int bits =
    923       FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
    924   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    925   const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
    926                            (1 << (offset_bits - conv_params->round_1 - 1));
    927   assert(bits >= 0);
    928   (void)filter_params_x;
    929   (void)filter_params_y;
    930   (void)subpel_x_q4;
    931   (void)subpel_y_q4;
    932 
    933   for (int y = 0; y < h; ++y) {
    934     for (int x = 0; x < w; ++x) {
    935       CONV_BUF_TYPE res = src[y * src_stride + x] << bits;
    936       res += round_offset;
    937       if (conv_params->do_average) {
    938         int32_t tmp = dst[y * dst_stride + x];
    939         if (conv_params->use_dist_wtd_comp_avg) {
    940           tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
    941           tmp = tmp >> DIST_PRECISION_BITS;
    942         } else {
    943           tmp += res;
    944           tmp = tmp >> 1;
    945         }
    946         tmp -= round_offset;
    947         dst16[y * dst16_stride + x] =
    948             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
    949       } else {
    950         dst[y * dst_stride + x] = res;
    951       }
    952     }
    953   }
    954 }
    955 
    956 void av1_highbd_convolve_2d_scale_c(const uint16_t *src, int src_stride,
    957                                     uint16_t *dst, int dst_stride, int w, int h,
    958                                     const InterpFilterParams *filter_params_x,
    959                                     const InterpFilterParams *filter_params_y,
    960                                     const int subpel_x_qn, const int x_step_qn,
    961                                     const int subpel_y_qn, const int y_step_qn,
    962                                     ConvolveParams *conv_params, int bd) {
    963   int16_t im_block[(2 * MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE];
    964   int im_h = (((h - 1) * y_step_qn + subpel_y_qn) >> SCALE_SUBPEL_BITS) +
    965              filter_params_y->taps;
    966   int im_stride = w;
    967   const int fo_vert = filter_params_y->taps / 2 - 1;
    968   const int fo_horiz = filter_params_x->taps / 2 - 1;
    969   CONV_BUF_TYPE *dst16 = conv_params->dst;
    970   const int dst16_stride = conv_params->dst_stride;
    971   const int bits =
    972       FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
    973   assert(bits >= 0);
    974   // horizontal filter
    975   const uint16_t *src_horiz = src - fo_vert * src_stride;
    976   for (int y = 0; y < im_h; ++y) {
    977     int x_qn = subpel_x_qn;
    978     for (int x = 0; x < w; ++x, x_qn += x_step_qn) {
    979       const uint16_t *const src_x = &src_horiz[(x_qn >> SCALE_SUBPEL_BITS)];
    980       const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
    981       assert(x_filter_idx < SUBPEL_SHIFTS);
    982       const int16_t *x_filter =
    983           av1_get_interp_filter_subpel_kernel(filter_params_x, x_filter_idx);
    984       int32_t sum = (1 << (bd + FILTER_BITS - 1));
    985       for (int k = 0; k < filter_params_x->taps; ++k) {
    986         sum += x_filter[k] * src_x[k - fo_horiz];
    987       }
    988       assert(0 <= sum && sum < (1 << (bd + FILTER_BITS + 1)));
    989       im_block[y * im_stride + x] =
    990           (int16_t)ROUND_POWER_OF_TWO(sum, conv_params->round_0);
    991     }
    992     src_horiz += src_stride;
    993   }
    994 
    995   // vertical filter
    996   int16_t *src_vert = im_block + fo_vert * im_stride;
    997   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    998   for (int x = 0; x < w; ++x) {
    999     int y_qn = subpel_y_qn;
   1000     for (int y = 0; y < h; ++y, y_qn += y_step_qn) {
   1001       const int16_t *src_y = &src_vert[(y_qn >> SCALE_SUBPEL_BITS) * im_stride];
   1002       const int y_filter_idx = (y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
   1003       assert(y_filter_idx < SUBPEL_SHIFTS);
   1004       const int16_t *y_filter =
   1005           av1_get_interp_filter_subpel_kernel(filter_params_y, y_filter_idx);
   1006       int32_t sum = 1 << offset_bits;
   1007       for (int k = 0; k < filter_params_y->taps; ++k) {
   1008         sum += y_filter[k] * src_y[(k - fo_vert) * im_stride];
   1009       }
   1010       assert(0 <= sum && sum < (1 << (offset_bits + 2)));
   1011       CONV_BUF_TYPE res = ROUND_POWER_OF_TWO(sum, conv_params->round_1);
   1012       if (conv_params->is_compound) {
   1013         if (conv_params->do_average) {
   1014           int32_t tmp = dst16[y * dst16_stride + x];
   1015           if (conv_params->use_dist_wtd_comp_avg) {
   1016             tmp = tmp * conv_params->fwd_offset + res * conv_params->bck_offset;
   1017             tmp = tmp >> DIST_PRECISION_BITS;
   1018           } else {
   1019             tmp += res;
   1020             tmp = tmp >> 1;
   1021           }
   1022           /* Subtract round offset and convolve round */
   1023           tmp = tmp - ((1 << (offset_bits - conv_params->round_1)) +
   1024                        (1 << (offset_bits - conv_params->round_1 - 1)));
   1025           dst[y * dst_stride + x] =
   1026               clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
   1027         } else {
   1028           dst16[y * dst16_stride + x] = res;
   1029         }
   1030       } else {
   1031         /* Subtract round offset and convolve round */
   1032         int32_t tmp = res - ((1 << (offset_bits - conv_params->round_1)) +
   1033                              (1 << (offset_bits - conv_params->round_1 - 1)));
   1034         dst[y * dst_stride + x] =
   1035             clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp, bits), bd);
   1036       }
   1037     }
   1038     src_vert++;
   1039   }
   1040 }
   1041 
   1042 static void highbd_convolve_2d_for_intrabc(const uint16_t *src, int src_stride,
   1043                                            uint16_t *dst, int dst_stride, int w,
   1044                                            int h, int subpel_x_q4,
   1045                                            int subpel_y_q4,
   1046                                            ConvolveParams *conv_params,
   1047                                            int bd) {
   1048   const InterpFilterParams *filter_params_x =
   1049       subpel_x_q4 ? &av1_intrabc_filter_params : NULL;
   1050   const InterpFilterParams *filter_params_y =
   1051       subpel_y_q4 ? &av1_intrabc_filter_params : NULL;
   1052   if (subpel_x_q4 != 0 && subpel_y_q4 != 0) {
   1053     av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
   1054                                 filter_params_x, filter_params_y, 0, 0,
   1055                                 conv_params, bd);
   1056   } else if (subpel_x_q4 != 0) {
   1057     av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h,
   1058                                filter_params_x, filter_params_y, 0, 0,
   1059                                conv_params, bd);
   1060   } else {
   1061     av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
   1062                                filter_params_x, filter_params_y, 0, 0,
   1063                                conv_params, bd);
   1064   }
   1065 }
   1066 
   1067 void av1_highbd_convolve_2d_facade(const uint8_t *src8, int src_stride,
   1068                                    uint8_t *dst8, int dst_stride, int w, int h,
   1069                                    InterpFilters interp_filters,
   1070                                    const int subpel_x_q4, int x_step_q4,
   1071                                    const int subpel_y_q4, int y_step_q4,
   1072                                    int scaled, ConvolveParams *conv_params,
   1073                                    const struct scale_factors *sf,
   1074                                    int is_intrabc, int bd) {
   1075   assert(IMPLIES(is_intrabc, !scaled));
   1076   (void)x_step_q4;
   1077   (void)y_step_q4;
   1078   (void)dst_stride;
   1079   const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   1080 
   1081   if (is_intrabc && (subpel_x_q4 != 0 || subpel_y_q4 != 0)) {
   1082     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   1083     highbd_convolve_2d_for_intrabc(src, src_stride, dst, dst_stride, w, h,
   1084                                    subpel_x_q4, subpel_y_q4, conv_params, bd);
   1085     return;
   1086   }
   1087 
   1088   InterpFilter filter_x = 0;
   1089   InterpFilter filter_y = 0;
   1090   const int need_filter_params_x = (subpel_x_q4 != 0) | scaled;
   1091   const int need_filter_params_y = (subpel_y_q4 != 0) | scaled;
   1092   if (need_filter_params_x)
   1093     filter_x = av1_extract_interp_filter(interp_filters, 1);
   1094   if (need_filter_params_y)
   1095     filter_y = av1_extract_interp_filter(interp_filters, 0);
   1096   const InterpFilterParams *filter_params_x =
   1097       need_filter_params_x
   1098           ? av1_get_interp_filter_params_with_block_size(filter_x, w)
   1099           : NULL;
   1100   const InterpFilterParams *filter_params_y =
   1101       need_filter_params_y
   1102           ? av1_get_interp_filter_params_with_block_size(filter_y, h)
   1103           : NULL;
   1104 
   1105   if (scaled) {
   1106     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   1107     if (conv_params->is_compound) {
   1108       assert(conv_params->dst != NULL);
   1109     }
   1110     av1_highbd_convolve_2d_scale(src, src_stride, dst, dst_stride, w, h,
   1111                                  filter_params_x, filter_params_y, subpel_x_q4,
   1112                                  x_step_q4, subpel_y_q4, y_step_q4, conv_params,
   1113                                  bd);
   1114   } else {
   1115     uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   1116 
   1117     sf->highbd_convolve[subpel_x_q4 != 0][subpel_y_q4 !=
   1118                                           0][conv_params->is_compound](
   1119         src, src_stride, dst, dst_stride, w, h, filter_params_x,
   1120         filter_params_y, subpel_x_q4, subpel_y_q4, conv_params, bd);
   1121   }
   1122 }
   1123 
   1124 // Note: Fixed size intermediate buffers, place limits on parameters
   1125 // of some functions. 2d filtering proceeds in 2 steps:
   1126 //   (1) Interpolate horizontally into an intermediate buffer, temp.
   1127 //   (2) Interpolate temp vertically to derive the sub-pixel result.
   1128 // Deriving the maximum number of rows in the temp buffer (135):
   1129 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
   1130 // --Largest block size is 128x128 pixels.
   1131 // --128 rows in the downscaled frame span a distance of (128 - 1) * 32 in the
   1132 //   original frame (in 1/16th pixel units).
   1133 // --Must round-up because block may be located at sub-pixel position.
   1134 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   1135 // --((128 - 1) * 32 + 15) >> 4 + 8 = 263.
   1136 #define WIENER_MAX_EXT_SIZE 263
   1137 
   1138 static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
   1139   int sum = 0;
   1140   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
   1141   return sum;
   1142 }
   1143 
   1144 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
   1145                                              const int16_t *b) {
   1146   int sum = 0;
   1147   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
   1148   return sum;
   1149 }
   1150 
   1151 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
   1152                                              ptrdiff_t a_stride,
   1153                                              const int16_t *b) {
   1154   int sum = 0;
   1155   for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
   1156   return sum;
   1157 }
   1158 
   1159 static const InterpKernel *get_filter_base(const int16_t *filter) {
   1160   // NOTE: This assumes that the filter table is 256-byte aligned.
   1161   // TODO(agrange) Modify to make independent of table alignment.
   1162   return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
   1163 }
   1164 
   1165 static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
   1166   return (int)((const InterpKernel *)(intptr_t)f - base);
   1167 }
   1168 
   1169 static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
   1170                                        uint16_t *dst, ptrdiff_t dst_stride,
   1171                                        const InterpKernel *x_filters, int x0_q4,
   1172                                        int x_step_q4, int w, int h,
   1173                                        int round0_bits) {
   1174   const int bd = 8;
   1175   src -= SUBPEL_TAPS / 2 - 1;
   1176   for (int y = 0; y < h; ++y) {
   1177     int x_q4 = x0_q4;
   1178     for (int x = 0; x < w; ++x) {
   1179       const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
   1180       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
   1181       const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
   1182                            (1 << (bd + FILTER_BITS - 1));
   1183       const int sum = horz_scalar_product(src_x, x_filter) + rounding;
   1184       dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
   1185                                WIENER_CLAMP_LIMIT(round0_bits, bd) - 1);
   1186       x_q4 += x_step_q4;
   1187     }
   1188     src += src_stride;
   1189     dst += dst_stride;
   1190   }
   1191 }
   1192 
   1193 static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
   1194                                       uint8_t *dst, ptrdiff_t dst_stride,
   1195                                       const InterpKernel *y_filters, int y0_q4,
   1196                                       int y_step_q4, int w, int h,
   1197                                       int round1_bits) {
   1198   const int bd = 8;
   1199   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
   1200 
   1201   for (int x = 0; x < w; ++x) {
   1202     int y_q4 = y0_q4;
   1203     for (int y = 0; y < h; ++y) {
   1204       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
   1205       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
   1206       const int rounding =
   1207           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
   1208           (1 << (bd + round1_bits - 1));
   1209       const int sum =
   1210           highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
   1211       dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, round1_bits));
   1212       y_q4 += y_step_q4;
   1213     }
   1214     ++src;
   1215     ++dst;
   1216   }
   1217 }
   1218 
   1219 void av1_wiener_convolve_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
   1220                                    uint8_t *dst, ptrdiff_t dst_stride,
   1221                                    const int16_t *filter_x, int x_step_q4,
   1222                                    const int16_t *filter_y, int y_step_q4,
   1223                                    int w, int h,
   1224                                    const ConvolveParams *conv_params) {
   1225   const InterpKernel *const filters_x = get_filter_base(filter_x);
   1226   const int x0_q4 = get_filter_offset(filter_x, filters_x);
   1227 
   1228   const InterpKernel *const filters_y = get_filter_base(filter_y);
   1229   const int y0_q4 = get_filter_offset(filter_y, filters_y);
   1230 
   1231   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
   1232   const int intermediate_height =
   1233       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS - 1;
   1234   memset(temp + (intermediate_height * MAX_SB_SIZE), 0, MAX_SB_SIZE);
   1235 
   1236   assert(w <= MAX_SB_SIZE);
   1237   assert(h <= MAX_SB_SIZE);
   1238   assert(y_step_q4 <= 32);
   1239   assert(x_step_q4 <= 32);
   1240 
   1241   convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
   1242                              src_stride, temp, MAX_SB_SIZE, filters_x, x0_q4,
   1243                              x_step_q4, w, intermediate_height,
   1244                              conv_params->round_0);
   1245   convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
   1246                             MAX_SB_SIZE, dst, dst_stride, filters_y, y0_q4,
   1247                             y_step_q4, w, h, conv_params->round_1);
   1248 }
   1249 
   1250 static void highbd_convolve_add_src_horiz_hip(
   1251     const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
   1252     ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
   1253     int x_step_q4, int w, int h, int round0_bits, int bd) {
   1254   const int extraprec_clamp_limit = WIENER_CLAMP_LIMIT(round0_bits, bd);
   1255   uint16_t *src = CONVERT_TO_SHORTPTR(src8);
   1256   src -= SUBPEL_TAPS / 2 - 1;
   1257   for (int y = 0; y < h; ++y) {
   1258     int x_q4 = x0_q4;
   1259     for (int x = 0; x < w; ++x) {
   1260       const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
   1261       const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
   1262       const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
   1263                            (1 << (bd + FILTER_BITS - 1));
   1264       const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
   1265       dst[x] = (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, round0_bits), 0,
   1266                                extraprec_clamp_limit - 1);
   1267       x_q4 += x_step_q4;
   1268     }
   1269     src += src_stride;
   1270     dst += dst_stride;
   1271   }
   1272 }
   1273 
   1274 static void highbd_convolve_add_src_vert_hip(
   1275     const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
   1276     ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
   1277     int y_step_q4, int w, int h, int round1_bits, int bd) {
   1278   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   1279   src -= src_stride * (SUBPEL_TAPS / 2 - 1);
   1280   for (int x = 0; x < w; ++x) {
   1281     int y_q4 = y0_q4;
   1282     for (int y = 0; y < h; ++y) {
   1283       const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
   1284       const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
   1285       const int rounding =
   1286           ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
   1287           (1 << (bd + round1_bits - 1));
   1288       const int sum =
   1289           highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
   1290       dst[y * dst_stride] =
   1291           clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, round1_bits), bd);
   1292       y_q4 += y_step_q4;
   1293     }
   1294     ++src;
   1295     ++dst;
   1296   }
   1297 }
   1298 
   1299 void av1_highbd_wiener_convolve_add_src_c(
   1300     const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
   1301     ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
   1302     const int16_t *filter_y, int y_step_q4, int w, int h,
   1303     const ConvolveParams *conv_params, int bd) {
   1304   const InterpKernel *const filters_x = get_filter_base(filter_x);
   1305   const int x0_q4 = get_filter_offset(filter_x, filters_x);
   1306 
   1307   const InterpKernel *const filters_y = get_filter_base(filter_y);
   1308   const int y0_q4 = get_filter_offset(filter_y, filters_y);
   1309 
   1310   uint16_t temp[WIENER_MAX_EXT_SIZE * MAX_SB_SIZE];
   1311   const int intermediate_height =
   1312       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
   1313 
   1314   assert(w <= MAX_SB_SIZE);
   1315   assert(h <= MAX_SB_SIZE);
   1316   assert(y_step_q4 <= 32);
   1317   assert(x_step_q4 <= 32);
   1318   assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
   1319 
   1320   highbd_convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
   1321                                     src_stride, temp, MAX_SB_SIZE, filters_x,
   1322                                     x0_q4, x_step_q4, w, intermediate_height,
   1323                                     conv_params->round_0, bd);
   1324   highbd_convolve_add_src_vert_hip(
   1325       temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE, dst, dst_stride,
   1326       filters_y, y0_q4, y_step_q4, w, h, conv_params->round_1, bd);
   1327 }
   1328