Home | History | Annotate | Download | only in aom_dsp
      1 /*
      2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
      3  *
      4  * This source code is subject to the terms of the BSD 2 Clause License and
      5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6  * was not distributed with this source code in the LICENSE file, you can
      7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8  * Media Patent License 1.0 was not distributed with this source code in the
      9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10  */
     11 
     12 #include <assert.h>
     13 #include <math.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/aom_dsp_rtcd.h"
     17 
     18 #include "aom_dsp/aom_dsp_common.h"
     19 #include "aom_dsp/intrapred_common.h"
     20 #include "aom_mem/aom_mem.h"
     21 #include "aom_ports/bitops.h"
     22 
     23 static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
     24                                const uint8_t *above, const uint8_t *left) {
     25   int r;
     26   (void)left;
     27 
     28   for (r = 0; r < bh; r++) {
     29     memcpy(dst, above, bw);
     30     dst += stride;
     31   }
     32 }
     33 
     34 static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
     35                                const uint8_t *above, const uint8_t *left) {
     36   int r;
     37   (void)above;
     38 
     39   for (r = 0; r < bh; r++) {
     40     memset(dst, left[r], bw);
     41     dst += stride;
     42   }
     43 }
     44 
     45 static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
     46 
     47 static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
     48                                               uint16_t top_left) {
     49   const int base = top + left - top_left;
     50   const int p_left = abs_diff(base, left);
     51   const int p_top = abs_diff(base, top);
     52   const int p_top_left = abs_diff(base, top_left);
     53 
     54   // Return nearest to base of left, top and top_left.
     55   return (p_left <= p_top && p_left <= p_top_left)
     56              ? left
     57              : (p_top <= p_top_left) ? top : top_left;
     58 }
     59 
     60 static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
     61                                    int bh, const uint8_t *above,
     62                                    const uint8_t *left) {
     63   int r, c;
     64   const uint8_t ytop_left = above[-1];
     65 
     66   for (r = 0; r < bh; r++) {
     67     for (c = 0; c < bw; c++)
     68       dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left);
     69     dst += stride;
     70   }
     71 }
     72 
     73 // Some basic checks on weights for smooth predictor.
     74 #define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \
     75                                  pred_scale)                          \
     76   assert(weights_w[0] < weights_scale);                               \
     77   assert(weights_h[0] < weights_scale);                               \
     78   assert(weights_scale - weights_w[bw - 1] < weights_scale);          \
     79   assert(weights_scale - weights_h[bh - 1] < weights_scale);          \
     80   assert(pred_scale < 31)  // ensures no overflow when calculating predictor.
     81 
     82 #define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
     83 
     84 static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
     85                                     int bh, const uint8_t *above,
     86                                     const uint8_t *left) {
     87   const uint8_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
     88   const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
     89   const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
     90   const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
     91   // scale = 2 * 2^sm_weight_log2_scale
     92   const int log2_scale = 1 + sm_weight_log2_scale;
     93   const uint16_t scale = (1 << sm_weight_log2_scale);
     94   sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
     95                            log2_scale + sizeof(*dst));
     96   int r;
     97   for (r = 0; r < bh; ++r) {
     98     int c;
     99     for (c = 0; c < bw; ++c) {
    100       const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred };
    101       const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
    102                                   sm_weights_w[c], scale - sm_weights_w[c] };
    103       uint32_t this_pred = 0;
    104       int i;
    105       assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
    106       for (i = 0; i < 4; ++i) {
    107         this_pred += weights[i] * pixels[i];
    108       }
    109       dst[c] = divide_round(this_pred, log2_scale);
    110     }
    111     dst += stride;
    112   }
    113 }
    114 
    115 static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
    116                                       int bh, const uint8_t *above,
    117                                       const uint8_t *left) {
    118   const uint8_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
    119   const uint8_t *const sm_weights = sm_weight_arrays + bh;
    120   // scale = 2^sm_weight_log2_scale
    121   const int log2_scale = sm_weight_log2_scale;
    122   const uint16_t scale = (1 << sm_weight_log2_scale);
    123   sm_weights_sanity_checks(sm_weights, sm_weights, scale,
    124                            log2_scale + sizeof(*dst));
    125 
    126   int r;
    127   for (r = 0; r < bh; r++) {
    128     int c;
    129     for (c = 0; c < bw; ++c) {
    130       const uint8_t pixels[] = { above[c], below_pred };
    131       const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
    132       uint32_t this_pred = 0;
    133       assert(scale >= sm_weights[r]);
    134       int i;
    135       for (i = 0; i < 2; ++i) {
    136         this_pred += weights[i] * pixels[i];
    137       }
    138       dst[c] = divide_round(this_pred, log2_scale);
    139     }
    140     dst += stride;
    141   }
    142 }
    143 
    144 static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
    145                                       int bh, const uint8_t *above,
    146                                       const uint8_t *left) {
    147   const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
    148   const uint8_t *const sm_weights = sm_weight_arrays + bw;
    149   // scale = 2^sm_weight_log2_scale
    150   const int log2_scale = sm_weight_log2_scale;
    151   const uint16_t scale = (1 << sm_weight_log2_scale);
    152   sm_weights_sanity_checks(sm_weights, sm_weights, scale,
    153                            log2_scale + sizeof(*dst));
    154 
    155   int r;
    156   for (r = 0; r < bh; r++) {
    157     int c;
    158     for (c = 0; c < bw; ++c) {
    159       const uint8_t pixels[] = { left[r], right_pred };
    160       const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
    161       uint32_t this_pred = 0;
    162       assert(scale >= sm_weights[c]);
    163       int i;
    164       for (i = 0; i < 2; ++i) {
    165         this_pred += weights[i] * pixels[i];
    166       }
    167       dst[c] = divide_round(this_pred, log2_scale);
    168     }
    169     dst += stride;
    170   }
    171 }
    172 
    173 static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
    174                                     int bh, const uint8_t *above,
    175                                     const uint8_t *left) {
    176   int r;
    177   (void)above;
    178   (void)left;
    179 
    180   for (r = 0; r < bh; r++) {
    181     memset(dst, 128, bw);
    182     dst += stride;
    183   }
    184 }
    185 
    186 static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
    187                                      int bh, const uint8_t *above,
    188                                      const uint8_t *left) {
    189   int i, r, expected_dc, sum = 0;
    190   (void)above;
    191 
    192   for (i = 0; i < bh; i++) sum += left[i];
    193   expected_dc = (sum + (bh >> 1)) / bh;
    194 
    195   for (r = 0; r < bh; r++) {
    196     memset(dst, expected_dc, bw);
    197     dst += stride;
    198   }
    199 }
    200 
    201 static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
    202                                     int bh, const uint8_t *above,
    203                                     const uint8_t *left) {
    204   int i, r, expected_dc, sum = 0;
    205   (void)left;
    206 
    207   for (i = 0; i < bw; i++) sum += above[i];
    208   expected_dc = (sum + (bw >> 1)) / bw;
    209 
    210   for (r = 0; r < bh; r++) {
    211     memset(dst, expected_dc, bw);
    212     dst += stride;
    213   }
    214 }
    215 
    216 static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
    217                                 const uint8_t *above, const uint8_t *left) {
    218   int i, r, expected_dc, sum = 0;
    219   const int count = bw + bh;
    220 
    221   for (i = 0; i < bw; i++) {
    222     sum += above[i];
    223   }
    224   for (i = 0; i < bh; i++) {
    225     sum += left[i];
    226   }
    227 
    228   expected_dc = (sum + (count >> 1)) / count;
    229 
    230   for (r = 0; r < bh; r++) {
    231     memset(dst, expected_dc, bw);
    232     dst += stride;
    233   }
    234 }
    235 
    236 static INLINE int divide_using_multiply_shift(int num, int shift1,
    237                                               int multiplier, int shift2) {
    238   const int interm = num >> shift1;
    239   return interm * multiplier >> shift2;
    240 }
    241 
    242 // The constants (multiplier and shifts) for a given block size are obtained
    243 // as follows:
    244 // - Let sum_w_h =  block width + block height.
    245 // - Shift 'sum_w_h' right until we reach an odd number. Let the number of
    246 // shifts for that block size be called 'shift1' (see the parameter in
    247 // dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2
    248 // possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect
    249 // block].
    250 // - Find multipliers for (i) dividing by 3, and (ii) dividing by 5,
    251 // using the "Algorithm 1" in:
    252 // http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
    253 // by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
    254 // shift will be 16, regardless of the block size.
    255 
    256 // Note: For low bitdepth, assembly code may be optimized by using smaller
    257 // constants for smaller block sizes, where the range of the 'sum' is
    258 // restricted to fewer bits.
    259 
    260 #define DC_MULTIPLIER_1X2 0x5556
    261 #define DC_MULTIPLIER_1X4 0x3334
    262 
    263 #define DC_SHIFT2 16
    264 
    265 static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
    266                                      int bh, const uint8_t *above,
    267                                      const uint8_t *left, int shift1,
    268                                      int multiplier) {
    269   int sum = 0;
    270 
    271   for (int i = 0; i < bw; i++) {
    272     sum += above[i];
    273   }
    274   for (int i = 0; i < bh; i++) {
    275     sum += left[i];
    276   }
    277 
    278   const int expected_dc = divide_using_multiply_shift(
    279       sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
    280   assert(expected_dc < (1 << 8));
    281 
    282   for (int r = 0; r < bh; r++) {
    283     memset(dst, expected_dc, bw);
    284     dst += stride;
    285   }
    286 }
    287 
    288 #undef DC_SHIFT2
    289 
    290 void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride,
    291                             const uint8_t *above, const uint8_t *left) {
    292   dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2);
    293 }
    294 
    295 void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride,
    296                             const uint8_t *above, const uint8_t *left) {
    297   dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2);
    298 }
    299 
    300 void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride,
    301                              const uint8_t *above, const uint8_t *left) {
    302   dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4);
    303 }
    304 
    305 void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride,
    306                              const uint8_t *above, const uint8_t *left) {
    307   dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4);
    308 }
    309 
    310 void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride,
    311                              const uint8_t *above, const uint8_t *left) {
    312   dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2);
    313 }
    314 
    315 void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride,
    316                              const uint8_t *above, const uint8_t *left) {
    317   dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2);
    318 }
    319 
    320 void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride,
    321                              const uint8_t *above, const uint8_t *left) {
    322   dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4);
    323 }
    324 
    325 void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride,
    326                              const uint8_t *above, const uint8_t *left) {
    327   dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4);
    328 }
    329 
    330 void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride,
    331                               const uint8_t *above, const uint8_t *left) {
    332   dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2);
    333 }
    334 
    335 void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride,
    336                               const uint8_t *above, const uint8_t *left) {
    337   dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2);
    338 }
    339 
    340 void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride,
    341                               const uint8_t *above, const uint8_t *left) {
    342   dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4);
    343 }
    344 
    345 void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride,
    346                               const uint8_t *above, const uint8_t *left) {
    347   dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4);
    348 }
    349 
    350 void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride,
    351                               const uint8_t *above, const uint8_t *left) {
    352   dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2);
    353 }
    354 
    355 void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
    356                               const uint8_t *above, const uint8_t *left) {
    357   dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2);
    358 }
    359 
    360 #undef DC_MULTIPLIER_1X2
    361 #undef DC_MULTIPLIER_1X4
    362 
    363 static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
    364                                       int bh, const uint16_t *above,
    365                                       const uint16_t *left, int bd) {
    366   int r;
    367   (void)left;
    368   (void)bd;
    369   for (r = 0; r < bh; r++) {
    370     memcpy(dst, above, bw * sizeof(uint16_t));
    371     dst += stride;
    372   }
    373 }
    374 
    375 static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
    376                                       int bh, const uint16_t *above,
    377                                       const uint16_t *left, int bd) {
    378   int r;
    379   (void)above;
    380   (void)bd;
    381   for (r = 0; r < bh; r++) {
    382     aom_memset16(dst, left[r], bw);
    383     dst += stride;
    384   }
    385 }
    386 
    387 static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
    388                                           int bw, int bh, const uint16_t *above,
    389                                           const uint16_t *left, int bd) {
    390   int r, c;
    391   const uint16_t ytop_left = above[-1];
    392   (void)bd;
    393 
    394   for (r = 0; r < bh; r++) {
    395     for (c = 0; c < bw; c++)
    396       dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
    397     dst += stride;
    398   }
    399 }
    400 
    401 static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
    402                                            int bw, int bh,
    403                                            const uint16_t *above,
    404                                            const uint16_t *left, int bd) {
    405   (void)bd;
    406   const uint16_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
    407   const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
    408   const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
    409   const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
    410   // scale = 2 * 2^sm_weight_log2_scale
    411   const int log2_scale = 1 + sm_weight_log2_scale;
    412   const uint16_t scale = (1 << sm_weight_log2_scale);
    413   sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
    414                            log2_scale + sizeof(*dst));
    415   int r;
    416   for (r = 0; r < bh; ++r) {
    417     int c;
    418     for (c = 0; c < bw; ++c) {
    419       const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
    420       const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
    421                                   sm_weights_w[c], scale - sm_weights_w[c] };
    422       uint32_t this_pred = 0;
    423       int i;
    424       assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
    425       for (i = 0; i < 4; ++i) {
    426         this_pred += weights[i] * pixels[i];
    427       }
    428       dst[c] = divide_round(this_pred, log2_scale);
    429     }
    430     dst += stride;
    431   }
    432 }
    433 
    434 static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
    435                                              int bw, int bh,
    436                                              const uint16_t *above,
    437                                              const uint16_t *left, int bd) {
    438   (void)bd;
    439   const uint16_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
    440   const uint8_t *const sm_weights = sm_weight_arrays + bh;
    441   // scale = 2^sm_weight_log2_scale
    442   const int log2_scale = sm_weight_log2_scale;
    443   const uint16_t scale = (1 << sm_weight_log2_scale);
    444   sm_weights_sanity_checks(sm_weights, sm_weights, scale,
    445                            log2_scale + sizeof(*dst));
    446 
    447   int r;
    448   for (r = 0; r < bh; r++) {
    449     int c;
    450     for (c = 0; c < bw; ++c) {
    451       const uint16_t pixels[] = { above[c], below_pred };
    452       const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
    453       uint32_t this_pred = 0;
    454       assert(scale >= sm_weights[r]);
    455       int i;
    456       for (i = 0; i < 2; ++i) {
    457         this_pred += weights[i] * pixels[i];
    458       }
    459       dst[c] = divide_round(this_pred, log2_scale);
    460     }
    461     dst += stride;
    462   }
    463 }
    464 
    465 static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
    466                                              int bw, int bh,
    467                                              const uint16_t *above,
    468                                              const uint16_t *left, int bd) {
    469   (void)bd;
    470   const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
    471   const uint8_t *const sm_weights = sm_weight_arrays + bw;
    472   // scale = 2^sm_weight_log2_scale
    473   const int log2_scale = sm_weight_log2_scale;
    474   const uint16_t scale = (1 << sm_weight_log2_scale);
    475   sm_weights_sanity_checks(sm_weights, sm_weights, scale,
    476                            log2_scale + sizeof(*dst));
    477 
    478   int r;
    479   for (r = 0; r < bh; r++) {
    480     int c;
    481     for (c = 0; c < bw; ++c) {
    482       const uint16_t pixels[] = { left[r], right_pred };
    483       const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
    484       uint32_t this_pred = 0;
    485       assert(scale >= sm_weights[c]);
    486       int i;
    487       for (i = 0; i < 2; ++i) {
    488         this_pred += weights[i] * pixels[i];
    489       }
    490       dst[c] = divide_round(this_pred, log2_scale);
    491     }
    492     dst += stride;
    493   }
    494 }
    495 
    496 static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
    497                                            int bw, int bh,
    498                                            const uint16_t *above,
    499                                            const uint16_t *left, int bd) {
    500   int r;
    501   (void)above;
    502   (void)left;
    503 
    504   for (r = 0; r < bh; r++) {
    505     aom_memset16(dst, 128 << (bd - 8), bw);
    506     dst += stride;
    507   }
    508 }
    509 
    510 static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
    511                                             int bw, int bh,
    512                                             const uint16_t *above,
    513                                             const uint16_t *left, int bd) {
    514   int i, r, expected_dc, sum = 0;
    515   (void)above;
    516   (void)bd;
    517 
    518   for (i = 0; i < bh; i++) sum += left[i];
    519   expected_dc = (sum + (bh >> 1)) / bh;
    520 
    521   for (r = 0; r < bh; r++) {
    522     aom_memset16(dst, expected_dc, bw);
    523     dst += stride;
    524   }
    525 }
    526 
    527 static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
    528                                            int bw, int bh,
    529                                            const uint16_t *above,
    530                                            const uint16_t *left, int bd) {
    531   int i, r, expected_dc, sum = 0;
    532   (void)left;
    533   (void)bd;
    534 
    535   for (i = 0; i < bw; i++) sum += above[i];
    536   expected_dc = (sum + (bw >> 1)) / bw;
    537 
    538   for (r = 0; r < bh; r++) {
    539     aom_memset16(dst, expected_dc, bw);
    540     dst += stride;
    541   }
    542 }
    543 
    544 static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
    545                                        int bh, const uint16_t *above,
    546                                        const uint16_t *left, int bd) {
    547   int i, r, expected_dc, sum = 0;
    548   const int count = bw + bh;
    549   (void)bd;
    550 
    551   for (i = 0; i < bw; i++) {
    552     sum += above[i];
    553   }
    554   for (i = 0; i < bh; i++) {
    555     sum += left[i];
    556   }
    557 
    558   expected_dc = (sum + (count >> 1)) / count;
    559 
    560   for (r = 0; r < bh; r++) {
    561     aom_memset16(dst, expected_dc, bw);
    562     dst += stride;
    563   }
    564 }
    565 
    566 // Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but
    567 // assume 2nd shift of 17 bits instead of 16.
    568 // Note: Strictly speaking, 2nd shift needs to be 17 only when:
    569 // - bit depth == 12, and
    570 // - bw + bh is divisible by 5 (as opposed to divisible by 3).
    571 // All other cases can use half the multipliers with a shift of 16 instead.
    572 // This special optimization can be used when writing assembly code.
    573 #define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
    574 // Note: This constant is odd, but a smaller even constant (0x199a) with the
    575 // appropriate shift should work for neon in 8/10-bit.
    576 #define HIGHBD_DC_MULTIPLIER_1X4 0x6667
    577 
    578 #define HIGHBD_DC_SHIFT2 17
    579 
    580 static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
    581                                             int bw, int bh,
    582                                             const uint16_t *above,
    583                                             const uint16_t *left, int bd,
    584                                             int shift1, uint32_t multiplier) {
    585   int sum = 0;
    586   (void)bd;
    587 
    588   for (int i = 0; i < bw; i++) {
    589     sum += above[i];
    590   }
    591   for (int i = 0; i < bh; i++) {
    592     sum += left[i];
    593   }
    594 
    595   const int expected_dc = divide_using_multiply_shift(
    596       sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
    597   assert(expected_dc < (1 << bd));
    598 
    599   for (int r = 0; r < bh; r++) {
    600     aom_memset16(dst, expected_dc, bw);
    601     dst += stride;
    602   }
    603 }
    604 
    605 #undef HIGHBD_DC_SHIFT2
    606 
    607 void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride,
    608                                    const uint16_t *above, const uint16_t *left,
    609                                    int bd) {
    610   highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2,
    611                            HIGHBD_DC_MULTIPLIER_1X2);
    612 }
    613 
    614 void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride,
    615                                    const uint16_t *above, const uint16_t *left,
    616                                    int bd) {
    617   highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2,
    618                            HIGHBD_DC_MULTIPLIER_1X2);
    619 }
    620 
    621 void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride,
    622                                     const uint16_t *above, const uint16_t *left,
    623                                     int bd) {
    624   highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
    625                            HIGHBD_DC_MULTIPLIER_1X4);
    626 }
    627 
    628 void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride,
    629                                     const uint16_t *above, const uint16_t *left,
    630                                     int bd) {
    631   highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
    632                            HIGHBD_DC_MULTIPLIER_1X4);
    633 }
    634 
    635 void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride,
    636                                     const uint16_t *above, const uint16_t *left,
    637                                     int bd) {
    638   highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3,
    639                            HIGHBD_DC_MULTIPLIER_1X2);
    640 }
    641 
    642 void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride,
    643                                     const uint16_t *above, const uint16_t *left,
    644                                     int bd) {
    645   highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3,
    646                            HIGHBD_DC_MULTIPLIER_1X2);
    647 }
    648 
    649 void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride,
    650                                     const uint16_t *above, const uint16_t *left,
    651                                     int bd) {
    652   highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
    653                            HIGHBD_DC_MULTIPLIER_1X4);
    654 }
    655 
    656 void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride,
    657                                     const uint16_t *above, const uint16_t *left,
    658                                     int bd) {
    659   highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
    660                            HIGHBD_DC_MULTIPLIER_1X4);
    661 }
    662 
    663 void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride,
    664                                      const uint16_t *above,
    665                                      const uint16_t *left, int bd) {
    666   highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4,
    667                            HIGHBD_DC_MULTIPLIER_1X2);
    668 }
    669 
    670 void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride,
    671                                      const uint16_t *above,
    672                                      const uint16_t *left, int bd) {
    673   highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4,
    674                            HIGHBD_DC_MULTIPLIER_1X2);
    675 }
    676 
    677 void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride,
    678                                      const uint16_t *above,
    679                                      const uint16_t *left, int bd) {
    680   highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
    681                            HIGHBD_DC_MULTIPLIER_1X4);
    682 }
    683 
    684 void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride,
    685                                      const uint16_t *above,
    686                                      const uint16_t *left, int bd) {
    687   highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
    688                            HIGHBD_DC_MULTIPLIER_1X4);
    689 }
    690 
    691 void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride,
    692                                      const uint16_t *above,
    693                                      const uint16_t *left, int bd) {
    694   highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
    695                            HIGHBD_DC_MULTIPLIER_1X2);
    696 }
    697 
    698 void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
    699                                      const uint16_t *above,
    700                                      const uint16_t *left, int bd) {
    701   highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
    702                            HIGHBD_DC_MULTIPLIER_1X2);
    703 }
    704 
    705 #undef HIGHBD_DC_MULTIPLIER_1X2
    706 #undef HIGHBD_DC_MULTIPLIER_1X4
    707 
    708 // This serves as a wrapper function, so that all the prediction functions
    709 // can be unified and accessed as a pointer array. Note that the boundary
    710 // above and left are not necessarily used all the time.
    711 #define intra_pred_sized(type, width, height)                  \
    712   void aom_##type##_predictor_##width##x##height##_c(          \
    713       uint8_t *dst, ptrdiff_t stride, const uint8_t *above,    \
    714       const uint8_t *left) {                                   \
    715     type##_predictor(dst, stride, width, height, above, left); \
    716   }
    717 
    718 #define intra_pred_highbd_sized(type, width, height)                        \
    719   void aom_highbd_##type##_predictor_##width##x##height##_c(                \
    720       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
    721       const uint16_t *left, int bd) {                                       \
    722     highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
    723   }
    724 
    725 /* clang-format off */
    726 #define intra_pred_rectangular(type) \
    727   intra_pred_sized(type, 4, 8) \
    728   intra_pred_sized(type, 8, 4) \
    729   intra_pred_sized(type, 8, 16) \
    730   intra_pred_sized(type, 16, 8) \
    731   intra_pred_sized(type, 16, 32) \
    732   intra_pred_sized(type, 32, 16) \
    733   intra_pred_sized(type, 32, 64) \
    734   intra_pred_sized(type, 64, 32) \
    735   intra_pred_sized(type, 4, 16) \
    736   intra_pred_sized(type, 16, 4) \
    737   intra_pred_sized(type, 8, 32) \
    738   intra_pred_sized(type, 32, 8) \
    739   intra_pred_sized(type, 16, 64) \
    740   intra_pred_sized(type, 64, 16) \
    741   intra_pred_highbd_sized(type, 4, 8) \
    742   intra_pred_highbd_sized(type, 8, 4) \
    743   intra_pred_highbd_sized(type, 8, 16) \
    744   intra_pred_highbd_sized(type, 16, 8) \
    745   intra_pred_highbd_sized(type, 16, 32) \
    746   intra_pred_highbd_sized(type, 32, 16) \
    747   intra_pred_highbd_sized(type, 32, 64) \
    748   intra_pred_highbd_sized(type, 64, 32) \
    749   intra_pred_highbd_sized(type, 4, 16) \
    750   intra_pred_highbd_sized(type, 16, 4) \
    751   intra_pred_highbd_sized(type, 8, 32) \
    752   intra_pred_highbd_sized(type, 32, 8) \
    753   intra_pred_highbd_sized(type, 16, 64) \
    754   intra_pred_highbd_sized(type, 64, 16)
    755 #define intra_pred_above_4x4(type) \
    756   intra_pred_sized(type, 8, 8) \
    757   intra_pred_sized(type, 16, 16) \
    758   intra_pred_sized(type, 32, 32) \
    759   intra_pred_sized(type, 64, 64) \
    760   intra_pred_highbd_sized(type, 4, 4) \
    761   intra_pred_highbd_sized(type, 8, 8) \
    762   intra_pred_highbd_sized(type, 16, 16) \
    763   intra_pred_highbd_sized(type, 32, 32) \
    764   intra_pred_highbd_sized(type, 64, 64) \
    765   intra_pred_rectangular(type)
    766 #define intra_pred_allsizes(type) \
    767   intra_pred_sized(type, 4, 4) \
    768   intra_pred_above_4x4(type)
    769 #define intra_pred_square(type) \
    770   intra_pred_sized(type, 4, 4) \
    771   intra_pred_sized(type, 8, 8) \
    772   intra_pred_sized(type, 16, 16) \
    773   intra_pred_sized(type, 32, 32) \
    774   intra_pred_sized(type, 64, 64) \
    775   intra_pred_highbd_sized(type, 4, 4) \
    776   intra_pred_highbd_sized(type, 8, 8) \
    777   intra_pred_highbd_sized(type, 16, 16) \
    778   intra_pred_highbd_sized(type, 32, 32) \
    779   intra_pred_highbd_sized(type, 64, 64)
    780 
    781 intra_pred_allsizes(v)
    782 intra_pred_allsizes(h)
    783 intra_pred_allsizes(smooth)
    784 intra_pred_allsizes(smooth_v)
    785 intra_pred_allsizes(smooth_h)
    786 intra_pred_allsizes(paeth)
    787 intra_pred_allsizes(dc_128)
    788 intra_pred_allsizes(dc_left)
    789 intra_pred_allsizes(dc_top)
    790 intra_pred_square(dc)
    791 /* clang-format on */
    792 #undef intra_pred_allsizes
    793