Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <immintrin.h>  // AVX2
     12 
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx_ports/mem.h"
     15 
     16 /* clang-format off */
     17 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
     18   16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
     19   16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
     20   14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
     21   14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
     22   12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
     23   12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
     24   10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
     25   10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
     26   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
     27   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
     28   6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
     29   6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
     30   4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
     31   4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
     32   2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
     33   2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
     34 };
     35 /* clang-format on */
     36 
     37 void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
     38                           const unsigned char *ref_ptr, int recon_stride,
     39                           unsigned int *SSE, int *Sum) {
     40   __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
     41   __m256i ref_expand_high, madd_low, madd_high;
     42   unsigned int i, src_2strides, ref_2strides;
     43   __m256i zero_reg = _mm256_set1_epi16(0);
     44   __m256i sum_ref_src = _mm256_set1_epi16(0);
     45   __m256i madd_ref_src = _mm256_set1_epi16(0);
     46 
     47   // processing two strides in a 256 bit register reducing the number
     48   // of loop stride by half (comparing to the sse2 code)
     49   src_2strides = source_stride << 1;
     50   ref_2strides = recon_stride << 1;
     51   for (i = 0; i < 8; i++) {
     52     src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
     53     src = _mm256_inserti128_si256(
     54         src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
     55 
     56     ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
     57     ref = _mm256_inserti128_si256(
     58         ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
     59 
     60     // expanding to 16 bit each lane
     61     src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
     62     src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
     63 
     64     ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
     65     ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
     66 
     67     // src-ref
     68     src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
     69     src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
     70 
     71     // madd low (src - ref)
     72     madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
     73 
     74     // add high to low
     75     src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
     76 
     77     // madd high (src - ref)
     78     madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
     79 
     80     sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
     81 
     82     // add high to low
     83     madd_ref_src =
     84         _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
     85 
     86     src_ptr += src_2strides;
     87     ref_ptr += ref_2strides;
     88   }
     89 
     90   {
     91     __m128i sum_res, madd_res;
     92     __m128i expand_sum_low, expand_sum_high, expand_sum;
     93     __m128i expand_madd_low, expand_madd_high, expand_madd;
     94     __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
     95 
     96     // extract the low lane and add it to the high lane
     97     sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
     98                             _mm256_extractf128_si256(sum_ref_src, 1));
     99 
    100     madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
    101                              _mm256_extractf128_si256(madd_ref_src, 1));
    102 
    103     // padding each 2 bytes with another 2 zeroed bytes
    104     expand_sum_low =
    105         _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
    106     expand_sum_high =
    107         _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
    108 
    109     // shifting the sign 16 bits right
    110     expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
    111     expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
    112 
    113     expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
    114 
    115     // expand each 32 bits of the madd result to 64 bits
    116     expand_madd_low =
    117         _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
    118     expand_madd_high =
    119         _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
    120 
    121     expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
    122 
    123     ex_expand_sum_low =
    124         _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
    125     ex_expand_sum_high =
    126         _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
    127 
    128     ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
    129 
    130     // shift 8 bytes eight
    131     madd_res = _mm_srli_si128(expand_madd, 8);
    132     sum_res = _mm_srli_si128(ex_expand_sum, 8);
    133 
    134     madd_res = _mm_add_epi32(madd_res, expand_madd);
    135     sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
    136 
    137     *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
    138 
    139     *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
    140   }
    141 }
    142 
    143 void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
    144                           const unsigned char *ref_ptr, int recon_stride,
    145                           unsigned int *SSE, int *Sum) {
    146   __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
    147   __m256i ref_expand_high, madd_low, madd_high;
    148   unsigned int i;
    149   __m256i zero_reg = _mm256_set1_epi16(0);
    150   __m256i sum_ref_src = _mm256_set1_epi16(0);
    151   __m256i madd_ref_src = _mm256_set1_epi16(0);
    152 
    153   // processing 32 elements in parallel
    154   for (i = 0; i < 16; i++) {
    155     src = _mm256_loadu_si256((__m256i const *)(src_ptr));
    156 
    157     ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
    158 
    159     // expanding to 16 bit each lane
    160     src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
    161     src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
    162 
    163     ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
    164     ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
    165 
    166     // src-ref
    167     src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
    168     src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
    169 
    170     // madd low (src - ref)
    171     madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
    172 
    173     // add high to low
    174     src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
    175 
    176     // madd high (src - ref)
    177     madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
    178 
    179     sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
    180 
    181     // add high to low
    182     madd_ref_src =
    183         _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
    184 
    185     src_ptr += source_stride;
    186     ref_ptr += recon_stride;
    187   }
    188 
    189   {
    190     __m256i expand_sum_low, expand_sum_high, expand_sum;
    191     __m256i expand_madd_low, expand_madd_high, expand_madd;
    192     __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
    193 
    194     // padding each 2 bytes with another 2 zeroed bytes
    195     expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
    196     expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
    197 
    198     // shifting the sign 16 bits right
    199     expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
    200     expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
    201 
    202     expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
    203 
    204     // expand each 32 bits of the madd result to 64 bits
    205     expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
    206     expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
    207 
    208     expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
    209 
    210     ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
    211     ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
    212 
    213     ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
    214 
    215     // shift 8 bytes eight
    216     madd_ref_src = _mm256_srli_si256(expand_madd, 8);
    217     sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
    218 
    219     madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
    220     sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
    221 
    222     // extract the low lane and the high lane and add the results
    223     *((int *)SSE) =
    224         _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
    225         _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
    226 
    227     *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
    228                     _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
    229   }
    230 }
    231 
    232 #define FILTER_SRC(filter)                               \
    233   /* filter the source */                                \
    234   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
    235   exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
    236                                                          \
    237   /* add 8 to source */                                  \
    238   exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
    239   exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
    240                                                          \
    241   /* divide source by 16 */                              \
    242   exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
    243   exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
    244 
    245 #define MERGE_WITH_SRC(src_reg, reg)               \
    246   exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
    247   exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
    248 
    249 #define LOAD_SRC_DST                                    \
    250   /* load source and destination */                     \
    251   src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
    252   dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
    253 
    254 #define AVG_NEXT_SRC(src_reg, size_stride)                                 \
    255   src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
    256   /* average between current and next stride source */                     \
    257   src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
    258 
    259 #define MERGE_NEXT_SRC(src_reg, size_stride)                               \
    260   src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
    261   MERGE_WITH_SRC(src_reg, src_next_reg)
    262 
    263 #define CALC_SUM_SSE_INSIDE_LOOP                          \
    264   /* expand each byte to 2 bytes */                       \
    265   exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
    266   exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
    267   /* source - dest */                                     \
    268   exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
    269   exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
    270   /* caculate sum */                                      \
    271   sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
    272   exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
    273   sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
    274   exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
    275   /* calculate sse */                                     \
    276   sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
    277   sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
    278 
    279 // final calculation to sum and sse
    280 #define CALC_SUM_AND_SSE                                                   \
    281   res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
    282   sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
    283   sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
    284   sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
    285   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
    286   sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
    287                                                                            \
    288   sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
    289   sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
    290                                                                            \
    291   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
    292   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
    293   *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
    294                   _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
    295   sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
    296   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
    297   sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
    298         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
    299 
    300 unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
    301                                              int x_offset, int y_offset,
    302                                              const uint8_t *dst, int dst_stride,
    303                                              int height, unsigned int *sse) {
    304   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
    305   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
    306   __m256i zero_reg;
    307   int i, sum;
    308   sum_reg = _mm256_set1_epi16(0);
    309   sse_reg = _mm256_set1_epi16(0);
    310   zero_reg = _mm256_set1_epi16(0);
    311 
    312   // x_offset = 0 and y_offset = 0
    313   if (x_offset == 0) {
    314     if (y_offset == 0) {
    315       for (i = 0; i < height; i++) {
    316         LOAD_SRC_DST
    317         // expend each byte to 2 bytes
    318         MERGE_WITH_SRC(src_reg, zero_reg)
    319         CALC_SUM_SSE_INSIDE_LOOP
    320         src += src_stride;
    321         dst += dst_stride;
    322       }
    323       // x_offset = 0 and y_offset = 8
    324     } else if (y_offset == 8) {
    325       __m256i src_next_reg;
    326       for (i = 0; i < height; i++) {
    327         LOAD_SRC_DST
    328         AVG_NEXT_SRC(src_reg, src_stride)
    329         // expend each byte to 2 bytes
    330         MERGE_WITH_SRC(src_reg, zero_reg)
    331         CALC_SUM_SSE_INSIDE_LOOP
    332         src += src_stride;
    333         dst += dst_stride;
    334       }
    335       // x_offset = 0 and y_offset = bilin interpolation
    336     } else {
    337       __m256i filter, pw8, src_next_reg;
    338 
    339       y_offset <<= 5;
    340       filter = _mm256_load_si256(
    341           (__m256i const *)(bilinear_filters_avx2 + y_offset));
    342       pw8 = _mm256_set1_epi16(8);
    343       for (i = 0; i < height; i++) {
    344         LOAD_SRC_DST
    345         MERGE_NEXT_SRC(src_reg, src_stride)
    346         FILTER_SRC(filter)
    347         CALC_SUM_SSE_INSIDE_LOOP
    348         src += src_stride;
    349         dst += dst_stride;
    350       }
    351     }
    352     // x_offset = 8  and y_offset = 0
    353   } else if (x_offset == 8) {
    354     if (y_offset == 0) {
    355       __m256i src_next_reg;
    356       for (i = 0; i < height; i++) {
    357         LOAD_SRC_DST
    358         AVG_NEXT_SRC(src_reg, 1)
    359         // expand each byte to 2 bytes
    360         MERGE_WITH_SRC(src_reg, zero_reg)
    361         CALC_SUM_SSE_INSIDE_LOOP
    362         src += src_stride;
    363         dst += dst_stride;
    364       }
    365       // x_offset = 8  and y_offset = 8
    366     } else if (y_offset == 8) {
    367       __m256i src_next_reg, src_avg;
    368       // load source and another source starting from the next
    369       // following byte
    370       src_reg = _mm256_loadu_si256((__m256i const *)(src));
    371       AVG_NEXT_SRC(src_reg, 1)
    372       for (i = 0; i < height; i++) {
    373         src_avg = src_reg;
    374         src += src_stride;
    375         LOAD_SRC_DST
    376         AVG_NEXT_SRC(src_reg, 1)
    377         // average between previous average to current average
    378         src_avg = _mm256_avg_epu8(src_avg, src_reg);
    379         // expand each byte to 2 bytes
    380         MERGE_WITH_SRC(src_avg, zero_reg)
    381         // save current source average
    382         CALC_SUM_SSE_INSIDE_LOOP
    383         dst += dst_stride;
    384       }
    385       // x_offset = 8  and y_offset = bilin interpolation
    386     } else {
    387       __m256i filter, pw8, src_next_reg, src_avg;
    388       y_offset <<= 5;
    389       filter = _mm256_load_si256(
    390           (__m256i const *)(bilinear_filters_avx2 + y_offset));
    391       pw8 = _mm256_set1_epi16(8);
    392       // load source and another source starting from the next
    393       // following byte
    394       src_reg = _mm256_loadu_si256((__m256i const *)(src));
    395       AVG_NEXT_SRC(src_reg, 1)
    396       for (i = 0; i < height; i++) {
    397         // save current source average
    398         src_avg = src_reg;
    399         src += src_stride;
    400         LOAD_SRC_DST
    401         AVG_NEXT_SRC(src_reg, 1)
    402         MERGE_WITH_SRC(src_avg, src_reg)
    403         FILTER_SRC(filter)
    404         CALC_SUM_SSE_INSIDE_LOOP
    405         dst += dst_stride;
    406       }
    407     }
    408     // x_offset = bilin interpolation and y_offset = 0
    409   } else {
    410     if (y_offset == 0) {
    411       __m256i filter, pw8, src_next_reg;
    412       x_offset <<= 5;
    413       filter = _mm256_load_si256(
    414           (__m256i const *)(bilinear_filters_avx2 + x_offset));
    415       pw8 = _mm256_set1_epi16(8);
    416       for (i = 0; i < height; i++) {
    417         LOAD_SRC_DST
    418         MERGE_NEXT_SRC(src_reg, 1)
    419         FILTER_SRC(filter)
    420         CALC_SUM_SSE_INSIDE_LOOP
    421         src += src_stride;
    422         dst += dst_stride;
    423       }
    424       // x_offset = bilin interpolation and y_offset = 8
    425     } else if (y_offset == 8) {
    426       __m256i filter, pw8, src_next_reg, src_pack;
    427       x_offset <<= 5;
    428       filter = _mm256_load_si256(
    429           (__m256i const *)(bilinear_filters_avx2 + x_offset));
    430       pw8 = _mm256_set1_epi16(8);
    431       src_reg = _mm256_loadu_si256((__m256i const *)(src));
    432       MERGE_NEXT_SRC(src_reg, 1)
    433       FILTER_SRC(filter)
    434       // convert each 16 bit to 8 bit to each low and high lane source
    435       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    436       for (i = 0; i < height; i++) {
    437         src += src_stride;
    438         LOAD_SRC_DST
    439         MERGE_NEXT_SRC(src_reg, 1)
    440         FILTER_SRC(filter)
    441         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    442         // average between previous pack to the current
    443         src_pack = _mm256_avg_epu8(src_pack, src_reg);
    444         MERGE_WITH_SRC(src_pack, zero_reg)
    445         CALC_SUM_SSE_INSIDE_LOOP
    446         src_pack = src_reg;
    447         dst += dst_stride;
    448       }
    449       // x_offset = bilin interpolation and y_offset = bilin interpolation
    450     } else {
    451       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
    452       x_offset <<= 5;
    453       xfilter = _mm256_load_si256(
    454           (__m256i const *)(bilinear_filters_avx2 + x_offset));
    455       y_offset <<= 5;
    456       yfilter = _mm256_load_si256(
    457           (__m256i const *)(bilinear_filters_avx2 + y_offset));
    458       pw8 = _mm256_set1_epi16(8);
    459       // load source and another source starting from the next
    460       // following byte
    461       src_reg = _mm256_loadu_si256((__m256i const *)(src));
    462       MERGE_NEXT_SRC(src_reg, 1)
    463 
    464       FILTER_SRC(xfilter)
    465       // convert each 16 bit to 8 bit to each low and high lane source
    466       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    467       for (i = 0; i < height; i++) {
    468         src += src_stride;
    469         LOAD_SRC_DST
    470         MERGE_NEXT_SRC(src_reg, 1)
    471         FILTER_SRC(xfilter)
    472         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    473         // merge previous pack to current pack source
    474         MERGE_WITH_SRC(src_pack, src_reg)
    475         // filter the source
    476         FILTER_SRC(yfilter)
    477         src_pack = src_reg;
    478         CALC_SUM_SSE_INSIDE_LOOP
    479         dst += dst_stride;
    480       }
    481     }
    482   }
    483   CALC_SUM_AND_SSE
    484   return sum;
    485 }
    486 
    487 unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
    488     const uint8_t *src, int src_stride, int x_offset, int y_offset,
    489     const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
    490     int height, unsigned int *sse) {
    491   __m256i sec_reg;
    492   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
    493   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
    494   __m256i zero_reg;
    495   int i, sum;
    496   sum_reg = _mm256_set1_epi16(0);
    497   sse_reg = _mm256_set1_epi16(0);
    498   zero_reg = _mm256_set1_epi16(0);
    499 
    500   // x_offset = 0 and y_offset = 0
    501   if (x_offset == 0) {
    502     if (y_offset == 0) {
    503       for (i = 0; i < height; i++) {
    504         LOAD_SRC_DST
    505         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
    506         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
    507         sec += sec_stride;
    508         // expend each byte to 2 bytes
    509         MERGE_WITH_SRC(src_reg, zero_reg)
    510         CALC_SUM_SSE_INSIDE_LOOP
    511         src += src_stride;
    512         dst += dst_stride;
    513       }
    514     } else if (y_offset == 8) {
    515       __m256i src_next_reg;
    516       for (i = 0; i < height; i++) {
    517         LOAD_SRC_DST
    518         AVG_NEXT_SRC(src_reg, src_stride)
    519         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
    520         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
    521         sec += sec_stride;
    522         // expend each byte to 2 bytes
    523         MERGE_WITH_SRC(src_reg, zero_reg)
    524         CALC_SUM_SSE_INSIDE_LOOP
    525         src += src_stride;
    526         dst += dst_stride;
    527       }
    528       // x_offset = 0 and y_offset = bilin interpolation
    529     } else {
    530       __m256i filter, pw8, src_next_reg;
    531 
    532       y_offset <<= 5;
    533       filter = _mm256_load_si256(
    534           (__m256i const *)(bilinear_filters_avx2 + y_offset));
    535       pw8 = _mm256_set1_epi16(8);
    536       for (i = 0; i < height; i++) {
    537         LOAD_SRC_DST
    538         MERGE_NEXT_SRC(src_reg, src_stride)
    539         FILTER_SRC(filter)
    540         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    541         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
    542         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
    543         sec += sec_stride;
    544         MERGE_WITH_SRC(src_reg, zero_reg)
    545         CALC_SUM_SSE_INSIDE_LOOP
    546         src += src_stride;
    547         dst += dst_stride;
    548       }
    549     }
    550     // x_offset = 8  and y_offset = 0
    551   } else if (x_offset == 8) {
    552     if (y_offset == 0) {
    553       __m256i src_next_reg;
    554       for (i = 0; i < height; i++) {
    555         LOAD_SRC_DST
    556         AVG_NEXT_SRC(src_reg, 1)
    557         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
    558         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
    559         sec += sec_stride;
    560         // expand each byte to 2 bytes
    561         MERGE_WITH_SRC(src_reg, zero_reg)
    562         CALC_SUM_SSE_INSIDE_LOOP
    563         src += src_stride;
    564         dst += dst_stride;
    565       }
    566       // x_offset = 8  and y_offset = 8
    567     } else if (y_offset == 8) {
    568       __m256i src_next_reg, src_avg;
    569       // load source and another source starting from the next
    570       // following byte
    571       src_reg = _mm256_loadu_si256((__m256i const *)(src));
    572       AVG_NEXT_SRC(src_reg, 1)
    573       for (i = 0; i < height; i++) {
    574         // save current source average
    575         src_avg = src_reg;
    576         src += src_stride;
    577         LOAD_SRC_DST
    578         AVG_NEXT_SRC(src_reg, 1)
    579         // average between previous average to current average
    580         src_avg = _mm256_avg_epu8(src_avg, src_reg);
    581         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
    582         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
    583         sec += sec_stride;
    584         // expand each byte to 2 bytes
    585         MERGE_WITH_SRC(src_avg, zero_reg)
    586         CALC_SUM_SSE_INSIDE_LOOP
    587         dst += dst_stride;
    588       }
    589       // x_offset = 8  and y_offset = bilin interpolation
    590     } else {
    591       __m256i filter, pw8, src_next_reg, src_avg;
    592       y_offset <<= 5;
    593       filter = _mm256_load_si256(
    594           (__m256i const *)(bilinear_filters_avx2 + y_offset));
    595       pw8 = _mm256_set1_epi16(8);
    596       // load source and another source starting from the next
    597       // following byte
    598       src_reg = _mm256_loadu_si256((__m256i const *)(src));
    599       AVG_NEXT_SRC(src_reg, 1)
    600       for (i = 0; i < height; i++) {
    601         // save current source average
    602         src_avg = src_reg;
    603         src += src_stride;
    604         LOAD_SRC_DST
    605         AVG_NEXT_SRC(src_reg, 1)
    606         MERGE_WITH_SRC(src_avg, src_reg)
    607         FILTER_SRC(filter)
    608         src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    609         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
    610         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
    611         // expand each byte to 2 bytes
    612         MERGE_WITH_SRC(src_avg, zero_reg)
    613         sec += sec_stride;
    614         CALC_SUM_SSE_INSIDE_LOOP
    615         dst += dst_stride;
    616       }
    617     }
    618     // x_offset = bilin interpolation and y_offset = 0
    619   } else {
    620     if (y_offset == 0) {
    621       __m256i filter, pw8, src_next_reg;
    622       x_offset <<= 5;
    623       filter = _mm256_load_si256(
    624           (__m256i const *)(bilinear_filters_avx2 + x_offset));
    625       pw8 = _mm256_set1_epi16(8);
    626       for (i = 0; i < height; i++) {
    627         LOAD_SRC_DST
    628         MERGE_NEXT_SRC(src_reg, 1)
    629         FILTER_SRC(filter)
    630         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    631         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
    632         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
    633         MERGE_WITH_SRC(src_reg, zero_reg)
    634         sec += sec_stride;
    635         CALC_SUM_SSE_INSIDE_LOOP
    636         src += src_stride;
    637         dst += dst_stride;
    638       }
    639       // x_offset = bilin interpolation and y_offset = 8
    640     } else if (y_offset == 8) {
    641       __m256i filter, pw8, src_next_reg, src_pack;
    642       x_offset <<= 5;
    643       filter = _mm256_load_si256(
    644           (__m256i const *)(bilinear_filters_avx2 + x_offset));
    645       pw8 = _mm256_set1_epi16(8);
    646       src_reg = _mm256_loadu_si256((__m256i const *)(src));
    647       MERGE_NEXT_SRC(src_reg, 1)
    648       FILTER_SRC(filter)
    649       // convert each 16 bit to 8 bit to each low and high lane source
    650       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    651       for (i = 0; i < height; i++) {
    652         src += src_stride;
    653         LOAD_SRC_DST
    654         MERGE_NEXT_SRC(src_reg, 1)
    655         FILTER_SRC(filter)
    656         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    657         // average between previous pack to the current
    658         src_pack = _mm256_avg_epu8(src_pack, src_reg);
    659         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
    660         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
    661         sec += sec_stride;
    662         MERGE_WITH_SRC(src_pack, zero_reg)
    663         src_pack = src_reg;
    664         CALC_SUM_SSE_INSIDE_LOOP
    665         dst += dst_stride;
    666       }
    667       // x_offset = bilin interpolation and y_offset = bilin interpolation
    668     } else {
    669       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
    670       x_offset <<= 5;
    671       xfilter = _mm256_load_si256(
    672           (__m256i const *)(bilinear_filters_avx2 + x_offset));
    673       y_offset <<= 5;
    674       yfilter = _mm256_load_si256(
    675           (__m256i const *)(bilinear_filters_avx2 + y_offset));
    676       pw8 = _mm256_set1_epi16(8);
    677       // load source and another source starting from the next
    678       // following byte
    679       src_reg = _mm256_loadu_si256((__m256i const *)(src));
    680       MERGE_NEXT_SRC(src_reg, 1)
    681 
    682       FILTER_SRC(xfilter)
    683       // convert each 16 bit to 8 bit to each low and high lane source
    684       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    685       for (i = 0; i < height; i++) {
    686         src += src_stride;
    687         LOAD_SRC_DST
    688         MERGE_NEXT_SRC(src_reg, 1)
    689         FILTER_SRC(xfilter)
    690         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    691         // merge previous pack to current pack source
    692         MERGE_WITH_SRC(src_pack, src_reg)
    693         // filter the source
    694         FILTER_SRC(yfilter)
    695         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    696         sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
    697         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
    698         MERGE_WITH_SRC(src_pack, zero_reg)
    699         src_pack = src_reg;
    700         sec += sec_stride;
    701         CALC_SUM_SSE_INSIDE_LOOP
    702         dst += dst_stride;
    703       }
    704     }
    705   }
    706   CALC_SUM_AND_SSE
    707   return sum;
    708 }
    709