Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <immintrin.h>  // AVX2
     12 #include "vpx_ports/mem.h"
     13 #include "vp9/encoder/vp9_variance.h"
     14 
     15 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
     16   16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
     17   16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
     18   15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
     19   15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
     20   14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
     21   14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
     22   13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
     23   13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
     24   12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
     25   12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
     26   11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
     27   11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
     28   10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
     29   10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
     30   9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
     31   9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
     32   8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
     33   8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
     34   7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
     35   7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
     36   6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
     37   6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
     38   5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
     39   5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
     40   4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
     41   4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
     42   3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
     43   3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
     44   2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
     45   2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
     46   1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,
     47   1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15
     48 };
     49 
     50 #define FILTER_SRC(filter) \
     51   /* filter the source */ \
     52   exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
     53   exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
     54   \
     55   /* add 8 to source */ \
     56   exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
     57   exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
     58   \
     59   /* divide source by 16 */ \
     60   exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
     61   exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
     62 
     63 #define MERGE_WITH_SRC(src_reg, reg) \
     64   exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
     65   exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
     66 
     67 #define LOAD_SRC_DST \
     68   /* load source and destination */ \
     69   src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
     70   dst_reg = _mm256_load_si256((__m256i const *) (dst));
     71 
     72 #define AVG_NEXT_SRC(src_reg, size_stride) \
     73   src_next_reg = _mm256_loadu_si256((__m256i const *) \
     74                                    (src + size_stride)); \
     75   /* average between current and next stride source */ \
     76   src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
     77 
     78 #define MERGE_NEXT_SRC(src_reg, size_stride) \
     79   src_next_reg = _mm256_loadu_si256((__m256i const *) \
     80                                    (src + size_stride)); \
     81   MERGE_WITH_SRC(src_reg, src_next_reg)
     82 
     83 #define CALC_SUM_SSE_INSIDE_LOOP \
     84   /* expand each byte to 2 bytes */ \
     85   exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
     86   exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
     87   /* source - dest */ \
     88   exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
     89   exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
     90   /* caculate sum */ \
     91   sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
     92   exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
     93   sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
     94   exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
     95   /* calculate sse */ \
     96   sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
     97   sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
     98 
     99 // final calculation to sum and sse
    100 #define CALC_SUM_AND_SSE \
    101   res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
    102   sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
    103   sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
    104   sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
    105   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
    106   sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
    107   \
    108   sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
    109   sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
    110   \
    111   sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
    112   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
    113   *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
    114                 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
    115   sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
    116   sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
    117   sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
    118         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
    119 
    120 
    121 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
    122                                              int src_stride,
    123                                              int x_offset,
    124                                              int y_offset,
    125                                              const uint8_t *dst,
    126                                              int dst_stride,
    127                                              int height,
    128                                              unsigned int *sse) {
    129   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
    130   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
    131   __m256i zero_reg;
    132   int i, sum;
    133   sum_reg = _mm256_set1_epi16(0);
    134   sse_reg = _mm256_set1_epi16(0);
    135   zero_reg = _mm256_set1_epi16(0);
    136 
    137   // x_offset = 0 and y_offset = 0
    138   if (x_offset == 0) {
    139     if (y_offset == 0) {
    140       for (i = 0; i < height ; i++) {
    141         LOAD_SRC_DST
    142         // expend each byte to 2 bytes
    143         MERGE_WITH_SRC(src_reg, zero_reg)
    144         CALC_SUM_SSE_INSIDE_LOOP
    145         src+= src_stride;
    146         dst+= dst_stride;
    147       }
    148     // x_offset = 0 and y_offset = 8
    149     } else if (y_offset == 8) {
    150       __m256i src_next_reg;
    151       for (i = 0; i < height ; i++) {
    152         LOAD_SRC_DST
    153         AVG_NEXT_SRC(src_reg, src_stride)
    154         // expend each byte to 2 bytes
    155         MERGE_WITH_SRC(src_reg, zero_reg)
    156         CALC_SUM_SSE_INSIDE_LOOP
    157         src+= src_stride;
    158         dst+= dst_stride;
    159       }
    160     // x_offset = 0 and y_offset = bilin interpolation
    161     } else {
    162       __m256i filter, pw8, src_next_reg;
    163 
    164       y_offset <<= 5;
    165       filter = _mm256_load_si256((__m256i const *)
    166                (bilinear_filters_avx2 + y_offset));
    167       pw8 = _mm256_set1_epi16(8);
    168       for (i = 0; i < height ; i++) {
    169         LOAD_SRC_DST
    170         MERGE_NEXT_SRC(src_reg, src_stride)
    171         FILTER_SRC(filter)
    172         CALC_SUM_SSE_INSIDE_LOOP
    173         src+= src_stride;
    174         dst+= dst_stride;
    175       }
    176     }
    177   // x_offset = 8  and y_offset = 0
    178   } else if (x_offset == 8) {
    179     if (y_offset == 0) {
    180       __m256i src_next_reg;
    181       for (i = 0; i < height ; i++) {
    182         LOAD_SRC_DST
    183         AVG_NEXT_SRC(src_reg, 1)
    184         // expand each byte to 2 bytes
    185         MERGE_WITH_SRC(src_reg, zero_reg)
    186         CALC_SUM_SSE_INSIDE_LOOP
    187         src+= src_stride;
    188         dst+= dst_stride;
    189       }
    190     // x_offset = 8  and y_offset = 8
    191     } else if (y_offset == 8) {
    192       __m256i src_next_reg, src_avg;
    193       // load source and another source starting from the next
    194       // following byte
    195       src_reg = _mm256_loadu_si256((__m256i const *) (src));
    196       AVG_NEXT_SRC(src_reg, 1)
    197       for (i = 0; i < height ; i++) {
    198         src_avg = src_reg;
    199         src+= src_stride;
    200         LOAD_SRC_DST
    201         AVG_NEXT_SRC(src_reg, 1)
    202         // average between previous average to current average
    203         src_avg = _mm256_avg_epu8(src_avg, src_reg);
    204         // expand each byte to 2 bytes
    205         MERGE_WITH_SRC(src_avg, zero_reg)
    206         // save current source average
    207         CALC_SUM_SSE_INSIDE_LOOP
    208         dst+= dst_stride;
    209       }
    210     // x_offset = 8  and y_offset = bilin interpolation
    211     } else {
    212       __m256i filter, pw8, src_next_reg, src_avg;
    213       y_offset <<= 5;
    214       filter = _mm256_load_si256((__m256i const *)
    215                (bilinear_filters_avx2 + y_offset));
    216       pw8 = _mm256_set1_epi16(8);
    217       // load source and another source starting from the next
    218       // following byte
    219       src_reg = _mm256_loadu_si256((__m256i const *) (src));
    220       AVG_NEXT_SRC(src_reg, 1)
    221       for (i = 0; i < height ; i++) {
    222         // save current source average
    223         src_avg = src_reg;
    224         src+= src_stride;
    225         LOAD_SRC_DST
    226         AVG_NEXT_SRC(src_reg, 1)
    227         MERGE_WITH_SRC(src_avg, src_reg)
    228         FILTER_SRC(filter)
    229         CALC_SUM_SSE_INSIDE_LOOP
    230         dst+= dst_stride;
    231       }
    232     }
    233   // x_offset = bilin interpolation and y_offset = 0
    234   } else {
    235     if (y_offset == 0) {
    236       __m256i filter, pw8, src_next_reg;
    237       x_offset <<= 5;
    238       filter = _mm256_load_si256((__m256i const *)
    239                (bilinear_filters_avx2 + x_offset));
    240       pw8 = _mm256_set1_epi16(8);
    241       for (i = 0; i < height ; i++) {
    242         LOAD_SRC_DST
    243         MERGE_NEXT_SRC(src_reg, 1)
    244         FILTER_SRC(filter)
    245         CALC_SUM_SSE_INSIDE_LOOP
    246         src+= src_stride;
    247         dst+= dst_stride;
    248       }
    249     // x_offset = bilin interpolation and y_offset = 8
    250     } else if (y_offset == 8) {
    251       __m256i filter, pw8, src_next_reg, src_pack;
    252       x_offset <<= 5;
    253       filter = _mm256_load_si256((__m256i const *)
    254                (bilinear_filters_avx2 + x_offset));
    255       pw8 = _mm256_set1_epi16(8);
    256       src_reg = _mm256_loadu_si256((__m256i const *) (src));
    257       MERGE_NEXT_SRC(src_reg, 1)
    258       FILTER_SRC(filter)
    259       // convert each 16 bit to 8 bit to each low and high lane source
    260       src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    261       for (i = 0; i < height ; i++) {
    262         src+= src_stride;
    263         LOAD_SRC_DST
    264         MERGE_NEXT_SRC(src_reg, 1)
    265         FILTER_SRC(filter)
    266         src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    267         // average between previous pack to the current
    268         src_pack = _mm256_avg_epu8(src_pack, src_reg);
    269         MERGE_WITH_SRC(src_pack, zero_reg)
    270         CALC_SUM_SSE_INSIDE_LOOP
    271         src_pack = src_reg;
    272         dst+= dst_stride;
    273       }
    274     // x_offset = bilin interpolation and y_offset = bilin interpolation
    275     } else {
    276       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
    277       x_offset <<= 5;
    278       xfilter = _mm256_load_si256((__m256i const *)
    279                 (bilinear_filters_avx2 + x_offset));
    280       y_offset <<= 5;
    281       yfilter = _mm256_load_si256((__m256i const *)
    282                 (bilinear_filters_avx2 + y_offset));
    283       pw8 = _mm256_set1_epi16(8);
    284       // load source and another source starting from the next
    285       // following byte
    286       src_reg = _mm256_loadu_si256((__m256i const *) (src));
    287       MERGE_NEXT_SRC(src_reg, 1)
    288 
    289       FILTER_SRC(xfilter)
    290       // convert each 16 bit to 8 bit to each low and high lane source
    291       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    292       for (i = 0; i < height ; i++) {
    293         src+= src_stride;
    294         LOAD_SRC_DST
    295         MERGE_NEXT_SRC(src_reg, 1)
    296         FILTER_SRC(xfilter)
    297         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    298         // merge previous pack to current pack source
    299         MERGE_WITH_SRC(src_pack, src_reg)
    300         // filter the source
    301         FILTER_SRC(yfilter)
    302         src_pack = src_reg;
    303         CALC_SUM_SSE_INSIDE_LOOP
    304         dst+= dst_stride;
    305       }
    306     }
    307   }
    308   CALC_SUM_AND_SSE
    309   return sum;
    310 }
    311 
    312 unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
    313                                              int src_stride,
    314                                              int x_offset,
    315                                              int y_offset,
    316                                              const uint8_t *dst,
    317                                              int dst_stride,
    318                                              const uint8_t *sec,
    319                                              int sec_stride,
    320                                              int height,
    321                                              unsigned int *sse) {
    322   __m256i sec_reg;
    323   __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
    324   __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
    325   __m256i zero_reg;
    326   int i, sum;
    327   sum_reg = _mm256_set1_epi16(0);
    328   sse_reg = _mm256_set1_epi16(0);
    329   zero_reg = _mm256_set1_epi16(0);
    330 
    331   // x_offset = 0 and y_offset = 0
    332   if (x_offset == 0) {
    333     if (y_offset == 0) {
    334       for (i = 0; i < height ; i++) {
    335         LOAD_SRC_DST
    336         sec_reg = _mm256_load_si256((__m256i const *) (sec));
    337         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
    338         sec+= sec_stride;
    339         // expend each byte to 2 bytes
    340         MERGE_WITH_SRC(src_reg, zero_reg)
    341         CALC_SUM_SSE_INSIDE_LOOP
    342         src+= src_stride;
    343         dst+= dst_stride;
    344       }
    345     } else if (y_offset == 8) {
    346       __m256i src_next_reg;
    347       for (i = 0; i < height ; i++) {
    348         LOAD_SRC_DST
    349         AVG_NEXT_SRC(src_reg, src_stride)
    350         sec_reg = _mm256_load_si256((__m256i const *) (sec));
    351         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
    352         sec+= sec_stride;
    353         // expend each byte to 2 bytes
    354         MERGE_WITH_SRC(src_reg, zero_reg)
    355         CALC_SUM_SSE_INSIDE_LOOP
    356         src+= src_stride;
    357         dst+= dst_stride;
    358       }
    359     // x_offset = 0 and y_offset = bilin interpolation
    360     } else {
    361       __m256i filter, pw8, src_next_reg;
    362 
    363       y_offset <<= 5;
    364       filter = _mm256_load_si256((__m256i const *)
    365                  (bilinear_filters_avx2 + y_offset));
    366       pw8 = _mm256_set1_epi16(8);
    367       for (i = 0; i < height ; i++) {
    368         LOAD_SRC_DST
    369         MERGE_NEXT_SRC(src_reg, src_stride)
    370         FILTER_SRC(filter)
    371         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    372         sec_reg = _mm256_load_si256((__m256i const *) (sec));
    373         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
    374         sec+= sec_stride;
    375         MERGE_WITH_SRC(src_reg, zero_reg)
    376         CALC_SUM_SSE_INSIDE_LOOP
    377         src+= src_stride;
    378         dst+= dst_stride;
    379       }
    380     }
    381   // x_offset = 8  and y_offset = 0
    382   } else if (x_offset == 8) {
    383     if (y_offset == 0) {
    384       __m256i src_next_reg;
    385       for (i = 0; i < height ; i++) {
    386         LOAD_SRC_DST
    387         AVG_NEXT_SRC(src_reg, 1)
    388         sec_reg = _mm256_load_si256((__m256i const *) (sec));
    389         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
    390         sec+= sec_stride;
    391         // expand each byte to 2 bytes
    392         MERGE_WITH_SRC(src_reg, zero_reg)
    393         CALC_SUM_SSE_INSIDE_LOOP
    394         src+= src_stride;
    395         dst+= dst_stride;
    396       }
    397     // x_offset = 8  and y_offset = 8
    398     } else if (y_offset == 8) {
    399       __m256i src_next_reg, src_avg;
    400       // load source and another source starting from the next
    401       // following byte
    402       src_reg = _mm256_loadu_si256((__m256i const *) (src));
    403       AVG_NEXT_SRC(src_reg, 1)
    404       for (i = 0; i < height ; i++) {
    405         // save current source average
    406         src_avg = src_reg;
    407         src+= src_stride;
    408         LOAD_SRC_DST
    409         AVG_NEXT_SRC(src_reg, 1)
    410         // average between previous average to current average
    411         src_avg = _mm256_avg_epu8(src_avg, src_reg);
    412         sec_reg = _mm256_load_si256((__m256i const *) (sec));
    413         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
    414         sec+= sec_stride;
    415         // expand each byte to 2 bytes
    416         MERGE_WITH_SRC(src_avg, zero_reg)
    417         CALC_SUM_SSE_INSIDE_LOOP
    418         dst+= dst_stride;
    419       }
    420     // x_offset = 8  and y_offset = bilin interpolation
    421     } else {
    422       __m256i filter, pw8, src_next_reg, src_avg;
    423       y_offset <<= 5;
    424       filter = _mm256_load_si256((__m256i const *)
    425                (bilinear_filters_avx2 + y_offset));
    426       pw8 = _mm256_set1_epi16(8);
    427       // load source and another source starting from the next
    428       // following byte
    429       src_reg = _mm256_loadu_si256((__m256i const *) (src));
    430       AVG_NEXT_SRC(src_reg, 1)
    431       for (i = 0; i < height ; i++) {
    432         // save current source average
    433         src_avg = src_reg;
    434         src+= src_stride;
    435         LOAD_SRC_DST
    436         AVG_NEXT_SRC(src_reg, 1)
    437         MERGE_WITH_SRC(src_avg, src_reg)
    438         FILTER_SRC(filter)
    439         src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    440         sec_reg = _mm256_load_si256((__m256i const *) (sec));
    441         src_avg = _mm256_avg_epu8(src_avg, sec_reg);
    442         // expand each byte to 2 bytes
    443         MERGE_WITH_SRC(src_avg, zero_reg)
    444         sec+= sec_stride;
    445         CALC_SUM_SSE_INSIDE_LOOP
    446         dst+= dst_stride;
    447       }
    448     }
    449   // x_offset = bilin interpolation and y_offset = 0
    450   } else {
    451     if (y_offset == 0) {
    452       __m256i filter, pw8, src_next_reg;
    453       x_offset <<= 5;
    454       filter = _mm256_load_si256((__m256i const *)
    455                (bilinear_filters_avx2 + x_offset));
    456       pw8 = _mm256_set1_epi16(8);
    457       for (i = 0; i < height ; i++) {
    458         LOAD_SRC_DST
    459         MERGE_NEXT_SRC(src_reg, 1)
    460         FILTER_SRC(filter)
    461         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    462         sec_reg = _mm256_load_si256((__m256i const *) (sec));
    463         src_reg = _mm256_avg_epu8(src_reg, sec_reg);
    464         MERGE_WITH_SRC(src_reg, zero_reg)
    465         sec+= sec_stride;
    466         CALC_SUM_SSE_INSIDE_LOOP
    467         src+= src_stride;
    468         dst+= dst_stride;
    469       }
    470     // x_offset = bilin interpolation and y_offset = 8
    471     } else if (y_offset == 8) {
    472       __m256i filter, pw8, src_next_reg, src_pack;
    473       x_offset <<= 5;
    474       filter = _mm256_load_si256((__m256i const *)
    475                (bilinear_filters_avx2 + x_offset));
    476       pw8 = _mm256_set1_epi16(8);
    477       src_reg = _mm256_loadu_si256((__m256i const *) (src));
    478       MERGE_NEXT_SRC(src_reg, 1)
    479       FILTER_SRC(filter)
    480       // convert each 16 bit to 8 bit to each low and high lane source
    481       src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    482       for (i = 0; i < height ; i++) {
    483         src+= src_stride;
    484         LOAD_SRC_DST
    485         MERGE_NEXT_SRC(src_reg, 1)
    486         FILTER_SRC(filter)
    487         src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    488         // average between previous pack to the current
    489         src_pack = _mm256_avg_epu8(src_pack, src_reg);
    490         sec_reg = _mm256_load_si256((__m256i const *) (sec));
    491         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
    492         sec+= sec_stride;
    493         MERGE_WITH_SRC(src_pack, zero_reg)
    494         src_pack = src_reg;
    495         CALC_SUM_SSE_INSIDE_LOOP
    496         dst+= dst_stride;
    497       }
    498     // x_offset = bilin interpolation and y_offset = bilin interpolation
    499     } else {
    500       __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
    501       x_offset <<= 5;
    502       xfilter = _mm256_load_si256((__m256i const *)
    503                 (bilinear_filters_avx2 + x_offset));
    504       y_offset <<= 5;
    505       yfilter = _mm256_load_si256((__m256i const *)
    506                 (bilinear_filters_avx2 + y_offset));
    507       pw8 = _mm256_set1_epi16(8);
    508       // load source and another source starting from the next
    509       // following byte
    510       src_reg = _mm256_loadu_si256((__m256i const *) (src));
    511       MERGE_NEXT_SRC(src_reg, 1)
    512 
    513       FILTER_SRC(xfilter)
    514       // convert each 16 bit to 8 bit to each low and high lane source
    515       src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    516       for (i = 0; i < height ; i++) {
    517         src+= src_stride;
    518         LOAD_SRC_DST
    519         MERGE_NEXT_SRC(src_reg, 1)
    520         FILTER_SRC(xfilter)
    521         src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    522         // merge previous pack to current pack source
    523         MERGE_WITH_SRC(src_pack, src_reg)
    524         // filter the source
    525         FILTER_SRC(yfilter)
    526         src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
    527         sec_reg = _mm256_load_si256((__m256i const *) (sec));
    528         src_pack = _mm256_avg_epu8(src_pack, sec_reg);
    529         MERGE_WITH_SRC(src_pack, zero_reg)
    530         src_pack = src_reg;
    531         sec+= sec_stride;
    532         CALC_SUM_SSE_INSIDE_LOOP
    533         dst+= dst_stride;
    534       }
    535     }
    536   }
    537   CALC_SUM_AND_SSE
    538   return sum;
    539 }
    540