Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <immintrin.h>
     12 
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx_dsp/x86/convolve.h"
     15 
     16 // -----------------------------------------------------------------------------
     17 // Copy and average
     18 
     19 void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
     20                                    uint16_t *dst, ptrdiff_t dst_stride,
     21                                    const InterpKernel *filter, int x0_q4,
     22                                    int x_step_q4, int y0_q4, int y_step_q4,
     23                                    int width, int h, int bd) {
     24   (void)filter;
     25   (void)x0_q4;
     26   (void)x_step_q4;
     27   (void)y0_q4;
     28   (void)y_step_q4;
     29   (void)bd;
     30 
     31   assert(width % 4 == 0);
     32   if (width > 32) {  // width = 64
     33     do {
     34       const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
     35       const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
     36       const __m256i p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
     37       const __m256i p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
     38       src += src_stride;
     39       _mm256_storeu_si256((__m256i *)dst, p0);
     40       _mm256_storeu_si256((__m256i *)(dst + 16), p1);
     41       _mm256_storeu_si256((__m256i *)(dst + 32), p2);
     42       _mm256_storeu_si256((__m256i *)(dst + 48), p3);
     43       dst += dst_stride;
     44       h--;
     45     } while (h > 0);
     46   } else if (width > 16) {  // width = 32
     47     do {
     48       const __m256i p0 = _mm256_loadu_si256((const __m256i *)src);
     49       const __m256i p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
     50       src += src_stride;
     51       _mm256_storeu_si256((__m256i *)dst, p0);
     52       _mm256_storeu_si256((__m256i *)(dst + 16), p1);
     53       dst += dst_stride;
     54       h--;
     55     } while (h > 0);
     56   } else if (width > 8) {  // width = 16
     57     __m256i p0, p1;
     58     do {
     59       p0 = _mm256_loadu_si256((const __m256i *)src);
     60       src += src_stride;
     61       p1 = _mm256_loadu_si256((const __m256i *)src);
     62       src += src_stride;
     63 
     64       _mm256_storeu_si256((__m256i *)dst, p0);
     65       dst += dst_stride;
     66       _mm256_storeu_si256((__m256i *)dst, p1);
     67       dst += dst_stride;
     68       h -= 2;
     69     } while (h > 0);
     70   } else if (width > 4) {  // width = 8
     71     __m128i p0, p1;
     72     do {
     73       p0 = _mm_loadu_si128((const __m128i *)src);
     74       src += src_stride;
     75       p1 = _mm_loadu_si128((const __m128i *)src);
     76       src += src_stride;
     77 
     78       _mm_storeu_si128((__m128i *)dst, p0);
     79       dst += dst_stride;
     80       _mm_storeu_si128((__m128i *)dst, p1);
     81       dst += dst_stride;
     82       h -= 2;
     83     } while (h > 0);
     84   } else {  // width = 4
     85     __m128i p0, p1;
     86     do {
     87       p0 = _mm_loadl_epi64((const __m128i *)src);
     88       src += src_stride;
     89       p1 = _mm_loadl_epi64((const __m128i *)src);
     90       src += src_stride;
     91 
     92       _mm_storel_epi64((__m128i *)dst, p0);
     93       dst += dst_stride;
     94       _mm_storel_epi64((__m128i *)dst, p1);
     95       dst += dst_stride;
     96       h -= 2;
     97     } while (h > 0);
     98   }
     99 }
    100 
    101 void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
    102                                   uint16_t *dst, ptrdiff_t dst_stride,
    103                                   const InterpKernel *filter, int x0_q4,
    104                                   int x_step_q4, int y0_q4, int y_step_q4,
    105                                   int width, int h, int bd) {
    106   (void)filter;
    107   (void)x0_q4;
    108   (void)x_step_q4;
    109   (void)y0_q4;
    110   (void)y_step_q4;
    111   (void)bd;
    112 
    113   assert(width % 4 == 0);
    114   if (width > 32) {  // width = 64
    115     __m256i p0, p1, p2, p3, u0, u1, u2, u3;
    116     do {
    117       p0 = _mm256_loadu_si256((const __m256i *)src);
    118       p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
    119       p2 = _mm256_loadu_si256((const __m256i *)(src + 32));
    120       p3 = _mm256_loadu_si256((const __m256i *)(src + 48));
    121       src += src_stride;
    122       u0 = _mm256_loadu_si256((const __m256i *)dst);
    123       u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
    124       u2 = _mm256_loadu_si256((const __m256i *)(dst + 32));
    125       u3 = _mm256_loadu_si256((const __m256i *)(dst + 48));
    126       _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
    127       _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
    128       _mm256_storeu_si256((__m256i *)(dst + 32), _mm256_avg_epu16(p2, u2));
    129       _mm256_storeu_si256((__m256i *)(dst + 48), _mm256_avg_epu16(p3, u3));
    130       dst += dst_stride;
    131       h--;
    132     } while (h > 0);
    133   } else if (width > 16) {  // width = 32
    134     __m256i p0, p1, u0, u1;
    135     do {
    136       p0 = _mm256_loadu_si256((const __m256i *)src);
    137       p1 = _mm256_loadu_si256((const __m256i *)(src + 16));
    138       src += src_stride;
    139       u0 = _mm256_loadu_si256((const __m256i *)dst);
    140       u1 = _mm256_loadu_si256((const __m256i *)(dst + 16));
    141       _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
    142       _mm256_storeu_si256((__m256i *)(dst + 16), _mm256_avg_epu16(p1, u1));
    143       dst += dst_stride;
    144       h--;
    145     } while (h > 0);
    146   } else if (width > 8) {  // width = 16
    147     __m256i p0, p1, u0, u1;
    148     do {
    149       p0 = _mm256_loadu_si256((const __m256i *)src);
    150       p1 = _mm256_loadu_si256((const __m256i *)(src + src_stride));
    151       src += src_stride << 1;
    152       u0 = _mm256_loadu_si256((const __m256i *)dst);
    153       u1 = _mm256_loadu_si256((const __m256i *)(dst + dst_stride));
    154 
    155       _mm256_storeu_si256((__m256i *)dst, _mm256_avg_epu16(p0, u0));
    156       _mm256_storeu_si256((__m256i *)(dst + dst_stride),
    157                           _mm256_avg_epu16(p1, u1));
    158       dst += dst_stride << 1;
    159       h -= 2;
    160     } while (h > 0);
    161   } else if (width > 4) {  // width = 8
    162     __m128i p0, p1, u0, u1;
    163     do {
    164       p0 = _mm_loadu_si128((const __m128i *)src);
    165       p1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
    166       src += src_stride << 1;
    167       u0 = _mm_loadu_si128((const __m128i *)dst);
    168       u1 = _mm_loadu_si128((const __m128i *)(dst + dst_stride));
    169 
    170       _mm_storeu_si128((__m128i *)dst, _mm_avg_epu16(p0, u0));
    171       _mm_storeu_si128((__m128i *)(dst + dst_stride), _mm_avg_epu16(p1, u1));
    172       dst += dst_stride << 1;
    173       h -= 2;
    174     } while (h > 0);
    175   } else {  // width = 4
    176     __m128i p0, p1, u0, u1;
    177     do {
    178       p0 = _mm_loadl_epi64((const __m128i *)src);
    179       p1 = _mm_loadl_epi64((const __m128i *)(src + src_stride));
    180       src += src_stride << 1;
    181       u0 = _mm_loadl_epi64((const __m128i *)dst);
    182       u1 = _mm_loadl_epi64((const __m128i *)(dst + dst_stride));
    183 
    184       _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(u0, p0));
    185       _mm_storel_epi64((__m128i *)(dst + dst_stride), _mm_avg_epu16(u1, p1));
    186       dst += dst_stride << 1;
    187       h -= 2;
    188     } while (h > 0);
    189   }
    190 }
    191 
    192 // -----------------------------------------------------------------------------
    193 // Horizontal and vertical filtering
    194 
    195 #define CONV8_ROUNDING_BITS (7)
    196 
    197 static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
    198                                               7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
    199                                               4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
    200 
    201 static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
    202                                               8, 9, 10, 11, 10, 11, 12, 13,
    203                                               4, 5, 6,  7,  6,  7,  8,  9,
    204                                               8, 9, 10, 11, 10, 11, 12, 13 };
    205 
    206 static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
    207                                               10, 11, 12, 13, 12, 13, 14, 15,
    208                                               6,  7,  8,  9,  8,  9,  10, 11,
    209                                               10, 11, 12, 13, 12, 13, 14, 15 };
    210 
    211 static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
    212 
    213 // -----------------------------------------------------------------------------
    214 // Horizontal Filtering
    215 
    216 static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
    217   const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
    218   const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
    219   const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
    220   const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
    221 
    222   p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
    223   p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
    224   p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
    225   p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
    226 }
    227 
    228 // Note:
    229 //  Shared by 8x2 and 16x1 block
    230 static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
    231                                   __m256i *x /*x[8]*/) {
    232   __m256i pp[8];
    233   pack_pixels(s0, pp);
    234   pack_pixels(s1, &pp[4]);
    235   x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
    236   x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
    237   x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
    238   x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
    239   x[4] = x[2];
    240   x[5] = x[3];
    241   x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
    242   x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
    243 }
    244 
    245 static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
    246   __m256i pp[8];
    247   __m256i s0;
    248   s0 = _mm256_loadu_si256((const __m256i *)src);
    249   pack_pixels(&s0, pp);
    250   x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
    251   x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
    252   x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
    253   x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
    254 }
    255 
    256 static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
    257                                    __m256i *x) {
    258   __m256i s0, s1;
    259   s0 = _mm256_loadu_si256((const __m256i *)src);
    260   s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
    261   pack_16_pixels(&s0, &s1, x);
    262 }
    263 
    264 static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
    265   __m256i s0, s1;
    266   s0 = _mm256_loadu_si256((const __m256i *)src);
    267   s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
    268   pack_16_pixels(&s0, &s1, x);
    269 }
    270 
    271 // Note:
    272 //  Shared by horizontal and vertical filtering
    273 static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
    274   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
    275   const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
    276   const __m256i p0 = _mm256_set1_epi32(0x03020100);
    277   const __m256i p1 = _mm256_set1_epi32(0x07060504);
    278   const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
    279   const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
    280   f[0] = _mm256_shuffle_epi8(hh, p0);
    281   f[1] = _mm256_shuffle_epi8(hh, p1);
    282   f[2] = _mm256_shuffle_epi8(hh, p2);
    283   f[3] = _mm256_shuffle_epi8(hh, p3);
    284 }
    285 
    286 static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
    287                                      const __m256i *fil /*fil[4]*/,
    288                                      __m256i *y) {
    289   __m256i a, a0, a1;
    290 
    291   a0 = _mm256_madd_epi16(fil[0], sig[0]);
    292   a1 = _mm256_madd_epi16(fil[3], sig[3]);
    293   a = _mm256_add_epi32(a0, a1);
    294 
    295   a0 = _mm256_madd_epi16(fil[1], sig[1]);
    296   a1 = _mm256_madd_epi16(fil[2], sig[2]);
    297 
    298   {
    299     const __m256i min = _mm256_min_epi32(a0, a1);
    300     a = _mm256_add_epi32(a, min);
    301   }
    302   {
    303     const __m256i max = _mm256_max_epi32(a0, a1);
    304     a = _mm256_add_epi32(a, max);
    305   }
    306   {
    307     const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
    308     a = _mm256_add_epi32(a, rounding);
    309     *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
    310   }
    311 }
    312 
    313 static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
    314                                     uint16_t *dst) {
    315   const __m128i a0 = _mm256_castsi256_si128(*y);
    316   const __m128i a1 = _mm256_extractf128_si256(*y, 1);
    317   __m128i res = _mm_packus_epi32(a0, a1);
    318   res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
    319   _mm_storeu_si128((__m128i *)dst, res);
    320 }
    321 
    322 static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
    323                                     const __m256i *mask, uint16_t *dst,
    324                                     ptrdiff_t pitch) {
    325   __m256i a = _mm256_packus_epi32(*y0, *y1);
    326   a = _mm256_min_epi16(a, *mask);
    327   _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
    328   _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
    329 }
    330 
    331 static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
    332                                      const __m256i *mask, uint16_t *dst) {
    333   __m256i a = _mm256_packus_epi32(*y0, *y1);
    334   a = _mm256_min_epi16(a, *mask);
    335   _mm256_storeu_si256((__m256i *)dst, a);
    336 }
    337 
    338 static void vpx_highbd_filter_block1d8_h8_avx2(
    339     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    340     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    341   __m256i signal[8], res0, res1;
    342   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    343 
    344   __m256i ff[4];
    345   pack_filters(filter, ff);
    346 
    347   src_ptr -= 3;
    348   do {
    349     pack_8x2_pixels(src_ptr, src_pitch, signal);
    350     filter_8x1_pixels(signal, ff, &res0);
    351     filter_8x1_pixels(&signal[4], ff, &res1);
    352     store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
    353     height -= 2;
    354     src_ptr += src_pitch << 1;
    355     dst_ptr += dst_pitch << 1;
    356   } while (height > 1);
    357 
    358   if (height > 0) {
    359     pack_8x1_pixels(src_ptr, signal);
    360     filter_8x1_pixels(signal, ff, &res0);
    361     store_8x1_pixels(&res0, &max, dst_ptr);
    362   }
    363 }
    364 
    365 static void vpx_highbd_filter_block1d16_h8_avx2(
    366     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    367     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    368   __m256i signal[8], res0, res1;
    369   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    370 
    371   __m256i ff[4];
    372   pack_filters(filter, ff);
    373 
    374   src_ptr -= 3;
    375   do {
    376     pack_16x1_pixels(src_ptr, signal);
    377     filter_8x1_pixels(signal, ff, &res0);
    378     filter_8x1_pixels(&signal[4], ff, &res1);
    379     store_16x1_pixels(&res0, &res1, &max, dst_ptr);
    380     height -= 1;
    381     src_ptr += src_pitch;
    382     dst_ptr += dst_pitch;
    383   } while (height > 0);
    384 }
    385 
    386 // -----------------------------------------------------------------------------
    387 // 2-tap horizontal filtering
    388 
    389 static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
    390   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
    391   const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
    392   const __m256i p = _mm256_set1_epi32(0x09080706);
    393   f[0] = _mm256_shuffle_epi8(hh, p);
    394 }
    395 
    396 // can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
    397 // the difference is s0/s1 specifies first and second rows or,
    398 // first 16 samples and 8-sample shifted 16 samples
    399 static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
    400                                      __m256i *sig) {
    401   const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
    402   const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
    403   __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
    404   __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
    405   __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
    406   __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
    407   r0 = _mm256_shuffle_epi8(r0, sf2);
    408   r1 = _mm256_shuffle_epi8(r1, sf2);
    409   sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
    410   sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
    411 }
    412 
    413 static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
    414                                       const ptrdiff_t pitch, __m256i *sig) {
    415   const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
    416   const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
    417   pack_16_2t_pixels(&r0, &r1, sig);
    418 }
    419 
    420 static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
    421                                        __m256i *sig /*sig[2]*/) {
    422   const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
    423   const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
    424   pack_16_2t_pixels(&r0, &r1, sig);
    425 }
    426 
    427 static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
    428                                       __m256i *sig /*sig[2]*/) {
    429   const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
    430   const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
    431   __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
    432   __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
    433   r0 = _mm256_permutevar8x32_epi32(r0, idx);
    434   r0 = _mm256_shuffle_epi8(r0, sf2);
    435   sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
    436 }
    437 
    438 // can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
    439 static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
    440                                        __m256i *y0, __m256i *y1) {
    441   const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
    442   __m256i x0 = _mm256_madd_epi16(sig[0], *f);
    443   __m256i x1 = _mm256_madd_epi16(sig[1], *f);
    444   x0 = _mm256_add_epi32(x0, rounding);
    445   x1 = _mm256_add_epi32(x1, rounding);
    446   *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
    447   *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
    448 }
    449 
    450 static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
    451                                         __m256i *y0) {
    452   const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
    453   __m256i x0 = _mm256_madd_epi16(sig[0], *f);
    454   x0 = _mm256_add_epi32(x0, rounding);
    455   *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
    456 }
    457 
    458 static void vpx_highbd_filter_block1d8_h2_avx2(
    459     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    460     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    461   __m256i signal[2], res0, res1;
    462   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    463 
    464   __m256i ff;
    465   pack_2t_filter(filter, &ff);
    466 
    467   src_ptr -= 3;
    468   do {
    469     pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
    470     filter_16_2t_pixels(signal, &ff, &res0, &res1);
    471     store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
    472     height -= 2;
    473     src_ptr += src_pitch << 1;
    474     dst_ptr += dst_pitch << 1;
    475   } while (height > 1);
    476 
    477   if (height > 0) {
    478     pack_8x1_2t_pixels(src_ptr, signal);
    479     filter_8x1_2t_pixels(signal, &ff, &res0);
    480     store_8x1_pixels(&res0, &max, dst_ptr);
    481   }
    482 }
    483 
    484 static void vpx_highbd_filter_block1d16_h2_avx2(
    485     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    486     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    487   __m256i signal[2], res0, res1;
    488   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    489 
    490   __m256i ff;
    491   pack_2t_filter(filter, &ff);
    492 
    493   src_ptr -= 3;
    494   do {
    495     pack_16x1_2t_pixels(src_ptr, signal);
    496     filter_16_2t_pixels(signal, &ff, &res0, &res1);
    497     store_16x1_pixels(&res0, &res1, &max, dst_ptr);
    498     height -= 1;
    499     src_ptr += src_pitch;
    500     dst_ptr += dst_pitch;
    501   } while (height > 0);
    502 }
    503 
    504 // -----------------------------------------------------------------------------
    505 // Vertical Filtering
    506 
    507 static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
    508   __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
    509   __m256i s1 =
    510       _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
    511   __m256i s2 = _mm256_castsi128_si256(
    512       _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
    513   __m256i s3 = _mm256_castsi128_si256(
    514       _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
    515   __m256i s4 = _mm256_castsi128_si256(
    516       _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
    517   __m256i s5 = _mm256_castsi128_si256(
    518       _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
    519   __m256i s6 = _mm256_castsi128_si256(
    520       _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
    521 
    522   s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
    523   s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
    524   s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
    525   s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
    526   s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
    527   s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
    528 
    529   sig[0] = _mm256_unpacklo_epi16(s0, s1);
    530   sig[4] = _mm256_unpackhi_epi16(s0, s1);
    531   sig[1] = _mm256_unpacklo_epi16(s2, s3);
    532   sig[5] = _mm256_unpackhi_epi16(s2, s3);
    533   sig[2] = _mm256_unpacklo_epi16(s4, s5);
    534   sig[6] = _mm256_unpackhi_epi16(s4, s5);
    535   sig[8] = s6;
    536 }
    537 
    538 static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
    539                                    __m256i *sig) {
    540   // base + 7th row
    541   __m256i s0 = _mm256_castsi128_si256(
    542       _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
    543   // base + 8th row
    544   __m256i s1 = _mm256_castsi128_si256(
    545       _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
    546   __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
    547   __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
    548   sig[3] = _mm256_unpacklo_epi16(s2, s3);
    549   sig[7] = _mm256_unpackhi_epi16(s2, s3);
    550   sig[8] = s1;
    551 }
    552 
    553 static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
    554                                      __m256i *y0, __m256i *y1) {
    555   filter_8x1_pixels(sig, f, y0);
    556   filter_8x1_pixels(&sig[4], f, y1);
    557 }
    558 
    559 static INLINE void update_pixels(__m256i *sig) {
    560   int i;
    561   for (i = 0; i < 3; ++i) {
    562     sig[i] = sig[i + 1];
    563     sig[i + 4] = sig[i + 5];
    564   }
    565 }
    566 
    567 static void vpx_highbd_filter_block1d8_v8_avx2(
    568     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    569     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    570   __m256i signal[9], res0, res1;
    571   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    572 
    573   __m256i ff[4];
    574   pack_filters(filter, ff);
    575 
    576   pack_8x9_init(src_ptr, src_pitch, signal);
    577 
    578   do {
    579     pack_8x9_pixels(src_ptr, src_pitch, signal);
    580 
    581     filter_8x9_pixels(signal, ff, &res0, &res1);
    582     store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
    583     update_pixels(signal);
    584 
    585     src_ptr += src_pitch << 1;
    586     dst_ptr += dst_pitch << 1;
    587     height -= 2;
    588   } while (height > 0);
    589 }
    590 
    591 static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
    592   __m256i u0, u1, u2, u3;
    593   // load 0-6 rows
    594   const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
    595   const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
    596   const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
    597   const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
    598   const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
    599   const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
    600   const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
    601 
    602   u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
    603   u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
    604 
    605   u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
    606   u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
    607 
    608   sig[0] = _mm256_unpacklo_epi16(u0, u2);
    609   sig[4] = _mm256_unpackhi_epi16(u0, u2);
    610 
    611   sig[8] = _mm256_unpacklo_epi16(u1, u3);
    612   sig[12] = _mm256_unpackhi_epi16(u1, u3);
    613 
    614   u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
    615   u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
    616 
    617   u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
    618   u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
    619 
    620   sig[1] = _mm256_unpacklo_epi16(u0, u2);
    621   sig[5] = _mm256_unpackhi_epi16(u0, u2);
    622 
    623   sig[9] = _mm256_unpacklo_epi16(u1, u3);
    624   sig[13] = _mm256_unpackhi_epi16(u1, u3);
    625 
    626   u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
    627   u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
    628 
    629   u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
    630   u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
    631 
    632   sig[2] = _mm256_unpacklo_epi16(u0, u2);
    633   sig[6] = _mm256_unpackhi_epi16(u0, u2);
    634 
    635   sig[10] = _mm256_unpacklo_epi16(u1, u3);
    636   sig[14] = _mm256_unpackhi_epi16(u1, u3);
    637 
    638   sig[16] = s6;
    639 }
    640 
    641 static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
    642                              __m256i *sig) {
    643   // base + 7th row
    644   const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
    645   // base + 8th row
    646   const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
    647 
    648   __m256i u0, u1, u2, u3;
    649   u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
    650   u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
    651 
    652   u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
    653   u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
    654 
    655   sig[3] = _mm256_unpacklo_epi16(u0, u2);
    656   sig[7] = _mm256_unpackhi_epi16(u0, u2);
    657 
    658   sig[11] = _mm256_unpacklo_epi16(u1, u3);
    659   sig[15] = _mm256_unpackhi_epi16(u1, u3);
    660 
    661   sig[16] = s8;
    662 }
    663 
    664 static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
    665                                       __m256i *y0, __m256i *y1) {
    666   __m256i res[4];
    667   int i;
    668   for (i = 0; i < 4; ++i) {
    669     filter_8x1_pixels(&sig[i << 2], f, &res[i]);
    670   }
    671 
    672   {
    673     const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
    674     const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
    675     *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
    676     *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
    677   }
    678 }
    679 
    680 static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
    681                                      const __m256i *mask, uint16_t *dst,
    682                                      ptrdiff_t pitch) {
    683   __m256i p = _mm256_min_epi16(*y0, *mask);
    684   _mm256_storeu_si256((__m256i *)dst, p);
    685   p = _mm256_min_epi16(*y1, *mask);
    686   _mm256_storeu_si256((__m256i *)(dst + pitch), p);
    687 }
    688 
    689 static void update_16x9_pixels(__m256i *sig) {
    690   update_pixels(&sig[0]);
    691   update_pixels(&sig[8]);
    692 }
    693 
    694 static void vpx_highbd_filter_block1d16_v8_avx2(
    695     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    696     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    697   __m256i signal[17], res0, res1;
    698   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    699 
    700   __m256i ff[4];
    701   pack_filters(filter, ff);
    702 
    703   pack_16x9_init(src_ptr, src_pitch, signal);
    704 
    705   do {
    706     pack_16x9_pixels(src_ptr, src_pitch, signal);
    707     filter_16x9_pixels(signal, ff, &res0, &res1);
    708     store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
    709     update_16x9_pixels(signal);
    710 
    711     src_ptr += src_pitch << 1;
    712     dst_ptr += dst_pitch << 1;
    713     height -= 2;
    714   } while (height > 0);
    715 }
    716 
    717 // -----------------------------------------------------------------------------
    718 // 2-tap vertical filtering
    719 
    720 static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
    721   sig[2] = _mm256_loadu_si256((const __m256i *)src);
    722 }
    723 
    724 static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
    725                                        __m256i *sig) {
    726   // load the next row
    727   const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
    728   sig[0] = _mm256_unpacklo_epi16(sig[2], u);
    729   sig[1] = _mm256_unpackhi_epi16(sig[2], u);
    730   sig[2] = u;
    731 }
    732 
    733 static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
    734                                          __m256i *y0, __m256i *y1) {
    735   filter_16_2t_pixels(sig, f, y0, y1);
    736 }
    737 
    738 static void vpx_highbd_filter_block1d16_v2_avx2(
    739     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    740     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    741   __m256i signal[3], res0, res1;
    742   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    743   __m256i ff;
    744 
    745   pack_2t_filter(filter, &ff);
    746   pack_16x2_init(src_ptr, signal);
    747 
    748   do {
    749     pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
    750     filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
    751     store_16x1_pixels(&res0, &res1, &max, dst_ptr);
    752 
    753     src_ptr += src_pitch;
    754     dst_ptr += dst_pitch;
    755     height -= 1;
    756   } while (height > 0);
    757 }
    758 
    759 static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
    760   const __m128i h = _mm_loadu_si128((const __m128i *)filter);
    761   const __m128i p = _mm_set1_epi32(0x09080706);
    762   f[0] = _mm_shuffle_epi8(h, p);
    763 }
    764 
    765 static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
    766   sig[2] = _mm_loadu_si128((const __m128i *)src);
    767 }
    768 
    769 static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
    770                                           __m128i *sig) {
    771   // load the next row
    772   const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
    773   sig[0] = _mm_unpacklo_epi16(sig[2], u);
    774   sig[1] = _mm_unpackhi_epi16(sig[2], u);
    775   sig[2] = u;
    776 }
    777 
    778 static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
    779                                       __m128i *y0, __m128i *y1) {
    780   const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
    781   __m128i x0 = _mm_madd_epi16(sig[0], *f);
    782   __m128i x1 = _mm_madd_epi16(sig[1], *f);
    783   x0 = _mm_add_epi32(x0, rounding);
    784   x1 = _mm_add_epi32(x1, rounding);
    785   *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
    786   *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
    787 }
    788 
    789 static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
    790                                            const __m128i *mask, uint16_t *dst) {
    791   __m128i res = _mm_packus_epi32(*y0, *y1);
    792   res = _mm_min_epi16(res, *mask);
    793   _mm_storeu_si128((__m128i *)dst, res);
    794 }
    795 
    796 static void vpx_highbd_filter_block1d8_v2_avx2(
    797     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    798     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    799   __m128i signal[3], res0, res1;
    800   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
    801   __m128i ff;
    802 
    803   pack_8x1_2t_filter(filter, &ff);
    804   pack_8x2_init(src_ptr, signal);
    805 
    806   do {
    807     pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
    808     filter_8_2t_pixels(signal, &ff, &res0, &res1);
    809     store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
    810 
    811     src_ptr += src_pitch;
    812     dst_ptr += dst_pitch;
    813     height -= 1;
    814   } while (height > 0);
    815 }
    816 
    817 // Calculation with averaging the input pixels
    818 
    819 static INLINE void store_8x1_avg_pixels(const __m256i *y0, const __m256i *mask,
    820                                         uint16_t *dst) {
    821   const __m128i a0 = _mm256_castsi256_si128(*y0);
    822   const __m128i a1 = _mm256_extractf128_si256(*y0, 1);
    823   __m128i res = _mm_packus_epi32(a0, a1);
    824   const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
    825   res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
    826   res = _mm_avg_epu16(res, pix);
    827   _mm_storeu_si128((__m128i *)dst, res);
    828 }
    829 
    830 static INLINE void store_8x2_avg_pixels(const __m256i *y0, const __m256i *y1,
    831                                         const __m256i *mask, uint16_t *dst,
    832                                         ptrdiff_t pitch) {
    833   __m256i a = _mm256_packus_epi32(*y0, *y1);
    834   const __m128i pix0 = _mm_loadu_si128((const __m128i *)dst);
    835   const __m128i pix1 = _mm_loadu_si128((const __m128i *)(dst + pitch));
    836   const __m256i pix =
    837       _mm256_insertf128_si256(_mm256_castsi128_si256(pix0), pix1, 1);
    838   a = _mm256_min_epi16(a, *mask);
    839   a = _mm256_avg_epu16(a, pix);
    840   _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
    841   _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
    842 }
    843 
    844 static INLINE void store_16x1_avg_pixels(const __m256i *y0, const __m256i *y1,
    845                                          const __m256i *mask, uint16_t *dst) {
    846   __m256i a = _mm256_packus_epi32(*y0, *y1);
    847   const __m256i pix = _mm256_loadu_si256((const __m256i *)dst);
    848   a = _mm256_min_epi16(a, *mask);
    849   a = _mm256_avg_epu16(a, pix);
    850   _mm256_storeu_si256((__m256i *)dst, a);
    851 }
    852 
    853 static INLINE void store_16x2_avg_pixels(const __m256i *y0, const __m256i *y1,
    854                                          const __m256i *mask, uint16_t *dst,
    855                                          ptrdiff_t pitch) {
    856   const __m256i pix0 = _mm256_loadu_si256((const __m256i *)dst);
    857   const __m256i pix1 = _mm256_loadu_si256((const __m256i *)(dst + pitch));
    858   __m256i p = _mm256_min_epi16(*y0, *mask);
    859   p = _mm256_avg_epu16(p, pix0);
    860   _mm256_storeu_si256((__m256i *)dst, p);
    861 
    862   p = _mm256_min_epi16(*y1, *mask);
    863   p = _mm256_avg_epu16(p, pix1);
    864   _mm256_storeu_si256((__m256i *)(dst + pitch), p);
    865 }
    866 
    867 static INLINE void store_8x1_2t_avg_pixels_ver(const __m128i *y0,
    868                                                const __m128i *y1,
    869                                                const __m128i *mask,
    870                                                uint16_t *dst) {
    871   __m128i res = _mm_packus_epi32(*y0, *y1);
    872   const __m128i pix = _mm_loadu_si128((const __m128i *)dst);
    873   res = _mm_min_epi16(res, *mask);
    874   res = _mm_avg_epu16(res, pix);
    875   _mm_storeu_si128((__m128i *)dst, res);
    876 }
    877 
    878 static void vpx_highbd_filter_block1d8_h8_avg_avx2(
    879     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    880     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    881   __m256i signal[8], res0, res1;
    882   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    883 
    884   __m256i ff[4];
    885   pack_filters(filter, ff);
    886 
    887   src_ptr -= 3;
    888   do {
    889     pack_8x2_pixels(src_ptr, src_pitch, signal);
    890     filter_8x1_pixels(signal, ff, &res0);
    891     filter_8x1_pixels(&signal[4], ff, &res1);
    892     store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
    893     height -= 2;
    894     src_ptr += src_pitch << 1;
    895     dst_ptr += dst_pitch << 1;
    896   } while (height > 1);
    897 
    898   if (height > 0) {
    899     pack_8x1_pixels(src_ptr, signal);
    900     filter_8x1_pixels(signal, ff, &res0);
    901     store_8x1_avg_pixels(&res0, &max, dst_ptr);
    902   }
    903 }
    904 
    905 static void vpx_highbd_filter_block1d16_h8_avg_avx2(
    906     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    907     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    908   __m256i signal[8], res0, res1;
    909   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    910 
    911   __m256i ff[4];
    912   pack_filters(filter, ff);
    913 
    914   src_ptr -= 3;
    915   do {
    916     pack_16x1_pixels(src_ptr, signal);
    917     filter_8x1_pixels(signal, ff, &res0);
    918     filter_8x1_pixels(&signal[4], ff, &res1);
    919     store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
    920     height -= 1;
    921     src_ptr += src_pitch;
    922     dst_ptr += dst_pitch;
    923   } while (height > 0);
    924 }
    925 
    926 static void vpx_highbd_filter_block1d8_v8_avg_avx2(
    927     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    928     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    929   __m256i signal[9], res0, res1;
    930   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    931 
    932   __m256i ff[4];
    933   pack_filters(filter, ff);
    934 
    935   pack_8x9_init(src_ptr, src_pitch, signal);
    936 
    937   do {
    938     pack_8x9_pixels(src_ptr, src_pitch, signal);
    939 
    940     filter_8x9_pixels(signal, ff, &res0, &res1);
    941     store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
    942     update_pixels(signal);
    943 
    944     src_ptr += src_pitch << 1;
    945     dst_ptr += dst_pitch << 1;
    946     height -= 2;
    947   } while (height > 0);
    948 }
    949 
    950 static void vpx_highbd_filter_block1d16_v8_avg_avx2(
    951     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    952     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    953   __m256i signal[17], res0, res1;
    954   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    955 
    956   __m256i ff[4];
    957   pack_filters(filter, ff);
    958 
    959   pack_16x9_init(src_ptr, src_pitch, signal);
    960 
    961   do {
    962     pack_16x9_pixels(src_ptr, src_pitch, signal);
    963     filter_16x9_pixels(signal, ff, &res0, &res1);
    964     store_16x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
    965     update_16x9_pixels(signal);
    966 
    967     src_ptr += src_pitch << 1;
    968     dst_ptr += dst_pitch << 1;
    969     height -= 2;
    970   } while (height > 0);
    971 }
    972 
    973 static void vpx_highbd_filter_block1d8_h2_avg_avx2(
    974     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
    975     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
    976   __m256i signal[2], res0, res1;
    977   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
    978 
    979   __m256i ff;
    980   pack_2t_filter(filter, &ff);
    981 
    982   src_ptr -= 3;
    983   do {
    984     pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
    985     filter_16_2t_pixels(signal, &ff, &res0, &res1);
    986     store_8x2_avg_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
    987     height -= 2;
    988     src_ptr += src_pitch << 1;
    989     dst_ptr += dst_pitch << 1;
    990   } while (height > 1);
    991 
    992   if (height > 0) {
    993     pack_8x1_2t_pixels(src_ptr, signal);
    994     filter_8x1_2t_pixels(signal, &ff, &res0);
    995     store_8x1_avg_pixels(&res0, &max, dst_ptr);
    996   }
    997 }
    998 
    999 static void vpx_highbd_filter_block1d16_h2_avg_avx2(
   1000     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
   1001     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   1002   __m256i signal[2], res0, res1;
   1003   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
   1004 
   1005   __m256i ff;
   1006   pack_2t_filter(filter, &ff);
   1007 
   1008   src_ptr -= 3;
   1009   do {
   1010     pack_16x1_2t_pixels(src_ptr, signal);
   1011     filter_16_2t_pixels(signal, &ff, &res0, &res1);
   1012     store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
   1013     height -= 1;
   1014     src_ptr += src_pitch;
   1015     dst_ptr += dst_pitch;
   1016   } while (height > 0);
   1017 }
   1018 
   1019 static void vpx_highbd_filter_block1d16_v2_avg_avx2(
   1020     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
   1021     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   1022   __m256i signal[3], res0, res1;
   1023   const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
   1024   __m256i ff;
   1025 
   1026   pack_2t_filter(filter, &ff);
   1027   pack_16x2_init(src_ptr, signal);
   1028 
   1029   do {
   1030     pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
   1031     filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
   1032     store_16x1_avg_pixels(&res0, &res1, &max, dst_ptr);
   1033 
   1034     src_ptr += src_pitch;
   1035     dst_ptr += dst_pitch;
   1036     height -= 1;
   1037   } while (height > 0);
   1038 }
   1039 
   1040 static void vpx_highbd_filter_block1d8_v2_avg_avx2(
   1041     const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
   1042     ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
   1043   __m128i signal[3], res0, res1;
   1044   const __m128i max = _mm_set1_epi16((1 << bd) - 1);
   1045   __m128i ff;
   1046 
   1047   pack_8x1_2t_filter(filter, &ff);
   1048   pack_8x2_init(src_ptr, signal);
   1049 
   1050   do {
   1051     pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
   1052     filter_8_2t_pixels(signal, &ff, &res0, &res1);
   1053     store_8x1_2t_avg_pixels_ver(&res0, &res1, &max, dst_ptr);
   1054 
   1055     src_ptr += src_pitch;
   1056     dst_ptr += dst_pitch;
   1057     height -= 1;
   1058   } while (height > 0);
   1059 }
   1060 
   1061 void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
   1062                                         ptrdiff_t, uint32_t, const int16_t *,
   1063                                         int);
   1064 void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
   1065                                         ptrdiff_t, uint32_t, const int16_t *,
   1066                                         int);
   1067 void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
   1068                                         ptrdiff_t, uint32_t, const int16_t *,
   1069                                         int);
   1070 void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
   1071                                         ptrdiff_t, uint32_t, const int16_t *,
   1072                                         int);
   1073 #define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2
   1074 #define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2
   1075 #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
   1076 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
   1077 
   1078 HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
   1079 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
   1080 HIGH_FUN_CONV_2D(, avx2);
   1081 
   1082 void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
   1083                                             uint16_t *, ptrdiff_t, uint32_t,
   1084                                             const int16_t *, int);
   1085 void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t,
   1086                                             uint16_t *, ptrdiff_t, uint32_t,
   1087                                             const int16_t *, int);
   1088 void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t,
   1089                                             uint16_t *, ptrdiff_t, uint32_t,
   1090                                             const int16_t *, int);
   1091 void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
   1092                                             uint16_t *, ptrdiff_t, uint32_t,
   1093                                             const int16_t *, int);
   1094 #define vpx_highbd_filter_block1d4_h8_avg_avx2 \
   1095   vpx_highbd_filter_block1d4_h8_avg_sse2
   1096 #define vpx_highbd_filter_block1d4_h2_avg_avx2 \
   1097   vpx_highbd_filter_block1d4_h2_avg_sse2
   1098 #define vpx_highbd_filter_block1d4_v8_avg_avx2 \
   1099   vpx_highbd_filter_block1d4_v8_avg_sse2
   1100 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
   1101   vpx_highbd_filter_block1d4_v2_avg_sse2
   1102 
   1103 HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
   1104 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
   1105                  avx2);
   1106 HIGH_FUN_CONV_2D(avg_, avx2);
   1107 
   1108 #undef HIGHBD_FUNC
   1109