Home | History | Annotate | Download | only in x86
      1 /*
      2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
      3  *
      4  * This source code is subject to the terms of the BSD 2 Clause License and
      5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6  * was not distributed with this source code in the LICENSE file, you can
      7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8  * Media Patent License 1.0 was not distributed with this source code in the
      9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10  */
     11 
     12 #include <immintrin.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 #include "aom_dsp/x86/lpf_common_sse2.h"
     16 
     17 static INLINE __m256i dc_sum_64(const uint8_t *ref) {
     18   const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
     19   const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
     20   const __m256i zero = _mm256_setzero_si256();
     21   __m256i y0 = _mm256_sad_epu8(x0, zero);
     22   __m256i y1 = _mm256_sad_epu8(x1, zero);
     23   y0 = _mm256_add_epi64(y0, y1);
     24   __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
     25   y0 = _mm256_add_epi64(u0, y0);
     26   u0 = _mm256_unpackhi_epi64(y0, y0);
     27   return _mm256_add_epi16(y0, u0);
     28 }
     29 
     30 static INLINE __m256i dc_sum_32(const uint8_t *ref) {
     31   const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
     32   const __m256i zero = _mm256_setzero_si256();
     33   __m256i y = _mm256_sad_epu8(x, zero);
     34   __m256i u = _mm256_permute2x128_si256(y, y, 1);
     35   y = _mm256_add_epi64(u, y);
     36   u = _mm256_unpackhi_epi64(y, y);
     37   return _mm256_add_epi16(y, u);
     38 }
     39 
     40 static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
     41                                   ptrdiff_t stride) {
     42   for (int i = 0; i < height; ++i) {
     43     _mm256_storeu_si256((__m256i *)dst, *r);
     44     dst += stride;
     45   }
     46 }
     47 
     48 static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
     49                                     int height, uint8_t *dst,
     50                                     ptrdiff_t stride) {
     51   for (int i = 0; i < height; ++i) {
     52     _mm256_storeu_si256((__m256i *)dst, *r0);
     53     _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
     54     dst += stride;
     55   }
     56 }
     57 
     58 static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
     59                                   ptrdiff_t stride) {
     60   for (int i = 0; i < height; ++i) {
     61     _mm256_storeu_si256((__m256i *)dst, *r);
     62     _mm256_storeu_si256((__m256i *)(dst + 32), *r);
     63     dst += stride;
     64   }
     65 }
     66 
     67 static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
     68   __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
     69 
     70   r0 = _mm_unpacklo_epi16(x[0], x[1]);
     71   r1 = _mm_unpacklo_epi16(x[2], x[3]);
     72   r2 = _mm_unpacklo_epi16(x[4], x[5]);
     73   r3 = _mm_unpacklo_epi16(x[6], x[7]);
     74 
     75   r4 = _mm_unpacklo_epi16(x[8], x[9]);
     76   r5 = _mm_unpacklo_epi16(x[10], x[11]);
     77   r6 = _mm_unpacklo_epi16(x[12], x[13]);
     78   r7 = _mm_unpacklo_epi16(x[14], x[15]);
     79 
     80   r8 = _mm_unpacklo_epi32(r0, r1);
     81   r9 = _mm_unpackhi_epi32(r0, r1);
     82   r10 = _mm_unpacklo_epi32(r2, r3);
     83   r11 = _mm_unpackhi_epi32(r2, r3);
     84 
     85   r12 = _mm_unpacklo_epi32(r4, r5);
     86   r13 = _mm_unpackhi_epi32(r4, r5);
     87   r14 = _mm_unpacklo_epi32(r6, r7);
     88   r15 = _mm_unpackhi_epi32(r6, r7);
     89 
     90   r0 = _mm_unpacklo_epi64(r8, r9);
     91   r1 = _mm_unpackhi_epi64(r8, r9);
     92   r2 = _mm_unpacklo_epi64(r10, r11);
     93   r3 = _mm_unpackhi_epi64(r10, r11);
     94 
     95   r4 = _mm_unpacklo_epi64(r12, r13);
     96   r5 = _mm_unpackhi_epi64(r12, r13);
     97   r6 = _mm_unpacklo_epi64(r14, r15);
     98   r7 = _mm_unpackhi_epi64(r14, r15);
     99 
    100   d[0] = _mm_unpacklo_epi64(r0, r2);
    101   d[1] = _mm_unpacklo_epi64(r4, r6);
    102   d[2] = _mm_unpacklo_epi64(r1, r3);
    103   d[3] = _mm_unpacklo_epi64(r5, r7);
    104 
    105   d[4] = _mm_unpackhi_epi64(r0, r2);
    106   d[5] = _mm_unpackhi_epi64(r4, r6);
    107   d[6] = _mm_unpackhi_epi64(r1, r3);
    108   d[7] = _mm_unpackhi_epi64(r5, r7);
    109 }
    110 
    111 static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
    112   __m256i w0, w1, w2, w3, ww0, ww1;
    113 
    114   w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
    115   w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
    116   w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
    117   w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
    118 
    119   ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
    120   ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
    121 
    122   d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
    123   d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
    124 
    125   ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
    126   ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
    127 
    128   d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
    129   d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
    130 }
    131 
    132 static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
    133   __m256i w0, w1, w2, w3, ww0, ww1;
    134 
    135   w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
    136   w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
    137   w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
    138   w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
    139 
    140   ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
    141   ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
    142 
    143   d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
    144   d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
    145 
    146   ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
    147   ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
    148 
    149   d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
    150   d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
    151 
    152   w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
    153   w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
    154   w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
    155   w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
    156 
    157   ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
    158   ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
    159 
    160   d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
    161   d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
    162 
    163   ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
    164   ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
    165 
    166   d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
    167   d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
    168 }
    169 
    170 static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
    171   __m256i w0, w1, w2, w3, ww0, ww1;
    172   __m256i dd[16];
    173   w0 = _mm256_unpacklo_epi16(x[0], x[1]);
    174   w1 = _mm256_unpacklo_epi16(x[2], x[3]);
    175   w2 = _mm256_unpacklo_epi16(x[4], x[5]);
    176   w3 = _mm256_unpacklo_epi16(x[6], x[7]);
    177 
    178   ww0 = _mm256_unpacklo_epi32(w0, w1);  //
    179   ww1 = _mm256_unpacklo_epi32(w2, w3);  //
    180 
    181   dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
    182   dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
    183 
    184   ww0 = _mm256_unpackhi_epi32(w0, w1);  //
    185   ww1 = _mm256_unpackhi_epi32(w2, w3);  //
    186 
    187   dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
    188   dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
    189 
    190   w0 = _mm256_unpackhi_epi16(x[0], x[1]);
    191   w1 = _mm256_unpackhi_epi16(x[2], x[3]);
    192   w2 = _mm256_unpackhi_epi16(x[4], x[5]);
    193   w3 = _mm256_unpackhi_epi16(x[6], x[7]);
    194 
    195   ww0 = _mm256_unpacklo_epi32(w0, w1);  //
    196   ww1 = _mm256_unpacklo_epi32(w2, w3);  //
    197 
    198   dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
    199   dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
    200 
    201   ww0 = _mm256_unpackhi_epi32(w0, w1);  //
    202   ww1 = _mm256_unpackhi_epi32(w2, w3);  //
    203 
    204   dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
    205   dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
    206 
    207   w0 = _mm256_unpacklo_epi16(x[8], x[9]);
    208   w1 = _mm256_unpacklo_epi16(x[10], x[11]);
    209   w2 = _mm256_unpacklo_epi16(x[12], x[13]);
    210   w3 = _mm256_unpacklo_epi16(x[14], x[15]);
    211 
    212   ww0 = _mm256_unpacklo_epi32(w0, w1);
    213   ww1 = _mm256_unpacklo_epi32(w2, w3);
    214 
    215   dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
    216   dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
    217 
    218   ww0 = _mm256_unpackhi_epi32(w0, w1);
    219   ww1 = _mm256_unpackhi_epi32(w2, w3);
    220 
    221   dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
    222   dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
    223 
    224   w0 = _mm256_unpackhi_epi16(x[8], x[9]);
    225   w1 = _mm256_unpackhi_epi16(x[10], x[11]);
    226   w2 = _mm256_unpackhi_epi16(x[12], x[13]);
    227   w3 = _mm256_unpackhi_epi16(x[14], x[15]);
    228 
    229   ww0 = _mm256_unpacklo_epi32(w0, w1);
    230   ww1 = _mm256_unpacklo_epi32(w2, w3);
    231 
    232   dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
    233   dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
    234 
    235   ww0 = _mm256_unpackhi_epi32(w0, w1);
    236   ww1 = _mm256_unpackhi_epi32(w2, w3);
    237 
    238   dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
    239   dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
    240 
    241   for (int i = 0; i < 8; i++) {
    242     d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
    243     d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
    244                                        _mm256_extracti128_si256(dd[i], 1), 0);
    245   }
    246 }
    247 
    248 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    249                                  const uint8_t *above, const uint8_t *left) {
    250   const __m256i sum_above = dc_sum_32(above);
    251   __m256i sum_left = dc_sum_32(left);
    252   sum_left = _mm256_add_epi16(sum_left, sum_above);
    253   const __m256i thirtytwo = _mm256_set1_epi16(32);
    254   sum_left = _mm256_add_epi16(sum_left, thirtytwo);
    255   sum_left = _mm256_srai_epi16(sum_left, 6);
    256   const __m256i zero = _mm256_setzero_si256();
    257   __m256i row = _mm256_shuffle_epi8(sum_left, zero);
    258   row_store_32xh(&row, 32, dst, stride);
    259 }
    260 
    261 void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    262                                      const uint8_t *above,
    263                                      const uint8_t *left) {
    264   __m256i sum = dc_sum_32(above);
    265   (void)left;
    266 
    267   const __m256i sixteen = _mm256_set1_epi16(16);
    268   sum = _mm256_add_epi16(sum, sixteen);
    269   sum = _mm256_srai_epi16(sum, 5);
    270   const __m256i zero = _mm256_setzero_si256();
    271   __m256i row = _mm256_shuffle_epi8(sum, zero);
    272   row_store_32xh(&row, 32, dst, stride);
    273 }
    274 
    275 void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    276                                       const uint8_t *above,
    277                                       const uint8_t *left) {
    278   __m256i sum = dc_sum_32(left);
    279   (void)above;
    280 
    281   const __m256i sixteen = _mm256_set1_epi16(16);
    282   sum = _mm256_add_epi16(sum, sixteen);
    283   sum = _mm256_srai_epi16(sum, 5);
    284   const __m256i zero = _mm256_setzero_si256();
    285   __m256i row = _mm256_shuffle_epi8(sum, zero);
    286   row_store_32xh(&row, 32, dst, stride);
    287 }
    288 
    289 void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    290                                      const uint8_t *above,
    291                                      const uint8_t *left) {
    292   (void)above;
    293   (void)left;
    294   const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
    295   row_store_32xh(&row, 32, dst, stride);
    296 }
    297 
    298 void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    299                                 const uint8_t *above, const uint8_t *left) {
    300   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
    301   (void)left;
    302   row_store_32xh(&row, 32, dst, stride);
    303 }
    304 
    305 // There are 32 rows togeter. This function does line:
    306 // 0,1,2,3, and 16,17,18,19. The next call would do
    307 // 4,5,6,7, and 20,21,22,23. So 4 times of calling
    308 // would finish 32 rows.
    309 static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
    310                                         ptrdiff_t stride) {
    311   __m256i t[4];
    312   __m256i m = _mm256_setzero_si256();
    313   const __m256i inc = _mm256_set1_epi8(4);
    314   int i;
    315 
    316   for (i = 0; i < 4; i++) {
    317     t[i] = _mm256_shuffle_epi8(*row, m);
    318     __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
    319     __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
    320     _mm256_storeu_si256((__m256i *)dst, r0);
    321     _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
    322     dst += stride;
    323     m = _mm256_add_epi8(m, inc);
    324   }
    325 }
    326 
    327 void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    328                                 const uint8_t *above, const uint8_t *left) {
    329   (void)above;
    330   const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
    331 
    332   __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
    333 
    334   __m256i v = _mm256_unpacklo_epi8(u, u);
    335   h_predictor_32x8line(&v, dst, stride);
    336   dst += stride << 2;
    337 
    338   v = _mm256_unpackhi_epi8(u, u);
    339   h_predictor_32x8line(&v, dst, stride);
    340   dst += stride << 2;
    341 
    342   u = _mm256_unpackhi_epi8(left_col, left_col);
    343 
    344   v = _mm256_unpacklo_epi8(u, u);
    345   h_predictor_32x8line(&v, dst, stride);
    346   dst += stride << 2;
    347 
    348   v = _mm256_unpackhi_epi8(u, u);
    349   h_predictor_32x8line(&v, dst, stride);
    350 }
    351 
    352 // -----------------------------------------------------------------------------
    353 // Rectangle
    354 
    355 // TODO(luoyi) The following two functions are shared with intrapred_sse2.c.
    356 // Use a header file, intrapred_common_x86.h
    357 static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
    358   __m128i x = _mm_load_si128((__m128i const *)ref);
    359   const __m128i zero = _mm_setzero_si128();
    360   x = _mm_sad_epu8(x, zero);
    361   const __m128i high = _mm_unpackhi_epi64(x, x);
    362   return _mm_add_epi16(x, high);
    363 }
    364 
    365 static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
    366   __m128i x0 = _mm_load_si128((__m128i const *)ref);
    367   __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
    368   const __m128i zero = _mm_setzero_si128();
    369   x0 = _mm_sad_epu8(x0, zero);
    370   x1 = _mm_sad_epu8(x1, zero);
    371   x0 = _mm_add_epi16(x0, x1);
    372   const __m128i high = _mm_unpackhi_epi64(x0, x0);
    373   return _mm_add_epi16(x0, high);
    374 }
    375 
    376 void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    377                                  const uint8_t *above, const uint8_t *left) {
    378   const __m128i top_sum = dc_sum_32_sse2(above);
    379   __m128i left_sum = dc_sum_16_sse2(left);
    380   left_sum = _mm_add_epi16(top_sum, left_sum);
    381   uint16_t sum = _mm_cvtsi128_si32(left_sum);
    382   sum += 24;
    383   sum /= 48;
    384   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
    385   row_store_32xh(&row, 16, dst, stride);
    386 }
    387 
    388 void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    389                                  const uint8_t *above, const uint8_t *left) {
    390   const __m256i sum_above = dc_sum_32(above);
    391   __m256i sum_left = dc_sum_64(left);
    392   sum_left = _mm256_add_epi16(sum_left, sum_above);
    393   uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
    394   sum += 48;
    395   sum /= 96;
    396   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
    397   row_store_32xh(&row, 64, dst, stride);
    398 }
    399 
    400 void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    401                                  const uint8_t *above, const uint8_t *left) {
    402   const __m256i sum_above = dc_sum_64(above);
    403   __m256i sum_left = dc_sum_64(left);
    404   sum_left = _mm256_add_epi16(sum_left, sum_above);
    405   uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
    406   sum += 64;
    407   sum /= 128;
    408   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
    409   row_store_64xh(&row, 64, dst, stride);
    410 }
    411 
    412 void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    413                                  const uint8_t *above, const uint8_t *left) {
    414   const __m256i sum_above = dc_sum_64(above);
    415   __m256i sum_left = dc_sum_32(left);
    416   sum_left = _mm256_add_epi16(sum_left, sum_above);
    417   uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
    418   sum += 48;
    419   sum /= 96;
    420   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
    421   row_store_64xh(&row, 32, dst, stride);
    422 }
    423 
    424 void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    425                                  const uint8_t *above, const uint8_t *left) {
    426   const __m256i sum_above = dc_sum_64(above);
    427   __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
    428   sum_left = _mm256_add_epi16(sum_left, sum_above);
    429   uint16_t sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
    430   sum += 40;
    431   sum /= 80;
    432   const __m256i row = _mm256_set1_epi8((uint8_t)sum);
    433   row_store_64xh(&row, 16, dst, stride);
    434 }
    435 
    436 void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    437                                      const uint8_t *above,
    438                                      const uint8_t *left) {
    439   __m256i sum = dc_sum_32(above);
    440   (void)left;
    441 
    442   const __m256i sixteen = _mm256_set1_epi16(16);
    443   sum = _mm256_add_epi16(sum, sixteen);
    444   sum = _mm256_srai_epi16(sum, 5);
    445   const __m256i zero = _mm256_setzero_si256();
    446   __m256i row = _mm256_shuffle_epi8(sum, zero);
    447   row_store_32xh(&row, 16, dst, stride);
    448 }
    449 
    450 void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    451                                      const uint8_t *above,
    452                                      const uint8_t *left) {
    453   __m256i sum = dc_sum_32(above);
    454   (void)left;
    455 
    456   const __m256i sixteen = _mm256_set1_epi16(16);
    457   sum = _mm256_add_epi16(sum, sixteen);
    458   sum = _mm256_srai_epi16(sum, 5);
    459   const __m256i zero = _mm256_setzero_si256();
    460   __m256i row = _mm256_shuffle_epi8(sum, zero);
    461   row_store_32xh(&row, 64, dst, stride);
    462 }
    463 
    464 void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    465                                      const uint8_t *above,
    466                                      const uint8_t *left) {
    467   __m256i sum = dc_sum_64(above);
    468   (void)left;
    469 
    470   const __m256i thirtytwo = _mm256_set1_epi16(32);
    471   sum = _mm256_add_epi16(sum, thirtytwo);
    472   sum = _mm256_srai_epi16(sum, 6);
    473   const __m256i zero = _mm256_setzero_si256();
    474   __m256i row = _mm256_shuffle_epi8(sum, zero);
    475   row_store_64xh(&row, 64, dst, stride);
    476 }
    477 
    478 void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    479                                      const uint8_t *above,
    480                                      const uint8_t *left) {
    481   __m256i sum = dc_sum_64(above);
    482   (void)left;
    483 
    484   const __m256i thirtytwo = _mm256_set1_epi16(32);
    485   sum = _mm256_add_epi16(sum, thirtytwo);
    486   sum = _mm256_srai_epi16(sum, 6);
    487   const __m256i zero = _mm256_setzero_si256();
    488   __m256i row = _mm256_shuffle_epi8(sum, zero);
    489   row_store_64xh(&row, 32, dst, stride);
    490 }
    491 
    492 void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    493                                      const uint8_t *above,
    494                                      const uint8_t *left) {
    495   __m256i sum = dc_sum_64(above);
    496   (void)left;
    497 
    498   const __m256i thirtytwo = _mm256_set1_epi16(32);
    499   sum = _mm256_add_epi16(sum, thirtytwo);
    500   sum = _mm256_srai_epi16(sum, 6);
    501   const __m256i zero = _mm256_setzero_si256();
    502   __m256i row = _mm256_shuffle_epi8(sum, zero);
    503   row_store_64xh(&row, 16, dst, stride);
    504 }
    505 
    506 void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    507                                       const uint8_t *above,
    508                                       const uint8_t *left) {
    509   __m128i sum = dc_sum_16_sse2(left);
    510   (void)above;
    511 
    512   const __m128i eight = _mm_set1_epi16(8);
    513   sum = _mm_add_epi16(sum, eight);
    514   sum = _mm_srai_epi16(sum, 4);
    515   const __m128i zero = _mm_setzero_si128();
    516   const __m128i r = _mm_shuffle_epi8(sum, zero);
    517   const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
    518   row_store_32xh(&row, 16, dst, stride);
    519 }
    520 
    521 void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    522                                       const uint8_t *above,
    523                                       const uint8_t *left) {
    524   __m256i sum = dc_sum_64(left);
    525   (void)above;
    526 
    527   const __m256i thirtytwo = _mm256_set1_epi16(32);
    528   sum = _mm256_add_epi16(sum, thirtytwo);
    529   sum = _mm256_srai_epi16(sum, 6);
    530   const __m256i zero = _mm256_setzero_si256();
    531   __m256i row = _mm256_shuffle_epi8(sum, zero);
    532   row_store_32xh(&row, 64, dst, stride);
    533 }
    534 
    535 void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    536                                       const uint8_t *above,
    537                                       const uint8_t *left) {
    538   __m256i sum = dc_sum_64(left);
    539   (void)above;
    540 
    541   const __m256i thirtytwo = _mm256_set1_epi16(32);
    542   sum = _mm256_add_epi16(sum, thirtytwo);
    543   sum = _mm256_srai_epi16(sum, 6);
    544   const __m256i zero = _mm256_setzero_si256();
    545   __m256i row = _mm256_shuffle_epi8(sum, zero);
    546   row_store_64xh(&row, 64, dst, stride);
    547 }
    548 
    549 void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    550                                       const uint8_t *above,
    551                                       const uint8_t *left) {
    552   __m256i sum = dc_sum_32(left);
    553   (void)above;
    554 
    555   const __m256i sixteen = _mm256_set1_epi16(16);
    556   sum = _mm256_add_epi16(sum, sixteen);
    557   sum = _mm256_srai_epi16(sum, 5);
    558   const __m256i zero = _mm256_setzero_si256();
    559   __m256i row = _mm256_shuffle_epi8(sum, zero);
    560   row_store_64xh(&row, 32, dst, stride);
    561 }
    562 
    563 void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    564                                       const uint8_t *above,
    565                                       const uint8_t *left) {
    566   __m128i sum = dc_sum_16_sse2(left);
    567   (void)above;
    568 
    569   const __m128i eight = _mm_set1_epi16(8);
    570   sum = _mm_add_epi16(sum, eight);
    571   sum = _mm_srai_epi16(sum, 4);
    572   const __m128i zero = _mm_setzero_si128();
    573   const __m128i r = _mm_shuffle_epi8(sum, zero);
    574   const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
    575   row_store_64xh(&row, 16, dst, stride);
    576 }
    577 
    578 void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    579                                      const uint8_t *above,
    580                                      const uint8_t *left) {
    581   (void)above;
    582   (void)left;
    583   const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
    584   row_store_32xh(&row, 16, dst, stride);
    585 }
    586 
    587 void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    588                                      const uint8_t *above,
    589                                      const uint8_t *left) {
    590   (void)above;
    591   (void)left;
    592   const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
    593   row_store_32xh(&row, 64, dst, stride);
    594 }
    595 
    596 void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    597                                      const uint8_t *above,
    598                                      const uint8_t *left) {
    599   (void)above;
    600   (void)left;
    601   const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
    602   row_store_64xh(&row, 64, dst, stride);
    603 }
    604 
    605 void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    606                                      const uint8_t *above,
    607                                      const uint8_t *left) {
    608   (void)above;
    609   (void)left;
    610   const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
    611   row_store_64xh(&row, 32, dst, stride);
    612 }
    613 
    614 void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    615                                      const uint8_t *above,
    616                                      const uint8_t *left) {
    617   (void)above;
    618   (void)left;
    619   const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
    620   row_store_64xh(&row, 16, dst, stride);
    621 }
    622 
    623 void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    624                                 const uint8_t *above, const uint8_t *left) {
    625   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
    626   (void)left;
    627   row_store_32xh(&row, 16, dst, stride);
    628 }
    629 
    630 void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    631                                 const uint8_t *above, const uint8_t *left) {
    632   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
    633   (void)left;
    634   row_store_32xh(&row, 64, dst, stride);
    635 }
    636 
    637 void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    638                                 const uint8_t *above, const uint8_t *left) {
    639   const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
    640   const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
    641   (void)left;
    642   row_store_32x2xh(&row0, &row1, 64, dst, stride);
    643 }
    644 
    645 void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    646                                 const uint8_t *above, const uint8_t *left) {
    647   const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
    648   const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
    649   (void)left;
    650   row_store_32x2xh(&row0, &row1, 32, dst, stride);
    651 }
    652 
    653 void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    654                                 const uint8_t *above, const uint8_t *left) {
    655   const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
    656   const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
    657   (void)left;
    658   row_store_32x2xh(&row0, &row1, 16, dst, stride);
    659 }
    660 
    661 // -----------------------------------------------------------------------------
    662 // PAETH_PRED
    663 
    664 // Return 16 16-bit pixels in one row (__m256i)
    665 static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
    666                                  const __m256i *topleft) {
    667   const __m256i base =
    668       _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
    669 
    670   __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
    671   __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
    672   __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
    673 
    674   __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
    675   mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
    676   __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
    677 
    678   pl = _mm256_andnot_si256(mask1, *left);
    679 
    680   ptl = _mm256_and_si256(mask2, *topleft);
    681   pt = _mm256_andnot_si256(mask2, *top);
    682   pt = _mm256_or_si256(pt, ptl);
    683   pt = _mm256_and_si256(mask1, pt);
    684 
    685   return _mm256_or_si256(pt, pl);
    686 }
    687 
    688 // Return 16 8-bit pixels in one row (__m128i)
    689 static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
    690                                       const __m256i *topleft) {
    691   const __m256i p0 = paeth_pred(left, top, topleft);
    692   const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
    693   const __m256i p = _mm256_packus_epi16(p0, p1);
    694   return _mm256_castsi256_si128(p);
    695 }
    696 
    697 static INLINE __m256i get_top_vector(const uint8_t *above) {
    698   const __m128i x = _mm_load_si128((const __m128i *)above);
    699   const __m128i zero = _mm_setzero_si128();
    700   const __m128i t0 = _mm_unpacklo_epi8(x, zero);
    701   const __m128i t1 = _mm_unpackhi_epi8(x, zero);
    702   return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
    703 }
    704 
    705 void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
    706                                    const uint8_t *above, const uint8_t *left) {
    707   __m128i x = _mm_loadl_epi64((const __m128i *)left);
    708   const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
    709   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
    710   __m256i rep = _mm256_set1_epi16(0x8000);
    711   const __m256i one = _mm256_set1_epi16(1);
    712   const __m256i top = get_top_vector(above);
    713 
    714   int i;
    715   for (i = 0; i < 8; ++i) {
    716     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    717     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    718 
    719     _mm_store_si128((__m128i *)dst, row);
    720     dst += stride;
    721     rep = _mm256_add_epi16(rep, one);
    722   }
    723 }
    724 
    725 static INLINE __m256i get_left_vector(const uint8_t *left) {
    726   const __m128i x = _mm_load_si128((const __m128i *)left);
    727   return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
    728 }
    729 
    730 void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
    731                                     const uint8_t *above, const uint8_t *left) {
    732   const __m256i l = get_left_vector(left);
    733   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
    734   __m256i rep = _mm256_set1_epi16(0x8000);
    735   const __m256i one = _mm256_set1_epi16(1);
    736   const __m256i top = get_top_vector(above);
    737 
    738   int i;
    739   for (i = 0; i < 16; ++i) {
    740     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    741     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    742 
    743     _mm_store_si128((__m128i *)dst, row);
    744     dst += stride;
    745     rep = _mm256_add_epi16(rep, one);
    746   }
    747 }
    748 
    749 void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
    750                                     const uint8_t *above, const uint8_t *left) {
    751   __m256i l = get_left_vector(left);
    752   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
    753   __m256i rep = _mm256_set1_epi16(0x8000);
    754   const __m256i one = _mm256_set1_epi16(1);
    755   const __m256i top = get_top_vector(above);
    756 
    757   int i;
    758   for (i = 0; i < 16; ++i) {
    759     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    760     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    761 
    762     _mm_store_si128((__m128i *)dst, row);
    763     dst += stride;
    764     rep = _mm256_add_epi16(rep, one);
    765   }
    766 
    767   l = get_left_vector(left + 16);
    768   rep = _mm256_set1_epi16(0x8000);
    769   for (i = 0; i < 16; ++i) {
    770     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    771     const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    772 
    773     _mm_store_si128((__m128i *)dst, row);
    774     dst += stride;
    775     rep = _mm256_add_epi16(rep, one);
    776   }
    777 }
    778 
    779 void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
    780                                     const uint8_t *above, const uint8_t *left) {
    781   const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
    782   const __m256i one = _mm256_set1_epi16(1);
    783   const __m256i top = get_top_vector(above);
    784 
    785   for (int j = 0; j < 4; ++j) {
    786     const __m256i l = get_left_vector(left + j * 16);
    787     __m256i rep = _mm256_set1_epi16(0x8000);
    788     for (int i = 0; i < 16; ++i) {
    789       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    790       const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    791 
    792       _mm_store_si128((__m128i *)dst, row);
    793       dst += stride;
    794       rep = _mm256_add_epi16(rep, one);
    795     }
    796   }
    797 }
    798 
    799 // Return 32 8-bit pixels in one row (__m256i)
    800 static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
    801                                       const __m256i *top1,
    802                                       const __m256i *topleft) {
    803   __m256i p0 = paeth_pred(left, top0, topleft);
    804   __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
    805   const __m256i x0 = _mm256_packus_epi16(p0, p1);
    806 
    807   p0 = paeth_pred(left, top1, topleft);
    808   p1 = _mm256_permute4x64_epi64(p0, 0xe);
    809   const __m256i x1 = _mm256_packus_epi16(p0, p1);
    810 
    811   return _mm256_permute2x128_si256(x0, x1, 0x20);
    812 }
    813 
    814 void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    815                                     const uint8_t *above, const uint8_t *left) {
    816   const __m256i l = get_left_vector(left);
    817   const __m256i t0 = get_top_vector(above);
    818   const __m256i t1 = get_top_vector(above + 16);
    819   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
    820   __m256i rep = _mm256_set1_epi16(0x8000);
    821   const __m256i one = _mm256_set1_epi16(1);
    822 
    823   int i;
    824   for (i = 0; i < 16; ++i) {
    825     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    826 
    827     const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
    828 
    829     _mm256_storeu_si256((__m256i *)dst, r);
    830 
    831     dst += stride;
    832     rep = _mm256_add_epi16(rep, one);
    833   }
    834 }
    835 
    836 void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    837                                     const uint8_t *above, const uint8_t *left) {
    838   __m256i l = get_left_vector(left);
    839   const __m256i t0 = get_top_vector(above);
    840   const __m256i t1 = get_top_vector(above + 16);
    841   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
    842   __m256i rep = _mm256_set1_epi16(0x8000);
    843   const __m256i one = _mm256_set1_epi16(1);
    844 
    845   int i;
    846   for (i = 0; i < 16; ++i) {
    847     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    848 
    849     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    850     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    851 
    852     _mm_store_si128((__m128i *)dst, r0);
    853     _mm_store_si128((__m128i *)(dst + 16), r1);
    854 
    855     dst += stride;
    856     rep = _mm256_add_epi16(rep, one);
    857   }
    858 
    859   l = get_left_vector(left + 16);
    860   rep = _mm256_set1_epi16(0x8000);
    861   for (i = 0; i < 16; ++i) {
    862     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    863 
    864     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    865     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    866 
    867     _mm_store_si128((__m128i *)dst, r0);
    868     _mm_store_si128((__m128i *)(dst + 16), r1);
    869 
    870     dst += stride;
    871     rep = _mm256_add_epi16(rep, one);
    872   }
    873 }
    874 
    875 void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    876                                     const uint8_t *above, const uint8_t *left) {
    877   const __m256i t0 = get_top_vector(above);
    878   const __m256i t1 = get_top_vector(above + 16);
    879   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
    880   const __m256i one = _mm256_set1_epi16(1);
    881 
    882   int i, j;
    883   for (j = 0; j < 4; ++j) {
    884     const __m256i l = get_left_vector(left + j * 16);
    885     __m256i rep = _mm256_set1_epi16(0x8000);
    886     for (i = 0; i < 16; ++i) {
    887       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    888 
    889       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    890       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    891 
    892       _mm_store_si128((__m128i *)dst, r0);
    893       _mm_store_si128((__m128i *)(dst + 16), r1);
    894 
    895       dst += stride;
    896       rep = _mm256_add_epi16(rep, one);
    897     }
    898   }
    899 }
    900 
    901 void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    902                                     const uint8_t *above, const uint8_t *left) {
    903   const __m256i t0 = get_top_vector(above);
    904   const __m256i t1 = get_top_vector(above + 16);
    905   const __m256i t2 = get_top_vector(above + 32);
    906   const __m256i t3 = get_top_vector(above + 48);
    907   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
    908   const __m256i one = _mm256_set1_epi16(1);
    909 
    910   int i, j;
    911   for (j = 0; j < 2; ++j) {
    912     const __m256i l = get_left_vector(left + j * 16);
    913     __m256i rep = _mm256_set1_epi16(0x8000);
    914     for (i = 0; i < 16; ++i) {
    915       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    916 
    917       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    918       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    919       const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
    920       const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
    921 
    922       _mm_store_si128((__m128i *)dst, r0);
    923       _mm_store_si128((__m128i *)(dst + 16), r1);
    924       _mm_store_si128((__m128i *)(dst + 32), r2);
    925       _mm_store_si128((__m128i *)(dst + 48), r3);
    926 
    927       dst += stride;
    928       rep = _mm256_add_epi16(rep, one);
    929     }
    930   }
    931 }
    932 
    933 void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    934                                     const uint8_t *above, const uint8_t *left) {
    935   const __m256i t0 = get_top_vector(above);
    936   const __m256i t1 = get_top_vector(above + 16);
    937   const __m256i t2 = get_top_vector(above + 32);
    938   const __m256i t3 = get_top_vector(above + 48);
    939   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
    940   const __m256i one = _mm256_set1_epi16(1);
    941 
    942   int i, j;
    943   for (j = 0; j < 4; ++j) {
    944     const __m256i l = get_left_vector(left + j * 16);
    945     __m256i rep = _mm256_set1_epi16(0x8000);
    946     for (i = 0; i < 16; ++i) {
    947       const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    948 
    949       const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    950       const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    951       const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
    952       const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
    953 
    954       _mm_store_si128((__m128i *)dst, r0);
    955       _mm_store_si128((__m128i *)(dst + 16), r1);
    956       _mm_store_si128((__m128i *)(dst + 32), r2);
    957       _mm_store_si128((__m128i *)(dst + 48), r3);
    958 
    959       dst += stride;
    960       rep = _mm256_add_epi16(rep, one);
    961     }
    962   }
    963 }
    964 
    965 void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    966                                     const uint8_t *above, const uint8_t *left) {
    967   const __m256i t0 = get_top_vector(above);
    968   const __m256i t1 = get_top_vector(above + 16);
    969   const __m256i t2 = get_top_vector(above + 32);
    970   const __m256i t3 = get_top_vector(above + 48);
    971   const __m256i tl = _mm256_set1_epi16((uint16_t)above[-1]);
    972   const __m256i one = _mm256_set1_epi16(1);
    973 
    974   int i;
    975   const __m256i l = get_left_vector(left);
    976   __m256i rep = _mm256_set1_epi16(0x8000);
    977   for (i = 0; i < 16; ++i) {
    978     const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    979 
    980     const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    981     const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    982     const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
    983     const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
    984 
    985     _mm_store_si128((__m128i *)dst, r0);
    986     _mm_store_si128((__m128i *)(dst + 16), r1);
    987     _mm_store_si128((__m128i *)(dst + 32), r2);
    988     _mm_store_si128((__m128i *)(dst + 48), r3);
    989 
    990     dst += stride;
    991     rep = _mm256_add_epi16(rep, one);
    992   }
    993 }
    994 
    995 #define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
    996 #define PERM2x128(c0, c1) c0 + (c1 << 4)
    997 
    998 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
    999     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
   1000   const int frac_bits = 6 - upsample_above;
   1001   const int max_base_x = ((N + 4) - 1) << upsample_above;
   1002   int x;
   1003   // a assert(dx > 0);
   1004   // pre-filter above pixels
   1005   // store in temp buffers:
   1006   //   above[x] * 32 + 16
   1007   //   above[x+1] - above[x]
   1008   // final pixels will be caluculated as:
   1009   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1010   __m256i a0, a1, a32, a16;
   1011   __m256i diff;
   1012   __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
   1013 
   1014   a16 = _mm256_set1_epi32(16);
   1015   a_mbase_x = _mm_set1_epi16(above[max_base_x]);
   1016   max_base_x128 = _mm_set1_epi32(max_base_x);
   1017 
   1018   x = dx;
   1019   for (int r = 0; r < N; r++) {
   1020     __m256i b, res, shift;
   1021     __m128i res1;
   1022 
   1023     int base = x >> frac_bits;
   1024     if (base >= max_base_x) {
   1025       for (int i = r; i < N; ++i) {
   1026         dst[i] = a_mbase_x;  // save 4 values
   1027       }
   1028       return;
   1029     }
   1030 
   1031     a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
   1032     a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
   1033 
   1034     if (upsample_above) {
   1035       a0 = _mm256_permutevar8x32_epi32(
   1036           a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
   1037       a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
   1038       base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
   1039       shift = _mm256_srli_epi32(
   1040           _mm256_and_si256(
   1041               _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
   1042               _mm256_set1_epi32(0x3f)),
   1043           1);
   1044     } else {
   1045       base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
   1046       shift = _mm256_srli_epi32(
   1047           _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
   1048     }
   1049 
   1050     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   1051     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   1052     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   1053 
   1054     b = _mm256_mullo_epi32(diff, shift);
   1055     res = _mm256_add_epi32(a32, b);
   1056     res = _mm256_srli_epi32(res, 5);
   1057 
   1058     res1 = _mm256_castsi256_si128(res);
   1059     res1 = _mm_packus_epi32(res1, res1);
   1060 
   1061     mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
   1062     mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
   1063     dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
   1064     x += dx;
   1065   }
   1066 }
   1067 
   1068 static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
   1069                                              ptrdiff_t stride,
   1070                                              const uint16_t *above,
   1071                                              int upsample_above, int dx) {
   1072   __m128i dstvec[16];
   1073 
   1074   highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
   1075                                             dx);
   1076   for (int i = 0; i < N; i++) {
   1077     _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
   1078   }
   1079 }
   1080 
   1081 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
   1082     int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
   1083   const int frac_bits = 6 - upsample_above;
   1084   const int max_base_x = ((8 + N) - 1) << upsample_above;
   1085 
   1086   int x;
   1087   // a assert(dx > 0);
   1088   // pre-filter above pixels
   1089   // store in temp buffers:
   1090   //   above[x] * 32 + 16
   1091   //   above[x+1] - above[x]
   1092   // final pixels will be caluculated as:
   1093   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1094   __m256i a0, a1, a0_1, a1_1, a32, a16;
   1095   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
   1096 
   1097   a16 = _mm256_set1_epi32(16);
   1098   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   1099   max_base_x256 = _mm256_set1_epi32(max_base_x);
   1100 
   1101   x = dx;
   1102   for (int r = 0; r < N; r++) {
   1103     __m256i b, res, res1, shift;
   1104 
   1105     int base = x >> frac_bits;
   1106     if (base >= max_base_x) {
   1107       for (int i = r; i < N; ++i) {
   1108         dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
   1109       }
   1110       return;
   1111     }
   1112 
   1113     a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
   1114     a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
   1115 
   1116     if (upsample_above) {
   1117       a0 = _mm256_permutevar8x32_epi32(
   1118           a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
   1119       a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
   1120 
   1121       a0_1 =
   1122           _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
   1123       a0_1 = _mm256_permutevar8x32_epi32(
   1124           a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
   1125       a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
   1126 
   1127       a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
   1128       a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
   1129       base_inc256 =
   1130           _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
   1131                             base + 10, base + 12, base + 14);
   1132       shift = _mm256_srli_epi32(
   1133           _mm256_and_si256(
   1134               _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
   1135               _mm256_set1_epi32(0x3f)),
   1136           1);
   1137     } else {
   1138       base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
   1139                                       base + 4, base + 5, base + 6, base + 7);
   1140       shift = _mm256_srli_epi32(
   1141           _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
   1142     }
   1143 
   1144     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   1145     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   1146     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   1147 
   1148     b = _mm256_mullo_epi32(diff, shift);
   1149     res = _mm256_add_epi32(a32, b);
   1150     res = _mm256_srli_epi32(res, 5);
   1151 
   1152     res1 = _mm256_packus_epi32(
   1153         res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
   1154 
   1155     mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
   1156     mask256 = _mm256_packs_epi32(
   1157         mask256, _mm256_castsi128_si256(
   1158                      _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
   1159     res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
   1160     dst[r] = _mm256_castsi256_si128(res1);
   1161     x += dx;
   1162   }
   1163 }
   1164 
   1165 static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
   1166                                              ptrdiff_t stride,
   1167                                              const uint16_t *above,
   1168                                              int upsample_above, int dx) {
   1169   __m128i dstvec[32];
   1170 
   1171   highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
   1172                                             dx);
   1173   for (int i = 0; i < N; i++) {
   1174     _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
   1175   }
   1176 }
   1177 
   1178 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
   1179     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
   1180   int x;
   1181   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   1182   (void)upsample_above;
   1183   const int frac_bits = 6;
   1184   const int max_base_x = ((16 + N) - 1);
   1185 
   1186   // pre-filter above pixels
   1187   // store in temp buffers:
   1188   //   above[x] * 32 + 16
   1189   //   above[x+1] - above[x]
   1190   // final pixels will be caluculated as:
   1191   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1192   __m256i a0, a0_1, a1, a1_1, a32, a16;
   1193   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
   1194 
   1195   a16 = _mm256_set1_epi32(16);
   1196   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   1197   max_base_x256 = _mm256_set1_epi16(max_base_x);
   1198 
   1199   x = dx;
   1200   for (int r = 0; r < N; r++) {
   1201     __m256i b, res[2], res1;
   1202 
   1203     int base = x >> frac_bits;
   1204     if (base >= max_base_x) {
   1205       for (int i = r; i < N; ++i) {
   1206         dstvec[i] = a_mbase_x;  // save 16 values
   1207       }
   1208       return;
   1209     }
   1210     __m256i shift = _mm256_srli_epi32(
   1211         _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
   1212 
   1213     a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
   1214     a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
   1215 
   1216     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   1217     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   1218     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   1219     b = _mm256_mullo_epi32(diff, shift);
   1220 
   1221     res[0] = _mm256_add_epi32(a32, b);
   1222     res[0] = _mm256_srli_epi32(res[0], 5);
   1223     res[0] = _mm256_packus_epi32(
   1224         res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
   1225 
   1226     int mdif = max_base_x - base;
   1227     if (mdif > 8) {
   1228       a0_1 =
   1229           _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
   1230       a1_1 =
   1231           _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
   1232 
   1233       diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
   1234       a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
   1235       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   1236       b = _mm256_mullo_epi32(diff, shift);
   1237 
   1238       res[1] = _mm256_add_epi32(a32, b);
   1239       res[1] = _mm256_srli_epi32(res[1], 5);
   1240       res[1] = _mm256_packus_epi32(
   1241           res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
   1242     } else {
   1243       res[1] = a_mbase_x;
   1244     }
   1245     res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
   1246                                    1);  // 16 16bit values
   1247 
   1248     base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
   1249                                     base + 4, base + 5, base + 6, base + 7,
   1250                                     base + 8, base + 9, base + 10, base + 11,
   1251                                     base + 12, base + 13, base + 14, base + 15);
   1252     mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
   1253     dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
   1254     x += dx;
   1255   }
   1256 }
   1257 
   1258 static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
   1259                                               ptrdiff_t stride,
   1260                                               const uint16_t *above,
   1261                                               int upsample_above, int dx) {
   1262   __m256i dstvec[64];
   1263   highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
   1264                                              dx);
   1265   for (int i = 0; i < N; i++) {
   1266     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
   1267   }
   1268 }
   1269 
   1270 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
   1271     int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
   1272   int x;
   1273   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   1274   (void)upsample_above;
   1275   const int frac_bits = 6;
   1276   const int max_base_x = ((32 + N) - 1);
   1277 
   1278   // pre-filter above pixels
   1279   // store in temp buffers:
   1280   //   above[x] * 32 + 16
   1281   //   above[x+1] - above[x]
   1282   // final pixels will be caluculated as:
   1283   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1284   __m256i a0, a0_1, a1, a1_1, a32, a16;
   1285   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
   1286 
   1287   a16 = _mm256_set1_epi32(16);
   1288   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   1289   max_base_x256 = _mm256_set1_epi16(max_base_x);
   1290 
   1291   x = dx;
   1292   for (int r = 0; r < N; r++) {
   1293     __m256i b, res[2], res1;
   1294 
   1295     int base = x >> frac_bits;
   1296     if (base >= max_base_x) {
   1297       for (int i = r; i < N; ++i) {
   1298         dstvec[i] = a_mbase_x;  // save 32 values
   1299         dstvec[i + N] = a_mbase_x;
   1300       }
   1301       return;
   1302     }
   1303 
   1304     __m256i shift = _mm256_srli_epi32(
   1305         _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
   1306 
   1307     for (int j = 0; j < 32; j += 16) {
   1308       int mdif = max_base_x - (base + j);
   1309       if (mdif <= 0) {
   1310         res1 = a_mbase_x;
   1311       } else {
   1312         a0 = _mm256_cvtepu16_epi32(
   1313             _mm_loadu_si128((__m128i *)(above + base + j)));
   1314         a1 = _mm256_cvtepu16_epi32(
   1315             _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
   1316 
   1317         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   1318         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   1319         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   1320         b = _mm256_mullo_epi32(diff, shift);
   1321 
   1322         res[0] = _mm256_add_epi32(a32, b);
   1323         res[0] = _mm256_srli_epi32(res[0], 5);
   1324         res[0] = _mm256_packus_epi32(
   1325             res[0],
   1326             _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
   1327         if (mdif > 8) {
   1328           a0_1 = _mm256_cvtepu16_epi32(
   1329               _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
   1330           a1_1 = _mm256_cvtepu16_epi32(
   1331               _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
   1332 
   1333           diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
   1334           a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
   1335           a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   1336           b = _mm256_mullo_epi32(diff, shift);
   1337 
   1338           res[1] = _mm256_add_epi32(a32, b);
   1339           res[1] = _mm256_srli_epi32(res[1], 5);
   1340           res[1] = _mm256_packus_epi32(
   1341               res[1],
   1342               _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
   1343         } else {
   1344           res[1] = a_mbase_x;
   1345         }
   1346         res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
   1347                                        1);  // 16 16bit values
   1348         base_inc256 = _mm256_setr_epi16(
   1349             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
   1350             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
   1351             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
   1352             base + j + 13, base + j + 14, base + j + 15);
   1353 
   1354         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
   1355         res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
   1356       }
   1357       if (!j)
   1358         dstvec[r] = res1;
   1359       else
   1360         dstvec[r + N] = res1;
   1361     }
   1362     x += dx;
   1363   }
   1364 }
   1365 
   1366 static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
   1367                                               ptrdiff_t stride,
   1368                                               const uint16_t *above,
   1369                                               int upsample_above, int dx) {
   1370   __m256i dstvec[128];
   1371 
   1372   highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
   1373                                              dx);
   1374   for (int i = 0; i < N; i++) {
   1375     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
   1376     _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
   1377   }
   1378 }
   1379 
   1380 static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
   1381                                               ptrdiff_t stride,
   1382                                               const uint16_t *above,
   1383                                               int upsample_above, int dx) {
   1384   int x;
   1385 
   1386   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   1387   (void)upsample_above;
   1388   const int frac_bits = 6;
   1389   const int max_base_x = ((64 + N) - 1);
   1390 
   1391   // pre-filter above pixels
   1392   // store in temp buffers:
   1393   //   above[x] * 32 + 16
   1394   //   above[x+1] - above[x]
   1395   // final pixels will be caluculated as:
   1396   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1397   __m256i a0, a0_1, a1, a1_1, a32, a16;
   1398   __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
   1399 
   1400   a16 = _mm256_set1_epi32(16);
   1401   a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   1402   max_base_x256 = _mm256_set1_epi16(max_base_x);
   1403 
   1404   x = dx;
   1405   for (int r = 0; r < N; r++, dst += stride) {
   1406     __m256i b, res[2], res1;
   1407 
   1408     int base = x >> frac_bits;
   1409     if (base >= max_base_x) {
   1410       for (int i = r; i < N; ++i) {
   1411         _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
   1412         _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
   1413         _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
   1414         _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
   1415         dst += stride;
   1416       }
   1417       return;
   1418     }
   1419 
   1420     __m256i shift = _mm256_srli_epi32(
   1421         _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
   1422 
   1423     __m128i a0_128, a0_1_128, a1_128, a1_1_128;
   1424     for (int j = 0; j < 64; j += 16) {
   1425       int mdif = max_base_x - (base + j);
   1426       if (mdif <= 0) {
   1427         _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
   1428       } else {
   1429         a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
   1430         a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
   1431         a0 = _mm256_cvtepu16_epi32(a0_128);
   1432         a1 = _mm256_cvtepu16_epi32(a1_128);
   1433 
   1434         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   1435         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   1436         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   1437         b = _mm256_mullo_epi32(diff, shift);
   1438 
   1439         res[0] = _mm256_add_epi32(a32, b);
   1440         res[0] = _mm256_srli_epi32(res[0], 5);
   1441         res[0] = _mm256_packus_epi32(
   1442             res[0],
   1443             _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
   1444         if (mdif > 8) {
   1445           a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
   1446           a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
   1447           a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
   1448           a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
   1449 
   1450           diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
   1451           a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
   1452           a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   1453           b = _mm256_mullo_epi32(diff, shift);
   1454 
   1455           res[1] = _mm256_add_epi32(a32, b);
   1456           res[1] = _mm256_srli_epi32(res[1], 5);
   1457           res[1] = _mm256_packus_epi32(
   1458               res[1],
   1459               _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
   1460         } else {
   1461           res[1] = a_mbase_x;
   1462         }
   1463         res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
   1464                                        1);  // 16 16bit values
   1465         base_inc256 = _mm256_setr_epi16(
   1466             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
   1467             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
   1468             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
   1469             base + j + 13, base + j + 14, base + j + 15);
   1470 
   1471         mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
   1472         res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
   1473         _mm256_storeu_si256((__m256i *)(dst + j), res1);
   1474       }
   1475     }
   1476     x += dx;
   1477   }
   1478 }
   1479 
   1480 // Directional prediction, zone 1: 0 < angle < 90
   1481 void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
   1482                                       int bh, const uint16_t *above,
   1483                                       const uint16_t *left, int upsample_above,
   1484                                       int dx, int dy, int bd) {
   1485   (void)left;
   1486   (void)dy;
   1487   (void)bd;
   1488 
   1489   switch (bw) {
   1490     case 4:
   1491       highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
   1492                                        dx);
   1493       break;
   1494     case 8:
   1495       highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
   1496                                        dx);
   1497       break;
   1498     case 16:
   1499       highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
   1500                                         dx);
   1501       break;
   1502     case 32:
   1503       highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
   1504                                         dx);
   1505       break;
   1506     case 64:
   1507       highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above,
   1508                                         dx);
   1509       break;
   1510     default: break;
   1511   }
   1512   return;
   1513 }
   1514 
   1515 static void highbd_transpose_TX_8X8(const uint16_t *src, ptrdiff_t pitchSrc,
   1516                                     uint16_t *dst, ptrdiff_t pitchDst) {
   1517   __m128i r0, r1, r2, r3, r4, r5, r6, r7, r0_Lo, r1_Lo, r2_Lo, r3_Lo, r4_Lo,
   1518       r5_Lo, r6_Lo;
   1519   r0 = _mm_load_si128(
   1520       (__m128i *)(src + 0 * pitchSrc));  // 07,06,05,04,03,02,01,00
   1521   r1 = _mm_load_si128(
   1522       (__m128i *)(src + 1 * pitchSrc));  // 17,16,15,14,13,12,11,10
   1523   r2 = _mm_load_si128(
   1524       (__m128i *)(src + 2 * pitchSrc));  // 27,26,25,24,23,22,21,20
   1525   r3 = _mm_load_si128(
   1526       (__m128i *)(src + 3 * pitchSrc));  // 37,36,35,34,33,32,31,30
   1527   r4 = _mm_load_si128(
   1528       (__m128i *)(src + 4 * pitchSrc));  // 47,46,45,44,43,42,41,40
   1529   r5 = _mm_load_si128(
   1530       (__m128i *)(src + 5 * pitchSrc));  // 57,56,55,54,53,52,51,50
   1531   r6 = _mm_load_si128(
   1532       (__m128i *)(src + 6 * pitchSrc));  // 67,66,65,64,63,62,61,60
   1533   r7 = _mm_load_si128(
   1534       (__m128i *)(src + 7 * pitchSrc));  // 77,76,75,74,73,72,71,70
   1535 
   1536   r0_Lo = _mm_unpacklo_epi16(r0, r1);
   1537   r2_Lo = _mm_unpacklo_epi16(r2, r3);
   1538   r4_Lo = _mm_unpacklo_epi16(r4, r5);
   1539   r6_Lo = _mm_unpacklo_epi16(r6, r7);
   1540 
   1541   r1_Lo = r0_Lo;
   1542   r0_Lo = _mm_unpacklo_epi32(r0_Lo, r2_Lo);
   1543   r1_Lo = _mm_unpackhi_epi32(r1_Lo, r2_Lo);
   1544   r5_Lo = r4_Lo;
   1545   r4_Lo = _mm_unpacklo_epi32(r4_Lo, r6_Lo);
   1546   r5_Lo = _mm_unpackhi_epi32(r5_Lo, r6_Lo);
   1547   r2_Lo = r0_Lo;
   1548   r0_Lo = _mm_unpacklo_epi64(r0_Lo, r4_Lo);  // 64
   1549   r2_Lo = _mm_unpackhi_epi64(r2_Lo, r4_Lo);
   1550   r3_Lo = r1_Lo;
   1551   r1_Lo = _mm_unpacklo_epi64(r1_Lo, r5_Lo);
   1552   r3_Lo = _mm_unpackhi_epi64(r3_Lo, r5_Lo);
   1553 
   1554   _mm_storeu_si128((__m128i *)(dst + 0 * pitchDst), r0_Lo);
   1555   _mm_storeu_si128((__m128i *)(dst + 1 * pitchDst), r2_Lo);
   1556   _mm_storeu_si128((__m128i *)(dst + 2 * pitchDst), r1_Lo);
   1557   _mm_storeu_si128((__m128i *)(dst + 3 * pitchDst), r3_Lo);
   1558 
   1559   r0 = _mm_unpackhi_epi16(r0, r1);
   1560   r2 = _mm_unpackhi_epi16(r2, r3);
   1561   r4 = _mm_unpackhi_epi16(r4, r5);
   1562   r6 = _mm_unpackhi_epi16(r6, r7);
   1563 
   1564   r1 = r0;
   1565   r0 = _mm_unpacklo_epi32(r0, r2);
   1566   r1 = _mm_unpackhi_epi32(r1, r2);
   1567   r5 = r4;
   1568   r4 = _mm_unpacklo_epi32(r4, r6);
   1569   r5 = _mm_unpackhi_epi32(r5, r6);
   1570   r2 = r0;
   1571   r0 = _mm_unpacklo_epi64(r0, r4);
   1572   r2 = _mm_unpackhi_epi64(r2, r4);
   1573   r3 = r1;
   1574   r1 = _mm_unpacklo_epi64(r1, r5);
   1575   r3 = _mm_unpackhi_epi64(r3, r5);
   1576 
   1577   _mm_storeu_si128((__m128i *)(dst + 4 * pitchDst), r0);
   1578   _mm_storeu_si128((__m128i *)(dst + 5 * pitchDst), r2);
   1579   _mm_storeu_si128((__m128i *)(dst + 6 * pitchDst), r1);
   1580   _mm_storeu_si128((__m128i *)(dst + 7 * pitchDst), r3);
   1581 }
   1582 
   1583 static uint8_t HighbdLoadMaskx[8][16] = {
   1584   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   1585   { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
   1586   { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
   1587   { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
   1588   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
   1589   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
   1590   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
   1591   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
   1592 };
   1593 
   1594 static uint8_t HighbdEvenOddMaskx4[8][16] = {
   1595   { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14,
   1596     15 },  // 0=0,1, 1=2,3, 2=4,5, 3=6,7, 4=8,9, 5=10,11, 6=12,13, 7=14,15,
   1597            // >7=0,1
   1598   { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
   1599   { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
   1600   { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 },
   1601   { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 0, 1, 0, 1 },
   1602   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 0, 1 },
   1603   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 0, 1 },
   1604   { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15 }
   1605 };
   1606 
   1607 static uint16_t HighbdEvenOddMaskx8_2[8][16] = {
   1608   { 0, 2, 4, 6, 8, 10, 12, 14 },      { 2, 2, 4, 6, 8, 10, 12, 14 },
   1609   { 4, 4, 4, 6, 8, 10, 12, 14 },      { 6, 6, 6, 6, 8, 10, 12, 14 },
   1610   { 8, 8, 8, 8, 8, 10, 12, 14 },      { 10, 10, 10, 10, 10, 10, 12, 14 },
   1611   { 12, 12, 12, 12, 12, 12, 12, 14 }, { 14, 14, 14, 14, 14, 14, 14, 14 },
   1612 };
   1613 
   1614 static uint16_t HighbdBaseMask[17][16] = {
   1615   {
   1616       0,
   1617       0,
   1618       0,
   1619       0,
   1620       0,
   1621       0,
   1622       0,
   1623       0,
   1624       0,
   1625       0,
   1626       0,
   1627       0,
   1628       0,
   1629       0,
   1630       0,
   1631       0,
   1632   },
   1633   { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1634   { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1635   { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1636   { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1637   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   1638   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   1639     0 },
   1640   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
   1641     0, 0 },
   1642   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
   1643     0, 0, 0, 0 },
   1644   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
   1645     0, 0, 0, 0, 0, 0 },
   1646   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
   1647     0xffff, 0, 0, 0, 0, 0, 0 },
   1648   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
   1649     0xffff, 0xffff, 0, 0, 0, 0, 0 },
   1650   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
   1651     0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
   1652   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
   1653     0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
   1654   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
   1655     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
   1656   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
   1657     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
   1658   { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
   1659     0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
   1660 };
   1661 
   1662 static void highbd_dr_prediction_z2_Nx4_avx2(
   1663     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
   1664     const uint16_t *left, int upsample_above, int upsample_left, int dx,
   1665     int dy) {
   1666   const int min_base_x = -(1 << upsample_above);
   1667   const int min_base_y = -(1 << upsample_left);
   1668   const int frac_bits_x = 6 - upsample_above;
   1669   const int frac_bits_y = 6 - upsample_left;
   1670 
   1671   // a assert(dx > 0);
   1672   // pre-filter above pixels
   1673   // store in temp buffers:
   1674   //   above[x] * 32 + 16
   1675   //   above[x+1] - above[x]
   1676   // final pixels will be caluculated as:
   1677   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1678   __m256i a0_x, a1_x, a32, a16;
   1679   __m256i diff;
   1680   __m128i c3f, min_base_y128;
   1681 
   1682   a16 = _mm256_set1_epi32(16);
   1683   c3f = _mm_set1_epi32(0x3f);
   1684   min_base_y128 = _mm_set1_epi32(min_base_y);
   1685 
   1686   for (int r = 0; r < N; r++) {
   1687     __m256i b, res, shift;
   1688     __m128i resx, resy, resxy;
   1689     __m128i a0_x128, a1_x128;
   1690     int y = r + 1;
   1691     int base_x = (-y * dx) >> frac_bits_x;
   1692     int base_shift = 0;
   1693     if (base_x < (min_base_x - 1)) {
   1694       base_shift = (min_base_x - base_x - 1) >> upsample_above;
   1695     }
   1696     int base_min_diff =
   1697         (min_base_x - base_x + upsample_above) >> upsample_above;
   1698     if (base_min_diff > 4) {
   1699       base_min_diff = 4;
   1700     } else {
   1701       if (base_min_diff < 0) base_min_diff = 0;
   1702     }
   1703 
   1704     if (base_shift > 3) {
   1705       a0_x = _mm256_setzero_si256();
   1706       a1_x = _mm256_setzero_si256();
   1707       shift = _mm256_setzero_si256();
   1708     } else {
   1709       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   1710       if (upsample_above) {
   1711         a0_x128 = _mm_shuffle_epi8(a0_x128,
   1712                                    *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
   1713         a1_x128 = _mm_srli_si128(a0_x128, 8);
   1714 
   1715         shift = _mm256_castsi128_si256(_mm_srli_epi32(
   1716             _mm_and_si128(
   1717                 _mm_slli_epi32(
   1718                     _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
   1719                                    (2 << 6) - y * dx, (3 << 6) - y * dx),
   1720                     upsample_above),
   1721                 c3f),
   1722             1));
   1723       } else {
   1724         a0_x128 =
   1725             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   1726         a1_x128 = _mm_srli_si128(a0_x128, 2);
   1727 
   1728         shift = _mm256_castsi128_si256(_mm_srli_epi32(
   1729             _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
   1730                                          (2 << 6) - y * dx, (3 << 6) - y * dx),
   1731                           c3f),
   1732             1));
   1733       }
   1734       a0_x = _mm256_cvtepu16_epi32(a0_x128);
   1735       a1_x = _mm256_cvtepu16_epi32(a1_x128);
   1736     }
   1737     // y calc
   1738     __m128i a0_y, a1_y, shifty;
   1739     if (base_x < min_base_x) {
   1740       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
   1741       DECLARE_ALIGNED(32, int, base_y_c[4]);
   1742       r6 = _mm_set1_epi32(r << 6);
   1743       dy128 = _mm_set1_epi32(dy);
   1744       c1234 = _mm_setr_epi32(1, 2, 3, 4);
   1745       y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
   1746       base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
   1747       mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
   1748       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
   1749       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
   1750 
   1751       a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
   1752                             left[base_y_c[2]], left[base_y_c[3]]);
   1753       a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
   1754                             left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
   1755 
   1756       if (upsample_left) {
   1757         shifty = _mm_srli_epi32(
   1758             _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
   1759       } else {
   1760         shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
   1761       }
   1762       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
   1763       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
   1764       shift = _mm256_inserti128_si256(shift, shifty, 1);
   1765     }
   1766 
   1767     diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
   1768     a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
   1769     a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   1770 
   1771     b = _mm256_mullo_epi32(diff, shift);
   1772     res = _mm256_add_epi32(a32, b);
   1773     res = _mm256_srli_epi32(res, 5);
   1774 
   1775     resx = _mm256_castsi256_si128(res);
   1776     resx = _mm_packus_epi32(resx, resx);
   1777 
   1778     resy = _mm256_extracti128_si256(res, 1);
   1779     resy = _mm_packus_epi32(resy, resy);
   1780 
   1781     resxy =
   1782         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
   1783     _mm_storel_epi64((__m128i *)(dst), resxy);
   1784     dst += stride;
   1785   }
   1786 }
   1787 
   1788 static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
   1789     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
   1790     const uint16_t *left, int upsample_above, int upsample_left, int dx,
   1791     int dy) {
   1792   const int min_base_x = -(1 << upsample_above);
   1793   const int min_base_y = -(1 << upsample_left);
   1794   const int frac_bits_x = 6 - upsample_above;
   1795   const int frac_bits_y = 6 - upsample_left;
   1796 
   1797   // pre-filter above pixels
   1798   // store in temp buffers:
   1799   //   above[x] * 32 + 16
   1800   //   above[x+1] - above[x]
   1801   // final pixels will be caluculated as:
   1802   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1803   __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
   1804   __m256i diff;
   1805   __m128i a0_x128, a1_x128;
   1806 
   1807   a16 = _mm256_set1_epi32(16);
   1808   c3f = _mm256_set1_epi32(0x3f);
   1809   min_base_y256 = _mm256_set1_epi32(min_base_y);
   1810 
   1811   for (int r = 0; r < N; r++) {
   1812     __m256i b, res, shift;
   1813     __m128i resx, resy, resxy;
   1814     int y = r + 1;
   1815     int base_x = (-y * dx) >> frac_bits_x;
   1816     int base_shift = 0;
   1817     if (base_x < (min_base_x - 1)) {
   1818       base_shift = (min_base_x - base_x - 1) >> upsample_above;
   1819     }
   1820     int base_min_diff =
   1821         (min_base_x - base_x + upsample_above) >> upsample_above;
   1822     if (base_min_diff > 8) {
   1823       base_min_diff = 8;
   1824     } else {
   1825       if (base_min_diff < 0) base_min_diff = 0;
   1826     }
   1827 
   1828     if (base_shift > 7) {
   1829       resx = _mm_setzero_si128();
   1830     } else {
   1831       if (upsample_above) {
   1832         a0_x128 = _mm_setr_epi16(
   1833             above[base_x + HighbdEvenOddMaskx8_2[base_shift][0]],
   1834             above[base_x + HighbdEvenOddMaskx8_2[base_shift][1]],
   1835             above[base_x + HighbdEvenOddMaskx8_2[base_shift][2]],
   1836             above[base_x + HighbdEvenOddMaskx8_2[base_shift][3]],
   1837             above[base_x + HighbdEvenOddMaskx8_2[base_shift][4]],
   1838             above[base_x + HighbdEvenOddMaskx8_2[base_shift][5]],
   1839             above[base_x + HighbdEvenOddMaskx8_2[base_shift][6]],
   1840             above[base_x + HighbdEvenOddMaskx8_2[base_shift][7]]);
   1841         a1_x128 = _mm_setr_epi16(
   1842             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][0]],
   1843             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][1]],
   1844             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][2]],
   1845             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][3]],
   1846             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][4]],
   1847             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][5]],
   1848             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][6]],
   1849             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][7]]);
   1850         shift = _mm256_srli_epi32(
   1851             _mm256_and_si256(
   1852                 _mm256_slli_epi32(
   1853                     _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
   1854                                       (2 << 6) - y * dx, (3 << 6) - y * dx,
   1855                                       (4 << 6) - y * dx, (5 << 6) - y * dx,
   1856                                       (6 << 6) - y * dx, (7 << 6) - y * dx),
   1857                     upsample_above),
   1858                 c3f),
   1859             1);
   1860       } else {
   1861         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   1862         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
   1863         a0_x128 =
   1864             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   1865         a1_x128 =
   1866             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   1867 
   1868         shift = _mm256_srli_epi32(
   1869             _mm256_and_si256(
   1870                 _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
   1871                                   (3 << 6) - y * dx, (4 << 6) - y * dx,
   1872                                   (5 << 6) - y * dx, (6 << 6) - y * dx,
   1873                                   (7 << 6) - y * dx),
   1874                 c3f),
   1875             1);
   1876       }
   1877 
   1878       a0_x = _mm256_cvtepu16_epi32(a0_x128);
   1879       a1_x = _mm256_cvtepu16_epi32(a1_x128);
   1880 
   1881       diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
   1882       a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
   1883       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   1884 
   1885       b = _mm256_mullo_epi32(diff, shift);
   1886       res = _mm256_add_epi32(a32, b);
   1887       res = _mm256_srli_epi32(res, 5);
   1888 
   1889       resx = _mm256_castsi256_si128(_mm256_packus_epi32(
   1890           res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
   1891     }
   1892     // y calc
   1893     if (base_x < min_base_x) {
   1894       DECLARE_ALIGNED(32, int, base_y_c[8]);
   1895       __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
   1896       r6 = _mm256_set1_epi32(r << 6);
   1897       dy256 = _mm256_set1_epi32(dy);
   1898       c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
   1899       y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
   1900       base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
   1901       mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
   1902       base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
   1903       _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
   1904 
   1905       a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
   1906           left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
   1907           left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
   1908           left[base_y_c[6]], left[base_y_c[7]]));
   1909       a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
   1910           left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
   1911           left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
   1912           left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
   1913 
   1914       if (upsample_left) {
   1915         shift = _mm256_srli_epi32(
   1916             _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
   1917             1);
   1918       } else {
   1919         shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
   1920       }
   1921       diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
   1922       a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
   1923       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   1924 
   1925       b = _mm256_mullo_epi32(diff, shift);
   1926       res = _mm256_add_epi32(a32, b);
   1927       res = _mm256_srli_epi32(res, 5);
   1928 
   1929       resy = _mm256_castsi256_si128(_mm256_packus_epi32(
   1930           res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
   1931     } else {
   1932       resy = resx;
   1933     }
   1934     resxy =
   1935         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
   1936     _mm_storeu_si128((__m128i *)(dst), resxy);
   1937     dst += stride;
   1938   }
   1939 }
   1940 
   1941 static void highbd_dr_prediction_z2_Nx8_avx2(
   1942     int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
   1943     const uint16_t *left, int upsample_above, int upsample_left, int dx,
   1944     int dy) {
   1945   const int min_base_x = -(1 << upsample_above);
   1946   const int min_base_y = -(1 << upsample_left);
   1947   const int frac_bits_x = 6 - upsample_above;
   1948   const int frac_bits_y = 6 - upsample_left;
   1949 
   1950   // pre-filter above pixels
   1951   // store in temp buffers:
   1952   //   above[x] * 32 + 16
   1953   //   above[x+1] - above[x]
   1954   // final pixels will be caluculated as:
   1955   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1956   __m128i c3f, min_base_y128;
   1957   __m256i a0_x, a1_x, diff, a32, a16;
   1958   __m128i a0_x128, a1_x128;
   1959 
   1960   a16 = _mm256_set1_epi16(16);
   1961   c3f = _mm_set1_epi16(0x3f);
   1962   min_base_y128 = _mm_set1_epi16(min_base_y);
   1963 
   1964   for (int r = 0; r < N; r++) {
   1965     __m256i b, res, shift;
   1966     __m128i resx, resy, resxy;
   1967     int y = r + 1;
   1968     int base_x = (-y * dx) >> frac_bits_x;
   1969     int base_shift = 0;
   1970     if (base_x < (min_base_x - 1)) {
   1971       base_shift = (min_base_x - base_x - 1) >> upsample_above;
   1972     }
   1973     int base_min_diff =
   1974         (min_base_x - base_x + upsample_above) >> upsample_above;
   1975     if (base_min_diff > 8) {
   1976       base_min_diff = 8;
   1977     } else {
   1978       if (base_min_diff < 0) base_min_diff = 0;
   1979     }
   1980 
   1981     if (base_shift > 7) {
   1982       a0_x = _mm256_setzero_si256();
   1983       a1_x = _mm256_setzero_si256();
   1984       shift = _mm256_setzero_si256();
   1985     } else {
   1986       if (upsample_above) {
   1987         a0_x128 = _mm_setr_epi16(
   1988             above[base_x + HighbdEvenOddMaskx8_2[base_shift][0]],
   1989             above[base_x + HighbdEvenOddMaskx8_2[base_shift][1]],
   1990             above[base_x + HighbdEvenOddMaskx8_2[base_shift][2]],
   1991             above[base_x + HighbdEvenOddMaskx8_2[base_shift][3]],
   1992             above[base_x + HighbdEvenOddMaskx8_2[base_shift][4]],
   1993             above[base_x + HighbdEvenOddMaskx8_2[base_shift][5]],
   1994             above[base_x + HighbdEvenOddMaskx8_2[base_shift][6]],
   1995             above[base_x + HighbdEvenOddMaskx8_2[base_shift][7]]);
   1996         a1_x128 = _mm_setr_epi16(
   1997             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][0]],
   1998             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][1]],
   1999             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][2]],
   2000             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][3]],
   2001             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][4]],
   2002             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][5]],
   2003             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][6]],
   2004             above[base_x + 1 + HighbdEvenOddMaskx8_2[base_shift][7]]);
   2005         shift = _mm256_castsi128_si256(_mm_srli_epi16(
   2006             _mm_and_si128(
   2007                 _mm_slli_epi16(
   2008                     _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
   2009                                    (2 << 6) - y * dx, (3 << 6) - y * dx,
   2010                                    (4 << 6) - y * dx, (5 << 6) - y * dx,
   2011                                    (6 << 6) - y * dx, (7 << 6) - y * dx),
   2012                     upsample_above),
   2013                 c3f),
   2014             1));
   2015       } else {
   2016         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   2017         a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
   2018         a0_x128 =
   2019             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2020         a1_x128 =
   2021             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2022 
   2023         shift = _mm256_castsi128_si256(_mm_srli_epi16(
   2024             _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
   2025                                          (2 << 6) - y * dx, (3 << 6) - y * dx,
   2026                                          (4 << 6) - y * dx, (5 << 6) - y * dx,
   2027                                          (6 << 6) - y * dx, (7 << 6) - y * dx),
   2028                           c3f),
   2029             1));
   2030       }
   2031       a0_x = _mm256_castsi128_si256(a0_x128);
   2032       a1_x = _mm256_castsi128_si256(a1_x128);
   2033     }
   2034 
   2035     // y calc
   2036     __m128i a0_y, a1_y, shifty;
   2037     if (base_x < min_base_x) {
   2038       DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
   2039       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
   2040       r6 = _mm_set1_epi16(r << 6);
   2041       dy128 = _mm_set1_epi16(dy);
   2042       c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
   2043       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
   2044       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
   2045       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
   2046       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
   2047       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
   2048 
   2049       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
   2050                             left[base_y_c[2]], left[base_y_c[3]],
   2051                             left[base_y_c[4]], left[base_y_c[5]],
   2052                             left[base_y_c[6]], left[base_y_c[7]]);
   2053       a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
   2054                             left[base_y_c[2] + 1], left[base_y_c[3] + 1],
   2055                             left[base_y_c[4] + 1], left[base_y_c[5] + 1],
   2056                             left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
   2057 
   2058       if (upsample_left) {
   2059         shifty = _mm_srli_epi16(
   2060             _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
   2061       } else {
   2062         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
   2063       }
   2064       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
   2065       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
   2066       shift = _mm256_inserti128_si256(shift, shifty, 1);
   2067     }
   2068 
   2069     diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
   2070     a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
   2071     a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   2072 
   2073     b = _mm256_mullo_epi16(diff, shift);
   2074     res = _mm256_add_epi16(a32, b);
   2075     res = _mm256_srli_epi16(res, 5);
   2076 
   2077     resx = _mm256_castsi256_si128(res);
   2078     resy = _mm256_extracti128_si256(res, 1);
   2079 
   2080     resxy =
   2081         _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
   2082     _mm_storeu_si128((__m128i *)(dst), resxy);
   2083     dst += stride;
   2084   }
   2085 }
   2086 
   2087 static void highbd_dr_prediction_32bit_z2_HxW_avx2(
   2088     int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
   2089     const uint16_t *left, int upsample_above, int upsample_left, int dx,
   2090     int dy) {
   2091   // here upsample_above and upsample_left are 0 by design of
   2092   // av1_use_intra_edge_upsample
   2093   const int min_base_x = -1;
   2094   const int min_base_y = -1;
   2095   (void)upsample_above;
   2096   (void)upsample_left;
   2097   const int frac_bits_x = 6;
   2098   const int frac_bits_y = 6;
   2099 
   2100   // pre-filter above pixels
   2101   // store in temp buffers:
   2102   //   above[x] * 32 + 16
   2103   //   above[x+1] - above[x]
   2104   // final pixels will be caluculated as:
   2105   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   2106   __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16;
   2107   __m256i diff, min_base_y256, c3f;
   2108   __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
   2109 
   2110   a16 = _mm256_set1_epi32(16);
   2111   min_base_y256 = _mm256_set1_epi16(min_base_y);
   2112   c3f = _mm256_set1_epi32(0x3f);
   2113 
   2114   for (int r = 0; r < H; r++) {
   2115     __m256i b, res, shift;
   2116     __m256i resx[2], resy[2];
   2117     __m256i resxy;
   2118     for (int j = 0; j < W; j += 16) {
   2119       int y = r + 1;
   2120       int base_x = (-y * dx) >> frac_bits_x;
   2121       int base_shift = 0;
   2122       if ((base_x + j) < (min_base_x - 1)) {
   2123         base_shift = (min_base_x - (base_x + j) - 1);
   2124       }
   2125       int base_min_diff = (min_base_x - base_x - j);
   2126       if (base_min_diff > 16) {
   2127         base_min_diff = 16;
   2128       } else {
   2129         if (base_min_diff < 0) base_min_diff = 0;
   2130       }
   2131 
   2132       if (base_shift > 7) {
   2133         resx[0] = _mm256_setzero_si256();
   2134       } else {
   2135         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
   2136         a1_x128 =
   2137             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
   2138         a0_x128 =
   2139             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2140         a1_x128 =
   2141             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2142 
   2143         a0_x = _mm256_cvtepu16_epi32(a0_x128);
   2144         a1_x = _mm256_cvtepu16_epi32(a1_x128);
   2145 
   2146         shift = _mm256_srli_epi32(
   2147             _mm256_and_si256(
   2148                 _mm256_setr_epi32(
   2149                     ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
   2150                     ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
   2151                     ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
   2152                     ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
   2153                 c3f),
   2154             1);
   2155 
   2156         diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
   2157         a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
   2158         a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   2159 
   2160         b = _mm256_mullo_epi32(diff, shift);
   2161         res = _mm256_add_epi32(a32, b);
   2162         res = _mm256_srli_epi32(res, 5);
   2163 
   2164         resx[0] = _mm256_packus_epi32(
   2165             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
   2166       }
   2167       int base_shift8 = 0;
   2168       if ((base_x + j + 8) < (min_base_x - 1)) {
   2169         base_shift8 = (min_base_x - (base_x + j + 8) - 1);
   2170       }
   2171       if (base_shift8 > 7) {
   2172         resx[1] = _mm256_setzero_si256();
   2173       } else {
   2174         a0_1_x128 =
   2175             _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8 + j));
   2176         a1_1_x128 =
   2177             _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9 + j));
   2178         a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
   2179                                      *(__m128i *)HighbdLoadMaskx[base_shift8]);
   2180         a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
   2181                                      *(__m128i *)HighbdLoadMaskx[base_shift8]);
   2182 
   2183         a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
   2184         a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
   2185 
   2186         shift = _mm256_srli_epi32(
   2187             _mm256_and_si256(
   2188                 _mm256_setr_epi32(
   2189                     ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
   2190                     ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
   2191                     ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
   2192                     ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
   2193                 c3f),
   2194             1);
   2195 
   2196         diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
   2197         a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
   2198         a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
   2199         b = _mm256_mullo_epi32(diff, shift);
   2200 
   2201         resx[1] = _mm256_add_epi32(a32, b);
   2202         resx[1] = _mm256_srli_epi32(resx[1], 5);
   2203         resx[1] = _mm256_packus_epi32(
   2204             resx[1],
   2205             _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
   2206       }
   2207       resx[0] =
   2208           _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
   2209                                   1);  // 16 16bit values
   2210 
   2211       // y calc
   2212       if ((base_x < min_base_x)) {
   2213         DECLARE_ALIGNED(32, int, base_y_c[16]);
   2214         __m256i r6, c256, dy256, y_c256, y_c_1_256, base_y_c256, mask256;
   2215         r6 = _mm256_set1_epi32(r << 6);
   2216         dy256 = _mm256_set1_epi32(dy);
   2217         c256 = _mm256_setr_epi32(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
   2218                                  7 + j, 8 + j);
   2219         y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
   2220         base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
   2221         mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
   2222         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
   2223         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
   2224         c256 = _mm256_setr_epi32(9 + j, 10 + j, 11 + j, 12 + j, 13 + j, 14 + j,
   2225                                  15 + j, 16 + j);
   2226         y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
   2227         base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
   2228         mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
   2229         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
   2230         _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
   2231 
   2232         a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
   2233             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
   2234             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
   2235             left[base_y_c[6]], left[base_y_c[7]]));
   2236         a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
   2237             left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
   2238             left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
   2239             left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
   2240 
   2241         shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
   2242 
   2243         diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
   2244         a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
   2245         a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   2246 
   2247         b = _mm256_mullo_epi32(diff, shift);
   2248         res = _mm256_add_epi32(a32, b);
   2249         res = _mm256_srli_epi32(res, 5);
   2250 
   2251         resy[0] = _mm256_packus_epi32(
   2252             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
   2253 
   2254         a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
   2255             left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
   2256             left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
   2257             left[base_y_c[14]], left[base_y_c[15]]));
   2258         a1_y = _mm256_cvtepu16_epi32(
   2259             _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
   2260                            left[base_y_c[10] + 1], left[base_y_c[11] + 1],
   2261                            left[base_y_c[12] + 1], left[base_y_c[13] + 1],
   2262                            left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
   2263         shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
   2264 
   2265         diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
   2266         a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
   2267         a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   2268 
   2269         b = _mm256_mullo_epi32(diff, shift);
   2270         res = _mm256_add_epi32(a32, b);
   2271         res = _mm256_srli_epi32(res, 5);
   2272 
   2273         resy[1] = _mm256_packus_epi32(
   2274             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
   2275 
   2276         resy[0] =
   2277             _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
   2278                                     1);  // 16 16bit values
   2279       } else {
   2280         resy[0] = resx[0];
   2281       }
   2282       resxy = _mm256_blendv_epi8(resx[0], resy[0],
   2283                                  *(__m256i *)HighbdBaseMask[base_min_diff]);
   2284       _mm256_storeu_si256((__m256i *)(dst + j), resxy);
   2285     }  // for j
   2286     dst += stride;
   2287   }
   2288 }
   2289 
   2290 static void highbd_dr_prediction_z2_HxW_avx2(
   2291     int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
   2292     const uint16_t *left, int upsample_above, int upsample_left, int dx,
   2293     int dy) {
   2294   // here upsample_above and upsample_left are 0 by design of
   2295   // av1_use_intra_edge_upsample
   2296   const int min_base_x = -1;
   2297   const int min_base_y = -1;
   2298   (void)upsample_above;
   2299   (void)upsample_left;
   2300   const int frac_bits_x = 6;
   2301   const int frac_bits_y = 6;
   2302 
   2303   // pre-filter above pixels
   2304   // store in temp buffers:
   2305   //   above[x] * 32 + 16
   2306   //   above[x+1] - above[x]
   2307   // final pixels will be caluculated as:
   2308   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   2309   __m256i a0_x, a1_x, a32, a16, c3f;
   2310   __m256i diff, min_base_y256;
   2311 
   2312   a16 = _mm256_set1_epi16(16);
   2313   min_base_y256 = _mm256_set1_epi16(min_base_y);
   2314   c3f = _mm256_set1_epi16(0x3f);
   2315 
   2316   for (int r = 0; r < H; r++) {
   2317     __m256i b, res, shift;
   2318     __m256i resx, resy;
   2319     __m256i resxy;
   2320     __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128, shiftx;
   2321 
   2322     for (int j = 0; j < W; j += 16) {
   2323       int y = r + 1;
   2324       int base_x = (-y * dx) >> frac_bits_x;
   2325       int base_shift = 0;
   2326       if ((base_x + j) < (min_base_x - 1)) {
   2327         base_shift = (min_base_x - (base_x + j) - 1);
   2328       }
   2329       int base_min_diff = (min_base_x - base_x - j);
   2330       if (base_min_diff > 16) {
   2331         base_min_diff = 16;
   2332       } else {
   2333         if (base_min_diff < 0) base_min_diff = 0;
   2334       }
   2335 
   2336       if (base_shift > 7) {
   2337         a0_x = _mm256_setzero_si256();
   2338         a1_x = _mm256_setzero_si256();
   2339         shift = _mm256_setzero_si256();
   2340       } else {
   2341         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
   2342         a1_x128 =
   2343             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
   2344         a0_x128 =
   2345             _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2346         a1_x128 =
   2347             _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2348 
   2349         a0_x = _mm256_castsi128_si256(a0_x128);
   2350         a1_x = _mm256_castsi128_si256(a1_x128);
   2351 
   2352         shift = _mm256_castsi128_si256(_mm_srli_epi16(
   2353             _mm_and_si128(_mm_setr_epi16(
   2354                               ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
   2355                               ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
   2356                               ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
   2357                               ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
   2358                           _mm256_castsi256_si128(c3f)),
   2359             1));
   2360       }
   2361 
   2362       base_shift = 0;
   2363       if ((base_x + j + 8) < (min_base_x - 1)) {
   2364         base_shift = (min_base_x - (base_x + j + 8) - 1);
   2365       }
   2366       if (base_shift <= 7) {
   2367         a0_1_x128 =
   2368             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j));
   2369         a1_1_x128 =
   2370             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j));
   2371         a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
   2372                                      *(__m128i *)HighbdLoadMaskx[base_shift]);
   2373         a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
   2374                                      *(__m128i *)HighbdLoadMaskx[base_shift]);
   2375 
   2376         shiftx = _mm_srli_epi16(
   2377             _mm_and_si128(
   2378                 _mm_setr_epi16(
   2379                     ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
   2380                     ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
   2381                     ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
   2382                     ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
   2383                 _mm256_castsi256_si128(c3f)),
   2384             1);
   2385 
   2386         a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
   2387         a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
   2388         shift = _mm256_inserti128_si256(shift, shiftx, 1);
   2389       }
   2390 
   2391       diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
   2392       a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
   2393       a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   2394 
   2395       b = _mm256_mullo_epi16(diff, shift);
   2396       res = _mm256_add_epi16(a32, b);
   2397       resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
   2398 
   2399       // y calc
   2400       __m256i a0_y, a1_y, shifty;
   2401       if ((base_x < min_base_x)) {
   2402         DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
   2403         __m256i r6, c256, dy256, y_c256, base_y_c256, mask256, mul16;
   2404         r6 = _mm256_set1_epi16(r << 6);
   2405         dy256 = _mm256_set1_epi16(dy);
   2406         c256 = _mm256_setr_epi16(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
   2407                                  7 + j, 8 + j, 9 + j, 10 + j, 11 + j, 12 + j,
   2408                                  13 + j, 14 + j, 15 + j, 16 + j);
   2409         mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
   2410                                  _mm256_srli_epi16(min_base_y256, 1));
   2411         y_c256 = _mm256_sub_epi16(r6, mul16);
   2412         base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
   2413         mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
   2414         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
   2415         _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
   2416 
   2417         a0_y = _mm256_setr_epi16(
   2418             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
   2419             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
   2420             left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
   2421             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
   2422             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
   2423             left[base_y_c[15]]);
   2424         a1_y = _mm256_setr_epi16(
   2425             left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
   2426             left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
   2427             left[base_y_c[6] + 1], left[base_y_c[7] + 1], left[base_y_c[8] + 1],
   2428             left[base_y_c[9] + 1], left[base_y_c[10] + 1],
   2429             left[base_y_c[11] + 1], left[base_y_c[12] + 1],
   2430             left[base_y_c[13] + 1], left[base_y_c[14] + 1],
   2431             left[base_y_c[15] + 1]);
   2432 
   2433         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
   2434 
   2435         diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
   2436         a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
   2437         a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   2438 
   2439         b = _mm256_mullo_epi16(diff, shifty);
   2440         res = _mm256_add_epi16(a32, b);
   2441         resy = _mm256_srli_epi16(res, 5);
   2442       } else {
   2443         resy = _mm256_setzero_si256();
   2444       }
   2445 
   2446       resxy = _mm256_blendv_epi8(resx, resy,
   2447                                  *(__m256i *)HighbdBaseMask[base_min_diff]);
   2448       _mm256_storeu_si256((__m256i *)(dst + j), resxy);
   2449     }  // for j
   2450     dst += stride;
   2451   }
   2452 }
   2453 
   2454 // Directional prediction, zone 2: 90 < angle < 180
   2455 void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
   2456                                       int bh, const uint16_t *above,
   2457                                       const uint16_t *left, int upsample_above,
   2458                                       int upsample_left, int dx, int dy,
   2459                                       int bd) {
   2460   (void)bd;
   2461   assert(dx > 0);
   2462   assert(dy > 0);
   2463   switch (bw) {
   2464     case 4:
   2465       highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
   2466                                        upsample_above, upsample_left, dx, dy);
   2467       break;
   2468     case 8:
   2469       if (bd < 12) {
   2470         highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
   2471                                          upsample_above, upsample_left, dx, dy);
   2472       } else {
   2473         highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
   2474                                                upsample_above, upsample_left,
   2475                                                dx, dy);
   2476       }
   2477       break;
   2478     default:
   2479       if (bd < 12) {
   2480         highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
   2481                                          upsample_above, upsample_left, dx, dy);
   2482       } else {
   2483         highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
   2484                                                upsample_above, upsample_left,
   2485                                                dx, dy);
   2486       }
   2487       break;
   2488   }
   2489 }
   2490 
   2491 static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
   2492                              uint16_t *dst, ptrdiff_t pitchDst, int width,
   2493                              int height) {
   2494   for (int j = 0; j < height; j += 8)
   2495     for (int i = 0; i < width; i += 8)
   2496       highbd_transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc,
   2497                               dst + j * pitchDst + i, pitchDst);
   2498 }
   2499 
   2500 static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
   2501                                              const uint16_t *left,
   2502                                              int upsample_left, int dy) {
   2503   __m128i dstvec[4], d[4];
   2504 
   2505   highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy);
   2506   highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
   2507                                    &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
   2508   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
   2509   _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
   2510   _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
   2511   _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
   2512   return;
   2513 }
   2514 
   2515 static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
   2516                                              const uint16_t *left,
   2517                                              int upsample_left, int dy) {
   2518   __m128i dstvec[8], d[8];
   2519 
   2520   highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy);
   2521   highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
   2522                            &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
   2523                            &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
   2524                            &d[7]);
   2525   for (int i = 0; i < 8; i++) {
   2526     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   2527   }
   2528 }
   2529 
   2530 static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
   2531                                              const uint16_t *left,
   2532                                              int upsample_left, int dy) {
   2533   __m128i dstvec[4], d[8];
   2534 
   2535   highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy);
   2536   highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
   2537                                &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
   2538                                &d[7]);
   2539   for (int i = 0; i < 8; i++) {
   2540     _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
   2541   }
   2542 }
   2543 
   2544 static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
   2545                                              const uint16_t *left,
   2546                                              int upsample_left, int dy) {
   2547   __m128i dstvec[8], d[4];
   2548 
   2549   highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy);
   2550   highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
   2551                                &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
   2552                                &d[0], &d[1], &d[2], &d[3]);
   2553   _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
   2554   _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
   2555   _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
   2556   _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
   2557 }
   2558 
   2559 static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
   2560                                               const uint16_t *left,
   2561                                               int upsample_left, int dy) {
   2562   __m256i dstvec[8], d[8];
   2563 
   2564   highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
   2565                                              dy);
   2566   highbd_transpose8x16_16x8_avx2(dstvec, d);
   2567   for (int i = 0; i < 8; i++) {
   2568     _mm_storeu_si128((__m128i *)(dst + i * stride),
   2569                      _mm256_castsi256_si128(d[i]));
   2570   }
   2571   for (int i = 8; i < 16; i++) {
   2572     _mm_storeu_si128((__m128i *)(dst + i * stride),
   2573                      _mm256_extracti128_si256(d[i - 8], 1));
   2574   }
   2575 }
   2576 
   2577 static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
   2578                                               const uint16_t *left,
   2579                                               int upsample_left, int dy) {
   2580   __m128i dstvec[16], d[16];
   2581 
   2582   highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
   2583                                             dy);
   2584   for (int i = 0; i < 16; i += 8) {
   2585     highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
   2586                              &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
   2587                              &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
   2588                              &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
   2589                              &d[5 + i], &d[6 + i], &d[7 + i]);
   2590   }
   2591   for (int i = 0; i < 8; i++) {
   2592     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   2593     _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
   2594   }
   2595 }
   2596 
   2597 static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
   2598                                               const uint16_t *left,
   2599                                               int upsample_left, int dy) {
   2600   __m256i dstvec[4], d[4], d1;
   2601 
   2602   highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
   2603                                              dy);
   2604   highbd_transpose4x16_avx2(dstvec, d);
   2605   for (int i = 0; i < 4; i++) {
   2606     _mm_storel_epi64((__m128i *)(dst + i * stride),
   2607                      _mm256_castsi256_si128(d[i]));
   2608     d1 = _mm256_bsrli_epi128(d[i], 8);
   2609     _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
   2610                      _mm256_castsi256_si128(d1));
   2611     _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
   2612                      _mm256_extracti128_si256(d[i], 1));
   2613     _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
   2614                      _mm256_extracti128_si256(d1, 1));
   2615   }
   2616 }
   2617 
   2618 static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
   2619                                               const uint16_t *left,
   2620                                               int upsample_left, int dy) {
   2621   __m128i dstvec[16], d[8];
   2622 
   2623   highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
   2624                                             dy);
   2625   highbd_transpose16x4_8x8_sse2(dstvec, d);
   2626 
   2627   _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
   2628   _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
   2629   _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
   2630   _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
   2631   _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
   2632   _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
   2633   _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
   2634   _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
   2635 }
   2636 
   2637 static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
   2638                                               const uint16_t *left,
   2639                                               int upsample_left, int dy) {
   2640   __m256i dstvec[16], d[16];
   2641 
   2642   highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
   2643                                              dy);
   2644   for (int i = 0; i < 16; i += 8) {
   2645     highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
   2646   }
   2647 
   2648   for (int i = 0; i < 8; i++) {
   2649     _mm_storeu_si128((__m128i *)(dst + i * stride),
   2650                      _mm256_castsi256_si128(d[i]));
   2651   }
   2652   for (int i = 0; i < 8; i++) {
   2653     _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
   2654                      _mm256_extracti128_si256(d[i], 1));
   2655   }
   2656   for (int i = 8; i < 16; i++) {
   2657     _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
   2658                      _mm256_castsi256_si128(d[i]));
   2659   }
   2660   for (int i = 8; i < 16; i++) {
   2661     _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
   2662                      _mm256_extracti128_si256(d[i], 1));
   2663   }
   2664 }
   2665 
   2666 static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
   2667                                               const uint16_t *left,
   2668                                               int upsample_left, int dy) {
   2669   __m128i dstvec[32], d[32];
   2670 
   2671   highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
   2672                                             dy);
   2673   for (int i = 0; i < 32; i += 8) {
   2674     highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
   2675                              &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
   2676                              &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
   2677                              &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
   2678                              &d[5 + i], &d[6 + i], &d[7 + i]);
   2679   }
   2680   for (int i = 0; i < 8; i++) {
   2681     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   2682     _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
   2683     _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
   2684     _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
   2685   }
   2686 }
   2687 
   2688 static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
   2689                                                const uint16_t *left,
   2690                                                int upsample_left, int dy) {
   2691   __m256i dstvec[16], d[16];
   2692 
   2693   highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
   2694                                              dy);
   2695   highbd_transpose16x16_avx2(dstvec, d);
   2696 
   2697   for (int i = 0; i < 16; i++) {
   2698     _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
   2699   }
   2700 }
   2701 
   2702 static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
   2703                                                const uint16_t *left,
   2704                                                int upsample_left, int dy) {
   2705   __m256i dstvec[64], d[16];
   2706 
   2707   highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
   2708                                              dy);
   2709 
   2710   highbd_transpose16x16_avx2(dstvec, d);
   2711   for (int j = 0; j < 16; j++) {
   2712     _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
   2713   }
   2714   highbd_transpose16x16_avx2(dstvec + 16, d);
   2715   for (int j = 0; j < 16; j++) {
   2716     _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
   2717   }
   2718   highbd_transpose16x16_avx2(dstvec + 32, d);
   2719   for (int j = 0; j < 16; j++) {
   2720     _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
   2721   }
   2722   highbd_transpose16x16_avx2(dstvec + 48, d);
   2723   for (int j = 0; j < 16; j++) {
   2724     _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
   2725   }
   2726 }
   2727 
   2728 static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
   2729                                                const uint16_t *left,
   2730                                                int upsample_left, int dy) {
   2731   DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
   2732   highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
   2733   highbd_transpose(dstT, 64, dst, stride, 64, 64);
   2734 }
   2735 
   2736 static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
   2737                                                const uint16_t *left,
   2738                                                int upsample_left, int dy) {
   2739   __m256i dstvec[32], d[32];
   2740 
   2741   highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
   2742                                              dy);
   2743   for (int i = 0; i < 32; i += 8) {
   2744     highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
   2745   }
   2746   // store
   2747   for (int j = 0; j < 32; j += 16) {
   2748     for (int i = 0; i < 8; i++) {
   2749       _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
   2750                        _mm256_castsi256_si128(d[(i + j)]));
   2751     }
   2752     for (int i = 0; i < 8; i++) {
   2753       _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
   2754                        _mm256_castsi256_si128(d[(i + j) + 8]));
   2755     }
   2756     for (int i = 8; i < 16; i++) {
   2757       _mm256_storeu_si256(
   2758           (__m256i *)(dst + (i + j) * stride),
   2759           _mm256_inserti128_si256(
   2760               d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
   2761     }
   2762   }
   2763 }
   2764 
   2765 static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
   2766                                                const uint16_t *left,
   2767                                                int upsample_left, int dy) {
   2768   __m256i dstvec[32], d[16];
   2769 
   2770   highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
   2771                                              dy);
   2772   for (int i = 0; i < 32; i += 16) {
   2773     highbd_transpose16x16_avx2((dstvec + i), d);
   2774     for (int j = 0; j < 16; j++) {
   2775       _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
   2776     }
   2777   }
   2778 }
   2779 
   2780 static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
   2781                                                const uint16_t *left,
   2782                                                int upsample_left, int dy) {
   2783   uint16_t dstT[64 * 32];
   2784   highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
   2785   highbd_transpose(dstT, 64, dst, stride, 32, 64);
   2786 }
   2787 
   2788 static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
   2789                                                const uint16_t *left,
   2790                                                int upsample_left, int dy) {
   2791   DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
   2792   highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
   2793   highbd_transpose(dstT, 32, dst, stride, 64, 32);
   2794   return;
   2795 }
   2796 
   2797 static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
   2798                                                const uint16_t *left,
   2799                                                int upsample_left, int dy) {
   2800   DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
   2801   highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
   2802   highbd_transpose(dstT, 64, dst, stride, 16, 64);
   2803 }
   2804 
   2805 static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
   2806                                                const uint16_t *left,
   2807                                                int upsample_left, int dy) {
   2808   __m256i dstvec[64], d[16];
   2809 
   2810   highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
   2811                                              dy);
   2812   for (int i = 0; i < 64; i += 16) {
   2813     highbd_transpose16x16_avx2((dstvec + i), d);
   2814     for (int j = 0; j < 16; j++) {
   2815       _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
   2816     }
   2817   }
   2818 }
   2819 
   2820 void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
   2821                                       int bh, const uint16_t *above,
   2822                                       const uint16_t *left, int upsample_left,
   2823                                       int dx, int dy, int bd) {
   2824   (void)above;
   2825   (void)dx;
   2826   (void)bd;
   2827   assert(dx == 1);
   2828   assert(dy > 0);
   2829   if (bw == bh) {
   2830     switch (bw) {
   2831       case 4:
   2832         highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
   2833         break;
   2834       case 8:
   2835         highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
   2836         break;
   2837       case 16:
   2838         highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left,
   2839                                            dy);
   2840         break;
   2841       case 32:
   2842         highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left,
   2843                                            dy);
   2844         break;
   2845       case 64:
   2846         highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left,
   2847                                            dy);
   2848         break;
   2849     }
   2850   } else {
   2851     if (bw < bh) {
   2852       if (bw + bw == bh) {
   2853         switch (bw) {
   2854           case 4:
   2855             highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
   2856                                              dy);
   2857             break;
   2858           case 8:
   2859             highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
   2860                                               dy);
   2861             break;
   2862           case 16:
   2863             highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
   2864                                                dy);
   2865             break;
   2866           case 32:
   2867             highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
   2868                                                dy);
   2869             break;
   2870         }
   2871       } else {
   2872         switch (bw) {
   2873           case 4:
   2874             highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
   2875                                               dy);
   2876             break;
   2877           case 8:
   2878             highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
   2879                                               dy);
   2880             break;
   2881           case 16:
   2882             highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
   2883                                                dy);
   2884             break;
   2885         }
   2886       }
   2887     } else {
   2888       if (bh + bh == bw) {
   2889         switch (bh) {
   2890           case 4:
   2891             highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
   2892                                              dy);
   2893             break;
   2894           case 8:
   2895             highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
   2896                                               dy);
   2897             break;
   2898           case 16:
   2899             highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
   2900                                                dy);
   2901             break;
   2902           case 32:
   2903             highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
   2904                                                dy);
   2905             break;
   2906         }
   2907       } else {
   2908         switch (bh) {
   2909           case 4:
   2910             highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
   2911                                               dy);
   2912             break;
   2913           case 8:
   2914             highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
   2915                                               dy);
   2916             break;
   2917           case 16:
   2918             highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
   2919                                                dy);
   2920             break;
   2921         }
   2922       }
   2923     }
   2924   }
   2925   return;
   2926 }
   2927 
   2928 // Low bit depth functions
   2929 static uint8_t BaseMask[33][32] = {
   2930   { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   2931     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   2932   { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   2933     0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   2934   { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   2935     0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   2936   { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   2937     0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   2938   { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   2939     0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   2940   { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   2941     0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   2942   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   2943     0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   2944   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   2945     0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
   2946   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
   2947     0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
   2948   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
   2949     0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
   2950   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
   2951     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
   2952     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2953   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2954     0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
   2955     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2956   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2957     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
   2958     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2959   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2960     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
   2961     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2962   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2963     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
   2964     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2965   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2966     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
   2967     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2968   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2969     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
   2970     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2971   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2972     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
   2973     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2974   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2975     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
   2976     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2977   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2978     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
   2979     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2980   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2981     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
   2982     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2983   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2984     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
   2985     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2986   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2987     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2988     0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2989   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2990     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2991     0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
   2992   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2993     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2994     0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
   2995   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2996     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2997     0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
   2998   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   2999     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3000     0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
   3001   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3002     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3003     0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
   3004   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3005     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3006     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
   3007   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3008     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3009     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
   3010   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3011     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3012     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
   3013   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3014     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3015     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
   3016   { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3017     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3018     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
   3019 };
   3020 
   3021 static AOM_FORCE_INLINE void dr_prediction_z1_4xN_internal_avx2(
   3022     int N, __m128i *dst, const uint8_t *above, int upsample_above, int dx) {
   3023   const int frac_bits = 6 - upsample_above;
   3024   const int max_base_x = ((N + 4) - 1) << upsample_above;
   3025   int x;
   3026   // a assert(dx > 0);
   3027   // pre-filter above pixels
   3028   // store in temp buffers:
   3029   //   above[x] * 32 + 16
   3030   //   above[x+1] - above[x]
   3031   // final pixels will be caluculated as:
   3032   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3033   __m256i a0, a1, a32, a16;
   3034   __m256i diff, c3f;
   3035   __m128i a_mbase_x;
   3036 
   3037   a16 = _mm256_set1_epi16(16);
   3038   a_mbase_x = _mm_set1_epi8(above[max_base_x]);
   3039   c3f = _mm256_set1_epi16(0x3f);
   3040 
   3041   x = dx;
   3042   for (int r = 0; r < N; r++) {
   3043     __m256i b, res, shift;
   3044     __m128i res1, a0_128, a1_128;
   3045 
   3046     int base = x >> frac_bits;
   3047     int base_max_diff = (max_base_x - base) >> upsample_above;
   3048     if (base_max_diff <= 0) {
   3049       for (int i = r; i < N; ++i) {
   3050         dst[i] = a_mbase_x;  // save 4 values
   3051       }
   3052       return;
   3053     }
   3054     if (base_max_diff > 4) base_max_diff = 4;
   3055     a0_128 = _mm_loadu_si128((__m128i *)(above + base));
   3056     a1_128 = _mm_srli_si128(a0_128, 1);
   3057 
   3058     if (upsample_above) {
   3059       a0_128 = _mm_shuffle_epi8(
   3060           a0_128,
   3061           _mm_setr_epi8(0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15));
   3062       a1_128 = _mm_srli_si128(a0_128, 4);
   3063 
   3064       shift = _mm256_srli_epi16(
   3065           _mm256_and_si256(
   3066               _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
   3067           1);
   3068     } else {
   3069       shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
   3070     }
   3071     a0 = _mm256_cvtepu8_epi16(a0_128);
   3072     a1 = _mm256_cvtepu8_epi16(a1_128);
   3073 
   3074     diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
   3075     a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
   3076     a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
   3077 
   3078     b = _mm256_mullo_epi16(diff, shift);
   3079     res = _mm256_add_epi16(a32, b);
   3080     res = _mm256_srli_epi16(res, 5);
   3081 
   3082     res1 = _mm256_castsi256_si128(res);
   3083     res1 = _mm_packus_epi16(res1, res1);
   3084 
   3085     dst[r] =
   3086         _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
   3087     x += dx;
   3088   }
   3089 }
   3090 
   3091 static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3092                                       const uint8_t *above, int upsample_above,
   3093                                       int dx) {
   3094   __m128i dstvec[16];
   3095 
   3096   dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, dx);
   3097   for (int i = 0; i < N; i++) {
   3098     *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
   3099   }
   3100 }
   3101 
   3102 static AOM_FORCE_INLINE void dr_prediction_z1_8xN_internal_avx2(
   3103     int N, __m128i *dst, const uint8_t *above, int upsample_above, int dx) {
   3104   const int frac_bits = 6 - upsample_above;
   3105   const int max_base_x = ((8 + N) - 1) << upsample_above;
   3106 
   3107   int x;
   3108   // pre-filter above pixels
   3109   // store in temp buffers:
   3110   //   above[x] * 32 + 16
   3111   //   above[x+1] - above[x]
   3112   // final pixels will be caluculated as:
   3113   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3114   __m256i a0, a1, a0_1, a1_1, a32, a16, diff, c3f;
   3115   __m128i a_mbase_x;
   3116 
   3117   a16 = _mm256_set1_epi32(16);
   3118   a_mbase_x = _mm_set1_epi8(above[max_base_x]);
   3119   c3f = _mm256_set1_epi32(0x3f);
   3120 
   3121   x = dx;
   3122   for (int r = 0; r < N; r++) {
   3123     __m256i b, res, res1, shift;
   3124     __m128i res128;
   3125 
   3126     int base = x >> frac_bits;
   3127     int base_max_diff = (max_base_x - base) >> upsample_above;
   3128     if (base_max_diff <= 0) {
   3129       for (int i = r; i < N; ++i) {
   3130         dst[i] = a_mbase_x;  // save 16 values, 8 to be used furter
   3131       }
   3132       return;
   3133     }
   3134     if (base_max_diff > 8) base_max_diff = 8;
   3135 
   3136     a0 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base)));
   3137     a1 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
   3138 
   3139     if (upsample_above) {
   3140       a0 = _mm256_permutevar8x32_epi32(
   3141           a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
   3142       a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
   3143 
   3144       a0_1 =
   3145           _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
   3146       a0_1 = _mm256_permutevar8x32_epi32(
   3147           a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
   3148       a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
   3149 
   3150       a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
   3151       a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
   3152 
   3153       shift = _mm256_srli_epi32(
   3154           _mm256_and_si256(
   3155               _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), c3f),
   3156           1);
   3157     } else {
   3158       shift = _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
   3159     }
   3160 
   3161     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   3162     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   3163     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   3164 
   3165     b = _mm256_mullo_epi32(diff, shift);
   3166     res = _mm256_add_epi32(a32, b);
   3167     res = _mm256_srli_epi32(res, 5);
   3168 
   3169     res1 = _mm256_packus_epi32(
   3170         res, _mm256_castsi128_si256(
   3171                  _mm256_extracti128_si256(res, 1)));  // goto 16 bit
   3172 
   3173     res128 = _mm_packus_epi16(_mm256_castsi256_si128(res1),
   3174                               _mm256_castsi256_si128(res1));  // goto 8 bit
   3175 
   3176     res128 =
   3177         _mm_blendv_epi8(a_mbase_x, res128, *(__m128i *)BaseMask[base_max_diff]);
   3178     dst[r] = res128;
   3179     x += dx;
   3180   }
   3181 }
   3182 
   3183 static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3184                                       const uint8_t *above, int upsample_above,
   3185                                       int dx) {
   3186   __m128i dstvec[32];
   3187 
   3188   dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, dx);
   3189   for (int i = 0; i < N; i++) {
   3190     _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
   3191   }
   3192 }
   3193 
   3194 static AOM_FORCE_INLINE void dr_prediction_z1_16xN_internal_avx2(
   3195     int N, __m128i *dstvec, const uint8_t *above, int upsample_above, int dx) {
   3196   int x;
   3197   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   3198   (void)upsample_above;
   3199   const int frac_bits = 6;
   3200   const int max_base_x = ((16 + N) - 1);
   3201 
   3202   // pre-filter above pixels
   3203   // store in temp buffers:
   3204   //   above[x] * 32 + 16
   3205   //   above[x+1] - above[x]
   3206   // final pixels will be caluculated as:
   3207   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3208   __m256i a0, a0_1, a1, a1_1, diff, a32, a16, c3f;
   3209   __m128i a_mbase_x;
   3210 
   3211   a16 = _mm256_set1_epi32(16);
   3212   a_mbase_x = _mm_set1_epi8((uint8_t)above[max_base_x]);
   3213   c3f = _mm256_set1_epi32(0x3f);
   3214 
   3215   x = dx;
   3216   for (int r = 0; r < N; r++) {
   3217     __m256i b, res[2];
   3218     __m128i res128[2];
   3219     int base = x >> frac_bits;
   3220     int base_max_diff = (max_base_x - base);
   3221     if (base_max_diff <= 0) {
   3222       for (int i = r; i < N; ++i) {
   3223         dstvec[i] = a_mbase_x;  // save 16 values
   3224       }
   3225       return;
   3226     }
   3227     __m256i shift =
   3228         _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
   3229 
   3230     a0 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base)));
   3231     a1 = _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
   3232 
   3233     diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   3234     a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   3235     a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   3236     b = _mm256_mullo_epi32(diff, shift);
   3237 
   3238     res[0] = _mm256_add_epi32(a32, b);
   3239     res[0] = _mm256_srli_epi32(res[0], 5);
   3240     res[0] = _mm256_packus_epi32(
   3241         res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
   3242     res128[0] = _mm_packus_epi16(_mm256_castsi256_si128(res[0]),
   3243                                  _mm256_castsi256_si128(res[0]));  // goto 8 bit
   3244 
   3245     if (base_max_diff > 8) {
   3246       if (base_max_diff > 16) base_max_diff = 16;
   3247       a0_1 =
   3248           _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
   3249       a1_1 =
   3250           _mm256_cvtepu8_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
   3251 
   3252       diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
   3253       a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
   3254       a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   3255       b = _mm256_mullo_epi32(diff, shift);
   3256 
   3257       res[1] = _mm256_add_epi32(a32, b);
   3258       res[1] = _mm256_srli_epi32(res[1], 5);
   3259       res[1] = _mm256_packus_epi32(
   3260           res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
   3261       res128[1] =
   3262           _mm_packus_epi16(_mm256_castsi256_si128(res[1]),
   3263                            _mm256_castsi256_si128(res[1]));  // goto 8 bit
   3264 
   3265     } else {
   3266       res128[1] = a_mbase_x;
   3267     }
   3268     res128[0] = _mm_unpacklo_epi64(res128[0], res128[1]);  // 16 8bit values
   3269 
   3270     dstvec[r] = _mm_blendv_epi8(a_mbase_x, res128[0],
   3271                                 *(__m128i *)BaseMask[base_max_diff]);
   3272     x += dx;
   3273   }
   3274 }
   3275 static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3276                                        const uint8_t *above, int upsample_above,
   3277                                        int dx) {
   3278   __m128i dstvec[64];
   3279 
   3280   dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, dx);
   3281   for (int i = 0; i < N; i++) {
   3282     _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
   3283   }
   3284 }
   3285 
   3286 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
   3287     int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
   3288   int x;
   3289   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   3290   (void)upsample_above;
   3291   const int frac_bits = 6;
   3292   const int max_base_x = ((32 + N) - 1);
   3293 
   3294   // pre-filter above pixels
   3295   // store in temp buffers:
   3296   //   above[x] * 32 + 16
   3297   //   above[x+1] - above[x]
   3298   // final pixels will be caluculated as:
   3299   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3300   __m256i a0, a0_1, a1, a1_1, a32, a16;
   3301   __m256i a_mbase_x, diff, c3f;
   3302 
   3303   a16 = _mm256_set1_epi32(16);
   3304   a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
   3305   c3f = _mm256_set1_epi32(0x3f);
   3306 
   3307   x = dx;
   3308   for (int r = 0; r < N; r++) {
   3309     __m256i b, res[2], res16[2];
   3310 
   3311     int base = x >> frac_bits;
   3312     int base_max_diff = (max_base_x - base);
   3313     if (base_max_diff <= 0) {
   3314       for (int i = r; i < N; ++i) {
   3315         dstvec[i] = a_mbase_x;  // save 32 values
   3316       }
   3317       return;
   3318     }
   3319     if (base_max_diff > 32) base_max_diff = 32;
   3320     __m256i shift =
   3321         _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
   3322 
   3323     for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
   3324       int mdiff = base_max_diff - j;
   3325       if (mdiff <= 0) {
   3326         res16[jj] = a_mbase_x;
   3327       } else {
   3328         a0 = _mm256_cvtepu8_epi32(
   3329             _mm_loadu_si128((__m128i *)(above + base + j)));
   3330         a1 = _mm256_cvtepu8_epi32(
   3331             _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
   3332 
   3333         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   3334         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   3335         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   3336         b = _mm256_mullo_epi32(diff, shift);
   3337 
   3338         res[0] = _mm256_add_epi32(a32, b);
   3339         res[0] = _mm256_srli_epi32(res[0], 5);
   3340         res[0] = _mm256_packus_epi32(
   3341             res[0],
   3342             _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
   3343 
   3344         // goto 8 bit
   3345         res[0] = _mm256_packus_epi16(res[0], res[0]);
   3346 
   3347         if (mdiff > 8) {
   3348           a0_1 = _mm256_cvtepu8_epi32(
   3349               _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
   3350           a1_1 = _mm256_cvtepu8_epi32(
   3351               _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
   3352 
   3353           diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
   3354           a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
   3355           a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   3356           b = _mm256_mullo_epi32(diff, shift);
   3357 
   3358           res[1] = _mm256_add_epi32(a32, b);
   3359           res[1] = _mm256_srli_epi32(res[1], 5);
   3360           res[1] = _mm256_packus_epi32(
   3361               res[1],
   3362               _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
   3363           res[1] = _mm256_packus_epi16(res[1], res[1]);
   3364           // goto 8 bit
   3365         } else {
   3366           res[1] = a_mbase_x;
   3367         }
   3368         res16[jj] = _mm256_unpacklo_epi64(res[0], res[1]);  // 16 8bit values
   3369       }
   3370     }
   3371     res16[1] =
   3372         _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
   3373                                 1);  // 32 8bit values
   3374 
   3375     dstvec[r] = _mm256_blendv_epi8(
   3376         a_mbase_x, res16[1],
   3377         *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
   3378     x += dx;
   3379   }
   3380 }
   3381 
   3382 static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3383                                        const uint8_t *above, int upsample_above,
   3384                                        int dx) {
   3385   __m256i dstvec[64];
   3386   dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
   3387   for (int i = 0; i < N; i++) {
   3388     _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
   3389   }
   3390 }
   3391 
   3392 static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3393                                        const uint8_t *above, int upsample_above,
   3394                                        int dx) {
   3395   int x;
   3396   // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   3397   (void)upsample_above;
   3398   const int frac_bits = 6;
   3399   const int max_base_x = ((64 + N) - 1);
   3400 
   3401   // pre-filter above pixels
   3402   // store in temp buffers:
   3403   //   above[x] * 32 + 16
   3404   //   above[x+1] - above[x]
   3405   // final pixels will be caluculated as:
   3406   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3407   __m256i a0, a0_1, a1, a1_1, a32, a16;
   3408   __m256i a_mbase_x, diff, c3f;
   3409   __m128i max_base_x128, base_inc128, mask128;
   3410 
   3411   a16 = _mm256_set1_epi32(16);
   3412   a_mbase_x = _mm256_set1_epi8(above[max_base_x]);
   3413   max_base_x128 = _mm_set1_epi8(max_base_x);
   3414   c3f = _mm256_set1_epi32(0x3f);
   3415 
   3416   x = dx;
   3417   for (int r = 0; r < N; r++, dst += stride) {
   3418     __m256i b, res[2];
   3419     __m128i res1;
   3420 
   3421     int base = x >> frac_bits;
   3422     if (base >= max_base_x) {
   3423       for (int i = r; i < N; ++i) {
   3424         _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
   3425         _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
   3426         dst += stride;
   3427       }
   3428       return;
   3429     }
   3430 
   3431     __m256i shift =
   3432         _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
   3433 
   3434     __m128i a0_128, a0_1_128, a1_128, a1_1_128;
   3435     for (int j = 0; j < 64; j += 16) {
   3436       int mdif = max_base_x - (base + j);
   3437       if (mdif <= 0) {
   3438         _mm_storeu_si128((__m128i *)(dst + j),
   3439                          _mm256_castsi256_si128(a_mbase_x));
   3440       } else {
   3441         a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
   3442         a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
   3443         a0 = _mm256_cvtepu8_epi32(a0_128);
   3444         a1 = _mm256_cvtepu8_epi32(a1_128);
   3445 
   3446         diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   3447         a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   3448         a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   3449         b = _mm256_mullo_epi32(diff, shift);
   3450 
   3451         res[0] = _mm256_add_epi32(a32, b);
   3452         res[0] = _mm256_srli_epi32(res[0], 5);
   3453         res[0] = _mm256_packus_epi32(
   3454             res[0],
   3455             _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
   3456         // goto 8 bit
   3457         res[0] = _mm256_packus_epi16(res[0], res[0]);
   3458 
   3459         if (mdif > 8) {
   3460           a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
   3461           a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
   3462           a0_1 = _mm256_cvtepu8_epi32(a0_1_128);
   3463           a1_1 = _mm256_cvtepu8_epi32(a1_1_128);
   3464 
   3465           diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
   3466           a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
   3467           a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   3468           b = _mm256_mullo_epi32(diff, shift);
   3469 
   3470           res[1] = _mm256_add_epi32(a32, b);
   3471           res[1] = _mm256_srli_epi32(res[1], 5);
   3472           res[1] = _mm256_packus_epi32(
   3473               res[1],
   3474               _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
   3475           res[1] = _mm256_packus_epi16(res[1], res[1]);
   3476 
   3477         } else {
   3478           res[1] = a_mbase_x;
   3479         }
   3480         res1 = _mm_unpacklo_epi64(
   3481             _mm256_castsi256_si128(res[0]),
   3482             _mm256_castsi256_si128(res[1]));  // 16 8bit values
   3483 
   3484         base_inc128 = _mm_setr_epi8(
   3485             base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
   3486             base + j + 5, base + j + 6, base + j + 7, base + j + 8,
   3487             base + j + 9, base + j + 10, base + j + 11, base + j + 12,
   3488             base + j + 13, base + j + 14, base + j + 15);
   3489 
   3490         mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
   3491                                  _mm_setzero_si128());
   3492         res1 =
   3493             _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), res1, mask128);
   3494         _mm_storeu_si128((__m128i *)(dst + j), res1);
   3495       }
   3496     }
   3497     x += dx;
   3498   }
   3499 }
   3500 
   3501 // Directional prediction, zone 1: 0 < angle < 90
   3502 void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   3503                                const uint8_t *above, const uint8_t *left,
   3504                                int upsample_above, int dx, int dy) {
   3505   (void)left;
   3506   (void)dy;
   3507   switch (bw) {
   3508     case 4:
   3509       dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
   3510       break;
   3511     case 8:
   3512       dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
   3513       break;
   3514     case 16:
   3515       dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
   3516       break;
   3517     case 32:
   3518       dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
   3519       break;
   3520     case 64:
   3521       dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
   3522       break;
   3523     default: break;
   3524   }
   3525   return;
   3526 }
   3527 
   3528 static uint8_t LoadMaskx[8][16] = {
   3529   { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
   3530   { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
   3531   { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
   3532   { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
   3533   { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
   3534   { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
   3535   { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
   3536   { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
   3537 };
   3538 
   3539 static uint8_t EvenOddMaskx4[8][16] = {
   3540   { 0, 2, 4, 6, 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0 },
   3541   { 0, 1, 3, 5, 7, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0 },
   3542   { 0, 0, 2, 4, 6, 8, 3, 5, 7, 9, 0, 0, 0, 0, 0, 0 },
   3543   { 0, 0, 0, 3, 5, 7, 9, 4, 6, 8, 10, 0, 0, 0, 0, 0 },
   3544   { 0, 0, 0, 0, 4, 6, 8, 10, 5, 7, 9, 11, 0, 0, 0, 0 },
   3545   { 0, 0, 0, 0, 0, 5, 7, 9, 11, 6, 8, 10, 12, 0, 0, 0 },
   3546   { 0, 0, 0, 0, 0, 0, 6, 8, 10, 12, 7, 9, 11, 13, 0, 0 },
   3547   { 0, 0, 0, 0, 0, 0, 0, 7, 9, 11, 13, 8, 10, 12, 14, 0 }
   3548 };
   3549 
   3550 static uint8_t EvenOddMaskx[8][16] = {
   3551   { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 0, 0, 0, 0 },
   3552   { 0, 1, 3, 5, 7, 9, 11, 13, 15, 2, 4, 6, 8, 0, 0, 0 },
   3553   { 0, 0, 2, 4, 6, 8, 10, 12, 14, 3, 5, 7, 9, 0, 0, 0 },
   3554   { 0, 0, 0, 3, 5, 7, 9, 11, 13, 15, 4, 6, 8, 10, 0 },
   3555   { 0, 0, 0, 0, 4, 6, 8, 10, 12, 14, 5, 7, 9, 11, 0, 0 },
   3556   { 0, 0, 0, 0, 0, 5, 7, 9, 11, 13, 15, 6, 8, 10, 12, 0 },
   3557   { 0, 0, 0, 0, 0, 0, 6, 8, 10, 12, 14, 7, 9, 11, 13, 0 },
   3558   { 0, 0, 0, 0, 0, 0, 0, 7, 9, 11, 13, 15, 8, 10, 12, 14 }
   3559 };
   3560 
   3561 static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3562                                       const uint8_t *above, const uint8_t *left,
   3563                                       int upsample_above, int upsample_left,
   3564                                       int dx, int dy) {
   3565   const int min_base_x = -(1 << upsample_above);
   3566   const int min_base_y = -(1 << upsample_left);
   3567   const int frac_bits_x = 6 - upsample_above;
   3568   const int frac_bits_y = 6 - upsample_left;
   3569 
   3570   // a assert(dx > 0);
   3571   // pre-filter above pixels
   3572   // store in temp buffers:
   3573   //   above[x] * 32 + 16
   3574   //   above[x+1] - above[x]
   3575   // final pixels will be caluculated as:
   3576   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3577   __m256i a0_x, a1_x, a32, a16, diff;
   3578   __m128i c3f, min_base_y128;
   3579 
   3580   a16 = _mm256_set1_epi32(16);
   3581   c3f = _mm_set1_epi32(0x3f);
   3582   min_base_y128 = _mm_set1_epi32(min_base_y);
   3583 
   3584   for (int r = 0; r < N; r++) {
   3585     __m256i b, res, shift;
   3586     __m128i resx, resy, resxy;
   3587     __m128i a0_x128, a1_x128;
   3588     int y = r + 1;
   3589     int base_x = (-y * dx) >> frac_bits_x;
   3590     int base_shift = 0;
   3591     if (base_x < (min_base_x - 1)) {
   3592       base_shift = (min_base_x - base_x - 1) >> upsample_above;
   3593     }
   3594     int base_min_diff =
   3595         (min_base_x - base_x + upsample_above) >> upsample_above;
   3596     if (base_min_diff > 4) {
   3597       base_min_diff = 4;
   3598     } else {
   3599       if (base_min_diff < 0) base_min_diff = 0;
   3600     }
   3601 
   3602     if (base_shift > 3) {
   3603       a0_x = _mm256_setzero_si256();
   3604       a1_x = _mm256_setzero_si256();
   3605       shift = _mm256_setzero_si256();
   3606     } else {
   3607       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   3608       if (upsample_above) {
   3609         a0_x128 =
   3610             _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx4[base_shift]);
   3611         a1_x128 = _mm_srli_si128(a0_x128, 4);
   3612 
   3613         shift = _mm256_castsi128_si256(_mm_srli_epi32(
   3614             _mm_and_si128(
   3615                 _mm_slli_epi32(
   3616                     _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
   3617                                    (2 << 6) - y * dx, (3 << 6) - y * dx),
   3618                     upsample_above),
   3619                 c3f),
   3620             1));
   3621       } else {
   3622         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
   3623         a1_x128 = _mm_srli_si128(a0_x128, 1);
   3624 
   3625         shift = _mm256_castsi128_si256(_mm_srli_epi32(
   3626             _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
   3627                                          (2 << 6) - y * dx, (3 << 6) - y * dx),
   3628                           c3f),
   3629             1));
   3630       }
   3631       a0_x = _mm256_cvtepu8_epi32(a0_x128);
   3632       a1_x = _mm256_cvtepu8_epi32(a1_x128);
   3633     }
   3634     // y calc
   3635     __m128i a0_y, a1_y, shifty;
   3636     if (base_x < min_base_x) {
   3637       DECLARE_ALIGNED(32, int, base_y_c[4]);
   3638       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
   3639       r6 = _mm_set1_epi32(r << 6);
   3640       dy128 = _mm_set1_epi32(dy);
   3641       c1234 = _mm_setr_epi32(1, 2, 3, 4);
   3642       y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
   3643       base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
   3644       mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
   3645       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
   3646       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
   3647 
   3648       a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
   3649                             left[base_y_c[2]], left[base_y_c[3]]);
   3650       a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
   3651                             left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
   3652 
   3653       if (upsample_left) {
   3654         shifty = _mm_srli_epi32(
   3655             _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
   3656       } else {
   3657         shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
   3658       }
   3659       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
   3660       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
   3661       shift = _mm256_inserti128_si256(shift, shifty, 1);
   3662     }
   3663 
   3664     diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
   3665     a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
   3666     a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   3667 
   3668     b = _mm256_mullo_epi32(diff, shift);
   3669     res = _mm256_add_epi32(a32, b);
   3670     res = _mm256_srli_epi32(res, 5);
   3671 
   3672     resx = _mm256_castsi256_si128(res);
   3673     resx = _mm_packus_epi32(resx, resx);
   3674     resx = _mm_packus_epi16(resx, resx);
   3675 
   3676     resy = _mm256_extracti128_si256(res, 1);
   3677     resy = _mm_packus_epi32(resy, resy);
   3678     resy = _mm_packus_epi16(resy, resy);
   3679 
   3680     resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
   3681     *(uint32_t *)(dst) = _mm_cvtsi128_si32(resxy);
   3682     dst += stride;
   3683   }
   3684 }
   3685 
   3686 static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3687                                       const uint8_t *above, const uint8_t *left,
   3688                                       int upsample_above, int upsample_left,
   3689                                       int dx, int dy) {
   3690   const int min_base_x = -(1 << upsample_above);
   3691   const int min_base_y = -(1 << upsample_left);
   3692   const int frac_bits_x = 6 - upsample_above;
   3693   const int frac_bits_y = 6 - upsample_left;
   3694 
   3695   // pre-filter above pixels
   3696   // store in temp buffers:
   3697   //   above[x] * 32 + 16
   3698   //   above[x+1] - above[x]
   3699   // final pixels will be caluculated as:
   3700   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3701   __m256i diff, a32, a16;
   3702   __m256i a0_x, a1_x;
   3703   __m128i a0_x128, a1_x128, min_base_y128, c3f;
   3704 
   3705   a16 = _mm256_set1_epi16(16);
   3706   c3f = _mm_set1_epi16(0x3f);
   3707   min_base_y128 = _mm_set1_epi16(min_base_y);
   3708 
   3709   for (int r = 0; r < N; r++) {
   3710     __m256i b, res, shift;
   3711     __m128i resx, resy, resxy;
   3712 
   3713     int y = r + 1;
   3714     int base_x = (-y * dx) >> frac_bits_x;
   3715     int base_shift = 0;
   3716     if (base_x < (min_base_x - 1)) {
   3717       base_shift = (min_base_x - base_x - 1) >> upsample_above;
   3718     }
   3719     int base_min_diff =
   3720         (min_base_x - base_x + upsample_above) >> upsample_above;
   3721     if (base_min_diff > 8) {
   3722       base_min_diff = 8;
   3723     } else {
   3724       if (base_min_diff < 0) base_min_diff = 0;
   3725     }
   3726 
   3727     if (base_shift > 7) {
   3728       a0_x = _mm256_setzero_si256();
   3729       a1_x = _mm256_setzero_si256();
   3730       shift = _mm256_setzero_si256();
   3731     } else {
   3732       a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   3733       a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
   3734       if (upsample_above) {
   3735         a0_x128 =
   3736             _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
   3737         a1_x128 =
   3738             _mm_shuffle_epi8(a1_x128, *(__m128i *)EvenOddMaskx[base_shift]);
   3739 
   3740         shift = _mm256_castsi128_si256(_mm_srli_epi16(
   3741             _mm_and_si128(
   3742                 _mm_slli_epi16(
   3743                     _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
   3744                                    (2 << 6) - y * dx, (3 << 6) - y * dx,
   3745                                    (4 << 6) - y * dx, (5 << 6) - y * dx,
   3746                                    (6 << 6) - y * dx, (7 << 6) - y * dx),
   3747                     upsample_above),
   3748                 c3f),
   3749             1));
   3750       } else {
   3751         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
   3752         a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
   3753 
   3754         shift = _mm256_castsi128_si256(_mm_srli_epi16(
   3755             _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
   3756                                          (2 << 6) - y * dx, (3 << 6) - y * dx,
   3757                                          (4 << 6) - y * dx, (5 << 6) - y * dx,
   3758                                          (6 << 6) - y * dx, (7 << 6) - y * dx),
   3759                           c3f),
   3760             1));
   3761       }
   3762       a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
   3763       a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
   3764     }
   3765 
   3766     // y calc
   3767     __m128i a0_y, a1_y, shifty;
   3768     if (base_x < min_base_x) {
   3769       DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
   3770       __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
   3771       r6 = _mm_set1_epi16(r << 6);
   3772       dy128 = _mm_set1_epi16(dy);
   3773       c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
   3774       y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
   3775       base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
   3776       mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
   3777       base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
   3778       _mm_store_si128((__m128i *)base_y_c, base_y_c128);
   3779 
   3780       a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
   3781                             left[base_y_c[2]], left[base_y_c[3]],
   3782                             left[base_y_c[4]], left[base_y_c[5]],
   3783                             left[base_y_c[6]], left[base_y_c[7]]);
   3784       a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
   3785                             left[base_y_c[2] + 1], left[base_y_c[3] + 1],
   3786                             left[base_y_c[4] + 1], left[base_y_c[5] + 1],
   3787                             left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
   3788 
   3789       if (upsample_left) {
   3790         shifty = _mm_srli_epi16(
   3791             _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
   3792       } else {
   3793         shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
   3794       }
   3795 
   3796       a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
   3797       a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
   3798       shift = _mm256_inserti128_si256(shift, shifty, 1);
   3799     }
   3800 
   3801     diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
   3802     a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
   3803     a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   3804 
   3805     b = _mm256_mullo_epi16(diff, shift);
   3806     res = _mm256_add_epi16(a32, b);
   3807     res = _mm256_srli_epi16(res, 5);
   3808 
   3809     resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
   3810                             _mm256_castsi256_si128(res));
   3811     resy = _mm256_extracti128_si256(res, 1);
   3812     resy = _mm_packus_epi16(resy, resy);
   3813 
   3814     resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
   3815     _mm_storel_epi64((__m128i *)(dst), resxy);
   3816     dst += stride;
   3817   }
   3818 }
   3819 
   3820 static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
   3821                                       ptrdiff_t stride, const uint8_t *above,
   3822                                       const uint8_t *left, int upsample_above,
   3823                                       int upsample_left, int dx, int dy) {
   3824   // here upsample_above and upsample_left are 0 by design of
   3825   // av1_use_intra_edge_upsample
   3826   const int min_base_x = -1;
   3827   const int min_base_y = -1;
   3828   (void)upsample_above;
   3829   (void)upsample_left;
   3830   const int frac_bits_x = 6;
   3831   const int frac_bits_y = 6;
   3832 
   3833   // pre-filter above pixels
   3834   // store in temp buffers:
   3835   //   above[x] * 32 + 16
   3836   //   above[x+1] - above[x]
   3837   // final pixels will be caluculated as:
   3838   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3839   __m256i a0_x, a1_x, a0_y, a1_y, a32, a16;
   3840   __m256i diff, min_base_y256, c3f, shifty;
   3841   __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128, a0_1_x, a1_1_x, shiftx;
   3842 
   3843   a16 = _mm256_set1_epi16(16);
   3844   min_base_y256 = _mm256_set1_epi16(min_base_y);
   3845   c3f = _mm256_set1_epi16(0x3f);
   3846 
   3847   for (int r = 0; r < H; r++) {
   3848     __m256i b, res, shift;
   3849     __m128i resx, resy;
   3850     __m128i resxy;
   3851     for (int j = 0; j < W; j += 16) {
   3852       int y = r + 1;
   3853       int base_x = (-y * dx) >> frac_bits_x;
   3854 
   3855       int base_shift = 0;
   3856       if ((base_x + j) < (min_base_x - 1)) {
   3857         base_shift = (min_base_x - (base_x + j) - 1);
   3858       }
   3859       int base_min_diff = (min_base_x - base_x - j);
   3860       if (base_min_diff > 16) {
   3861         base_min_diff = 16;
   3862       } else {
   3863         if (base_min_diff < 0) base_min_diff = 0;
   3864       }
   3865       if (base_shift > 7) {
   3866         a0_x = _mm256_setzero_si256();
   3867         a1_x = _mm256_setzero_si256();
   3868         shift = _mm256_setzero_si256();
   3869       } else {
   3870         a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
   3871         a1_x128 =
   3872             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
   3873         a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
   3874         a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
   3875 
   3876         a0_x = _mm256_cvtepu8_epi16(a0_x128);
   3877         a1_x = _mm256_cvtepu8_epi16(a1_x128);
   3878 
   3879         shift = _mm256_castsi128_si256(_mm_srli_epi16(
   3880             _mm_and_si128(_mm_setr_epi16(
   3881                               ((0 + j) << 6) - y * dx, ((1 + j) << 6) - y * dx,
   3882                               ((2 + j) << 6) - y * dx, ((3 + j) << 6) - y * dx,
   3883                               ((4 + j) << 6) - y * dx, ((5 + j) << 6) - y * dx,
   3884                               ((6 + j) << 6) - y * dx, ((7 + j) << 6) - y * dx),
   3885                           _mm256_castsi256_si128(c3f)),
   3886             1));
   3887       }
   3888 
   3889       base_shift = 0;
   3890       if ((base_x + j + 8) < (min_base_x - 1)) {
   3891         base_shift = (min_base_x - (base_x + j + 8) - 1);
   3892       }
   3893       if (base_shift <= 7) {
   3894         a0_1_x128 =
   3895             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 8 + j));
   3896         a1_1_x128 =
   3897             _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 9 + j));
   3898         a0_1_x128 =
   3899             _mm_shuffle_epi8(a0_1_x128, *(__m128i *)LoadMaskx[base_shift]);
   3900         a1_1_x128 =
   3901             _mm_shuffle_epi8(a1_1_x128, *(__m128i *)LoadMaskx[base_shift]);
   3902 
   3903         a0_1_x = _mm_cvtepu8_epi16(a0_1_x128);
   3904         a1_1_x = _mm_cvtepu8_epi16(a1_1_x128);
   3905 
   3906         shiftx = _mm_srli_epi16(
   3907             _mm_and_si128(
   3908                 _mm_setr_epi16(
   3909                     ((8 + j) << 6) - y * dx, ((9 + j) << 6) - y * dx,
   3910                     ((10 + j) << 6) - y * dx, ((11 + j) << 6) - y * dx,
   3911                     ((12 + j) << 6) - y * dx, ((13 + j) << 6) - y * dx,
   3912                     ((14 + j) << 6) - y * dx, ((15 + j) << 6) - y * dx),
   3913                 _mm256_castsi256_si128(c3f)),
   3914             1);
   3915 
   3916         a0_x = _mm256_inserti128_si256(a0_x, a0_1_x, 1);
   3917         a1_x = _mm256_inserti128_si256(a1_x, a1_1_x, 1);
   3918         shift = _mm256_inserti128_si256(shift, shiftx, 1);
   3919       }
   3920 
   3921       diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
   3922       a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
   3923       a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   3924 
   3925       b = _mm256_mullo_epi16(diff, shift);
   3926       res = _mm256_add_epi16(a32, b);
   3927       res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
   3928       resx = _mm256_castsi256_si128(_mm256_packus_epi16(
   3929           res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
   3930 
   3931       // y calc
   3932       if ((base_x < min_base_x)) {
   3933         DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
   3934         __m256i r6, c256, dy256, y_c256, base_y_c256, mask256, mul16;
   3935         r6 = _mm256_set1_epi16(r << 6);
   3936         dy256 = _mm256_set1_epi16(dy);
   3937         c256 = _mm256_setr_epi16(1 + j, 2 + j, 3 + j, 4 + j, 5 + j, 6 + j,
   3938                                  7 + j, 8 + j, 9 + j, 10 + j, 11 + j, 12 + j,
   3939                                  13 + j, 14 + j, 15 + j, 16 + j);
   3940         mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
   3941                                  _mm256_srli_epi16(min_base_y256, 1));
   3942         y_c256 = _mm256_sub_epi16(r6, mul16);
   3943 
   3944         base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
   3945         mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
   3946         base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
   3947         _mm256_store_si256((__m256i *)base_y_c, base_y_c256); /**/
   3948 
   3949         a0_y = _mm256_setr_epi16(
   3950             left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
   3951             left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
   3952             left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
   3953             left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
   3954             left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
   3955             left[base_y_c[15]]);
   3956         a1_y = _mm256_setr_epi16(
   3957             left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
   3958             left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
   3959             left[base_y_c[6] + 1], left[base_y_c[7] + 1], left[base_y_c[8] + 1],
   3960             left[base_y_c[9] + 1], left[base_y_c[10] + 1],
   3961             left[base_y_c[11] + 1], left[base_y_c[12] + 1],
   3962             left[base_y_c[13] + 1], left[base_y_c[14] + 1],
   3963             left[base_y_c[15] + 1]);
   3964 
   3965         shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
   3966 
   3967         diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
   3968         a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
   3969         a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   3970 
   3971         b = _mm256_mullo_epi16(diff, shifty);
   3972         res = _mm256_add_epi16(a32, b);
   3973         res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
   3974         resy = _mm256_castsi256_si128(_mm256_packus_epi16(
   3975             res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
   3976 
   3977       } else {
   3978         resy = _mm_setzero_si128();
   3979       }
   3980       resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
   3981       _mm_storeu_si128((__m128i *)(dst + j), resxy);
   3982     }  // for j
   3983     dst += stride;
   3984   }
   3985 }
   3986 
   3987 // Directional prediction, zone 2: 90 < angle < 180
   3988 void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   3989                                const uint8_t *above, const uint8_t *left,
   3990                                int upsample_above, int upsample_left, int dx,
   3991                                int dy) {
   3992   assert(dx > 0);
   3993   assert(dy > 0);
   3994   switch (bw) {
   3995     case 4:
   3996       dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
   3997                                 upsample_left, dx, dy);
   3998       break;
   3999     case 8:
   4000       dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
   4001                                 upsample_left, dx, dy);
   4002       break;
   4003     default:
   4004       dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
   4005                                 upsample_above, upsample_left, dx, dy);
   4006       break;
   4007   }
   4008   return;
   4009 }
   4010 
   4011 // z3 functions
   4012 static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
   4013   __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
   4014   w0 = _mm_unpacklo_epi8(x[0], x[1]);
   4015   w1 = _mm_unpacklo_epi8(x[2], x[3]);
   4016   w2 = _mm_unpackhi_epi8(x[0], x[1]);
   4017   w3 = _mm_unpackhi_epi8(x[2], x[3]);
   4018 
   4019   ww0 = _mm_unpacklo_epi16(w0, w1);
   4020   ww1 = _mm_unpacklo_epi16(w2, w3);
   4021   ww2 = _mm_unpackhi_epi16(w0, w1);
   4022   ww3 = _mm_unpackhi_epi16(w2, w3);
   4023 
   4024   w0 = _mm_unpacklo_epi32(ww0, ww1);
   4025   w2 = _mm_unpacklo_epi32(ww2, ww3);
   4026   w1 = _mm_unpackhi_epi32(ww0, ww1);
   4027   w3 = _mm_unpackhi_epi32(ww2, ww3);
   4028 
   4029   d[0] = _mm_unpacklo_epi64(w0, w2);
   4030   d[1] = _mm_unpackhi_epi64(w0, w2);
   4031   d[2] = _mm_unpacklo_epi64(w1, w3);
   4032   d[3] = _mm_unpackhi_epi64(w1, w3);
   4033 
   4034   d[4] = _mm_srli_si128(d[0], 8);
   4035   d[5] = _mm_srli_si128(d[1], 8);
   4036   d[6] = _mm_srli_si128(d[2], 8);
   4037   d[7] = _mm_srli_si128(d[3], 8);
   4038 
   4039   d[8] = _mm_srli_si128(d[0], 4);
   4040   d[9] = _mm_srli_si128(d[1], 4);
   4041   d[10] = _mm_srli_si128(d[2], 4);
   4042   d[11] = _mm_srli_si128(d[3], 4);
   4043 
   4044   d[12] = _mm_srli_si128(d[0], 12);
   4045   d[13] = _mm_srli_si128(d[1], 12);
   4046   d[14] = _mm_srli_si128(d[2], 12);
   4047   d[15] = _mm_srli_si128(d[3], 12);
   4048 }
   4049 
   4050 static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
   4051   __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
   4052   __m256i w10, w11, w12, w13, w14, w15;
   4053 
   4054   w0 = _mm256_unpacklo_epi8(x[0], x[1]);
   4055   w1 = _mm256_unpacklo_epi8(x[2], x[3]);
   4056   w2 = _mm256_unpacklo_epi8(x[4], x[5]);
   4057   w3 = _mm256_unpacklo_epi8(x[6], x[7]);
   4058 
   4059   w8 = _mm256_unpacklo_epi8(x[8], x[9]);
   4060   w9 = _mm256_unpacklo_epi8(x[10], x[11]);
   4061   w10 = _mm256_unpacklo_epi8(x[12], x[13]);
   4062   w11 = _mm256_unpacklo_epi8(x[14], x[15]);
   4063 
   4064   w4 = _mm256_unpacklo_epi16(w0, w1);
   4065   w5 = _mm256_unpacklo_epi16(w2, w3);
   4066   w12 = _mm256_unpacklo_epi16(w8, w9);
   4067   w13 = _mm256_unpacklo_epi16(w10, w11);
   4068 
   4069   w6 = _mm256_unpacklo_epi32(w4, w5);
   4070   w7 = _mm256_unpackhi_epi32(w4, w5);
   4071   w14 = _mm256_unpacklo_epi32(w12, w13);
   4072   w15 = _mm256_unpackhi_epi32(w12, w13);
   4073 
   4074   // Store first 4-line result
   4075   d[0] = _mm256_unpacklo_epi64(w6, w14);
   4076   d[1] = _mm256_unpackhi_epi64(w6, w14);
   4077   d[2] = _mm256_unpacklo_epi64(w7, w15);
   4078   d[3] = _mm256_unpackhi_epi64(w7, w15);
   4079 
   4080   w4 = _mm256_unpackhi_epi16(w0, w1);
   4081   w5 = _mm256_unpackhi_epi16(w2, w3);
   4082   w12 = _mm256_unpackhi_epi16(w8, w9);
   4083   w13 = _mm256_unpackhi_epi16(w10, w11);
   4084 
   4085   w6 = _mm256_unpacklo_epi32(w4, w5);
   4086   w7 = _mm256_unpackhi_epi32(w4, w5);
   4087   w14 = _mm256_unpacklo_epi32(w12, w13);
   4088   w15 = _mm256_unpackhi_epi32(w12, w13);
   4089 
   4090   // Store second 4-line result
   4091   d[4] = _mm256_unpacklo_epi64(w6, w14);
   4092   d[5] = _mm256_unpackhi_epi64(w6, w14);
   4093   d[6] = _mm256_unpacklo_epi64(w7, w15);
   4094   d[7] = _mm256_unpackhi_epi64(w7, w15);
   4095 
   4096   // upper half
   4097   w0 = _mm256_unpackhi_epi8(x[0], x[1]);
   4098   w1 = _mm256_unpackhi_epi8(x[2], x[3]);
   4099   w2 = _mm256_unpackhi_epi8(x[4], x[5]);
   4100   w3 = _mm256_unpackhi_epi8(x[6], x[7]);
   4101 
   4102   w8 = _mm256_unpackhi_epi8(x[8], x[9]);
   4103   w9 = _mm256_unpackhi_epi8(x[10], x[11]);
   4104   w10 = _mm256_unpackhi_epi8(x[12], x[13]);
   4105   w11 = _mm256_unpackhi_epi8(x[14], x[15]);
   4106 
   4107   w4 = _mm256_unpacklo_epi16(w0, w1);
   4108   w5 = _mm256_unpacklo_epi16(w2, w3);
   4109   w12 = _mm256_unpacklo_epi16(w8, w9);
   4110   w13 = _mm256_unpacklo_epi16(w10, w11);
   4111 
   4112   w6 = _mm256_unpacklo_epi32(w4, w5);
   4113   w7 = _mm256_unpackhi_epi32(w4, w5);
   4114   w14 = _mm256_unpacklo_epi32(w12, w13);
   4115   w15 = _mm256_unpackhi_epi32(w12, w13);
   4116 
   4117   // Store first 4-line result
   4118   d[8] = _mm256_unpacklo_epi64(w6, w14);
   4119   d[9] = _mm256_unpackhi_epi64(w6, w14);
   4120   d[10] = _mm256_unpacklo_epi64(w7, w15);
   4121   d[11] = _mm256_unpackhi_epi64(w7, w15);
   4122 
   4123   w4 = _mm256_unpackhi_epi16(w0, w1);
   4124   w5 = _mm256_unpackhi_epi16(w2, w3);
   4125   w12 = _mm256_unpackhi_epi16(w8, w9);
   4126   w13 = _mm256_unpackhi_epi16(w10, w11);
   4127 
   4128   w6 = _mm256_unpacklo_epi32(w4, w5);
   4129   w7 = _mm256_unpackhi_epi32(w4, w5);
   4130   w14 = _mm256_unpacklo_epi32(w12, w13);
   4131   w15 = _mm256_unpackhi_epi32(w12, w13);
   4132 
   4133   // Store second 4-line result
   4134   d[12] = _mm256_unpacklo_epi64(w6, w14);
   4135   d[13] = _mm256_unpackhi_epi64(w6, w14);
   4136   d[14] = _mm256_unpacklo_epi64(w7, w15);
   4137   d[15] = _mm256_unpackhi_epi64(w7, w15);
   4138 }
   4139 
   4140 static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
   4141   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
   4142   __m128i w10, w11, w12, w13, w14, w15;
   4143 
   4144   w0 = _mm_unpacklo_epi8(x[0], x[1]);
   4145   w1 = _mm_unpacklo_epi8(x[2], x[3]);
   4146   w2 = _mm_unpacklo_epi8(x[4], x[5]);
   4147   w3 = _mm_unpacklo_epi8(x[6], x[7]);
   4148 
   4149   w8 = _mm_unpacklo_epi8(x[8], x[9]);
   4150   w9 = _mm_unpacklo_epi8(x[10], x[11]);
   4151   w10 = _mm_unpacklo_epi8(x[12], x[13]);
   4152   w11 = _mm_unpacklo_epi8(x[14], x[15]);
   4153 
   4154   w4 = _mm_unpacklo_epi16(w0, w1);
   4155   w5 = _mm_unpacklo_epi16(w2, w3);
   4156   w12 = _mm_unpacklo_epi16(w8, w9);
   4157   w13 = _mm_unpacklo_epi16(w10, w11);
   4158 
   4159   w6 = _mm_unpacklo_epi32(w4, w5);
   4160   w7 = _mm_unpackhi_epi32(w4, w5);
   4161   w14 = _mm_unpacklo_epi32(w12, w13);
   4162   w15 = _mm_unpackhi_epi32(w12, w13);
   4163 
   4164   // Store first 4-line result
   4165   d[0] = _mm_unpacklo_epi64(w6, w14);
   4166   d[1] = _mm_unpackhi_epi64(w6, w14);
   4167   d[2] = _mm_unpacklo_epi64(w7, w15);
   4168   d[3] = _mm_unpackhi_epi64(w7, w15);
   4169 
   4170   w4 = _mm_unpackhi_epi16(w0, w1);
   4171   w5 = _mm_unpackhi_epi16(w2, w3);
   4172   w12 = _mm_unpackhi_epi16(w8, w9);
   4173   w13 = _mm_unpackhi_epi16(w10, w11);
   4174 
   4175   w6 = _mm_unpacklo_epi32(w4, w5);
   4176   w7 = _mm_unpackhi_epi32(w4, w5);
   4177   w14 = _mm_unpacklo_epi32(w12, w13);
   4178   w15 = _mm_unpackhi_epi32(w12, w13);
   4179 
   4180   // Store second 4-line result
   4181   d[4] = _mm_unpacklo_epi64(w6, w14);
   4182   d[5] = _mm_unpackhi_epi64(w6, w14);
   4183   d[6] = _mm_unpacklo_epi64(w7, w15);
   4184   d[7] = _mm_unpackhi_epi64(w7, w15);
   4185 
   4186   // upper half
   4187   w0 = _mm_unpackhi_epi8(x[0], x[1]);
   4188   w1 = _mm_unpackhi_epi8(x[2], x[3]);
   4189   w2 = _mm_unpackhi_epi8(x[4], x[5]);
   4190   w3 = _mm_unpackhi_epi8(x[6], x[7]);
   4191 
   4192   w8 = _mm_unpackhi_epi8(x[8], x[9]);
   4193   w9 = _mm_unpackhi_epi8(x[10], x[11]);
   4194   w10 = _mm_unpackhi_epi8(x[12], x[13]);
   4195   w11 = _mm_unpackhi_epi8(x[14], x[15]);
   4196 
   4197   w4 = _mm_unpacklo_epi16(w0, w1);
   4198   w5 = _mm_unpacklo_epi16(w2, w3);
   4199   w12 = _mm_unpacklo_epi16(w8, w9);
   4200   w13 = _mm_unpacklo_epi16(w10, w11);
   4201 
   4202   w6 = _mm_unpacklo_epi32(w4, w5);
   4203   w7 = _mm_unpackhi_epi32(w4, w5);
   4204   w14 = _mm_unpacklo_epi32(w12, w13);
   4205   w15 = _mm_unpackhi_epi32(w12, w13);
   4206 
   4207   // Store first 4-line result
   4208   d[8] = _mm_unpacklo_epi64(w6, w14);
   4209   d[9] = _mm_unpackhi_epi64(w6, w14);
   4210   d[10] = _mm_unpacklo_epi64(w7, w15);
   4211   d[11] = _mm_unpackhi_epi64(w7, w15);
   4212 
   4213   w4 = _mm_unpackhi_epi16(w0, w1);
   4214   w5 = _mm_unpackhi_epi16(w2, w3);
   4215   w12 = _mm_unpackhi_epi16(w8, w9);
   4216   w13 = _mm_unpackhi_epi16(w10, w11);
   4217 
   4218   w6 = _mm_unpacklo_epi32(w4, w5);
   4219   w7 = _mm_unpackhi_epi32(w4, w5);
   4220   w14 = _mm_unpacklo_epi32(w12, w13);
   4221   w15 = _mm_unpackhi_epi32(w12, w13);
   4222 
   4223   // Store second 4-line result
   4224   d[12] = _mm_unpacklo_epi64(w6, w14);
   4225   d[13] = _mm_unpackhi_epi64(w6, w14);
   4226   d[14] = _mm_unpacklo_epi64(w7, w15);
   4227   d[15] = _mm_unpackhi_epi64(w7, w15);
   4228 }
   4229 
   4230 static void transpose_TX_8X8(const uint8_t *src, ptrdiff_t pitchSrc,
   4231                              uint8_t *dst, ptrdiff_t pitchDst) {
   4232   __m128i r0, r1, r2, r3, r4, r5, r6, r7;
   4233   __m128i d0d1, d2d3, d4d5, d6d7;
   4234   r0 = _mm_loadl_epi64((__m128i *)(src + 0 * pitchSrc));
   4235   r1 = _mm_loadl_epi64((__m128i *)(src + 1 * pitchSrc));
   4236   r2 = _mm_loadl_epi64((__m128i *)(src + 2 * pitchSrc));
   4237   r3 = _mm_loadl_epi64((__m128i *)(src + 3 * pitchSrc));
   4238   r4 = _mm_loadl_epi64((__m128i *)(src + 4 * pitchSrc));
   4239   r5 = _mm_loadl_epi64((__m128i *)(src + 5 * pitchSrc));
   4240   r6 = _mm_loadl_epi64((__m128i *)(src + 6 * pitchSrc));
   4241   r7 = _mm_loadl_epi64((__m128i *)(src + 7 * pitchSrc));
   4242 
   4243   transpose8x8_sse2(&r0, &r1, &r2, &r3, &r4, &r5, &r6, &r7, &d0d1, &d2d3, &d4d5,
   4244                     &d6d7);
   4245 
   4246   _mm_storel_epi64((__m128i *)(dst + 0 * pitchDst), d0d1);
   4247   _mm_storel_epi64((__m128i *)(dst + 1 * pitchDst), _mm_srli_si128(d0d1, 8));
   4248   _mm_storel_epi64((__m128i *)(dst + 2 * pitchDst), d2d3);
   4249   _mm_storel_epi64((__m128i *)(dst + 3 * pitchDst), _mm_srli_si128(d2d3, 8));
   4250   _mm_storel_epi64((__m128i *)(dst + 4 * pitchDst), d4d5);
   4251   _mm_storel_epi64((__m128i *)(dst + 5 * pitchDst), _mm_srli_si128(d4d5, 8));
   4252   _mm_storel_epi64((__m128i *)(dst + 6 * pitchDst), d6d7);
   4253   _mm_storel_epi64((__m128i *)(dst + 7 * pitchDst), _mm_srli_si128(d6d7, 8));
   4254 }
   4255 
   4256 static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
   4257                       ptrdiff_t pitchDst, int width, int height) {
   4258   for (int j = 0; j < height; j += 8)
   4259     for (int i = 0; i < width; i += 8)
   4260       transpose_TX_8X8(src + i * pitchSrc + j, pitchSrc, dst + j * pitchDst + i,
   4261                        pitchDst);
   4262 }
   4263 
   4264 static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
   4265                                       const uint8_t *left, int upsample_left,
   4266                                       int dy) {
   4267   __m128i dstvec[4], d[4];
   4268 
   4269   dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, dy);
   4270   transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
   4271                             &d[0], &d[1], &d[2], &d[3]);
   4272 
   4273   *(uint32_t *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
   4274   *(uint32_t *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
   4275   *(uint32_t *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
   4276   *(uint32_t *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
   4277   return;
   4278 }
   4279 
   4280 static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
   4281                                       const uint8_t *left, int upsample_left,
   4282                                       int dy) {
   4283   __m128i dstvec[8], d[8];
   4284 
   4285   dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, dy);
   4286   transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
   4287                     &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
   4288                     &d[3]);
   4289 
   4290   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
   4291   _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
   4292   _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
   4293   _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
   4294   _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
   4295   _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
   4296   _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
   4297   _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
   4298 }
   4299 
   4300 static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
   4301                                       const uint8_t *left, int upsample_left,
   4302                                       int dy) {
   4303   __m128i dstvec[4], d[8];
   4304 
   4305   dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, dy);
   4306   transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
   4307                         &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
   4308   for (int i = 0; i < 8; i++) {
   4309     *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
   4310   }
   4311 }
   4312 
   4313 static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
   4314                                       const uint8_t *left, int upsample_left,
   4315                                       int dy) {
   4316   __m128i dstvec[8], d[4];
   4317 
   4318   dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, dy);
   4319   transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
   4320                         &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
   4321                         &d[1], &d[2], &d[3]);
   4322   _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
   4323   _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
   4324   _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
   4325   _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
   4326 }
   4327 
   4328 static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
   4329                                        const uint8_t *left, int upsample_left,
   4330                                        int dy) {
   4331   __m128i dstvec[8], d[8];
   4332 
   4333   dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, dy);
   4334   transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
   4335                           dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
   4336                           d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
   4337   for (int i = 0; i < 8; i++) {
   4338     _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
   4339     _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
   4340                      _mm_srli_si128(d[i], 8));
   4341   }
   4342 }
   4343 
   4344 static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
   4345                                        const uint8_t *left, int upsample_left,
   4346                                        int dy) {
   4347   __m128i dstvec[16], d[16];
   4348 
   4349   dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, dy);
   4350   transpose16x8_8x16_sse2(
   4351       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
   4352       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
   4353       &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
   4354       &d[3], &d[4], &d[5], &d[6], &d[7]);
   4355 
   4356   for (int i = 0; i < 8; i++) {
   4357     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   4358   }
   4359 }
   4360 
   4361 static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
   4362                                        const uint8_t *left, int upsample_left,
   4363                                        int dy) {
   4364   __m128i dstvec[4], d[16];
   4365 
   4366   dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, dy);
   4367   transpose4x16_sse2(dstvec, d);
   4368   for (int i = 0; i < 16; i++) {
   4369     *(uint32_t *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
   4370   }
   4371 }
   4372 
   4373 static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
   4374                                        const uint8_t *left, int upsample_left,
   4375                                        int dy) {
   4376   __m128i dstvec[16], d[8];
   4377 
   4378   dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, dy);
   4379   for (int i = 4; i < 8; i++) {
   4380     d[i] = _mm_setzero_si128();
   4381   }
   4382   transpose16x8_8x16_sse2(
   4383       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
   4384       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
   4385       &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
   4386       &d[3], &d[4], &d[5], &d[6], &d[7]);
   4387 
   4388   for (int i = 0; i < 4; i++) {
   4389     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   4390   }
   4391 }
   4392 
   4393 static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
   4394                                        const uint8_t *left, int upsample_left,
   4395                                        int dy) {
   4396   __m256i dstvec[16], d[16];
   4397 
   4398   dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
   4399   for (int i = 8; i < 16; i++) {
   4400     dstvec[i] = _mm256_setzero_si256();
   4401   }
   4402   transpose16x32_avx2(dstvec, d);
   4403 
   4404   for (int i = 0; i < 16; i++) {
   4405     _mm_storel_epi64((__m128i *)(dst + i * stride),
   4406                      _mm256_castsi256_si128(d[i]));
   4407   }
   4408   for (int i = 0; i < 16; i++) {
   4409     _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
   4410                      _mm256_extracti128_si256(d[i], 1));
   4411   }
   4412 }
   4413 
   4414 static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
   4415                                        const uint8_t *left, int upsample_left,
   4416                                        int dy) {
   4417   __m128i dstvec[32], d[16];
   4418 
   4419   dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, dy);
   4420 
   4421   transpose16x8_8x16_sse2(
   4422       &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
   4423       &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
   4424       &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
   4425       &d[3], &d[4], &d[5], &d[6], &d[7]);
   4426   transpose16x8_8x16_sse2(
   4427       &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
   4428       &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
   4429       &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
   4430       &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
   4431       &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
   4432       &d[6 + 8], &d[7 + 8]);
   4433 
   4434   for (int i = 0; i < 8; i++) {
   4435     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   4436     _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
   4437   }
   4438 }
   4439 
   4440 static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
   4441                                         const uint8_t *left, int upsample_left,
   4442                                         int dy) {
   4443   __m128i dstvec[16], d[16];
   4444 
   4445   dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, dy);
   4446   transpose16x16_sse2(dstvec, d);
   4447 
   4448   for (int i = 0; i < 16; i++) {
   4449     _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   4450   }
   4451 }
   4452 
   4453 static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
   4454                                         const uint8_t *left, int upsample_left,
   4455                                         int dy) {
   4456   __m256i dstvec[32], d[32];
   4457 
   4458   dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
   4459   transpose16x32_avx2(dstvec, d);
   4460   transpose16x32_avx2(dstvec + 16, d + 16);
   4461   for (int j = 0; j < 16; j++) {
   4462     _mm_storeu_si128((__m128i *)(dst + j * stride),
   4463                      _mm256_castsi256_si128(d[j]));
   4464     _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
   4465                      _mm256_castsi256_si128(d[j + 16]));
   4466   }
   4467   for (int j = 0; j < 16; j++) {
   4468     _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
   4469                      _mm256_extracti128_si256(d[j], 1));
   4470     _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
   4471                      _mm256_extracti128_si256(d[j + 16], 1));
   4472   }
   4473 }
   4474 
   4475 static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
   4476                                         const uint8_t *left, int upsample_left,
   4477                                         int dy) {
   4478   DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
   4479   dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
   4480   transpose(dstT, 64, dst, stride, 64, 64);
   4481 }
   4482 
   4483 static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
   4484                                         const uint8_t *left, int upsample_left,
   4485                                         int dy) {
   4486   __m256i dstvec[16], d[16];
   4487 
   4488   dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
   4489   transpose16x32_avx2(dstvec, d);
   4490   // store
   4491   for (int j = 0; j < 16; j++) {
   4492     _mm_storeu_si128((__m128i *)(dst + j * stride),
   4493                      _mm256_castsi256_si128(d[j]));
   4494     _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
   4495                      _mm256_extracti128_si256(d[j], 1));
   4496   }
   4497 }
   4498 
   4499 static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   4500                                         const uint8_t *left, int upsample_left,
   4501                                         int dy) {
   4502   __m128i dstvec[32], d[16];
   4503 
   4504   dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, dy);
   4505   for (int i = 0; i < 32; i += 16) {
   4506     transpose16x16_sse2((dstvec + i), d);
   4507     for (int j = 0; j < 16; j++) {
   4508       _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
   4509     }
   4510   }
   4511 }
   4512 
   4513 static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
   4514                                         const uint8_t *left, int upsample_left,
   4515                                         int dy) {
   4516   uint8_t dstT[64 * 32];
   4517   dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
   4518   transpose(dstT, 64, dst, stride, 32, 64);
   4519 }
   4520 
   4521 static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
   4522                                         const uint8_t *left, int upsample_left,
   4523                                         int dy) {
   4524   uint8_t dstT[32 * 64];
   4525   dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
   4526   transpose(dstT, 32, dst, stride, 64, 32);
   4527   return;
   4528 }
   4529 
   4530 static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
   4531                                         const uint8_t *left, int upsample_left,
   4532                                         int dy) {
   4533   uint8_t dstT[64 * 16];
   4534   dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
   4535   transpose(dstT, 64, dst, stride, 16, 64);
   4536 }
   4537 
   4538 static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
   4539                                         const uint8_t *left, int upsample_left,
   4540                                         int dy) {
   4541   __m128i dstvec[64], d[16];
   4542 
   4543   dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, dy);
   4544   for (int i = 0; i < 64; i += 16) {
   4545     transpose16x16_sse2((dstvec + i), d);
   4546     for (int j = 0; j < 16; j++) {
   4547       _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
   4548     }
   4549   }
   4550 }
   4551 
   4552 void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   4553                                const uint8_t *above, const uint8_t *left,
   4554                                int upsample_left, int dx, int dy) {
   4555   (void)above;
   4556   (void)dx;
   4557   assert(dx == 1);
   4558   assert(dy > 0);
   4559 
   4560   if (bw == bh) {
   4561     switch (bw) {
   4562       case 4:
   4563         dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
   4564         break;
   4565       case 8:
   4566         dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
   4567         break;
   4568       case 16:
   4569         dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
   4570         break;
   4571       case 32:
   4572         dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
   4573         break;
   4574       case 64:
   4575         dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
   4576         break;
   4577     }
   4578   } else {
   4579     if (bw < bh) {
   4580       if (bw + bw == bh) {
   4581         switch (bw) {
   4582           case 4:
   4583             dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
   4584             break;
   4585           case 8:
   4586             dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
   4587             break;
   4588           case 16:
   4589             dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
   4590             break;
   4591           case 32:
   4592             dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
   4593             break;
   4594         }
   4595       } else {
   4596         switch (bw) {
   4597           case 4:
   4598             dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
   4599             break;
   4600           case 8:
   4601             dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
   4602             break;
   4603           case 16:
   4604             dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
   4605             break;
   4606         }
   4607       }
   4608     } else {
   4609       if (bh + bh == bw) {
   4610         switch (bh) {
   4611           case 4:
   4612             dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
   4613             break;
   4614           case 8:
   4615             dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
   4616             break;
   4617           case 16:
   4618             dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
   4619             break;
   4620           case 32:
   4621             dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
   4622             break;
   4623         }
   4624       } else {
   4625         switch (bh) {
   4626           case 4:
   4627             dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
   4628             break;
   4629           case 8:
   4630             dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
   4631             break;
   4632           case 16:
   4633             dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
   4634             break;
   4635         }
   4636       }
   4637     }
   4638   }
   4639 }
   4640