Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <emmintrin.h>  // SSE2
     12 
     13 #include "./vpx_config.h"
     14 #include "./vpx_dsp_rtcd.h"
     15 #include "vpx/vpx_integer.h"
     16 
     17 // -----------------------------------------------------------------------------
     18 
     19 void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
     20                                      const uint16_t *above,
     21                                      const uint16_t *left, int bd) {
     22   const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
     23   const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
     24   const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
     25   const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
     26   const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
     27   (void)above;
     28   (void)bd;
     29   _mm_storel_epi64((__m128i *)dst, row0);
     30   dst += stride;
     31   _mm_storel_epi64((__m128i *)dst, row1);
     32   dst += stride;
     33   _mm_storel_epi64((__m128i *)dst, row2);
     34   dst += stride;
     35   _mm_storel_epi64((__m128i *)dst, row3);
     36 }
     37 
     38 void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
     39                                      const uint16_t *above,
     40                                      const uint16_t *left, int bd) {
     41   const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
     42   const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
     43   const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
     44   const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
     45   const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
     46   const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
     47   const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
     48   const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
     49   const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
     50   (void)above;
     51   (void)bd;
     52   _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
     53   dst += stride;
     54   _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
     55   dst += stride;
     56   _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
     57   dst += stride;
     58   _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
     59   dst += stride;
     60   _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
     61   dst += stride;
     62   _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
     63   dst += stride;
     64   _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
     65   dst += stride;
     66   _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
     67 }
     68 
     69 static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
     70                                        const __m128i *row) {
     71   const __m128i val = _mm_unpacklo_epi64(*row, *row);
     72   _mm_store_si128((__m128i *)*dst, val);
     73   _mm_store_si128((__m128i *)(*dst + 8), val);
     74   *dst += stride;
     75 }
     76 
     77 static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
     78                                        const __m128i *row) {
     79   const __m128i val = _mm_unpackhi_epi64(*row, *row);
     80   _mm_store_si128((__m128i *)(*dst), val);
     81   _mm_store_si128((__m128i *)(*dst + 8), val);
     82   *dst += stride;
     83 }
     84 
     85 void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
     86                                        const uint16_t *above,
     87                                        const uint16_t *left, int bd) {
     88   int i;
     89   (void)above;
     90   (void)bd;
     91 
     92   for (i = 0; i < 2; i++, left += 8) {
     93     const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
     94     const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
     95     const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
     96     const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
     97     const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
     98     const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
     99     const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
    100     const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
    101     const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
    102     h_store_16_unpacklo(&dst, stride, &row0);
    103     h_store_16_unpacklo(&dst, stride, &row1);
    104     h_store_16_unpacklo(&dst, stride, &row2);
    105     h_store_16_unpacklo(&dst, stride, &row3);
    106     h_store_16_unpackhi(&dst, stride, &row4);
    107     h_store_16_unpackhi(&dst, stride, &row5);
    108     h_store_16_unpackhi(&dst, stride, &row6);
    109     h_store_16_unpackhi(&dst, stride, &row7);
    110   }
    111 }
    112 
    113 static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
    114                                        const __m128i *row) {
    115   const __m128i val = _mm_unpacklo_epi64(*row, *row);
    116   _mm_store_si128((__m128i *)(*dst), val);
    117   _mm_store_si128((__m128i *)(*dst + 8), val);
    118   _mm_store_si128((__m128i *)(*dst + 16), val);
    119   _mm_store_si128((__m128i *)(*dst + 24), val);
    120   *dst += stride;
    121 }
    122 
    123 static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
    124                                        const __m128i *row) {
    125   const __m128i val = _mm_unpackhi_epi64(*row, *row);
    126   _mm_store_si128((__m128i *)(*dst), val);
    127   _mm_store_si128((__m128i *)(*dst + 8), val);
    128   _mm_store_si128((__m128i *)(*dst + 16), val);
    129   _mm_store_si128((__m128i *)(*dst + 24), val);
    130   *dst += stride;
    131 }
    132 
    133 void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
    134                                        const uint16_t *above,
    135                                        const uint16_t *left, int bd) {
    136   int i;
    137   (void)above;
    138   (void)bd;
    139 
    140   for (i = 0; i < 4; i++, left += 8) {
    141     const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
    142     const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
    143     const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
    144     const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
    145     const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
    146     const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
    147     const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
    148     const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
    149     const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
    150     h_store_32_unpacklo(&dst, stride, &row0);
    151     h_store_32_unpacklo(&dst, stride, &row1);
    152     h_store_32_unpacklo(&dst, stride, &row2);
    153     h_store_32_unpacklo(&dst, stride, &row3);
    154     h_store_32_unpackhi(&dst, stride, &row4);
    155     h_store_32_unpackhi(&dst, stride, &row5);
    156     h_store_32_unpackhi(&dst, stride, &row6);
    157     h_store_32_unpackhi(&dst, stride, &row7);
    158   }
    159 }
    160 
    161 //------------------------------------------------------------------------------
    162 // DC 4x4
    163 
    164 static INLINE __m128i dc_sum_4(const uint16_t *ref) {
    165   const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
    166   const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
    167   const __m128i a = _mm_add_epi16(_dcba, _xxdc);
    168   return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
    169 }
    170 
    171 static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
    172                                 const __m128i *dc) {
    173   const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
    174   int i;
    175   for (i = 0; i < 4; ++i, dst += stride) {
    176     _mm_storel_epi64((__m128i *)dst, dc_dup);
    177   }
    178 }
    179 
    180 void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
    181                                            const uint16_t *above,
    182                                            const uint16_t *left, int bd) {
    183   const __m128i two = _mm_cvtsi32_si128(2);
    184   const __m128i sum = dc_sum_4(left);
    185   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
    186   (void)above;
    187   (void)bd;
    188   dc_store_4x4(dst, stride, &dc);
    189 }
    190 
    191 void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
    192                                           const uint16_t *above,
    193                                           const uint16_t *left, int bd) {
    194   const __m128i two = _mm_cvtsi32_si128(2);
    195   const __m128i sum = dc_sum_4(above);
    196   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
    197   (void)left;
    198   (void)bd;
    199   dc_store_4x4(dst, stride, &dc);
    200 }
    201 
    202 void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
    203                                           const uint16_t *above,
    204                                           const uint16_t *left, int bd) {
    205   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
    206   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
    207   (void)above;
    208   (void)left;
    209   dc_store_4x4(dst, stride, &dc_dup);
    210 }
    211 
    212 //------------------------------------------------------------------------------
    213 // DC 8x8
    214 
    215 static INLINE __m128i dc_sum_8(const uint16_t *ref) {
    216   const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
    217   const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
    218   const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
    219   const __m128i a = _mm_add_epi16(_dcba, _xxdc);
    220 
    221   return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
    222 }
    223 
    224 static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
    225                                 const __m128i *dc) {
    226   const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
    227   const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
    228   int i;
    229   for (i = 0; i < 8; ++i, dst += stride) {
    230     _mm_store_si128((__m128i *)dst, dc_dup);
    231   }
    232 }
    233 
    234 void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
    235                                            const uint16_t *above,
    236                                            const uint16_t *left, int bd) {
    237   const __m128i four = _mm_cvtsi32_si128(4);
    238   const __m128i sum = dc_sum_8(left);
    239   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
    240   (void)above;
    241   (void)bd;
    242   dc_store_8x8(dst, stride, &dc);
    243 }
    244 
    245 void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
    246                                           const uint16_t *above,
    247                                           const uint16_t *left, int bd) {
    248   const __m128i four = _mm_cvtsi32_si128(4);
    249   const __m128i sum = dc_sum_8(above);
    250   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
    251   (void)left;
    252   (void)bd;
    253   dc_store_8x8(dst, stride, &dc);
    254 }
    255 
    256 void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
    257                                           const uint16_t *above,
    258                                           const uint16_t *left, int bd) {
    259   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
    260   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
    261   (void)above;
    262   (void)left;
    263   dc_store_8x8(dst, stride, &dc_dup);
    264 }
    265 
    266 //------------------------------------------------------------------------------
    267 // DC 16x16
    268 
    269 static INLINE __m128i dc_sum_16(const uint16_t *ref) {
    270   const __m128i sum_lo = dc_sum_8(ref);
    271   const __m128i sum_hi = dc_sum_8(ref + 8);
    272   return _mm_add_epi16(sum_lo, sum_hi);
    273 }
    274 
    275 static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
    276                                   const __m128i *dc) {
    277   const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
    278   const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
    279   int i;
    280   for (i = 0; i < 16; ++i, dst += stride) {
    281     _mm_store_si128((__m128i *)dst, dc_dup);
    282     _mm_store_si128((__m128i *)(dst + 8), dc_dup);
    283   }
    284 }
    285 
    286 void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
    287                                              const uint16_t *above,
    288                                              const uint16_t *left, int bd) {
    289   const __m128i eight = _mm_cvtsi32_si128(8);
    290   const __m128i sum = dc_sum_16(left);
    291   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
    292   (void)above;
    293   (void)bd;
    294   dc_store_16x16(dst, stride, &dc);
    295 }
    296 
    297 void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
    298                                             const uint16_t *above,
    299                                             const uint16_t *left, int bd) {
    300   const __m128i eight = _mm_cvtsi32_si128(8);
    301   const __m128i sum = dc_sum_16(above);
    302   const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
    303   (void)left;
    304   (void)bd;
    305   dc_store_16x16(dst, stride, &dc);
    306 }
    307 
    308 void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
    309                                             const uint16_t *above,
    310                                             const uint16_t *left, int bd) {
    311   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
    312   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
    313   (void)above;
    314   (void)left;
    315   dc_store_16x16(dst, stride, &dc_dup);
    316 }
    317 
    318 //------------------------------------------------------------------------------
    319 // DC 32x32
    320 
    321 static INLINE __m128i dc_sum_32(const uint16_t *ref) {
    322   const __m128i zero = _mm_setzero_si128();
    323   const __m128i sum_a = dc_sum_16(ref);
    324   const __m128i sum_b = dc_sum_16(ref + 16);
    325   // 12 bit bd will outrange, so expand to 32 bit before adding final total
    326   return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
    327                        _mm_unpacklo_epi16(sum_b, zero));
    328 }
    329 
    330 static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
    331                                   const __m128i *dc) {
    332   const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
    333   const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
    334   int i;
    335   for (i = 0; i < 32; ++i, dst += stride) {
    336     _mm_store_si128((__m128i *)dst, dc_dup);
    337     _mm_store_si128((__m128i *)(dst + 8), dc_dup);
    338     _mm_store_si128((__m128i *)(dst + 16), dc_dup);
    339     _mm_store_si128((__m128i *)(dst + 24), dc_dup);
    340   }
    341 }
    342 
    343 void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
    344                                              const uint16_t *above,
    345                                              const uint16_t *left, int bd) {
    346   const __m128i sixteen = _mm_cvtsi32_si128(16);
    347   const __m128i sum = dc_sum_32(left);
    348   const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
    349   (void)above;
    350   (void)bd;
    351   dc_store_32x32(dst, stride, &dc);
    352 }
    353 
    354 void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
    355                                             const uint16_t *above,
    356                                             const uint16_t *left, int bd) {
    357   const __m128i sixteen = _mm_cvtsi32_si128(16);
    358   const __m128i sum = dc_sum_32(above);
    359   const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
    360   (void)left;
    361   (void)bd;
    362   dc_store_32x32(dst, stride, &dc);
    363 }
    364 
    365 void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
    366                                             const uint16_t *above,
    367                                             const uint16_t *left, int bd) {
    368   const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
    369   const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
    370   (void)above;
    371   (void)left;
    372   dc_store_32x32(dst, stride, &dc_dup);
    373 }
    374 
    375 // -----------------------------------------------------------------------------
    376 /*
    377 ; ------------------------------------------
    378 ; input: x, y, z, result
    379 ;
    380 ; trick from pascal
    381 ; (x+2y+z+2)>>2 can be calculated as:
    382 ; result = avg(x,z)
    383 ; result -= xor(x,z) & 1
    384 ; result = avg(result,y)
    385 ; ------------------------------------------
    386 */
    387 static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
    388                                  const __m128i *z) {
    389   const __m128i one = _mm_set1_epi16(1);
    390   const __m128i a = _mm_avg_epu16(*x, *z);
    391   const __m128i b =
    392       _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
    393   return _mm_avg_epu16(b, *y);
    394 }
    395 
    396 void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
    397                                         const uint16_t *above,
    398                                         const uint16_t *left, int bd) {
    399   const int I = left[0];
    400   const int J = left[1];
    401   const int K = left[2];
    402   const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
    403   const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
    404   const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
    405   const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
    406   const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
    407   const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
    408   const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
    409   const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
    410   const __m128i row0 = _mm_srli_si128(avg2, 6);
    411   const __m128i row1 = _mm_srli_si128(avg3, 4);
    412   const __m128i row2 = _mm_srli_si128(avg2, 4);
    413   const __m128i row3 = _mm_srli_si128(avg3, 2);
    414   (void)bd;
    415   _mm_storel_epi64((__m128i *)dst, row0);
    416   dst += stride;
    417   _mm_storel_epi64((__m128i *)dst, row1);
    418   dst += stride;
    419   _mm_storel_epi64((__m128i *)dst, row2);
    420   dst += stride;
    421   _mm_storel_epi64((__m128i *)dst, row3);
    422 
    423   dst -= stride;
    424   dst[0] = _mm_extract_epi16(avg3, 1);
    425   dst[stride] = _mm_extract_epi16(avg3, 0);
    426 }
    427 
    428 void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
    429                                         const uint16_t *above,
    430                                         const uint16_t *left, int bd) {
    431   const int I = left[0];
    432   const int J = left[1];
    433   const int K = left[2];
    434   const int L = left[3];
    435   const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
    436   const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
    437   const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
    438   const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
    439   const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
    440   const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
    441   const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
    442   const __m128i row0 = _mm_srli_si128(avg3, 6);
    443   const __m128i row1 = _mm_srli_si128(avg3, 4);
    444   const __m128i row2 = _mm_srli_si128(avg3, 2);
    445   const __m128i row3 = avg3;
    446   (void)bd;
    447   _mm_storel_epi64((__m128i *)dst, row0);
    448   dst += stride;
    449   _mm_storel_epi64((__m128i *)dst, row1);
    450   dst += stride;
    451   _mm_storel_epi64((__m128i *)dst, row2);
    452   dst += stride;
    453   _mm_storel_epi64((__m128i *)dst, row3);
    454 }
    455 
    456 void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
    457                                         const uint16_t *above,
    458                                         const uint16_t *left, int bd) {
    459   const int I = left[0];
    460   const int J = left[1];
    461   const int K = left[2];
    462   const int L = left[3];
    463   const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5));
    464   const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
    465   const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
    466   const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
    467   const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
    468   const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
    469   const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
    470   const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
    471   const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
    472   const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
    473   const __m128i row2 = _mm_srli_si128(row3, 4);
    474   const __m128i row1 = _mm_srli_si128(row3, 8);
    475   const __m128i row0 = _mm_srli_si128(avg3, 4);
    476   (void)bd;
    477   _mm_storel_epi64((__m128i *)dst, row0);
    478   dst[0] = _mm_extract_epi16(avg2, 3);
    479   dst += stride;
    480   _mm_storel_epi64((__m128i *)dst, row1);
    481   dst += stride;
    482   _mm_storel_epi64((__m128i *)dst, row2);
    483   dst += stride;
    484   _mm_storel_epi64((__m128i *)dst, row3);
    485 }
    486 
    487 void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
    488                                         const uint16_t *above,
    489                                         const uint16_t *left, int bd) {
    490   const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left);
    491   const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff);
    492   const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000);
    493   const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2);
    494   const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4);
    495   const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00);
    496   const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0);
    497   const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3);
    498   const __m128i row1 = _mm_srli_si128(row0, 4);
    499   const __m128i row2 = _mm_srli_si128(row0, 8);
    500   const __m128i row3 = LLLL0000;
    501   (void)above;
    502   (void)bd;
    503   _mm_storel_epi64((__m128i *)dst, row0);
    504   dst += stride;
    505   _mm_storel_epi64((__m128i *)dst, row1);
    506   dst += stride;
    507   _mm_storel_epi64((__m128i *)dst, row2);
    508   dst += stride;
    509   _mm_storel_epi64((__m128i *)dst, row3);
    510 }
    511 
    512 void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
    513                                        const uint16_t *above,
    514                                        const uint16_t *left, int bd) {
    515   const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
    516   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
    517   const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
    518   const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
    519   const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGH0);
    520   const __m128i row0 = avg2;
    521   const __m128i row1 = avg3;
    522   const __m128i row2 = _mm_srli_si128(avg2, 2);
    523   const __m128i row3 = _mm_srli_si128(avg3, 2);
    524   (void)left;
    525   (void)bd;
    526   _mm_storel_epi64((__m128i *)dst, row0);
    527   dst += stride;
    528   _mm_storel_epi64((__m128i *)dst, row1);
    529   dst += stride;
    530   _mm_storel_epi64((__m128i *)dst, row2);
    531   dst += stride;
    532   _mm_storel_epi64((__m128i *)dst, row3);
    533 }
    534