Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <tmmintrin.h>
     12 
     13 #include "./vpx_config.h"
     14 #include "./vpx_dsp_rtcd.h"
     15 #include "vpx/vpx_integer.h"
     16 
     17 // -----------------------------------------------------------------------------
     18 /*
     19 ; ------------------------------------------
     20 ; input: x, y, z, result
     21 ;
     22 ; trick from pascal
     23 ; (x+2y+z+2)>>2 can be calculated as:
     24 ; result = avg(x,z)
     25 ; result -= xor(x,z) & 1
     26 ; result = avg(result,y)
     27 ; ------------------------------------------
     28 */
     29 static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
     30                                  const __m128i *z) {
     31   const __m128i one = _mm_set1_epi16(1);
     32   const __m128i a = _mm_avg_epu16(*x, *z);
     33   const __m128i b =
     34       _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
     35   return _mm_avg_epu16(b, *y);
     36 }
     37 
     38 void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
     39                                         const uint16_t *above,
     40                                         const uint16_t *left, int bd) {
     41   const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
     42   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
     43   const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
     44   const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
     45   (void)left;
     46   (void)bd;
     47   _mm_storel_epi64((__m128i *)dst, avg3);
     48   dst += stride;
     49   _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
     50   dst += stride;
     51   _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
     52   dst += stride;
     53   _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
     54   dst[3] = above[7];  // aka H
     55 }
     56 
     57 static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
     58                                __m128i *row, const __m128i *ar) {
     59   *row = _mm_alignr_epi8(*ar, *row, 2);
     60   _mm_store_si128((__m128i *)*dst, *row);
     61   *dst += stride;
     62 }
     63 
     64 void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
     65                                         const uint16_t *above,
     66                                         const uint16_t *left, int bd) {
     67   const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
     68   const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
     69   const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
     70   const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
     71   const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
     72   __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
     73   (void)left;
     74   (void)bd;
     75   _mm_store_si128((__m128i *)dst, avg3);
     76   dst += stride;
     77   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
     78   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
     79   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
     80   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
     81   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
     82   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
     83   d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
     84 }
     85 
     86 static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
     87                                 __m128i *row_0, __m128i *row_1,
     88                                 const __m128i *ar) {
     89   *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2);
     90   *row_1 = _mm_alignr_epi8(*ar, *row_1, 2);
     91   _mm_store_si128((__m128i *)*dst, *row_0);
     92   _mm_store_si128((__m128i *)(*dst + 8), *row_1);
     93   *dst += stride;
     94 }
     95 
     96 void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
     97                                           const uint16_t *above,
     98                                           const uint16_t *left, int bd) {
     99   const __m128i A0 = _mm_load_si128((const __m128i *)above);
    100   const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
    101   const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
    102   const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
    103   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
    104   const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
    105   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
    106   const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
    107   __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
    108   __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
    109   (void)left;
    110   (void)bd;
    111   _mm_store_si128((__m128i *)dst, avg3_0);
    112   _mm_store_si128((__m128i *)(dst + 8), avg3_1);
    113   dst += stride;
    114   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    115   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    116   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    117   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    118   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    119   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    120   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    121   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    122   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    123   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    124   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    125   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    126   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    127   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    128   d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
    129 }
    130 
    131 void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
    132                                           const uint16_t *above,
    133                                           const uint16_t *left, int bd) {
    134   const __m128i A0 = _mm_load_si128((const __m128i *)above);
    135   const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
    136   const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
    137   const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
    138   const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
    139   const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
    140   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
    141   const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
    142   const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
    143   const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
    144   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
    145   const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
    146   const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
    147   const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
    148   __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
    149   __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
    150   __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
    151   __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
    152   int i;
    153   (void)left;
    154   (void)bd;
    155   _mm_store_si128((__m128i *)dst, avg3_0);
    156   _mm_store_si128((__m128i *)(dst + 8), avg3_1);
    157   _mm_store_si128((__m128i *)(dst + 16), avg3_2);
    158   _mm_store_si128((__m128i *)(dst + 24), avg3_3);
    159   dst += stride;
    160   for (i = 1; i < 32; ++i) {
    161     avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
    162     avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
    163     avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
    164     avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
    165     _mm_store_si128((__m128i *)dst, avg3_0);
    166     _mm_store_si128((__m128i *)(dst + 8), avg3_1);
    167     _mm_store_si128((__m128i *)(dst + 16), avg3_2);
    168     _mm_store_si128((__m128i *)(dst + 24), avg3_3);
    169     dst += stride;
    170   }
    171 }
    172 
    173 DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = {
    174   2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1
    175 };
    176 
    177 static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
    178   *a = _mm_shuffle_epi8(*a, *rotrw);
    179   return *a;
    180 }
    181 
    182 void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
    183                                          const uint16_t *above,
    184                                          const uint16_t *left, int bd) {
    185   const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
    186   const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
    187   const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
    188   const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
    189   const __m128i IXABCDEF =
    190       _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
    191   const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
    192   const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
    193   const __m128i XIJKLMNO =
    194       _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
    195   const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
    196   __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
    197   __m128i rowa = avg2;
    198   __m128i rowb = avg3;
    199   int i;
    200   (void)bd;
    201   for (i = 0; i < 8; i += 2) {
    202     _mm_store_si128((__m128i *)dst, rowa);
    203     dst += stride;
    204     _mm_store_si128((__m128i *)dst, rowb);
    205     dst += stride;
    206     rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
    207     rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
    208   }
    209 }
    210 
    211 void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
    212                                            const uint16_t *above,
    213                                            const uint16_t *left, int bd) {
    214   const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
    215   const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
    216   const __m128i A0 = _mm_load_si128((const __m128i *)above);
    217   const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
    218   const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
    219   const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
    220   const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
    221   const __m128i L0 = _mm_load_si128((const __m128i *)left);
    222   const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
    223   const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
    224   const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
    225   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
    226   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
    227   const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
    228   const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
    229   const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
    230   const __m128i L1_ = _mm_srli_si128(L1, 2);
    231   __m128i rowa_0 = avg2_0;
    232   __m128i rowa_1 = avg2_1;
    233   __m128i rowb_0 = avg3_0;
    234   __m128i rowb_1 = avg3_1;
    235   __m128i avg3_left[2];
    236   int i, j;
    237   (void)bd;
    238   avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
    239   avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
    240   for (i = 0; i < 2; ++i) {
    241     __m128i avg_left = avg3_left[i];
    242     for (j = 0; j < 8; j += 2) {
    243       _mm_store_si128((__m128i *)dst, rowa_0);
    244       _mm_store_si128((__m128i *)(dst + 8), rowa_1);
    245       dst += stride;
    246       _mm_store_si128((__m128i *)dst, rowb_0);
    247       _mm_store_si128((__m128i *)(dst + 8), rowb_1);
    248       dst += stride;
    249       rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
    250       rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
    251       rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
    252       rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
    253     }
    254   }
    255 }
    256 
    257 void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
    258                                            const uint16_t *above,
    259                                            const uint16_t *left, int bd) {
    260   const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
    261   const __m128i A0 = _mm_load_si128((const __m128i *)above);
    262   const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
    263   const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
    264   const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
    265   const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
    266   const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
    267   const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
    268   const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
    269   const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
    270   const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
    271   const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
    272   const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
    273   const __m128i L0 = _mm_load_si128((const __m128i *)left);
    274   const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
    275   const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
    276   const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
    277   const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
    278   const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
    279   const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
    280   const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
    281   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
    282   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
    283   const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
    284   const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
    285   const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
    286   const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
    287   const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
    288   const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
    289   const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
    290   const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
    291   const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
    292   const __m128i L3_ = _mm_srli_si128(L3, 2);
    293   __m128i rowa_0 = avg2_0;
    294   __m128i rowa_1 = avg2_1;
    295   __m128i rowa_2 = avg2_2;
    296   __m128i rowa_3 = avg2_3;
    297   __m128i rowb_0 = avg3_0;
    298   __m128i rowb_1 = avg3_1;
    299   __m128i rowb_2 = avg3_2;
    300   __m128i rowb_3 = avg3_3;
    301   __m128i avg3_left[4];
    302   int i, j;
    303   (void)bd;
    304   avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
    305   avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
    306   avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
    307   avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
    308   for (i = 0; i < 4; ++i) {
    309     __m128i avg_left = avg3_left[i];
    310     for (j = 0; j < 8; j += 2) {
    311       _mm_store_si128((__m128i *)dst, rowa_0);
    312       _mm_store_si128((__m128i *)(dst + 8), rowa_1);
    313       _mm_store_si128((__m128i *)(dst + 16), rowa_2);
    314       _mm_store_si128((__m128i *)(dst + 24), rowa_3);
    315       dst += stride;
    316       _mm_store_si128((__m128i *)dst, rowb_0);
    317       _mm_store_si128((__m128i *)(dst + 8), rowb_1);
    318       _mm_store_si128((__m128i *)(dst + 16), rowb_2);
    319       _mm_store_si128((__m128i *)(dst + 24), rowb_3);
    320       dst += stride;
    321       rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
    322       rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
    323       rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
    324       rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
    325       rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
    326       rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
    327       rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
    328       rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
    329     }
    330   }
    331 }
    332 
    333 void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
    334                                          const uint16_t *above,
    335                                          const uint16_t *left, int bd) {
    336   const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
    337   const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
    338   const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
    339   const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
    340   const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
    341   const __m128i XIJKLMNO =
    342       _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
    343   const __m128i AXIJKLMN =
    344       _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
    345   const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
    346   __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
    347   __m128i rowa = avg3;
    348   int i;
    349   (void)bd;
    350   for (i = 0; i < 8; ++i) {
    351     rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
    352     _mm_store_si128((__m128i *)dst, rowa);
    353     dst += stride;
    354   }
    355 }
    356 
    357 void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
    358                                            const uint16_t *above,
    359                                            const uint16_t *left, int bd) {
    360   const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
    361   const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
    362   const __m128i B0 = _mm_load_si128((const __m128i *)above);
    363   const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
    364   const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
    365   const __m128i L0 = _mm_load_si128((const __m128i *)left);
    366   const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
    367   const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
    368   const __m128i C1 = _mm_srli_si128(B1, 2);
    369   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
    370   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
    371   const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
    372   const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
    373   const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
    374   const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
    375   __m128i rowa_0 = avg3_0;
    376   __m128i rowa_1 = avg3_1;
    377   __m128i avg3_left[2];
    378   int i, j;
    379   (void)bd;
    380   avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
    381   avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
    382   for (i = 0; i < 2; ++i) {
    383     __m128i avg_left = avg3_left[i];
    384     for (j = 0; j < 8; ++j) {
    385       rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
    386       rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
    387       _mm_store_si128((__m128i *)dst, rowa_0);
    388       _mm_store_si128((__m128i *)(dst + 8), rowa_1);
    389       dst += stride;
    390     }
    391   }
    392 }
    393 
    394 void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
    395                                            const uint16_t *above,
    396                                            const uint16_t *left, int bd) {
    397   const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
    398   const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
    399   const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
    400   const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
    401   const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
    402   const __m128i B0 = _mm_load_si128((const __m128i *)above);
    403   const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
    404   const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
    405   const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
    406   const __m128i L0 = _mm_load_si128((const __m128i *)left);
    407   const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
    408   const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
    409   const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
    410   const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
    411   const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
    412   const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
    413   const __m128i C3 = _mm_srli_si128(B3, 2);
    414   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
    415   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
    416   const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
    417   const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
    418   const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
    419   const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
    420   const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
    421   const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
    422   const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
    423   const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
    424   const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
    425   const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
    426   __m128i rowa_0 = avg3_0;
    427   __m128i rowa_1 = avg3_1;
    428   __m128i rowa_2 = avg3_2;
    429   __m128i rowa_3 = avg3_3;
    430   __m128i avg3_left[4];
    431   int i, j;
    432   (void)bd;
    433   avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
    434   avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
    435   avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
    436   avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
    437   for (i = 0; i < 4; ++i) {
    438     __m128i avg_left = avg3_left[i];
    439     for (j = 0; j < 8; ++j) {
    440       rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
    441       rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
    442       rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
    443       rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
    444       _mm_store_si128((__m128i *)dst, rowa_0);
    445       _mm_store_si128((__m128i *)(dst + 8), rowa_1);
    446       _mm_store_si128((__m128i *)(dst + 16), rowa_2);
    447       _mm_store_si128((__m128i *)(dst + 24), rowa_3);
    448       dst += stride;
    449     }
    450   }
    451 }
    452 
    453 void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
    454                                          const uint16_t *above,
    455                                          const uint16_t *left, int bd) {
    456   const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
    457   const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
    458   const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
    459   const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
    460   const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
    461   const __m128i XIJKLMNO =
    462       _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
    463   const __m128i AXIJKLMN =
    464       _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
    465   const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
    466   const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
    467   const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
    468   const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
    469   const __m128i row0 =
    470       _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
    471   const __m128i row1 =
    472       _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
    473   const __m128i row2 =
    474       _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
    475   const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
    476   const __m128i row4 =
    477       _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
    478   const __m128i row5 =
    479       _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
    480   const __m128i row6 =
    481       _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
    482   const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
    483   (void)bd;
    484   _mm_store_si128((__m128i *)dst, row0);
    485   dst += stride;
    486   _mm_store_si128((__m128i *)dst, row1);
    487   dst += stride;
    488   _mm_store_si128((__m128i *)dst, row2);
    489   dst += stride;
    490   _mm_store_si128((__m128i *)dst, row3);
    491   dst += stride;
    492   _mm_store_si128((__m128i *)dst, row4);
    493   dst += stride;
    494   _mm_store_si128((__m128i *)dst, row5);
    495   dst += stride;
    496   _mm_store_si128((__m128i *)dst, row6);
    497   dst += stride;
    498   _mm_store_si128((__m128i *)dst, row7);
    499 }
    500 
    501 void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
    502                                            const uint16_t *above,
    503                                            const uint16_t *left, int bd) {
    504   const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
    505   const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
    506   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
    507   const __m128i B1 = _mm_srli_si128(A1, 2);
    508   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
    509   const __m128i C1 = _mm_srli_si128(A1, 4);
    510   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
    511   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
    512   const __m128i L0 = _mm_load_si128((const __m128i *)left);
    513   const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
    514   const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
    515   const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
    516   const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
    517   const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
    518   const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
    519   const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
    520   const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
    521   const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
    522   __m128i row_0 = avg3_0;
    523   __m128i row_1 = avg3_1;
    524   __m128i avg2_avg3_left[2][2];
    525   int i, j;
    526   (void)bd;
    527 
    528   avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
    529   avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
    530   avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
    531   avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
    532 
    533   for (j = 0; j < 2; ++j) {
    534     for (i = 0; i < 2; ++i) {
    535       const __m128i avg2_avg3 = avg2_avg3_left[j][i];
    536       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
    537       row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
    538       _mm_store_si128((__m128i *)dst, row_0);
    539       _mm_store_si128((__m128i *)(dst + 8), row_1);
    540       dst += stride;
    541       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
    542       row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
    543       _mm_store_si128((__m128i *)dst, row_0);
    544       _mm_store_si128((__m128i *)(dst + 8), row_1);
    545       dst += stride;
    546       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
    547       row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
    548       _mm_store_si128((__m128i *)dst, row_0);
    549       _mm_store_si128((__m128i *)(dst + 8), row_1);
    550       dst += stride;
    551       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
    552       row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
    553       _mm_store_si128((__m128i *)dst, row_0);
    554       _mm_store_si128((__m128i *)(dst + 8), row_1);
    555       dst += stride;
    556     }
    557   }
    558 }
    559 
    560 void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
    561                                            const uint16_t *above,
    562                                            const uint16_t *left, int bd) {
    563   const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
    564   const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
    565   const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
    566   const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
    567   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
    568   const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
    569   const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
    570   const __m128i B3 = _mm_srli_si128(A3, 2);
    571   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
    572   const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
    573   const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
    574   const __m128i C3 = _mm_srli_si128(A3, 4);
    575   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
    576   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
    577   const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
    578   const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
    579   const __m128i L0 = _mm_load_si128((const __m128i *)left);
    580   const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
    581   const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
    582   const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
    583   const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
    584   const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
    585   const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
    586   const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
    587   const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
    588   const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
    589   const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
    590   const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
    591   const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
    592   const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
    593   const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
    594   const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
    595   const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
    596   const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
    597   const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
    598   const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
    599   __m128i row_0 = avg3_0;
    600   __m128i row_1 = avg3_1;
    601   __m128i row_2 = avg3_2;
    602   __m128i row_3 = avg3_3;
    603   __m128i avg2_avg3_left[4][2];
    604   int i, j;
    605   (void)bd;
    606 
    607   avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
    608   avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
    609   avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
    610   avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
    611   avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
    612   avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
    613   avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
    614   avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
    615 
    616   for (j = 0; j < 4; ++j) {
    617     for (i = 0; i < 2; ++i) {
    618       const __m128i avg2_avg3 = avg2_avg3_left[j][i];
    619       row_3 = _mm_alignr_epi8(row_3, row_2, 12);
    620       row_2 = _mm_alignr_epi8(row_2, row_1, 12);
    621       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
    622       row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
    623       _mm_store_si128((__m128i *)dst, row_0);
    624       _mm_store_si128((__m128i *)(dst + 8), row_1);
    625       _mm_store_si128((__m128i *)(dst + 16), row_2);
    626       _mm_store_si128((__m128i *)(dst + 24), row_3);
    627       dst += stride;
    628       row_3 = _mm_alignr_epi8(row_3, row_2, 12);
    629       row_2 = _mm_alignr_epi8(row_2, row_1, 12);
    630       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
    631       row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
    632       _mm_store_si128((__m128i *)dst, row_0);
    633       _mm_store_si128((__m128i *)(dst + 8), row_1);
    634       _mm_store_si128((__m128i *)(dst + 16), row_2);
    635       _mm_store_si128((__m128i *)(dst + 24), row_3);
    636       dst += stride;
    637       row_3 = _mm_alignr_epi8(row_3, row_2, 12);
    638       row_2 = _mm_alignr_epi8(row_2, row_1, 12);
    639       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
    640       row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
    641       _mm_store_si128((__m128i *)dst, row_0);
    642       _mm_store_si128((__m128i *)(dst + 8), row_1);
    643       _mm_store_si128((__m128i *)(dst + 16), row_2);
    644       _mm_store_si128((__m128i *)(dst + 24), row_3);
    645       dst += stride;
    646       row_3 = _mm_alignr_epi8(row_3, row_2, 12);
    647       row_2 = _mm_alignr_epi8(row_2, row_1, 12);
    648       row_1 = _mm_alignr_epi8(row_1, row_0, 12);
    649       row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
    650       _mm_store_si128((__m128i *)dst, row_0);
    651       _mm_store_si128((__m128i *)(dst + 8), row_1);
    652       _mm_store_si128((__m128i *)(dst + 16), row_2);
    653       _mm_store_si128((__m128i *)(dst + 24), row_3);
    654       dst += stride;
    655     }
    656   }
    657 }
    658 
    659 static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride,
    660                                   const __m128i *a, const __m128i *b) {
    661   _mm_store_si128((__m128i *)*dst, *a);
    662   *dst += stride;
    663   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
    664   *dst += stride;
    665   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
    666   *dst += stride;
    667   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
    668   *dst += stride;
    669 }
    670 
    671 void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
    672                                          const uint16_t *above,
    673                                          const uint16_t *left, int bd) {
    674   const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left);
    675   const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
    676   const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
    677   const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
    678   const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
    679   const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
    680   const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
    681   const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3);
    682   const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3);
    683   (void)above;
    684   (void)bd;
    685   d207_store_4x8(&dst, stride, &out_a, &out_b);
    686   d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH);
    687 }
    688 
    689 static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride,
    690                                    const __m128i *a, const __m128i *b,
    691                                    const __m128i *c) {
    692   _mm_store_si128((__m128i *)*dst, *a);
    693   _mm_store_si128((__m128i *)(*dst + 8), *b);
    694   *dst += stride;
    695   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
    696   _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
    697   *dst += stride;
    698   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
    699   _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
    700   *dst += stride;
    701   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
    702   _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
    703   *dst += stride;
    704 }
    705 
    706 void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
    707                                            const uint16_t *above,
    708                                            const uint16_t *left, int bd) {
    709   const __m128i A0 = _mm_load_si128((const __m128i *)left);
    710   const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
    711   const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
    712   const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
    713   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
    714   const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
    715   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
    716   const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
    717   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
    718   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
    719   const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
    720   const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
    721   const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
    722   const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
    723   const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
    724   const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
    725   (void)above;
    726   (void)bd;
    727   d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
    728   d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
    729   d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
    730   d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
    731 }
    732 
    733 static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride,
    734                                    const __m128i *a, const __m128i *b,
    735                                    const __m128i *c, const __m128i *d,
    736                                    const __m128i *e) {
    737   _mm_store_si128((__m128i *)*dst, *a);
    738   _mm_store_si128((__m128i *)(*dst + 8), *b);
    739   _mm_store_si128((__m128i *)(*dst + 16), *c);
    740   _mm_store_si128((__m128i *)(*dst + 24), *d);
    741   *dst += stride;
    742   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
    743   _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
    744   _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4));
    745   _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4));
    746   *dst += stride;
    747   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
    748   _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
    749   _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8));
    750   _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8));
    751   *dst += stride;
    752   _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
    753   _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
    754   _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12));
    755   _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12));
    756   *dst += stride;
    757 }
    758 
    759 void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
    760                                            const uint16_t *above,
    761                                            const uint16_t *left, int bd) {
    762   const __m128i A0 = _mm_load_si128((const __m128i *)left);
    763   const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
    764   const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16));
    765   const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24));
    766   const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff);
    767   const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
    768   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
    769   const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
    770   const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
    771   const __m128i B3 = _mm_alignr_epi8(LR, A3, 2);
    772   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
    773   const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
    774   const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
    775   const __m128i C3 = _mm_alignr_epi8(LR, A3, 4);
    776   const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
    777   const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
    778   const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
    779   const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
    780   const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
    781   const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
    782   const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
    783   const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
    784   const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
    785   const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
    786   const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
    787   const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
    788   const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2);
    789   const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2);
    790   const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3);
    791   const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3);
    792   (void)above;
    793   (void)bd;
    794   d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e);
    795   d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f);
    796   d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g);
    797   d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h);
    798   d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR);
    799   d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR);
    800   d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR);
    801   d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR);
    802 }
    803 
    804 static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride,
    805                                  __m128i *a, __m128i *b, const __m128i *ar) {
    806   _mm_store_si128((__m128i *)*dst, *a);
    807   *dst += stride;
    808   _mm_store_si128((__m128i *)*dst, *b);
    809   *dst += stride;
    810   *a = _mm_alignr_epi8(*ar, *a, 2);
    811   *b = _mm_alignr_epi8(*ar, *b, 2);
    812   _mm_store_si128((__m128i *)*dst, *a);
    813   *dst += stride;
    814   _mm_store_si128((__m128i *)*dst, *b);
    815   *dst += stride;
    816   *a = _mm_alignr_epi8(*ar, *a, 2);
    817   *b = _mm_alignr_epi8(*ar, *b, 2);
    818 }
    819 
    820 void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
    821                                         const uint16_t *above,
    822                                         const uint16_t *left, int bd) {
    823   const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
    824   const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
    825   const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
    826   const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
    827   const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
    828   __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
    829   __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
    830   (void)left;
    831   (void)bd;
    832   d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
    833   d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
    834 }
    835 
    836 void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
    837                                           const uint16_t *above,
    838                                           const uint16_t *left, int bd) {
    839   const __m128i A0 = _mm_load_si128((const __m128i *)above);
    840   const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
    841   const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
    842   const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
    843   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
    844   const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
    845   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
    846   const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
    847   __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
    848   __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
    849   __m128i avg2_0 = _mm_avg_epu16(A0, B0);
    850   __m128i avg2_1 = _mm_avg_epu16(A1, B1);
    851   int i;
    852   (void)left;
    853   (void)bd;
    854   for (i = 0; i < 14; i += 2) {
    855     _mm_store_si128((__m128i *)dst, avg2_0);
    856     _mm_store_si128((__m128i *)(dst + 8), avg2_1);
    857     dst += stride;
    858     _mm_store_si128((__m128i *)dst, avg3_0);
    859     _mm_store_si128((__m128i *)(dst + 8), avg3_1);
    860     dst += stride;
    861     avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
    862     avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2);
    863     avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
    864     avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2);
    865   }
    866   _mm_store_si128((__m128i *)dst, avg2_0);
    867   _mm_store_si128((__m128i *)(dst + 8), avg2_1);
    868   dst += stride;
    869   _mm_store_si128((__m128i *)dst, avg3_0);
    870   _mm_store_si128((__m128i *)(dst + 8), avg3_1);
    871 }
    872 
    873 void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
    874                                           const uint16_t *above,
    875                                           const uint16_t *left, int bd) {
    876   const __m128i A0 = _mm_load_si128((const __m128i *)above);
    877   const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
    878   const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
    879   const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
    880   const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
    881   const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
    882   const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
    883   const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
    884   const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
    885   const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
    886   const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
    887   const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
    888   const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
    889   const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
    890   __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
    891   __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
    892   __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
    893   __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
    894   __m128i avg2_0 = _mm_avg_epu16(A0, B0);
    895   __m128i avg2_1 = _mm_avg_epu16(A1, B1);
    896   __m128i avg2_2 = _mm_avg_epu16(A2, B2);
    897   __m128i avg2_3 = _mm_avg_epu16(A3, B3);
    898   int i;
    899   (void)left;
    900   (void)bd;
    901   for (i = 0; i < 30; i += 2) {
    902     _mm_store_si128((__m128i *)dst, avg2_0);
    903     _mm_store_si128((__m128i *)(dst + 8), avg2_1);
    904     _mm_store_si128((__m128i *)(dst + 16), avg2_2);
    905     _mm_store_si128((__m128i *)(dst + 24), avg2_3);
    906     dst += stride;
    907     _mm_store_si128((__m128i *)dst, avg3_0);
    908     _mm_store_si128((__m128i *)(dst + 8), avg3_1);
    909     _mm_store_si128((__m128i *)(dst + 16), avg3_2);
    910     _mm_store_si128((__m128i *)(dst + 24), avg3_3);
    911     dst += stride;
    912     avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
    913     avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2);
    914     avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2);
    915     avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2);
    916     avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
    917     avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
    918     avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
    919     avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
    920   }
    921   _mm_store_si128((__m128i *)dst, avg2_0);
    922   _mm_store_si128((__m128i *)(dst + 8), avg2_1);
    923   _mm_store_si128((__m128i *)(dst + 16), avg2_2);
    924   _mm_store_si128((__m128i *)(dst + 24), avg2_3);
    925   dst += stride;
    926   _mm_store_si128((__m128i *)dst, avg3_0);
    927   _mm_store_si128((__m128i *)(dst + 8), avg3_1);
    928   _mm_store_si128((__m128i *)(dst + 16), avg3_2);
    929   _mm_store_si128((__m128i *)(dst + 24), avg3_3);
    930 }
    931