Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <emmintrin.h>  // SSE2
     12 
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx_ports/mem.h"
     15 #include "vpx_ports/emmintrin_compat.h"
     16 
     17 static INLINE __m128i abs_diff(__m128i a, __m128i b) {
     18   return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
     19 }
     20 
     21 // filter_mask and hev_mask
     22 #define FILTER_HEV_MASK                                                       \
     23   do {                                                                        \
     24     /* (abs(q1 - q0), abs(p1 - p0) */                                         \
     25     __m128i flat = abs_diff(q1p1, q0p0);                                      \
     26     /* abs(p1 - q1), abs(p0 - q0) */                                          \
     27     const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);                        \
     28     __m128i abs_p0q0, abs_p1q1, work;                                         \
     29                                                                               \
     30     /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */       \
     31     hev =                                                                     \
     32         _mm_unpacklo_epi8(_mm_max_epu8(flat, _mm_srli_si128(flat, 8)), zero); \
     33     hev = _mm_cmpgt_epi16(hev, thresh);                                       \
     34     hev = _mm_packs_epi16(hev, hev);                                          \
     35                                                                               \
     36     /* const int8_t mask = filter_mask(*limit, *blimit, */                    \
     37     /*                                 p3, p2, p1, p0, q0, q1, q2, q3); */    \
     38     abs_p0q0 =                                                                \
     39         _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */     \
     40     abs_p1q1 =                                                                \
     41         _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */     \
     42     abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);                                   \
     43     abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */    \
     44     /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */                                 \
     45     mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);                                 \
     46     /* abs(p3 - p2), abs(p2 - p1) */                                          \
     47     work = abs_diff(p3p2, p2p1);                                              \
     48     flat = _mm_max_epu8(work, flat);                                          \
     49     /* abs(q3 - q2), abs(q2 - q1) */                                          \
     50     work = abs_diff(q3q2, q2q1);                                              \
     51     flat = _mm_max_epu8(work, flat);                                          \
     52     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));                       \
     53     mask = _mm_unpacklo_epi64(mask, flat);                                    \
     54     mask = _mm_subs_epu8(mask, limit);                                        \
     55     mask = _mm_cmpeq_epi8(mask, zero);                                        \
     56     mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));                      \
     57   } while (0)
     58 
     59 #define FILTER4                                                             \
     60   do {                                                                      \
     61     const __m128i t3t4 =                                                    \
     62         _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);       \
     63     const __m128i t80 = _mm_set1_epi8(0x80);                                \
     64     __m128i filter, filter2filter1, work;                                   \
     65                                                                             \
     66     ps1ps0 = _mm_xor_si128(p1p0, t80); /* ^ 0x80 */                         \
     67     qs1qs0 = _mm_xor_si128(q1q0, t80);                                      \
     68                                                                             \
     69     /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */               \
     70     work = _mm_subs_epi8(ps1ps0, qs1qs0);                                   \
     71     filter = _mm_and_si128(_mm_srli_si128(work, 8), hev);                   \
     72     /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */      \
     73     filter = _mm_subs_epi8(filter, work);                                   \
     74     filter = _mm_subs_epi8(filter, work);                                   \
     75     filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */           \
     76     filter = _mm_and_si128(filter, mask); /* & mask */                      \
     77     filter = _mm_unpacklo_epi64(filter, filter);                            \
     78                                                                             \
     79     /* filter1 = signed_char_clamp(filter + 4) >> 3; */                     \
     80     /* filter2 = signed_char_clamp(filter + 3) >> 3; */                     \
     81     filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */   \
     82     filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);             \
     83     filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);     \
     84     filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */         \
     85     filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */         \
     86     filter2filter1 = _mm_packs_epi16(filter2filter1, filter);               \
     87                                                                             \
     88     /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */                   \
     89     filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */                   \
     90     filter = _mm_unpacklo_epi8(filter, filter);                             \
     91     filter = _mm_srai_epi16(filter, 9); /* round */                         \
     92     filter = _mm_packs_epi16(filter, filter);                               \
     93     filter = _mm_andnot_si128(hev, filter);                                 \
     94                                                                             \
     95     hev = _mm_unpackhi_epi64(filter2filter1, filter);                       \
     96     filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);            \
     97                                                                             \
     98     /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */ \
     99     qs1qs0 = _mm_subs_epi8(qs1qs0, filter2filter1);                         \
    100     /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */ \
    101     ps1ps0 = _mm_adds_epi8(ps1ps0, hev);                                    \
    102     qs1qs0 = _mm_xor_si128(qs1qs0, t80); /* ^ 0x80 */                       \
    103     ps1ps0 = _mm_xor_si128(ps1ps0, t80); /* ^ 0x80 */                       \
    104   } while (0)
    105 
    106 void vpx_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
    107                                const uint8_t *_blimit, const uint8_t *_limit,
    108                                const uint8_t *_thresh) {
    109   const __m128i zero = _mm_set1_epi16(0);
    110   const __m128i limit =
    111       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
    112                          _mm_loadl_epi64((const __m128i *)_limit));
    113   const __m128i thresh =
    114       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
    115   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
    116   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
    117   __m128i mask, hev;
    118 
    119   p3p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
    120                             _mm_loadl_epi64((__m128i *)(s - 4 * p)));
    121   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
    122                             _mm_loadl_epi64((__m128i *)(s + 1 * p)));
    123   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
    124                             _mm_loadl_epi64((__m128i *)(s + 0 * p)));
    125   q3q2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s + 2 * p)),
    126                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
    127   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
    128   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
    129   q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
    130   q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
    131 
    132   FILTER_HEV_MASK;
    133   FILTER4;
    134 
    135   _mm_storeh_pi((__m64 *)(s - 2 * p), _mm_castsi128_ps(ps1ps0));  // *op1
    136   _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);               // *op0
    137   _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);               // *oq0
    138   _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(qs1qs0));  // *oq1
    139 }
    140 
    141 void vpx_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
    142                              const uint8_t *_blimit, const uint8_t *_limit,
    143                              const uint8_t *_thresh) {
    144   const __m128i zero = _mm_set1_epi16(0);
    145   const __m128i limit =
    146       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)_blimit),
    147                          _mm_loadl_epi64((const __m128i *)_limit));
    148   const __m128i thresh =
    149       _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
    150   const __m128i ff = _mm_cmpeq_epi8(zero, zero);
    151   __m128i x0, x1, x2, x3;
    152   __m128i q1p1, q0p0, p3p2, p2p1, p1p0, q3q2, q2q1, q1q0, ps1ps0, qs1qs0;
    153   __m128i mask, hev;
    154 
    155   // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
    156   q1q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 0 * p - 4)),
    157                            _mm_loadl_epi64((__m128i *)(s + 1 * p - 4)));
    158 
    159   // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
    160   x1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 2 * p - 4)),
    161                          _mm_loadl_epi64((__m128i *)(s + 3 * p - 4)));
    162 
    163   // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
    164   x2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 4 * p - 4)),
    165                          _mm_loadl_epi64((__m128i *)(s + 5 * p - 4)));
    166 
    167   // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
    168   x3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(s + 6 * p - 4)),
    169                          _mm_loadl_epi64((__m128i *)(s + 7 * p - 4)));
    170 
    171   // Transpose 8x8
    172   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
    173   p1p0 = _mm_unpacklo_epi16(q1q0, x1);
    174   // 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
    175   x0 = _mm_unpacklo_epi16(x2, x3);
    176   // 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
    177   p3p2 = _mm_unpacklo_epi32(p1p0, x0);
    178   // 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
    179   p1p0 = _mm_unpackhi_epi32(p1p0, x0);
    180   p3p2 = _mm_unpackhi_epi64(p3p2, _mm_slli_si128(p3p2, 8));  // swap lo and high
    181   p1p0 = _mm_unpackhi_epi64(p1p0, _mm_slli_si128(p1p0, 8));  // swap lo and high
    182 
    183   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
    184   q1q0 = _mm_unpackhi_epi16(q1q0, x1);
    185   // 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
    186   x2 = _mm_unpackhi_epi16(x2, x3);
    187   // 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
    188   q3q2 = _mm_unpackhi_epi32(q1q0, x2);
    189   // 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
    190   q1q0 = _mm_unpacklo_epi32(q1q0, x2);
    191 
    192   q0p0 = _mm_unpacklo_epi64(p1p0, q1q0);
    193   q1p1 = _mm_unpackhi_epi64(p1p0, q1q0);
    194   p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
    195   p2p1 = _mm_unpacklo_epi64(q1p1, p3p2);
    196   q2q1 = _mm_unpacklo_epi64(_mm_srli_si128(q1p1, 8), q3q2);
    197 
    198   FILTER_HEV_MASK;
    199   FILTER4;
    200 
    201   // Transpose 8x4 to 4x8
    202   // qs1qs0: 20 21 22 23 24 25 26 27  30 31 32 33 34 34 36 37
    203   // ps1ps0: 10 11 12 13 14 15 16 17  00 01 02 03 04 05 06 07
    204   // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
    205   ps1ps0 = _mm_unpackhi_epi64(ps1ps0, _mm_slli_si128(ps1ps0, 8));
    206   // 10 30 11 31 12 32 13 33  14 34 15 35 16 36 17 37
    207   x0 = _mm_unpackhi_epi8(ps1ps0, qs1qs0);
    208   // 00 20 01 21 02 22 03 23  04 24 05 25 06 26 07 27
    209   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, qs1qs0);
    210   // 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
    211   qs1qs0 = _mm_unpackhi_epi8(ps1ps0, x0);
    212   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
    213   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
    214 
    215   *(int *)(s + 0 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
    216   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
    217   *(int *)(s + 1 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
    218   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
    219   *(int *)(s + 2 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
    220   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
    221   *(int *)(s + 3 * p - 2) = _mm_cvtsi128_si32(ps1ps0);
    222 
    223   *(int *)(s + 4 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
    224   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
    225   *(int *)(s + 5 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
    226   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
    227   *(int *)(s + 6 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
    228   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
    229   *(int *)(s + 7 * p - 2) = _mm_cvtsi128_si32(qs1qs0);
    230 }
    231 
    232 void vpx_lpf_horizontal_16_sse2(unsigned char *s, int p,
    233                                 const unsigned char *_blimit,
    234                                 const unsigned char *_limit,
    235                                 const unsigned char *_thresh) {
    236   const __m128i zero = _mm_set1_epi16(0);
    237   const __m128i one = _mm_set1_epi8(1);
    238   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
    239   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
    240   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
    241   __m128i mask, hev, flat, flat2;
    242   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
    243   __m128i abs_p1p0;
    244 
    245   q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
    246   q4p4 = _mm_castps_si128(
    247       _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
    248   q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
    249   q3p3 = _mm_castps_si128(
    250       _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
    251   q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
    252   q2p2 = _mm_castps_si128(
    253       _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
    254   q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
    255   q1p1 = _mm_castps_si128(
    256       _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
    257   p1q1 = _mm_shuffle_epi32(q1p1, 78);
    258   q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
    259   q0p0 = _mm_castps_si128(
    260       _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
    261   p0q0 = _mm_shuffle_epi32(q0p0, 78);
    262 
    263   {
    264     __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
    265     abs_p1p0 = abs_diff(q1p1, q0p0);
    266     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
    267     fe = _mm_set1_epi8(0xfe);
    268     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    269     abs_p0q0 = abs_diff(q0p0, p0q0);
    270     abs_p1q1 = abs_diff(q1p1, p1q1);
    271     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    272     hev = _mm_subs_epu8(flat, thresh);
    273     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
    274 
    275     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    276     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    277     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    278     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    279     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    280     mask = _mm_max_epu8(abs_p1p0, mask);
    281     // mask |= (abs(p1 - p0) > limit) * -1;
    282     // mask |= (abs(q1 - q0) > limit) * -1;
    283 
    284     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
    285     mask = _mm_max_epu8(work, mask);
    286     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
    287     mask = _mm_subs_epu8(mask, limit);
    288     mask = _mm_cmpeq_epi8(mask, zero);
    289   }
    290 
    291   // lp filter
    292   {
    293     const __m128i t4 = _mm_set1_epi8(4);
    294     const __m128i t3 = _mm_set1_epi8(3);
    295     const __m128i t80 = _mm_set1_epi8(0x80);
    296     const __m128i t1 = _mm_set1_epi16(0x1);
    297     __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
    298     __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
    299     __m128i qs0 = _mm_xor_si128(p0q0, t80);
    300     __m128i qs1 = _mm_xor_si128(p1q1, t80);
    301     __m128i filt;
    302     __m128i work_a;
    303     __m128i filter1, filter2;
    304     __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
    305     __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
    306 
    307     filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
    308     work_a = _mm_subs_epi8(qs0, qs0ps0);
    309     filt = _mm_adds_epi8(filt, work_a);
    310     filt = _mm_adds_epi8(filt, work_a);
    311     filt = _mm_adds_epi8(filt, work_a);
    312     // (vpx_filter + 3 * (qs0 - ps0)) & mask
    313     filt = _mm_and_si128(filt, mask);
    314 
    315     filter1 = _mm_adds_epi8(filt, t4);
    316     filter2 = _mm_adds_epi8(filt, t3);
    317 
    318     filter1 = _mm_unpacklo_epi8(zero, filter1);
    319     filter1 = _mm_srai_epi16(filter1, 0xB);
    320     filter2 = _mm_unpacklo_epi8(zero, filter2);
    321     filter2 = _mm_srai_epi16(filter2, 0xB);
    322 
    323     // Filter1 >> 3
    324     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
    325     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
    326 
    327     // filt >> 1
    328     filt = _mm_adds_epi16(filter1, t1);
    329     filt = _mm_srai_epi16(filt, 1);
    330     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
    331                             filt);
    332     filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
    333     qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
    334     // loopfilter done
    335 
    336     {
    337       __m128i work;
    338       flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
    339       flat = _mm_max_epu8(abs_p1p0, flat);
    340       flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
    341       flat = _mm_subs_epu8(flat, one);
    342       flat = _mm_cmpeq_epi8(flat, zero);
    343       flat = _mm_and_si128(flat, mask);
    344 
    345       q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
    346       q5p5 = _mm_castps_si128(
    347           _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
    348 
    349       q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
    350       q6p6 = _mm_castps_si128(
    351           _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
    352       flat2 = _mm_max_epu8(abs_diff(q4p4, q0p0), abs_diff(q5p5, q0p0));
    353 
    354       q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
    355       q7p7 = _mm_castps_si128(
    356           _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
    357       work = _mm_max_epu8(abs_diff(q6p6, q0p0), abs_diff(q7p7, q0p0));
    358       flat2 = _mm_max_epu8(work, flat2);
    359       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
    360       flat2 = _mm_subs_epu8(flat2, one);
    361       flat2 = _mm_cmpeq_epi8(flat2, zero);
    362       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    363     }
    364 
    365     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    366     // flat and wide flat calculations
    367     {
    368       const __m128i eight = _mm_set1_epi16(8);
    369       const __m128i four = _mm_set1_epi16(4);
    370       __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
    371       __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
    372       __m128i pixelFilter_p, pixelFilter_q;
    373       __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
    374       __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
    375 
    376       p7_16 = _mm_unpacklo_epi8(q7p7, zero);
    377       p6_16 = _mm_unpacklo_epi8(q6p6, zero);
    378       p5_16 = _mm_unpacklo_epi8(q5p5, zero);
    379       p4_16 = _mm_unpacklo_epi8(q4p4, zero);
    380       p3_16 = _mm_unpacklo_epi8(q3p3, zero);
    381       p2_16 = _mm_unpacklo_epi8(q2p2, zero);
    382       p1_16 = _mm_unpacklo_epi8(q1p1, zero);
    383       p0_16 = _mm_unpacklo_epi8(q0p0, zero);
    384       q0_16 = _mm_unpackhi_epi8(q0p0, zero);
    385       q1_16 = _mm_unpackhi_epi8(q1p1, zero);
    386       q2_16 = _mm_unpackhi_epi8(q2p2, zero);
    387       q3_16 = _mm_unpackhi_epi8(q3p3, zero);
    388       q4_16 = _mm_unpackhi_epi8(q4p4, zero);
    389       q5_16 = _mm_unpackhi_epi8(q5p5, zero);
    390       q6_16 = _mm_unpackhi_epi8(q6p6, zero);
    391       q7_16 = _mm_unpackhi_epi8(q7p7, zero);
    392 
    393       pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
    394                                     _mm_add_epi16(p4_16, p3_16));
    395       pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
    396                                     _mm_add_epi16(q4_16, q3_16));
    397 
    398       pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
    399       pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
    400 
    401       pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
    402       pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
    403       pixelFilter_p =
    404           _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
    405       pixetFilter_p2p1p0 = _mm_add_epi16(
    406           four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
    407       res_p = _mm_srli_epi16(
    408           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
    409       res_q = _mm_srli_epi16(
    410           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
    411       flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
    412       res_p = _mm_srli_epi16(
    413           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
    414       res_q = _mm_srli_epi16(
    415           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
    416 
    417       flat_q0p0 = _mm_packus_epi16(res_p, res_q);
    418 
    419       sum_p7 = _mm_add_epi16(p7_16, p7_16);
    420       sum_q7 = _mm_add_epi16(q7_16, q7_16);
    421       sum_p3 = _mm_add_epi16(p3_16, p3_16);
    422       sum_q3 = _mm_add_epi16(q3_16, q3_16);
    423 
    424       pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
    425       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
    426       res_p = _mm_srli_epi16(
    427           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
    428       res_q = _mm_srli_epi16(
    429           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
    430       flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
    431 
    432       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
    433       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
    434       res_p = _mm_srli_epi16(
    435           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
    436       res_q = _mm_srli_epi16(
    437           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
    438       flat_q1p1 = _mm_packus_epi16(res_p, res_q);
    439 
    440       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    441       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    442       sum_p3 = _mm_add_epi16(sum_p3, p3_16);
    443       sum_q3 = _mm_add_epi16(sum_q3, q3_16);
    444 
    445       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
    446       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
    447       res_p = _mm_srli_epi16(
    448           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
    449       res_q = _mm_srli_epi16(
    450           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
    451       flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
    452 
    453       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
    454       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
    455 
    456       res_p = _mm_srli_epi16(
    457           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
    458       res_q = _mm_srli_epi16(
    459           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
    460       flat_q2p2 = _mm_packus_epi16(res_p, res_q);
    461 
    462       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    463       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    464       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
    465       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
    466       res_p = _mm_srli_epi16(
    467           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
    468       res_q = _mm_srli_epi16(
    469           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
    470       flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
    471 
    472       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    473       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    474       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
    475       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
    476       res_p = _mm_srli_epi16(
    477           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
    478       res_q = _mm_srli_epi16(
    479           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
    480       flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
    481 
    482       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    483       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    484       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
    485       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
    486       res_p = _mm_srli_epi16(
    487           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
    488       res_q = _mm_srli_epi16(
    489           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
    490       flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
    491 
    492       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    493       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    494       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
    495       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
    496       res_p = _mm_srli_epi16(
    497           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
    498       res_q = _mm_srli_epi16(
    499           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
    500       flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
    501     }
    502     // wide flat
    503     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    504 
    505     flat = _mm_shuffle_epi32(flat, 68);
    506     flat2 = _mm_shuffle_epi32(flat2, 68);
    507 
    508     q2p2 = _mm_andnot_si128(flat, q2p2);
    509     flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
    510     q2p2 = _mm_or_si128(q2p2, flat_q2p2);
    511 
    512     qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
    513     flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
    514     q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
    515 
    516     qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
    517     flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
    518     q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
    519 
    520     q6p6 = _mm_andnot_si128(flat2, q6p6);
    521     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
    522     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
    523     _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
    524     _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
    525 
    526     q5p5 = _mm_andnot_si128(flat2, q5p5);
    527     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
    528     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
    529     _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
    530     _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
    531 
    532     q4p4 = _mm_andnot_si128(flat2, q4p4);
    533     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
    534     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
    535     _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
    536     _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
    537 
    538     q3p3 = _mm_andnot_si128(flat2, q3p3);
    539     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
    540     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
    541     _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
    542     _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
    543 
    544     q2p2 = _mm_andnot_si128(flat2, q2p2);
    545     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
    546     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
    547     _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
    548     _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
    549 
    550     q1p1 = _mm_andnot_si128(flat2, q1p1);
    551     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
    552     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
    553     _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
    554     _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
    555 
    556     q0p0 = _mm_andnot_si128(flat2, q0p0);
    557     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
    558     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
    559     _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
    560     _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
    561   }
    562 }
    563 
    564 static INLINE __m128i filter_add2_sub2(const __m128i *const total,
    565                                        const __m128i *const a1,
    566                                        const __m128i *const a2,
    567                                        const __m128i *const s1,
    568                                        const __m128i *const s2) {
    569   __m128i x = _mm_add_epi16(*a1, *total);
    570   x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
    571   return x;
    572 }
    573 
    574 static INLINE __m128i filter8_mask(const __m128i *const flat,
    575                                    const __m128i *const other_filt,
    576                                    const __m128i *const f8_lo,
    577                                    const __m128i *const f8_hi) {
    578   const __m128i f8 =
    579       _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
    580   const __m128i result = _mm_and_si128(*flat, f8);
    581   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
    582 }
    583 
    584 static INLINE __m128i filter16_mask(const __m128i *const flat,
    585                                     const __m128i *const other_filt,
    586                                     const __m128i *const f_lo,
    587                                     const __m128i *const f_hi) {
    588   const __m128i f =
    589       _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
    590   const __m128i result = _mm_and_si128(*flat, f);
    591   return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
    592 }
    593 
    594 void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int p,
    595                                      const unsigned char *_blimit,
    596                                      const unsigned char *_limit,
    597                                      const unsigned char *_thresh) {
    598   const __m128i zero = _mm_set1_epi16(0);
    599   const __m128i one = _mm_set1_epi8(1);
    600   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
    601   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
    602   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
    603   __m128i mask, hev, flat, flat2;
    604   __m128i p7, p6, p5;
    605   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
    606   __m128i q5, q6, q7;
    607 
    608   __m128i op2, op1, op0, oq0, oq1, oq2;
    609 
    610   __m128i max_abs_p1p0q1q0;
    611 
    612   p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
    613   p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
    614   p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
    615   p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
    616   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
    617   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
    618   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
    619   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
    620   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
    621   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
    622   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
    623   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
    624   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
    625   q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
    626   q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
    627   q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
    628 
    629   {
    630     const __m128i abs_p1p0 = abs_diff(p1, p0);
    631     const __m128i abs_q1q0 = abs_diff(q1, q0);
    632     const __m128i fe = _mm_set1_epi8(0xfe);
    633     const __m128i ff = _mm_cmpeq_epi8(zero, zero);
    634     __m128i abs_p0q0 = abs_diff(p0, q0);
    635     __m128i abs_p1q1 = abs_diff(p1, q1);
    636     __m128i work;
    637     max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
    638 
    639     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    640     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    641     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    642     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    643     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    644     mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
    645     // mask |= (abs(p1 - p0) > limit) * -1;
    646     // mask |= (abs(q1 - q0) > limit) * -1;
    647     work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
    648     mask = _mm_max_epu8(work, mask);
    649     work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
    650     mask = _mm_max_epu8(work, mask);
    651     mask = _mm_subs_epu8(mask, limit);
    652     mask = _mm_cmpeq_epi8(mask, zero);
    653   }
    654 
    655   {
    656     __m128i work;
    657     work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
    658     flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
    659     work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
    660     flat = _mm_max_epu8(work, flat);
    661     work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
    662     flat = _mm_subs_epu8(flat, one);
    663     flat = _mm_cmpeq_epi8(flat, zero);
    664     flat = _mm_and_si128(flat, mask);
    665     flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
    666     flat2 = _mm_max_epu8(work, flat2);
    667     work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
    668     flat2 = _mm_max_epu8(work, flat2);
    669     work = _mm_max_epu8(abs_diff(p7, p0), abs_diff(q7, q0));
    670     flat2 = _mm_max_epu8(work, flat2);
    671     flat2 = _mm_subs_epu8(flat2, one);
    672     flat2 = _mm_cmpeq_epi8(flat2, zero);
    673     flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    674   }
    675 
    676   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    677   // filter4
    678   {
    679     const __m128i t4 = _mm_set1_epi8(4);
    680     const __m128i t3 = _mm_set1_epi8(3);
    681     const __m128i t80 = _mm_set1_epi8(0x80);
    682     const __m128i te0 = _mm_set1_epi8(0xe0);
    683     const __m128i t1f = _mm_set1_epi8(0x1f);
    684     const __m128i t1 = _mm_set1_epi8(0x1);
    685     const __m128i t7f = _mm_set1_epi8(0x7f);
    686     const __m128i ff = _mm_cmpeq_epi8(t4, t4);
    687 
    688     __m128i filt;
    689     __m128i work_a;
    690     __m128i filter1, filter2;
    691 
    692     op1 = _mm_xor_si128(p1, t80);
    693     op0 = _mm_xor_si128(p0, t80);
    694     oq0 = _mm_xor_si128(q0, t80);
    695     oq1 = _mm_xor_si128(q1, t80);
    696 
    697     hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh);
    698     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
    699     filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
    700 
    701     work_a = _mm_subs_epi8(oq0, op0);
    702     filt = _mm_adds_epi8(filt, work_a);
    703     filt = _mm_adds_epi8(filt, work_a);
    704     filt = _mm_adds_epi8(filt, work_a);
    705     // (vpx_filter + 3 * (qs0 - ps0)) & mask
    706     filt = _mm_and_si128(filt, mask);
    707     filter1 = _mm_adds_epi8(filt, t4);
    708     filter2 = _mm_adds_epi8(filt, t3);
    709 
    710     // Filter1 >> 3
    711     work_a = _mm_cmpgt_epi8(zero, filter1);
    712     filter1 = _mm_srli_epi16(filter1, 3);
    713     work_a = _mm_and_si128(work_a, te0);
    714     filter1 = _mm_and_si128(filter1, t1f);
    715     filter1 = _mm_or_si128(filter1, work_a);
    716     oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
    717 
    718     // Filter2 >> 3
    719     work_a = _mm_cmpgt_epi8(zero, filter2);
    720     filter2 = _mm_srli_epi16(filter2, 3);
    721     work_a = _mm_and_si128(work_a, te0);
    722     filter2 = _mm_and_si128(filter2, t1f);
    723     filter2 = _mm_or_si128(filter2, work_a);
    724     op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
    725 
    726     // filt >> 1
    727     filt = _mm_adds_epi8(filter1, t1);
    728     work_a = _mm_cmpgt_epi8(zero, filt);
    729     filt = _mm_srli_epi16(filt, 1);
    730     work_a = _mm_and_si128(work_a, t80);
    731     filt = _mm_and_si128(filt, t7f);
    732     filt = _mm_or_si128(filt, work_a);
    733     filt = _mm_andnot_si128(hev, filt);
    734     op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
    735     oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
    736     // loopfilter done
    737 
    738     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    739     // filter8
    740     {
    741       const __m128i four = _mm_set1_epi16(4);
    742       const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
    743       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
    744       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
    745       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
    746       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
    747       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
    748       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
    749       const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
    750 
    751       const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
    752       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
    753       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
    754       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
    755       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
    756       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
    757       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
    758       const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
    759       __m128i f8_lo, f8_hi;
    760 
    761       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
    762                             _mm_add_epi16(p3_lo, p2_lo));
    763       f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
    764                             _mm_add_epi16(p2_lo, p1_lo));
    765       f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
    766 
    767       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
    768                             _mm_add_epi16(p3_hi, p2_hi));
    769       f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
    770                             _mm_add_epi16(p2_hi, p1_hi));
    771       f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
    772 
    773       op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
    774 
    775       f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
    776       f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
    777       op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
    778 
    779       f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
    780       f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
    781       op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
    782 
    783       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
    784       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
    785       oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
    786 
    787       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
    788       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
    789       oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
    790 
    791       f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
    792       f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
    793       oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
    794     }
    795 
    796     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    797     // wide flat calculations
    798     {
    799       const __m128i eight = _mm_set1_epi16(8);
    800       const __m128i p7_lo = _mm_unpacklo_epi8(p7, zero);
    801       const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
    802       const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
    803       const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
    804       const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
    805       const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
    806       const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
    807       const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
    808       const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
    809       const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
    810       const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
    811       const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
    812       const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
    813       const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
    814       const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
    815       const __m128i q7_lo = _mm_unpacklo_epi8(q7, zero);
    816 
    817       const __m128i p7_hi = _mm_unpackhi_epi8(p7, zero);
    818       const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
    819       const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
    820       const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
    821       const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
    822       const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
    823       const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
    824       const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
    825       const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
    826       const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
    827       const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
    828       const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
    829       const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
    830       const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
    831       const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
    832       const __m128i q7_hi = _mm_unpackhi_epi8(q7, zero);
    833 
    834       __m128i f_lo;
    835       __m128i f_hi;
    836 
    837       f_lo = _mm_sub_epi16(_mm_slli_epi16(p7_lo, 3), p7_lo);  // p7 * 7
    838       f_lo =
    839           _mm_add_epi16(_mm_slli_epi16(p6_lo, 1), _mm_add_epi16(p4_lo, f_lo));
    840       f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
    841                            _mm_add_epi16(p2_lo, p1_lo));
    842       f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
    843       f_lo = _mm_add_epi16(_mm_add_epi16(p5_lo, eight), f_lo);
    844 
    845       f_hi = _mm_sub_epi16(_mm_slli_epi16(p7_hi, 3), p7_hi);  // p7 * 7
    846       f_hi =
    847           _mm_add_epi16(_mm_slli_epi16(p6_hi, 1), _mm_add_epi16(p4_hi, f_hi));
    848       f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
    849                            _mm_add_epi16(p2_hi, p1_hi));
    850       f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
    851       f_hi = _mm_add_epi16(_mm_add_epi16(p5_hi, eight), f_hi);
    852 
    853       p6 = filter16_mask(&flat2, &p6, &f_lo, &f_hi);
    854       _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
    855 
    856       f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p5_lo, &p6_lo, &p7_lo);
    857       f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p5_hi, &p6_hi, &p7_hi);
    858       p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
    859       _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
    860 
    861       f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p4_lo, &p5_lo, &p7_lo);
    862       f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p4_hi, &p5_hi, &p7_hi);
    863       p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
    864       _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
    865 
    866       f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p3_lo, &p4_lo, &p7_lo);
    867       f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p3_hi, &p4_hi, &p7_hi);
    868       p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
    869       _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
    870 
    871       f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p2_lo, &p3_lo, &p7_lo);
    872       f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p2_hi, &p3_hi, &p7_hi);
    873       op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
    874       _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
    875 
    876       f_lo = filter_add2_sub2(&f_lo, &q5_lo, &p1_lo, &p2_lo, &p7_lo);
    877       f_hi = filter_add2_sub2(&f_hi, &q5_hi, &p1_hi, &p2_hi, &p7_hi);
    878       op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
    879       _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
    880 
    881       f_lo = filter_add2_sub2(&f_lo, &q6_lo, &p0_lo, &p1_lo, &p7_lo);
    882       f_hi = filter_add2_sub2(&f_hi, &q6_hi, &p0_hi, &p1_hi, &p7_hi);
    883       op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
    884       _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
    885 
    886       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q0_lo, &p0_lo, &p7_lo);
    887       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q0_hi, &p0_hi, &p7_hi);
    888       oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
    889       _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
    890 
    891       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q1_lo, &p6_lo, &q0_lo);
    892       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q1_hi, &p6_hi, &q0_hi);
    893       oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
    894       _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
    895 
    896       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q2_lo, &p5_lo, &q1_lo);
    897       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q2_hi, &p5_hi, &q1_hi);
    898       oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
    899       _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
    900 
    901       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q3_lo, &p4_lo, &q2_lo);
    902       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q3_hi, &p4_hi, &q2_hi);
    903       q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
    904       _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
    905 
    906       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q4_lo, &p3_lo, &q3_lo);
    907       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q4_hi, &p3_hi, &q3_hi);
    908       q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
    909       _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
    910 
    911       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q5_lo, &p2_lo, &q4_lo);
    912       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q5_hi, &p2_hi, &q4_hi);
    913       q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
    914       _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
    915 
    916       f_lo = filter_add2_sub2(&f_lo, &q7_lo, &q6_lo, &p1_lo, &q5_lo);
    917       f_hi = filter_add2_sub2(&f_hi, &q7_hi, &q6_hi, &p1_hi, &q5_hi);
    918       q6 = filter16_mask(&flat2, &q6, &f_lo, &f_hi);
    919       _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
    920     }
    921     // wide flat
    922     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    923   }
    924 }
    925 
    926 void vpx_lpf_horizontal_8_sse2(unsigned char *s, int p,
    927                                const unsigned char *_blimit,
    928                                const unsigned char *_limit,
    929                                const unsigned char *_thresh) {
    930   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
    931   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
    932   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
    933   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
    934   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
    935   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
    936   const __m128i zero = _mm_set1_epi16(0);
    937   const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
    938   const __m128i limit = _mm_load_si128((const __m128i *)_limit);
    939   const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
    940   __m128i mask, hev, flat;
    941   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
    942   __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
    943 
    944   q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
    945                             _mm_loadl_epi64((__m128i *)(s + 3 * p)));
    946   q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
    947                             _mm_loadl_epi64((__m128i *)(s + 2 * p)));
    948   q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
    949                             _mm_loadl_epi64((__m128i *)(s + 1 * p)));
    950   q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
    951                             _mm_loadl_epi64((__m128i *)(s - 0 * p)));
    952   p1q1 = _mm_shuffle_epi32(q1p1, 78);
    953   p0q0 = _mm_shuffle_epi32(q0p0, 78);
    954 
    955   {
    956     // filter_mask and hev_mask
    957     const __m128i one = _mm_set1_epi8(1);
    958     const __m128i fe = _mm_set1_epi8(0xfe);
    959     const __m128i ff = _mm_cmpeq_epi8(fe, fe);
    960     __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
    961     abs_p1p0 = abs_diff(q1p1, q0p0);
    962     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
    963 
    964     abs_p0q0 = abs_diff(q0p0, p0q0);
    965     abs_p1q1 = abs_diff(q1p1, p1q1);
    966     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    967     hev = _mm_subs_epu8(flat, thresh);
    968     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
    969 
    970     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    971     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    972     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    973     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    974     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    975     mask = _mm_max_epu8(abs_p1p0, mask);
    976     // mask |= (abs(p1 - p0) > limit) * -1;
    977     // mask |= (abs(q1 - q0) > limit) * -1;
    978 
    979     work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
    980     mask = _mm_max_epu8(work, mask);
    981     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
    982     mask = _mm_subs_epu8(mask, limit);
    983     mask = _mm_cmpeq_epi8(mask, zero);
    984 
    985     // flat_mask4
    986 
    987     flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
    988     flat = _mm_max_epu8(abs_p1p0, flat);
    989     flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
    990     flat = _mm_subs_epu8(flat, one);
    991     flat = _mm_cmpeq_epi8(flat, zero);
    992     flat = _mm_and_si128(flat, mask);
    993   }
    994 
    995   {
    996     const __m128i four = _mm_set1_epi16(4);
    997     unsigned char *src = s;
    998     {
    999       __m128i workp_a, workp_b, workp_shft;
   1000       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
   1001       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
   1002       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
   1003       p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
   1004       q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
   1005       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
   1006       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
   1007       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
   1008 
   1009       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
   1010       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
   1011       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
   1012       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
   1013       _mm_storel_epi64((__m128i *)&flat_op2[0],
   1014                        _mm_packus_epi16(workp_shft, workp_shft));
   1015 
   1016       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
   1017       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
   1018       _mm_storel_epi64((__m128i *)&flat_op1[0],
   1019                        _mm_packus_epi16(workp_shft, workp_shft));
   1020 
   1021       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
   1022       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
   1023       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
   1024       _mm_storel_epi64((__m128i *)&flat_op0[0],
   1025                        _mm_packus_epi16(workp_shft, workp_shft));
   1026 
   1027       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
   1028       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
   1029       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
   1030       _mm_storel_epi64((__m128i *)&flat_oq0[0],
   1031                        _mm_packus_epi16(workp_shft, workp_shft));
   1032 
   1033       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
   1034       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
   1035       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
   1036       _mm_storel_epi64((__m128i *)&flat_oq1[0],
   1037                        _mm_packus_epi16(workp_shft, workp_shft));
   1038 
   1039       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
   1040       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
   1041       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
   1042       _mm_storel_epi64((__m128i *)&flat_oq2[0],
   1043                        _mm_packus_epi16(workp_shft, workp_shft));
   1044     }
   1045   }
   1046   // lp filter
   1047   {
   1048     const __m128i t4 = _mm_set1_epi8(4);
   1049     const __m128i t3 = _mm_set1_epi8(3);
   1050     const __m128i t80 = _mm_set1_epi8(0x80);
   1051     const __m128i t1 = _mm_set1_epi8(0x1);
   1052     const __m128i ps1 =
   1053         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), t80);
   1054     const __m128i ps0 =
   1055         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), t80);
   1056     const __m128i qs0 =
   1057         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), t80);
   1058     const __m128i qs1 =
   1059         _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), t80);
   1060     __m128i filt;
   1061     __m128i work_a;
   1062     __m128i filter1, filter2;
   1063 
   1064     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
   1065     work_a = _mm_subs_epi8(qs0, ps0);
   1066     filt = _mm_adds_epi8(filt, work_a);
   1067     filt = _mm_adds_epi8(filt, work_a);
   1068     filt = _mm_adds_epi8(filt, work_a);
   1069     // (vpx_filter + 3 * (qs0 - ps0)) & mask
   1070     filt = _mm_and_si128(filt, mask);
   1071 
   1072     filter1 = _mm_adds_epi8(filt, t4);
   1073     filter2 = _mm_adds_epi8(filt, t3);
   1074 
   1075     // Filter1 >> 3
   1076     filter1 = _mm_unpacklo_epi8(zero, filter1);
   1077     filter1 = _mm_srai_epi16(filter1, 11);
   1078     filter1 = _mm_packs_epi16(filter1, filter1);
   1079 
   1080     // Filter2 >> 3
   1081     filter2 = _mm_unpacklo_epi8(zero, filter2);
   1082     filter2 = _mm_srai_epi16(filter2, 11);
   1083     filter2 = _mm_packs_epi16(filter2, zero);
   1084 
   1085     // filt >> 1
   1086     filt = _mm_adds_epi8(filter1, t1);
   1087     filt = _mm_unpacklo_epi8(zero, filt);
   1088     filt = _mm_srai_epi16(filt, 9);
   1089     filt = _mm_packs_epi16(filt, zero);
   1090 
   1091     filt = _mm_andnot_si128(hev, filt);
   1092 
   1093     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
   1094     q0 = _mm_loadl_epi64((__m128i *)flat_oq0);
   1095     work_a = _mm_andnot_si128(flat, work_a);
   1096     q0 = _mm_and_si128(flat, q0);
   1097     q0 = _mm_or_si128(work_a, q0);
   1098 
   1099     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
   1100     q1 = _mm_loadl_epi64((__m128i *)flat_oq1);
   1101     work_a = _mm_andnot_si128(flat, work_a);
   1102     q1 = _mm_and_si128(flat, q1);
   1103     q1 = _mm_or_si128(work_a, q1);
   1104 
   1105     work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
   1106     q2 = _mm_loadl_epi64((__m128i *)flat_oq2);
   1107     work_a = _mm_andnot_si128(flat, work_a);
   1108     q2 = _mm_and_si128(flat, q2);
   1109     q2 = _mm_or_si128(work_a, q2);
   1110 
   1111     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
   1112     p0 = _mm_loadl_epi64((__m128i *)flat_op0);
   1113     work_a = _mm_andnot_si128(flat, work_a);
   1114     p0 = _mm_and_si128(flat, p0);
   1115     p0 = _mm_or_si128(work_a, p0);
   1116 
   1117     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
   1118     p1 = _mm_loadl_epi64((__m128i *)flat_op1);
   1119     work_a = _mm_andnot_si128(flat, work_a);
   1120     p1 = _mm_and_si128(flat, p1);
   1121     p1 = _mm_or_si128(work_a, p1);
   1122 
   1123     work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
   1124     p2 = _mm_loadl_epi64((__m128i *)flat_op2);
   1125     work_a = _mm_andnot_si128(flat, work_a);
   1126     p2 = _mm_and_si128(flat, p2);
   1127     p2 = _mm_or_si128(work_a, p2);
   1128 
   1129     _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
   1130     _mm_storel_epi64((__m128i *)(s - 2 * p), p1);
   1131     _mm_storel_epi64((__m128i *)(s - 1 * p), p0);
   1132     _mm_storel_epi64((__m128i *)(s + 0 * p), q0);
   1133     _mm_storel_epi64((__m128i *)(s + 1 * p), q1);
   1134     _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
   1135   }
   1136 }
   1137 
   1138 void vpx_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
   1139                                     const uint8_t *_limit0,
   1140                                     const uint8_t *_thresh0,
   1141                                     const uint8_t *_blimit1,
   1142                                     const uint8_t *_limit1,
   1143                                     const uint8_t *_thresh1) {
   1144   DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
   1145   DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
   1146   DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
   1147   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
   1148   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
   1149   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
   1150   const __m128i zero = _mm_set1_epi16(0);
   1151   const __m128i blimit =
   1152       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
   1153                          _mm_load_si128((const __m128i *)_blimit1));
   1154   const __m128i limit =
   1155       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
   1156                          _mm_load_si128((const __m128i *)_limit1));
   1157   const __m128i thresh =
   1158       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
   1159                          _mm_load_si128((const __m128i *)_thresh1));
   1160 
   1161   __m128i mask, hev, flat;
   1162   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   1163 
   1164   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
   1165   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
   1166   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
   1167   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
   1168   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
   1169   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
   1170   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
   1171   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
   1172   {
   1173     const __m128i abs_p1p0 =
   1174         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
   1175     const __m128i abs_q1q0 =
   1176         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
   1177     const __m128i one = _mm_set1_epi8(1);
   1178     const __m128i fe = _mm_set1_epi8(0xfe);
   1179     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
   1180     __m128i abs_p0q0 =
   1181         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
   1182     __m128i abs_p1q1 =
   1183         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
   1184     __m128i work;
   1185 
   1186     // filter_mask and hev_mask
   1187     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
   1188     hev = _mm_subs_epu8(flat, thresh);
   1189     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
   1190 
   1191     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
   1192     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
   1193     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
   1194     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
   1195     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
   1196     mask = _mm_max_epu8(flat, mask);
   1197     // mask |= (abs(p1 - p0) > limit) * -1;
   1198     // mask |= (abs(q1 - q0) > limit) * -1;
   1199     work = _mm_max_epu8(
   1200         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
   1201         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
   1202     mask = _mm_max_epu8(work, mask);
   1203     work = _mm_max_epu8(
   1204         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
   1205         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
   1206     mask = _mm_max_epu8(work, mask);
   1207     mask = _mm_subs_epu8(mask, limit);
   1208     mask = _mm_cmpeq_epi8(mask, zero);
   1209 
   1210     // flat_mask4
   1211     work = _mm_max_epu8(
   1212         _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
   1213         _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
   1214     flat = _mm_max_epu8(work, flat);
   1215     work = _mm_max_epu8(
   1216         _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
   1217         _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
   1218     flat = _mm_max_epu8(work, flat);
   1219     flat = _mm_subs_epu8(flat, one);
   1220     flat = _mm_cmpeq_epi8(flat, zero);
   1221     flat = _mm_and_si128(flat, mask);
   1222   }
   1223   {
   1224     const __m128i four = _mm_set1_epi16(4);
   1225     unsigned char *src = s;
   1226     int i = 0;
   1227 
   1228     do {
   1229       __m128i workp_a, workp_b, workp_shft;
   1230       p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
   1231       p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
   1232       p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
   1233       p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
   1234       q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
   1235       q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
   1236       q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
   1237       q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
   1238 
   1239       workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
   1240       workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
   1241       workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
   1242       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
   1243       _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
   1244                        _mm_packus_epi16(workp_shft, workp_shft));
   1245 
   1246       workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
   1247       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
   1248       _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
   1249                        _mm_packus_epi16(workp_shft, workp_shft));
   1250 
   1251       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
   1252       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
   1253       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
   1254       _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
   1255                        _mm_packus_epi16(workp_shft, workp_shft));
   1256 
   1257       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
   1258       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
   1259       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
   1260       _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
   1261                        _mm_packus_epi16(workp_shft, workp_shft));
   1262 
   1263       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
   1264       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
   1265       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
   1266       _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
   1267                        _mm_packus_epi16(workp_shft, workp_shft));
   1268 
   1269       workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
   1270       workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
   1271       workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
   1272       _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
   1273                        _mm_packus_epi16(workp_shft, workp_shft));
   1274 
   1275       src += 8;
   1276     } while (++i < 2);
   1277   }
   1278   // lp filter
   1279   {
   1280     const __m128i t4 = _mm_set1_epi8(4);
   1281     const __m128i t3 = _mm_set1_epi8(3);
   1282     const __m128i t80 = _mm_set1_epi8(0x80);
   1283     const __m128i te0 = _mm_set1_epi8(0xe0);
   1284     const __m128i t1f = _mm_set1_epi8(0x1f);
   1285     const __m128i t1 = _mm_set1_epi8(0x1);
   1286     const __m128i t7f = _mm_set1_epi8(0x7f);
   1287 
   1288     const __m128i ps1 =
   1289         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
   1290     const __m128i ps0 =
   1291         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
   1292     const __m128i qs0 =
   1293         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
   1294     const __m128i qs1 =
   1295         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
   1296     __m128i filt;
   1297     __m128i work_a;
   1298     __m128i filter1, filter2;
   1299 
   1300     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
   1301     work_a = _mm_subs_epi8(qs0, ps0);
   1302     filt = _mm_adds_epi8(filt, work_a);
   1303     filt = _mm_adds_epi8(filt, work_a);
   1304     filt = _mm_adds_epi8(filt, work_a);
   1305     // (vpx_filter + 3 * (qs0 - ps0)) & mask
   1306     filt = _mm_and_si128(filt, mask);
   1307 
   1308     filter1 = _mm_adds_epi8(filt, t4);
   1309     filter2 = _mm_adds_epi8(filt, t3);
   1310 
   1311     // Filter1 >> 3
   1312     work_a = _mm_cmpgt_epi8(zero, filter1);
   1313     filter1 = _mm_srli_epi16(filter1, 3);
   1314     work_a = _mm_and_si128(work_a, te0);
   1315     filter1 = _mm_and_si128(filter1, t1f);
   1316     filter1 = _mm_or_si128(filter1, work_a);
   1317 
   1318     // Filter2 >> 3
   1319     work_a = _mm_cmpgt_epi8(zero, filter2);
   1320     filter2 = _mm_srli_epi16(filter2, 3);
   1321     work_a = _mm_and_si128(work_a, te0);
   1322     filter2 = _mm_and_si128(filter2, t1f);
   1323     filter2 = _mm_or_si128(filter2, work_a);
   1324 
   1325     // filt >> 1
   1326     filt = _mm_adds_epi8(filter1, t1);
   1327     work_a = _mm_cmpgt_epi8(zero, filt);
   1328     filt = _mm_srli_epi16(filt, 1);
   1329     work_a = _mm_and_si128(work_a, t80);
   1330     filt = _mm_and_si128(filt, t7f);
   1331     filt = _mm_or_si128(filt, work_a);
   1332 
   1333     filt = _mm_andnot_si128(hev, filt);
   1334 
   1335     work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
   1336     q0 = _mm_load_si128((__m128i *)flat_oq0);
   1337     work_a = _mm_andnot_si128(flat, work_a);
   1338     q0 = _mm_and_si128(flat, q0);
   1339     q0 = _mm_or_si128(work_a, q0);
   1340 
   1341     work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
   1342     q1 = _mm_load_si128((__m128i *)flat_oq1);
   1343     work_a = _mm_andnot_si128(flat, work_a);
   1344     q1 = _mm_and_si128(flat, q1);
   1345     q1 = _mm_or_si128(work_a, q1);
   1346 
   1347     work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
   1348     q2 = _mm_load_si128((__m128i *)flat_oq2);
   1349     work_a = _mm_andnot_si128(flat, work_a);
   1350     q2 = _mm_and_si128(flat, q2);
   1351     q2 = _mm_or_si128(work_a, q2);
   1352 
   1353     work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
   1354     p0 = _mm_load_si128((__m128i *)flat_op0);
   1355     work_a = _mm_andnot_si128(flat, work_a);
   1356     p0 = _mm_and_si128(flat, p0);
   1357     p0 = _mm_or_si128(work_a, p0);
   1358 
   1359     work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
   1360     p1 = _mm_load_si128((__m128i *)flat_op1);
   1361     work_a = _mm_andnot_si128(flat, work_a);
   1362     p1 = _mm_and_si128(flat, p1);
   1363     p1 = _mm_or_si128(work_a, p1);
   1364 
   1365     work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
   1366     p2 = _mm_load_si128((__m128i *)flat_op2);
   1367     work_a = _mm_andnot_si128(flat, work_a);
   1368     p2 = _mm_and_si128(flat, p2);
   1369     p2 = _mm_or_si128(work_a, p2);
   1370 
   1371     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
   1372     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
   1373     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
   1374     _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
   1375     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
   1376     _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
   1377   }
   1378 }
   1379 
   1380 void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
   1381                                     const unsigned char *_blimit0,
   1382                                     const unsigned char *_limit0,
   1383                                     const unsigned char *_thresh0,
   1384                                     const unsigned char *_blimit1,
   1385                                     const unsigned char *_limit1,
   1386                                     const unsigned char *_thresh1) {
   1387   const __m128i blimit =
   1388       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
   1389                          _mm_load_si128((const __m128i *)_blimit1));
   1390   const __m128i limit =
   1391       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
   1392                          _mm_load_si128((const __m128i *)_limit1));
   1393   const __m128i thresh =
   1394       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
   1395                          _mm_load_si128((const __m128i *)_thresh1));
   1396   const __m128i zero = _mm_set1_epi16(0);
   1397   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   1398   __m128i mask, hev, flat;
   1399 
   1400   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
   1401   p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
   1402   p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
   1403   p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
   1404   q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
   1405   q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
   1406   q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
   1407   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
   1408 
   1409   // filter_mask and hev_mask
   1410   {
   1411     const __m128i abs_p1p0 =
   1412         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
   1413     const __m128i abs_q1q0 =
   1414         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
   1415     const __m128i fe = _mm_set1_epi8(0xfe);
   1416     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
   1417     __m128i abs_p0q0 =
   1418         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
   1419     __m128i abs_p1q1 =
   1420         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
   1421     __m128i work;
   1422 
   1423     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
   1424     hev = _mm_subs_epu8(flat, thresh);
   1425     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
   1426 
   1427     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
   1428     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
   1429     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
   1430     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
   1431     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
   1432     mask = _mm_max_epu8(flat, mask);
   1433     // mask |= (abs(p1 - p0) > limit) * -1;
   1434     // mask |= (abs(q1 - q0) > limit) * -1;
   1435     work = _mm_max_epu8(
   1436         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
   1437         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
   1438     mask = _mm_max_epu8(work, mask);
   1439     work = _mm_max_epu8(
   1440         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
   1441         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
   1442     mask = _mm_max_epu8(work, mask);
   1443     mask = _mm_subs_epu8(mask, limit);
   1444     mask = _mm_cmpeq_epi8(mask, zero);
   1445   }
   1446 
   1447   // filter4
   1448   {
   1449     const __m128i t4 = _mm_set1_epi8(4);
   1450     const __m128i t3 = _mm_set1_epi8(3);
   1451     const __m128i t80 = _mm_set1_epi8(0x80);
   1452     const __m128i te0 = _mm_set1_epi8(0xe0);
   1453     const __m128i t1f = _mm_set1_epi8(0x1f);
   1454     const __m128i t1 = _mm_set1_epi8(0x1);
   1455     const __m128i t7f = _mm_set1_epi8(0x7f);
   1456 
   1457     const __m128i ps1 =
   1458         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
   1459     const __m128i ps0 =
   1460         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
   1461     const __m128i qs0 =
   1462         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
   1463     const __m128i qs1 =
   1464         _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
   1465     __m128i filt;
   1466     __m128i work_a;
   1467     __m128i filter1, filter2;
   1468 
   1469     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
   1470     work_a = _mm_subs_epi8(qs0, ps0);
   1471     filt = _mm_adds_epi8(filt, work_a);
   1472     filt = _mm_adds_epi8(filt, work_a);
   1473     filt = _mm_adds_epi8(filt, work_a);
   1474     // (vpx_filter + 3 * (qs0 - ps0)) & mask
   1475     filt = _mm_and_si128(filt, mask);
   1476 
   1477     filter1 = _mm_adds_epi8(filt, t4);
   1478     filter2 = _mm_adds_epi8(filt, t3);
   1479 
   1480     // Filter1 >> 3
   1481     work_a = _mm_cmpgt_epi8(zero, filter1);
   1482     filter1 = _mm_srli_epi16(filter1, 3);
   1483     work_a = _mm_and_si128(work_a, te0);
   1484     filter1 = _mm_and_si128(filter1, t1f);
   1485     filter1 = _mm_or_si128(filter1, work_a);
   1486 
   1487     // Filter2 >> 3
   1488     work_a = _mm_cmpgt_epi8(zero, filter2);
   1489     filter2 = _mm_srli_epi16(filter2, 3);
   1490     work_a = _mm_and_si128(work_a, te0);
   1491     filter2 = _mm_and_si128(filter2, t1f);
   1492     filter2 = _mm_or_si128(filter2, work_a);
   1493 
   1494     // filt >> 1
   1495     filt = _mm_adds_epi8(filter1, t1);
   1496     work_a = _mm_cmpgt_epi8(zero, filt);
   1497     filt = _mm_srli_epi16(filt, 1);
   1498     work_a = _mm_and_si128(work_a, t80);
   1499     filt = _mm_and_si128(filt, t7f);
   1500     filt = _mm_or_si128(filt, work_a);
   1501 
   1502     filt = _mm_andnot_si128(hev, filt);
   1503 
   1504     q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
   1505     q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
   1506     p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
   1507     p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
   1508 
   1509     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
   1510     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
   1511     _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
   1512     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
   1513   }
   1514 }
   1515 
   1516 static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
   1517                                  int in_p, unsigned char *out, int out_p) {
   1518   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   1519   __m128i x8, x9, x10, x11, x12, x13, x14, x15;
   1520 
   1521   // 2-way interleave w/hoisting of unpacks
   1522   x0 = _mm_loadl_epi64((__m128i *)in0);           // 1
   1523   x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));  // 3
   1524   x0 = _mm_unpacklo_epi8(x0, x1);                 // 1
   1525 
   1526   x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));  // 5
   1527   x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));  // 7
   1528   x1 = _mm_unpacklo_epi8(x2, x3);                     // 2
   1529 
   1530   x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));  // 9
   1531   x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));  // 11
   1532   x2 = _mm_unpacklo_epi8(x4, x5);                     // 3
   1533 
   1534   x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));  // 13
   1535   x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));  // 15
   1536   x3 = _mm_unpacklo_epi8(x6, x7);                     // 4
   1537   x4 = _mm_unpacklo_epi16(x0, x1);                    // 9
   1538 
   1539   x8 = _mm_loadl_epi64((__m128i *)in1);           // 2
   1540   x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));  // 4
   1541   x8 = _mm_unpacklo_epi8(x8, x9);                 // 5
   1542   x5 = _mm_unpacklo_epi16(x2, x3);                // 10
   1543 
   1544   x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));  // 6
   1545   x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));  // 8
   1546   x9 = _mm_unpacklo_epi8(x10, x11);                    // 6
   1547 
   1548   x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));  // 10
   1549   x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));  // 12
   1550   x10 = _mm_unpacklo_epi8(x12, x13);                   // 7
   1551   x12 = _mm_unpacklo_epi16(x8, x9);                    // 11
   1552 
   1553   x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));  // 14
   1554   x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));  // 16
   1555   x11 = _mm_unpacklo_epi8(x14, x15);                   // 8
   1556   x13 = _mm_unpacklo_epi16(x10, x11);                  // 12
   1557 
   1558   x6 = _mm_unpacklo_epi32(x4, x5);     // 13
   1559   x7 = _mm_unpackhi_epi32(x4, x5);     // 14
   1560   x14 = _mm_unpacklo_epi32(x12, x13);  // 15
   1561   x15 = _mm_unpackhi_epi32(x12, x13);  // 16
   1562 
   1563   // Store first 4-line result
   1564   _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
   1565   _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
   1566   _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
   1567   _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
   1568 
   1569   x4 = _mm_unpackhi_epi16(x0, x1);
   1570   x5 = _mm_unpackhi_epi16(x2, x3);
   1571   x12 = _mm_unpackhi_epi16(x8, x9);
   1572   x13 = _mm_unpackhi_epi16(x10, x11);
   1573 
   1574   x6 = _mm_unpacklo_epi32(x4, x5);
   1575   x7 = _mm_unpackhi_epi32(x4, x5);
   1576   x14 = _mm_unpacklo_epi32(x12, x13);
   1577   x15 = _mm_unpackhi_epi32(x12, x13);
   1578 
   1579   // Store second 4-line result
   1580   _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
   1581   _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
   1582   _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
   1583   _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
   1584 }
   1585 
   1586 static INLINE void transpose(unsigned char *src[], int in_p,
   1587                              unsigned char *dst[], int out_p,
   1588                              int num_8x8_to_transpose) {
   1589   int idx8x8 = 0;
   1590   __m128i x0, x1, x2, x3, x4, x5, x6, x7;
   1591   do {
   1592     unsigned char *in = src[idx8x8];
   1593     unsigned char *out = dst[idx8x8];
   1594 
   1595     x0 =
   1596         _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
   1597     x1 =
   1598         _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
   1599     // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
   1600     x0 = _mm_unpacklo_epi8(x0, x1);
   1601 
   1602     x2 =
   1603         _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
   1604     x3 =
   1605         _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
   1606     // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
   1607     x1 = _mm_unpacklo_epi8(x2, x3);
   1608 
   1609     x4 =
   1610         _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
   1611     x5 =
   1612         _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
   1613     // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
   1614     x2 = _mm_unpacklo_epi8(x4, x5);
   1615 
   1616     x6 =
   1617         _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
   1618     x7 =
   1619         _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
   1620     // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
   1621     x3 = _mm_unpacklo_epi8(x6, x7);
   1622 
   1623     // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
   1624     x4 = _mm_unpacklo_epi16(x0, x1);
   1625     // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
   1626     x5 = _mm_unpacklo_epi16(x2, x3);
   1627     // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
   1628     x6 = _mm_unpacklo_epi32(x4, x5);
   1629     _mm_storel_pd((double *)(out + 0 * out_p),
   1630                   _mm_castsi128_pd(x6));  // 00 10 20 30 40 50 60 70
   1631     _mm_storeh_pd((double *)(out + 1 * out_p),
   1632                   _mm_castsi128_pd(x6));  // 01 11 21 31 41 51 61 71
   1633     // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
   1634     x7 = _mm_unpackhi_epi32(x4, x5);
   1635     _mm_storel_pd((double *)(out + 2 * out_p),
   1636                   _mm_castsi128_pd(x7));  // 02 12 22 32 42 52 62 72
   1637     _mm_storeh_pd((double *)(out + 3 * out_p),
   1638                   _mm_castsi128_pd(x7));  // 03 13 23 33 43 53 63 73
   1639 
   1640     // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
   1641     x4 = _mm_unpackhi_epi16(x0, x1);
   1642     // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
   1643     x5 = _mm_unpackhi_epi16(x2, x3);
   1644     // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
   1645     x6 = _mm_unpacklo_epi32(x4, x5);
   1646     _mm_storel_pd((double *)(out + 4 * out_p),
   1647                   _mm_castsi128_pd(x6));  // 04 14 24 34 44 54 64 74
   1648     _mm_storeh_pd((double *)(out + 5 * out_p),
   1649                   _mm_castsi128_pd(x6));  // 05 15 25 35 45 55 65 75
   1650     // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
   1651     x7 = _mm_unpackhi_epi32(x4, x5);
   1652 
   1653     _mm_storel_pd((double *)(out + 6 * out_p),
   1654                   _mm_castsi128_pd(x7));  // 06 16 26 36 46 56 66 76
   1655     _mm_storeh_pd((double *)(out + 7 * out_p),
   1656                   _mm_castsi128_pd(x7));  // 07 17 27 37 47 57 67 77
   1657   } while (++idx8x8 < num_8x8_to_transpose);
   1658 }
   1659 
   1660 void vpx_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
   1661                                   const uint8_t *limit0, const uint8_t *thresh0,
   1662                                   const uint8_t *blimit1, const uint8_t *limit1,
   1663                                   const uint8_t *thresh1) {
   1664   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
   1665   unsigned char *src[2];
   1666   unsigned char *dst[2];
   1667 
   1668   // Transpose 8x16
   1669   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
   1670 
   1671   // Loop filtering
   1672   vpx_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
   1673                                  blimit1, limit1, thresh1);
   1674   src[0] = t_dst;
   1675   src[1] = t_dst + 8;
   1676   dst[0] = s - 4;
   1677   dst[1] = s - 4 + p * 8;
   1678 
   1679   // Transpose back
   1680   transpose(src, 16, dst, p, 2);
   1681 }
   1682 
   1683 void vpx_lpf_vertical_8_sse2(unsigned char *s, int p,
   1684                              const unsigned char *blimit,
   1685                              const unsigned char *limit,
   1686                              const unsigned char *thresh) {
   1687   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 8]);
   1688   unsigned char *src[1];
   1689   unsigned char *dst[1];
   1690 
   1691   // Transpose 8x8
   1692   src[0] = s - 4;
   1693   dst[0] = t_dst;
   1694 
   1695   transpose(src, p, dst, 8, 1);
   1696 
   1697   // Loop filtering
   1698   vpx_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh);
   1699 
   1700   src[0] = t_dst;
   1701   dst[0] = s - 4;
   1702 
   1703   // Transpose back
   1704   transpose(src, 8, dst, p, 1);
   1705 }
   1706 
   1707 void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *blimit0,
   1708                                   const uint8_t *limit0, const uint8_t *thresh0,
   1709                                   const uint8_t *blimit1, const uint8_t *limit1,
   1710                                   const uint8_t *thresh1) {
   1711   DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
   1712   unsigned char *src[2];
   1713   unsigned char *dst[2];
   1714 
   1715   // Transpose 8x16
   1716   transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
   1717 
   1718   // Loop filtering
   1719   vpx_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0, thresh0,
   1720                                  blimit1, limit1, thresh1);
   1721   src[0] = t_dst;
   1722   src[1] = t_dst + 8;
   1723 
   1724   dst[0] = s - 4;
   1725   dst[1] = s - 4 + p * 8;
   1726 
   1727   // Transpose back
   1728   transpose(src, 16, dst, p, 2);
   1729 }
   1730 
   1731 void vpx_lpf_vertical_16_sse2(unsigned char *s, int p,
   1732                               const unsigned char *blimit,
   1733                               const unsigned char *limit,
   1734                               const unsigned char *thresh) {
   1735   DECLARE_ALIGNED(8, unsigned char, t_dst[8 * 16]);
   1736   unsigned char *src[2];
   1737   unsigned char *dst[2];
   1738 
   1739   src[0] = s - 8;
   1740   src[1] = s;
   1741   dst[0] = t_dst;
   1742   dst[1] = t_dst + 8 * 8;
   1743 
   1744   // Transpose 16x8
   1745   transpose(src, p, dst, 8, 2);
   1746 
   1747   // Loop filtering
   1748   vpx_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh);
   1749 
   1750   src[0] = t_dst;
   1751   src[1] = t_dst + 8 * 8;
   1752   dst[0] = s - 8;
   1753   dst[1] = s;
   1754 
   1755   // Transpose back
   1756   transpose(src, 8, dst, p, 2);
   1757 }
   1758 
   1759 void vpx_lpf_vertical_16_dual_sse2(unsigned char *s, int p,
   1760                                    const uint8_t *blimit, const uint8_t *limit,
   1761                                    const uint8_t *thresh) {
   1762   DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
   1763 
   1764   // Transpose 16x16
   1765   transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
   1766   transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
   1767 
   1768   // Loop filtering
   1769   vpx_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit, thresh);
   1770 
   1771   // Transpose back
   1772   transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
   1773   transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
   1774 }
   1775