Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <emmintrin.h>  // SSE2
     12 
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx_ports/mem.h"
     15 
     16 static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
     17   __m128i ubounded;
     18   __m128i lbounded;
     19   __m128i retval;
     20 
     21   const __m128i zero = _mm_set1_epi16(0);
     22   const __m128i one = _mm_set1_epi16(1);
     23   __m128i t80, max, min;
     24 
     25   if (bd == 8) {
     26     t80 = _mm_set1_epi16(0x80);
     27     max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 8), one), t80);
     28   } else if (bd == 10) {
     29     t80 = _mm_set1_epi16(0x200);
     30     max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 10), one), t80);
     31   } else {  // bd == 12
     32     t80 = _mm_set1_epi16(0x800);
     33     max = _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, 12), one), t80);
     34   }
     35 
     36   min = _mm_subs_epi16(zero, t80);
     37 
     38   ubounded = _mm_cmpgt_epi16(value, max);
     39   lbounded = _mm_cmplt_epi16(value, min);
     40   retval = _mm_andnot_si128(_mm_or_si128(ubounded, lbounded), value);
     41   ubounded = _mm_and_si128(ubounded, max);
     42   lbounded = _mm_and_si128(lbounded, min);
     43   retval = _mm_or_si128(retval, ubounded);
     44   retval = _mm_or_si128(retval, lbounded);
     45   return retval;
     46 }
     47 
     48 // TODO(debargha, peter): Break up large functions into smaller ones
     49 // in this file.
     50 void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int p,
     51                                        const uint8_t *_blimit,
     52                                        const uint8_t *_limit,
     53                                        const uint8_t *_thresh, int bd) {
     54   const __m128i zero = _mm_set1_epi16(0);
     55   const __m128i one = _mm_set1_epi16(1);
     56   __m128i blimit, limit, thresh;
     57   __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
     58   __m128i mask, hev, flat, flat2, abs_p1p0, abs_q1q0;
     59   __m128i ps1, qs1, ps0, qs0;
     60   __m128i abs_p0q0, abs_p1q1, ffff, work;
     61   __m128i filt, work_a, filter1, filter2;
     62   __m128i flat2_q6, flat2_p6, flat2_q5, flat2_p5, flat2_q4, flat2_p4;
     63   __m128i flat2_q3, flat2_p3, flat2_q2, flat2_p2, flat2_q1, flat2_p1;
     64   __m128i flat2_q0, flat2_p0;
     65   __m128i flat_q2, flat_p2, flat_q1, flat_p1, flat_q0, flat_p0;
     66   __m128i pixelFilter_p, pixelFilter_q;
     67   __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
     68   __m128i sum_p7, sum_q7, sum_p3, sum_q3;
     69   __m128i t4, t3, t80, t1;
     70   __m128i eight, four;
     71 
     72   if (bd == 8) {
     73     blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
     74     limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
     75     thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
     76   } else if (bd == 10) {
     77     blimit = _mm_slli_epi16(
     78         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
     79     limit = _mm_slli_epi16(
     80         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
     81     thresh = _mm_slli_epi16(
     82         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
     83   } else {  // bd == 12
     84     blimit = _mm_slli_epi16(
     85         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
     86     limit = _mm_slli_epi16(
     87         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
     88     thresh = _mm_slli_epi16(
     89         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
     90   }
     91 
     92   q4 = _mm_load_si128((__m128i *)(s + 4 * p));
     93   p4 = _mm_load_si128((__m128i *)(s - 5 * p));
     94   q3 = _mm_load_si128((__m128i *)(s + 3 * p));
     95   p3 = _mm_load_si128((__m128i *)(s - 4 * p));
     96   q2 = _mm_load_si128((__m128i *)(s + 2 * p));
     97   p2 = _mm_load_si128((__m128i *)(s - 3 * p));
     98   q1 = _mm_load_si128((__m128i *)(s + 1 * p));
     99   p1 = _mm_load_si128((__m128i *)(s - 2 * p));
    100   q0 = _mm_load_si128((__m128i *)(s + 0 * p));
    101   p0 = _mm_load_si128((__m128i *)(s - 1 * p));
    102 
    103   //  highbd_filter_mask
    104   abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
    105   abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
    106 
    107   ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
    108 
    109   abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
    110   abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
    111 
    112   //  highbd_hev_mask (in C code this is actually called from highbd_filter4)
    113   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
    114   hev = _mm_subs_epu16(flat, thresh);
    115   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
    116 
    117   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);  // abs(p0 - q0) * 2
    118   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);         // abs(p1 - q1) / 2
    119   mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
    120   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
    121   mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
    122   work = _mm_max_epi16(
    123       _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1)),
    124       _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1)));
    125   mask = _mm_max_epi16(work, mask);
    126   work = _mm_max_epi16(
    127       _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
    128       _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
    129   mask = _mm_max_epi16(work, mask);
    130   work = _mm_max_epi16(
    131       _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
    132       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
    133   mask = _mm_max_epi16(work, mask);
    134 
    135   mask = _mm_subs_epu16(mask, limit);
    136   mask = _mm_cmpeq_epi16(mask, zero);  // return ~mask
    137 
    138   // lp filter
    139   // highbd_filter4
    140   t4 = _mm_set1_epi16(4);
    141   t3 = _mm_set1_epi16(3);
    142   if (bd == 8)
    143     t80 = _mm_set1_epi16(0x80);
    144   else if (bd == 10)
    145     t80 = _mm_set1_epi16(0x200);
    146   else  // bd == 12
    147     t80 = _mm_set1_epi16(0x800);
    148 
    149   t1 = _mm_set1_epi16(0x1);
    150 
    151   ps1 = _mm_subs_epi16(p1, t80);
    152   qs1 = _mm_subs_epi16(q1, t80);
    153   ps0 = _mm_subs_epi16(p0, t80);
    154   qs0 = _mm_subs_epi16(q0, t80);
    155 
    156   filt = _mm_and_si128(signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd),
    157                        hev);
    158   work_a = _mm_subs_epi16(qs0, ps0);
    159   filt = _mm_adds_epi16(filt, work_a);
    160   filt = _mm_adds_epi16(filt, work_a);
    161   filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
    162   filt = _mm_and_si128(filt, mask);
    163   filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
    164   filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
    165 
    166   // Filter1 >> 3
    167   filter1 = _mm_srai_epi16(filter1, 0x3);
    168   filter2 = _mm_srai_epi16(filter2, 0x3);
    169 
    170   qs0 = _mm_adds_epi16(
    171       signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
    172   ps0 = _mm_adds_epi16(
    173       signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
    174   filt = _mm_adds_epi16(filter1, t1);
    175   filt = _mm_srai_epi16(filt, 1);
    176   filt = _mm_andnot_si128(hev, filt);
    177   qs1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
    178                        t80);
    179   ps1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
    180                        t80);
    181 
    182   // end highbd_filter4
    183   // loopfilter done
    184 
    185   // highbd_flat_mask4
    186   flat = _mm_max_epi16(
    187       _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
    188       _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)));
    189   work = _mm_max_epi16(
    190       _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)),
    191       _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
    192   flat = _mm_max_epi16(work, flat);
    193   work = _mm_max_epi16(abs_p1p0, abs_q1q0);
    194   flat = _mm_max_epi16(work, flat);
    195 
    196   if (bd == 8)
    197     flat = _mm_subs_epu16(flat, one);
    198   else if (bd == 10)
    199     flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
    200   else  // bd == 12
    201     flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
    202 
    203   flat = _mm_cmpeq_epi16(flat, zero);
    204   // end flat_mask4
    205 
    206   // flat & mask = flat && mask (as used in filter8)
    207   // (because, in both vars, each block of 16 either all 1s or all 0s)
    208   flat = _mm_and_si128(flat, mask);
    209 
    210   p5 = _mm_load_si128((__m128i *)(s - 6 * p));
    211   q5 = _mm_load_si128((__m128i *)(s + 5 * p));
    212   p6 = _mm_load_si128((__m128i *)(s - 7 * p));
    213   q6 = _mm_load_si128((__m128i *)(s + 6 * p));
    214   p7 = _mm_load_si128((__m128i *)(s - 8 * p));
    215   q7 = _mm_load_si128((__m128i *)(s + 7 * p));
    216 
    217   // highbd_flat_mask5 (arguments passed in are p0, q0, p4-p7, q4-q7
    218   // but referred to as p0-p4 & q0-q4 in fn)
    219   flat2 = _mm_max_epi16(
    220       _mm_or_si128(_mm_subs_epu16(p4, p0), _mm_subs_epu16(p0, p4)),
    221       _mm_or_si128(_mm_subs_epu16(q4, q0), _mm_subs_epu16(q0, q4)));
    222 
    223   work = _mm_max_epi16(
    224       _mm_or_si128(_mm_subs_epu16(p5, p0), _mm_subs_epu16(p0, p5)),
    225       _mm_or_si128(_mm_subs_epu16(q5, q0), _mm_subs_epu16(q0, q5)));
    226   flat2 = _mm_max_epi16(work, flat2);
    227 
    228   work = _mm_max_epi16(
    229       _mm_or_si128(_mm_subs_epu16(p6, p0), _mm_subs_epu16(p0, p6)),
    230       _mm_or_si128(_mm_subs_epu16(q6, q0), _mm_subs_epu16(q0, q6)));
    231   flat2 = _mm_max_epi16(work, flat2);
    232 
    233   work = _mm_max_epi16(
    234       _mm_or_si128(_mm_subs_epu16(p7, p0), _mm_subs_epu16(p0, p7)),
    235       _mm_or_si128(_mm_subs_epu16(q7, q0), _mm_subs_epu16(q0, q7)));
    236   flat2 = _mm_max_epi16(work, flat2);
    237 
    238   if (bd == 8)
    239     flat2 = _mm_subs_epu16(flat2, one);
    240   else if (bd == 10)
    241     flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 2));
    242   else  // bd == 12
    243     flat2 = _mm_subs_epu16(flat2, _mm_slli_epi16(one, 4));
    244 
    245   flat2 = _mm_cmpeq_epi16(flat2, zero);
    246   flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    247   // end highbd_flat_mask5
    248 
    249   // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    250   // flat and wide flat calculations
    251   eight = _mm_set1_epi16(8);
    252   four = _mm_set1_epi16(4);
    253 
    254   pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6, p5), _mm_add_epi16(p4, p3));
    255   pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6, q5), _mm_add_epi16(q4, q3));
    256 
    257   pixetFilter_p2p1p0 = _mm_add_epi16(p0, _mm_add_epi16(p2, p1));
    258   pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
    259 
    260   pixetFilter_q2q1q0 = _mm_add_epi16(q0, _mm_add_epi16(q2, q1));
    261   pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
    262   pixelFilter_p =
    263       _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
    264   pixetFilter_p2p1p0 = _mm_add_epi16(
    265       four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
    266   flat2_p0 =
    267       _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7, p0)), 4);
    268   flat2_q0 =
    269       _mm_srli_epi16(_mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7, q0)), 4);
    270   flat_p0 = _mm_srli_epi16(
    271       _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3, p0)), 3);
    272   flat_q0 = _mm_srli_epi16(
    273       _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3, q0)), 3);
    274 
    275   sum_p7 = _mm_add_epi16(p7, p7);
    276   sum_q7 = _mm_add_epi16(q7, q7);
    277   sum_p3 = _mm_add_epi16(p3, p3);
    278   sum_q3 = _mm_add_epi16(q3, q3);
    279 
    280   pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6);
    281   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6);
    282   flat2_p1 = _mm_srli_epi16(
    283       _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1)), 4);
    284   flat2_q1 = _mm_srli_epi16(
    285       _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1)), 4);
    286 
    287   pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2);
    288   pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2);
    289   flat_p1 = _mm_srli_epi16(
    290       _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1)), 3);
    291   flat_q1 = _mm_srli_epi16(
    292       _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1)), 3);
    293 
    294   sum_p7 = _mm_add_epi16(sum_p7, p7);
    295   sum_q7 = _mm_add_epi16(sum_q7, q7);
    296   sum_p3 = _mm_add_epi16(sum_p3, p3);
    297   sum_q3 = _mm_add_epi16(sum_q3, q3);
    298 
    299   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5);
    300   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5);
    301   flat2_p2 = _mm_srli_epi16(
    302       _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2)), 4);
    303   flat2_q2 = _mm_srli_epi16(
    304       _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2)), 4);
    305 
    306   pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1);
    307   pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1);
    308   flat_p2 = _mm_srli_epi16(
    309       _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2)), 3);
    310   flat_q2 = _mm_srli_epi16(
    311       _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2)), 3);
    312 
    313   sum_p7 = _mm_add_epi16(sum_p7, p7);
    314   sum_q7 = _mm_add_epi16(sum_q7, q7);
    315   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4);
    316   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4);
    317   flat2_p3 = _mm_srli_epi16(
    318       _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3)), 4);
    319   flat2_q3 = _mm_srli_epi16(
    320       _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3)), 4);
    321 
    322   sum_p7 = _mm_add_epi16(sum_p7, p7);
    323   sum_q7 = _mm_add_epi16(sum_q7, q7);
    324   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3);
    325   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3);
    326   flat2_p4 = _mm_srli_epi16(
    327       _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4)), 4);
    328   flat2_q4 = _mm_srli_epi16(
    329       _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4)), 4);
    330 
    331   sum_p7 = _mm_add_epi16(sum_p7, p7);
    332   sum_q7 = _mm_add_epi16(sum_q7, q7);
    333   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2);
    334   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2);
    335   flat2_p5 = _mm_srli_epi16(
    336       _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5)), 4);
    337   flat2_q5 = _mm_srli_epi16(
    338       _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5)), 4);
    339 
    340   sum_p7 = _mm_add_epi16(sum_p7, p7);
    341   sum_q7 = _mm_add_epi16(sum_q7, q7);
    342   pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1);
    343   pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1);
    344   flat2_p6 = _mm_srli_epi16(
    345       _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6)), 4);
    346   flat2_q6 = _mm_srli_epi16(
    347       _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6)), 4);
    348 
    349   //  wide flat
    350   //  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    351 
    352   //  highbd_filter8
    353   p2 = _mm_andnot_si128(flat, p2);
    354   //  p2 remains unchanged if !(flat && mask)
    355   flat_p2 = _mm_and_si128(flat, flat_p2);
    356   //  when (flat && mask)
    357   p2 = _mm_or_si128(p2, flat_p2);  // full list of p2 values
    358   q2 = _mm_andnot_si128(flat, q2);
    359   flat_q2 = _mm_and_si128(flat, flat_q2);
    360   q2 = _mm_or_si128(q2, flat_q2);  // full list of q2 values
    361 
    362   ps1 = _mm_andnot_si128(flat, ps1);
    363   //  p1 takes the value assigned to in in filter4 if !(flat && mask)
    364   flat_p1 = _mm_and_si128(flat, flat_p1);
    365   //  when (flat && mask)
    366   p1 = _mm_or_si128(ps1, flat_p1);  // full list of p1 values
    367   qs1 = _mm_andnot_si128(flat, qs1);
    368   flat_q1 = _mm_and_si128(flat, flat_q1);
    369   q1 = _mm_or_si128(qs1, flat_q1);  // full list of q1 values
    370 
    371   ps0 = _mm_andnot_si128(flat, ps0);
    372   //  p0 takes the value assigned to in in filter4 if !(flat && mask)
    373   flat_p0 = _mm_and_si128(flat, flat_p0);
    374   //  when (flat && mask)
    375   p0 = _mm_or_si128(ps0, flat_p0);  // full list of p0 values
    376   qs0 = _mm_andnot_si128(flat, qs0);
    377   flat_q0 = _mm_and_si128(flat, flat_q0);
    378   q0 = _mm_or_si128(qs0, flat_q0);  // full list of q0 values
    379   // end highbd_filter8
    380 
    381   // highbd_filter16
    382   p6 = _mm_andnot_si128(flat2, p6);
    383   //  p6 remains unchanged if !(flat2 && flat && mask)
    384   flat2_p6 = _mm_and_si128(flat2, flat2_p6);
    385   //  get values for when (flat2 && flat && mask)
    386   p6 = _mm_or_si128(p6, flat2_p6);  // full list of p6 values
    387   q6 = _mm_andnot_si128(flat2, q6);
    388   //  q6 remains unchanged if !(flat2 && flat && mask)
    389   flat2_q6 = _mm_and_si128(flat2, flat2_q6);
    390   //  get values for when (flat2 && flat && mask)
    391   q6 = _mm_or_si128(q6, flat2_q6);  // full list of q6 values
    392   _mm_store_si128((__m128i *)(s - 7 * p), p6);
    393   _mm_store_si128((__m128i *)(s + 6 * p), q6);
    394 
    395   p5 = _mm_andnot_si128(flat2, p5);
    396   //  p5 remains unchanged if !(flat2 && flat && mask)
    397   flat2_p5 = _mm_and_si128(flat2, flat2_p5);
    398   //  get values for when (flat2 && flat && mask)
    399   p5 = _mm_or_si128(p5, flat2_p5);
    400   //  full list of p5 values
    401   q5 = _mm_andnot_si128(flat2, q5);
    402   //  q5 remains unchanged if !(flat2 && flat && mask)
    403   flat2_q5 = _mm_and_si128(flat2, flat2_q5);
    404   //  get values for when (flat2 && flat && mask)
    405   q5 = _mm_or_si128(q5, flat2_q5);
    406   //  full list of q5 values
    407   _mm_store_si128((__m128i *)(s - 6 * p), p5);
    408   _mm_store_si128((__m128i *)(s + 5 * p), q5);
    409 
    410   p4 = _mm_andnot_si128(flat2, p4);
    411   //  p4 remains unchanged if !(flat2 && flat && mask)
    412   flat2_p4 = _mm_and_si128(flat2, flat2_p4);
    413   //  get values for when (flat2 && flat && mask)
    414   p4 = _mm_or_si128(p4, flat2_p4);  // full list of p4 values
    415   q4 = _mm_andnot_si128(flat2, q4);
    416   //  q4 remains unchanged if !(flat2 && flat && mask)
    417   flat2_q4 = _mm_and_si128(flat2, flat2_q4);
    418   //  get values for when (flat2 && flat && mask)
    419   q4 = _mm_or_si128(q4, flat2_q4);  // full list of q4 values
    420   _mm_store_si128((__m128i *)(s - 5 * p), p4);
    421   _mm_store_si128((__m128i *)(s + 4 * p), q4);
    422 
    423   p3 = _mm_andnot_si128(flat2, p3);
    424   //  p3 takes value from highbd_filter8 if !(flat2 && flat && mask)
    425   flat2_p3 = _mm_and_si128(flat2, flat2_p3);
    426   //  get values for when (flat2 && flat && mask)
    427   p3 = _mm_or_si128(p3, flat2_p3);  // full list of p3 values
    428   q3 = _mm_andnot_si128(flat2, q3);
    429   //  q3 takes value from highbd_filter8 if !(flat2 && flat && mask)
    430   flat2_q3 = _mm_and_si128(flat2, flat2_q3);
    431   //  get values for when (flat2 && flat && mask)
    432   q3 = _mm_or_si128(q3, flat2_q3);  // full list of q3 values
    433   _mm_store_si128((__m128i *)(s - 4 * p), p3);
    434   _mm_store_si128((__m128i *)(s + 3 * p), q3);
    435 
    436   p2 = _mm_andnot_si128(flat2, p2);
    437   //  p2 takes value from highbd_filter8 if !(flat2 && flat && mask)
    438   flat2_p2 = _mm_and_si128(flat2, flat2_p2);
    439   //  get values for when (flat2 && flat && mask)
    440   p2 = _mm_or_si128(p2, flat2_p2);
    441   //  full list of p2 values
    442   q2 = _mm_andnot_si128(flat2, q2);
    443   //  q2 takes value from highbd_filter8 if !(flat2 && flat && mask)
    444   flat2_q2 = _mm_and_si128(flat2, flat2_q2);
    445   //  get values for when (flat2 && flat && mask)
    446   q2 = _mm_or_si128(q2, flat2_q2);  // full list of q2 values
    447   _mm_store_si128((__m128i *)(s - 3 * p), p2);
    448   _mm_store_si128((__m128i *)(s + 2 * p), q2);
    449 
    450   p1 = _mm_andnot_si128(flat2, p1);
    451   //  p1 takes value from highbd_filter8 if !(flat2 && flat && mask)
    452   flat2_p1 = _mm_and_si128(flat2, flat2_p1);
    453   //  get values for when (flat2 && flat && mask)
    454   p1 = _mm_or_si128(p1, flat2_p1);  // full list of p1 values
    455   q1 = _mm_andnot_si128(flat2, q1);
    456   //  q1 takes value from highbd_filter8 if !(flat2 && flat && mask)
    457   flat2_q1 = _mm_and_si128(flat2, flat2_q1);
    458   //  get values for when (flat2 && flat && mask)
    459   q1 = _mm_or_si128(q1, flat2_q1);  // full list of q1 values
    460   _mm_store_si128((__m128i *)(s - 2 * p), p1);
    461   _mm_store_si128((__m128i *)(s + 1 * p), q1);
    462 
    463   p0 = _mm_andnot_si128(flat2, p0);
    464   //  p0 takes value from highbd_filter8 if !(flat2 && flat && mask)
    465   flat2_p0 = _mm_and_si128(flat2, flat2_p0);
    466   //  get values for when (flat2 && flat && mask)
    467   p0 = _mm_or_si128(p0, flat2_p0);  // full list of p0 values
    468   q0 = _mm_andnot_si128(flat2, q0);
    469   //  q0 takes value from highbd_filter8 if !(flat2 && flat && mask)
    470   flat2_q0 = _mm_and_si128(flat2, flat2_q0);
    471   //  get values for when (flat2 && flat && mask)
    472   q0 = _mm_or_si128(q0, flat2_q0);  // full list of q0 values
    473   _mm_store_si128((__m128i *)(s - 1 * p), p0);
    474   _mm_store_si128((__m128i *)(s - 0 * p), q0);
    475 }
    476 
    477 void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int p,
    478                                             const uint8_t *_blimit,
    479                                             const uint8_t *_limit,
    480                                             const uint8_t *_thresh, int bd) {
    481   vpx_highbd_lpf_horizontal_16_sse2(s, p, _blimit, _limit, _thresh, bd);
    482   vpx_highbd_lpf_horizontal_16_sse2(s + 8, p, _blimit, _limit, _thresh, bd);
    483 }
    484 
    485 void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
    486                                       const uint8_t *_blimit,
    487                                       const uint8_t *_limit,
    488                                       const uint8_t *_thresh, int bd) {
    489   DECLARE_ALIGNED(16, uint16_t, flat_op2[16]);
    490   DECLARE_ALIGNED(16, uint16_t, flat_op1[16]);
    491   DECLARE_ALIGNED(16, uint16_t, flat_op0[16]);
    492   DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
    493   DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
    494   DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
    495   const __m128i zero = _mm_set1_epi16(0);
    496   __m128i blimit, limit, thresh;
    497   __m128i mask, hev, flat;
    498   __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * p));
    499   __m128i q3 = _mm_load_si128((__m128i *)(s + 3 * p));
    500   __m128i p2 = _mm_load_si128((__m128i *)(s - 3 * p));
    501   __m128i q2 = _mm_load_si128((__m128i *)(s + 2 * p));
    502   __m128i p1 = _mm_load_si128((__m128i *)(s - 2 * p));
    503   __m128i q1 = _mm_load_si128((__m128i *)(s + 1 * p));
    504   __m128i p0 = _mm_load_si128((__m128i *)(s - 1 * p));
    505   __m128i q0 = _mm_load_si128((__m128i *)(s + 0 * p));
    506   const __m128i one = _mm_set1_epi16(1);
    507   const __m128i ffff = _mm_cmpeq_epi16(one, one);
    508   __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
    509   const __m128i four = _mm_set1_epi16(4);
    510   __m128i workp_a, workp_b, workp_shft;
    511 
    512   const __m128i t4 = _mm_set1_epi16(4);
    513   const __m128i t3 = _mm_set1_epi16(3);
    514   __m128i t80;
    515   const __m128i t1 = _mm_set1_epi16(0x1);
    516   __m128i ps1, ps0, qs0, qs1;
    517   __m128i filt;
    518   __m128i work_a;
    519   __m128i filter1, filter2;
    520 
    521   if (bd == 8) {
    522     blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
    523     limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
    524     thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
    525     t80 = _mm_set1_epi16(0x80);
    526   } else if (bd == 10) {
    527     blimit = _mm_slli_epi16(
    528         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
    529     limit = _mm_slli_epi16(
    530         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
    531     thresh = _mm_slli_epi16(
    532         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
    533     t80 = _mm_set1_epi16(0x200);
    534   } else {  // bd == 12
    535     blimit = _mm_slli_epi16(
    536         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
    537     limit = _mm_slli_epi16(
    538         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
    539     thresh = _mm_slli_epi16(
    540         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
    541     t80 = _mm_set1_epi16(0x800);
    542   }
    543 
    544   ps1 = _mm_subs_epi16(p1, t80);
    545   ps0 = _mm_subs_epi16(p0, t80);
    546   qs0 = _mm_subs_epi16(q0, t80);
    547   qs1 = _mm_subs_epi16(q1, t80);
    548 
    549   // filter_mask and hev_mask
    550   abs_p1p0 = _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
    551   abs_q1q0 = _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
    552 
    553   abs_p0q0 = _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
    554   abs_p1q1 = _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
    555   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
    556   hev = _mm_subs_epu16(flat, thresh);
    557   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
    558 
    559   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
    560   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
    561   mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
    562   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
    563   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    564   // So taking maximums continues to work:
    565   mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
    566   mask = _mm_max_epi16(abs_p1p0, mask);
    567   // mask |= (abs(p1 - p0) > limit) * -1;
    568   mask = _mm_max_epi16(abs_q1q0, mask);
    569   // mask |= (abs(q1 - q0) > limit) * -1;
    570 
    571   work = _mm_max_epi16(
    572       _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
    573       _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)));
    574   mask = _mm_max_epi16(work, mask);
    575   work = _mm_max_epi16(
    576       _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)),
    577       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
    578   mask = _mm_max_epi16(work, mask);
    579   mask = _mm_subs_epu16(mask, limit);
    580   mask = _mm_cmpeq_epi16(mask, zero);
    581 
    582   // flat_mask4
    583   flat = _mm_max_epi16(
    584       _mm_or_si128(_mm_subs_epu16(p2, p0), _mm_subs_epu16(p0, p2)),
    585       _mm_or_si128(_mm_subs_epu16(q2, q0), _mm_subs_epu16(q0, q2)));
    586   work = _mm_max_epi16(
    587       _mm_or_si128(_mm_subs_epu16(p3, p0), _mm_subs_epu16(p0, p3)),
    588       _mm_or_si128(_mm_subs_epu16(q3, q0), _mm_subs_epu16(q0, q3)));
    589   flat = _mm_max_epi16(work, flat);
    590   flat = _mm_max_epi16(abs_p1p0, flat);
    591   flat = _mm_max_epi16(abs_q1q0, flat);
    592 
    593   if (bd == 8)
    594     flat = _mm_subs_epu16(flat, one);
    595   else if (bd == 10)
    596     flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 2));
    597   else  // bd == 12
    598     flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, 4));
    599 
    600   flat = _mm_cmpeq_epi16(flat, zero);
    601   flat = _mm_and_si128(flat, mask);  // flat & mask
    602 
    603   // Added before shift for rounding part of ROUND_POWER_OF_TWO
    604 
    605   workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
    606   workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
    607   workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
    608   workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    609   _mm_store_si128((__m128i *)&flat_op2[0], workp_shft);
    610 
    611   workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
    612   workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    613   _mm_store_si128((__m128i *)&flat_op1[0], workp_shft);
    614 
    615   workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
    616   workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
    617   workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    618   _mm_store_si128((__m128i *)&flat_op0[0], workp_shft);
    619 
    620   workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
    621   workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
    622   workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    623   _mm_store_si128((__m128i *)&flat_oq0[0], workp_shft);
    624 
    625   workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
    626   workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
    627   workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    628   _mm_store_si128((__m128i *)&flat_oq1[0], workp_shft);
    629 
    630   workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
    631   workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
    632   workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
    633   _mm_store_si128((__m128i *)&flat_oq2[0], workp_shft);
    634 
    635   // lp filter
    636   filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
    637   filt = _mm_and_si128(filt, hev);
    638   work_a = _mm_subs_epi16(qs0, ps0);
    639   filt = _mm_adds_epi16(filt, work_a);
    640   filt = _mm_adds_epi16(filt, work_a);
    641   filt = _mm_adds_epi16(filt, work_a);
    642   // (vpx_filter + 3 * (qs0 - ps0)) & mask
    643   filt = signed_char_clamp_bd_sse2(filt, bd);
    644   filt = _mm_and_si128(filt, mask);
    645 
    646   filter1 = _mm_adds_epi16(filt, t4);
    647   filter2 = _mm_adds_epi16(filt, t3);
    648 
    649   // Filter1 >> 3
    650   filter1 = signed_char_clamp_bd_sse2(filter1, bd);
    651   filter1 = _mm_srai_epi16(filter1, 3);
    652 
    653   // Filter2 >> 3
    654   filter2 = signed_char_clamp_bd_sse2(filter2, bd);
    655   filter2 = _mm_srai_epi16(filter2, 3);
    656 
    657   // filt >> 1
    658   filt = _mm_adds_epi16(filter1, t1);
    659   filt = _mm_srai_epi16(filt, 1);
    660   // filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
    661   filt = _mm_andnot_si128(hev, filt);
    662 
    663   work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd);
    664   work_a = _mm_adds_epi16(work_a, t80);
    665   q0 = _mm_load_si128((__m128i *)flat_oq0);
    666   work_a = _mm_andnot_si128(flat, work_a);
    667   q0 = _mm_and_si128(flat, q0);
    668   q0 = _mm_or_si128(work_a, q0);
    669 
    670   work_a = signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd);
    671   work_a = _mm_adds_epi16(work_a, t80);
    672   q1 = _mm_load_si128((__m128i *)flat_oq1);
    673   work_a = _mm_andnot_si128(flat, work_a);
    674   q1 = _mm_and_si128(flat, q1);
    675   q1 = _mm_or_si128(work_a, q1);
    676 
    677   work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
    678   q2 = _mm_load_si128((__m128i *)flat_oq2);
    679   work_a = _mm_andnot_si128(flat, work_a);
    680   q2 = _mm_and_si128(flat, q2);
    681   q2 = _mm_or_si128(work_a, q2);
    682 
    683   work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd);
    684   work_a = _mm_adds_epi16(work_a, t80);
    685   p0 = _mm_load_si128((__m128i *)flat_op0);
    686   work_a = _mm_andnot_si128(flat, work_a);
    687   p0 = _mm_and_si128(flat, p0);
    688   p0 = _mm_or_si128(work_a, p0);
    689 
    690   work_a = signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd);
    691   work_a = _mm_adds_epi16(work_a, t80);
    692   p1 = _mm_load_si128((__m128i *)flat_op1);
    693   work_a = _mm_andnot_si128(flat, work_a);
    694   p1 = _mm_and_si128(flat, p1);
    695   p1 = _mm_or_si128(work_a, p1);
    696 
    697   work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
    698   p2 = _mm_load_si128((__m128i *)flat_op2);
    699   work_a = _mm_andnot_si128(flat, work_a);
    700   p2 = _mm_and_si128(flat, p2);
    701   p2 = _mm_or_si128(work_a, p2);
    702 
    703   _mm_store_si128((__m128i *)(s - 3 * p), p2);
    704   _mm_store_si128((__m128i *)(s - 2 * p), p1);
    705   _mm_store_si128((__m128i *)(s - 1 * p), p0);
    706   _mm_store_si128((__m128i *)(s + 0 * p), q0);
    707   _mm_store_si128((__m128i *)(s + 1 * p), q1);
    708   _mm_store_si128((__m128i *)(s + 2 * p), q2);
    709 }
    710 
    711 void vpx_highbd_lpf_horizontal_8_dual_sse2(
    712     uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
    713     const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
    714     const uint8_t *_thresh1, int bd) {
    715   vpx_highbd_lpf_horizontal_8_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
    716   vpx_highbd_lpf_horizontal_8_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
    717 }
    718 
    719 void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
    720                                       const uint8_t *_blimit,
    721                                       const uint8_t *_limit,
    722                                       const uint8_t *_thresh, int bd) {
    723   const __m128i zero = _mm_set1_epi16(0);
    724   __m128i blimit, limit, thresh;
    725   __m128i mask, hev, flat;
    726   __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
    727   __m128i p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
    728   __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
    729   __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
    730   __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
    731   __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
    732   __m128i q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
    733   __m128i q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
    734   const __m128i abs_p1p0 =
    735       _mm_or_si128(_mm_subs_epu16(p1, p0), _mm_subs_epu16(p0, p1));
    736   const __m128i abs_q1q0 =
    737       _mm_or_si128(_mm_subs_epu16(q1, q0), _mm_subs_epu16(q0, q1));
    738   const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
    739   const __m128i one = _mm_set1_epi16(1);
    740   __m128i abs_p0q0 =
    741       _mm_or_si128(_mm_subs_epu16(p0, q0), _mm_subs_epu16(q0, p0));
    742   __m128i abs_p1q1 =
    743       _mm_or_si128(_mm_subs_epu16(p1, q1), _mm_subs_epu16(q1, p1));
    744   __m128i work;
    745   const __m128i t4 = _mm_set1_epi16(4);
    746   const __m128i t3 = _mm_set1_epi16(3);
    747   __m128i t80;
    748   __m128i tff80;
    749   __m128i tffe0;
    750   __m128i t1f;
    751   // equivalent to shifting 0x1f left by bitdepth - 8
    752   // and setting new bits to 1
    753   const __m128i t1 = _mm_set1_epi16(0x1);
    754   __m128i t7f;
    755   // equivalent to shifting 0x7f left by bitdepth - 8
    756   // and setting new bits to 1
    757   __m128i ps1, ps0, qs0, qs1;
    758   __m128i filt;
    759   __m128i work_a;
    760   __m128i filter1, filter2;
    761 
    762   if (bd == 8) {
    763     blimit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero);
    764     limit = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero);
    765     thresh = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero);
    766     t80 = _mm_set1_epi16(0x80);
    767     tff80 = _mm_set1_epi16(0xff80);
    768     tffe0 = _mm_set1_epi16(0xffe0);
    769     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 8);
    770     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 8);
    771   } else if (bd == 10) {
    772     blimit = _mm_slli_epi16(
    773         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 2);
    774     limit = _mm_slli_epi16(
    775         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 2);
    776     thresh = _mm_slli_epi16(
    777         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 2);
    778     t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 2);
    779     tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 2);
    780     tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 2);
    781     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 6);
    782     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 6);
    783   } else {  // bd == 12
    784     blimit = _mm_slli_epi16(
    785         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit), zero), 4);
    786     limit = _mm_slli_epi16(
    787         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit), zero), 4);
    788     thresh = _mm_slli_epi16(
    789         _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh), zero), 4);
    790     t80 = _mm_slli_epi16(_mm_set1_epi16(0x80), 4);
    791     tff80 = _mm_slli_epi16(_mm_set1_epi16(0xff80), 4);
    792     tffe0 = _mm_slli_epi16(_mm_set1_epi16(0xffe0), 4);
    793     t1f = _mm_srli_epi16(_mm_set1_epi16(0x1fff), 4);
    794     t7f = _mm_srli_epi16(_mm_set1_epi16(0x7fff), 4);
    795   }
    796 
    797   ps1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80);
    798   ps0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80);
    799   qs0 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80);
    800   qs1 = _mm_subs_epi16(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80);
    801 
    802   // filter_mask and hev_mask
    803   flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
    804   hev = _mm_subs_epu16(flat, thresh);
    805   hev = _mm_xor_si128(_mm_cmpeq_epi16(hev, zero), ffff);
    806 
    807   abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
    808   abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
    809   mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit);
    810   mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
    811   // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    812   // So taking maximums continues to work:
    813   mask = _mm_and_si128(mask, _mm_adds_epu16(limit, one));
    814   mask = _mm_max_epi16(flat, mask);
    815   // mask |= (abs(p1 - p0) > limit) * -1;
    816   // mask |= (abs(q1 - q0) > limit) * -1;
    817   work = _mm_max_epi16(
    818       _mm_or_si128(_mm_subs_epu16(p2, p1), _mm_subs_epu16(p1, p2)),
    819       _mm_or_si128(_mm_subs_epu16(p3, p2), _mm_subs_epu16(p2, p3)));
    820   mask = _mm_max_epi16(work, mask);
    821   work = _mm_max_epi16(
    822       _mm_or_si128(_mm_subs_epu16(q2, q1), _mm_subs_epu16(q1, q2)),
    823       _mm_or_si128(_mm_subs_epu16(q3, q2), _mm_subs_epu16(q2, q3)));
    824   mask = _mm_max_epi16(work, mask);
    825   mask = _mm_subs_epu16(mask, limit);
    826   mask = _mm_cmpeq_epi16(mask, zero);
    827 
    828   // filter4
    829   filt = signed_char_clamp_bd_sse2(_mm_subs_epi16(ps1, qs1), bd);
    830   filt = _mm_and_si128(filt, hev);
    831   work_a = _mm_subs_epi16(qs0, ps0);
    832   filt = _mm_adds_epi16(filt, work_a);
    833   filt = _mm_adds_epi16(filt, work_a);
    834   filt = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, work_a), bd);
    835 
    836   // (vpx_filter + 3 * (qs0 - ps0)) & mask
    837   filt = _mm_and_si128(filt, mask);
    838 
    839   filter1 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t4), bd);
    840   filter2 = signed_char_clamp_bd_sse2(_mm_adds_epi16(filt, t3), bd);
    841 
    842   // Filter1 >> 3
    843   work_a = _mm_cmpgt_epi16(zero, filter1);  // get the values that are <0
    844   filter1 = _mm_srli_epi16(filter1, 3);
    845   work_a = _mm_and_si128(work_a, tffe0);    // sign bits for the values < 0
    846   filter1 = _mm_and_si128(filter1, t1f);    // clamp the range
    847   filter1 = _mm_or_si128(filter1, work_a);  // reinsert the sign bits
    848 
    849   // Filter2 >> 3
    850   work_a = _mm_cmpgt_epi16(zero, filter2);
    851   filter2 = _mm_srli_epi16(filter2, 3);
    852   work_a = _mm_and_si128(work_a, tffe0);
    853   filter2 = _mm_and_si128(filter2, t1f);
    854   filter2 = _mm_or_si128(filter2, work_a);
    855 
    856   // filt >> 1
    857   filt = _mm_adds_epi16(filter1, t1);
    858   work_a = _mm_cmpgt_epi16(zero, filt);
    859   filt = _mm_srli_epi16(filt, 1);
    860   work_a = _mm_and_si128(work_a, tff80);
    861   filt = _mm_and_si128(filt, t7f);
    862   filt = _mm_or_si128(filt, work_a);
    863 
    864   filt = _mm_andnot_si128(hev, filt);
    865 
    866   q0 = _mm_adds_epi16(
    867       signed_char_clamp_bd_sse2(_mm_subs_epi16(qs0, filter1), bd), t80);
    868   q1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_subs_epi16(qs1, filt), bd),
    869                       t80);
    870   p0 = _mm_adds_epi16(
    871       signed_char_clamp_bd_sse2(_mm_adds_epi16(ps0, filter2), bd), t80);
    872   p1 = _mm_adds_epi16(signed_char_clamp_bd_sse2(_mm_adds_epi16(ps1, filt), bd),
    873                       t80);
    874 
    875   _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
    876   _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
    877   _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
    878   _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
    879 }
    880 
    881 void vpx_highbd_lpf_horizontal_4_dual_sse2(
    882     uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
    883     const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
    884     const uint8_t *_thresh1, int bd) {
    885   vpx_highbd_lpf_horizontal_4_sse2(s, p, _blimit0, _limit0, _thresh0, bd);
    886   vpx_highbd_lpf_horizontal_4_sse2(s + 8, p, _blimit1, _limit1, _thresh1, bd);
    887 }
    888 
    889 static INLINE void highbd_transpose(uint16_t *src[], int in_p, uint16_t *dst[],
    890                                     int out_p, int num_8x8_to_transpose) {
    891   int idx8x8 = 0;
    892   __m128i p0, p1, p2, p3, p4, p5, p6, p7, x0, x1, x2, x3, x4, x5, x6, x7;
    893   do {
    894     uint16_t *in = src[idx8x8];
    895     uint16_t *out = dst[idx8x8];
    896 
    897     p0 =
    898         _mm_loadu_si128((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
    899     p1 =
    900         _mm_loadu_si128((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
    901     p2 =
    902         _mm_loadu_si128((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
    903     p3 =
    904         _mm_loadu_si128((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
    905     p4 =
    906         _mm_loadu_si128((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
    907     p5 =
    908         _mm_loadu_si128((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
    909     p6 =
    910         _mm_loadu_si128((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
    911     p7 =
    912         _mm_loadu_si128((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
    913     // 00 10 01 11 02 12 03 13
    914     x0 = _mm_unpacklo_epi16(p0, p1);
    915     // 20 30 21 31 22 32 23 33
    916     x1 = _mm_unpacklo_epi16(p2, p3);
    917     // 40 50 41 51 42 52 43 53
    918     x2 = _mm_unpacklo_epi16(p4, p5);
    919     // 60 70 61 71 62 72 63 73
    920     x3 = _mm_unpacklo_epi16(p6, p7);
    921     // 00 10 20 30 01 11 21 31
    922     x4 = _mm_unpacklo_epi32(x0, x1);
    923     // 40 50 60 70 41 51 61 71
    924     x5 = _mm_unpacklo_epi32(x2, x3);
    925     // 00 10 20 30 40 50 60 70
    926     x6 = _mm_unpacklo_epi64(x4, x5);
    927     // 01 11 21 31 41 51 61 71
    928     x7 = _mm_unpackhi_epi64(x4, x5);
    929 
    930     _mm_storeu_si128((__m128i *)(out + 0 * out_p), x6);
    931     // 00 10 20 30 40 50 60 70
    932     _mm_storeu_si128((__m128i *)(out + 1 * out_p), x7);
    933     // 01 11 21 31 41 51 61 71
    934 
    935     // 02 12 22 32 03 13 23 33
    936     x4 = _mm_unpackhi_epi32(x0, x1);
    937     // 42 52 62 72 43 53 63 73
    938     x5 = _mm_unpackhi_epi32(x2, x3);
    939     // 02 12 22 32 42 52 62 72
    940     x6 = _mm_unpacklo_epi64(x4, x5);
    941     // 03 13 23 33 43 53 63 73
    942     x7 = _mm_unpackhi_epi64(x4, x5);
    943 
    944     _mm_storeu_si128((__m128i *)(out + 2 * out_p), x6);
    945     // 02 12 22 32 42 52 62 72
    946     _mm_storeu_si128((__m128i *)(out + 3 * out_p), x7);
    947     // 03 13 23 33 43 53 63 73
    948 
    949     // 04 14 05 15 06 16 07 17
    950     x0 = _mm_unpackhi_epi16(p0, p1);
    951     // 24 34 25 35 26 36 27 37
    952     x1 = _mm_unpackhi_epi16(p2, p3);
    953     // 44 54 45 55 46 56 47 57
    954     x2 = _mm_unpackhi_epi16(p4, p5);
    955     // 64 74 65 75 66 76 67 77
    956     x3 = _mm_unpackhi_epi16(p6, p7);
    957     // 04 14 24 34 05 15 25 35
    958     x4 = _mm_unpacklo_epi32(x0, x1);
    959     // 44 54 64 74 45 55 65 75
    960     x5 = _mm_unpacklo_epi32(x2, x3);
    961     // 04 14 24 34 44 54 64 74
    962     x6 = _mm_unpacklo_epi64(x4, x5);
    963     // 05 15 25 35 45 55 65 75
    964     x7 = _mm_unpackhi_epi64(x4, x5);
    965 
    966     _mm_storeu_si128((__m128i *)(out + 4 * out_p), x6);
    967     // 04 14 24 34 44 54 64 74
    968     _mm_storeu_si128((__m128i *)(out + 5 * out_p), x7);
    969     // 05 15 25 35 45 55 65 75
    970 
    971     // 06 16 26 36 07 17 27 37
    972     x4 = _mm_unpackhi_epi32(x0, x1);
    973     // 46 56 66 76 47 57 67 77
    974     x5 = _mm_unpackhi_epi32(x2, x3);
    975     // 06 16 26 36 46 56 66 76
    976     x6 = _mm_unpacklo_epi64(x4, x5);
    977     // 07 17 27 37 47 57 67 77
    978     x7 = _mm_unpackhi_epi64(x4, x5);
    979 
    980     _mm_storeu_si128((__m128i *)(out + 6 * out_p), x6);
    981     // 06 16 26 36 46 56 66 76
    982     _mm_storeu_si128((__m128i *)(out + 7 * out_p), x7);
    983     // 07 17 27 37 47 57 67 77
    984   } while (++idx8x8 < num_8x8_to_transpose);
    985 }
    986 
    987 static INLINE void highbd_transpose8x16(uint16_t *in0, uint16_t *in1, int in_p,
    988                                         uint16_t *out, int out_p) {
    989   uint16_t *src0[1];
    990   uint16_t *src1[1];
    991   uint16_t *dest0[1];
    992   uint16_t *dest1[1];
    993   src0[0] = in0;
    994   src1[0] = in1;
    995   dest0[0] = out;
    996   dest1[0] = out + 8;
    997   highbd_transpose(src0, in_p, dest0, out_p, 1);
    998   highbd_transpose(src1, in_p, dest1, out_p, 1);
    999 }
   1000 
   1001 void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
   1002                                     const uint8_t *limit, const uint8_t *thresh,
   1003                                     int bd) {
   1004   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   1005   uint16_t *src[1];
   1006   uint16_t *dst[1];
   1007 
   1008   // Transpose 8x8
   1009   src[0] = s - 4;
   1010   dst[0] = t_dst;
   1011 
   1012   highbd_transpose(src, p, dst, 8, 1);
   1013 
   1014   // Loop filtering
   1015   vpx_highbd_lpf_horizontal_4_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
   1016 
   1017   src[0] = t_dst;
   1018   dst[0] = s - 4;
   1019 
   1020   // Transpose back
   1021   highbd_transpose(src, 8, dst, p, 1);
   1022 }
   1023 
   1024 void vpx_highbd_lpf_vertical_4_dual_sse2(
   1025     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
   1026     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
   1027     const uint8_t *thresh1, int bd) {
   1028   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
   1029   uint16_t *src[2];
   1030   uint16_t *dst[2];
   1031 
   1032   // Transpose 8x16
   1033   highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
   1034 
   1035   // Loop filtering
   1036   vpx_highbd_lpf_horizontal_4_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
   1037                                         thresh0, blimit1, limit1, thresh1, bd);
   1038   src[0] = t_dst;
   1039   src[1] = t_dst + 8;
   1040   dst[0] = s - 4;
   1041   dst[1] = s - 4 + p * 8;
   1042 
   1043   // Transpose back
   1044   highbd_transpose(src, 16, dst, p, 2);
   1045 }
   1046 
   1047 void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
   1048                                     const uint8_t *limit, const uint8_t *thresh,
   1049                                     int bd) {
   1050   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 8]);
   1051   uint16_t *src[1];
   1052   uint16_t *dst[1];
   1053 
   1054   // Transpose 8x8
   1055   src[0] = s - 4;
   1056   dst[0] = t_dst;
   1057 
   1058   highbd_transpose(src, p, dst, 8, 1);
   1059 
   1060   // Loop filtering
   1061   vpx_highbd_lpf_horizontal_8_sse2(t_dst + 4 * 8, 8, blimit, limit, thresh, bd);
   1062 
   1063   src[0] = t_dst;
   1064   dst[0] = s - 4;
   1065 
   1066   // Transpose back
   1067   highbd_transpose(src, 8, dst, p, 1);
   1068 }
   1069 
   1070 void vpx_highbd_lpf_vertical_8_dual_sse2(
   1071     uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
   1072     const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
   1073     const uint8_t *thresh1, int bd) {
   1074   DECLARE_ALIGNED(16, uint16_t, t_dst[16 * 8]);
   1075   uint16_t *src[2];
   1076   uint16_t *dst[2];
   1077 
   1078   // Transpose 8x16
   1079   highbd_transpose8x16(s - 4, s - 4 + p * 8, p, t_dst, 16);
   1080 
   1081   // Loop filtering
   1082   vpx_highbd_lpf_horizontal_8_dual_sse2(t_dst + 4 * 16, 16, blimit0, limit0,
   1083                                         thresh0, blimit1, limit1, thresh1, bd);
   1084   src[0] = t_dst;
   1085   src[1] = t_dst + 8;
   1086 
   1087   dst[0] = s - 4;
   1088   dst[1] = s - 4 + p * 8;
   1089 
   1090   // Transpose back
   1091   highbd_transpose(src, 16, dst, p, 2);
   1092 }
   1093 
   1094 void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int p, const uint8_t *blimit,
   1095                                      const uint8_t *limit,
   1096                                      const uint8_t *thresh, int bd) {
   1097   DECLARE_ALIGNED(16, uint16_t, t_dst[8 * 16]);
   1098   uint16_t *src[2];
   1099   uint16_t *dst[2];
   1100 
   1101   src[0] = s - 8;
   1102   src[1] = s;
   1103   dst[0] = t_dst;
   1104   dst[1] = t_dst + 8 * 8;
   1105 
   1106   // Transpose 16x8
   1107   highbd_transpose(src, p, dst, 8, 2);
   1108 
   1109   // Loop filtering
   1110   vpx_highbd_lpf_horizontal_16_sse2(t_dst + 8 * 8, 8, blimit, limit, thresh,
   1111                                     bd);
   1112   src[0] = t_dst;
   1113   src[1] = t_dst + 8 * 8;
   1114   dst[0] = s - 8;
   1115   dst[1] = s;
   1116 
   1117   // Transpose back
   1118   highbd_transpose(src, 8, dst, p, 2);
   1119 }
   1120 
   1121 void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int p,
   1122                                           const uint8_t *blimit,
   1123                                           const uint8_t *limit,
   1124                                           const uint8_t *thresh, int bd) {
   1125   DECLARE_ALIGNED(16, uint16_t, t_dst[256]);
   1126 
   1127   //  Transpose 16x16
   1128   highbd_transpose8x16(s - 8, s - 8 + 8 * p, p, t_dst, 16);
   1129   highbd_transpose8x16(s, s + 8 * p, p, t_dst + 8 * 16, 16);
   1130 
   1131   //  Loop filtering
   1132   vpx_highbd_lpf_horizontal_16_dual_sse2(t_dst + 8 * 16, 16, blimit, limit,
   1133                                          thresh, bd);
   1134 
   1135   //  Transpose back
   1136   highbd_transpose8x16(t_dst, t_dst + 8 * 16, 16, s - 8, p);
   1137   highbd_transpose8x16(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * p, p);
   1138 }
   1139