Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <immintrin.h>  /* AVX2 */
     12 
     13 static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
     14         const unsigned char *_blimit, const unsigned char *_limit,
     15         const unsigned char *_thresh) {
     16     __m128i mask, hev, flat, flat2;
     17     const __m128i zero = _mm_set1_epi16(0);
     18     const __m128i one = _mm_set1_epi8(1);
     19     __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
     20     __m128i abs_p1p0;
     21 
     22     const __m128i thresh = _mm_broadcastb_epi8(
     23             _mm_cvtsi32_si128((int) _thresh[0]));
     24     const __m128i limit = _mm_broadcastb_epi8(
     25             _mm_cvtsi32_si128((int) _limit[0]));
     26     const __m128i blimit = _mm_broadcastb_epi8(
     27             _mm_cvtsi32_si128((int) _blimit[0]));
     28 
     29     q4p4 = _mm_loadl_epi64((__m128i *) (s - 5 * p));
     30     q4p4 = _mm_castps_si128(
     31             _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *) (s + 4 * p)));
     32     q3p3 = _mm_loadl_epi64((__m128i *) (s - 4 * p));
     33     q3p3 = _mm_castps_si128(
     34             _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *) (s + 3 * p)));
     35     q2p2 = _mm_loadl_epi64((__m128i *) (s - 3 * p));
     36     q2p2 = _mm_castps_si128(
     37             _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *) (s + 2 * p)));
     38     q1p1 = _mm_loadl_epi64((__m128i *) (s - 2 * p));
     39     q1p1 = _mm_castps_si128(
     40             _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *) (s + 1 * p)));
     41     p1q1 = _mm_shuffle_epi32(q1p1, 78);
     42     q0p0 = _mm_loadl_epi64((__m128i *) (s - 1 * p));
     43     q0p0 = _mm_castps_si128(
     44             _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *) (s - 0 * p)));
     45     p0q0 = _mm_shuffle_epi32(q0p0, 78);
     46 
     47     {
     48         __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
     49         abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
     50                 _mm_subs_epu8(q0p0, q1p1));
     51         abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
     52         fe = _mm_set1_epi8(0xfe);
     53         ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     54         abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
     55                 _mm_subs_epu8(p0q0, q0p0));
     56         abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
     57                 _mm_subs_epu8(p1q1, q1p1));
     58         flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     59         hev = _mm_subs_epu8(flat, thresh);
     60         hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
     61 
     62         abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     63         abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     64         mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     65         mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     66         // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     67         mask = _mm_max_epu8(abs_p1p0, mask);
     68         // mask |= (abs(p1 - p0) > limit) * -1;
     69         // mask |= (abs(q1 - q0) > limit) * -1;
     70 
     71         work = _mm_max_epu8(
     72                 _mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
     73                         _mm_subs_epu8(q1p1, q2p2)),
     74                 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
     75                         _mm_subs_epu8(q2p2, q3p3)));
     76         mask = _mm_max_epu8(work, mask);
     77         mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     78         mask = _mm_subs_epu8(mask, limit);
     79         mask = _mm_cmpeq_epi8(mask, zero);
     80     }
     81 
     82     // lp filter
     83     {
     84         const __m128i t4 = _mm_set1_epi8(4);
     85         const __m128i t3 = _mm_set1_epi8(3);
     86         const __m128i t80 = _mm_set1_epi8(0x80);
     87         const __m128i t1 = _mm_set1_epi16(0x1);
     88         __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
     89         __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
     90         __m128i qs0 = _mm_xor_si128(p0q0, t80);
     91         __m128i qs1 = _mm_xor_si128(p1q1, t80);
     92         __m128i filt;
     93         __m128i work_a;
     94         __m128i filter1, filter2;
     95         __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
     96         __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
     97 
     98         filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
     99         work_a = _mm_subs_epi8(qs0, qs0ps0);
    100         filt = _mm_adds_epi8(filt, work_a);
    101         filt = _mm_adds_epi8(filt, work_a);
    102         filt = _mm_adds_epi8(filt, work_a);
    103         /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
    104         filt = _mm_and_si128(filt, mask);
    105 
    106         filter1 = _mm_adds_epi8(filt, t4);
    107         filter2 = _mm_adds_epi8(filt, t3);
    108 
    109         filter1 = _mm_unpacklo_epi8(zero, filter1);
    110         filter1 = _mm_srai_epi16(filter1, 0xB);
    111         filter2 = _mm_unpacklo_epi8(zero, filter2);
    112         filter2 = _mm_srai_epi16(filter2, 0xB);
    113 
    114         /* Filter1 >> 3 */
    115         filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
    116         qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
    117 
    118         /* filt >> 1 */
    119         filt = _mm_adds_epi16(filter1, t1);
    120         filt = _mm_srai_epi16(filt, 1);
    121         filt = _mm_andnot_si128(
    122                 _mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt);
    123         filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
    124         qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
    125         // loopfilter done
    126 
    127         {
    128             __m128i work;
    129             flat = _mm_max_epu8(
    130                     _mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
    131                             _mm_subs_epu8(q0p0, q2p2)),
    132                     _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
    133                             _mm_subs_epu8(q0p0, q3p3)));
    134             flat = _mm_max_epu8(abs_p1p0, flat);
    135             flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
    136             flat = _mm_subs_epu8(flat, one);
    137             flat = _mm_cmpeq_epi8(flat, zero);
    138             flat = _mm_and_si128(flat, mask);
    139 
    140             q5p5 = _mm_loadl_epi64((__m128i *) (s - 6 * p));
    141             q5p5 = _mm_castps_si128(
    142                     _mm_loadh_pi(_mm_castsi128_ps(q5p5),
    143                             (__m64 *) (s + 5 * p)));
    144 
    145             q6p6 = _mm_loadl_epi64((__m128i *) (s - 7 * p));
    146             q6p6 = _mm_castps_si128(
    147                     _mm_loadh_pi(_mm_castsi128_ps(q6p6),
    148                             (__m64 *) (s + 6 * p)));
    149 
    150             flat2 = _mm_max_epu8(
    151                     _mm_or_si128(_mm_subs_epu8(q4p4, q0p0),
    152                             _mm_subs_epu8(q0p0, q4p4)),
    153                     _mm_or_si128(_mm_subs_epu8(q5p5, q0p0),
    154                             _mm_subs_epu8(q0p0, q5p5)));
    155 
    156             q7p7 = _mm_loadl_epi64((__m128i *) (s - 8 * p));
    157             q7p7 = _mm_castps_si128(
    158                     _mm_loadh_pi(_mm_castsi128_ps(q7p7),
    159                             (__m64 *) (s + 7 * p)));
    160 
    161             work = _mm_max_epu8(
    162                     _mm_or_si128(_mm_subs_epu8(q6p6, q0p0),
    163                             _mm_subs_epu8(q0p0, q6p6)),
    164                     _mm_or_si128(_mm_subs_epu8(q7p7, q0p0),
    165                             _mm_subs_epu8(q0p0, q7p7)));
    166 
    167             flat2 = _mm_max_epu8(work, flat2);
    168             flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
    169             flat2 = _mm_subs_epu8(flat2, one);
    170             flat2 = _mm_cmpeq_epi8(flat2, zero);
    171             flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    172         }
    173 
    174         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    175         // flat and wide flat calculations
    176         {
    177             const __m128i eight = _mm_set1_epi16(8);
    178             const __m128i four = _mm_set1_epi16(4);
    179             __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
    180             __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
    181             __m128i pixelFilter_p, pixelFilter_q;
    182             __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
    183             __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
    184 
    185             p7_16 = _mm_unpacklo_epi8(q7p7, zero);
    186             p6_16 = _mm_unpacklo_epi8(q6p6, zero);
    187             p5_16 = _mm_unpacklo_epi8(q5p5, zero);
    188             p4_16 = _mm_unpacklo_epi8(q4p4, zero);
    189             p3_16 = _mm_unpacklo_epi8(q3p3, zero);
    190             p2_16 = _mm_unpacklo_epi8(q2p2, zero);
    191             p1_16 = _mm_unpacklo_epi8(q1p1, zero);
    192             p0_16 = _mm_unpacklo_epi8(q0p0, zero);
    193             q0_16 = _mm_unpackhi_epi8(q0p0, zero);
    194             q1_16 = _mm_unpackhi_epi8(q1p1, zero);
    195             q2_16 = _mm_unpackhi_epi8(q2p2, zero);
    196             q3_16 = _mm_unpackhi_epi8(q3p3, zero);
    197             q4_16 = _mm_unpackhi_epi8(q4p4, zero);
    198             q5_16 = _mm_unpackhi_epi8(q5p5, zero);
    199             q6_16 = _mm_unpackhi_epi8(q6p6, zero);
    200             q7_16 = _mm_unpackhi_epi8(q7p7, zero);
    201 
    202             pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
    203                     _mm_add_epi16(p4_16, p3_16));
    204             pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
    205                     _mm_add_epi16(q4_16, q3_16));
    206 
    207             pixetFilter_p2p1p0 = _mm_add_epi16(p0_16,
    208                     _mm_add_epi16(p2_16, p1_16));
    209             pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
    210 
    211             pixetFilter_q2q1q0 = _mm_add_epi16(q0_16,
    212                     _mm_add_epi16(q2_16, q1_16));
    213             pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
    214             pixelFilter_p = _mm_add_epi16(eight,
    215                     _mm_add_epi16(pixelFilter_p, pixelFilter_q));
    216             pixetFilter_p2p1p0 = _mm_add_epi16(four,
    217                     _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
    218             res_p = _mm_srli_epi16(
    219                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)),
    220                     4);
    221             res_q = _mm_srli_epi16(
    222                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)),
    223                     4);
    224             flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
    225             res_p = _mm_srli_epi16(
    226                     _mm_add_epi16(pixetFilter_p2p1p0,
    227                             _mm_add_epi16(p3_16, p0_16)), 3);
    228             res_q = _mm_srli_epi16(
    229                     _mm_add_epi16(pixetFilter_p2p1p0,
    230                             _mm_add_epi16(q3_16, q0_16)), 3);
    231 
    232             flat_q0p0 = _mm_packus_epi16(res_p, res_q);
    233 
    234             sum_p7 = _mm_add_epi16(p7_16, p7_16);
    235             sum_q7 = _mm_add_epi16(q7_16, q7_16);
    236             sum_p3 = _mm_add_epi16(p3_16, p3_16);
    237             sum_q3 = _mm_add_epi16(q3_16, q3_16);
    238 
    239             pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
    240             pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
    241             res_p = _mm_srli_epi16(
    242                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)),
    243                     4);
    244             res_q = _mm_srli_epi16(
    245                     _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)),
    246                     4);
    247             flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
    248 
    249             pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
    250             pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
    251             res_p = _mm_srli_epi16(
    252                     _mm_add_epi16(pixetFilter_p2p1p0,
    253                             _mm_add_epi16(sum_p3, p1_16)), 3);
    254             res_q = _mm_srli_epi16(
    255                     _mm_add_epi16(pixetFilter_q2q1q0,
    256                             _mm_add_epi16(sum_q3, q1_16)), 3);
    257             flat_q1p1 = _mm_packus_epi16(res_p, res_q);
    258 
    259             sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    260             sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    261             sum_p3 = _mm_add_epi16(sum_p3, p3_16);
    262             sum_q3 = _mm_add_epi16(sum_q3, q3_16);
    263 
    264             pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
    265             pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
    266             res_p = _mm_srli_epi16(
    267                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)),
    268                     4);
    269             res_q = _mm_srli_epi16(
    270                     _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)),
    271                     4);
    272             flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
    273 
    274             pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
    275             pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
    276 
    277             res_p = _mm_srli_epi16(
    278                     _mm_add_epi16(pixetFilter_p2p1p0,
    279                             _mm_add_epi16(sum_p3, p2_16)), 3);
    280             res_q = _mm_srli_epi16(
    281                     _mm_add_epi16(pixetFilter_q2q1q0,
    282                             _mm_add_epi16(sum_q3, q2_16)), 3);
    283             flat_q2p2 = _mm_packus_epi16(res_p, res_q);
    284 
    285             sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    286             sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    287             pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
    288             pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
    289             res_p = _mm_srli_epi16(
    290                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)),
    291                     4);
    292             res_q = _mm_srli_epi16(
    293                     _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)),
    294                     4);
    295             flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
    296 
    297             sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    298             sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    299             pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
    300             pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
    301             res_p = _mm_srli_epi16(
    302                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)),
    303                     4);
    304             res_q = _mm_srli_epi16(
    305                     _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)),
    306                     4);
    307             flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
    308 
    309             sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    310             sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    311             pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
    312             pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
    313             res_p = _mm_srli_epi16(
    314                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)),
    315                     4);
    316             res_q = _mm_srli_epi16(
    317                     _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)),
    318                     4);
    319             flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
    320 
    321             sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    322             sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    323             pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
    324             pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
    325             res_p = _mm_srli_epi16(
    326                     _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)),
    327                     4);
    328             res_q = _mm_srli_epi16(
    329                     _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)),
    330                     4);
    331             flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
    332         }
    333         // wide flat
    334         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    335 
    336         flat = _mm_shuffle_epi32(flat, 68);
    337         flat2 = _mm_shuffle_epi32(flat2, 68);
    338 
    339         q2p2 = _mm_andnot_si128(flat, q2p2);
    340         flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
    341         q2p2 = _mm_or_si128(q2p2, flat_q2p2);
    342 
    343         qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
    344         flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
    345         q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
    346 
    347         qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
    348         flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
    349         q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
    350 
    351         q6p6 = _mm_andnot_si128(flat2, q6p6);
    352         flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
    353         q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
    354         _mm_storel_epi64((__m128i *) (s - 7 * p), q6p6);
    355         _mm_storeh_pi((__m64 *) (s + 6 * p), _mm_castsi128_ps(q6p6));
    356 
    357         q5p5 = _mm_andnot_si128(flat2, q5p5);
    358         flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
    359         q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
    360         _mm_storel_epi64((__m128i *) (s - 6 * p), q5p5);
    361         _mm_storeh_pi((__m64 *) (s + 5 * p), _mm_castsi128_ps(q5p5));
    362 
    363         q4p4 = _mm_andnot_si128(flat2, q4p4);
    364         flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
    365         q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
    366         _mm_storel_epi64((__m128i *) (s - 5 * p), q4p4);
    367         _mm_storeh_pi((__m64 *) (s + 4 * p), _mm_castsi128_ps(q4p4));
    368 
    369         q3p3 = _mm_andnot_si128(flat2, q3p3);
    370         flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
    371         q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
    372         _mm_storel_epi64((__m128i *) (s - 4 * p), q3p3);
    373         _mm_storeh_pi((__m64 *) (s + 3 * p), _mm_castsi128_ps(q3p3));
    374 
    375         q2p2 = _mm_andnot_si128(flat2, q2p2);
    376         flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
    377         q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
    378         _mm_storel_epi64((__m128i *) (s - 3 * p), q2p2);
    379         _mm_storeh_pi((__m64 *) (s + 2 * p), _mm_castsi128_ps(q2p2));
    380 
    381         q1p1 = _mm_andnot_si128(flat2, q1p1);
    382         flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
    383         q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
    384         _mm_storel_epi64((__m128i *) (s - 2 * p), q1p1);
    385         _mm_storeh_pi((__m64 *) (s + 1 * p), _mm_castsi128_ps(q1p1));
    386 
    387         q0p0 = _mm_andnot_si128(flat2, q0p0);
    388         flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
    389         q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
    390         _mm_storel_epi64((__m128i *) (s - 1 * p), q0p0);
    391         _mm_storeh_pi((__m64 *) (s - 0 * p), _mm_castsi128_ps(q0p0));
    392     }
    393 }
    394 
    395 static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
    396         const unsigned char *_blimit, const unsigned char *_limit,
    397         const unsigned char *_thresh) {
    398     __m128i mask, hev, flat, flat2;
    399     const __m128i zero = _mm_set1_epi16(0);
    400     const __m128i one = _mm_set1_epi8(1);
    401     __m128i p7, p6, p5;
    402     __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
    403     __m128i q5, q6, q7;
    404 
    405     const __m128i thresh = _mm_broadcastb_epi8(
    406             _mm_cvtsi32_si128((int) _thresh[0]));
    407     const __m128i limit = _mm_broadcastb_epi8(
    408             _mm_cvtsi32_si128((int) _limit[0]));
    409     const __m128i blimit = _mm_broadcastb_epi8(
    410             _mm_cvtsi32_si128((int) _blimit[0]));
    411 
    412     p4 = _mm_loadu_si128((__m128i *) (s - 5 * p));
    413     p3 = _mm_loadu_si128((__m128i *) (s - 4 * p));
    414     p2 = _mm_loadu_si128((__m128i *) (s - 3 * p));
    415     p1 = _mm_loadu_si128((__m128i *) (s - 2 * p));
    416     p0 = _mm_loadu_si128((__m128i *) (s - 1 * p));
    417     q0 = _mm_loadu_si128((__m128i *) (s - 0 * p));
    418     q1 = _mm_loadu_si128((__m128i *) (s + 1 * p));
    419     q2 = _mm_loadu_si128((__m128i *) (s + 2 * p));
    420     q3 = _mm_loadu_si128((__m128i *) (s + 3 * p));
    421     q4 = _mm_loadu_si128((__m128i *) (s + 4 * p));
    422 
    423     {
    424         const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
    425                 _mm_subs_epu8(p0, p1));
    426         const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
    427                 _mm_subs_epu8(q0, q1));
    428         const __m128i fe = _mm_set1_epi8(0xfe);
    429         const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    430         __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
    431                 _mm_subs_epu8(q0, p0));
    432         __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
    433                 _mm_subs_epu8(q1, p1));
    434         __m128i work;
    435         flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    436         hev = _mm_subs_epu8(flat, thresh);
    437         hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
    438 
    439         abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    440         abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    441         mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    442         mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    443         // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    444         mask = _mm_max_epu8(flat, mask);
    445         // mask |= (abs(p1 - p0) > limit) * -1;
    446         // mask |= (abs(q1 - q0) > limit) * -1;
    447         work = _mm_max_epu8(
    448                 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
    449                 _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
    450         mask = _mm_max_epu8(work, mask);
    451         work = _mm_max_epu8(
    452                 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
    453                 _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
    454         mask = _mm_max_epu8(work, mask);
    455         mask = _mm_subs_epu8(mask, limit);
    456         mask = _mm_cmpeq_epi8(mask, zero);
    457     }
    458 
    459     // lp filter
    460     {
    461         const __m128i t4 = _mm_set1_epi8(4);
    462         const __m128i t3 = _mm_set1_epi8(3);
    463         const __m128i t80 = _mm_set1_epi8(0x80);
    464         const __m128i te0 = _mm_set1_epi8(0xe0);
    465         const __m128i t1f = _mm_set1_epi8(0x1f);
    466         const __m128i t1 = _mm_set1_epi8(0x1);
    467         const __m128i t7f = _mm_set1_epi8(0x7f);
    468 
    469         __m128i ps1 = _mm_xor_si128(p1, t80);
    470         __m128i ps0 = _mm_xor_si128(p0, t80);
    471         __m128i qs0 = _mm_xor_si128(q0, t80);
    472         __m128i qs1 = _mm_xor_si128(q1, t80);
    473         __m128i filt;
    474         __m128i work_a;
    475         __m128i filter1, filter2;
    476         __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
    477                 flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4,
    478                 flat2_q5, flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1,
    479                 flat_q2;
    480 
    481         filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
    482         work_a = _mm_subs_epi8(qs0, ps0);
    483         filt = _mm_adds_epi8(filt, work_a);
    484         filt = _mm_adds_epi8(filt, work_a);
    485         filt = _mm_adds_epi8(filt, work_a);
    486         /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
    487         filt = _mm_and_si128(filt, mask);
    488 
    489         filter1 = _mm_adds_epi8(filt, t4);
    490         filter2 = _mm_adds_epi8(filt, t3);
    491 
    492         /* Filter1 >> 3 */
    493         work_a = _mm_cmpgt_epi8(zero, filter1);
    494         filter1 = _mm_srli_epi16(filter1, 3);
    495         work_a = _mm_and_si128(work_a, te0);
    496         filter1 = _mm_and_si128(filter1, t1f);
    497         filter1 = _mm_or_si128(filter1, work_a);
    498         qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
    499 
    500         /* Filter2 >> 3 */
    501         work_a = _mm_cmpgt_epi8(zero, filter2);
    502         filter2 = _mm_srli_epi16(filter2, 3);
    503         work_a = _mm_and_si128(work_a, te0);
    504         filter2 = _mm_and_si128(filter2, t1f);
    505         filter2 = _mm_or_si128(filter2, work_a);
    506         ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
    507 
    508         /* filt >> 1 */
    509         filt = _mm_adds_epi8(filter1, t1);
    510         work_a = _mm_cmpgt_epi8(zero, filt);
    511         filt = _mm_srli_epi16(filt, 1);
    512         work_a = _mm_and_si128(work_a, t80);
    513         filt = _mm_and_si128(filt, t7f);
    514         filt = _mm_or_si128(filt, work_a);
    515         filt = _mm_andnot_si128(hev, filt);
    516         ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
    517         qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
    518         // loopfilter done
    519 
    520         {
    521             __m128i work;
    522             work = _mm_max_epu8(
    523                     _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
    524                     _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
    525             flat = _mm_max_epu8(work, flat);
    526             work = _mm_max_epu8(
    527                     _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
    528                     _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
    529             flat = _mm_max_epu8(work, flat);
    530             work = _mm_max_epu8(
    531                     _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
    532                     _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
    533             flat = _mm_subs_epu8(flat, one);
    534             flat = _mm_cmpeq_epi8(flat, zero);
    535             flat = _mm_and_si128(flat, mask);
    536 
    537             p5 = _mm_loadu_si128((__m128i *) (s - 6 * p));
    538             q5 = _mm_loadu_si128((__m128i *) (s + 5 * p));
    539             flat2 = _mm_max_epu8(
    540                     _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
    541                     _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
    542 
    543             flat2 = _mm_max_epu8(work, flat2);
    544             p6 = _mm_loadu_si128((__m128i *) (s - 7 * p));
    545             q6 = _mm_loadu_si128((__m128i *) (s + 6 * p));
    546             work = _mm_max_epu8(
    547                     _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
    548                     _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
    549 
    550             flat2 = _mm_max_epu8(work, flat2);
    551 
    552             p7 = _mm_loadu_si128((__m128i *) (s - 8 * p));
    553             q7 = _mm_loadu_si128((__m128i *) (s + 7 * p));
    554             work = _mm_max_epu8(
    555                     _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
    556                     _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
    557 
    558             flat2 = _mm_max_epu8(work, flat2);
    559             flat2 = _mm_subs_epu8(flat2, one);
    560             flat2 = _mm_cmpeq_epi8(flat2, zero);
    561             flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    562         }
    563 
    564         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    565         // flat and wide flat calculations
    566         {
    567             const __m256i eight = _mm256_set1_epi16(8);
    568             const __m256i four = _mm256_set1_epi16(4);
    569             __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4,
    570                     q256_4, p256_3, q256_3, p256_2, q256_2, p256_1, q256_1,
    571                     p256_0, q256_0;
    572             __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
    573                     pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p,
    574                     res_q;
    575 
    576             p256_7 = _mm256_cvtepu8_epi16(p7);
    577             p256_6 = _mm256_cvtepu8_epi16(p6);
    578             p256_5 = _mm256_cvtepu8_epi16(p5);
    579             p256_4 = _mm256_cvtepu8_epi16(p4);
    580             p256_3 = _mm256_cvtepu8_epi16(p3);
    581             p256_2 = _mm256_cvtepu8_epi16(p2);
    582             p256_1 = _mm256_cvtepu8_epi16(p1);
    583             p256_0 = _mm256_cvtepu8_epi16(p0);
    584             q256_0 = _mm256_cvtepu8_epi16(q0);
    585             q256_1 = _mm256_cvtepu8_epi16(q1);
    586             q256_2 = _mm256_cvtepu8_epi16(q2);
    587             q256_3 = _mm256_cvtepu8_epi16(q3);
    588             q256_4 = _mm256_cvtepu8_epi16(q4);
    589             q256_5 = _mm256_cvtepu8_epi16(q5);
    590             q256_6 = _mm256_cvtepu8_epi16(q6);
    591             q256_7 = _mm256_cvtepu8_epi16(q7);
    592 
    593             pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
    594                     _mm256_add_epi16(p256_4, p256_3));
    595             pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
    596                     _mm256_add_epi16(q256_4, q256_3));
    597 
    598             pixetFilter_p2p1p0 = _mm256_add_epi16(p256_0,
    599                     _mm256_add_epi16(p256_2, p256_1));
    600             pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
    601 
    602             pixetFilter_q2q1q0 = _mm256_add_epi16(q256_0,
    603                     _mm256_add_epi16(q256_2, q256_1));
    604             pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
    605 
    606             pixelFilter_p = _mm256_add_epi16(eight,
    607                     _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
    608 
    609             pixetFilter_p2p1p0 = _mm256_add_epi16(four,
    610                     _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
    611 
    612             res_p = _mm256_srli_epi16(
    613                     _mm256_add_epi16(pixelFilter_p,
    614                             _mm256_add_epi16(p256_7, p256_0)), 4);
    615 
    616             flat2_p0 = _mm256_castsi256_si128(
    617                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
    618                             168));
    619 
    620             res_q = _mm256_srli_epi16(
    621                     _mm256_add_epi16(pixelFilter_p,
    622                             _mm256_add_epi16(q256_7, q256_0)), 4);
    623 
    624             flat2_q0 = _mm256_castsi256_si128(
    625                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
    626                             168));
    627 
    628             res_p = _mm256_srli_epi16(
    629                     _mm256_add_epi16(pixetFilter_p2p1p0,
    630                             _mm256_add_epi16(p256_3, p256_0)), 3);
    631 
    632             flat_p0 = _mm256_castsi256_si128(
    633                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
    634                             168));
    635 
    636             res_q = _mm256_srli_epi16(
    637                     _mm256_add_epi16(pixetFilter_p2p1p0,
    638                             _mm256_add_epi16(q256_3, q256_0)), 3);
    639 
    640             flat_q0 = _mm256_castsi256_si128(
    641                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
    642                             168));
    643 
    644             sum_p7 = _mm256_add_epi16(p256_7, p256_7);
    645 
    646             sum_q7 = _mm256_add_epi16(q256_7, q256_7);
    647 
    648             sum_p3 = _mm256_add_epi16(p256_3, p256_3);
    649 
    650             sum_q3 = _mm256_add_epi16(q256_3, q256_3);
    651 
    652             pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
    653 
    654             pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
    655 
    656             res_p = _mm256_srli_epi16(
    657                     _mm256_add_epi16(pixelFilter_p,
    658                             _mm256_add_epi16(sum_p7, p256_1)), 4);
    659 
    660             flat2_p1 = _mm256_castsi256_si128(
    661                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
    662                             168));
    663 
    664             res_q = _mm256_srli_epi16(
    665                     _mm256_add_epi16(pixelFilter_q,
    666                             _mm256_add_epi16(sum_q7, q256_1)), 4);
    667 
    668             flat2_q1 = _mm256_castsi256_si128(
    669                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
    670                             168));
    671 
    672             pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
    673 
    674             pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
    675 
    676             res_p = _mm256_srli_epi16(
    677                     _mm256_add_epi16(pixetFilter_p2p1p0,
    678                             _mm256_add_epi16(sum_p3, p256_1)), 3);
    679 
    680             flat_p1 = _mm256_castsi256_si128(
    681                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
    682                             168));
    683 
    684             res_q = _mm256_srli_epi16(
    685                     _mm256_add_epi16(pixetFilter_q2q1q0,
    686                             _mm256_add_epi16(sum_q3, q256_1)), 3);
    687 
    688             flat_q1 = _mm256_castsi256_si128(
    689                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
    690                             168));
    691 
    692             sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
    693 
    694             sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
    695 
    696             sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
    697 
    698             sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
    699 
    700             pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
    701 
    702             pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
    703 
    704             res_p = _mm256_srli_epi16(
    705                     _mm256_add_epi16(pixelFilter_p,
    706                             _mm256_add_epi16(sum_p7, p256_2)), 4);
    707 
    708             flat2_p2 = _mm256_castsi256_si128(
    709                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
    710                             168));
    711 
    712             res_q = _mm256_srli_epi16(
    713                     _mm256_add_epi16(pixelFilter_q,
    714                             _mm256_add_epi16(sum_q7, q256_2)), 4);
    715 
    716             flat2_q2 = _mm256_castsi256_si128(
    717                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
    718                             168));
    719 
    720             pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
    721 
    722             pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
    723 
    724             res_p = _mm256_srli_epi16(
    725                     _mm256_add_epi16(pixetFilter_p2p1p0,
    726                             _mm256_add_epi16(sum_p3, p256_2)), 3);
    727 
    728             flat_p2 = _mm256_castsi256_si128(
    729                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
    730                             168));
    731 
    732             res_q = _mm256_srli_epi16(
    733                     _mm256_add_epi16(pixetFilter_q2q1q0,
    734                             _mm256_add_epi16(sum_q3, q256_2)), 3);
    735 
    736             flat_q2 = _mm256_castsi256_si128(
    737                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
    738                             168));
    739 
    740             sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
    741 
    742             sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
    743 
    744             pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
    745 
    746             pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
    747 
    748             res_p = _mm256_srli_epi16(
    749                     _mm256_add_epi16(pixelFilter_p,
    750                             _mm256_add_epi16(sum_p7, p256_3)), 4);
    751 
    752             flat2_p3 = _mm256_castsi256_si128(
    753                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
    754                             168));
    755 
    756             res_q = _mm256_srli_epi16(
    757                     _mm256_add_epi16(pixelFilter_q,
    758                             _mm256_add_epi16(sum_q7, q256_3)), 4);
    759 
    760             flat2_q3 = _mm256_castsi256_si128(
    761                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
    762                             168));
    763 
    764             sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
    765 
    766             sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
    767 
    768             pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
    769 
    770             pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
    771 
    772             res_p = _mm256_srli_epi16(
    773                     _mm256_add_epi16(pixelFilter_p,
    774                             _mm256_add_epi16(sum_p7, p256_4)), 4);
    775 
    776             flat2_p4 = _mm256_castsi256_si128(
    777                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
    778                             168));
    779 
    780             res_q = _mm256_srli_epi16(
    781                     _mm256_add_epi16(pixelFilter_q,
    782                             _mm256_add_epi16(sum_q7, q256_4)), 4);
    783 
    784             flat2_q4 = _mm256_castsi256_si128(
    785                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
    786                             168));
    787 
    788             sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
    789 
    790             sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
    791 
    792             pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
    793 
    794             pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
    795 
    796             res_p = _mm256_srli_epi16(
    797                     _mm256_add_epi16(pixelFilter_p,
    798                             _mm256_add_epi16(sum_p7, p256_5)), 4);
    799 
    800             flat2_p5 = _mm256_castsi256_si128(
    801                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
    802                             168));
    803 
    804             res_q = _mm256_srli_epi16(
    805                     _mm256_add_epi16(pixelFilter_q,
    806                             _mm256_add_epi16(sum_q7, q256_5)), 4);
    807 
    808             flat2_q5 = _mm256_castsi256_si128(
    809                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
    810                             168));
    811 
    812             sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
    813 
    814             sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
    815 
    816             pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
    817 
    818             pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
    819 
    820             res_p = _mm256_srli_epi16(
    821                     _mm256_add_epi16(pixelFilter_p,
    822                             _mm256_add_epi16(sum_p7, p256_6)), 4);
    823 
    824             flat2_p6 = _mm256_castsi256_si128(
    825                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p),
    826                             168));
    827 
    828             res_q = _mm256_srli_epi16(
    829                     _mm256_add_epi16(pixelFilter_q,
    830                             _mm256_add_epi16(sum_q7, q256_6)), 4);
    831 
    832             flat2_q6 = _mm256_castsi256_si128(
    833                     _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q),
    834                             168));
    835         }
    836 
    837         // wide flat
    838         // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    839 
    840         p2 = _mm_andnot_si128(flat, p2);
    841         flat_p2 = _mm_and_si128(flat, flat_p2);
    842         p2 = _mm_or_si128(flat_p2, p2);
    843 
    844         p1 = _mm_andnot_si128(flat, ps1);
    845         flat_p1 = _mm_and_si128(flat, flat_p1);
    846         p1 = _mm_or_si128(flat_p1, p1);
    847 
    848         p0 = _mm_andnot_si128(flat, ps0);
    849         flat_p0 = _mm_and_si128(flat, flat_p0);
    850         p0 = _mm_or_si128(flat_p0, p0);
    851 
    852         q0 = _mm_andnot_si128(flat, qs0);
    853         flat_q0 = _mm_and_si128(flat, flat_q0);
    854         q0 = _mm_or_si128(flat_q0, q0);
    855 
    856         q1 = _mm_andnot_si128(flat, qs1);
    857         flat_q1 = _mm_and_si128(flat, flat_q1);
    858         q1 = _mm_or_si128(flat_q1, q1);
    859 
    860         q2 = _mm_andnot_si128(flat, q2);
    861         flat_q2 = _mm_and_si128(flat, flat_q2);
    862         q2 = _mm_or_si128(flat_q2, q2);
    863 
    864         p6 = _mm_andnot_si128(flat2, p6);
    865         flat2_p6 = _mm_and_si128(flat2, flat2_p6);
    866         p6 = _mm_or_si128(flat2_p6, p6);
    867         _mm_storeu_si128((__m128i *) (s - 7 * p), p6);
    868 
    869         p5 = _mm_andnot_si128(flat2, p5);
    870         flat2_p5 = _mm_and_si128(flat2, flat2_p5);
    871         p5 = _mm_or_si128(flat2_p5, p5);
    872         _mm_storeu_si128((__m128i *) (s - 6 * p), p5);
    873 
    874         p4 = _mm_andnot_si128(flat2, p4);
    875         flat2_p4 = _mm_and_si128(flat2, flat2_p4);
    876         p4 = _mm_or_si128(flat2_p4, p4);
    877         _mm_storeu_si128((__m128i *) (s - 5 * p), p4);
    878 
    879         p3 = _mm_andnot_si128(flat2, p3);
    880         flat2_p3 = _mm_and_si128(flat2, flat2_p3);
    881         p3 = _mm_or_si128(flat2_p3, p3);
    882         _mm_storeu_si128((__m128i *) (s - 4 * p), p3);
    883 
    884         p2 = _mm_andnot_si128(flat2, p2);
    885         flat2_p2 = _mm_and_si128(flat2, flat2_p2);
    886         p2 = _mm_or_si128(flat2_p2, p2);
    887         _mm_storeu_si128((__m128i *) (s - 3 * p), p2);
    888 
    889         p1 = _mm_andnot_si128(flat2, p1);
    890         flat2_p1 = _mm_and_si128(flat2, flat2_p1);
    891         p1 = _mm_or_si128(flat2_p1, p1);
    892         _mm_storeu_si128((__m128i *) (s - 2 * p), p1);
    893 
    894         p0 = _mm_andnot_si128(flat2, p0);
    895         flat2_p0 = _mm_and_si128(flat2, flat2_p0);
    896         p0 = _mm_or_si128(flat2_p0, p0);
    897         _mm_storeu_si128((__m128i *) (s - 1 * p), p0);
    898 
    899         q0 = _mm_andnot_si128(flat2, q0);
    900         flat2_q0 = _mm_and_si128(flat2, flat2_q0);
    901         q0 = _mm_or_si128(flat2_q0, q0);
    902         _mm_storeu_si128((__m128i *) (s - 0 * p), q0);
    903 
    904         q1 = _mm_andnot_si128(flat2, q1);
    905         flat2_q1 = _mm_and_si128(flat2, flat2_q1);
    906         q1 = _mm_or_si128(flat2_q1, q1);
    907         _mm_storeu_si128((__m128i *) (s + 1 * p), q1);
    908 
    909         q2 = _mm_andnot_si128(flat2, q2);
    910         flat2_q2 = _mm_and_si128(flat2, flat2_q2);
    911         q2 = _mm_or_si128(flat2_q2, q2);
    912         _mm_storeu_si128((__m128i *) (s + 2 * p), q2);
    913 
    914         q3 = _mm_andnot_si128(flat2, q3);
    915         flat2_q3 = _mm_and_si128(flat2, flat2_q3);
    916         q3 = _mm_or_si128(flat2_q3, q3);
    917         _mm_storeu_si128((__m128i *) (s + 3 * p), q3);
    918 
    919         q4 = _mm_andnot_si128(flat2, q4);
    920         flat2_q4 = _mm_and_si128(flat2, flat2_q4);
    921         q4 = _mm_or_si128(flat2_q4, q4);
    922         _mm_storeu_si128((__m128i *) (s + 4 * p), q4);
    923 
    924         q5 = _mm_andnot_si128(flat2, q5);
    925         flat2_q5 = _mm_and_si128(flat2, flat2_q5);
    926         q5 = _mm_or_si128(flat2_q5, q5);
    927         _mm_storeu_si128((__m128i *) (s + 5 * p), q5);
    928 
    929         q6 = _mm_andnot_si128(flat2, q6);
    930         flat2_q6 = _mm_and_si128(flat2, flat2_q6);
    931         q6 = _mm_or_si128(flat2_q6, q6);
    932         _mm_storeu_si128((__m128i *) (s + 6 * p), q6);
    933     }
    934 }
    935 
    936 void vp9_lpf_horizontal_16_avx2(unsigned char *s, int p,
    937         const unsigned char *_blimit, const unsigned char *_limit,
    938         const unsigned char *_thresh, int count) {
    939     if (count == 1)
    940         mb_lpf_horizontal_edge_w_avx2_8(s, p, _blimit, _limit, _thresh);
    941     else
    942         mb_lpf_horizontal_edge_w_avx2_16(s, p, _blimit, _limit, _thresh);
    943 }
    944