Home | History | Annotate | Download | only in x86
      1 /*
      2  *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
      3  *
      4  *  Use of this source code is governed by a BSD-style license
      5  *  that can be found in the LICENSE file in the root of the source
      6  *  tree. An additional intellectual property rights grant can be found
      7  *  in the file PATENTS.  All contributing project authors may
      8  *  be found in the AUTHORS file in the root of the source tree.
      9  */
     10 
     11 #include <immintrin.h> /* AVX2 */
     12 
     13 #include "./vpx_dsp_rtcd.h"
     14 #include "vpx_ports/mem.h"
     15 
     16 void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p,
     17                                 const unsigned char *_blimit,
     18                                 const unsigned char *_limit,
     19                                 const unsigned char *_thresh) {
     20   __m128i mask, hev, flat, flat2;
     21   const __m128i zero = _mm_set1_epi16(0);
     22   const __m128i one = _mm_set1_epi8(1);
     23   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
     24   __m128i abs_p1p0;
     25 
     26   const __m128i thresh =
     27       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
     28   const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
     29   const __m128i blimit =
     30       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
     31 
     32   q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
     33   q4p4 = _mm_castps_si128(
     34       _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
     35   q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
     36   q3p3 = _mm_castps_si128(
     37       _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
     38   q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
     39   q2p2 = _mm_castps_si128(
     40       _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
     41   q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
     42   q1p1 = _mm_castps_si128(
     43       _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
     44   p1q1 = _mm_shuffle_epi32(q1p1, 78);
     45   q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
     46   q0p0 = _mm_castps_si128(
     47       _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
     48   p0q0 = _mm_shuffle_epi32(q0p0, 78);
     49 
     50   {
     51     __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
     52     abs_p1p0 =
     53         _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
     54     abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
     55     fe = _mm_set1_epi8(0xfe);
     56     ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
     57     abs_p0q0 =
     58         _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
     59     abs_p1q1 =
     60         _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
     61     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     62     hev = _mm_subs_epu8(flat, thresh);
     63     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
     64 
     65     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
     66     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
     67     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     68     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     69     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
     70     mask = _mm_max_epu8(abs_p1p0, mask);
     71     // mask |= (abs(p1 - p0) > limit) * -1;
     72     // mask |= (abs(q1 - q0) > limit) * -1;
     73 
     74     work = _mm_max_epu8(
     75         _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
     76         _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
     77     mask = _mm_max_epu8(work, mask);
     78     mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     79     mask = _mm_subs_epu8(mask, limit);
     80     mask = _mm_cmpeq_epi8(mask, zero);
     81   }
     82 
     83   // lp filter
     84   {
     85     const __m128i t4 = _mm_set1_epi8(4);
     86     const __m128i t3 = _mm_set1_epi8(3);
     87     const __m128i t80 = _mm_set1_epi8(0x80);
     88     const __m128i t1 = _mm_set1_epi16(0x1);
     89     __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
     90     __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
     91     __m128i qs0 = _mm_xor_si128(p0q0, t80);
     92     __m128i qs1 = _mm_xor_si128(p1q1, t80);
     93     __m128i filt;
     94     __m128i work_a;
     95     __m128i filter1, filter2;
     96     __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
     97     __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
     98 
     99     filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
    100     work_a = _mm_subs_epi8(qs0, qs0ps0);
    101     filt = _mm_adds_epi8(filt, work_a);
    102     filt = _mm_adds_epi8(filt, work_a);
    103     filt = _mm_adds_epi8(filt, work_a);
    104     /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
    105     filt = _mm_and_si128(filt, mask);
    106 
    107     filter1 = _mm_adds_epi8(filt, t4);
    108     filter2 = _mm_adds_epi8(filt, t3);
    109 
    110     filter1 = _mm_unpacklo_epi8(zero, filter1);
    111     filter1 = _mm_srai_epi16(filter1, 0xB);
    112     filter2 = _mm_unpacklo_epi8(zero, filter2);
    113     filter2 = _mm_srai_epi16(filter2, 0xB);
    114 
    115     /* Filter1 >> 3 */
    116     filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
    117     qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);
    118 
    119     /* filt >> 1 */
    120     filt = _mm_adds_epi16(filter1, t1);
    121     filt = _mm_srai_epi16(filt, 1);
    122     filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
    123                             filt);
    124     filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
    125     qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
    126     // loopfilter done
    127 
    128     {
    129       __m128i work;
    130       flat = _mm_max_epu8(
    131           _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
    132           _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
    133       flat = _mm_max_epu8(abs_p1p0, flat);
    134       flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
    135       flat = _mm_subs_epu8(flat, one);
    136       flat = _mm_cmpeq_epi8(flat, zero);
    137       flat = _mm_and_si128(flat, mask);
    138 
    139       q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
    140       q5p5 = _mm_castps_si128(
    141           _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));
    142 
    143       q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
    144       q6p6 = _mm_castps_si128(
    145           _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));
    146 
    147       flat2 = _mm_max_epu8(
    148           _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
    149           _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));
    150 
    151       q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
    152       q7p7 = _mm_castps_si128(
    153           _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));
    154 
    155       work = _mm_max_epu8(
    156           _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
    157           _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));
    158 
    159       flat2 = _mm_max_epu8(work, flat2);
    160       flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
    161       flat2 = _mm_subs_epu8(flat2, one);
    162       flat2 = _mm_cmpeq_epi8(flat2, zero);
    163       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    164     }
    165 
    166     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    167     // flat and wide flat calculations
    168     {
    169       const __m128i eight = _mm_set1_epi16(8);
    170       const __m128i four = _mm_set1_epi16(4);
    171       __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
    172       __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
    173       __m128i pixelFilter_p, pixelFilter_q;
    174       __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
    175       __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
    176 
    177       p7_16 = _mm_unpacklo_epi8(q7p7, zero);
    178       p6_16 = _mm_unpacklo_epi8(q6p6, zero);
    179       p5_16 = _mm_unpacklo_epi8(q5p5, zero);
    180       p4_16 = _mm_unpacklo_epi8(q4p4, zero);
    181       p3_16 = _mm_unpacklo_epi8(q3p3, zero);
    182       p2_16 = _mm_unpacklo_epi8(q2p2, zero);
    183       p1_16 = _mm_unpacklo_epi8(q1p1, zero);
    184       p0_16 = _mm_unpacklo_epi8(q0p0, zero);
    185       q0_16 = _mm_unpackhi_epi8(q0p0, zero);
    186       q1_16 = _mm_unpackhi_epi8(q1p1, zero);
    187       q2_16 = _mm_unpackhi_epi8(q2p2, zero);
    188       q3_16 = _mm_unpackhi_epi8(q3p3, zero);
    189       q4_16 = _mm_unpackhi_epi8(q4p4, zero);
    190       q5_16 = _mm_unpackhi_epi8(q5p5, zero);
    191       q6_16 = _mm_unpackhi_epi8(q6p6, zero);
    192       q7_16 = _mm_unpackhi_epi8(q7p7, zero);
    193 
    194       pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
    195                                     _mm_add_epi16(p4_16, p3_16));
    196       pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
    197                                     _mm_add_epi16(q4_16, q3_16));
    198 
    199       pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
    200       pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
    201 
    202       pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
    203       pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
    204       pixelFilter_p =
    205           _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
    206       pixetFilter_p2p1p0 = _mm_add_epi16(
    207           four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
    208       res_p = _mm_srli_epi16(
    209           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
    210       res_q = _mm_srli_epi16(
    211           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
    212       flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
    213       res_p = _mm_srli_epi16(
    214           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
    215       res_q = _mm_srli_epi16(
    216           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
    217 
    218       flat_q0p0 = _mm_packus_epi16(res_p, res_q);
    219 
    220       sum_p7 = _mm_add_epi16(p7_16, p7_16);
    221       sum_q7 = _mm_add_epi16(q7_16, q7_16);
    222       sum_p3 = _mm_add_epi16(p3_16, p3_16);
    223       sum_q3 = _mm_add_epi16(q3_16, q3_16);
    224 
    225       pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
    226       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
    227       res_p = _mm_srli_epi16(
    228           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
    229       res_q = _mm_srli_epi16(
    230           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
    231       flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
    232 
    233       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
    234       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
    235       res_p = _mm_srli_epi16(
    236           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
    237       res_q = _mm_srli_epi16(
    238           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
    239       flat_q1p1 = _mm_packus_epi16(res_p, res_q);
    240 
    241       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    242       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    243       sum_p3 = _mm_add_epi16(sum_p3, p3_16);
    244       sum_q3 = _mm_add_epi16(sum_q3, q3_16);
    245 
    246       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
    247       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
    248       res_p = _mm_srli_epi16(
    249           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
    250       res_q = _mm_srli_epi16(
    251           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
    252       flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
    253 
    254       pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
    255       pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
    256 
    257       res_p = _mm_srli_epi16(
    258           _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
    259       res_q = _mm_srli_epi16(
    260           _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
    261       flat_q2p2 = _mm_packus_epi16(res_p, res_q);
    262 
    263       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    264       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    265       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
    266       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
    267       res_p = _mm_srli_epi16(
    268           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
    269       res_q = _mm_srli_epi16(
    270           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
    271       flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
    272 
    273       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    274       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    275       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
    276       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
    277       res_p = _mm_srli_epi16(
    278           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
    279       res_q = _mm_srli_epi16(
    280           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
    281       flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
    282 
    283       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    284       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    285       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
    286       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
    287       res_p = _mm_srli_epi16(
    288           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
    289       res_q = _mm_srli_epi16(
    290           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
    291       flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
    292 
    293       sum_p7 = _mm_add_epi16(sum_p7, p7_16);
    294       sum_q7 = _mm_add_epi16(sum_q7, q7_16);
    295       pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
    296       pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
    297       res_p = _mm_srli_epi16(
    298           _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
    299       res_q = _mm_srli_epi16(
    300           _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
    301       flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
    302     }
    303     // wide flat
    304     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    305 
    306     flat = _mm_shuffle_epi32(flat, 68);
    307     flat2 = _mm_shuffle_epi32(flat2, 68);
    308 
    309     q2p2 = _mm_andnot_si128(flat, q2p2);
    310     flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
    311     q2p2 = _mm_or_si128(q2p2, flat_q2p2);
    312 
    313     qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
    314     flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
    315     q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
    316 
    317     qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
    318     flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
    319     q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
    320 
    321     q6p6 = _mm_andnot_si128(flat2, q6p6);
    322     flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
    323     q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
    324     _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
    325     _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));
    326 
    327     q5p5 = _mm_andnot_si128(flat2, q5p5);
    328     flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
    329     q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
    330     _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
    331     _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));
    332 
    333     q4p4 = _mm_andnot_si128(flat2, q4p4);
    334     flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
    335     q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
    336     _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
    337     _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));
    338 
    339     q3p3 = _mm_andnot_si128(flat2, q3p3);
    340     flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
    341     q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
    342     _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
    343     _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));
    344 
    345     q2p2 = _mm_andnot_si128(flat2, q2p2);
    346     flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
    347     q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
    348     _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
    349     _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));
    350 
    351     q1p1 = _mm_andnot_si128(flat2, q1p1);
    352     flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
    353     q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
    354     _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
    355     _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));
    356 
    357     q0p0 = _mm_andnot_si128(flat2, q0p0);
    358     flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
    359     q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
    360     _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
    361     _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
    362   }
    363 }
    364 
    365 DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
    366   0, 128, 1, 128, 2,  128, 3,  128, 4,  128, 5,  128, 6,  128, 7,  128,
    367   8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
    368 };
    369 
    370 void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p,
    371                                      const unsigned char *_blimit,
    372                                      const unsigned char *_limit,
    373                                      const unsigned char *_thresh) {
    374   __m128i mask, hev, flat, flat2;
    375   const __m128i zero = _mm_set1_epi16(0);
    376   const __m128i one = _mm_set1_epi8(1);
    377   __m128i p7, p6, p5;
    378   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
    379   __m128i q5, q6, q7;
    380   __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
    381       p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
    382 
    383   const __m128i thresh =
    384       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
    385   const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
    386   const __m128i blimit =
    387       _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));
    388 
    389   p256_4 =
    390       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
    391   p256_3 =
    392       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
    393   p256_2 =
    394       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
    395   p256_1 =
    396       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
    397   p256_0 =
    398       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
    399   q256_0 =
    400       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
    401   q256_1 =
    402       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
    403   q256_2 =
    404       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
    405   q256_3 =
    406       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
    407   q256_4 =
    408       _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
    409 
    410   p4 = _mm256_castsi256_si128(p256_4);
    411   p3 = _mm256_castsi256_si128(p256_3);
    412   p2 = _mm256_castsi256_si128(p256_2);
    413   p1 = _mm256_castsi256_si128(p256_1);
    414   p0 = _mm256_castsi256_si128(p256_0);
    415   q0 = _mm256_castsi256_si128(q256_0);
    416   q1 = _mm256_castsi256_si128(q256_1);
    417   q2 = _mm256_castsi256_si128(q256_2);
    418   q3 = _mm256_castsi256_si128(q256_3);
    419   q4 = _mm256_castsi256_si128(q256_4);
    420 
    421   {
    422     const __m128i abs_p1p0 =
    423         _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
    424     const __m128i abs_q1q0 =
    425         _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
    426     const __m128i fe = _mm_set1_epi8(0xfe);
    427     const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    428     __m128i abs_p0q0 =
    429         _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
    430     __m128i abs_p1q1 =
    431         _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
    432     __m128i work;
    433     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    434     hev = _mm_subs_epu8(flat, thresh);
    435     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
    436 
    437     abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    438     abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    439     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    440     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    441     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    442     mask = _mm_max_epu8(flat, mask);
    443     // mask |= (abs(p1 - p0) > limit) * -1;
    444     // mask |= (abs(q1 - q0) > limit) * -1;
    445     work = _mm_max_epu8(
    446         _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
    447         _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
    448     mask = _mm_max_epu8(work, mask);
    449     work = _mm_max_epu8(
    450         _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
    451         _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
    452     mask = _mm_max_epu8(work, mask);
    453     mask = _mm_subs_epu8(mask, limit);
    454     mask = _mm_cmpeq_epi8(mask, zero);
    455   }
    456 
    457   // lp filter
    458   {
    459     const __m128i t4 = _mm_set1_epi8(4);
    460     const __m128i t3 = _mm_set1_epi8(3);
    461     const __m128i t80 = _mm_set1_epi8(0x80);
    462     const __m128i te0 = _mm_set1_epi8(0xe0);
    463     const __m128i t1f = _mm_set1_epi8(0x1f);
    464     const __m128i t1 = _mm_set1_epi8(0x1);
    465     const __m128i t7f = _mm_set1_epi8(0x7f);
    466 
    467     __m128i ps1 = _mm_xor_si128(p1, t80);
    468     __m128i ps0 = _mm_xor_si128(p0, t80);
    469     __m128i qs0 = _mm_xor_si128(q0, t80);
    470     __m128i qs1 = _mm_xor_si128(q1, t80);
    471     __m128i filt;
    472     __m128i work_a;
    473     __m128i filter1, filter2;
    474     __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
    475         flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
    476         flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
    477 
    478     filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
    479     work_a = _mm_subs_epi8(qs0, ps0);
    480     filt = _mm_adds_epi8(filt, work_a);
    481     filt = _mm_adds_epi8(filt, work_a);
    482     filt = _mm_adds_epi8(filt, work_a);
    483     /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
    484     filt = _mm_and_si128(filt, mask);
    485 
    486     filter1 = _mm_adds_epi8(filt, t4);
    487     filter2 = _mm_adds_epi8(filt, t3);
    488 
    489     /* Filter1 >> 3 */
    490     work_a = _mm_cmpgt_epi8(zero, filter1);
    491     filter1 = _mm_srli_epi16(filter1, 3);
    492     work_a = _mm_and_si128(work_a, te0);
    493     filter1 = _mm_and_si128(filter1, t1f);
    494     filter1 = _mm_or_si128(filter1, work_a);
    495     qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
    496 
    497     /* Filter2 >> 3 */
    498     work_a = _mm_cmpgt_epi8(zero, filter2);
    499     filter2 = _mm_srli_epi16(filter2, 3);
    500     work_a = _mm_and_si128(work_a, te0);
    501     filter2 = _mm_and_si128(filter2, t1f);
    502     filter2 = _mm_or_si128(filter2, work_a);
    503     ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
    504 
    505     /* filt >> 1 */
    506     filt = _mm_adds_epi8(filter1, t1);
    507     work_a = _mm_cmpgt_epi8(zero, filt);
    508     filt = _mm_srli_epi16(filt, 1);
    509     work_a = _mm_and_si128(work_a, t80);
    510     filt = _mm_and_si128(filt, t7f);
    511     filt = _mm_or_si128(filt, work_a);
    512     filt = _mm_andnot_si128(hev, filt);
    513     ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
    514     qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
    515     // loopfilter done
    516 
    517     {
    518       __m128i work;
    519       work = _mm_max_epu8(
    520           _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
    521           _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
    522       flat = _mm_max_epu8(work, flat);
    523       work = _mm_max_epu8(
    524           _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
    525           _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
    526       flat = _mm_max_epu8(work, flat);
    527       work = _mm_max_epu8(
    528           _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
    529           _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
    530       flat = _mm_subs_epu8(flat, one);
    531       flat = _mm_cmpeq_epi8(flat, zero);
    532       flat = _mm_and_si128(flat, mask);
    533 
    534       p256_5 = _mm256_castpd_si256(
    535           _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
    536       q256_5 = _mm256_castpd_si256(
    537           _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
    538       p5 = _mm256_castsi256_si128(p256_5);
    539       q5 = _mm256_castsi256_si128(q256_5);
    540       flat2 = _mm_max_epu8(
    541           _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
    542           _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));
    543 
    544       flat2 = _mm_max_epu8(work, flat2);
    545       p256_6 = _mm256_castpd_si256(
    546           _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
    547       q256_6 = _mm256_castpd_si256(
    548           _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
    549       p6 = _mm256_castsi256_si128(p256_6);
    550       q6 = _mm256_castsi256_si128(q256_6);
    551       work = _mm_max_epu8(
    552           _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
    553           _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));
    554 
    555       flat2 = _mm_max_epu8(work, flat2);
    556 
    557       p256_7 = _mm256_castpd_si256(
    558           _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
    559       q256_7 = _mm256_castpd_si256(
    560           _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
    561       p7 = _mm256_castsi256_si128(p256_7);
    562       q7 = _mm256_castsi256_si128(q256_7);
    563       work = _mm_max_epu8(
    564           _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
    565           _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));
    566 
    567       flat2 = _mm_max_epu8(work, flat2);
    568       flat2 = _mm_subs_epu8(flat2, one);
    569       flat2 = _mm_cmpeq_epi8(flat2, zero);
    570       flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    571     }
    572 
    573     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    574     // flat and wide flat calculations
    575     {
    576       const __m256i eight = _mm256_set1_epi16(8);
    577       const __m256i four = _mm256_set1_epi16(4);
    578       __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
    579           pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;
    580 
    581       const __m256i filter =
    582           _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
    583       p256_7 = _mm256_shuffle_epi8(p256_7, filter);
    584       p256_6 = _mm256_shuffle_epi8(p256_6, filter);
    585       p256_5 = _mm256_shuffle_epi8(p256_5, filter);
    586       p256_4 = _mm256_shuffle_epi8(p256_4, filter);
    587       p256_3 = _mm256_shuffle_epi8(p256_3, filter);
    588       p256_2 = _mm256_shuffle_epi8(p256_2, filter);
    589       p256_1 = _mm256_shuffle_epi8(p256_1, filter);
    590       p256_0 = _mm256_shuffle_epi8(p256_0, filter);
    591       q256_0 = _mm256_shuffle_epi8(q256_0, filter);
    592       q256_1 = _mm256_shuffle_epi8(q256_1, filter);
    593       q256_2 = _mm256_shuffle_epi8(q256_2, filter);
    594       q256_3 = _mm256_shuffle_epi8(q256_3, filter);
    595       q256_4 = _mm256_shuffle_epi8(q256_4, filter);
    596       q256_5 = _mm256_shuffle_epi8(q256_5, filter);
    597       q256_6 = _mm256_shuffle_epi8(q256_6, filter);
    598       q256_7 = _mm256_shuffle_epi8(q256_7, filter);
    599 
    600       pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
    601                                        _mm256_add_epi16(p256_4, p256_3));
    602       pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
    603                                        _mm256_add_epi16(q256_4, q256_3));
    604 
    605       pixetFilter_p2p1p0 =
    606           _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
    607       pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
    608 
    609       pixetFilter_q2q1q0 =
    610           _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
    611       pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
    612 
    613       pixelFilter_p = _mm256_add_epi16(
    614           eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
    615 
    616       pixetFilter_p2p1p0 = _mm256_add_epi16(
    617           four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
    618 
    619       res_p = _mm256_srli_epi16(
    620           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);
    621 
    622       flat2_p0 = _mm256_castsi256_si128(
    623           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
    624 
    625       res_q = _mm256_srli_epi16(
    626           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);
    627 
    628       flat2_q0 = _mm256_castsi256_si128(
    629           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
    630 
    631       res_p =
    632           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
    633                                              _mm256_add_epi16(p256_3, p256_0)),
    634                             3);
    635 
    636       flat_p0 = _mm256_castsi256_si128(
    637           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
    638 
    639       res_q =
    640           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
    641                                              _mm256_add_epi16(q256_3, q256_0)),
    642                             3);
    643 
    644       flat_q0 = _mm256_castsi256_si128(
    645           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
    646 
    647       sum_p7 = _mm256_add_epi16(p256_7, p256_7);
    648 
    649       sum_q7 = _mm256_add_epi16(q256_7, q256_7);
    650 
    651       sum_p3 = _mm256_add_epi16(p256_3, p256_3);
    652 
    653       sum_q3 = _mm256_add_epi16(q256_3, q256_3);
    654 
    655       pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);
    656 
    657       pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);
    658 
    659       res_p = _mm256_srli_epi16(
    660           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);
    661 
    662       flat2_p1 = _mm256_castsi256_si128(
    663           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
    664 
    665       res_q = _mm256_srli_epi16(
    666           _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);
    667 
    668       flat2_q1 = _mm256_castsi256_si128(
    669           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
    670 
    671       pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);
    672 
    673       pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);
    674 
    675       res_p =
    676           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
    677                                              _mm256_add_epi16(sum_p3, p256_1)),
    678                             3);
    679 
    680       flat_p1 = _mm256_castsi256_si128(
    681           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
    682 
    683       res_q =
    684           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
    685                                              _mm256_add_epi16(sum_q3, q256_1)),
    686                             3);
    687 
    688       flat_q1 = _mm256_castsi256_si128(
    689           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
    690 
    691       sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
    692 
    693       sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
    694 
    695       sum_p3 = _mm256_add_epi16(sum_p3, p256_3);
    696 
    697       sum_q3 = _mm256_add_epi16(sum_q3, q256_3);
    698 
    699       pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);
    700 
    701       pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);
    702 
    703       res_p = _mm256_srli_epi16(
    704           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);
    705 
    706       flat2_p2 = _mm256_castsi256_si128(
    707           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
    708 
    709       res_q = _mm256_srli_epi16(
    710           _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);
    711 
    712       flat2_q2 = _mm256_castsi256_si128(
    713           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
    714 
    715       pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);
    716 
    717       pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);
    718 
    719       res_p =
    720           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
    721                                              _mm256_add_epi16(sum_p3, p256_2)),
    722                             3);
    723 
    724       flat_p2 = _mm256_castsi256_si128(
    725           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
    726 
    727       res_q =
    728           _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
    729                                              _mm256_add_epi16(sum_q3, q256_2)),
    730                             3);
    731 
    732       flat_q2 = _mm256_castsi256_si128(
    733           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
    734 
    735       sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
    736 
    737       sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
    738 
    739       pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);
    740 
    741       pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);
    742 
    743       res_p = _mm256_srli_epi16(
    744           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);
    745 
    746       flat2_p3 = _mm256_castsi256_si128(
    747           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
    748 
    749       res_q = _mm256_srli_epi16(
    750           _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);
    751 
    752       flat2_q3 = _mm256_castsi256_si128(
    753           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
    754 
    755       sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
    756 
    757       sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
    758 
    759       pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);
    760 
    761       pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);
    762 
    763       res_p = _mm256_srli_epi16(
    764           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);
    765 
    766       flat2_p4 = _mm256_castsi256_si128(
    767           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
    768 
    769       res_q = _mm256_srli_epi16(
    770           _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);
    771 
    772       flat2_q4 = _mm256_castsi256_si128(
    773           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
    774 
    775       sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
    776 
    777       sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
    778 
    779       pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);
    780 
    781       pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);
    782 
    783       res_p = _mm256_srli_epi16(
    784           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);
    785 
    786       flat2_p5 = _mm256_castsi256_si128(
    787           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
    788 
    789       res_q = _mm256_srli_epi16(
    790           _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);
    791 
    792       flat2_q5 = _mm256_castsi256_si128(
    793           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
    794 
    795       sum_p7 = _mm256_add_epi16(sum_p7, p256_7);
    796 
    797       sum_q7 = _mm256_add_epi16(sum_q7, q256_7);
    798 
    799       pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);
    800 
    801       pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);
    802 
    803       res_p = _mm256_srli_epi16(
    804           _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);
    805 
    806       flat2_p6 = _mm256_castsi256_si128(
    807           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
    808 
    809       res_q = _mm256_srli_epi16(
    810           _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);
    811 
    812       flat2_q6 = _mm256_castsi256_si128(
    813           _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
    814     }
    815 
    816     // wide flat
    817     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    818 
    819     p2 = _mm_andnot_si128(flat, p2);
    820     flat_p2 = _mm_and_si128(flat, flat_p2);
    821     p2 = _mm_or_si128(flat_p2, p2);
    822 
    823     p1 = _mm_andnot_si128(flat, ps1);
    824     flat_p1 = _mm_and_si128(flat, flat_p1);
    825     p1 = _mm_or_si128(flat_p1, p1);
    826 
    827     p0 = _mm_andnot_si128(flat, ps0);
    828     flat_p0 = _mm_and_si128(flat, flat_p0);
    829     p0 = _mm_or_si128(flat_p0, p0);
    830 
    831     q0 = _mm_andnot_si128(flat, qs0);
    832     flat_q0 = _mm_and_si128(flat, flat_q0);
    833     q0 = _mm_or_si128(flat_q0, q0);
    834 
    835     q1 = _mm_andnot_si128(flat, qs1);
    836     flat_q1 = _mm_and_si128(flat, flat_q1);
    837     q1 = _mm_or_si128(flat_q1, q1);
    838 
    839     q2 = _mm_andnot_si128(flat, q2);
    840     flat_q2 = _mm_and_si128(flat, flat_q2);
    841     q2 = _mm_or_si128(flat_q2, q2);
    842 
    843     p6 = _mm_andnot_si128(flat2, p6);
    844     flat2_p6 = _mm_and_si128(flat2, flat2_p6);
    845     p6 = _mm_or_si128(flat2_p6, p6);
    846     _mm_storeu_si128((__m128i *)(s - 7 * p), p6);
    847 
    848     p5 = _mm_andnot_si128(flat2, p5);
    849     flat2_p5 = _mm_and_si128(flat2, flat2_p5);
    850     p5 = _mm_or_si128(flat2_p5, p5);
    851     _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
    852 
    853     p4 = _mm_andnot_si128(flat2, p4);
    854     flat2_p4 = _mm_and_si128(flat2, flat2_p4);
    855     p4 = _mm_or_si128(flat2_p4, p4);
    856     _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
    857 
    858     p3 = _mm_andnot_si128(flat2, p3);
    859     flat2_p3 = _mm_and_si128(flat2, flat2_p3);
    860     p3 = _mm_or_si128(flat2_p3, p3);
    861     _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
    862 
    863     p2 = _mm_andnot_si128(flat2, p2);
    864     flat2_p2 = _mm_and_si128(flat2, flat2_p2);
    865     p2 = _mm_or_si128(flat2_p2, p2);
    866     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
    867 
    868     p1 = _mm_andnot_si128(flat2, p1);
    869     flat2_p1 = _mm_and_si128(flat2, flat2_p1);
    870     p1 = _mm_or_si128(flat2_p1, p1);
    871     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
    872 
    873     p0 = _mm_andnot_si128(flat2, p0);
    874     flat2_p0 = _mm_and_si128(flat2, flat2_p0);
    875     p0 = _mm_or_si128(flat2_p0, p0);
    876     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
    877 
    878     q0 = _mm_andnot_si128(flat2, q0);
    879     flat2_q0 = _mm_and_si128(flat2, flat2_q0);
    880     q0 = _mm_or_si128(flat2_q0, q0);
    881     _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
    882 
    883     q1 = _mm_andnot_si128(flat2, q1);
    884     flat2_q1 = _mm_and_si128(flat2, flat2_q1);
    885     q1 = _mm_or_si128(flat2_q1, q1);
    886     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
    887 
    888     q2 = _mm_andnot_si128(flat2, q2);
    889     flat2_q2 = _mm_and_si128(flat2, flat2_q2);
    890     q2 = _mm_or_si128(flat2_q2, q2);
    891     _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
    892 
    893     q3 = _mm_andnot_si128(flat2, q3);
    894     flat2_q3 = _mm_and_si128(flat2, flat2_q3);
    895     q3 = _mm_or_si128(flat2_q3, q3);
    896     _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
    897 
    898     q4 = _mm_andnot_si128(flat2, q4);
    899     flat2_q4 = _mm_and_si128(flat2, flat2_q4);
    900     q4 = _mm_or_si128(flat2_q4, q4);
    901     _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
    902 
    903     q5 = _mm_andnot_si128(flat2, q5);
    904     flat2_q5 = _mm_and_si128(flat2, flat2_q5);
    905     q5 = _mm_or_si128(flat2_q5, q5);
    906     _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
    907 
    908     q6 = _mm_andnot_si128(flat2, q6);
    909     flat2_q6 = _mm_and_si128(flat2, flat2_q6);
    910     q6 = _mm_or_si128(flat2_q6, q6);
    911     _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
    912   }
    913 }
    914