1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <immintrin.h> /* AVX2 */ 12 13 #include "./vpx_dsp_rtcd.h" 14 #include "vpx_ports/mem.h" 15 16 void vpx_lpf_horizontal_16_avx2(unsigned char *s, int p, 17 const unsigned char *_blimit, 18 const unsigned char *_limit, 19 const unsigned char *_thresh) { 20 __m128i mask, hev, flat, flat2; 21 const __m128i zero = _mm_set1_epi16(0); 22 const __m128i one = _mm_set1_epi8(1); 23 __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; 24 __m128i abs_p1p0; 25 26 const __m128i thresh = 27 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); 28 const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); 29 const __m128i blimit = 30 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); 31 32 q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); 33 q4p4 = _mm_castps_si128( 34 _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); 35 q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); 36 q3p3 = _mm_castps_si128( 37 _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); 38 q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); 39 q2p2 = _mm_castps_si128( 40 _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); 41 q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); 42 q1p1 = _mm_castps_si128( 43 _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); 44 p1q1 = _mm_shuffle_epi32(q1p1, 78); 45 q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); 46 q0p0 = _mm_castps_si128( 47 _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); 48 p0q0 = _mm_shuffle_epi32(q0p0, 78); 49 50 { 51 __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; 52 abs_p1p0 = 53 _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); 54 abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); 55 fe = _mm_set1_epi8(0xfe); 56 ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 57 abs_p0q0 = 58 _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); 59 abs_p1q1 = 60 _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); 61 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 62 hev = _mm_subs_epu8(flat, thresh); 63 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 64 65 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); 66 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 67 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 68 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 69 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 70 mask = _mm_max_epu8(abs_p1p0, mask); 71 // mask |= (abs(p1 - p0) > limit) * -1; 72 // mask |= (abs(q1 - q0) > limit) * -1; 73 74 work = _mm_max_epu8( 75 _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)), 76 _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); 77 mask = _mm_max_epu8(work, mask); 78 mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); 79 mask = _mm_subs_epu8(mask, limit); 80 mask = _mm_cmpeq_epi8(mask, zero); 81 } 82 83 // lp filter 84 { 85 const __m128i t4 = _mm_set1_epi8(4); 86 const __m128i t3 = _mm_set1_epi8(3); 87 const __m128i t80 = _mm_set1_epi8(0x80); 88 const __m128i t1 = _mm_set1_epi16(0x1); 89 __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); 90 __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); 91 __m128i qs0 = _mm_xor_si128(p0q0, t80); 92 __m128i qs1 = _mm_xor_si128(p1q1, t80); 93 __m128i filt; 94 __m128i work_a; 95 __m128i filter1, filter2; 96 __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; 97 __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; 98 99 filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); 100 work_a = _mm_subs_epi8(qs0, qs0ps0); 101 filt = _mm_adds_epi8(filt, work_a); 102 filt = _mm_adds_epi8(filt, work_a); 103 filt = _mm_adds_epi8(filt, work_a); 104 /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ 105 filt = _mm_and_si128(filt, mask); 106 107 filter1 = _mm_adds_epi8(filt, t4); 108 filter2 = _mm_adds_epi8(filt, t3); 109 110 filter1 = _mm_unpacklo_epi8(zero, filter1); 111 filter1 = _mm_srai_epi16(filter1, 0xB); 112 filter2 = _mm_unpacklo_epi8(zero, filter2); 113 filter2 = _mm_srai_epi16(filter2, 0xB); 114 115 /* Filter1 >> 3 */ 116 filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); 117 qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); 118 119 /* filt >> 1 */ 120 filt = _mm_adds_epi16(filter1, t1); 121 filt = _mm_srai_epi16(filt, 1); 122 filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), 123 filt); 124 filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); 125 qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); 126 // loopfilter done 127 128 { 129 __m128i work; 130 flat = _mm_max_epu8( 131 _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)), 132 _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3))); 133 flat = _mm_max_epu8(abs_p1p0, flat); 134 flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); 135 flat = _mm_subs_epu8(flat, one); 136 flat = _mm_cmpeq_epi8(flat, zero); 137 flat = _mm_and_si128(flat, mask); 138 139 q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); 140 q5p5 = _mm_castps_si128( 141 _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); 142 143 q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); 144 q6p6 = _mm_castps_si128( 145 _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); 146 147 flat2 = _mm_max_epu8( 148 _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), 149 _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); 150 151 q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); 152 q7p7 = _mm_castps_si128( 153 _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); 154 155 work = _mm_max_epu8( 156 _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), 157 _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7))); 158 159 flat2 = _mm_max_epu8(work, flat2); 160 flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); 161 flat2 = _mm_subs_epu8(flat2, one); 162 flat2 = _mm_cmpeq_epi8(flat2, zero); 163 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 164 } 165 166 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 167 // flat and wide flat calculations 168 { 169 const __m128i eight = _mm_set1_epi16(8); 170 const __m128i four = _mm_set1_epi16(4); 171 __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; 172 __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; 173 __m128i pixelFilter_p, pixelFilter_q; 174 __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; 175 __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; 176 177 p7_16 = _mm_unpacklo_epi8(q7p7, zero); 178 p6_16 = _mm_unpacklo_epi8(q6p6, zero); 179 p5_16 = _mm_unpacklo_epi8(q5p5, zero); 180 p4_16 = _mm_unpacklo_epi8(q4p4, zero); 181 p3_16 = _mm_unpacklo_epi8(q3p3, zero); 182 p2_16 = _mm_unpacklo_epi8(q2p2, zero); 183 p1_16 = _mm_unpacklo_epi8(q1p1, zero); 184 p0_16 = _mm_unpacklo_epi8(q0p0, zero); 185 q0_16 = _mm_unpackhi_epi8(q0p0, zero); 186 q1_16 = _mm_unpackhi_epi8(q1p1, zero); 187 q2_16 = _mm_unpackhi_epi8(q2p2, zero); 188 q3_16 = _mm_unpackhi_epi8(q3p3, zero); 189 q4_16 = _mm_unpackhi_epi8(q4p4, zero); 190 q5_16 = _mm_unpackhi_epi8(q5p5, zero); 191 q6_16 = _mm_unpackhi_epi8(q6p6, zero); 192 q7_16 = _mm_unpackhi_epi8(q7p7, zero); 193 194 pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), 195 _mm_add_epi16(p4_16, p3_16)); 196 pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), 197 _mm_add_epi16(q4_16, q3_16)); 198 199 pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); 200 pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); 201 202 pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); 203 pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); 204 pixelFilter_p = 205 _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); 206 pixetFilter_p2p1p0 = _mm_add_epi16( 207 four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); 208 res_p = _mm_srli_epi16( 209 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); 210 res_q = _mm_srli_epi16( 211 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); 212 flat2_q0p0 = _mm_packus_epi16(res_p, res_q); 213 res_p = _mm_srli_epi16( 214 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); 215 res_q = _mm_srli_epi16( 216 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); 217 218 flat_q0p0 = _mm_packus_epi16(res_p, res_q); 219 220 sum_p7 = _mm_add_epi16(p7_16, p7_16); 221 sum_q7 = _mm_add_epi16(q7_16, q7_16); 222 sum_p3 = _mm_add_epi16(p3_16, p3_16); 223 sum_q3 = _mm_add_epi16(q3_16, q3_16); 224 225 pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); 226 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); 227 res_p = _mm_srli_epi16( 228 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); 229 res_q = _mm_srli_epi16( 230 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); 231 flat2_q1p1 = _mm_packus_epi16(res_p, res_q); 232 233 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); 234 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); 235 res_p = _mm_srli_epi16( 236 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); 237 res_q = _mm_srli_epi16( 238 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); 239 flat_q1p1 = _mm_packus_epi16(res_p, res_q); 240 241 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 242 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 243 sum_p3 = _mm_add_epi16(sum_p3, p3_16); 244 sum_q3 = _mm_add_epi16(sum_q3, q3_16); 245 246 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); 247 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); 248 res_p = _mm_srli_epi16( 249 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); 250 res_q = _mm_srli_epi16( 251 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); 252 flat2_q2p2 = _mm_packus_epi16(res_p, res_q); 253 254 pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); 255 pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); 256 257 res_p = _mm_srli_epi16( 258 _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); 259 res_q = _mm_srli_epi16( 260 _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); 261 flat_q2p2 = _mm_packus_epi16(res_p, res_q); 262 263 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 264 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 265 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); 266 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); 267 res_p = _mm_srli_epi16( 268 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); 269 res_q = _mm_srli_epi16( 270 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); 271 flat2_q3p3 = _mm_packus_epi16(res_p, res_q); 272 273 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 274 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 275 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); 276 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); 277 res_p = _mm_srli_epi16( 278 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); 279 res_q = _mm_srli_epi16( 280 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); 281 flat2_q4p4 = _mm_packus_epi16(res_p, res_q); 282 283 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 284 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 285 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); 286 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); 287 res_p = _mm_srli_epi16( 288 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); 289 res_q = _mm_srli_epi16( 290 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); 291 flat2_q5p5 = _mm_packus_epi16(res_p, res_q); 292 293 sum_p7 = _mm_add_epi16(sum_p7, p7_16); 294 sum_q7 = _mm_add_epi16(sum_q7, q7_16); 295 pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); 296 pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); 297 res_p = _mm_srli_epi16( 298 _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); 299 res_q = _mm_srli_epi16( 300 _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); 301 flat2_q6p6 = _mm_packus_epi16(res_p, res_q); 302 } 303 // wide flat 304 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 305 306 flat = _mm_shuffle_epi32(flat, 68); 307 flat2 = _mm_shuffle_epi32(flat2, 68); 308 309 q2p2 = _mm_andnot_si128(flat, q2p2); 310 flat_q2p2 = _mm_and_si128(flat, flat_q2p2); 311 q2p2 = _mm_or_si128(q2p2, flat_q2p2); 312 313 qs1ps1 = _mm_andnot_si128(flat, qs1ps1); 314 flat_q1p1 = _mm_and_si128(flat, flat_q1p1); 315 q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); 316 317 qs0ps0 = _mm_andnot_si128(flat, qs0ps0); 318 flat_q0p0 = _mm_and_si128(flat, flat_q0p0); 319 q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); 320 321 q6p6 = _mm_andnot_si128(flat2, q6p6); 322 flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); 323 q6p6 = _mm_or_si128(q6p6, flat2_q6p6); 324 _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); 325 _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); 326 327 q5p5 = _mm_andnot_si128(flat2, q5p5); 328 flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); 329 q5p5 = _mm_or_si128(q5p5, flat2_q5p5); 330 _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); 331 _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); 332 333 q4p4 = _mm_andnot_si128(flat2, q4p4); 334 flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); 335 q4p4 = _mm_or_si128(q4p4, flat2_q4p4); 336 _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); 337 _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); 338 339 q3p3 = _mm_andnot_si128(flat2, q3p3); 340 flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); 341 q3p3 = _mm_or_si128(q3p3, flat2_q3p3); 342 _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); 343 _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); 344 345 q2p2 = _mm_andnot_si128(flat2, q2p2); 346 flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); 347 q2p2 = _mm_or_si128(q2p2, flat2_q2p2); 348 _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); 349 _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); 350 351 q1p1 = _mm_andnot_si128(flat2, q1p1); 352 flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); 353 q1p1 = _mm_or_si128(q1p1, flat2_q1p1); 354 _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); 355 _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); 356 357 q0p0 = _mm_andnot_si128(flat2, q0p0); 358 flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); 359 q0p0 = _mm_or_si128(q0p0, flat2_q0p0); 360 _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); 361 _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); 362 } 363 } 364 365 DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = { 366 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128, 367 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128 368 }; 369 370 void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int p, 371 const unsigned char *_blimit, 372 const unsigned char *_limit, 373 const unsigned char *_thresh) { 374 __m128i mask, hev, flat, flat2; 375 const __m128i zero = _mm_set1_epi16(0); 376 const __m128i one = _mm_set1_epi8(1); 377 __m128i p7, p6, p5; 378 __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; 379 __m128i q5, q6, q7; 380 __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4, 381 p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0; 382 383 const __m128i thresh = 384 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); 385 const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); 386 const __m128i blimit = 387 _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); 388 389 p256_4 = 390 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p))); 391 p256_3 = 392 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p))); 393 p256_2 = 394 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p))); 395 p256_1 = 396 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p))); 397 p256_0 = 398 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p))); 399 q256_0 = 400 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p))); 401 q256_1 = 402 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p))); 403 q256_2 = 404 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p))); 405 q256_3 = 406 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p))); 407 q256_4 = 408 _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p))); 409 410 p4 = _mm256_castsi256_si128(p256_4); 411 p3 = _mm256_castsi256_si128(p256_3); 412 p2 = _mm256_castsi256_si128(p256_2); 413 p1 = _mm256_castsi256_si128(p256_1); 414 p0 = _mm256_castsi256_si128(p256_0); 415 q0 = _mm256_castsi256_si128(q256_0); 416 q1 = _mm256_castsi256_si128(q256_1); 417 q2 = _mm256_castsi256_si128(q256_2); 418 q3 = _mm256_castsi256_si128(q256_3); 419 q4 = _mm256_castsi256_si128(q256_4); 420 421 { 422 const __m128i abs_p1p0 = 423 _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); 424 const __m128i abs_q1q0 = 425 _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1)); 426 const __m128i fe = _mm_set1_epi8(0xfe); 427 const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); 428 __m128i abs_p0q0 = 429 _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0)); 430 __m128i abs_p1q1 = 431 _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); 432 __m128i work; 433 flat = _mm_max_epu8(abs_p1p0, abs_q1q0); 434 hev = _mm_subs_epu8(flat, thresh); 435 hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); 436 437 abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); 438 abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); 439 mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); 440 mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); 441 // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; 442 mask = _mm_max_epu8(flat, mask); 443 // mask |= (abs(p1 - p0) > limit) * -1; 444 // mask |= (abs(q1 - q0) > limit) * -1; 445 work = _mm_max_epu8( 446 _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)), 447 _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3))); 448 mask = _mm_max_epu8(work, mask); 449 work = _mm_max_epu8( 450 _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)), 451 _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3))); 452 mask = _mm_max_epu8(work, mask); 453 mask = _mm_subs_epu8(mask, limit); 454 mask = _mm_cmpeq_epi8(mask, zero); 455 } 456 457 // lp filter 458 { 459 const __m128i t4 = _mm_set1_epi8(4); 460 const __m128i t3 = _mm_set1_epi8(3); 461 const __m128i t80 = _mm_set1_epi8(0x80); 462 const __m128i te0 = _mm_set1_epi8(0xe0); 463 const __m128i t1f = _mm_set1_epi8(0x1f); 464 const __m128i t1 = _mm_set1_epi8(0x1); 465 const __m128i t7f = _mm_set1_epi8(0x7f); 466 467 __m128i ps1 = _mm_xor_si128(p1, t80); 468 __m128i ps0 = _mm_xor_si128(p0, t80); 469 __m128i qs0 = _mm_xor_si128(q0, t80); 470 __m128i qs1 = _mm_xor_si128(q1, t80); 471 __m128i filt; 472 __m128i work_a; 473 __m128i filter1, filter2; 474 __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1, 475 flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5, 476 flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2; 477 478 filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); 479 work_a = _mm_subs_epi8(qs0, ps0); 480 filt = _mm_adds_epi8(filt, work_a); 481 filt = _mm_adds_epi8(filt, work_a); 482 filt = _mm_adds_epi8(filt, work_a); 483 /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ 484 filt = _mm_and_si128(filt, mask); 485 486 filter1 = _mm_adds_epi8(filt, t4); 487 filter2 = _mm_adds_epi8(filt, t3); 488 489 /* Filter1 >> 3 */ 490 work_a = _mm_cmpgt_epi8(zero, filter1); 491 filter1 = _mm_srli_epi16(filter1, 3); 492 work_a = _mm_and_si128(work_a, te0); 493 filter1 = _mm_and_si128(filter1, t1f); 494 filter1 = _mm_or_si128(filter1, work_a); 495 qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); 496 497 /* Filter2 >> 3 */ 498 work_a = _mm_cmpgt_epi8(zero, filter2); 499 filter2 = _mm_srli_epi16(filter2, 3); 500 work_a = _mm_and_si128(work_a, te0); 501 filter2 = _mm_and_si128(filter2, t1f); 502 filter2 = _mm_or_si128(filter2, work_a); 503 ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); 504 505 /* filt >> 1 */ 506 filt = _mm_adds_epi8(filter1, t1); 507 work_a = _mm_cmpgt_epi8(zero, filt); 508 filt = _mm_srli_epi16(filt, 1); 509 work_a = _mm_and_si128(work_a, t80); 510 filt = _mm_and_si128(filt, t7f); 511 filt = _mm_or_si128(filt, work_a); 512 filt = _mm_andnot_si128(hev, filt); 513 ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); 514 qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); 515 // loopfilter done 516 517 { 518 __m128i work; 519 work = _mm_max_epu8( 520 _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), 521 _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2))); 522 flat = _mm_max_epu8(work, flat); 523 work = _mm_max_epu8( 524 _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)), 525 _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3))); 526 flat = _mm_max_epu8(work, flat); 527 work = _mm_max_epu8( 528 _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)), 529 _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4))); 530 flat = _mm_subs_epu8(flat, one); 531 flat = _mm_cmpeq_epi8(flat, zero); 532 flat = _mm_and_si128(flat, mask); 533 534 p256_5 = _mm256_castpd_si256( 535 _mm256_broadcast_pd((__m128d const *)(s - 6 * p))); 536 q256_5 = _mm256_castpd_si256( 537 _mm256_broadcast_pd((__m128d const *)(s + 5 * p))); 538 p5 = _mm256_castsi256_si128(p256_5); 539 q5 = _mm256_castsi256_si128(q256_5); 540 flat2 = _mm_max_epu8( 541 _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)), 542 _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); 543 544 flat2 = _mm_max_epu8(work, flat2); 545 p256_6 = _mm256_castpd_si256( 546 _mm256_broadcast_pd((__m128d const *)(s - 7 * p))); 547 q256_6 = _mm256_castpd_si256( 548 _mm256_broadcast_pd((__m128d const *)(s + 6 * p))); 549 p6 = _mm256_castsi256_si128(p256_6); 550 q6 = _mm256_castsi256_si128(q256_6); 551 work = _mm_max_epu8( 552 _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)), 553 _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); 554 555 flat2 = _mm_max_epu8(work, flat2); 556 557 p256_7 = _mm256_castpd_si256( 558 _mm256_broadcast_pd((__m128d const *)(s - 8 * p))); 559 q256_7 = _mm256_castpd_si256( 560 _mm256_broadcast_pd((__m128d const *)(s + 7 * p))); 561 p7 = _mm256_castsi256_si128(p256_7); 562 q7 = _mm256_castsi256_si128(q256_7); 563 work = _mm_max_epu8( 564 _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)), 565 _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); 566 567 flat2 = _mm_max_epu8(work, flat2); 568 flat2 = _mm_subs_epu8(flat2, one); 569 flat2 = _mm_cmpeq_epi8(flat2, zero); 570 flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask 571 } 572 573 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 574 // flat and wide flat calculations 575 { 576 const __m256i eight = _mm256_set1_epi16(8); 577 const __m256i four = _mm256_set1_epi16(4); 578 __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0, 579 pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; 580 581 const __m256i filter = 582 _mm256_load_si256((__m256i const *)filt_loopfilter_avx2); 583 p256_7 = _mm256_shuffle_epi8(p256_7, filter); 584 p256_6 = _mm256_shuffle_epi8(p256_6, filter); 585 p256_5 = _mm256_shuffle_epi8(p256_5, filter); 586 p256_4 = _mm256_shuffle_epi8(p256_4, filter); 587 p256_3 = _mm256_shuffle_epi8(p256_3, filter); 588 p256_2 = _mm256_shuffle_epi8(p256_2, filter); 589 p256_1 = _mm256_shuffle_epi8(p256_1, filter); 590 p256_0 = _mm256_shuffle_epi8(p256_0, filter); 591 q256_0 = _mm256_shuffle_epi8(q256_0, filter); 592 q256_1 = _mm256_shuffle_epi8(q256_1, filter); 593 q256_2 = _mm256_shuffle_epi8(q256_2, filter); 594 q256_3 = _mm256_shuffle_epi8(q256_3, filter); 595 q256_4 = _mm256_shuffle_epi8(q256_4, filter); 596 q256_5 = _mm256_shuffle_epi8(q256_5, filter); 597 q256_6 = _mm256_shuffle_epi8(q256_6, filter); 598 q256_7 = _mm256_shuffle_epi8(q256_7, filter); 599 600 pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5), 601 _mm256_add_epi16(p256_4, p256_3)); 602 pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5), 603 _mm256_add_epi16(q256_4, q256_3)); 604 605 pixetFilter_p2p1p0 = 606 _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1)); 607 pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); 608 609 pixetFilter_q2q1q0 = 610 _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1)); 611 pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); 612 613 pixelFilter_p = _mm256_add_epi16( 614 eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q)); 615 616 pixetFilter_p2p1p0 = _mm256_add_epi16( 617 four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); 618 619 res_p = _mm256_srli_epi16( 620 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4); 621 622 flat2_p0 = _mm256_castsi256_si128( 623 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); 624 625 res_q = _mm256_srli_epi16( 626 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4); 627 628 flat2_q0 = _mm256_castsi256_si128( 629 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); 630 631 res_p = 632 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, 633 _mm256_add_epi16(p256_3, p256_0)), 634 3); 635 636 flat_p0 = _mm256_castsi256_si128( 637 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); 638 639 res_q = 640 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, 641 _mm256_add_epi16(q256_3, q256_0)), 642 3); 643 644 flat_q0 = _mm256_castsi256_si128( 645 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); 646 647 sum_p7 = _mm256_add_epi16(p256_7, p256_7); 648 649 sum_q7 = _mm256_add_epi16(q256_7, q256_7); 650 651 sum_p3 = _mm256_add_epi16(p256_3, p256_3); 652 653 sum_q3 = _mm256_add_epi16(q256_3, q256_3); 654 655 pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6); 656 657 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6); 658 659 res_p = _mm256_srli_epi16( 660 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4); 661 662 flat2_p1 = _mm256_castsi256_si128( 663 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); 664 665 res_q = _mm256_srli_epi16( 666 _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4); 667 668 flat2_q1 = _mm256_castsi256_si128( 669 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); 670 671 pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2); 672 673 pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2); 674 675 res_p = 676 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, 677 _mm256_add_epi16(sum_p3, p256_1)), 678 3); 679 680 flat_p1 = _mm256_castsi256_si128( 681 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); 682 683 res_q = 684 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, 685 _mm256_add_epi16(sum_q3, q256_1)), 686 3); 687 688 flat_q1 = _mm256_castsi256_si128( 689 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); 690 691 sum_p7 = _mm256_add_epi16(sum_p7, p256_7); 692 693 sum_q7 = _mm256_add_epi16(sum_q7, q256_7); 694 695 sum_p3 = _mm256_add_epi16(sum_p3, p256_3); 696 697 sum_q3 = _mm256_add_epi16(sum_q3, q256_3); 698 699 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5); 700 701 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5); 702 703 res_p = _mm256_srli_epi16( 704 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4); 705 706 flat2_p2 = _mm256_castsi256_si128( 707 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); 708 709 res_q = _mm256_srli_epi16( 710 _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4); 711 712 flat2_q2 = _mm256_castsi256_si128( 713 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); 714 715 pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1); 716 717 pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1); 718 719 res_p = 720 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, 721 _mm256_add_epi16(sum_p3, p256_2)), 722 3); 723 724 flat_p2 = _mm256_castsi256_si128( 725 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); 726 727 res_q = 728 _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, 729 _mm256_add_epi16(sum_q3, q256_2)), 730 3); 731 732 flat_q2 = _mm256_castsi256_si128( 733 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); 734 735 sum_p7 = _mm256_add_epi16(sum_p7, p256_7); 736 737 sum_q7 = _mm256_add_epi16(sum_q7, q256_7); 738 739 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4); 740 741 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4); 742 743 res_p = _mm256_srli_epi16( 744 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4); 745 746 flat2_p3 = _mm256_castsi256_si128( 747 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); 748 749 res_q = _mm256_srli_epi16( 750 _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4); 751 752 flat2_q3 = _mm256_castsi256_si128( 753 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); 754 755 sum_p7 = _mm256_add_epi16(sum_p7, p256_7); 756 757 sum_q7 = _mm256_add_epi16(sum_q7, q256_7); 758 759 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3); 760 761 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3); 762 763 res_p = _mm256_srli_epi16( 764 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4); 765 766 flat2_p4 = _mm256_castsi256_si128( 767 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); 768 769 res_q = _mm256_srli_epi16( 770 _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4); 771 772 flat2_q4 = _mm256_castsi256_si128( 773 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); 774 775 sum_p7 = _mm256_add_epi16(sum_p7, p256_7); 776 777 sum_q7 = _mm256_add_epi16(sum_q7, q256_7); 778 779 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2); 780 781 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2); 782 783 res_p = _mm256_srli_epi16( 784 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4); 785 786 flat2_p5 = _mm256_castsi256_si128( 787 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); 788 789 res_q = _mm256_srli_epi16( 790 _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4); 791 792 flat2_q5 = _mm256_castsi256_si128( 793 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); 794 795 sum_p7 = _mm256_add_epi16(sum_p7, p256_7); 796 797 sum_q7 = _mm256_add_epi16(sum_q7, q256_7); 798 799 pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1); 800 801 pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1); 802 803 res_p = _mm256_srli_epi16( 804 _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4); 805 806 flat2_p6 = _mm256_castsi256_si128( 807 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168)); 808 809 res_q = _mm256_srli_epi16( 810 _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4); 811 812 flat2_q6 = _mm256_castsi256_si128( 813 _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168)); 814 } 815 816 // wide flat 817 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 818 819 p2 = _mm_andnot_si128(flat, p2); 820 flat_p2 = _mm_and_si128(flat, flat_p2); 821 p2 = _mm_or_si128(flat_p2, p2); 822 823 p1 = _mm_andnot_si128(flat, ps1); 824 flat_p1 = _mm_and_si128(flat, flat_p1); 825 p1 = _mm_or_si128(flat_p1, p1); 826 827 p0 = _mm_andnot_si128(flat, ps0); 828 flat_p0 = _mm_and_si128(flat, flat_p0); 829 p0 = _mm_or_si128(flat_p0, p0); 830 831 q0 = _mm_andnot_si128(flat, qs0); 832 flat_q0 = _mm_and_si128(flat, flat_q0); 833 q0 = _mm_or_si128(flat_q0, q0); 834 835 q1 = _mm_andnot_si128(flat, qs1); 836 flat_q1 = _mm_and_si128(flat, flat_q1); 837 q1 = _mm_or_si128(flat_q1, q1); 838 839 q2 = _mm_andnot_si128(flat, q2); 840 flat_q2 = _mm_and_si128(flat, flat_q2); 841 q2 = _mm_or_si128(flat_q2, q2); 842 843 p6 = _mm_andnot_si128(flat2, p6); 844 flat2_p6 = _mm_and_si128(flat2, flat2_p6); 845 p6 = _mm_or_si128(flat2_p6, p6); 846 _mm_storeu_si128((__m128i *)(s - 7 * p), p6); 847 848 p5 = _mm_andnot_si128(flat2, p5); 849 flat2_p5 = _mm_and_si128(flat2, flat2_p5); 850 p5 = _mm_or_si128(flat2_p5, p5); 851 _mm_storeu_si128((__m128i *)(s - 6 * p), p5); 852 853 p4 = _mm_andnot_si128(flat2, p4); 854 flat2_p4 = _mm_and_si128(flat2, flat2_p4); 855 p4 = _mm_or_si128(flat2_p4, p4); 856 _mm_storeu_si128((__m128i *)(s - 5 * p), p4); 857 858 p3 = _mm_andnot_si128(flat2, p3); 859 flat2_p3 = _mm_and_si128(flat2, flat2_p3); 860 p3 = _mm_or_si128(flat2_p3, p3); 861 _mm_storeu_si128((__m128i *)(s - 4 * p), p3); 862 863 p2 = _mm_andnot_si128(flat2, p2); 864 flat2_p2 = _mm_and_si128(flat2, flat2_p2); 865 p2 = _mm_or_si128(flat2_p2, p2); 866 _mm_storeu_si128((__m128i *)(s - 3 * p), p2); 867 868 p1 = _mm_andnot_si128(flat2, p1); 869 flat2_p1 = _mm_and_si128(flat2, flat2_p1); 870 p1 = _mm_or_si128(flat2_p1, p1); 871 _mm_storeu_si128((__m128i *)(s - 2 * p), p1); 872 873 p0 = _mm_andnot_si128(flat2, p0); 874 flat2_p0 = _mm_and_si128(flat2, flat2_p0); 875 p0 = _mm_or_si128(flat2_p0, p0); 876 _mm_storeu_si128((__m128i *)(s - 1 * p), p0); 877 878 q0 = _mm_andnot_si128(flat2, q0); 879 flat2_q0 = _mm_and_si128(flat2, flat2_q0); 880 q0 = _mm_or_si128(flat2_q0, q0); 881 _mm_storeu_si128((__m128i *)(s - 0 * p), q0); 882 883 q1 = _mm_andnot_si128(flat2, q1); 884 flat2_q1 = _mm_and_si128(flat2, flat2_q1); 885 q1 = _mm_or_si128(flat2_q1, q1); 886 _mm_storeu_si128((__m128i *)(s + 1 * p), q1); 887 888 q2 = _mm_andnot_si128(flat2, q2); 889 flat2_q2 = _mm_and_si128(flat2, flat2_q2); 890 q2 = _mm_or_si128(flat2_q2, q2); 891 _mm_storeu_si128((__m128i *)(s + 2 * p), q2); 892 893 q3 = _mm_andnot_si128(flat2, q3); 894 flat2_q3 = _mm_and_si128(flat2, flat2_q3); 895 q3 = _mm_or_si128(flat2_q3, q3); 896 _mm_storeu_si128((__m128i *)(s + 3 * p), q3); 897 898 q4 = _mm_andnot_si128(flat2, q4); 899 flat2_q4 = _mm_and_si128(flat2, flat2_q4); 900 q4 = _mm_or_si128(flat2_q4, q4); 901 _mm_storeu_si128((__m128i *)(s + 4 * p), q4); 902 903 q5 = _mm_andnot_si128(flat2, q5); 904 flat2_q5 = _mm_and_si128(flat2, flat2_q5); 905 q5 = _mm_or_si128(flat2_q5, q5); 906 _mm_storeu_si128((__m128i *)(s + 5 * p), q5); 907 908 q6 = _mm_andnot_si128(flat2, q6); 909 flat2_q6 = _mm_and_si128(flat2, flat2_q6); 910 q6 = _mm_or_si128(flat2_q6, q6); 911 _mm_storeu_si128((__m128i *)(s + 6 * p), q6); 912 } 913 } 914