1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 ****************************************************************************/ 23 24 #ifndef __SWR_SIMDINTRIN_H__ 25 #define __SWR_SIMDINTRIN_H__ 26 27 #include "os.h" 28 29 #include <cassert> 30 31 #include <emmintrin.h> 32 #include <immintrin.h> 33 #include <xmmintrin.h> 34 35 #if KNOB_SIMD_WIDTH == 8 36 typedef __m256 simdscalar; 37 typedef __m256i simdscalari; 38 typedef uint8_t simdmask; 39 #else 40 #error Unsupported vector width 41 #endif 42 43 // simd vector 44 OSALIGNSIMD(union) simdvector 45 { 46 simdscalar v[4]; 47 struct 48 { 49 simdscalar x, y, z, w; 50 }; 51 52 simdscalar& operator[] (const int i) { return v[i]; } 53 const simdscalar& operator[] (const int i) const { return v[i]; } 54 }; 55 56 #if KNOB_SIMD_WIDTH == 8 57 #define _simd128_maskstore_ps _mm_maskstore_ps 58 #define _simd_load_ps _mm256_load_ps 59 #define _simd_load1_ps _mm256_broadcast_ss 60 #define _simd_loadu_ps _mm256_loadu_ps 61 #define _simd_setzero_ps _mm256_setzero_ps 62 #define _simd_set1_ps _mm256_set1_ps 63 #define _simd_blend_ps _mm256_blend_ps 64 #define _simd_blendv_ps _mm256_blendv_ps 65 #define _simd_store_ps _mm256_store_ps 66 #define _simd_mul_ps _mm256_mul_ps 67 #define _simd_add_ps _mm256_add_ps 68 #define _simd_sub_ps _mm256_sub_ps 69 #define _simd_rsqrt_ps _mm256_rsqrt_ps 70 #define _simd_min_ps _mm256_min_ps 71 #define _simd_max_ps _mm256_max_ps 72 #define _simd_movemask_ps _mm256_movemask_ps 73 #define _simd_cvtps_epi32 _mm256_cvtps_epi32 74 #define _simd_cvttps_epi32 _mm256_cvttps_epi32 75 #define _simd_cvtepi32_ps _mm256_cvtepi32_ps 76 #define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ) 77 #define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ) 78 #define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ) 79 #define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ) 80 #define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ) 81 #define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ) 82 #define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm) 83 #define _simd_and_ps _mm256_and_ps 84 #define _simd_or_ps _mm256_or_ps 85 86 #define _simd_rcp_ps _mm256_rcp_ps 87 #define _simd_div_ps _mm256_div_ps 88 #define _simd_castsi_ps _mm256_castsi256_ps 89 #define _simd_andnot_ps _mm256_andnot_ps 90 #define _simd_round_ps _mm256_round_ps 91 #define _simd_castpd_ps _mm256_castpd_ps 92 #define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a)) 93 #define _simd_stream_ps _mm256_stream_ps 94 95 #define _simd_load_sd _mm256_load_sd 96 #define _simd_movemask_pd _mm256_movemask_pd 97 #define _simd_castsi_pd _mm256_castsi256_pd 98 99 // emulated integer simd 100 #define SIMD_EMU_EPI(func, intrin) \ 101 INLINE \ 102 __m256i func(__m256i a, __m256i b)\ 103 {\ 104 __m128i aHi = _mm256_extractf128_si256(a, 1);\ 105 __m128i bHi = _mm256_extractf128_si256(b, 1);\ 106 __m128i aLo = _mm256_castsi256_si128(a);\ 107 __m128i bLo = _mm256_castsi256_si128(b);\ 108 \ 109 __m128i subLo = intrin(aLo, bLo);\ 110 __m128i subHi = intrin(aHi, bHi);\ 111 \ 112 __m256i result = _mm256_castsi128_si256(subLo);\ 113 result = _mm256_insertf128_si256(result, subHi, 1);\ 114 \ 115 return result;\ 116 } 117 118 #if (KNOB_ARCH == KNOB_ARCH_AVX) 119 INLINE 120 __m256 _simdemu_permute_ps(__m256 a, __m256i b) 121 { 122 __m128 aHi = _mm256_extractf128_ps(a, 1); 123 __m128i bHi = _mm256_extractf128_si256(b, 1); 124 __m128 aLo = _mm256_castps256_ps128(a); 125 __m128i bLo = _mm256_castsi256_si128(b); 126 127 __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3)); 128 __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3))); 129 __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3))); 130 __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi)); 131 132 indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3)); 133 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3))); 134 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3))); 135 __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi)); 136 137 __m256 result = _mm256_castps128_ps256(blendLowRes); 138 result = _mm256_insertf128_ps(result, blendHiRes, 1); 139 140 return result; 141 } 142 143 INLINE 144 __m256i _simdemu_permute_epi32(__m256i a, __m256i b) 145 { 146 return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a), b)); 147 } 148 149 INLINE 150 __m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount) 151 { 152 int32_t aHi, aLow, countHi, countLow; 153 __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); 154 __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); 155 __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); 156 __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); 157 158 aHi = _mm_extract_epi32(vAHi, 0); 159 countHi = _mm_extract_epi32(vCountHi, 0); 160 aHi >>= countHi; 161 vAHi = _mm_insert_epi32(vAHi, aHi, 0); 162 163 aLow = _mm_extract_epi32(vALow, 0); 164 countLow = _mm_extract_epi32(vCountLow, 0); 165 aLow >>= countLow; 166 vALow = _mm_insert_epi32(vALow, aLow, 0); 167 168 aHi = _mm_extract_epi32(vAHi, 1); 169 countHi = _mm_extract_epi32(vCountHi, 1); 170 aHi >>= countHi; 171 vAHi = _mm_insert_epi32(vAHi, aHi, 1); 172 173 aLow = _mm_extract_epi32(vALow, 1); 174 countLow = _mm_extract_epi32(vCountLow, 1); 175 aLow >>= countLow; 176 vALow = _mm_insert_epi32(vALow, aLow, 1); 177 178 aHi = _mm_extract_epi32(vAHi, 2); 179 countHi = _mm_extract_epi32(vCountHi, 2); 180 aHi >>= countHi; 181 vAHi = _mm_insert_epi32(vAHi, aHi, 2); 182 183 aLow = _mm_extract_epi32(vALow, 2); 184 countLow = _mm_extract_epi32(vCountLow, 2); 185 aLow >>= countLow; 186 vALow = _mm_insert_epi32(vALow, aLow, 2); 187 188 aHi = _mm_extract_epi32(vAHi, 3); 189 countHi = _mm_extract_epi32(vCountHi, 3); 190 aHi >>= countHi; 191 vAHi = _mm_insert_epi32(vAHi, aHi, 3); 192 193 aLow = _mm_extract_epi32(vALow, 3); 194 countLow = _mm_extract_epi32(vCountLow, 3); 195 aLow >>= countLow; 196 vALow = _mm_insert_epi32(vALow, aLow, 3); 197 198 __m256i ret = _mm256_set1_epi32(0); 199 ret = _mm256_insertf128_si256(ret, vAHi, 1); 200 ret = _mm256_insertf128_si256(ret, vALow, 0); 201 return ret; 202 } 203 204 205 INLINE 206 __m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount) 207 { 208 int32_t aHi, aLow, countHi, countLow; 209 __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1)); 210 __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0)); 211 __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1)); 212 __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0)); 213 214 aHi = _mm_extract_epi32(vAHi, 0); 215 countHi = _mm_extract_epi32(vCountHi, 0); 216 aHi <<= countHi; 217 vAHi = _mm_insert_epi32(vAHi, aHi, 0); 218 219 aLow = _mm_extract_epi32(vALow, 0); 220 countLow = _mm_extract_epi32(vCountLow, 0); 221 aLow <<= countLow; 222 vALow = _mm_insert_epi32(vALow, aLow, 0); 223 224 aHi = _mm_extract_epi32(vAHi, 1); 225 countHi = _mm_extract_epi32(vCountHi, 1); 226 aHi <<= countHi; 227 vAHi = _mm_insert_epi32(vAHi, aHi, 1); 228 229 aLow = _mm_extract_epi32(vALow, 1); 230 countLow = _mm_extract_epi32(vCountLow, 1); 231 aLow <<= countLow; 232 vALow = _mm_insert_epi32(vALow, aLow, 1); 233 234 aHi = _mm_extract_epi32(vAHi, 2); 235 countHi = _mm_extract_epi32(vCountHi, 2); 236 aHi <<= countHi; 237 vAHi = _mm_insert_epi32(vAHi, aHi, 2); 238 239 aLow = _mm_extract_epi32(vALow, 2); 240 countLow = _mm_extract_epi32(vCountLow, 2); 241 aLow <<= countLow; 242 vALow = _mm_insert_epi32(vALow, aLow, 2); 243 244 aHi = _mm_extract_epi32(vAHi, 3); 245 countHi = _mm_extract_epi32(vCountHi, 3); 246 aHi <<= countHi; 247 vAHi = _mm_insert_epi32(vAHi, aHi, 3); 248 249 aLow = _mm_extract_epi32(vALow, 3); 250 countLow = _mm_extract_epi32(vCountLow, 3); 251 aLow <<= countLow; 252 vALow = _mm_insert_epi32(vALow, aLow, 3); 253 254 __m256i ret = _mm256_set1_epi32(0); 255 ret = _mm256_insertf128_si256(ret, vAHi, 1); 256 ret = _mm256_insertf128_si256(ret, vALow, 0); 257 return ret; 258 } 259 260 #define _simd_mul_epi32 _simdemu_mul_epi32 261 #define _simd_mullo_epi32 _simdemu_mullo_epi32 262 #define _simd_sub_epi32 _simdemu_sub_epi32 263 #define _simd_sub_epi64 _simdemu_sub_epi64 264 #define _simd_min_epi32 _simdemu_min_epi32 265 #define _simd_min_epu32 _simdemu_min_epu32 266 #define _simd_max_epi32 _simdemu_max_epi32 267 #define _simd_max_epu32 _simdemu_max_epu32 268 #define _simd_add_epi32 _simdemu_add_epi32 269 #define _simd_and_si _simdemu_and_si 270 #define _simd_andnot_si _simdemu_andnot_si 271 #define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32 272 #define _simd_cmplt_epi32 _simdemu_cmplt_epi32 273 #define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32 274 #define _simd_or_si _simdemu_or_si 275 #define _simd_xor_si _simdemu_xor_si 276 #define _simd_castps_si _mm256_castps_si256 277 #define _simd_adds_epu8 _simdemu_adds_epu8 278 #define _simd_subs_epu8 _simdemu_subs_epu8 279 #define _simd_add_epi8 _simdemu_add_epi8 280 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64 281 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64 282 #define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8 283 #define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8 284 #define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16 285 #define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16 286 #define _simd_movemask_epi8 _simdemu_movemask_epi8 287 #define _simd_permute_ps _simdemu_permute_ps 288 #define _simd_permute_epi32 _simdemu_permute_epi32 289 #define _simd_srlv_epi32 _simdemu_srlv_epi32 290 #define _simd_sllv_epi32 _simdemu_sllv_epi32 291 292 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32) 293 SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32) 294 SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32) 295 SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64) 296 SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32) 297 SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32) 298 SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32) 299 SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32) 300 SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32) 301 SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128) 302 SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128) 303 SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32) 304 SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32) 305 SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32) 306 SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128) 307 SIMD_EMU_EPI(_simdemu_xor_si, _mm_xor_si128) 308 SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8) 309 SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8) 310 SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8) 311 SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64) 312 SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64) 313 SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8) 314 SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8) 315 SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16) 316 SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16) 317 SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8) 318 SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8) 319 SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16) 320 SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16) 321 322 #define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8 323 #define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8 324 #define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16 325 #define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16 326 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) 327 #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) 328 #define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b))) 329 #define _simd_unpackhi_epi64(a, b) _mm256_castpd_si256(_mm256_unpackhi_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b))) 330 331 #define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i) 332 #define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i) 333 #define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i) 334 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a))) 335 336 #define _simd128_fmadd_ps _mm_fmaddemu_ps 337 #define _simd_fmadd_ps _mm_fmaddemu256_ps 338 #define _simd_fmsub_ps _mm_fmsubemu256_ps 339 #define _simd_shuffle_epi8 _simdemu_shuffle_epi8 340 SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8) 341 342 INLINE 343 __m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c) 344 { 345 __m128 res = _mm_mul_ps(a, b); 346 res = _mm_add_ps(res, c); 347 return res; 348 } 349 350 INLINE 351 __m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c) 352 { 353 __m256 res = _mm256_mul_ps(a, b); 354 res = _mm256_add_ps(res, c); 355 return res; 356 } 357 358 INLINE 359 __m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c) 360 { 361 __m256 res = _mm256_mul_ps(a, b); 362 res = _mm256_sub_ps(res, c); 363 return res; 364 } 365 366 INLINE 367 __m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale) 368 { 369 uint32_t *pOffsets = (uint32_t*)&vOffsets; 370 simdscalar vResult; 371 float* pResult = (float*)&vResult; 372 for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i) 373 { 374 uint32_t offset = pOffsets[i]; 375 offset = offset * scale; 376 pResult[i] = *(float*)(((const uint8_t*)pBase + offset)); 377 } 378 379 return vResult; 380 } 381 382 INLINE 383 __m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale) 384 { 385 uint32_t *pOffsets = (uint32_t*)&vOffsets; 386 simdscalar vResult = vSrc; 387 float* pResult = (float*)&vResult; 388 DWORD index; 389 uint32_t mask = _simd_movemask_ps(vMask); 390 while (_BitScanForward(&index, mask)) 391 { 392 mask &= ~(1 << index); 393 uint32_t offset = pOffsets[index]; 394 offset = offset * scale; 395 pResult[index] = *(float*)(((const uint8_t*)pBase + offset)); 396 } 397 398 return vResult; 399 } 400 401 INLINE 402 __m256i _simd_abs_epi32(__m256i a) 403 { 404 __m128i aHi = _mm256_extractf128_si256(a, 1); 405 __m128i aLo = _mm256_castsi256_si128(a); 406 __m128i absLo = _mm_abs_epi32(aLo); 407 __m128i absHi = _mm_abs_epi32(aHi); 408 __m256i result = _mm256_castsi128_si256(absLo); 409 result = _mm256_insertf128_si256(result, absHi, 1); 410 return result; 411 } 412 413 INLINE 414 int _simdemu_movemask_epi8(__m256i a) 415 { 416 __m128i aHi = _mm256_extractf128_si256(a, 1); 417 __m128i aLo = _mm256_castsi256_si128(a); 418 419 int resHi = _mm_movemask_epi8(aHi); 420 int resLo = _mm_movemask_epi8(aLo); 421 422 return (resHi << 16) | resLo; 423 } 424 425 INLINE 426 __m256i _simd_cvtepu8_epi16(__m128i a) 427 { 428 __m128i resultlo = _mm_cvtepu8_epi16(a); 429 __m128i resulthi = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8)); 430 431 __m256i result = _mm256_castsi128_si256(resultlo); 432 433 return _mm256_insertf128_si256(result, resulthi, 1); 434 } 435 436 INLINE 437 __m256i _simd_cvtepu8_epi32(__m128i a) 438 { 439 __m128i resultlo = _mm_cvtepu8_epi32(a); 440 __m128i resulthi = _mm_cvtepu8_epi32(_mm_srli_si128(a, 4)); 441 442 __m256i result = _mm256_castsi128_si256(resultlo); 443 444 return _mm256_insertf128_si256(result, resulthi, 1); 445 } 446 447 INLINE 448 __m256i _simd_cvtepu16_epi32(__m128i a) 449 { 450 __m128i resultlo = _mm_cvtepu16_epi32(a); 451 __m128i resulthi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8)); 452 453 __m256i result = _mm256_castsi128_si256(resultlo); 454 455 return _mm256_insertf128_si256(result, resulthi, 1); 456 } 457 458 INLINE 459 __m256i _simd_packus_epi16(__m256i a, __m256i b) 460 { 461 __m128i alo = _mm256_extractf128_si256(a, 0); 462 __m128i ahi = _mm256_extractf128_si256(a, 1); 463 464 __m128i blo = _mm256_extractf128_si256(b, 0); 465 __m128i bhi = _mm256_extractf128_si256(b, 1); 466 467 __m128i resultlo = _mm_packus_epi16(alo, blo); 468 __m128i resulthi = _mm_packus_epi16(ahi, bhi); 469 470 __m256i result = _mm256_castsi128_si256(resultlo); 471 472 return _mm256_insertf128_si256(result, resulthi, 1); 473 } 474 475 INLINE 476 __m256i _simd_packs_epi16(__m256i a, __m256i b) 477 { 478 __m128i alo = _mm256_extractf128_si256(a, 0); 479 __m128i ahi = _mm256_extractf128_si256(a, 1); 480 481 __m128i blo = _mm256_extractf128_si256(b, 0); 482 __m128i bhi = _mm256_extractf128_si256(b, 1); 483 484 __m128i resultlo = _mm_packs_epi16(alo, blo); 485 __m128i resulthi = _mm_packs_epi16(ahi, bhi); 486 487 __m256i result = _mm256_castsi128_si256(resultlo); 488 489 return _mm256_insertf128_si256(result, resulthi, 1); 490 } 491 492 INLINE 493 __m256i _simd_packus_epi32(__m256i a, __m256i b) 494 { 495 __m128i alo = _mm256_extractf128_si256(a, 0); 496 __m128i ahi = _mm256_extractf128_si256(a, 1); 497 498 __m128i blo = _mm256_extractf128_si256(b, 0); 499 __m128i bhi = _mm256_extractf128_si256(b, 1); 500 501 __m128i resultlo = _mm_packus_epi32(alo, blo); 502 __m128i resulthi = _mm_packus_epi32(ahi, bhi); 503 504 __m256i result = _mm256_castsi128_si256(resultlo); 505 506 return _mm256_insertf128_si256(result, resulthi, 1); 507 } 508 509 INLINE 510 __m256i _simd_packs_epi32(__m256i a, __m256i b) 511 { 512 __m128i alo = _mm256_extractf128_si256(a, 0); 513 __m128i ahi = _mm256_extractf128_si256(a, 1); 514 515 __m128i blo = _mm256_extractf128_si256(b, 0); 516 __m128i bhi = _mm256_extractf128_si256(b, 1); 517 518 __m128i resultlo = _mm_packs_epi32(alo, blo); 519 __m128i resulthi = _mm_packs_epi32(ahi, bhi); 520 521 __m256i result = _mm256_castsi128_si256(resultlo); 522 523 return _mm256_insertf128_si256(result, resulthi, 1); 524 } 525 526 #else 527 528 #define _simd_mul_epi32 _mm256_mul_epi32 529 #define _simd_mullo_epi32 _mm256_mullo_epi32 530 #define _simd_sub_epi32 _mm256_sub_epi32 531 #define _simd_sub_epi64 _mm256_sub_epi64 532 #define _simd_min_epi32 _mm256_min_epi32 533 #define _simd_max_epi32 _mm256_max_epi32 534 #define _simd_min_epu32 _mm256_min_epu32 535 #define _simd_max_epu32 _mm256_max_epu32 536 #define _simd_add_epi32 _mm256_add_epi32 537 #define _simd_and_si _mm256_and_si256 538 #define _simd_andnot_si _mm256_andnot_si256 539 #define _simd_cmpeq_epi32 _mm256_cmpeq_epi32 540 #define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a) 541 #define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b) 542 #define _simd_or_si _mm256_or_si256 543 #define _simd_xor_si _mm256_xor_si256 544 #define _simd_castps_si _mm256_castps_si256 545 546 #define _simd_unpacklo_epi8 _mm256_unpacklo_epi8 547 #define _simd_unpackhi_epi8 _mm256_unpackhi_epi8 548 #define _simd_unpacklo_epi16 _mm256_unpacklo_epi16 549 #define _simd_unpackhi_epi16 _mm256_unpackhi_epi16 550 #define _simd_unpacklo_epi32 _mm256_unpacklo_epi32 551 #define _simd_unpackhi_epi32 _mm256_unpackhi_epi32 552 #define _simd_unpacklo_epi64 _mm256_unpacklo_epi64 553 #define _simd_unpackhi_epi64 _mm256_unpackhi_epi64 554 555 #define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a) 556 #define _simd_slli_epi32 _mm256_slli_epi32 557 #define _simd_srai_epi32 _mm256_srai_epi32 558 #define _simd_srli_epi32 _mm256_srli_epi32 559 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a))) 560 #define _simd128_fmadd_ps _mm_fmadd_ps 561 #define _simd_fmadd_ps _mm256_fmadd_ps 562 #define _simd_fmsub_ps _mm256_fmsub_ps 563 #define _simd_shuffle_epi8 _mm256_shuffle_epi8 564 #define _simd_adds_epu8 _mm256_adds_epu8 565 #define _simd_subs_epu8 _mm256_subs_epu8 566 #define _simd_add_epi8 _mm256_add_epi8 567 #define _simd_i32gather_ps _mm256_i32gather_ps 568 #define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps 569 #define _simd_abs_epi32 _mm256_abs_epi32 570 571 #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64 572 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64 573 #define _simd_cmpgt_epi8 _mm256_cmpgt_epi8 574 #define _simd_cmpeq_epi8 _mm256_cmpeq_epi8 575 #define _simd_cmpgt_epi16 _mm256_cmpgt_epi16 576 #define _simd_cmpeq_epi16 _mm256_cmpeq_epi16 577 #define _simd_movemask_epi8 _mm256_movemask_epi8 578 #define _simd_permute_ps _mm256_permutevar8x32_ps 579 #define _simd_permute_epi32 _mm256_permutevar8x32_epi32 580 #define _simd_srlv_epi32 _mm256_srlv_epi32 581 #define _simd_sllv_epi32 _mm256_sllv_epi32 582 #define _simd_cvtepu8_epi16 _mm256_cvtepu8_epi16 583 #define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32 584 #define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32 585 #define _simd_packus_epi16 _mm256_packus_epi16 586 #define _simd_packs_epi16 _mm256_packs_epi16 587 #define _simd_packus_epi32 _mm256_packus_epi32 588 #define _simd_packs_epi32 _mm256_packs_epi32 589 590 #endif 591 592 #define _simd_unpacklo_ps _mm256_unpacklo_ps 593 #define _simd_unpackhi_ps _mm256_unpackhi_ps 594 #define _simd_unpacklo_pd _mm256_unpacklo_pd 595 #define _simd_unpackhi_pd _mm256_unpackhi_pd 596 #define _simd_insertf128_ps _mm256_insertf128_ps 597 #define _simd_insertf128_pd _mm256_insertf128_pd 598 #define _simd_insertf128_si _mm256_insertf128_si256 599 #define _simd_extractf128_ps _mm256_extractf128_ps 600 #define _simd_extractf128_pd _mm256_extractf128_pd 601 #define _simd_extractf128_si _mm256_extractf128_si256 602 #define _simd_permute2f128_ps _mm256_permute2f128_ps 603 #define _simd_permute2f128_pd _mm256_permute2f128_pd 604 #define _simd_permute2f128_si _mm256_permute2f128_si256 605 #define _simd_shuffle_ps _mm256_shuffle_ps 606 #define _simd_shuffle_pd _mm256_shuffle_pd 607 #define _simd_shuffle_epi32(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), imm8)) 608 #define _simd_shuffle_epi64(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), imm8)) 609 #define _simd_set1_epi32 _mm256_set1_epi32 610 #define _simd_set_epi32 _mm256_set_epi32 611 #define _simd_set1_epi8 _mm256_set1_epi8 612 #define _simd_setzero_si _mm256_setzero_si256 613 #define _simd_cvttps_epi32 _mm256_cvttps_epi32 614 #define _simd_store_si _mm256_store_si256 615 #define _simd_broadcast_ss _mm256_broadcast_ss 616 #define _simd_maskstore_ps _mm256_maskstore_ps 617 #define _simd_load_si _mm256_load_si256 618 #define _simd_loadu_si _mm256_loadu_si256 619 #define _simd_sub_ps _mm256_sub_ps 620 #define _simd_testz_ps _mm256_testz_ps 621 #define _simd_xor_ps _mm256_xor_ps 622 623 INLINE 624 simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr) 625 { 626 __m128i lo = _mm_loadu_si128(loaddr); 627 __m128i hi = _mm_loadu_si128(hiaddr); 628 629 return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); 630 } 631 632 INLINE 633 void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a) 634 { 635 _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a)); 636 _mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1)); 637 } 638 639 INLINE 640 simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask) 641 { 642 return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask)); 643 } 644 645 INLINE 646 simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalari mask) 647 { 648 return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), _simd_castsi_ps(mask))); 649 } 650 651 // convert bitmask to vector mask 652 INLINE 653 simdscalar vMask(int32_t mask) 654 { 655 __m256i vec = _mm256_set1_epi32(mask); 656 const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); 657 vec = _simd_and_si(vec, bit); 658 vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec); 659 return _simd_castsi_ps(vec); 660 } 661 662 INLINE 663 void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane) 664 { 665 OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH]; 666 _mm256_store_ps(rArray, r); 667 _mm256_store_ps(sArray, s); 668 rArray[rlane] = sArray[slane]; 669 r = _mm256_load_ps(rArray); 670 } 671 672 INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i) 673 { 674 __m128i aHi = _mm256_extractf128_si256(a, 1); 675 __m128i aLo = _mm256_castsi256_si128(a); 676 677 __m128i resHi = _mm_slli_epi32(aHi, i); 678 __m128i resLo = _mm_slli_epi32(aLo, i); 679 680 __m256i result = _mm256_castsi128_si256(resLo); 681 result = _mm256_insertf128_si256(result, resHi, 1); 682 683 return result; 684 } 685 686 INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i) 687 { 688 __m128i aHi = _mm256_extractf128_si256(a, 1); 689 __m128i aLo = _mm256_castsi256_si128(a); 690 691 __m128i resHi = _mm_srai_epi32(aHi, i); 692 __m128i resLo = _mm_srai_epi32(aLo, i); 693 694 __m256i result = _mm256_castsi128_si256(resLo); 695 result = _mm256_insertf128_si256(result, resHi, 1); 696 697 return result; 698 } 699 700 INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i) 701 { 702 __m128i aHi = _mm256_extractf128_si256(a, 1); 703 __m128i aLo = _mm256_castsi256_si128(a); 704 705 __m128i resHi = _mm_srli_epi32(aHi, i); 706 __m128i resLo = _mm_srli_epi32(aLo, i); 707 708 __m256i result = _mm256_castsi128_si256(resLo); 709 result = _mm256_insertf128_si256(result, resHi, 1); 710 711 return result; 712 } 713 714 INLINE 715 void _simdvec_transpose(simdvector &v) 716 { 717 SWR_ASSERT(false, "Need to implement 8 wide version"); 718 } 719 720 #else 721 #error Unsupported vector width 722 #endif 723 724 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww. 725 INLINE 726 void _simdvec_load_ps(simdvector& r, const float *p) 727 { 728 r[0] = _simd_set1_ps(p[0]); 729 r[1] = _simd_set1_ps(p[1]); 730 r[2] = _simd_set1_ps(p[2]); 731 r[3] = _simd_set1_ps(p[3]); 732 } 733 734 INLINE 735 void _simdvec_mov(simdvector& r, const simdscalar& s) 736 { 737 r[0] = s; 738 r[1] = s; 739 r[2] = s; 740 r[3] = s; 741 } 742 743 INLINE 744 void _simdvec_mov(simdvector& r, const simdvector& v) 745 { 746 r[0] = v[0]; 747 r[1] = v[1]; 748 r[2] = v[2]; 749 r[3] = v[3]; 750 } 751 752 #if 0 753 // just move a lane from the source simdvector to dest simdvector 754 INLINE 755 void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane) 756 { 757 _simd_mov(r[0], rlane, s[0], slane); 758 _simd_mov(r[1], rlane, s[1], slane); 759 _simd_mov(r[2], rlane, s[2], slane); 760 _simd_mov(r[3], rlane, s[3], slane); 761 } 762 763 #endif 764 INLINE 765 void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1) 766 { 767 simdscalar tmp; 768 r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) 769 770 tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) 771 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) 772 773 tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) 774 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) 775 } 776 777 INLINE 778 void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1) 779 { 780 simdscalar tmp; 781 r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x) 782 783 tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y) 784 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) 785 786 tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z) 787 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) 788 789 tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w) 790 r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z) 791 } 792 793 INLINE 794 simdscalar _simdvec_rcp_length_ps(const simdvector& v) 795 { 796 simdscalar length; 797 _simdvec_dp4_ps(length, v, v); 798 return _simd_rsqrt_ps(length); 799 } 800 801 INLINE 802 void _simdvec_normalize_ps(simdvector& r, const simdvector& v) 803 { 804 simdscalar vecLength; 805 vecLength = _simdvec_rcp_length_ps(v); 806 807 r[0] = _simd_mul_ps(v[0], vecLength); 808 r[1] = _simd_mul_ps(v[1], vecLength); 809 r[2] = _simd_mul_ps(v[2], vecLength); 810 r[3] = _simd_mul_ps(v[3], vecLength); 811 } 812 813 INLINE 814 void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s) 815 { 816 r[0] = _simd_mul_ps(v[0], s); 817 r[1] = _simd_mul_ps(v[1], s); 818 r[2] = _simd_mul_ps(v[2], s); 819 r[3] = _simd_mul_ps(v[3], s); 820 } 821 822 INLINE 823 void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1) 824 { 825 r[0] = _simd_mul_ps(v0[0], v1[0]); 826 r[1] = _simd_mul_ps(v0[1], v1[1]); 827 r[2] = _simd_mul_ps(v0[2], v1[2]); 828 r[3] = _simd_mul_ps(v0[3], v1[3]); 829 } 830 831 INLINE 832 void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1) 833 { 834 r[0] = _simd_add_ps(v0[0], v1[0]); 835 r[1] = _simd_add_ps(v0[1], v1[1]); 836 r[2] = _simd_add_ps(v0[2], v1[2]); 837 r[3] = _simd_add_ps(v0[3], v1[3]); 838 } 839 840 INLINE 841 void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s) 842 { 843 r[0] = _simd_min_ps(v0[0], s); 844 r[1] = _simd_min_ps(v0[1], s); 845 r[2] = _simd_min_ps(v0[2], s); 846 r[3] = _simd_min_ps(v0[3], s); 847 } 848 849 INLINE 850 void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s) 851 { 852 r[0] = _simd_max_ps(v0[0], s); 853 r[1] = _simd_max_ps(v0[1], s); 854 r[2] = _simd_max_ps(v0[2], s); 855 r[3] = _simd_max_ps(v0[3], s); 856 } 857 858 // Matrix4x4 * Vector4 859 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w) 860 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w) 861 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w) 862 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w) 863 INLINE 864 void _simd_mat4x4_vec4_multiply( 865 simdvector& result, 866 const float *pMatrix, 867 const simdvector& v) 868 { 869 simdscalar m; 870 simdscalar r0; 871 simdscalar r1; 872 873 m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] 874 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 875 m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] 876 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 877 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 878 m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] 879 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 880 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 881 m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] 882 r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) 883 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) 884 result[0] = r0; 885 886 m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] 887 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 888 m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] 889 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 890 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 891 m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] 892 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 893 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 894 m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] 895 r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) 896 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) 897 result[1] = r0; 898 899 m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] 900 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 901 m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] 902 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 903 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 904 m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] 905 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 906 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 907 m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] 908 r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) 909 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) 910 result[2] = r0; 911 912 m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] 913 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 914 m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] 915 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 916 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 917 m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] 918 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 919 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 920 m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] 921 r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z) 922 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w) 923 result[3] = r0; 924 } 925 926 // Matrix4x4 * Vector3 - Direction Vector where w = 0. 927 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0) 928 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0) 929 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0) 930 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0) 931 INLINE 932 void _simd_mat3x3_vec3_w0_multiply( 933 simdvector& result, 934 const float *pMatrix, 935 const simdvector& v) 936 { 937 simdscalar m; 938 simdscalar r0; 939 simdscalar r1; 940 941 m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] 942 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 943 m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] 944 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 945 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 946 m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] 947 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 948 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 949 result[0] = r0; 950 951 m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] 952 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 953 m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] 954 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 955 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 956 m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] 957 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 958 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 959 result[1] = r0; 960 961 m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] 962 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 963 m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] 964 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 965 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 966 m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] 967 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 968 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 969 result[2] = r0; 970 971 result[3] = _simd_setzero_ps(); 972 } 973 974 // Matrix4x4 * Vector3 - Position vector where w = 1. 975 // outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1) 976 // outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1) 977 // outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1) 978 // outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1) 979 INLINE 980 void _simd_mat4x4_vec3_w1_multiply( 981 simdvector& result, 982 const float *pMatrix, 983 const simdvector& v) 984 { 985 simdscalar m; 986 simdscalar r0; 987 simdscalar r1; 988 989 m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] 990 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 991 m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] 992 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 993 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 994 m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] 995 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 996 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 997 m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] 998 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 999 result[0] = r0; 1000 1001 m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] 1002 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 1003 m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] 1004 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 1005 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 1006 m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] 1007 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 1008 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 1009 m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] 1010 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 1011 result[1] = r0; 1012 1013 m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] 1014 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 1015 m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] 1016 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 1017 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 1018 m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] 1019 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 1020 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 1021 m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] 1022 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 1023 result[2] = r0; 1024 1025 m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0] 1026 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 1027 m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1] 1028 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 1029 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 1030 m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2] 1031 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 1032 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 1033 m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3] 1034 result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 1035 } 1036 1037 INLINE 1038 void _simd_mat4x3_vec3_w1_multiply( 1039 simdvector& result, 1040 const float *pMatrix, 1041 const simdvector& v) 1042 { 1043 simdscalar m; 1044 simdscalar r0; 1045 simdscalar r1; 1046 1047 m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0] 1048 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 1049 m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1] 1050 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 1051 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 1052 m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2] 1053 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 1054 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 1055 m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3] 1056 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 1057 result[0] = r0; 1058 1059 m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0] 1060 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 1061 m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1] 1062 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 1063 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 1064 m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2] 1065 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 1066 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 1067 m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3] 1068 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 1069 result[1] = r0; 1070 1071 m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0] 1072 r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x) 1073 m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1] 1074 r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y) 1075 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) 1076 m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2] 1077 r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z) 1078 r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) 1079 m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3] 1080 r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1) 1081 result[2] = r0; 1082 result[3] = _simd_set1_ps(1.0f); 1083 } 1084 1085 ////////////////////////////////////////////////////////////////////////// 1086 /// @brief Compute plane equation vA * vX + vB * vY + vC 1087 INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY) 1088 { 1089 simdscalar vOut = _simd_fmadd_ps(vA, vX, vC); 1090 vOut = _simd_fmadd_ps(vB, vY, vOut); 1091 return vOut; 1092 } 1093 1094 ////////////////////////////////////////////////////////////////////////// 1095 /// @brief Compute plane equation vA * vX + vB * vY + vC 1096 INLINE __m128 vplaneps128(__m128 vA, __m128 vB, __m128 vC, __m128 &vX, __m128 &vY) 1097 { 1098 __m128 vOut = _simd128_fmadd_ps(vA, vX, vC); 1099 vOut = _simd128_fmadd_ps(vB, vY, vOut); 1100 return vOut; 1101 } 1102 1103 ////////////////////////////////////////////////////////////////////////// 1104 /// @brief Interpolates a single component. 1105 /// @param vI - barycentric I 1106 /// @param vJ - barycentric J 1107 /// @param pInterpBuffer - pointer to attribute barycentric coeffs 1108 template<UINT Attrib, UINT Comp, UINT numComponents = 4> 1109 static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer) 1110 { 1111 const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; 1112 const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp]; 1113 const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp]; 1114 1115 simdscalar vA = _simd_broadcast_ss(pInterpA); 1116 simdscalar vB = _simd_broadcast_ss(pInterpB); 1117 simdscalar vC = _simd_broadcast_ss(pInterpC); 1118 1119 simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ); 1120 vC = _simd_mul_ps(vk, vC); 1121 1122 return vplaneps(vA, vB, vC, vI, vJ); 1123 } 1124 1125 ////////////////////////////////////////////////////////////////////////// 1126 /// @brief Interpolates a single component. 1127 /// @param vI - barycentric I 1128 /// @param vJ - barycentric J 1129 /// @param pInterpBuffer - pointer to attribute barycentric coeffs 1130 template<UINT Attrib, UINT Comp, UINT numComponents = 4> 1131 static INLINE __m128 InterpolateComponent(__m128 vI, __m128 vJ, const float *pInterpBuffer) 1132 { 1133 const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp]; 1134 const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp]; 1135 const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp]; 1136 1137 __m128 vA = _mm_broadcast_ss(pInterpA); 1138 __m128 vB = _mm_broadcast_ss(pInterpB); 1139 __m128 vC = _mm_broadcast_ss(pInterpC); 1140 1141 __m128 vk = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), vI), vJ); 1142 vC = _mm_mul_ps(vk, vC); 1143 1144 return vplaneps128(vA, vB, vC, vI, vJ); 1145 } 1146 1147 static INLINE __m128 _simd128_abs_ps(__m128 a) 1148 { 1149 __m128i ai = _mm_castps_si128(a); 1150 return _mm_castsi128_ps(_mm_and_si128(ai, _mm_set1_epi32(0x7fffffff))); 1151 } 1152 1153 static INLINE simdscalar _simd_abs_ps(simdscalar a) 1154 { 1155 simdscalari ai = _simd_castps_si(a); 1156 return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff))); 1157 } 1158 1159 INLINE 1160 UINT pdep_u32(UINT a, UINT mask) 1161 { 1162 #if KNOB_ARCH >= KNOB_ARCH_AVX2 1163 return _pdep_u32(a, mask); 1164 #else 1165 UINT result = 0; 1166 1167 // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 1168 // using bsf instead of funky loop 1169 DWORD maskIndex; 1170 while (_BitScanForward(&maskIndex, mask)) 1171 { 1172 // 1. isolate lowest set bit of mask 1173 const UINT lowest = 1 << maskIndex; 1174 1175 // 2. populate LSB from src 1176 const UINT LSB = (UINT)((int)(a << 31) >> 31); 1177 1178 // 3. copy bit from mask 1179 result |= LSB & lowest; 1180 1181 // 4. clear lowest bit 1182 mask &= ~lowest; 1183 1184 // 5. prepare for next iteration 1185 a >>= 1; 1186 } 1187 1188 return result; 1189 #endif 1190 } 1191 1192 INLINE 1193 UINT pext_u32(UINT a, UINT mask) 1194 { 1195 #if KNOB_ARCH >= KNOB_ARCH_AVX2 1196 return _pext_u32(a, mask); 1197 #else 1198 UINT result = 0; 1199 DWORD maskIndex; 1200 uint32_t currentBit = 0; 1201 while (_BitScanForward(&maskIndex, mask)) 1202 { 1203 // 1. isolate lowest set bit of mask 1204 const UINT lowest = 1 << maskIndex; 1205 1206 // 2. copy bit from mask 1207 result |= ((a & lowest) > 0) << currentBit++; 1208 1209 // 3. clear lowest bit 1210 mask &= ~lowest; 1211 } 1212 return result; 1213 #endif 1214 } 1215 1216 #if ENABLE_AVX512_SIMD16 1217 #include "simd16intrin.h" 1218 #endif//ENABLE_AVX512_SIMD16 1219 1220 #endif//__SWR_SIMDINTRIN_H__ 1221