1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file utils.h 24 * 25 * @brief Utilities used by SWR core. 26 * 27 ******************************************************************************/ 28 #pragma once 29 30 #include <string.h> 31 #include <type_traits> 32 #include <algorithm> 33 #include "common/os.h" 34 #include "common/simdintrin.h" 35 #include "common/swr_assert.h" 36 #include "core/api.h" 37 38 #if defined(_WIN64) || defined(__x86_64__) 39 #define _MM_INSERT_EPI64 _mm_insert_epi64 40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64 41 #else 42 INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx) 43 { 44 OSALIGNLINE(uint32_t) elems[4]; 45 _mm_store_si128((__m128i*)elems, a); 46 if (ndx == 0) 47 { 48 uint64_t foo = elems[0]; 49 foo |= (uint64_t)elems[1] << 32; 50 return foo; 51 } 52 else 53 { 54 uint64_t foo = elems[2]; 55 foo |= (uint64_t)elems[3] << 32; 56 return foo; 57 } 58 } 59 60 INLINE __m128i _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx) 61 { 62 OSALIGNLINE(int64_t) elems[2]; 63 _mm_store_si128((__m128i*)elems, a); 64 if (ndx == 0) 65 { 66 elems[0] = b; 67 } 68 else 69 { 70 elems[1] = b; 71 } 72 __m128i out; 73 out = _mm_load_si128((const __m128i*)elems); 74 return out; 75 } 76 #endif 77 78 struct simdBBox 79 { 80 simdscalari ymin; 81 simdscalari ymax; 82 simdscalari xmin; 83 simdscalari xmax; 84 }; 85 86 INLINE 87 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3) 88 { 89 __m128i row0i = _mm_castps_si128(row0); 90 __m128i row1i = _mm_castps_si128(row1); 91 __m128i row2i = _mm_castps_si128(row2); 92 __m128i row3i = _mm_castps_si128(row3); 93 94 __m128i vTemp = row2i; 95 row2i = _mm_unpacklo_epi32(row2i, row3i); 96 vTemp = _mm_unpackhi_epi32(vTemp, row3i); 97 98 row3i = row0i; 99 row0i = _mm_unpacklo_epi32(row0i, row1i); 100 row3i = _mm_unpackhi_epi32(row3i, row1i); 101 102 row1i = row0i; 103 row0i = _mm_unpacklo_epi64(row0i, row2i); 104 row1i = _mm_unpackhi_epi64(row1i, row2i); 105 106 row2i = row3i; 107 row2i = _mm_unpacklo_epi64(row2i, vTemp); 108 row3i = _mm_unpackhi_epi64(row3i, vTemp); 109 110 row0 = _mm_castsi128_ps(row0i); 111 row1 = _mm_castsi128_ps(row1i); 112 row2 = _mm_castsi128_ps(row2i); 113 row3 = _mm_castsi128_ps(row3i); 114 } 115 116 INLINE 117 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) 118 { 119 __m128i vTemp = row2; 120 row2 = _mm_unpacklo_epi32(row2, row3); 121 vTemp = _mm_unpackhi_epi32(vTemp, row3); 122 123 row3 = row0; 124 row0 = _mm_unpacklo_epi32(row0, row1); 125 row3 = _mm_unpackhi_epi32(row3, row1); 126 127 row1 = row0; 128 row0 = _mm_unpacklo_epi64(row0, row2); 129 row1 = _mm_unpackhi_epi64(row1, row2); 130 131 row2 = row3; 132 row2 = _mm_unpacklo_epi64(row2, vTemp); 133 row3 = _mm_unpackhi_epi64(row3, vTemp); 134 } 135 136 #define GCC_VERSION (__GNUC__ * 10000 \ 137 + __GNUC_MINOR__ * 100 \ 138 + __GNUC_PATCHLEVEL__) 139 140 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900)) 141 #define _mm_undefined_ps _mm_setzero_ps 142 #define _mm_undefined_si128 _mm_setzero_si128 143 #if KNOB_SIMD_WIDTH == 8 144 #define _mm256_undefined_ps _mm256_setzero_ps 145 #endif 146 #endif 147 148 #if KNOB_SIMD_WIDTH == 8 149 INLINE 150 void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2) 151 { 152 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 153 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5 154 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 155 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 156 157 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 158 r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps()); //y2w2y3w3 y6w6yw77 159 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 160 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 161 162 vDst[0] = _mm256_castps256_ps128(r02r1xlolo); 163 vDst[1] = _mm256_castps256_ps128(r02r1xlohi); 164 vDst[2] = _mm256_castps256_ps128(r02r1xhilo); 165 vDst[3] = _mm256_castps256_ps128(r02r1xhihi); 166 167 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); 168 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); 169 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); 170 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); 171 } 172 173 INLINE 174 void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3) 175 { 176 __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 177 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 178 __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 179 __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 180 181 r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 182 r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3) ; //y2w2y3w3 y6w6yw77 183 __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 184 __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 185 186 vDst[0] = _mm256_castps256_ps128(r02r1xlolo); 187 vDst[1] = _mm256_castps256_ps128(r02r1xlohi); 188 vDst[2] = _mm256_castps256_ps128(r02r1xhilo); 189 vDst[3] = _mm256_castps256_ps128(r02r1xhihi); 190 191 vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1); 192 vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1); 193 vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1); 194 vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1); 195 } 196 197 #if ENABLE_AVX512_SIMD16 198 INLINE 199 void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3) 200 { 201 const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking 202 203 simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r 204 simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g 205 simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b 206 simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a 207 208 simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2); 209 simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3); 210 simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2); 211 simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3); 212 213 dst[0] = _simd16_unpacklo_ps(rblo, galo); 214 dst[1] = _simd16_unpackhi_ps(rblo, galo); 215 dst[2] = _simd16_unpacklo_ps(rbhi, gahi); 216 dst[3] = _simd16_unpackhi_ps(rbhi, gahi); 217 } 218 219 #endif 220 INLINE 221 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7) 222 { 223 __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1); 224 __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1); 225 __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3); 226 __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3); 227 __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5); 228 __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5); 229 __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7); 230 __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7); 231 __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); 232 __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); 233 __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); 234 __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); 235 __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); 236 __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); 237 __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); 238 __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); 239 vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); 240 vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); 241 vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); 242 vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); 243 vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); 244 vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); 245 vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); 246 vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); 247 } 248 249 INLINE 250 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7) 251 { 252 vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3), 253 _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7)); 254 } 255 #endif 256 257 ////////////////////////////////////////////////////////////////////////// 258 /// TranposeSingleComponent 259 ////////////////////////////////////////////////////////////////////////// 260 template<uint32_t bpp> 261 struct TransposeSingleComponent 262 { 263 ////////////////////////////////////////////////////////////////////////// 264 /// @brief Pass-thru for single component. 265 /// @param pSrc - source data in SOA form 266 /// @param pDst - output data in AOS form 267 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 268 { 269 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); 270 } 271 #if ENABLE_AVX512_SIMD16 272 273 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 274 { 275 memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8); 276 } 277 #endif 278 }; 279 280 ////////////////////////////////////////////////////////////////////////// 281 /// Transpose8_8_8_8 282 ////////////////////////////////////////////////////////////////////////// 283 struct Transpose8_8_8_8 284 { 285 ////////////////////////////////////////////////////////////////////////// 286 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. 287 /// @param pSrc - source data in SOA form 288 /// @param pDst - output data in AOS form 289 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 290 { 291 simdscalari src = _simd_load_si((const simdscalari*)pSrc); 292 293 #if KNOB_SIMD_WIDTH == 8 294 #if KNOB_ARCH == KNOB_ARCH_AVX 295 __m128i c0c1 = _mm256_castsi256_si128(src); // rrrrrrrrgggggggg 296 __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1)); // bbbbbbbbaaaaaaaa 297 __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb 298 __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa 299 __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg 300 __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3); // babababababababa 301 __m128i c0123lo = _mm_unpacklo_epi16(c01, c23); // rgbargbargbargba 302 __m128i c0123hi = _mm_unpackhi_epi16(c01, c23); // rgbargbargbargba 303 _mm_store_si128((__m128i*)pDst, c0123lo); 304 _mm_store_si128((__m128i*)(pDst + 16), c0123hi); 305 #elif KNOB_ARCH == KNOB_ARCH_AVX2 306 simdscalari dst01 = _mm256_shuffle_epi8(src, 307 _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); 308 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); 309 dst23 = _mm256_shuffle_epi8(dst23, 310 _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); 311 simdscalari dst = _mm256_or_si256(dst01, dst23); 312 _simd_store_si((simdscalari*)pDst, dst); 313 #endif 314 #else 315 #error Unsupported vector width 316 #endif 317 } 318 #if ENABLE_AVX512_SIMD16 319 320 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 321 { 322 __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr 323 __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg 324 __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb 325 __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa 326 327 simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0); 328 simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1); 329 simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2); 330 simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3); 331 332 simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); 333 simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16); 334 simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24); 335 336 simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3)); 337 338 _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba 339 } 340 #endif 341 }; 342 343 ////////////////////////////////////////////////////////////////////////// 344 /// Transpose8_8_8 345 ////////////////////////////////////////////////////////////////////////// 346 struct Transpose8_8_8 347 { 348 ////////////////////////////////////////////////////////////////////////// 349 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. 350 /// @param pSrc - source data in SOA form 351 /// @param pDst - output data in AOS form 352 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 353 #if ENABLE_AVX512_SIMD16 354 355 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 356 #endif 357 }; 358 359 ////////////////////////////////////////////////////////////////////////// 360 /// Transpose8_8 361 ////////////////////////////////////////////////////////////////////////// 362 struct Transpose8_8 363 { 364 ////////////////////////////////////////////////////////////////////////// 365 /// @brief Performs an SOA to AOS conversion for packed 8_8 data. 366 /// @param pSrc - source data in SOA form 367 /// @param pDst - output data in AOS form 368 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 369 { 370 #if KNOB_SIMD_WIDTH == 8 371 simdscalari src = _simd_load_si((const simdscalari*)pSrc); 372 373 __m128i rg = _mm256_castsi256_si128(src); // rrrrrrrr gggggggg 374 __m128i g = _mm_unpackhi_epi64(rg, rg); // gggggggg gggggggg 375 rg = _mm_unpacklo_epi8(rg, g); 376 _mm_store_si128((__m128i*)pDst, rg); 377 #else 378 #error Unsupported vector width 379 #endif 380 } 381 #if ENABLE_AVX512_SIMD16 382 383 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 384 { 385 __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc)); // rrrrrrrrrrrrrrrr 386 __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg 387 388 simdscalari cvt0 = _simd_cvtepu8_epi16(src0); 389 simdscalari cvt1 = _simd_cvtepu8_epi16(src1); 390 391 simdscalari shl1 = _simd_slli_epi32(cvt1, 8); 392 393 simdscalari dst = _simd_or_si(cvt0, shl1); 394 395 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg 396 } 397 #endif 398 }; 399 400 ////////////////////////////////////////////////////////////////////////// 401 /// Transpose32_32_32_32 402 ////////////////////////////////////////////////////////////////////////// 403 struct Transpose32_32_32_32 404 { 405 ////////////////////////////////////////////////////////////////////////// 406 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. 407 /// @param pSrc - source data in SOA form 408 /// @param pDst - output data in AOS form 409 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 410 { 411 #if KNOB_SIMD_WIDTH == 8 412 simdscalar src0 = _simd_load_ps((const float*)pSrc); 413 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 414 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 415 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); 416 417 __m128 vDst[8]; 418 vTranspose4x8(vDst, src0, src1, src2, src3); 419 _mm_store_ps((float*)pDst, vDst[0]); 420 _mm_store_ps((float*)pDst+4, vDst[1]); 421 _mm_store_ps((float*)pDst+8, vDst[2]); 422 _mm_store_ps((float*)pDst+12, vDst[3]); 423 _mm_store_ps((float*)pDst+16, vDst[4]); 424 _mm_store_ps((float*)pDst+20, vDst[5]); 425 _mm_store_ps((float*)pDst+24, vDst[6]); 426 _mm_store_ps((float*)pDst+28, vDst[7]); 427 #else 428 #error Unsupported vector width 429 #endif 430 } 431 #if ENABLE_AVX512_SIMD16 432 433 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 434 { 435 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 436 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); 437 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32); 438 simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48); 439 440 simd16scalar dst[4]; 441 442 vTranspose4x16(dst, src0, src1, src2, src3); 443 444 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]); 445 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]); 446 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]); 447 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]); 448 } 449 #endif 450 }; 451 452 ////////////////////////////////////////////////////////////////////////// 453 /// Transpose32_32_32 454 ////////////////////////////////////////////////////////////////////////// 455 struct Transpose32_32_32 456 { 457 ////////////////////////////////////////////////////////////////////////// 458 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. 459 /// @param pSrc - source data in SOA form 460 /// @param pDst - output data in AOS form 461 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 462 { 463 #if KNOB_SIMD_WIDTH == 8 464 simdscalar src0 = _simd_load_ps((const float*)pSrc); 465 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 466 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 467 468 __m128 vDst[8]; 469 vTranspose3x8(vDst, src0, src1, src2); 470 _mm_store_ps((float*)pDst, vDst[0]); 471 _mm_store_ps((float*)pDst + 4, vDst[1]); 472 _mm_store_ps((float*)pDst + 8, vDst[2]); 473 _mm_store_ps((float*)pDst + 12, vDst[3]); 474 _mm_store_ps((float*)pDst + 16, vDst[4]); 475 _mm_store_ps((float*)pDst + 20, vDst[5]); 476 _mm_store_ps((float*)pDst + 24, vDst[6]); 477 _mm_store_ps((float*)pDst + 28, vDst[7]); 478 #else 479 #error Unsupported vector width 480 #endif 481 } 482 #if ENABLE_AVX512_SIMD16 483 484 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 485 { 486 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 487 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); 488 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32); 489 simd16scalar src3 = _simd16_setzero_ps(); 490 491 simd16scalar dst[4]; 492 493 vTranspose4x16(dst, src0, src1, src2, src3); 494 495 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]); 496 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]); 497 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]); 498 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]); 499 } 500 #endif 501 }; 502 503 ////////////////////////////////////////////////////////////////////////// 504 /// Transpose32_32 505 ////////////////////////////////////////////////////////////////////////// 506 struct Transpose32_32 507 { 508 ////////////////////////////////////////////////////////////////////////// 509 /// @brief Performs an SOA to AOS conversion for packed 32_32 data. 510 /// @param pSrc - source data in SOA form 511 /// @param pDst - output data in AOS form 512 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 513 { 514 #if KNOB_SIMD_WIDTH == 8 515 const float* pfSrc = (const float*)pSrc; 516 __m128 src_r0 = _mm_load_ps(pfSrc + 0); 517 __m128 src_r1 = _mm_load_ps(pfSrc + 4); 518 __m128 src_g0 = _mm_load_ps(pfSrc + 8); 519 __m128 src_g1 = _mm_load_ps(pfSrc + 12); 520 521 __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0); 522 __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0); 523 __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1); 524 __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1); 525 526 float* pfDst = (float*)pDst; 527 _mm_store_ps(pfDst + 0, dst0); 528 _mm_store_ps(pfDst + 4, dst1); 529 _mm_store_ps(pfDst + 8, dst2); 530 _mm_store_ps(pfDst + 12, dst3); 531 #else 532 #error Unsupported vector width 533 #endif 534 } 535 #if ENABLE_AVX512_SIMD16 536 537 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 538 { 539 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); // rrrrrrrrrrrrrrrr 540 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); // gggggggggggggggg 541 542 simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD 543 simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF 544 545 simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 546 simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF 547 548 simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 549 simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF 550 551 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg 552 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg 553 } 554 #endif 555 }; 556 557 ////////////////////////////////////////////////////////////////////////// 558 /// Transpose16_16_16_16 559 ////////////////////////////////////////////////////////////////////////// 560 struct Transpose16_16_16_16 561 { 562 ////////////////////////////////////////////////////////////////////////// 563 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. 564 /// @param pSrc - source data in SOA form 565 /// @param pDst - output data in AOS form 566 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 567 { 568 #if KNOB_SIMD_WIDTH == 8 569 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 570 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); 571 572 __m128i src_r = _mm256_extractf128_si256(src_rg, 0); 573 __m128i src_g = _mm256_extractf128_si256(src_rg, 1); 574 __m128i src_b = _mm256_extractf128_si256(src_ba, 0); 575 __m128i src_a = _mm256_extractf128_si256(src_ba, 1); 576 577 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); 578 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); 579 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); 580 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); 581 582 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); 583 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); 584 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); 585 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); 586 587 _mm_store_si128(((__m128i*)pDst) + 0, dst0); 588 _mm_store_si128(((__m128i*)pDst) + 1, dst1); 589 _mm_store_si128(((__m128i*)pDst) + 2, dst2); 590 _mm_store_si128(((__m128i*)pDst) + 3, dst3); 591 #else 592 #error Unsupported vector width 593 #endif 594 } 595 #if ENABLE_AVX512_SIMD16 596 597 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 598 { 599 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr 600 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg 601 simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb 602 simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa 603 604 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB 605 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF 606 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB 607 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF 608 609 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 610 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB 611 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD 612 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF 613 614 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 615 simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 616 simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB 617 simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF 618 619 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba 620 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba 621 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba 622 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba 623 } 624 #endif 625 }; 626 627 ////////////////////////////////////////////////////////////////////////// 628 /// Transpose16_16_16 629 ////////////////////////////////////////////////////////////////////////// 630 struct Transpose16_16_16 631 { 632 ////////////////////////////////////////////////////////////////////////// 633 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. 634 /// @param pSrc - source data in SOA form 635 /// @param pDst - output data in AOS form 636 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 637 { 638 #if KNOB_SIMD_WIDTH == 8 639 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 640 641 __m128i src_r = _mm256_extractf128_si256(src_rg, 0); 642 __m128i src_g = _mm256_extractf128_si256(src_rg, 1); 643 __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari))); 644 __m128i src_a = _mm_undefined_si128(); 645 646 __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g); 647 __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g); 648 __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a); 649 __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a); 650 651 __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0); 652 __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0); 653 __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1); 654 __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1); 655 656 _mm_store_si128(((__m128i*)pDst) + 0, dst0); 657 _mm_store_si128(((__m128i*)pDst) + 1, dst1); 658 _mm_store_si128(((__m128i*)pDst) + 2, dst2); 659 _mm_store_si128(((__m128i*)pDst) + 3, dst3); 660 #else 661 #error Unsupported vector width 662 #endif 663 } 664 #if ENABLE_AVX512_SIMD16 665 666 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 667 { 668 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr 669 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg 670 simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb 671 simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa 672 673 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB 674 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF 675 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB 676 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF 677 678 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 679 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB 680 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD 681 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF 682 683 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 684 simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 685 simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB 686 simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF 687 688 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba 689 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba 690 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba 691 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba 692 } 693 #endif 694 }; 695 696 ////////////////////////////////////////////////////////////////////////// 697 /// Transpose16_16 698 ////////////////////////////////////////////////////////////////////////// 699 struct Transpose16_16 700 { 701 ////////////////////////////////////////////////////////////////////////// 702 /// @brief Performs an SOA to AOS conversion for packed 16_16 data. 703 /// @param pSrc - source data in SOA form 704 /// @param pDst - output data in AOS form 705 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 706 { 707 #if KNOB_SIMD_WIDTH == 8 708 simdscalar src = _simd_load_ps((const float*)pSrc); 709 710 __m128 comp0 = _mm256_castps256_ps128(src); 711 __m128 comp1 = _mm256_extractf128_ps(src, 1); 712 713 __m128i comp0i = _mm_castps_si128(comp0); 714 __m128i comp1i = _mm_castps_si128(comp1); 715 716 __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i); 717 __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i); 718 719 _mm_store_si128((__m128i*)pDst, resLo); 720 _mm_store_si128((__m128i*)pDst + 1, resHi); 721 #else 722 #error Unsupported vector width 723 #endif 724 } 725 #if ENABLE_AVX512_SIMD16 726 727 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 728 { 729 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr 730 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg 731 732 simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB 733 simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF 734 735 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 736 simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF 737 738 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg 739 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg 740 } 741 #endif 742 }; 743 744 ////////////////////////////////////////////////////////////////////////// 745 /// Transpose24_8 746 ////////////////////////////////////////////////////////////////////////// 747 struct Transpose24_8 748 { 749 ////////////////////////////////////////////////////////////////////////// 750 /// @brief Performs an SOA to AOS conversion for packed 24_8 data. 751 /// @param pSrc - source data in SOA form 752 /// @param pDst - output data in AOS form 753 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 754 #if ENABLE_AVX512_SIMD16 755 756 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 757 #endif 758 }; 759 760 ////////////////////////////////////////////////////////////////////////// 761 /// Transpose32_8_24 762 ////////////////////////////////////////////////////////////////////////// 763 struct Transpose32_8_24 764 { 765 ////////////////////////////////////////////////////////////////////////// 766 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. 767 /// @param pSrc - source data in SOA form 768 /// @param pDst - output data in AOS form 769 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 770 #if ENABLE_AVX512_SIMD16 771 772 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 773 #endif 774 }; 775 776 ////////////////////////////////////////////////////////////////////////// 777 /// Transpose4_4_4_4 778 ////////////////////////////////////////////////////////////////////////// 779 struct Transpose4_4_4_4 780 { 781 ////////////////////////////////////////////////////////////////////////// 782 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. 783 /// @param pSrc - source data in SOA form 784 /// @param pDst - output data in AOS form 785 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 786 #if ENABLE_AVX512_SIMD16 787 788 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 789 #endif 790 }; 791 792 ////////////////////////////////////////////////////////////////////////// 793 /// Transpose5_6_5 794 ////////////////////////////////////////////////////////////////////////// 795 struct Transpose5_6_5 796 { 797 ////////////////////////////////////////////////////////////////////////// 798 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. 799 /// @param pSrc - source data in SOA form 800 /// @param pDst - output data in AOS form 801 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 802 #if ENABLE_AVX512_SIMD16 803 804 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 805 #endif 806 }; 807 808 ////////////////////////////////////////////////////////////////////////// 809 /// Transpose9_9_9_5 810 ////////////////////////////////////////////////////////////////////////// 811 struct Transpose9_9_9_5 812 { 813 ////////////////////////////////////////////////////////////////////////// 814 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. 815 /// @param pSrc - source data in SOA form 816 /// @param pDst - output data in AOS form 817 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 818 #if ENABLE_AVX512_SIMD16 819 820 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 821 #endif 822 }; 823 824 ////////////////////////////////////////////////////////////////////////// 825 /// Transpose5_5_5_1 826 ////////////////////////////////////////////////////////////////////////// 827 struct Transpose5_5_5_1 828 { 829 ////////////////////////////////////////////////////////////////////////// 830 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. 831 /// @param pSrc - source data in SOA form 832 /// @param pDst - output data in AOS form 833 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 834 #if ENABLE_AVX512_SIMD16 835 836 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 837 #endif 838 }; 839 840 ////////////////////////////////////////////////////////////////////////// 841 /// Transpose1_5_5_5 842 ////////////////////////////////////////////////////////////////////////// 843 struct Transpose1_5_5_5 844 { 845 ////////////////////////////////////////////////////////////////////////// 846 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. 847 /// @param pSrc - source data in SOA form 848 /// @param pDst - output data in AOS form 849 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 850 }; 851 852 ////////////////////////////////////////////////////////////////////////// 853 /// Transpose10_10_10_2 854 ////////////////////////////////////////////////////////////////////////// 855 struct Transpose10_10_10_2 856 { 857 ////////////////////////////////////////////////////////////////////////// 858 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. 859 /// @param pSrc - source data in SOA form 860 /// @param pDst - output data in AOS form 861 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 862 #if ENABLE_AVX512_SIMD16 863 864 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 865 #endif 866 }; 867 868 ////////////////////////////////////////////////////////////////////////// 869 /// Transpose11_11_10 870 ////////////////////////////////////////////////////////////////////////// 871 struct Transpose11_11_10 872 { 873 ////////////////////////////////////////////////////////////////////////// 874 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. 875 /// @param pSrc - source data in SOA form 876 /// @param pDst - output data in AOS form 877 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 878 #if ENABLE_AVX512_SIMD16 879 880 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 881 #endif 882 }; 883 884 ////////////////////////////////////////////////////////////////////////// 885 /// Transpose64 886 ////////////////////////////////////////////////////////////////////////// 887 struct Transpose64 888 { 889 ////////////////////////////////////////////////////////////////////////// 890 /// @brief Performs an SOA to AOS conversion 891 /// @param pSrc - source data in SOA form 892 /// @param pDst - output data in AOS form 893 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 894 #if ENABLE_AVX512_SIMD16 895 896 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 897 #endif 898 }; 899 900 ////////////////////////////////////////////////////////////////////////// 901 /// Transpose64_64 902 ////////////////////////////////////////////////////////////////////////// 903 struct Transpose64_64 904 { 905 ////////////////////////////////////////////////////////////////////////// 906 /// @brief Performs an SOA to AOS conversion 907 /// @param pSrc - source data in SOA form 908 /// @param pDst - output data in AOS form 909 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 910 #if ENABLE_AVX512_SIMD16 911 912 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 913 #endif 914 }; 915 916 ////////////////////////////////////////////////////////////////////////// 917 /// Transpose64_64_64 918 ////////////////////////////////////////////////////////////////////////// 919 struct Transpose64_64_64 920 { 921 ////////////////////////////////////////////////////////////////////////// 922 /// @brief Performs an SOA to AOS conversion 923 /// @param pSrc - source data in SOA form 924 /// @param pDst - output data in AOS form 925 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 926 #if ENABLE_AVX512_SIMD16 927 928 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 929 #endif 930 }; 931 932 ////////////////////////////////////////////////////////////////////////// 933 /// Transpose64_64_64_64 934 ////////////////////////////////////////////////////////////////////////// 935 struct Transpose64_64_64_64 936 { 937 ////////////////////////////////////////////////////////////////////////// 938 /// @brief Performs an SOA to AOS conversion 939 /// @param pSrc - source data in SOA form 940 /// @param pDst - output data in AOS form 941 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 942 #if ENABLE_AVX512_SIMD16 943 944 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 945 #endif 946 }; 947 948 // helper function to unroll loops 949 template<int Begin, int End, int Step = 1> 950 struct UnrollerL { 951 template<typename Lambda> 952 INLINE static void step(Lambda& func) { 953 func(Begin); 954 UnrollerL<Begin + Step, End, Step>::step(func); 955 } 956 }; 957 958 template<int End, int Step> 959 struct UnrollerL<End, End, Step> { 960 template<typename Lambda> 961 static void step(Lambda& func) { 962 } 963 }; 964 965 // helper function to unroll loops, with mask to skip specific iterations 966 template<int Begin, int End, int Step = 1, int Mask = 0x7f> 967 struct UnrollerLMask { 968 template<typename Lambda> 969 INLINE static void step(Lambda& func) { 970 if(Mask & (1 << Begin)) 971 { 972 func(Begin); 973 } 974 UnrollerL<Begin + Step, End, Step>::step(func); 975 } 976 }; 977 978 template<int End, int Step, int Mask> 979 struct UnrollerLMask<End, End, Step, Mask> { 980 template<typename Lambda> 981 static void step(Lambda& func) { 982 } 983 }; 984 985 // general CRC compute 986 INLINE 987 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size) 988 { 989 #if defined(_WIN64) || defined(__x86_64__) 990 uint32_t sizeInQwords = size / sizeof(uint64_t); 991 uint32_t sizeRemainderBytes = size % sizeof(uint64_t); 992 uint64_t* pDataWords = (uint64_t*)pData; 993 for (uint32_t i = 0; i < sizeInQwords; ++i) 994 { 995 crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++); 996 } 997 #else 998 uint32_t sizeInDwords = size / sizeof(uint32_t); 999 uint32_t sizeRemainderBytes = size % sizeof(uint32_t); 1000 uint32_t* pDataWords = (uint32_t*)pData; 1001 for (uint32_t i = 0; i < sizeInDwords; ++i) 1002 { 1003 crc = _mm_crc32_u32(crc, *pDataWords++); 1004 } 1005 #endif 1006 1007 uint8_t* pRemainderBytes = (uint8_t*)pDataWords; 1008 for (uint32_t i = 0; i < sizeRemainderBytes; ++i) 1009 { 1010 crc = _mm_crc32_u8(crc, *pRemainderBytes++); 1011 } 1012 1013 return crc; 1014 } 1015 1016 ////////////////////////////////////////////////////////////////////////// 1017 /// Add byte offset to any-type pointer 1018 ////////////////////////////////////////////////////////////////////////// 1019 template <typename T> 1020 INLINE 1021 static T* PtrAdd(T* p, intptr_t offset) 1022 { 1023 intptr_t intp = reinterpret_cast<intptr_t>(p); 1024 return reinterpret_cast<T*>(intp + offset); 1025 } 1026 1027 ////////////////////////////////////////////////////////////////////////// 1028 /// Is a power-of-2? 1029 ////////////////////////////////////////////////////////////////////////// 1030 template <typename T> 1031 INLINE 1032 static bool IsPow2(T value) 1033 { 1034 return value == (value & (0 - value)); 1035 } 1036 1037 ////////////////////////////////////////////////////////////////////////// 1038 /// Align down to specified alignment 1039 /// Note: IsPow2(alignment) MUST be true 1040 ////////////////////////////////////////////////////////////////////////// 1041 template <typename T1, typename T2> 1042 INLINE 1043 static T1 AlignDownPow2(T1 value, T2 alignment) 1044 { 1045 SWR_ASSERT(IsPow2(alignment)); 1046 return value & ~T1(alignment - 1); 1047 } 1048 1049 ////////////////////////////////////////////////////////////////////////// 1050 /// Align up to specified alignment 1051 /// Note: IsPow2(alignment) MUST be true 1052 ////////////////////////////////////////////////////////////////////////// 1053 template <typename T1, typename T2> 1054 INLINE 1055 static T1 AlignUpPow2(T1 value, T2 alignment) 1056 { 1057 return AlignDownPow2(value + T1(alignment - 1), alignment); 1058 } 1059 1060 ////////////////////////////////////////////////////////////////////////// 1061 /// Align up ptr to specified alignment 1062 /// Note: IsPow2(alignment) MUST be true 1063 ////////////////////////////////////////////////////////////////////////// 1064 template <typename T1, typename T2> 1065 INLINE 1066 static T1* AlignUpPow2(T1* value, T2 alignment) 1067 { 1068 return reinterpret_cast<T1*>( 1069 AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment)); 1070 } 1071 1072 ////////////////////////////////////////////////////////////////////////// 1073 /// Align down to specified alignment 1074 ////////////////////////////////////////////////////////////////////////// 1075 template <typename T1, typename T2> 1076 INLINE 1077 static T1 AlignDown(T1 value, T2 alignment) 1078 { 1079 if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); } 1080 return value - T1(value % alignment); 1081 } 1082 1083 ////////////////////////////////////////////////////////////////////////// 1084 /// Align down to specified alignment 1085 ////////////////////////////////////////////////////////////////////////// 1086 template <typename T1, typename T2> 1087 INLINE 1088 static T1* AlignDown(T1* value, T2 alignment) 1089 { 1090 return (T1*)AlignDown(uintptr_t(value), alignment); 1091 } 1092 1093 ////////////////////////////////////////////////////////////////////////// 1094 /// Align up to specified alignment 1095 /// Note: IsPow2(alignment) MUST be true 1096 ////////////////////////////////////////////////////////////////////////// 1097 template <typename T1, typename T2> 1098 INLINE 1099 static T1 AlignUp(T1 value, T2 alignment) 1100 { 1101 return AlignDown(value + T1(alignment - 1), alignment); 1102 } 1103 1104 ////////////////////////////////////////////////////////////////////////// 1105 /// Align up to specified alignment 1106 /// Note: IsPow2(alignment) MUST be true 1107 ////////////////////////////////////////////////////////////////////////// 1108 template <typename T1, typename T2> 1109 INLINE 1110 static T1* AlignUp(T1* value, T2 alignment) 1111 { 1112 return AlignDown(PtrAdd(value, alignment - 1), alignment); 1113 } 1114 1115 ////////////////////////////////////////////////////////////////////////// 1116 /// Helper structure used to access an array of elements that don't 1117 /// correspond to a typical word size. 1118 ////////////////////////////////////////////////////////////////////////// 1119 template<typename T, size_t BitsPerElementT, size_t ArrayLenT> 1120 class BitsArray 1121 { 1122 private: 1123 static const size_t BITS_PER_WORD = sizeof(size_t) * 8; 1124 static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT; 1125 static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD; 1126 static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1; 1127 1128 static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD, 1129 "Element size must an integral fraction of pointer size"); 1130 1131 size_t m_words[NUM_WORDS] = {}; 1132 1133 public: 1134 1135 T operator[] (size_t elementIndex) const 1136 { 1137 size_t word = m_words[elementIndex / ELEMENTS_PER_WORD]; 1138 word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT); 1139 return T(word & ELEMENT_MASK); 1140 } 1141 }; 1142 1143 // Ranged integer argument for TemplateArgUnroller 1144 template <uint32_t TMin, uint32_t TMax> 1145 struct IntArg 1146 { 1147 uint32_t val; 1148 }; 1149 1150 // Recursive template used to auto-nest conditionals. Converts dynamic boolean function 1151 // arguments to static template arguments. 1152 template <typename TermT, typename... ArgsB> 1153 struct TemplateArgUnroller 1154 { 1155 //----------------------------------------- 1156 // Boolean value 1157 //----------------------------------------- 1158 1159 // Last Arg Terminator 1160 static typename TermT::FuncType GetFunc(bool bArg) 1161 { 1162 if (bArg) 1163 { 1164 return TermT::template GetFunc<ArgsB..., std::true_type>(); 1165 } 1166 1167 return TermT::template GetFunc<ArgsB..., std::false_type>(); 1168 } 1169 1170 // Recursively parse args 1171 template <typename... TArgsT> 1172 static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs) 1173 { 1174 if (bArg) 1175 { 1176 return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...); 1177 } 1178 1179 return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...); 1180 } 1181 1182 //----------------------------------------- 1183 // Integer value (within specified range) 1184 //----------------------------------------- 1185 1186 // Last Arg Terminator 1187 template <uint32_t TMin, uint32_t TMax> 1188 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg) 1189 { 1190 if (iArg.val == TMax) 1191 { 1192 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>(); 1193 } 1194 if (TMax > TMin) 1195 { 1196 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val}); 1197 } 1198 SWR_ASSUME(false); return nullptr; 1199 } 1200 template <uint32_t TVal> 1201 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg) 1202 { 1203 SWR_ASSERT(iArg.val == TVal); 1204 return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>(); 1205 } 1206 1207 // Recursively parse args 1208 template <uint32_t TMin, uint32_t TMax, typename... TArgsT> 1209 static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs) 1210 { 1211 if (iArg.val == TMax) 1212 { 1213 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...); 1214 } 1215 if (TMax > TMin) 1216 { 1217 return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...); 1218 } 1219 SWR_ASSUME(false); return nullptr; 1220 } 1221 template <uint32_t TVal, typename... TArgsT> 1222 static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs) 1223 { 1224 SWR_ASSERT(iArg.val == TVal); 1225 return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...); 1226 } 1227 }; 1228 1229 1230