1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * @file utils.h 24 * 25 * @brief Utilities used by SWR core related to pixel formats. 26 * 27 ******************************************************************************/ 28 #pragma once 29 30 #include "core/utils.h" 31 #include "common/simdintrin.h" 32 33 INLINE 34 void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4scalar &row3) 35 { 36 simd4scalari row0i = SIMD128::castps_si(row0); 37 simd4scalari row1i = SIMD128::castps_si(row1); 38 simd4scalari row2i = SIMD128::castps_si(row2); 39 simd4scalari row3i = SIMD128::castps_si(row3); 40 41 simd4scalari vTemp = row2i; 42 row2i = SIMD128::unpacklo_epi32(row2i, row3i); 43 vTemp = SIMD128::unpackhi_epi32(vTemp, row3i); 44 45 row3i = row0i; 46 row0i = SIMD128::unpacklo_epi32(row0i, row1i); 47 row3i = SIMD128::unpackhi_epi32(row3i, row1i); 48 49 row1i = row0i; 50 row0i = SIMD128::unpacklo_epi64(row0i, row2i); 51 row1i = SIMD128::unpackhi_epi64(row1i, row2i); 52 53 row2i = row3i; 54 row2i = SIMD128::unpacklo_epi64(row2i, vTemp); 55 row3i = SIMD128::unpackhi_epi64(row3i, vTemp); 56 57 row0 = SIMD128::castsi_ps(row0i); 58 row1 = SIMD128::castsi_ps(row1i); 59 row2 = SIMD128::castsi_ps(row2i); 60 row3 = SIMD128::castsi_ps(row3i); 61 } 62 63 INLINE 64 void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd4scalari &row3) 65 { 66 simd4scalari vTemp = row2; 67 row2 = SIMD128::unpacklo_epi32(row2, row3); 68 vTemp = SIMD128::unpackhi_epi32(vTemp, row3); 69 70 row3 = row0; 71 row0 = SIMD128::unpacklo_epi32(row0, row1); 72 row3 = SIMD128::unpackhi_epi32(row3, row1); 73 74 row1 = row0; 75 row0 = SIMD128::unpacklo_epi64(row0, row2); 76 row1 = SIMD128::unpackhi_epi64(row1, row2); 77 78 row2 = row3; 79 row2 = SIMD128::unpacklo_epi64(row2, vTemp); 80 row3 = SIMD128::unpackhi_epi64(row3, vTemp); 81 } 82 83 #if KNOB_SIMD_WIDTH == 8 84 INLINE 85 void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2) 86 { 87 simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 88 simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps()); //y0w0y1w1 y4w4y5w5 89 simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 90 simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 91 92 r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 93 r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps()); //y2w2y3w3 y6w6yw77 94 simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 95 simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 96 97 vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0); 98 vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0); 99 vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0); 100 vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0); 101 102 vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1); 103 vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1); 104 vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1); 105 vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1); 106 } 107 108 INLINE 109 void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3) 110 { 111 simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 112 simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 113 simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx); //x0y0z0w0 x4y4z4w4 114 simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx); //x1y1z1w1 x5y5z5w5 115 116 r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2); //x2z2x3z3 x6z6x7z7 117 r1rx = _simd_unpackhi_ps(vSrc1, vSrc3); //y2w2y3w3 y6w6yw77 118 simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx); //x2y2z2w2 x6y6z6w6 119 simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx); //x3y3z3w3 x7y7z7w7 120 121 vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0); 122 vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0); 123 vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0); 124 vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0); 125 126 vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1); 127 vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1); 128 vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1); 129 vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1); 130 } 131 132 #if ENABLE_AVX512_SIMD16 133 INLINE 134 void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3) 135 { 136 const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking 137 138 simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r 139 simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g 140 simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b 141 simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a 142 143 simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2); 144 simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3); 145 simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2); 146 simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3); 147 148 dst[0] = _simd16_unpacklo_ps(rblo, galo); 149 dst[1] = _simd16_unpackhi_ps(rblo, galo); 150 dst[2] = _simd16_unpacklo_ps(rbhi, gahi); 151 dst[3] = _simd16_unpackhi_ps(rbhi, gahi); 152 } 153 154 #endif 155 INLINE 156 void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7) 157 { 158 simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1); 159 simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1); 160 simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3); 161 simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3); 162 simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5); 163 simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5); 164 simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7); 165 simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7); 166 simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0)); 167 simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2)); 168 simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0)); 169 simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2)); 170 simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0)); 171 simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2)); 172 simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0)); 173 simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2)); 174 vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20); 175 vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20); 176 vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20); 177 vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20); 178 vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31); 179 vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31); 180 vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31); 181 vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31); 182 } 183 184 INLINE 185 void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7) 186 { 187 vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3), 188 _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7)); 189 } 190 #endif 191 192 ////////////////////////////////////////////////////////////////////////// 193 /// TranposeSingleComponent 194 ////////////////////////////////////////////////////////////////////////// 195 template<uint32_t bpp> 196 struct TransposeSingleComponent 197 { 198 ////////////////////////////////////////////////////////////////////////// 199 /// @brief Pass-thru for single component. 200 /// @param pSrc - source data in SOA form 201 /// @param pDst - output data in AOS form 202 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 203 { 204 memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); 205 } 206 #if ENABLE_AVX512_SIMD16 207 208 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 209 { 210 memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8); 211 } 212 #endif 213 }; 214 215 ////////////////////////////////////////////////////////////////////////// 216 /// Transpose8_8_8_8 217 ////////////////////////////////////////////////////////////////////////// 218 struct Transpose8_8_8_8 219 { 220 ////////////////////////////////////////////////////////////////////////// 221 /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data. 222 /// @param pSrc - source data in SOA form 223 /// @param pDst - output data in AOS form 224 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 225 { 226 simdscalari src = _simd_load_si((const simdscalari*)pSrc); 227 228 #if KNOB_SIMD_WIDTH == 8 229 #if KNOB_ARCH <= KNOB_ARCH_AVX 230 simd4scalari c0c1 = src.v4[0]; // rrrrrrrrgggggggg 231 simd4scalari c2c3 = SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1)); // bbbbbbbbaaaaaaaa 232 simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3); // rrrrrrrrbbbbbbbb 233 simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3); // ggggggggaaaaaaaa 234 simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3); // rgrgrgrgrgrgrgrg 235 simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3); // babababababababa 236 simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23); // rgbargbargbargba 237 simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23); // rgbargbargbargba 238 SIMD128::store_si((simd4scalari*)pDst, c0123lo); 239 SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi); 240 #else 241 simdscalari dst01 = _simd_shuffle_epi8(src, 242 _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800)); 243 simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01); 244 dst23 = _simd_shuffle_epi8(dst23, 245 _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080)); 246 simdscalari dst = _simd_or_si(dst01, dst23); 247 _simd_store_si((simdscalari*)pDst, dst); 248 #endif 249 #else 250 #error Unsupported vector width 251 #endif 252 } 253 #if ENABLE_AVX512_SIMD16 254 255 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 256 { 257 simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc)); // rrrrrrrrrrrrrrrr 258 simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg 259 simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb 260 simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa 261 262 simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0); 263 simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1); 264 simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2); 265 simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3); 266 267 simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); 268 simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16); 269 simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24); 270 271 simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3)); 272 273 _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba 274 } 275 #endif 276 }; 277 278 ////////////////////////////////////////////////////////////////////////// 279 /// Transpose8_8_8 280 ////////////////////////////////////////////////////////////////////////// 281 struct Transpose8_8_8 282 { 283 ////////////////////////////////////////////////////////////////////////// 284 /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data. 285 /// @param pSrc - source data in SOA form 286 /// @param pDst - output data in AOS form 287 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 288 #if ENABLE_AVX512_SIMD16 289 290 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 291 #endif 292 }; 293 294 ////////////////////////////////////////////////////////////////////////// 295 /// Transpose8_8 296 ////////////////////////////////////////////////////////////////////////// 297 struct Transpose8_8 298 { 299 ////////////////////////////////////////////////////////////////////////// 300 /// @brief Performs an SOA to AOS conversion for packed 8_8 data. 301 /// @param pSrc - source data in SOA form 302 /// @param pDst - output data in AOS form 303 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 304 { 305 #if KNOB_SIMD_WIDTH == 8 306 simdscalari src = _simd_load_si((const simdscalari*)pSrc); 307 308 simd4scalari rg = src.v4[0]; // rrrrrrrr gggggggg 309 simd4scalari g = SIMD128::unpackhi_epi64(rg, rg); // gggggggg gggggggg 310 rg = SIMD128::unpacklo_epi8(rg, g); 311 SIMD128::store_si((simd4scalari*)pDst, rg); 312 #else 313 #error Unsupported vector width 314 #endif 315 } 316 #if ENABLE_AVX512_SIMD16 317 318 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 319 { 320 simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc)); // rrrrrrrrrrrrrrrr 321 simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg 322 323 simdscalari cvt0 = _simd_cvtepu8_epi16(src0); 324 simdscalari cvt1 = _simd_cvtepu8_epi16(src1); 325 326 simdscalari shl1 = _simd_slli_epi32(cvt1, 8); 327 328 simdscalari dst = _simd_or_si(cvt0, shl1); 329 330 _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg 331 } 332 #endif 333 }; 334 335 ////////////////////////////////////////////////////////////////////////// 336 /// Transpose32_32_32_32 337 ////////////////////////////////////////////////////////////////////////// 338 struct Transpose32_32_32_32 339 { 340 ////////////////////////////////////////////////////////////////////////// 341 /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data. 342 /// @param pSrc - source data in SOA form 343 /// @param pDst - output data in AOS form 344 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 345 { 346 #if KNOB_SIMD_WIDTH == 8 347 simdscalar src0 = _simd_load_ps((const float*)pSrc); 348 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 349 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 350 simdscalar src3 = _simd_load_ps((const float*)pSrc + 24); 351 352 simd4scalar vDst[8]; 353 vTranspose4x8(vDst, src0, src1, src2, src3); 354 SIMD128::store_ps((float*)pDst, vDst[0]); 355 SIMD128::store_ps((float*)pDst+4, vDst[1]); 356 SIMD128::store_ps((float*)pDst+8, vDst[2]); 357 SIMD128::store_ps((float*)pDst+12, vDst[3]); 358 SIMD128::store_ps((float*)pDst+16, vDst[4]); 359 SIMD128::store_ps((float*)pDst+20, vDst[5]); 360 SIMD128::store_ps((float*)pDst+24, vDst[6]); 361 SIMD128::store_ps((float*)pDst+28, vDst[7]); 362 #else 363 #error Unsupported vector width 364 #endif 365 } 366 #if ENABLE_AVX512_SIMD16 367 368 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 369 { 370 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 371 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); 372 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32); 373 simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48); 374 375 simd16scalar dst[4]; 376 377 vTranspose4x16(dst, src0, src1, src2, src3); 378 379 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]); 380 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]); 381 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]); 382 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]); 383 } 384 #endif 385 }; 386 387 ////////////////////////////////////////////////////////////////////////// 388 /// Transpose32_32_32 389 ////////////////////////////////////////////////////////////////////////// 390 struct Transpose32_32_32 391 { 392 ////////////////////////////////////////////////////////////////////////// 393 /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data. 394 /// @param pSrc - source data in SOA form 395 /// @param pDst - output data in AOS form 396 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 397 { 398 #if KNOB_SIMD_WIDTH == 8 399 simdscalar src0 = _simd_load_ps((const float*)pSrc); 400 simdscalar src1 = _simd_load_ps((const float*)pSrc + 8); 401 simdscalar src2 = _simd_load_ps((const float*)pSrc + 16); 402 403 simd4scalar vDst[8]; 404 vTranspose3x8(vDst, src0, src1, src2); 405 SIMD128::store_ps((float*)pDst, vDst[0]); 406 SIMD128::store_ps((float*)pDst + 4, vDst[1]); 407 SIMD128::store_ps((float*)pDst + 8, vDst[2]); 408 SIMD128::store_ps((float*)pDst + 12, vDst[3]); 409 SIMD128::store_ps((float*)pDst + 16, vDst[4]); 410 SIMD128::store_ps((float*)pDst + 20, vDst[5]); 411 SIMD128::store_ps((float*)pDst + 24, vDst[6]); 412 SIMD128::store_ps((float*)pDst + 28, vDst[7]); 413 #else 414 #error Unsupported vector width 415 #endif 416 } 417 #if ENABLE_AVX512_SIMD16 418 419 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 420 { 421 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); 422 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); 423 simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32); 424 simd16scalar src3 = _simd16_setzero_ps(); 425 426 simd16scalar dst[4]; 427 428 vTranspose4x16(dst, src0, src1, src2, src3); 429 430 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst[0]); 431 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]); 432 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]); 433 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]); 434 } 435 #endif 436 }; 437 438 ////////////////////////////////////////////////////////////////////////// 439 /// Transpose32_32 440 ////////////////////////////////////////////////////////////////////////// 441 struct Transpose32_32 442 { 443 ////////////////////////////////////////////////////////////////////////// 444 /// @brief Performs an SOA to AOS conversion for packed 32_32 data. 445 /// @param pSrc - source data in SOA form 446 /// @param pDst - output data in AOS form 447 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 448 { 449 #if KNOB_SIMD_WIDTH == 8 450 const float* pfSrc = (const float*)pSrc; 451 simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0); 452 simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4); 453 simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8); 454 simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12); 455 456 simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0); 457 simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0); 458 simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1); 459 simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1); 460 461 float* pfDst = (float*)pDst; 462 SIMD128::store_ps(pfDst + 0, dst0); 463 SIMD128::store_ps(pfDst + 4, dst1); 464 SIMD128::store_ps(pfDst + 8, dst2); 465 SIMD128::store_ps(pfDst + 12, dst3); 466 #else 467 #error Unsupported vector width 468 #endif 469 } 470 #if ENABLE_AVX512_SIMD16 471 472 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 473 { 474 simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); // rrrrrrrrrrrrrrrr 475 simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16); // gggggggggggggggg 476 477 simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD 478 simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF 479 480 simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 481 simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF 482 483 simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 484 simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF 485 486 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg 487 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg 488 } 489 #endif 490 }; 491 492 ////////////////////////////////////////////////////////////////////////// 493 /// Transpose16_16_16_16 494 ////////////////////////////////////////////////////////////////////////// 495 struct Transpose16_16_16_16 496 { 497 ////////////////////////////////////////////////////////////////////////// 498 /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data. 499 /// @param pSrc - source data in SOA form 500 /// @param pDst - output data in AOS form 501 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 502 { 503 #if KNOB_SIMD_WIDTH == 8 504 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 505 simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari))); 506 507 simd4scalari src_r = _simd_extractf128_si(src_rg, 0); 508 simd4scalari src_g = _simd_extractf128_si(src_rg, 1); 509 simd4scalari src_b = _simd_extractf128_si(src_ba, 0); 510 simd4scalari src_a = _simd_extractf128_si(src_ba, 1); 511 512 simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g); 513 simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g); 514 simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a); 515 simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a); 516 517 simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0); 518 simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0); 519 simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1); 520 simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1); 521 522 SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0); 523 SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1); 524 SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2); 525 SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3); 526 #else 527 #error Unsupported vector width 528 #endif 529 } 530 #if ENABLE_AVX512_SIMD16 531 532 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 533 { 534 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr 535 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg 536 simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb 537 simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa 538 539 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB 540 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF 541 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB 542 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF 543 544 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 545 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB 546 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD 547 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF 548 549 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 550 simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 551 simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB 552 simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF 553 554 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba 555 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba 556 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba 557 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba 558 } 559 #endif 560 }; 561 562 ////////////////////////////////////////////////////////////////////////// 563 /// Transpose16_16_16 564 ////////////////////////////////////////////////////////////////////////// 565 struct Transpose16_16_16 566 { 567 ////////////////////////////////////////////////////////////////////////// 568 /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data. 569 /// @param pSrc - source data in SOA form 570 /// @param pDst - output data in AOS form 571 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 572 { 573 #if KNOB_SIMD_WIDTH == 8 574 simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc); 575 576 simd4scalari src_r = _simd_extractf128_si(src_rg, 0); 577 simd4scalari src_g = _simd_extractf128_si(src_rg, 1); 578 simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari))); 579 simd4scalari src_a = SIMD128::setzero_si(); 580 581 simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g); 582 simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g); 583 simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a); 584 simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a); 585 586 simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0); 587 simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0); 588 simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1); 589 simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1); 590 591 SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0); 592 SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1); 593 SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2); 594 SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3); 595 #else 596 #error Unsupported vector width 597 #endif 598 } 599 #if ENABLE_AVX512_SIMD16 600 601 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 602 { 603 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr 604 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg 605 simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb 606 simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa 607 608 simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB 609 simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF 610 simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB 611 simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF 612 613 simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 614 simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB 615 simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD 616 simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF 617 618 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 619 simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 620 simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB 621 simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF 622 623 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgbargbargbargba 624 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgbargbargbargba 625 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); // rgbargbargbargba 626 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); // rgbargbargbargba 627 } 628 #endif 629 }; 630 631 ////////////////////////////////////////////////////////////////////////// 632 /// Transpose16_16 633 ////////////////////////////////////////////////////////////////////////// 634 struct Transpose16_16 635 { 636 ////////////////////////////////////////////////////////////////////////// 637 /// @brief Performs an SOA to AOS conversion for packed 16_16 data. 638 /// @param pSrc - source data in SOA form 639 /// @param pDst - output data in AOS form 640 INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) 641 { 642 #if KNOB_SIMD_WIDTH == 8 643 simdscalar src = _simd_load_ps((const float*)pSrc); 644 645 simd4scalar comp0 = _simd_extractf128_ps(src, 0); 646 simd4scalar comp1 = _simd_extractf128_ps(src, 1); 647 648 simd4scalari comp0i = SIMD128::castps_si(comp0); 649 simd4scalari comp1i = SIMD128::castps_si(comp1); 650 651 simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i); 652 simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i); 653 654 SIMD128::store_si((simd4scalari*)pDst, resLo); 655 SIMD128::store_si((simd4scalari*)pDst + 1, resHi); 656 #else 657 #error Unsupported vector width 658 #endif 659 } 660 #if ENABLE_AVX512_SIMD16 661 662 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) 663 { 664 simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrr 665 simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1); // gggggggggggggggg 666 667 simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB 668 simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF 669 670 simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 671 simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF 672 673 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg 674 _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg 675 } 676 #endif 677 }; 678 679 ////////////////////////////////////////////////////////////////////////// 680 /// Transpose24_8 681 ////////////////////////////////////////////////////////////////////////// 682 struct Transpose24_8 683 { 684 ////////////////////////////////////////////////////////////////////////// 685 /// @brief Performs an SOA to AOS conversion for packed 24_8 data. 686 /// @param pSrc - source data in SOA form 687 /// @param pDst - output data in AOS form 688 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 689 #if ENABLE_AVX512_SIMD16 690 691 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 692 #endif 693 }; 694 695 ////////////////////////////////////////////////////////////////////////// 696 /// Transpose32_8_24 697 ////////////////////////////////////////////////////////////////////////// 698 struct Transpose32_8_24 699 { 700 ////////////////////////////////////////////////////////////////////////// 701 /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data. 702 /// @param pSrc - source data in SOA form 703 /// @param pDst - output data in AOS form 704 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 705 #if ENABLE_AVX512_SIMD16 706 707 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 708 #endif 709 }; 710 711 ////////////////////////////////////////////////////////////////////////// 712 /// Transpose4_4_4_4 713 ////////////////////////////////////////////////////////////////////////// 714 struct Transpose4_4_4_4 715 { 716 ////////////////////////////////////////////////////////////////////////// 717 /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data. 718 /// @param pSrc - source data in SOA form 719 /// @param pDst - output data in AOS form 720 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 721 #if ENABLE_AVX512_SIMD16 722 723 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 724 #endif 725 }; 726 727 ////////////////////////////////////////////////////////////////////////// 728 /// Transpose5_6_5 729 ////////////////////////////////////////////////////////////////////////// 730 struct Transpose5_6_5 731 { 732 ////////////////////////////////////////////////////////////////////////// 733 /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data. 734 /// @param pSrc - source data in SOA form 735 /// @param pDst - output data in AOS form 736 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 737 #if ENABLE_AVX512_SIMD16 738 739 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 740 #endif 741 }; 742 743 ////////////////////////////////////////////////////////////////////////// 744 /// Transpose9_9_9_5 745 ////////////////////////////////////////////////////////////////////////// 746 struct Transpose9_9_9_5 747 { 748 ////////////////////////////////////////////////////////////////////////// 749 /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data. 750 /// @param pSrc - source data in SOA form 751 /// @param pDst - output data in AOS form 752 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 753 #if ENABLE_AVX512_SIMD16 754 755 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 756 #endif 757 }; 758 759 ////////////////////////////////////////////////////////////////////////// 760 /// Transpose5_5_5_1 761 ////////////////////////////////////////////////////////////////////////// 762 struct Transpose5_5_5_1 763 { 764 ////////////////////////////////////////////////////////////////////////// 765 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. 766 /// @param pSrc - source data in SOA form 767 /// @param pDst - output data in AOS form 768 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 769 #if ENABLE_AVX512_SIMD16 770 771 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 772 #endif 773 }; 774 775 ////////////////////////////////////////////////////////////////////////// 776 /// Transpose1_5_5_5 777 ////////////////////////////////////////////////////////////////////////// 778 struct Transpose1_5_5_5 779 { 780 ////////////////////////////////////////////////////////////////////////// 781 /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data. 782 /// @param pSrc - source data in SOA form 783 /// @param pDst - output data in AOS form 784 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 785 }; 786 787 ////////////////////////////////////////////////////////////////////////// 788 /// Transpose10_10_10_2 789 ////////////////////////////////////////////////////////////////////////// 790 struct Transpose10_10_10_2 791 { 792 ////////////////////////////////////////////////////////////////////////// 793 /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data. 794 /// @param pSrc - source data in SOA form 795 /// @param pDst - output data in AOS form 796 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 797 #if ENABLE_AVX512_SIMD16 798 799 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 800 #endif 801 }; 802 803 ////////////////////////////////////////////////////////////////////////// 804 /// Transpose11_11_10 805 ////////////////////////////////////////////////////////////////////////// 806 struct Transpose11_11_10 807 { 808 ////////////////////////////////////////////////////////////////////////// 809 /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data. 810 /// @param pSrc - source data in SOA form 811 /// @param pDst - output data in AOS form 812 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 813 #if ENABLE_AVX512_SIMD16 814 815 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 816 #endif 817 }; 818 819 ////////////////////////////////////////////////////////////////////////// 820 /// Transpose64 821 ////////////////////////////////////////////////////////////////////////// 822 struct Transpose64 823 { 824 ////////////////////////////////////////////////////////////////////////// 825 /// @brief Performs an SOA to AOS conversion 826 /// @param pSrc - source data in SOA form 827 /// @param pDst - output data in AOS form 828 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 829 #if ENABLE_AVX512_SIMD16 830 831 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 832 #endif 833 }; 834 835 ////////////////////////////////////////////////////////////////////////// 836 /// Transpose64_64 837 ////////////////////////////////////////////////////////////////////////// 838 struct Transpose64_64 839 { 840 ////////////////////////////////////////////////////////////////////////// 841 /// @brief Performs an SOA to AOS conversion 842 /// @param pSrc - source data in SOA form 843 /// @param pDst - output data in AOS form 844 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 845 #if ENABLE_AVX512_SIMD16 846 847 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 848 #endif 849 }; 850 851 ////////////////////////////////////////////////////////////////////////// 852 /// Transpose64_64_64 853 ////////////////////////////////////////////////////////////////////////// 854 struct Transpose64_64_64 855 { 856 ////////////////////////////////////////////////////////////////////////// 857 /// @brief Performs an SOA to AOS conversion 858 /// @param pSrc - source data in SOA form 859 /// @param pDst - output data in AOS form 860 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 861 #if ENABLE_AVX512_SIMD16 862 863 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 864 #endif 865 }; 866 867 ////////////////////////////////////////////////////////////////////////// 868 /// Transpose64_64_64_64 869 ////////////////////////////////////////////////////////////////////////// 870 struct Transpose64_64_64_64 871 { 872 ////////////////////////////////////////////////////////////////////////// 873 /// @brief Performs an SOA to AOS conversion 874 /// @param pSrc - source data in SOA form 875 /// @param pDst - output data in AOS form 876 static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; 877 #if ENABLE_AVX512_SIMD16 878 879 static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; 880 #endif 881 }; 882 883