Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file utils.h
     24 *
     25 * @brief Utilities used by SWR core.
     26 *
     27 ******************************************************************************/
     28 #pragma once
     29 
     30 #include <string.h>
     31 #include <type_traits>
     32 #include <algorithm>
     33 #include "common/os.h"
     34 #include "common/simdintrin.h"
     35 #include "common/swr_assert.h"
     36 #include "core/api.h"
     37 
     38 #if defined(_WIN64) || defined(__x86_64__)
     39 #define _MM_INSERT_EPI64 _mm_insert_epi64
     40 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
     41 #else
     42 INLINE int64_t _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
     43 {
     44     OSALIGNLINE(uint32_t) elems[4];
     45     _mm_store_si128((__m128i*)elems, a);
     46     if (ndx == 0)
     47     {
     48         uint64_t foo = elems[0];
     49         foo |= (uint64_t)elems[1] << 32;
     50         return foo;
     51     }
     52     else
     53     {
     54         uint64_t foo = elems[2];
     55         foo |= (uint64_t)elems[3] << 32;
     56         return foo;
     57     }
     58 }
     59 
     60 INLINE __m128i  _MM_INSERT_EPI64(__m128i a, int64_t b, const int32_t ndx)
     61 {
     62     OSALIGNLINE(int64_t) elems[2];
     63     _mm_store_si128((__m128i*)elems, a);
     64     if (ndx == 0)
     65     {
     66         elems[0] = b;
     67     }
     68     else
     69     {
     70         elems[1] = b;
     71     }
     72     __m128i out;
     73     out = _mm_load_si128((const __m128i*)elems);
     74     return out;
     75 }
     76 #endif
     77 
     78 struct simdBBox
     79 {
     80     simdscalari ymin;
     81     simdscalari ymax;
     82     simdscalari xmin;
     83     simdscalari xmax;
     84 };
     85 
     86 INLINE
     87 void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
     88 {
     89     __m128i row0i = _mm_castps_si128(row0);
     90     __m128i row1i = _mm_castps_si128(row1);
     91     __m128i row2i = _mm_castps_si128(row2);
     92     __m128i row3i = _mm_castps_si128(row3);
     93 
     94     __m128i vTemp = row2i;
     95     row2i = _mm_unpacklo_epi32(row2i, row3i);
     96     vTemp = _mm_unpackhi_epi32(vTemp, row3i);
     97 
     98     row3i = row0i;
     99     row0i = _mm_unpacklo_epi32(row0i, row1i);
    100     row3i = _mm_unpackhi_epi32(row3i, row1i);
    101 
    102     row1i = row0i;
    103     row0i = _mm_unpacklo_epi64(row0i, row2i);
    104     row1i = _mm_unpackhi_epi64(row1i, row2i);
    105 
    106     row2i = row3i;
    107     row2i = _mm_unpacklo_epi64(row2i, vTemp);
    108     row3i = _mm_unpackhi_epi64(row3i, vTemp);
    109 
    110     row0 = _mm_castsi128_ps(row0i);
    111     row1 = _mm_castsi128_ps(row1i);
    112     row2 = _mm_castsi128_ps(row2i);
    113     row3 = _mm_castsi128_ps(row3i);
    114 }
    115 
    116 INLINE
    117 void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
    118 {
    119     __m128i vTemp = row2;
    120     row2 = _mm_unpacklo_epi32(row2, row3);
    121     vTemp = _mm_unpackhi_epi32(vTemp, row3);
    122 
    123     row3 = row0;
    124     row0 = _mm_unpacklo_epi32(row0, row1);
    125     row3 = _mm_unpackhi_epi32(row3, row1);
    126 
    127     row1 = row0;
    128     row0 = _mm_unpacklo_epi64(row0, row2);
    129     row1 = _mm_unpackhi_epi64(row1, row2);
    130 
    131     row2 = row3;
    132     row2 = _mm_unpacklo_epi64(row2, vTemp);
    133     row3 = _mm_unpackhi_epi64(row3, vTemp);
    134 }
    135 
    136 #define GCC_VERSION (__GNUC__ * 10000 \
    137                      + __GNUC_MINOR__ * 100 \
    138                      + __GNUC_PATCHLEVEL__)
    139 
    140 #if defined(__clang__) || (defined(__GNUC__) && (GCC_VERSION < 40900))
    141 #define _mm_undefined_ps _mm_setzero_ps
    142 #define _mm_undefined_si128 _mm_setzero_si128
    143 #if KNOB_SIMD_WIDTH == 8
    144 #define _mm256_undefined_ps _mm256_setzero_ps
    145 #endif
    146 #endif
    147 
    148 #if KNOB_SIMD_WIDTH == 8
    149 INLINE
    150 void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2)
    151 {
    152     __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
    153     __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps());    //y0w0y1w1 y4w4y5w5
    154     __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
    155     __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
    156 
    157     r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
    158     r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps());        //y2w2y3w3 y6w6yw77
    159     __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
    160     __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
    161 
    162     vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
    163     vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
    164     vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
    165     vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
    166 
    167     vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
    168     vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
    169     vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
    170     vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
    171 }
    172 
    173 INLINE
    174 void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3)
    175 {
    176     __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
    177     __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3);                    //y0w0y1w1 y4w4y5w5
    178     __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
    179     __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
    180 
    181     r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
    182     r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3)                    ;        //y2w2y3w3 y6w6yw77
    183     __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
    184     __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
    185 
    186     vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
    187     vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
    188     vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
    189     vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
    190 
    191     vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
    192     vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
    193     vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
    194     vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
    195 }
    196 
    197 #if ENABLE_AVX512_SIMD16
    198 INLINE
    199 void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
    200 {
    201     const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
    202 
    203     simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
    204     simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
    205     simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
    206     simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
    207 
    208     simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
    209     simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
    210     simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
    211     simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
    212 
    213     dst[0] = _simd16_unpacklo_ps(rblo, galo);
    214     dst[1] = _simd16_unpackhi_ps(rblo, galo);
    215     dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
    216     dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
    217 }
    218 
    219 #endif
    220 INLINE
    221 void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
    222 {
    223     __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
    224     __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
    225     __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
    226     __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
    227     __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
    228     __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
    229     __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
    230     __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
    231     __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
    232     __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
    233     __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
    234     __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
    235     __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
    236     __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
    237     __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
    238     __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
    239     vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
    240     vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
    241     vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
    242     vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
    243     vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
    244     vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
    245     vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
    246     vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
    247 }
    248 
    249 INLINE
    250 void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
    251 {
    252     vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3),
    253         _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
    254 }
    255 #endif
    256 
    257 //////////////////////////////////////////////////////////////////////////
    258 /// TranposeSingleComponent
    259 //////////////////////////////////////////////////////////////////////////
    260 template<uint32_t bpp>
    261 struct TransposeSingleComponent
    262 {
    263     //////////////////////////////////////////////////////////////////////////
    264     /// @brief Pass-thru for single component.
    265     /// @param pSrc - source data in SOA form
    266     /// @param pDst - output data in AOS form
    267     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    268     {
    269         memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
    270     }
    271 #if ENABLE_AVX512_SIMD16
    272 
    273     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    274     {
    275         memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
    276     }
    277 #endif
    278 };
    279 
    280 //////////////////////////////////////////////////////////////////////////
    281 /// Transpose8_8_8_8
    282 //////////////////////////////////////////////////////////////////////////
    283 struct Transpose8_8_8_8
    284 {
    285     //////////////////////////////////////////////////////////////////////////
    286     /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
    287     /// @param pSrc - source data in SOA form
    288     /// @param pDst - output data in AOS form
    289     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    290     {
    291         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
    292 
    293 #if KNOB_SIMD_WIDTH == 8
    294 #if KNOB_ARCH == KNOB_ARCH_AVX
    295         __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
    296         __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
    297         __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
    298         __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
    299         __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
    300         __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
    301         __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
    302         __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
    303         _mm_store_si128((__m128i*)pDst, c0123lo);
    304         _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
    305 #elif KNOB_ARCH == KNOB_ARCH_AVX2
    306         simdscalari dst01 = _mm256_shuffle_epi8(src,
    307             _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
    308         simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
    309         dst23 = _mm256_shuffle_epi8(dst23,
    310             _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
    311         simdscalari dst = _mm256_or_si256(dst01, dst23);
    312         _simd_store_si((simdscalari*)pDst, dst);
    313 #endif
    314 #else
    315 #error Unsupported vector width
    316 #endif
    317     }
    318 #if ENABLE_AVX512_SIMD16
    319 
    320     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    321     {
    322         __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
    323         __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
    324         __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
    325         __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
    326 
    327         simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
    328         simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
    329         simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
    330         simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
    331 
    332         simd16scalari shl1 = _simd16_slli_epi32(cvt1,  8);
    333         simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
    334         simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
    335 
    336         simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
    337 
    338         _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst);             // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
    339     }
    340 #endif
    341 };
    342 
    343 //////////////////////////////////////////////////////////////////////////
    344 /// Transpose8_8_8
    345 //////////////////////////////////////////////////////////////////////////
    346 struct Transpose8_8_8
    347 {
    348     //////////////////////////////////////////////////////////////////////////
    349     /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
    350     /// @param pSrc - source data in SOA form
    351     /// @param pDst - output data in AOS form
    352     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    353 #if ENABLE_AVX512_SIMD16
    354 
    355     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    356 #endif
    357 };
    358 
    359 //////////////////////////////////////////////////////////////////////////
    360 /// Transpose8_8
    361 //////////////////////////////////////////////////////////////////////////
    362 struct Transpose8_8
    363 {
    364     //////////////////////////////////////////////////////////////////////////
    365     /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
    366     /// @param pSrc - source data in SOA form
    367     /// @param pDst - output data in AOS form
    368     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    369     {
    370 #if KNOB_SIMD_WIDTH == 8
    371         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
    372 
    373         __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
    374         __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
    375         rg = _mm_unpacklo_epi8(rg, g);
    376         _mm_store_si128((__m128i*)pDst, rg);
    377 #else
    378 #error Unsupported vector width
    379 #endif
    380     }
    381 #if ENABLE_AVX512_SIMD16
    382 
    383     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    384     {
    385         __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
    386         __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
    387 
    388         simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
    389         simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
    390 
    391         simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
    392 
    393         simdscalari dst = _simd_or_si(cvt0, shl1);
    394 
    395         _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);                 // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
    396     }
    397 #endif
    398 };
    399 
    400 //////////////////////////////////////////////////////////////////////////
    401 /// Transpose32_32_32_32
    402 //////////////////////////////////////////////////////////////////////////
    403 struct Transpose32_32_32_32
    404 {
    405     //////////////////////////////////////////////////////////////////////////
    406     /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
    407     /// @param pSrc - source data in SOA form
    408     /// @param pDst - output data in AOS form
    409     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    410     {
    411 #if KNOB_SIMD_WIDTH == 8
    412         simdscalar src0 = _simd_load_ps((const float*)pSrc);
    413         simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
    414         simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
    415         simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
    416 
    417         __m128 vDst[8];
    418         vTranspose4x8(vDst, src0, src1, src2, src3);
    419         _mm_store_ps((float*)pDst, vDst[0]);
    420         _mm_store_ps((float*)pDst+4, vDst[1]);
    421         _mm_store_ps((float*)pDst+8, vDst[2]);
    422         _mm_store_ps((float*)pDst+12, vDst[3]);
    423         _mm_store_ps((float*)pDst+16, vDst[4]);
    424         _mm_store_ps((float*)pDst+20, vDst[5]);
    425         _mm_store_ps((float*)pDst+24, vDst[6]);
    426         _mm_store_ps((float*)pDst+28, vDst[7]);
    427 #else
    428 #error Unsupported vector width
    429 #endif
    430     }
    431 #if ENABLE_AVX512_SIMD16
    432 
    433     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    434     {
    435         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
    436         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
    437         simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
    438         simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
    439 
    440         simd16scalar dst[4];
    441 
    442         vTranspose4x16(dst, src0, src1, src2, src3);
    443 
    444         _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
    445         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
    446         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
    447         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
    448     }
    449 #endif
    450 };
    451 
    452 //////////////////////////////////////////////////////////////////////////
    453 /// Transpose32_32_32
    454 //////////////////////////////////////////////////////////////////////////
    455 struct Transpose32_32_32
    456 {
    457     //////////////////////////////////////////////////////////////////////////
    458     /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
    459     /// @param pSrc - source data in SOA form
    460     /// @param pDst - output data in AOS form
    461     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    462     {
    463 #if KNOB_SIMD_WIDTH == 8
    464         simdscalar src0 = _simd_load_ps((const float*)pSrc);
    465         simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
    466         simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
    467 
    468         __m128 vDst[8];
    469         vTranspose3x8(vDst, src0, src1, src2);
    470         _mm_store_ps((float*)pDst, vDst[0]);
    471         _mm_store_ps((float*)pDst + 4, vDst[1]);
    472         _mm_store_ps((float*)pDst + 8, vDst[2]);
    473         _mm_store_ps((float*)pDst + 12, vDst[3]);
    474         _mm_store_ps((float*)pDst + 16, vDst[4]);
    475         _mm_store_ps((float*)pDst + 20, vDst[5]);
    476         _mm_store_ps((float*)pDst + 24, vDst[6]);
    477         _mm_store_ps((float*)pDst + 28, vDst[7]);
    478 #else
    479 #error Unsupported vector width
    480 #endif
    481     }
    482 #if ENABLE_AVX512_SIMD16
    483 
    484     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    485     {
    486         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
    487         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
    488         simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
    489         simd16scalar src3 = _simd16_setzero_ps();
    490 
    491         simd16scalar dst[4];
    492 
    493         vTranspose4x16(dst, src0, src1, src2, src3);
    494 
    495         _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
    496         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
    497         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
    498         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
    499     }
    500 #endif
    501 };
    502 
    503 //////////////////////////////////////////////////////////////////////////
    504 /// Transpose32_32
    505 //////////////////////////////////////////////////////////////////////////
    506 struct Transpose32_32
    507 {
    508     //////////////////////////////////////////////////////////////////////////
    509     /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
    510     /// @param pSrc - source data in SOA form
    511     /// @param pDst - output data in AOS form
    512     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    513     {
    514 #if KNOB_SIMD_WIDTH == 8
    515         const float* pfSrc = (const float*)pSrc;
    516         __m128 src_r0 = _mm_load_ps(pfSrc + 0);
    517         __m128 src_r1 = _mm_load_ps(pfSrc + 4);
    518         __m128 src_g0 = _mm_load_ps(pfSrc + 8);
    519         __m128 src_g1 = _mm_load_ps(pfSrc + 12);
    520 
    521         __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
    522         __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
    523         __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
    524         __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
    525 
    526         float* pfDst = (float*)pDst;
    527         _mm_store_ps(pfDst + 0, dst0);
    528         _mm_store_ps(pfDst + 4, dst1);
    529         _mm_store_ps(pfDst + 8, dst2);
    530         _mm_store_ps(pfDst + 12, dst3);
    531 #else
    532 #error Unsupported vector width
    533 #endif
    534     }
    535 #if ENABLE_AVX512_SIMD16
    536 
    537     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    538     {
    539         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));                 // rrrrrrrrrrrrrrrr
    540         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);            // gggggggggggggggg
    541 
    542         simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1);                                        // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
    543         simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1);                                        // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
    544 
    545         simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44);  // (1, 0, 1, 0)             // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
    546         simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE);  // (3, 2, 3, 2)             // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
    547 
    548         simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8);  // (3, 1, 2, 0)             // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
    549         simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8);  // (3, 1, 2, 0)             // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
    550 
    551         _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst0);                               // rgrgrgrgrgrgrgrg
    552         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1);                               // rgrgrgrgrgrgrgrg
    553     }
    554 #endif
    555 };
    556 
    557 //////////////////////////////////////////////////////////////////////////
    558 /// Transpose16_16_16_16
    559 //////////////////////////////////////////////////////////////////////////
    560 struct Transpose16_16_16_16
    561 {
    562     //////////////////////////////////////////////////////////////////////////
    563     /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
    564     /// @param pSrc - source data in SOA form
    565     /// @param pDst - output data in AOS form
    566     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    567     {
    568 #if KNOB_SIMD_WIDTH == 8
    569         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
    570         simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
    571 
    572         __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
    573         __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
    574         __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
    575         __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
    576 
    577         __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
    578         __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
    579         __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
    580         __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
    581 
    582         __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
    583         __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
    584         __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
    585         __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
    586 
    587         _mm_store_si128(((__m128i*)pDst) + 0, dst0);
    588         _mm_store_si128(((__m128i*)pDst) + 1, dst1);
    589         _mm_store_si128(((__m128i*)pDst) + 2, dst2);
    590         _mm_store_si128(((__m128i*)pDst) + 3, dst3);
    591 #else
    592 #error Unsupported vector width
    593 #endif
    594     }
    595 #if ENABLE_AVX512_SIMD16
    596 
    597     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    598     {
    599         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
    600         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
    601         simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
    602         simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3);          // aaaaaaaaaaaaaaaa
    603 
    604         simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
    605         simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
    606         simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
    607         simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
    608 
    609         simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
    610         simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
    611         simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
    612         simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
    613 
    614         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
    615         simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
    616         simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
    617         simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
    618 
    619         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
    620         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
    621         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
    622         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
    623     }
    624 #endif
    625 };
    626 
    627 //////////////////////////////////////////////////////////////////////////
    628 /// Transpose16_16_16
    629 //////////////////////////////////////////////////////////////////////////
    630 struct Transpose16_16_16
    631 {
    632     //////////////////////////////////////////////////////////////////////////
    633     /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
    634     /// @param pSrc - source data in SOA form
    635     /// @param pDst - output data in AOS form
    636     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    637     {
    638 #if KNOB_SIMD_WIDTH == 8
    639         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
    640 
    641         __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
    642         __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
    643         __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
    644         __m128i src_a = _mm_undefined_si128();
    645 
    646         __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
    647         __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
    648         __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
    649         __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
    650 
    651         __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
    652         __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
    653         __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
    654         __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
    655 
    656         _mm_store_si128(((__m128i*)pDst) + 0, dst0);
    657         _mm_store_si128(((__m128i*)pDst) + 1, dst1);
    658         _mm_store_si128(((__m128i*)pDst) + 2, dst2);
    659         _mm_store_si128(((__m128i*)pDst) + 3, dst3);
    660 #else
    661 #error Unsupported vector width
    662 #endif
    663     }
    664 #if ENABLE_AVX512_SIMD16
    665 
    666     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    667     {
    668         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
    669         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
    670         simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
    671         simdscalari src3 = _simd_setzero_si();                                                      // aaaaaaaaaaaaaaaa
    672 
    673         simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
    674         simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
    675         simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
    676         simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
    677 
    678         simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
    679         simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
    680         simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
    681         simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
    682 
    683         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
    684         simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
    685         simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
    686         simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
    687 
    688         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
    689         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
    690         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
    691         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
    692     }
    693 #endif
    694 };
    695 
    696 //////////////////////////////////////////////////////////////////////////
    697 /// Transpose16_16
    698 //////////////////////////////////////////////////////////////////////////
    699 struct Transpose16_16
    700 {
    701     //////////////////////////////////////////////////////////////////////////
    702     /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
    703     /// @param pSrc - source data in SOA form
    704     /// @param pDst - output data in AOS form
    705     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    706     {
    707 #if KNOB_SIMD_WIDTH == 8
    708         simdscalar src = _simd_load_ps((const float*)pSrc);
    709 
    710         __m128 comp0 = _mm256_castps256_ps128(src);
    711         __m128 comp1 = _mm256_extractf128_ps(src, 1);
    712 
    713         __m128i comp0i = _mm_castps_si128(comp0);
    714         __m128i comp1i = _mm_castps_si128(comp1);
    715 
    716         __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
    717         __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
    718 
    719         _mm_store_si128((__m128i*)pDst, resLo);
    720         _mm_store_si128((__m128i*)pDst + 1, resHi);
    721 #else
    722 #error Unsupported vector width
    723 #endif
    724     }
    725 #if ENABLE_AVX512_SIMD16
    726 
    727     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    728     {
    729         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
    730         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
    731 
    732         simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
    733         simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
    734 
    735         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20);     // (2, 0)                   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
    736         simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31);     // (3, 1)                   // rg8 rg9 rgA rgB rgC rgD rgE rgF
    737 
    738         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgrgrgrgrgrgrgrg
    739         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgrgrgrgrgrgrgrg
    740     }
    741 #endif
    742 };
    743 
    744 //////////////////////////////////////////////////////////////////////////
    745 /// Transpose24_8
    746 //////////////////////////////////////////////////////////////////////////
    747 struct Transpose24_8
    748 {
    749     //////////////////////////////////////////////////////////////////////////
    750     /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
    751     /// @param pSrc - source data in SOA form
    752     /// @param pDst - output data in AOS form
    753     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    754 #if ENABLE_AVX512_SIMD16
    755 
    756     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    757 #endif
    758 };
    759 
    760 //////////////////////////////////////////////////////////////////////////
    761 /// Transpose32_8_24
    762 //////////////////////////////////////////////////////////////////////////
    763 struct Transpose32_8_24
    764 {
    765     //////////////////////////////////////////////////////////////////////////
    766     /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
    767     /// @param pSrc - source data in SOA form
    768     /// @param pDst - output data in AOS form
    769     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    770 #if ENABLE_AVX512_SIMD16
    771 
    772     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    773 #endif
    774 };
    775 
    776 //////////////////////////////////////////////////////////////////////////
    777 /// Transpose4_4_4_4
    778 //////////////////////////////////////////////////////////////////////////
    779 struct Transpose4_4_4_4
    780 {
    781     //////////////////////////////////////////////////////////////////////////
    782     /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
    783     /// @param pSrc - source data in SOA form
    784     /// @param pDst - output data in AOS form
    785     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    786 #if ENABLE_AVX512_SIMD16
    787 
    788     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    789 #endif
    790 };
    791 
    792 //////////////////////////////////////////////////////////////////////////
    793 /// Transpose5_6_5
    794 //////////////////////////////////////////////////////////////////////////
    795 struct Transpose5_6_5
    796 {
    797     //////////////////////////////////////////////////////////////////////////
    798     /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
    799     /// @param pSrc - source data in SOA form
    800     /// @param pDst - output data in AOS form
    801     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    802 #if ENABLE_AVX512_SIMD16
    803 
    804     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    805 #endif
    806 };
    807 
    808 //////////////////////////////////////////////////////////////////////////
    809 /// Transpose9_9_9_5
    810 //////////////////////////////////////////////////////////////////////////
    811 struct Transpose9_9_9_5
    812 {
    813     //////////////////////////////////////////////////////////////////////////
    814     /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
    815     /// @param pSrc - source data in SOA form
    816     /// @param pDst - output data in AOS form
    817     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    818 #if ENABLE_AVX512_SIMD16
    819 
    820     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    821 #endif
    822 };
    823 
    824 //////////////////////////////////////////////////////////////////////////
    825 /// Transpose5_5_5_1
    826 //////////////////////////////////////////////////////////////////////////
    827 struct Transpose5_5_5_1
    828 {
    829     //////////////////////////////////////////////////////////////////////////
    830     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
    831     /// @param pSrc - source data in SOA form
    832     /// @param pDst - output data in AOS form
    833     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    834 #if ENABLE_AVX512_SIMD16
    835 
    836     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    837 #endif
    838 };
    839 
    840 //////////////////////////////////////////////////////////////////////////
    841 /// Transpose1_5_5_5
    842 //////////////////////////////////////////////////////////////////////////
    843 struct Transpose1_5_5_5
    844 {
    845     //////////////////////////////////////////////////////////////////////////
    846     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
    847     /// @param pSrc - source data in SOA form
    848     /// @param pDst - output data in AOS form
    849     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    850 };
    851 
    852 //////////////////////////////////////////////////////////////////////////
    853 /// Transpose10_10_10_2
    854 //////////////////////////////////////////////////////////////////////////
    855 struct Transpose10_10_10_2
    856 {
    857     //////////////////////////////////////////////////////////////////////////
    858     /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
    859     /// @param pSrc - source data in SOA form
    860     /// @param pDst - output data in AOS form
    861     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    862 #if ENABLE_AVX512_SIMD16
    863 
    864     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    865 #endif
    866 };
    867 
    868 //////////////////////////////////////////////////////////////////////////
    869 /// Transpose11_11_10
    870 //////////////////////////////////////////////////////////////////////////
    871 struct Transpose11_11_10
    872 {
    873     //////////////////////////////////////////////////////////////////////////
    874     /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
    875     /// @param pSrc - source data in SOA form
    876     /// @param pDst - output data in AOS form
    877     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    878 #if ENABLE_AVX512_SIMD16
    879 
    880     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    881 #endif
    882 };
    883 
    884 //////////////////////////////////////////////////////////////////////////
    885 /// Transpose64
    886 //////////////////////////////////////////////////////////////////////////
    887 struct Transpose64
    888 {
    889     //////////////////////////////////////////////////////////////////////////
    890     /// @brief Performs an SOA to AOS conversion
    891     /// @param pSrc - source data in SOA form
    892     /// @param pDst - output data in AOS form
    893     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    894 #if ENABLE_AVX512_SIMD16
    895 
    896     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    897 #endif
    898 };
    899 
    900 //////////////////////////////////////////////////////////////////////////
    901 /// Transpose64_64
    902 //////////////////////////////////////////////////////////////////////////
    903 struct Transpose64_64
    904 {
    905     //////////////////////////////////////////////////////////////////////////
    906     /// @brief Performs an SOA to AOS conversion
    907     /// @param pSrc - source data in SOA form
    908     /// @param pDst - output data in AOS form
    909     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    910 #if ENABLE_AVX512_SIMD16
    911 
    912     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    913 #endif
    914 };
    915 
    916 //////////////////////////////////////////////////////////////////////////
    917 /// Transpose64_64_64
    918 //////////////////////////////////////////////////////////////////////////
    919 struct Transpose64_64_64
    920 {
    921     //////////////////////////////////////////////////////////////////////////
    922     /// @brief Performs an SOA to AOS conversion
    923     /// @param pSrc - source data in SOA form
    924     /// @param pDst - output data in AOS form
    925     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    926 #if ENABLE_AVX512_SIMD16
    927 
    928     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    929 #endif
    930 };
    931 
    932 //////////////////////////////////////////////////////////////////////////
    933 /// Transpose64_64_64_64
    934 //////////////////////////////////////////////////////////////////////////
    935 struct Transpose64_64_64_64
    936 {
    937     //////////////////////////////////////////////////////////////////////////
    938     /// @brief Performs an SOA to AOS conversion
    939     /// @param pSrc - source data in SOA form
    940     /// @param pDst - output data in AOS form
    941     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    942 #if ENABLE_AVX512_SIMD16
    943 
    944     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    945 #endif
    946 };
    947 
    948 // helper function to unroll loops
    949 template<int Begin, int End, int Step = 1>
    950 struct UnrollerL {
    951     template<typename Lambda>
    952     INLINE static void step(Lambda& func) {
    953         func(Begin);
    954         UnrollerL<Begin + Step, End, Step>::step(func);
    955     }
    956 };
    957 
    958 template<int End, int Step>
    959 struct UnrollerL<End, End, Step> {
    960     template<typename Lambda>
    961     static void step(Lambda& func) {
    962     }
    963 };
    964 
    965 // helper function to unroll loops, with mask to skip specific iterations
    966 template<int Begin, int End, int Step = 1, int Mask = 0x7f>
    967 struct UnrollerLMask {
    968     template<typename Lambda>
    969     INLINE static void step(Lambda& func) {
    970         if(Mask & (1 << Begin))
    971         {
    972             func(Begin);
    973         }
    974         UnrollerL<Begin + Step, End, Step>::step(func);
    975     }
    976 };
    977 
    978 template<int End, int Step, int Mask>
    979 struct UnrollerLMask<End, End, Step, Mask> {
    980     template<typename Lambda>
    981     static void step(Lambda& func) {
    982     }
    983 };
    984 
    985 // general CRC compute
    986 INLINE
    987 uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
    988 {
    989 #if defined(_WIN64) || defined(__x86_64__)
    990     uint32_t sizeInQwords = size / sizeof(uint64_t);
    991     uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
    992     uint64_t* pDataWords = (uint64_t*)pData;
    993     for (uint32_t i = 0; i < sizeInQwords; ++i)
    994     {
    995         crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
    996     }
    997 #else
    998     uint32_t sizeInDwords = size / sizeof(uint32_t);
    999     uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
   1000     uint32_t* pDataWords = (uint32_t*)pData;
   1001     for (uint32_t i = 0; i < sizeInDwords; ++i)
   1002     {
   1003         crc = _mm_crc32_u32(crc, *pDataWords++);
   1004     }
   1005 #endif
   1006 
   1007     uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
   1008     for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
   1009     {
   1010         crc = _mm_crc32_u8(crc, *pRemainderBytes++);
   1011     }
   1012 
   1013     return crc;
   1014 }
   1015 
   1016 //////////////////////////////////////////////////////////////////////////
   1017 /// Add byte offset to any-type pointer
   1018 //////////////////////////////////////////////////////////////////////////
   1019 template <typename T>
   1020 INLINE
   1021 static T* PtrAdd(T* p, intptr_t offset)
   1022 {
   1023     intptr_t intp = reinterpret_cast<intptr_t>(p);
   1024     return reinterpret_cast<T*>(intp + offset);
   1025 }
   1026 
   1027 //////////////////////////////////////////////////////////////////////////
   1028 /// Is a power-of-2?
   1029 //////////////////////////////////////////////////////////////////////////
   1030 template <typename T>
   1031 INLINE
   1032 static bool IsPow2(T value)
   1033 {
   1034     return value == (value & (0 - value));
   1035 }
   1036 
   1037 //////////////////////////////////////////////////////////////////////////
   1038 /// Align down to specified alignment
   1039 /// Note: IsPow2(alignment) MUST be true
   1040 //////////////////////////////////////////////////////////////////////////
   1041 template <typename T1, typename T2>
   1042 INLINE
   1043 static T1 AlignDownPow2(T1 value, T2 alignment)
   1044 {
   1045     SWR_ASSERT(IsPow2(alignment));
   1046     return value & ~T1(alignment - 1);
   1047 }
   1048 
   1049 //////////////////////////////////////////////////////////////////////////
   1050 /// Align up to specified alignment
   1051 /// Note: IsPow2(alignment) MUST be true
   1052 //////////////////////////////////////////////////////////////////////////
   1053 template <typename T1, typename T2>
   1054 INLINE
   1055 static T1 AlignUpPow2(T1 value, T2 alignment)
   1056 {
   1057     return AlignDownPow2(value + T1(alignment - 1), alignment);
   1058 }
   1059 
   1060 //////////////////////////////////////////////////////////////////////////
   1061 /// Align up ptr to specified alignment
   1062 /// Note: IsPow2(alignment) MUST be true
   1063 //////////////////////////////////////////////////////////////////////////
   1064 template <typename T1, typename T2>
   1065 INLINE
   1066 static T1* AlignUpPow2(T1* value, T2 alignment)
   1067 {
   1068     return reinterpret_cast<T1*>(
   1069         AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
   1070 }
   1071 
   1072 //////////////////////////////////////////////////////////////////////////
   1073 /// Align down to specified alignment
   1074 //////////////////////////////////////////////////////////////////////////
   1075 template <typename T1, typename T2>
   1076 INLINE
   1077 static T1 AlignDown(T1 value, T2 alignment)
   1078 {
   1079     if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
   1080     return value - T1(value % alignment);
   1081 }
   1082 
   1083 //////////////////////////////////////////////////////////////////////////
   1084 /// Align down to specified alignment
   1085 //////////////////////////////////////////////////////////////////////////
   1086 template <typename T1, typename T2>
   1087 INLINE
   1088 static T1* AlignDown(T1* value, T2 alignment)
   1089 {
   1090     return (T1*)AlignDown(uintptr_t(value), alignment);
   1091 }
   1092 
   1093 //////////////////////////////////////////////////////////////////////////
   1094 /// Align up to specified alignment
   1095 /// Note: IsPow2(alignment) MUST be true
   1096 //////////////////////////////////////////////////////////////////////////
   1097 template <typename T1, typename T2>
   1098 INLINE
   1099 static T1 AlignUp(T1 value, T2 alignment)
   1100 {
   1101     return AlignDown(value + T1(alignment - 1), alignment);
   1102 }
   1103 
   1104 //////////////////////////////////////////////////////////////////////////
   1105 /// Align up to specified alignment
   1106 /// Note: IsPow2(alignment) MUST be true
   1107 //////////////////////////////////////////////////////////////////////////
   1108 template <typename T1, typename T2>
   1109 INLINE
   1110 static T1* AlignUp(T1* value, T2 alignment)
   1111 {
   1112     return AlignDown(PtrAdd(value, alignment - 1), alignment);
   1113 }
   1114 
   1115 //////////////////////////////////////////////////////////////////////////
   1116 /// Helper structure used to access an array of elements that don't
   1117 /// correspond to a typical word size.
   1118 //////////////////////////////////////////////////////////////////////////
   1119 template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
   1120 class BitsArray
   1121 {
   1122 private:
   1123     static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
   1124     static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
   1125     static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
   1126     static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
   1127 
   1128     static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
   1129         "Element size must an integral fraction of pointer size");
   1130 
   1131     size_t              m_words[NUM_WORDS] = {};
   1132 
   1133 public:
   1134 
   1135     T operator[] (size_t elementIndex) const
   1136     {
   1137         size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
   1138         word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
   1139         return T(word & ELEMENT_MASK);
   1140     }
   1141 };
   1142 
   1143 // Ranged integer argument for TemplateArgUnroller
   1144 template <uint32_t TMin, uint32_t TMax>
   1145 struct IntArg
   1146 {
   1147     uint32_t val;
   1148 };
   1149 
   1150 // Recursive template used to auto-nest conditionals.  Converts dynamic boolean function
   1151 // arguments to static template arguments.
   1152 template <typename TermT, typename... ArgsB>
   1153 struct TemplateArgUnroller
   1154 {
   1155     //-----------------------------------------
   1156     // Boolean value
   1157     //-----------------------------------------
   1158 
   1159     // Last Arg Terminator
   1160     static typename TermT::FuncType GetFunc(bool bArg)
   1161     {
   1162         if (bArg)
   1163         {
   1164             return TermT::template GetFunc<ArgsB..., std::true_type>();
   1165         }
   1166 
   1167         return TermT::template GetFunc<ArgsB..., std::false_type>();
   1168     }
   1169 
   1170     // Recursively parse args
   1171     template <typename... TArgsT>
   1172     static typename TermT::FuncType GetFunc(bool bArg, TArgsT... remainingArgs)
   1173     {
   1174         if (bArg)
   1175         {
   1176             return TemplateArgUnroller<TermT, ArgsB..., std::true_type>::GetFunc(remainingArgs...);
   1177         }
   1178 
   1179         return TemplateArgUnroller<TermT, ArgsB..., std::false_type>::GetFunc(remainingArgs...);
   1180     }
   1181 
   1182     //-----------------------------------------
   1183     // Integer value (within specified range)
   1184     //-----------------------------------------
   1185 
   1186     // Last Arg Terminator
   1187     template <uint32_t TMin, uint32_t TMax>
   1188     static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg)
   1189     {
   1190         if (iArg.val == TMax)
   1191         {
   1192             return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TMax>>();
   1193         }
   1194         if (TMax > TMin)
   1195         {
   1196             return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax-1>{iArg.val});
   1197         }
   1198         SWR_ASSUME(false); return nullptr;
   1199     }
   1200     template <uint32_t TVal>
   1201     static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg)
   1202     {
   1203         SWR_ASSERT(iArg.val == TVal);
   1204         return TermT::template GetFunc<ArgsB..., std::integral_constant<uint32_t, TVal>>();
   1205     }
   1206 
   1207     // Recursively parse args
   1208     template <uint32_t TMin, uint32_t TMax, typename... TArgsT>
   1209     static typename TermT::FuncType GetFunc(IntArg<TMin, TMax> iArg, TArgsT... remainingArgs)
   1210     {
   1211         if (iArg.val == TMax)
   1212         {
   1213             return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TMax>>::GetFunc(remainingArgs...);
   1214         }
   1215         if (TMax > TMin)
   1216         {
   1217             return TemplateArgUnroller<TermT, ArgsB...>::GetFunc(IntArg<TMin, TMax - 1>{iArg.val}, remainingArgs...);
   1218         }
   1219         SWR_ASSUME(false); return nullptr;
   1220     }
   1221     template <uint32_t TVal, typename... TArgsT>
   1222     static typename TermT::FuncType GetFunc(IntArg<TVal, TVal> iArg, TArgsT... remainingArgs)
   1223     {
   1224         SWR_ASSERT(iArg.val == TVal);
   1225         return TemplateArgUnroller<TermT, ArgsB..., std::integral_constant<uint32_t, TVal>>::GetFunc(remainingArgs...);
   1226     }
   1227 };
   1228 
   1229 
   1230