Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file utils.h
     24 *
     25 * @brief Utilities used by SWR core related to pixel formats.
     26 *
     27 ******************************************************************************/
     28 #pragma once
     29 
     30 #include "core/utils.h"
     31 #include "common/simdintrin.h"
     32 
     33 INLINE
     34 void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4scalar &row3)
     35 {
     36     simd4scalari row0i = SIMD128::castps_si(row0);
     37     simd4scalari row1i = SIMD128::castps_si(row1);
     38     simd4scalari row2i = SIMD128::castps_si(row2);
     39     simd4scalari row3i = SIMD128::castps_si(row3);
     40 
     41     simd4scalari vTemp = row2i;
     42     row2i = SIMD128::unpacklo_epi32(row2i, row3i);
     43     vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);
     44 
     45     row3i = row0i;
     46     row0i = SIMD128::unpacklo_epi32(row0i, row1i);
     47     row3i = SIMD128::unpackhi_epi32(row3i, row1i);
     48 
     49     row1i = row0i;
     50     row0i = SIMD128::unpacklo_epi64(row0i, row2i);
     51     row1i = SIMD128::unpackhi_epi64(row1i, row2i);
     52 
     53     row2i = row3i;
     54     row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
     55     row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
     56 
     57     row0 = SIMD128::castsi_ps(row0i);
     58     row1 = SIMD128::castsi_ps(row1i);
     59     row2 = SIMD128::castsi_ps(row2i);
     60     row3 = SIMD128::castsi_ps(row3i);
     61 }
     62 
     63 INLINE
     64 void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd4scalari &row3)
     65 {
     66     simd4scalari vTemp = row2;
     67     row2 = SIMD128::unpacklo_epi32(row2, row3);
     68     vTemp = SIMD128::unpackhi_epi32(vTemp, row3);
     69 
     70     row3 = row0;
     71     row0 = SIMD128::unpacklo_epi32(row0, row1);
     72     row3 = SIMD128::unpackhi_epi32(row3, row1);
     73 
     74     row1 = row0;
     75     row0 = SIMD128::unpacklo_epi64(row0, row2);
     76     row1 = SIMD128::unpackhi_epi64(row1, row2);
     77 
     78     row2 = row3;
     79     row2 = SIMD128::unpacklo_epi64(row2, vTemp);
     80     row3 = SIMD128::unpackhi_epi64(row3, vTemp);
     81 }
     82 
     83 #if KNOB_SIMD_WIDTH == 8
     84 INLINE
     85 void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
     86 {
     87     simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);                  //x0z0x1z1 x4z4x5z5
     88     simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps());     //y0w0y1w1 y4w4y5w5
     89     simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);              //x0y0z0w0 x4y4z4w4
     90     simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);              //x1y1z1w1 x5y5z5w5
     91 
     92     r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2);                             //x2z2x3z3 x6z6x7z7
     93     r1rx = _simd_unpackhi_ps(vSrc1, _simd_setzero_ps());                //y2w2y3w3 y6w6yw77
     94     simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);              //x2y2z2w2 x6y6z6w6
     95     simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);              //x3y3z3w3 x7y7z7w7
     96 
     97     vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
     98     vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
     99     vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
    100     vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
    101 
    102     vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
    103     vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
    104     vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
    105     vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
    106 }
    107 
    108 INLINE
    109 void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
    110 {
    111     simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);      //x0z0x1z1 x4z4x5z5
    112     simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3);      //y0w0y1w1 y4w4y5w5
    113     simdscalar r02r1xlolo = _simd_unpacklo_ps(r0r2, r1rx);  //x0y0z0w0 x4y4z4w4
    114     simdscalar r02r1xlohi = _simd_unpackhi_ps(r0r2, r1rx);  //x1y1z1w1 x5y5z5w5
    115 
    116     r0r2 = _simd_unpackhi_ps(vSrc0, vSrc2);                 //x2z2x3z3 x6z6x7z7
    117     r1rx = _simd_unpackhi_ps(vSrc1, vSrc3);                 //y2w2y3w3 y6w6yw77
    118     simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);  //x2y2z2w2 x6y6z6w6
    119     simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);  //x3y3z3w3 x7y7z7w7
    120 
    121     vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
    122     vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
    123     vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
    124     vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
    125 
    126     vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
    127     vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
    128     vDst[6] = _simd_extractf128_ps(r02r1xhilo, 1);
    129     vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
    130 }
    131 
    132 #if ENABLE_AVX512_SIMD16
    133 INLINE
    134 void vTranspose4x16(simd16scalar(&dst)[4], const simd16scalar &src0, const simd16scalar &src1, const simd16scalar &src2, const simd16scalar &src3)
    135 {
    136     const simd16scalari perm = _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); // pre-permute input to setup the right order after all the unpacking
    137 
    138     simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
    139     simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
    140     simd16scalar pre2 = _simd16_permute_ps(src2, perm); // b
    141     simd16scalar pre3 = _simd16_permute_ps(src3, perm); // a
    142 
    143     simd16scalar rblo = _simd16_unpacklo_ps(pre0, pre2);
    144     simd16scalar galo = _simd16_unpacklo_ps(pre1, pre3);
    145     simd16scalar rbhi = _simd16_unpackhi_ps(pre0, pre2);
    146     simd16scalar gahi = _simd16_unpackhi_ps(pre1, pre3);
    147 
    148     dst[0] = _simd16_unpacklo_ps(rblo, galo);
    149     dst[1] = _simd16_unpackhi_ps(rblo, galo);
    150     dst[2] = _simd16_unpacklo_ps(rbhi, gahi);
    151     dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
    152 }
    153 
    154 #endif
    155 INLINE
    156 void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar &vMask0, const simdscalar &vMask1, const simdscalar &vMask2, const simdscalar &vMask3, const simdscalar &vMask4, const simdscalar &vMask5, const simdscalar &vMask6, const simdscalar &vMask7)
    157 {
    158     simdscalar __t0 = _simd_unpacklo_ps(vMask0, vMask1);
    159     simdscalar __t1 = _simd_unpackhi_ps(vMask0, vMask1);
    160     simdscalar __t2 = _simd_unpacklo_ps(vMask2, vMask3);
    161     simdscalar __t3 = _simd_unpackhi_ps(vMask2, vMask3);
    162     simdscalar __t4 = _simd_unpacklo_ps(vMask4, vMask5);
    163     simdscalar __t5 = _simd_unpackhi_ps(vMask4, vMask5);
    164     simdscalar __t6 = _simd_unpacklo_ps(vMask6, vMask7);
    165     simdscalar __t7 = _simd_unpackhi_ps(vMask6, vMask7);
    166     simdscalar __tt0 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
    167     simdscalar __tt1 = _simd_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
    168     simdscalar __tt2 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
    169     simdscalar __tt3 = _simd_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
    170     simdscalar __tt4 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
    171     simdscalar __tt5 = _simd_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
    172     simdscalar __tt6 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
    173     simdscalar __tt7 = _simd_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
    174     vDst[0] = _simd_permute2f128_ps(__tt0, __tt4, 0x20);
    175     vDst[1] = _simd_permute2f128_ps(__tt1, __tt5, 0x20);
    176     vDst[2] = _simd_permute2f128_ps(__tt2, __tt6, 0x20);
    177     vDst[3] = _simd_permute2f128_ps(__tt3, __tt7, 0x20);
    178     vDst[4] = _simd_permute2f128_ps(__tt0, __tt4, 0x31);
    179     vDst[5] = _simd_permute2f128_ps(__tt1, __tt5, 0x31);
    180     vDst[6] = _simd_permute2f128_ps(__tt2, __tt6, 0x31);
    181     vDst[7] = _simd_permute2f128_ps(__tt3, __tt7, 0x31);
    182 }
    183 
    184 INLINE
    185 void vTranspose8x8(simdscalar (&vDst)[8], const simdscalari &vMask0, const simdscalari &vMask1, const simdscalari &vMask2, const simdscalari &vMask3, const simdscalari &vMask4, const simdscalari &vMask5, const simdscalari &vMask6, const simdscalari &vMask7)
    186 {
    187     vTranspose8x8(vDst, _simd_castsi_ps(vMask0), _simd_castsi_ps(vMask1), _simd_castsi_ps(vMask2), _simd_castsi_ps(vMask3),
    188         _simd_castsi_ps(vMask4), _simd_castsi_ps(vMask5), _simd_castsi_ps(vMask6), _simd_castsi_ps(vMask7));
    189 }
    190 #endif
    191 
    192 //////////////////////////////////////////////////////////////////////////
    193 /// TranposeSingleComponent
    194 //////////////////////////////////////////////////////////////////////////
    195 template<uint32_t bpp>
    196 struct TransposeSingleComponent
    197 {
    198     //////////////////////////////////////////////////////////////////////////
    199     /// @brief Pass-thru for single component.
    200     /// @param pSrc - source data in SOA form
    201     /// @param pDst - output data in AOS form
    202     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    203     {
    204         memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
    205     }
    206 #if ENABLE_AVX512_SIMD16
    207 
    208     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    209     {
    210         memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
    211     }
    212 #endif
    213 };
    214 
    215 //////////////////////////////////////////////////////////////////////////
    216 /// Transpose8_8_8_8
    217 //////////////////////////////////////////////////////////////////////////
    218 struct Transpose8_8_8_8
    219 {
    220     //////////////////////////////////////////////////////////////////////////
    221     /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
    222     /// @param pSrc - source data in SOA form
    223     /// @param pDst - output data in AOS form
    224     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    225     {
    226         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
    227 
    228 #if KNOB_SIMD_WIDTH == 8
    229 #if KNOB_ARCH <= KNOB_ARCH_AVX
    230         simd4scalari c0c1 = src.v4[0];                                                          // rrrrrrrrgggggggg
    231         simd4scalari c2c3 = SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1));  // bbbbbbbbaaaaaaaa
    232         simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
    233         simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
    234         simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
    235         simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
    236         simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
    237         simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
    238         SIMD128::store_si((simd4scalari*)pDst, c0123lo);
    239         SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
    240 #else
    241         simdscalari dst01 = _simd_shuffle_epi8(src,
    242             _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
    243         simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
    244         dst23 = _simd_shuffle_epi8(dst23,
    245             _simd_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
    246         simdscalari dst = _simd_or_si(dst01, dst23);
    247         _simd_store_si((simdscalari*)pDst, dst);
    248 #endif
    249 #else
    250 #error Unsupported vector width
    251 #endif
    252     }
    253 #if ENABLE_AVX512_SIMD16
    254 
    255     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    256     {
    257         simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc));     // rrrrrrrrrrrrrrrr
    258         simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
    259         simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
    260         simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
    261 
    262         simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
    263         simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
    264         simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
    265         simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
    266 
    267         simd16scalari shl1 = _simd16_slli_epi32(cvt1,  8);
    268         simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
    269         simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
    270 
    271         simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
    272 
    273         _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), dst);             // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
    274     }
    275 #endif
    276 };
    277 
    278 //////////////////////////////////////////////////////////////////////////
    279 /// Transpose8_8_8
    280 //////////////////////////////////////////////////////////////////////////
    281 struct Transpose8_8_8
    282 {
    283     //////////////////////////////////////////////////////////////////////////
    284     /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
    285     /// @param pSrc - source data in SOA form
    286     /// @param pDst - output data in AOS form
    287     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    288 #if ENABLE_AVX512_SIMD16
    289 
    290     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    291 #endif
    292 };
    293 
    294 //////////////////////////////////////////////////////////////////////////
    295 /// Transpose8_8
    296 //////////////////////////////////////////////////////////////////////////
    297 struct Transpose8_8
    298 {
    299     //////////////////////////////////////////////////////////////////////////
    300     /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
    301     /// @param pSrc - source data in SOA form
    302     /// @param pDst - output data in AOS form
    303     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    304     {
    305 #if KNOB_SIMD_WIDTH == 8
    306         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
    307 
    308         simd4scalari rg = src.v4[0];           // rrrrrrrr gggggggg
    309         simd4scalari g = SIMD128::unpackhi_epi64(rg, rg);             // gggggggg gggggggg
    310         rg = SIMD128::unpacklo_epi8(rg, g);
    311         SIMD128::store_si((simd4scalari*)pDst, rg);
    312 #else
    313 #error Unsupported vector width
    314 #endif
    315     }
    316 #if ENABLE_AVX512_SIMD16
    317 
    318     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    319     {
    320         simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc));     // rrrrrrrrrrrrrrrr
    321         simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
    322 
    323         simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
    324         simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
    325 
    326         simdscalari shl1 = _simd_slli_epi32(cvt1, 8);
    327 
    328         simdscalari dst = _simd_or_si(cvt0, shl1);
    329 
    330         _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);                 // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
    331     }
    332 #endif
    333 };
    334 
    335 //////////////////////////////////////////////////////////////////////////
    336 /// Transpose32_32_32_32
    337 //////////////////////////////////////////////////////////////////////////
    338 struct Transpose32_32_32_32
    339 {
    340     //////////////////////////////////////////////////////////////////////////
    341     /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
    342     /// @param pSrc - source data in SOA form
    343     /// @param pDst - output data in AOS form
    344     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    345     {
    346 #if KNOB_SIMD_WIDTH == 8
    347         simdscalar src0 = _simd_load_ps((const float*)pSrc);
    348         simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
    349         simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
    350         simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
    351 
    352         simd4scalar vDst[8];
    353         vTranspose4x8(vDst, src0, src1, src2, src3);
    354         SIMD128::store_ps((float*)pDst, vDst[0]);
    355         SIMD128::store_ps((float*)pDst+4, vDst[1]);
    356         SIMD128::store_ps((float*)pDst+8, vDst[2]);
    357         SIMD128::store_ps((float*)pDst+12, vDst[3]);
    358         SIMD128::store_ps((float*)pDst+16, vDst[4]);
    359         SIMD128::store_ps((float*)pDst+20, vDst[5]);
    360         SIMD128::store_ps((float*)pDst+24, vDst[6]);
    361         SIMD128::store_ps((float*)pDst+28, vDst[7]);
    362 #else
    363 #error Unsupported vector width
    364 #endif
    365     }
    366 #if ENABLE_AVX512_SIMD16
    367 
    368     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    369     {
    370         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
    371         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
    372         simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
    373         simd16scalar src3 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 48);
    374 
    375         simd16scalar dst[4];
    376 
    377         vTranspose4x16(dst, src0, src1, src2, src3);
    378 
    379         _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
    380         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
    381         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
    382         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
    383     }
    384 #endif
    385 };
    386 
    387 //////////////////////////////////////////////////////////////////////////
    388 /// Transpose32_32_32
    389 //////////////////////////////////////////////////////////////////////////
    390 struct Transpose32_32_32
    391 {
    392     //////////////////////////////////////////////////////////////////////////
    393     /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
    394     /// @param pSrc - source data in SOA form
    395     /// @param pDst - output data in AOS form
    396     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    397     {
    398 #if KNOB_SIMD_WIDTH == 8
    399         simdscalar src0 = _simd_load_ps((const float*)pSrc);
    400         simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
    401         simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
    402 
    403         simd4scalar vDst[8];
    404         vTranspose3x8(vDst, src0, src1, src2);
    405         SIMD128::store_ps((float*)pDst, vDst[0]);
    406         SIMD128::store_ps((float*)pDst + 4, vDst[1]);
    407         SIMD128::store_ps((float*)pDst + 8, vDst[2]);
    408         SIMD128::store_ps((float*)pDst + 12, vDst[3]);
    409         SIMD128::store_ps((float*)pDst + 16, vDst[4]);
    410         SIMD128::store_ps((float*)pDst + 20, vDst[5]);
    411         SIMD128::store_ps((float*)pDst + 24, vDst[6]);
    412         SIMD128::store_ps((float*)pDst + 28, vDst[7]);
    413 #else
    414 #error Unsupported vector width
    415 #endif
    416     }
    417 #if ENABLE_AVX512_SIMD16
    418 
    419     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    420     {
    421         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
    422         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
    423         simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 32);
    424         simd16scalar src3 = _simd16_setzero_ps();
    425 
    426         simd16scalar dst[4];
    427 
    428         vTranspose4x16(dst, src0, src1, src2, src3);
    429 
    430         _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst[0]);
    431         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst[1]);
    432         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, dst[2]);
    433         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, dst[3]);
    434     }
    435 #endif
    436 };
    437 
    438 //////////////////////////////////////////////////////////////////////////
    439 /// Transpose32_32
    440 //////////////////////////////////////////////////////////////////////////
    441 struct Transpose32_32
    442 {
    443     //////////////////////////////////////////////////////////////////////////
    444     /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
    445     /// @param pSrc - source data in SOA form
    446     /// @param pDst - output data in AOS form
    447     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    448     {
    449 #if KNOB_SIMD_WIDTH == 8
    450         const float* pfSrc = (const float*)pSrc;
    451         simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
    452         simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
    453         simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
    454         simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);
    455 
    456         simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
    457         simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
    458         simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
    459         simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
    460 
    461         float* pfDst = (float*)pDst;
    462         SIMD128::store_ps(pfDst + 0, dst0);
    463         SIMD128::store_ps(pfDst + 4, dst1);
    464         SIMD128::store_ps(pfDst + 8, dst2);
    465         SIMD128::store_ps(pfDst + 12, dst3);
    466 #else
    467 #error Unsupported vector width
    468 #endif
    469     }
    470 #if ENABLE_AVX512_SIMD16
    471 
    472     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    473     {
    474         simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc));                 // rrrrrrrrrrrrrrrr
    475         simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float *>(pSrc) + 16);            // gggggggggggggggg
    476 
    477         simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1);                                        // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
    478         simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1);                                        // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
    479 
    480         simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44);  // (1, 0, 1, 0)             // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
    481         simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE);  // (3, 2, 3, 2)             // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
    482 
    483         simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8);  // (3, 1, 2, 0)             // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
    484         simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8);  // (3, 1, 2, 0)             // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
    485 
    486         _simd16_store_ps(reinterpret_cast<float *>(pDst) +  0, dst0);                               // rgrgrgrgrgrgrgrg
    487         _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, dst1);                               // rgrgrgrgrgrgrgrg
    488     }
    489 #endif
    490 };
    491 
    492 //////////////////////////////////////////////////////////////////////////
    493 /// Transpose16_16_16_16
    494 //////////////////////////////////////////////////////////////////////////
    495 struct Transpose16_16_16_16
    496 {
    497     //////////////////////////////////////////////////////////////////////////
    498     /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
    499     /// @param pSrc - source data in SOA form
    500     /// @param pDst - output data in AOS form
    501     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    502     {
    503 #if KNOB_SIMD_WIDTH == 8
    504         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
    505         simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
    506 
    507         simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
    508         simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
    509         simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
    510         simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
    511 
    512         simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
    513         simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
    514         simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
    515         simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
    516 
    517         simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
    518         simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
    519         simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
    520         simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
    521 
    522         SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
    523         SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
    524         SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
    525         SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
    526 #else
    527 #error Unsupported vector width
    528 #endif
    529     }
    530 #if ENABLE_AVX512_SIMD16
    531 
    532     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    533     {
    534         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
    535         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
    536         simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
    537         simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 3);          // aaaaaaaaaaaaaaaa
    538 
    539         simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
    540         simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
    541         simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
    542         simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
    543 
    544         simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
    545         simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
    546         simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
    547         simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
    548 
    549         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
    550         simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
    551         simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
    552         simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
    553 
    554         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
    555         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
    556         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
    557         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
    558     }
    559 #endif
    560 };
    561 
    562 //////////////////////////////////////////////////////////////////////////
    563 /// Transpose16_16_16
    564 //////////////////////////////////////////////////////////////////////////
    565 struct Transpose16_16_16
    566 {
    567     //////////////////////////////////////////////////////////////////////////
    568     /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
    569     /// @param pSrc - source data in SOA form
    570     /// @param pDst - output data in AOS form
    571     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    572     {
    573 #if KNOB_SIMD_WIDTH == 8
    574         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
    575 
    576         simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
    577         simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
    578         simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
    579         simd4scalari src_a = SIMD128::setzero_si();
    580 
    581         simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
    582         simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
    583         simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
    584         simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
    585 
    586         simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
    587         simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
    588         simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
    589         simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
    590 
    591         SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
    592         SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
    593         SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
    594         SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
    595 #else
    596 #error Unsupported vector width
    597 #endif
    598     }
    599 #if ENABLE_AVX512_SIMD16
    600 
    601     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    602     {
    603         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
    604         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
    605         simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 2);          // bbbbbbbbbbbbbbbb
    606         simdscalari src3 = _simd_setzero_si();                                                      // aaaaaaaaaaaaaaaa
    607 
    608         simdscalari pre0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
    609         simdscalari pre1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
    610         simdscalari pre2 = _simd_unpacklo_epi16(src2, src3);                                        // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
    611         simdscalari pre3 = _simd_unpackhi_epi16(src2, src3);                                        // ba4 ba5 ba6 ba7 baC baD baE baF
    612 
    613         simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2);                                        // rbga0 rbga1 rbga8 rbga9
    614         simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2);                                        // rbga2 rbga3 rbgaA rbgaB
    615         simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3);                                        // rbga4 rbga5 rgbaC rbgaD
    616         simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3);                                        // rbga6 rbga7 rbgaE rbgaF
    617 
    618         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0)                       // rbga0 rbga1 rbga2 rbga3
    619         simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0)                       // rbga4 rbga5 rbga6 rbga7
    620         simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1)                       // rbga8 rbga9 rbgaA rbgaB
    621         simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1)                       // rbgaC rbgaD rbgaE rbgaF
    622 
    623         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgbargbargbargba
    624         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgbargbargbargba
    625         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);                            // rgbargbargbargba
    626         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);                            // rgbargbargbargba
    627     }
    628 #endif
    629 };
    630 
    631 //////////////////////////////////////////////////////////////////////////
    632 /// Transpose16_16
    633 //////////////////////////////////////////////////////////////////////////
    634 struct Transpose16_16
    635 {
    636     //////////////////////////////////////////////////////////////////////////
    637     /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
    638     /// @param pSrc - source data in SOA form
    639     /// @param pDst - output data in AOS form
    640     INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
    641     {
    642 #if KNOB_SIMD_WIDTH == 8
    643         simdscalar src = _simd_load_ps((const float*)pSrc);
    644 
    645         simd4scalar comp0 = _simd_extractf128_ps(src, 0);
    646         simd4scalar comp1 = _simd_extractf128_ps(src, 1);
    647 
    648         simd4scalari comp0i = SIMD128::castps_si(comp0);
    649         simd4scalari comp1i = SIMD128::castps_si(comp1);
    650 
    651         simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
    652         simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
    653 
    654         SIMD128::store_si((simd4scalari*)pDst, resLo);
    655         SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
    656 #else
    657 #error Unsupported vector width
    658 #endif
    659     }
    660 #if ENABLE_AVX512_SIMD16
    661 
    662     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
    663     {
    664         simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc));              // rrrrrrrrrrrrrrrr
    665         simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc) + 1);          // gggggggggggggggg
    666 
    667         simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1);                                        // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
    668         simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1);                                        // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
    669 
    670         simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20);     // (2, 0)                   // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
    671         simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31);     // (3, 1)                   // rg8 rg9 rgA rgB rgC rgD rgE rgF
    672 
    673         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);                            // rgrgrgrgrgrgrgrg
    674         _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);                            // rgrgrgrgrgrgrgrg
    675     }
    676 #endif
    677 };
    678 
    679 //////////////////////////////////////////////////////////////////////////
    680 /// Transpose24_8
    681 //////////////////////////////////////////////////////////////////////////
    682 struct Transpose24_8
    683 {
    684     //////////////////////////////////////////////////////////////////////////
    685     /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
    686     /// @param pSrc - source data in SOA form
    687     /// @param pDst - output data in AOS form
    688     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    689 #if ENABLE_AVX512_SIMD16
    690 
    691     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    692 #endif
    693 };
    694 
    695 //////////////////////////////////////////////////////////////////////////
    696 /// Transpose32_8_24
    697 //////////////////////////////////////////////////////////////////////////
    698 struct Transpose32_8_24
    699 {
    700     //////////////////////////////////////////////////////////////////////////
    701     /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
    702     /// @param pSrc - source data in SOA form
    703     /// @param pDst - output data in AOS form
    704     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    705 #if ENABLE_AVX512_SIMD16
    706 
    707     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    708 #endif
    709 };
    710 
    711 //////////////////////////////////////////////////////////////////////////
    712 /// Transpose4_4_4_4
    713 //////////////////////////////////////////////////////////////////////////
    714 struct Transpose4_4_4_4
    715 {
    716     //////////////////////////////////////////////////////////////////////////
    717     /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
    718     /// @param pSrc - source data in SOA form
    719     /// @param pDst - output data in AOS form
    720     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    721 #if ENABLE_AVX512_SIMD16
    722 
    723     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    724 #endif
    725 };
    726 
    727 //////////////////////////////////////////////////////////////////////////
    728 /// Transpose5_6_5
    729 //////////////////////////////////////////////////////////////////////////
    730 struct Transpose5_6_5
    731 {
    732     //////////////////////////////////////////////////////////////////////////
    733     /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
    734     /// @param pSrc - source data in SOA form
    735     /// @param pDst - output data in AOS form
    736     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    737 #if ENABLE_AVX512_SIMD16
    738 
    739     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    740 #endif
    741 };
    742 
    743 //////////////////////////////////////////////////////////////////////////
    744 /// Transpose9_9_9_5
    745 //////////////////////////////////////////////////////////////////////////
    746 struct Transpose9_9_9_5
    747 {
    748     //////////////////////////////////////////////////////////////////////////
    749     /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
    750     /// @param pSrc - source data in SOA form
    751     /// @param pDst - output data in AOS form
    752     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    753 #if ENABLE_AVX512_SIMD16
    754 
    755     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    756 #endif
    757 };
    758 
    759 //////////////////////////////////////////////////////////////////////////
    760 /// Transpose5_5_5_1
    761 //////////////////////////////////////////////////////////////////////////
    762 struct Transpose5_5_5_1
    763 {
    764     //////////////////////////////////////////////////////////////////////////
    765     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
    766     /// @param pSrc - source data in SOA form
    767     /// @param pDst - output data in AOS form
    768     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    769 #if ENABLE_AVX512_SIMD16
    770 
    771     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    772 #endif
    773 };
    774 
    775 //////////////////////////////////////////////////////////////////////////
    776 /// Transpose1_5_5_5
    777 //////////////////////////////////////////////////////////////////////////
    778 struct Transpose1_5_5_5
    779 {
    780     //////////////////////////////////////////////////////////////////////////
    781     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
    782     /// @param pSrc - source data in SOA form
    783     /// @param pDst - output data in AOS form
    784     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    785 };
    786 
    787 //////////////////////////////////////////////////////////////////////////
    788 /// Transpose10_10_10_2
    789 //////////////////////////////////////////////////////////////////////////
    790 struct Transpose10_10_10_2
    791 {
    792     //////////////////////////////////////////////////////////////////////////
    793     /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
    794     /// @param pSrc - source data in SOA form
    795     /// @param pDst - output data in AOS form
    796     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    797 #if ENABLE_AVX512_SIMD16
    798 
    799     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    800 #endif
    801 };
    802 
    803 //////////////////////////////////////////////////////////////////////////
    804 /// Transpose11_11_10
    805 //////////////////////////////////////////////////////////////////////////
    806 struct Transpose11_11_10
    807 {
    808     //////////////////////////////////////////////////////////////////////////
    809     /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
    810     /// @param pSrc - source data in SOA form
    811     /// @param pDst - output data in AOS form
    812     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    813 #if ENABLE_AVX512_SIMD16
    814 
    815     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    816 #endif
    817 };
    818 
    819 //////////////////////////////////////////////////////////////////////////
    820 /// Transpose64
    821 //////////////////////////////////////////////////////////////////////////
    822 struct Transpose64
    823 {
    824     //////////////////////////////////////////////////////////////////////////
    825     /// @brief Performs an SOA to AOS conversion
    826     /// @param pSrc - source data in SOA form
    827     /// @param pDst - output data in AOS form
    828     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    829 #if ENABLE_AVX512_SIMD16
    830 
    831     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    832 #endif
    833 };
    834 
    835 //////////////////////////////////////////////////////////////////////////
    836 /// Transpose64_64
    837 //////////////////////////////////////////////////////////////////////////
    838 struct Transpose64_64
    839 {
    840     //////////////////////////////////////////////////////////////////////////
    841     /// @brief Performs an SOA to AOS conversion
    842     /// @param pSrc - source data in SOA form
    843     /// @param pDst - output data in AOS form
    844     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    845 #if ENABLE_AVX512_SIMD16
    846 
    847     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    848 #endif
    849 };
    850 
    851 //////////////////////////////////////////////////////////////////////////
    852 /// Transpose64_64_64
    853 //////////////////////////////////////////////////////////////////////////
    854 struct Transpose64_64_64
    855 {
    856     //////////////////////////////////////////////////////////////////////////
    857     /// @brief Performs an SOA to AOS conversion
    858     /// @param pSrc - source data in SOA form
    859     /// @param pDst - output data in AOS form
    860     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    861 #if ENABLE_AVX512_SIMD16
    862 
    863     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    864 #endif
    865 };
    866 
    867 //////////////////////////////////////////////////////////////////////////
    868 /// Transpose64_64_64_64
    869 //////////////////////////////////////////////////////////////////////////
    870 struct Transpose64_64_64_64
    871 {
    872     //////////////////////////////////////////////////////////////////////////
    873     /// @brief Performs an SOA to AOS conversion
    874     /// @param pSrc - source data in SOA form
    875     /// @param pDst - output data in AOS form
    876     static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
    877 #if ENABLE_AVX512_SIMD16
    878 
    879     static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
    880 #endif
    881 };
    882 
    883