Home | History | Annotate | Download | only in core
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 *
     23 * @file formats.h
     24 *
     25 * @brief Definitions for SWR_FORMAT functions.
     26 *
     27 ******************************************************************************/
     28 #pragma once
     29 
     30 #include "utils.h"
     31 #include "common/simdintrin.h"
     32 
     33 //////////////////////////////////////////////////////////////////////////
     34 /// PackTraits - Helpers for packing / unpacking same pixel sizes
     35 //////////////////////////////////////////////////////////////////////////
     36 template <uint32_t NumBits, bool Signed = false>
     37 struct PackTraits
     38 {
     39     static const uint32_t MyNumBits = NumBits;
     40     static simdscalar loadSOA(const uint8_t *pSrc) = delete;
     41     static void storeSOA(uint8_t *pDst, simdscalar const &src) = delete;
     42     static simdscalar unpack(simdscalar &in) = delete;
     43     static simdscalar pack(simdscalar &in) = delete;
     44 #if ENABLE_AVX512_SIMD16
     45     static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
     46     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) = delete;
     47     static simd16scalar unpack(simd16scalar &in) = delete;
     48     static simd16scalar pack(simd16scalar &in) = delete;
     49 #endif
     50 };
     51 
     52 //////////////////////////////////////////////////////////////////////////
     53 /// PackTraits - Helpers for packing / unpacking unused channels
     54 //////////////////////////////////////////////////////////////////////////
     55 template <>
     56 struct PackTraits<0, false>
     57 {
     58     static const uint32_t MyNumBits = 0;
     59 
     60     static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
     61     static void storeSOA(uint8_t *pDst, simdscalar const &src) { return; }
     62     static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
     63     static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
     64 #if ENABLE_AVX512_SIMD16
     65     static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
     66     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src) { return; }
     67     static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
     68     static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
     69 #endif
     70 };
     71 
     72 //////////////////////////////////////////////////////////////////////////
     73 /// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
     74 //////////////////////////////////////////////////////////////////////////
     75 template <>
     76 struct PackTraits<8, false>
     77 {
     78     static const uint32_t MyNumBits = 8;
     79 
     80     static simdscalar loadSOA(const uint8_t *pSrc)
     81     {
     82 #if KNOB_SIMD_WIDTH == 8
     83         __m256 result = _mm256_setzero_ps();
     84         __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
     85         return _mm256_insertf128_ps(result, vLo, 0);
     86 #else
     87 #error Unsupported vector width
     88 #endif
     89     }
     90 
     91     static void storeSOA(uint8_t *pDst, simdscalar const &src)
     92     {
     93         // store simd bytes
     94 #if KNOB_SIMD_WIDTH == 8
     95         _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
     96 #else
     97 #error Unsupported vector width
     98 #endif
     99     }
    100 
    101     static simdscalar unpack(simdscalar &in)
    102     {
    103 #if KNOB_SIMD_WIDTH == 8
    104 #if KNOB_ARCH <= KNOB_ARCH_AVX
    105         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
    106         __m128i resLo = _mm_cvtepu8_epi32(src);
    107         __m128i resHi = _mm_shuffle_epi8(src,
    108             _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
    109 
    110         __m256i result = _mm256_castsi128_si256(resLo);
    111         result = _mm256_insertf128_si256(result, resHi, 1);
    112         return simdscalar{ _mm256_castsi256_ps(result) };
    113 #else
    114         return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
    115 #endif
    116 #else
    117 #error Unsupported vector width
    118 #endif
    119     }
    120 
    121     static simdscalar pack(simdscalar &in)
    122     {
    123 #if KNOB_SIMD_WIDTH == 8
    124         simdscalari src = _simd_castps_si(in);
    125         __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
    126         __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
    127         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
    128 #else
    129 #error Unsupported vector width
    130 #endif
    131     }
    132 #if ENABLE_AVX512_SIMD16
    133 
    134     static simd16scalar loadSOA_16(const uint8_t *pSrc)
    135     {
    136         simd16scalar result = _simd16_setzero_ps();
    137         simdscalar resultlo = _simd_setzero_ps();
    138 
    139         const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
    140 
    141         resultlo = _mm256_insertf128_ps(resultlo, src, 0);
    142         result = _simd16_insert_ps(result, resultlo, 0);
    143 
    144         return result;
    145     }
    146 
    147     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
    148     {
    149         // store simd16 bytes
    150         _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
    151     }
    152 
    153     static simd16scalar unpack(simd16scalar &in)
    154     {
    155         simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
    156         simd16scalari result = _simd16_cvtepu8_epi32(tmp);
    157 
    158         return _simd16_castsi_ps(result);
    159     }
    160 
    161     static simd16scalar pack(simd16scalar &in)
    162     {
    163         simd16scalari result = _simd16_setzero_si();
    164 
    165         simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));          // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
    166         simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));          // r8 r9 rA rB rC rD rE rF
    167 
    168         simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20);           // r0 r1 r2 r3 r8 r9 rA rB (32b)
    169         simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31);           // r4 r5 r6 r7 rC rD rE rF (32b)
    170 
    171         simdscalari pack = _simd_packus_epi32(permlo, permhi);                  // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
    172 
    173         const simdscalari zero = _simd_setzero_si();
    174 
    175         permlo = _simd_permute2f128_si(pack, zero, 0x20);   // (2, 0)           // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
    176         permhi = _simd_permute2f128_si(pack, zero, 0x31);   // (3, 1)           // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
    177 
    178         pack = _simd_packus_epi16(permlo, permhi);                              // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
    179 
    180         result = _simd16_insert_si(result, pack, 0);
    181 
    182         return _simd16_castsi_ps(result);
    183     }
    184 #endif
    185 };
    186 
    187 //////////////////////////////////////////////////////////////////////////
    188 /// PackTraits - Helpers for packing / unpacking 8 bit signed channels
    189 //////////////////////////////////////////////////////////////////////////
    190 template <>
    191 struct PackTraits<8, true>
    192 {
    193     static const uint32_t MyNumBits = 8;
    194 
    195     static simdscalar loadSOA(const uint8_t *pSrc)
    196     {
    197 #if KNOB_SIMD_WIDTH == 8
    198         __m256 result = _mm256_setzero_ps();
    199         __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
    200         return _mm256_insertf128_ps(result, vLo, 0);
    201 #else
    202 #error Unsupported vector width
    203 #endif
    204     }
    205 
    206     static void storeSOA(uint8_t *pDst, simdscalar const &src)
    207     {
    208         // store simd bytes
    209 #if KNOB_SIMD_WIDTH == 8
    210         _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
    211 #else
    212 #error Unsupported vector width
    213 #endif
    214     }
    215 
    216     static simdscalar unpack(simdscalar &in)
    217     {
    218 #if KNOB_SIMD_WIDTH == 8
    219 #if KNOB_ARCH <= KNOB_ARCH_AVX
    220         SWR_INVALID("I think this may be incorrect.");
    221         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
    222         __m128i resLo = _mm_cvtepi8_epi32(src);
    223         __m128i resHi = _mm_shuffle_epi8(src,
    224             _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
    225 
    226         __m256i result = _mm256_castsi128_si256(resLo);
    227         result = _mm256_insertf128_si256(result, resHi, 1);
    228         return _mm256_castsi256_ps(result);
    229 #else
    230         return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
    231 #endif
    232 #else
    233 #error Unsupported vector width
    234 #endif
    235     }
    236 
    237     static simdscalar pack(simdscalar &in)
    238     {
    239 #if KNOB_SIMD_WIDTH == 8
    240         simdscalari src = _simd_castps_si(in);
    241         __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
    242         __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
    243         return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
    244 #else
    245 #error Unsupported vector width
    246 #endif
    247     }
    248 #if ENABLE_AVX512_SIMD16
    249 
    250     static simd16scalar loadSOA_16(const uint8_t *pSrc)
    251     {
    252         simd16scalar result = _simd16_setzero_ps();
    253         simdscalar resultlo = _simd_setzero_ps();
    254 
    255         const __m128 src = _mm_load_ps(reinterpret_cast<const float *>(pSrc));
    256 
    257         resultlo = _mm256_insertf128_ps(resultlo, src, 0);
    258         result = _simd16_insert_ps(result, resultlo, 0);
    259 
    260         return result;
    261     }
    262 
    263     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
    264     {
    265         // store simd16 bytes
    266         _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
    267     }
    268 
    269     static simd16scalar unpack(simd16scalar &in)
    270     {
    271         simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
    272         simd16scalari result = _simd16_cvtepu8_epi32(tmp);
    273 
    274         return _simd16_castsi_ps(result);
    275     }
    276 
    277     static simd16scalar pack(simd16scalar &in)
    278     {
    279         simd16scalari result = _simd16_setzero_si();
    280 
    281         simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0));          // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
    282         simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1));          // r8 r9 rA rB rC rD rE rF
    283 
    284         simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20);           // r0 r1 r2 r3 r8 r9 rA rB (32b)
    285         simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31);           // r4 r5 r6 r7 rC rD rE rF (32b)
    286 
    287         simdscalari pack = _simd_packs_epi32(permlo, permhi);                   // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
    288 
    289         const simdscalari zero = _simd_setzero_si();
    290 
    291         permlo = _simd_permute2f128_si(pack, zero, 0x20);   // (2, 0)           // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
    292         permhi = _simd_permute2f128_si(pack, zero, 0x31);   // (3, 1)           // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
    293 
    294         pack = _simd_packs_epi16(permlo, permhi);                               // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
    295 
    296         result = _simd16_insert_si(result, pack, 0);
    297 
    298         return _simd16_castsi_ps(result);
    299     }
    300 #endif
    301 };
    302 
    303 //////////////////////////////////////////////////////////////////////////
    304 /// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
    305 //////////////////////////////////////////////////////////////////////////
    306 template <>
    307 struct PackTraits<16, false>
    308 {
    309     static const uint32_t MyNumBits = 16;
    310 
    311     static simdscalar loadSOA(const uint8_t *pSrc)
    312     {
    313 #if KNOB_SIMD_WIDTH == 8
    314         __m256 result = _mm256_setzero_ps();
    315         __m128 vLo = _mm_load_ps((const float*)pSrc);
    316         return _mm256_insertf128_ps(result, vLo, 0);
    317 #else
    318 #error Unsupported vector width
    319 #endif
    320     }
    321 
    322     static void storeSOA(uint8_t *pDst, simdscalar const &src)
    323     {
    324 #if KNOB_SIMD_WIDTH == 8
    325         // store 16B (2B * 8)
    326         _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
    327 #else
    328 #error Unsupported vector width
    329 #endif
    330     }
    331 
    332     static simdscalar unpack(simdscalar &in)
    333     {
    334 #if KNOB_SIMD_WIDTH == 8
    335 #if KNOB_ARCH <= KNOB_ARCH_AVX
    336         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
    337         __m128i resLo = _mm_cvtepu16_epi32(src);
    338         __m128i resHi = _mm_shuffle_epi8(src,
    339             _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
    340 
    341         __m256i result = _mm256_castsi128_si256(resLo);
    342         result = _mm256_insertf128_si256(result, resHi, 1);
    343         return _mm256_castsi256_ps(result);
    344 #else
    345         return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
    346 #endif
    347 #else
    348 #error Unsupported vector width
    349 #endif
    350     }
    351 
    352     static simdscalar pack(simdscalar &in)
    353     {
    354 #if KNOB_SIMD_WIDTH == 8
    355         simdscalari src = _simd_castps_si(in);
    356         __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
    357         return _mm256_castsi256_ps(res);
    358 #else
    359 #error Unsupported vector width
    360 #endif
    361     }
    362 #if ENABLE_AVX512_SIMD16
    363 
    364     static simd16scalar loadSOA_16(const uint8_t *pSrc)
    365     {
    366         simd16scalar result = _simd16_setzero_ps();
    367 
    368         simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
    369 
    370         result = _simd16_insert_ps(result, resultlo, 0);
    371 
    372         return result;
    373     }
    374 
    375     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
    376     {
    377         _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
    378     }
    379 
    380     static simd16scalar unpack(simd16scalar &in)
    381     {
    382         simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
    383 
    384         return _simd16_castsi_ps(result);
    385     }
    386 
    387     static simd16scalar pack(simd16scalar &in)
    388     {
    389         const simd16scalari zero = _simd16_setzero_si();
    390 
    391         simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08);  // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
    392         simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D);  // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
    393 
    394         simd16scalari result = _simd16_packus_epi32(permlo, permhi);    // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
    395 
    396         return _simd16_castsi_ps(result);
    397     }
    398 #endif
    399 };
    400 
    401 //////////////////////////////////////////////////////////////////////////
    402 /// PackTraits - Helpers for packing / unpacking 16 bit signed channels
    403 //////////////////////////////////////////////////////////////////////////
    404 template <>
    405 struct PackTraits<16, true>
    406 {
    407     static const uint32_t MyNumBits = 16;
    408 
    409     static simdscalar loadSOA(const uint8_t *pSrc)
    410     {
    411 #if KNOB_SIMD_WIDTH == 8
    412         __m256 result = _mm256_setzero_ps();
    413         __m128 vLo = _mm_load_ps((const float*)pSrc);
    414         return _mm256_insertf128_ps(result, vLo, 0);
    415 #else
    416 #error Unsupported vector width
    417 #endif
    418     }
    419 
    420     static void storeSOA(uint8_t *pDst, simdscalar const &src)
    421     {
    422 #if KNOB_SIMD_WIDTH == 8
    423         // store 16B (2B * 8)
    424         _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
    425 #else
    426 #error Unsupported vector width
    427 #endif
    428     }
    429 
    430     static simdscalar unpack(simdscalar &in)
    431     {
    432 #if KNOB_SIMD_WIDTH == 8
    433 #if KNOB_ARCH <= KNOB_ARCH_AVX
    434         SWR_INVALID("I think this may be incorrect.");
    435         __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
    436         __m128i resLo = _mm_cvtepi16_epi32(src);
    437         __m128i resHi = _mm_shuffle_epi8(src,
    438             _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
    439 
    440         __m256i result = _mm256_castsi128_si256(resLo);
    441         result = _mm256_insertf128_si256(result, resHi, 1);
    442         return _mm256_castsi256_ps(result);
    443 #else
    444         return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
    445 #endif
    446 #else
    447 #error Unsupported vector width
    448 #endif
    449     }
    450 
    451     static simdscalar pack(simdscalar &in)
    452     {
    453 #if KNOB_SIMD_WIDTH == 8
    454         simdscalari src = _simd_castps_si(in);
    455         __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
    456         return _mm256_castsi256_ps(res);
    457 #else
    458 #error Unsupported vector width
    459 #endif
    460     }
    461 #if ENABLE_AVX512_SIMD16
    462 
    463     static simd16scalar loadSOA_16(const uint8_t *pSrc)
    464     {
    465         simd16scalar result = _simd16_setzero_ps();
    466 
    467         simdscalar resultlo = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
    468 
    469         result = _simd16_insert_ps(result, resultlo, 0);
    470 
    471         return result;
    472     }
    473 
    474     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
    475     {
    476         _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
    477     }
    478 
    479     static simd16scalar unpack(simd16scalar &in)
    480     {
    481         simd16scalari result = _simd16_cvtepu16_epi32(_simd_castps_si(_simd16_extract_ps(in, 0)));
    482 
    483         return _simd16_castsi_ps(result);
    484     }
    485 
    486     static simd16scalar pack(simd16scalar &in)
    487     {
    488         const simd16scalari zero = _simd16_setzero_si();
    489 
    490         simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08);  // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
    491         simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D);  // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
    492 
    493         simd16scalari result = _simd16_packs_epi32(permlo, permhi);     // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
    494 
    495         return _simd16_castsi_ps(result);
    496     }
    497 #endif
    498 };
    499 
    500 //////////////////////////////////////////////////////////////////////////
    501 /// PackTraits - Helpers for packing / unpacking 32 bit channels
    502 //////////////////////////////////////////////////////////////////////////
    503 template <>
    504 struct PackTraits<32, false>
    505 {
    506     static const uint32_t MyNumBits = 32;
    507 
    508     static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
    509     static void storeSOA(uint8_t *pDst, simdscalar const &src) { _simd_store_ps((float*)pDst, src); }
    510     static simdscalar unpack(simdscalar &in) { return in; }
    511     static simdscalar pack(simdscalar &in) { return in; }
    512 #if ENABLE_AVX512_SIMD16
    513 
    514     static simd16scalar loadSOA_16(const uint8_t *pSrc)
    515     {
    516         return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
    517     }
    518 
    519     static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar const &src)
    520     {
    521         _simd16_store_ps(reinterpret_cast<float *>(pDst), src);
    522     }
    523 
    524     static simd16scalar unpack(simd16scalar &in)
    525     {
    526         return in;
    527     }
    528 
    529     static simd16scalar pack(simd16scalar &in)
    530     {
    531         return in;
    532     }
    533 #endif
    534 };
    535 
    536 //////////////////////////////////////////////////////////////////////////
    537 /// TypeTraits - Format type traits.
    538 //////////////////////////////////////////////////////////////////////////
    539 template<SWR_TYPE type, uint32_t NumBits>
    540 struct TypeTraits : PackTraits<NumBits>
    541 {
    542     static const SWR_TYPE MyType = type;
    543     static float toFloat() { return 0.0; }
    544     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
    545     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    546 };
    547 
    548 //////////////////////////////////////////////////////////////////////////
    549 /// TypeTraits - Format type traits specialization for UINT8
    550 //////////////////////////////////////////////////////////////////////////
    551 template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
    552 {
    553     static const SWR_TYPE MyType = SWR_TYPE_UINT;
    554     static float toFloat() { return 0.0; }
    555     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
    556     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    557 };
    558 
    559 //////////////////////////////////////////////////////////////////////////
    560 /// TypeTraits - Format type traits specialization for UINT8
    561 //////////////////////////////////////////////////////////////////////////
    562 template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
    563 {
    564     static const SWR_TYPE MyType = SWR_TYPE_SINT;
    565     static float toFloat() { return 0.0; }
    566     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
    567     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    568 };
    569 
    570 //////////////////////////////////////////////////////////////////////////
    571 /// TypeTraits - Format type traits specialization for UINT16
    572 //////////////////////////////////////////////////////////////////////////
    573 template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
    574 {
    575     static const SWR_TYPE MyType = SWR_TYPE_UINT;
    576     static float toFloat() { return 0.0; }
    577     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
    578     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    579 };
    580 
    581 //////////////////////////////////////////////////////////////////////////
    582 /// TypeTraits - Format type traits specialization for SINT16
    583 //////////////////////////////////////////////////////////////////////////
    584 template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
    585 {
    586     static const SWR_TYPE MyType = SWR_TYPE_SINT;
    587     static float toFloat() { return 0.0; }
    588     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
    589     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    590 };
    591 
    592 //////////////////////////////////////////////////////////////////////////
    593 /// TypeTraits - Format type traits specialization for UINT32
    594 //////////////////////////////////////////////////////////////////////////
    595 template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
    596 {
    597     static const SWR_TYPE MyType = SWR_TYPE_UINT;
    598     static float toFloat() { return 0.0; }
    599     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
    600     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    601 };
    602 
    603 //////////////////////////////////////////////////////////////////////////
    604 /// TypeTraits - Format type traits specialization for UINT32
    605 //////////////////////////////////////////////////////////////////////////
    606 template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
    607 {
    608     static const SWR_TYPE MyType = SWR_TYPE_SINT;
    609     static float toFloat() { return 0.0; }
    610     static float fromFloat() { SWR_NOT_IMPL; return 0.0; }
    611     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    612 };
    613 
    614 //////////////////////////////////////////////////////////////////////////
    615 /// TypeTraits - Format type traits specialization for UNORM5
    616 //////////////////////////////////////////////////////////////////////////
    617 template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
    618 {
    619     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
    620     static float toFloat() { return 1.0f / 31.0f; }
    621     static float fromFloat() { return 31.0f; }
    622     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    623 };
    624 
    625 //////////////////////////////////////////////////////////////////////////
    626 /// TypeTraits - Format type traits specialization for UNORM6
    627 //////////////////////////////////////////////////////////////////////////
    628 template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
    629 {
    630     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
    631     static float toFloat() { return 1.0f / 63.0f; }
    632     static float fromFloat() { return 63.0f; }
    633     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    634 };
    635 
    636 //////////////////////////////////////////////////////////////////////////
    637 /// TypeTraits - Format type traits specialization for UNORM8
    638 //////////////////////////////////////////////////////////////////////////
    639 template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
    640 {
    641     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
    642     static float toFloat() { return 1.0f / 255.0f; }
    643     static float fromFloat() { return 255.0f; }
    644     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    645 };
    646 
    647 //////////////////////////////////////////////////////////////////////////
    648 /// TypeTraits - Format type traits specialization for UNORM8
    649 //////////////////////////////////////////////////////////////////////////
    650 template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
    651 {
    652     static const SWR_TYPE MyType = SWR_TYPE_SNORM;
    653     static float toFloat() { return 1.0f / 127.0f; }
    654     static float fromFloat() { return 127.0f; }
    655     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    656 };
    657 
    658 //////////////////////////////////////////////////////////////////////////
    659 /// TypeTraits - Format type traits specialization for UNORM16
    660 //////////////////////////////////////////////////////////////////////////
    661 template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
    662 {
    663     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
    664     static float toFloat() { return 1.0f / 65535.0f; }
    665     static float fromFloat() { return 65535.0f; }
    666     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    667 };
    668 
    669 //////////////////////////////////////////////////////////////////////////
    670 /// TypeTraits - Format type traits specialization for SNORM16
    671 //////////////////////////////////////////////////////////////////////////
    672 template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
    673 {
    674     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
    675     static float toFloat() { return 1.0f / 32767.0f; }
    676     static float fromFloat() { return 32767.0f; }
    677     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    678 };
    679 
    680 //////////////////////////////////////////////////////////////////////////
    681 /// TypeTraits - Format type traits specialization for UNORM24
    682 //////////////////////////////////////////////////////////////////////////
    683 template<>
    684 struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
    685 {
    686     static const SWR_TYPE MyType = SWR_TYPE_UNORM;
    687     static float toFloat() { return 1.0f / 16777215.0f; }
    688     static float fromFloat() { return 16777215.0f; }
    689     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    690 };
    691 
    692 //////////////////////////////////////////////////////////////////////////
    693 // FLOAT Specializations from here on...
    694 //////////////////////////////////////////////////////////////////////////
    695 #define TO_M128i(a) _mm_castps_si128(a)
    696 #define TO_M128(a) _mm_castsi128_ps(a)
    697 
    698 #include "math.h"
    699 
    700 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
    701 inline static __m128 fastpow(__m128 arg) {
    702     __m128 ret = arg;
    703 
    704     static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
    705         * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
    706 
    707     // Apply a constant pre-correction factor.
    708     ret = _mm_mul_ps(ret, factor);
    709 
    710     // Reinterpret arg as integer to obtain logarithm.
    711     //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
    712     ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
    713 
    714     // Multiply logarithm by power.
    715     ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
    716 
    717     // Convert back to "integer" to exponentiate.
    718     //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
    719     ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
    720 
    721     return ret;
    722 }
    723 
    724 inline static __m128 pow512_4(__m128 arg) {
    725     // 5/12 is too small, so compute the 4th root of 20/12 instead.
    726     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
    727     // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
    728     __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
    729     __m128 xover = _mm_mul_ps(arg, xf);
    730 
    731     __m128 xfm1 = _mm_rsqrt_ps(xf);
    732     __m128 x2 = _mm_mul_ps(arg, arg);
    733     __m128 xunder = _mm_mul_ps(x2, xfm1);
    734 
    735     // sqrt2 * over + 2 * sqrt2 * under
    736     __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
    737         _mm_add_ps(xover, xunder));
    738 
    739     xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
    740     xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
    741     return xavg;
    742 }
    743 
    744 inline static __m128 powf_wrapper(__m128 Base, float Exp)
    745 {
    746     float *f = (float *)(&Base);
    747 
    748     return _mm_set_ps(powf(f[3], Exp),
    749                       powf(f[2], Exp),
    750                       powf(f[1], Exp),
    751                       powf(f[0], Exp));
    752 }
    753 
    754 static inline __m128 ConvertFloatToSRGB2(__m128& Src)
    755 {
    756     // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
    757     __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
    758 
    759     // squeeze the mask down to 16 bits (4 bits per DWORD)
    760     int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
    761 
    762     __m128 Result;
    763 
    764     //
    765     if (CompareResult == 0xFFFF)
    766     {
    767         // all DWORDs are <= the threshold
    768         Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
    769     }
    770     else if (CompareResult == 0x0)
    771     {
    772         // all DWORDs are > the threshold
    773         __m128 fSrc_0RGB = Src;
    774 
    775         // --> 1.055f * c(1.0f/2.4f) - 0.055f
    776 #if KNOB_USE_FAST_SRGB == TRUE
    777         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
    778         __m128 f = pow512_4(fSrc_0RGB);
    779 #else
    780         __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
    781 #endif
    782         f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
    783         Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
    784     }
    785     else
    786     {
    787         // some DWORDs are <= the threshold and some are > threshold
    788         __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
    789 
    790         __m128 fSrc_0RGB = Src;
    791 
    792         // --> 1.055f * c(1.0f/2.4f) - 0.055f
    793 #if KNOB_USE_FAST_SRGB == TRUE
    794         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
    795         __m128 f = pow512_4(fSrc_0RGB);
    796 #else
    797         __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
    798 #endif
    799         f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
    800         f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
    801 
    802         // Clear the alpha (is garbage after the sub)
    803         __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
    804 
    805         __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
    806         __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
    807         __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
    808 
    809         Result = TO_M128(CombinedParts);
    810     }
    811 
    812     return Result;
    813 }
    814 
    815 #if ENABLE_AVX512_SIMD16
    816 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
    817 inline static simd16scalar SIMDCALL fastpow(simd16scalar const &value)
    818 {
    819     static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
    820         * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
    821 
    822     // Apply a constant pre-correction factor.
    823     simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(factor1));
    824 
    825     // Reinterpret arg as integer to obtain logarithm.
    826     //asm("cvtdq2ps %1, %0" : "=x" (result) : "x" (result));
    827     result = _simd16_cvtepi32_ps(_simd16_castps_si(result));
    828 
    829     // Multiply logarithm by power.
    830     result = _simd16_mul_ps(result, _simd16_set1_ps(1.0f * expnum / expden));
    831 
    832     // Convert back to "integer" to exponentiate.
    833     //asm("cvtps2dq %1, %0" : "=x" (result) : "x" (result));
    834     result = _simd16_castsi_ps(_simd16_cvtps_epi32(result));
    835 
    836     return result;
    837 }
    838 
    839 inline static simd16scalar SIMDCALL pow512_4(simd16scalar const &arg)
    840 {
    841     // 5/12 is too small, so compute the 4th root of 20/12 instead.
    842     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
    843     // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
    844     simd16scalar xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
    845     simd16scalar xover = _simd16_mul_ps(arg, xf);
    846 
    847     simd16scalar xfm1 = _simd16_rsqrt_ps(xf);
    848     simd16scalar x2 = _simd16_mul_ps(arg, arg);
    849     simd16scalar xunder = _simd16_mul_ps(x2, xfm1);
    850 
    851     // sqrt2 * over + 2 * sqrt2 * under
    852     simd16scalar xavg = _simd16_mul_ps(_simd16_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f), _simd16_add_ps(xover, xunder));
    853 
    854     xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
    855     xavg = _simd16_mul_ps(xavg, _simd16_rsqrt_ps(xavg));
    856 
    857     return xavg;
    858 }
    859 
    860 inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar &base, float exp)
    861 {
    862     const float *f = reinterpret_cast<const float *>(&base);
    863 
    864     return _simd16_set_ps(
    865         powf(f[15], exp),
    866         powf(f[14], exp),
    867         powf(f[13], exp),
    868         powf(f[12], exp),
    869         powf(f[11], exp),
    870         powf(f[10], exp),
    871         powf(f[ 9], exp),
    872         powf(f[ 8], exp),
    873         powf(f[ 7], exp),
    874         powf(f[ 6], exp),
    875         powf(f[ 5], exp),
    876         powf(f[ 4], exp),
    877         powf(f[ 3], exp),
    878         powf(f[ 2], exp),
    879         powf(f[ 1], exp),
    880         powf(f[ 0], exp)
    881     );
    882 }
    883 
    884 // float to SRGB conversion formula
    885 //
    886 // if (value < 0.0031308f)
    887 //     value *= 12.92f;
    888 // else
    889 //     value = 1.055f * pow(value, 1.0f / 2.4f) - 0.055f;
    890 //
    891 static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar &value)
    892 {
    893     // create a mask where the source is < the minimal SRGB float value
    894     const simd16mask mask = _simd16_cmplt_ps_mask(value, _simd16_set1_ps(0.0031308f));
    895 
    896     // if all elements are < the threshold, result = value * 12.92
    897     simd16scalar result = _simd16_mul_ps(value, _simd16_set1_ps(12.92f));
    898 
    899     if (_simd16_mask2int(mask) != 0xFFFF)
    900     {
    901         // some elements are >= threshold, result = 1.055 * power(value, 1.0 / 2.4) - 0.055
    902 #if KNOB_USE_FAST_SRGB == TRUE
    903         // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
    904         simd16scalar result2 = pow512_4(value);
    905 #else
    906         simd16scalar result2 = powf_wrapper(value, 1.0f / 2.4f);
    907 #endif
    908 
    909         result2 = _simd16_mul_ps(result2, _simd16_set1_ps(1.055f));
    910         result2 = _simd16_sub_ps(result2, _simd16_set1_ps(0.055f));
    911 
    912 #if (KNOB_ARCH == KNOB_ARCH_AVX512)
    913         // only native AVX512 can directly use the computed mask for the blend operation
    914         result = _mm512_mask_blend_ps(mask, result2, result);
    915 #else
    916         result = _simd16_blendv_ps(result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f)));
    917 #endif
    918     }
    919 
    920     return result;
    921 }
    922 
    923 #endif
    924 //////////////////////////////////////////////////////////////////////////
    925 /// TypeTraits - Format type traits specialization for FLOAT16
    926 //////////////////////////////////////////////////////////////////////////
    927 template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
    928 {
    929     static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
    930     static float toFloat() { return 1.0f; }
    931     static float fromFloat() { return 1.0f; }
    932     static simdscalar convertSrgb(simdscalar &in) { SWR_NOT_IMPL; return _simd_setzero_ps(); }
    933 
    934     static simdscalar pack(const simdscalar &in)
    935     {
    936 #if KNOB_SIMD_WIDTH == 8
    937 #if (KNOB_ARCH == KNOB_ARCH_AVX)
    938         // input is 8 packed float32, output is 8 packed float16
    939         simdscalari src = _simd_castps_si(in);
    940 
    941         static const uint32_t FLOAT_EXP_BITS = 8;
    942         static const uint32_t FLOAT_MANTISSA_BITS = 23;
    943         static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
    944         static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
    945 
    946         static const uint32_t HALF_EXP_BITS = 5;
    947         static const uint32_t HALF_MANTISSA_BITS = 10;
    948         static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
    949 
    950         // minimum exponent required, exponents below this are flushed to 0.
    951         static const int32_t HALF_EXP_MIN = -14;
    952         static const int32_t FLOAT_EXP_BIAS = 127;
    953         static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
    954         static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
    955 
    956         // maximum exponent required, exponents above this are set to infinity
    957         static const int32_t HALF_EXP_MAX = 15;
    958         static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
    959 
    960         const simdscalari vSignMask     = _simd_set1_epi32(0x80000000);
    961         const simdscalari vExpMask      = _simd_set1_epi32(FLOAT_EXP_MASK);
    962         const simdscalari vManMask      = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
    963         const simdscalari vExpMin       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
    964         const simdscalari vExpMinFtz    = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
    965         const simdscalari vExpMax       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
    966 
    967         simdscalari vSign       = _simd_and_si(src, vSignMask);
    968         simdscalari vExp        = _simd_and_si(src, vExpMask);
    969         simdscalari vMan        = _simd_and_si(src, vManMask);
    970 
    971         simdscalari vFTZMask    = _simd_cmplt_epi32(vExp, vExpMinFtz);
    972         simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
    973         simdscalari vInfMask    = _simd_cmpeq_epi32(vExpMask, vExp);
    974         simdscalari vClampMask  = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
    975 
    976         simdscalari vHalfExp    = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
    977 
    978         // pack output 16-bits into the lower 16-bits of each 32-bit channel
    979         simdscalari vDst        = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
    980         vDst   = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
    981 
    982         // Flush To Zero
    983         vDst   = _simd_andnot_si(vFTZMask, vDst);
    984         // Apply Infinites / NaN
    985         vDst   = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
    986 
    987         // Apply clamps
    988         vDst = _simd_andnot_si(vClampMask, vDst);
    989         vDst = _simd_or_si(vDst,
    990                 _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
    991 
    992         // Compute Denormals (subnormals)
    993         if (!_mm256_testz_si256(vDenormMask, vDenormMask))
    994         {
    995             uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
    996             uint32_t *pExp = (uint32_t*)&vExp;
    997             uint32_t *pMan = (uint32_t*)&vMan;
    998             uint32_t *pDst = (uint32_t*)&vDst;
    999             for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
   1000             {
   1001                 if (pDenormMask[i])
   1002                 {
   1003                     // Need to compute subnormal value
   1004                     uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
   1005                     uint32_t mantissa = pMan[i] |
   1006                                         (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s.  Make it explicit
   1007 
   1008                     pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
   1009                 }
   1010             }
   1011         }
   1012 
   1013         // Add in sign bits
   1014         vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
   1015 
   1016         // Pack to lower 128-bits
   1017         vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
   1018 
   1019 #if 0
   1020 #if !defined(NDEBUG)
   1021         simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
   1022 
   1023         for (uint32_t i = 0; i < 4; ++i)
   1024         {
   1025             SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
   1026         }
   1027 #endif
   1028 #endif
   1029 
   1030         return _simd_castsi_ps(vDst);
   1031 
   1032 #else
   1033         return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
   1034 #endif
   1035 #else
   1036 #error Unsupported vector width
   1037 #endif
   1038     }
   1039 
   1040     static simdscalar unpack(const simdscalar &in)
   1041     {
   1042         // input is 8 packed float16, output is 8 packed float32
   1043         SWR_NOT_IMPL; // @todo
   1044         return _simd_setzero_ps();
   1045     }
   1046 #if ENABLE_AVX512_SIMD16
   1047 
   1048     static simd16scalar pack(const simd16scalar &in)
   1049     {
   1050         simd16scalari result = _simd16_setzero_si();
   1051         simdscalari resultlo = _simd_setzero_si();
   1052 
   1053 #if (KNOB_ARCH == KNOB_ARCH_AVX)
   1054         simdscalar simdlo = pack(_simd16_extract_ps(in, 0));
   1055         simdscalar simdhi = pack(_simd16_extract_ps(in, 1));
   1056 
   1057         __m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0);
   1058         __m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0);
   1059 
   1060 #else
   1061         __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC);
   1062         __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC);
   1063 
   1064 #endif
   1065         resultlo = _simd_insertf128_si(resultlo, templo, 0);
   1066         resultlo = _simd_insertf128_si(resultlo, temphi, 1);
   1067 
   1068         result = _simd16_insert_si(result, resultlo, 0);
   1069 
   1070         return _simd16_castsi_ps(result);
   1071     }
   1072 
   1073     static simd16scalar unpack(const simd16scalar &in)
   1074     {
   1075         // input is 16 packed float16, output is 16 packed float32
   1076         SWR_NOT_IMPL; //  @todo
   1077         return _simd16_setzero_ps();
   1078     }
   1079 #endif
   1080 };
   1081 
   1082 //////////////////////////////////////////////////////////////////////////
   1083 /// TypeTraits - Format type traits specialization for FLOAT32
   1084 //////////////////////////////////////////////////////////////////////////
   1085 template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
   1086 {
   1087     static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
   1088     static float toFloat() { return 1.0f; }
   1089     static float fromFloat() { return 1.0f; }
   1090     static inline simdscalar convertSrgb(simdscalar &in)
   1091     {
   1092 #if KNOB_SIMD_WIDTH == 8
   1093         __m128 srcLo = _mm256_extractf128_ps(in, 0);
   1094         __m128 srcHi = _mm256_extractf128_ps(in, 1);
   1095 
   1096         srcLo = ConvertFloatToSRGB2(srcLo);
   1097         srcHi = ConvertFloatToSRGB2(srcHi);
   1098 
   1099         in = _mm256_insertf128_ps(in, srcLo, 0);
   1100         in = _mm256_insertf128_ps(in, srcHi, 1);
   1101 #else
   1102 #error Unsupported vector width
   1103 #endif
   1104         return in;
   1105     }
   1106 #if ENABLE_AVX512_SIMD16
   1107 
   1108     static inline simd16scalar convertSrgb(simd16scalar &in)
   1109     {
   1110         return ConvertFloatToSRGB2(in);
   1111     }
   1112 #endif
   1113 };
   1114 
   1115 //////////////////////////////////////////////////////////////////////////
   1116 /// FormatIntType - Calculate base integer type for pixel components based
   1117 ///                 on total number of bits.  Components can be smaller
   1118 ///                 that this type, but the entire pixel must not be
   1119 ///                 any smaller than this type.
   1120 //////////////////////////////////////////////////////////////////////////
   1121 template <uint32_t bits, bool bits8 = bits <= 8, bool bits16 = bits <= 16>
   1122 struct FormatIntType
   1123 {
   1124     typedef uint32_t TYPE;
   1125 };
   1126 
   1127 template <uint32_t bits>
   1128 struct FormatIntType<bits, true, true>
   1129 {
   1130     typedef uint8_t TYPE;
   1131 };
   1132 
   1133 template <uint32_t bits>
   1134 struct FormatIntType<bits, false, true>
   1135 {
   1136     typedef uint16_t TYPE;
   1137 };
   1138 
   1139 //////////////////////////////////////////////////////////////////////////
   1140 /// Format1 - Bitfield for single component formats.
   1141 //////////////////////////////////////////////////////////////////////////
   1142 template<uint32_t x>
   1143 union Format1
   1144 {
   1145     typedef typename FormatIntType<x>::TYPE TYPE;
   1146     struct
   1147     {
   1148         TYPE r : x;
   1149     };
   1150 
   1151     ///@ The following are here to provide full template needed in Formats.
   1152     struct
   1153     {
   1154         TYPE g : x;
   1155     };
   1156     struct
   1157     {
   1158         TYPE b : x;
   1159     };
   1160     struct
   1161     {
   1162         TYPE a : x;
   1163     };
   1164 };
   1165 
   1166 //////////////////////////////////////////////////////////////////////////
   1167 /// Format2 - Bitfield for 2 component formats.
   1168 //////////////////////////////////////////////////////////////////////////
   1169 template<uint32_t x, uint32_t y>
   1170 union Format2
   1171 {
   1172     typedef typename FormatIntType<x + y>::TYPE TYPE;
   1173 
   1174     struct
   1175     {
   1176         TYPE r : x;
   1177         TYPE g : y;
   1178     };
   1179     struct
   1180     {
   1181         ///@ The following are here to provide full template needed in Formats.
   1182         TYPE b : x;
   1183         TYPE a : y;
   1184     };
   1185 };
   1186 
   1187 //////////////////////////////////////////////////////////////////////////
   1188 /// Format3 - Bitfield for 3 component formats.
   1189 //////////////////////////////////////////////////////////////////////////
   1190 template<uint32_t x, uint32_t y, uint32_t z>
   1191 union Format3
   1192 {
   1193     typedef typename FormatIntType<x + y + z>::TYPE TYPE;
   1194 
   1195     struct
   1196     {
   1197         TYPE r : x;
   1198         TYPE g : y;
   1199         TYPE b : z;
   1200     };
   1201     TYPE a;  ///@note This is here to provide full template needed in Formats.
   1202 };
   1203 
   1204 //////////////////////////////////////////////////////////////////////////
   1205 /// Format4 - Bitfield for 4 component formats.
   1206 //////////////////////////////////////////////////////////////////////////
   1207 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
   1208 struct Format4
   1209 {
   1210     typedef typename FormatIntType<x + y + z + w>::TYPE TYPE;
   1211 
   1212     TYPE r : x;
   1213     TYPE g : y;
   1214     TYPE b : z;
   1215     TYPE a : w;
   1216 };
   1217 
   1218 //////////////////////////////////////////////////////////////////////////
   1219 /// ComponentTraits - Default components
   1220 //////////////////////////////////////////////////////////////////////////
   1221 template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
   1222 struct Defaults
   1223 {
   1224     INLINE static uint32_t GetDefault(uint32_t comp)
   1225     {
   1226         static const uint32_t defaults[4]{ x, y, z, w };
   1227         return defaults[comp];
   1228     }
   1229 };
   1230 
   1231 //////////////////////////////////////////////////////////////////////////
   1232 /// ComponentTraits - Component type traits.
   1233 //////////////////////////////////////////////////////////////////////////
   1234 template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
   1235 struct ComponentTraits
   1236 {
   1237     INLINE static SWR_TYPE GetType(uint32_t comp)
   1238     {
   1239         static const SWR_TYPE CompType[4]{ X, Y, Z, W };
   1240         return CompType[comp];
   1241     }
   1242 
   1243     INLINE static constexpr uint32_t GetConstBPC(uint32_t comp)
   1244     {
   1245         return (comp == 3) ? NumBitsW :
   1246             ((comp == 2) ? NumBitsZ :
   1247                 ((comp == 1) ? NumBitsY : NumBitsX) );
   1248     }
   1249 
   1250     INLINE static uint32_t GetBPC(uint32_t comp)
   1251     {
   1252         static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
   1253         return MyBpc[comp];
   1254     }
   1255 
   1256     INLINE static bool isNormalized(uint32_t comp)
   1257     {
   1258         switch (comp)
   1259         {
   1260         case 0:
   1261             return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
   1262         case 1:
   1263             return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
   1264         case 2:
   1265             return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
   1266         case 3:
   1267             return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
   1268         }
   1269         SWR_INVALID("Invalid component: %d", comp);
   1270         return false;
   1271     }
   1272 
   1273     INLINE static float toFloat(uint32_t comp)
   1274     {
   1275         switch (comp)
   1276         {
   1277         case 0:
   1278             return TypeTraits<X, NumBitsX>::toFloat();
   1279         case 1:
   1280             return TypeTraits<Y, NumBitsY>::toFloat();
   1281         case 2:
   1282             return TypeTraits<Z, NumBitsZ>::toFloat();
   1283         case 3:
   1284             return TypeTraits<W, NumBitsW>::toFloat();
   1285         }
   1286         SWR_INVALID("Invalid component: %d", comp);
   1287         return TypeTraits<X, NumBitsX>::toFloat();
   1288 
   1289     }
   1290 
   1291     INLINE static float fromFloat(uint32_t comp)
   1292     {
   1293         switch (comp)
   1294         {
   1295         case 0:
   1296             return TypeTraits<X, NumBitsX>::fromFloat();
   1297         case 1:
   1298             return TypeTraits<Y, NumBitsY>::fromFloat();
   1299         case 2:
   1300             return TypeTraits<Z, NumBitsZ>::fromFloat();
   1301         case 3:
   1302             return TypeTraits<W, NumBitsW>::fromFloat();
   1303         }
   1304         SWR_INVALID("Invalid component: %d", comp);
   1305         return TypeTraits<X, NumBitsX>::fromFloat();
   1306     }
   1307 
   1308     INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
   1309     {
   1310         switch (comp)
   1311         {
   1312         case 0:
   1313             return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
   1314         case 1:
   1315             return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
   1316         case 2:
   1317             return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
   1318         case 3:
   1319             return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
   1320         }
   1321         SWR_INVALID("Invalid component: %d", comp);
   1322         return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
   1323     }
   1324 
   1325     INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar const &src)
   1326     {
   1327         switch (comp)
   1328         {
   1329         case 0:
   1330             TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
   1331             return;
   1332         case 1:
   1333             TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
   1334             return;
   1335         case 2:
   1336             TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
   1337             return;
   1338         case 3:
   1339             TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
   1340             return;
   1341         }
   1342         SWR_INVALID("Invalid component: %d", comp);
   1343     }
   1344 
   1345     INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
   1346     {
   1347         simdscalar out;
   1348         switch (comp)
   1349         {
   1350         case 0:
   1351             out = TypeTraits<X, NumBitsX>::unpack(in); break;
   1352         case 1:
   1353             out = TypeTraits<Y, NumBitsY>::unpack(in); break;
   1354         case 2:
   1355             out = TypeTraits<Z, NumBitsZ>::unpack(in); break;
   1356         case 3:
   1357             out = TypeTraits<W, NumBitsW>::unpack(in); break;
   1358         default:
   1359             SWR_INVALID("Invalid component: %d", comp);
   1360             out = in;
   1361             break;
   1362         }
   1363         return out;
   1364     }
   1365 
   1366     INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
   1367     {
   1368         simdscalar out;
   1369         switch (comp)
   1370         {
   1371         case 0:
   1372             out = TypeTraits<X, NumBitsX>::pack(in); break;
   1373         case 1:
   1374             out = TypeTraits<Y, NumBitsY>::pack(in); break;
   1375         case 2:
   1376             out = TypeTraits<Z, NumBitsZ>::pack(in); break;
   1377         case 3:
   1378             out = TypeTraits<W, NumBitsW>::pack(in); break;
   1379         default:
   1380             SWR_INVALID("Invalid component: %d", comp);
   1381             out = in;
   1382             break;
   1383         }
   1384         return out;
   1385     }
   1386 
   1387     INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
   1388     {
   1389         switch (comp)
   1390         {
   1391         case 0:
   1392             return TypeTraits<X, NumBitsX>::convertSrgb(in);
   1393         case 1:
   1394             return TypeTraits<Y, NumBitsY>::convertSrgb(in);
   1395         case 2:
   1396             return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
   1397         case 3:
   1398             return TypeTraits<W, NumBitsW>::convertSrgb(in);
   1399         }
   1400         SWR_INVALID("Invalid component: %d", comp);
   1401         return TypeTraits<X, NumBitsX>::convertSrgb(in);
   1402     }
   1403 #if ENABLE_AVX512_SIMD16
   1404 
   1405     INLINE static simd16scalar loadSOA_16(uint32_t comp, const uint8_t* pSrc)
   1406     {
   1407         switch (comp)
   1408         {
   1409         case 0:
   1410             return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
   1411         case 1:
   1412             return TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc);
   1413         case 2:
   1414             return TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc);
   1415         case 3:
   1416             return TypeTraits<W, NumBitsW>::loadSOA_16(pSrc);
   1417         }
   1418         SWR_INVALID("Invalid component: %d", comp);
   1419         return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
   1420     }
   1421 
   1422     INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar const &src)
   1423     {
   1424         switch (comp)
   1425         {
   1426         case 0:
   1427             TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
   1428             return;
   1429         case 1:
   1430             TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
   1431             return;
   1432         case 2:
   1433             TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
   1434             return;
   1435         case 3:
   1436             TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
   1437             return;
   1438         }
   1439         SWR_INVALID("Invalid component: %d", comp);
   1440         TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
   1441     }
   1442 
   1443     INLINE static simd16scalar unpack(uint32_t comp, simd16scalar &in)
   1444     {
   1445         switch (comp)
   1446         {
   1447         case 0:
   1448             return TypeTraits<X, NumBitsX>::unpack(in);
   1449         case 1:
   1450             return TypeTraits<Y, NumBitsY>::unpack(in);
   1451         case 2:
   1452             return TypeTraits<Z, NumBitsZ>::unpack(in);
   1453         case 3:
   1454             return TypeTraits<W, NumBitsW>::unpack(in);
   1455         }
   1456         SWR_INVALID("Invalid component: %d", comp);
   1457         return TypeTraits<X, NumBitsX>::unpack(in);
   1458     }
   1459 
   1460     INLINE static simd16scalar pack(uint32_t comp, simd16scalar &in)
   1461     {
   1462         switch (comp)
   1463         {
   1464         case 0:
   1465             return TypeTraits<X, NumBitsX>::pack(in);
   1466         case 1:
   1467             return TypeTraits<Y, NumBitsY>::pack(in);
   1468         case 2:
   1469             return TypeTraits<Z, NumBitsZ>::pack(in);
   1470         case 3:
   1471             return TypeTraits<W, NumBitsW>::pack(in);
   1472         }
   1473         SWR_INVALID("Invalid component: %d", comp);
   1474         return TypeTraits<X, NumBitsX>::pack(in);
   1475     }
   1476 
   1477     INLINE static simd16scalar convertSrgb(uint32_t comp, simd16scalar &in)
   1478     {
   1479         switch (comp)
   1480         {
   1481         case 0:
   1482             return TypeTraits<X, NumBitsX>::convertSrgb(in);
   1483         case 1:
   1484             return TypeTraits<Y, NumBitsY>::convertSrgb(in);
   1485         case 2:
   1486             return TypeTraits<Z, NumBitsZ>::convertSrgb(in);
   1487         case 3:
   1488             return TypeTraits<W, NumBitsW>::convertSrgb(in);
   1489         }
   1490         SWR_INVALID("Invalid component: %d", comp);
   1491         return TypeTraits<X, NumBitsX>::convertSrgb(in);
   1492     }
   1493 #endif
   1494 };
   1495