Home | History | Annotate | Download | only in common
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 ****************************************************************************/
     23 
     24 #ifndef __SWR_SIMD16INTRIN_H__
     25 #define __SWR_SIMD16INTRIN_H__
     26 
     27 #if ENABLE_AVX512_SIMD16
     28 
     29 #if KNOB_SIMD16_WIDTH == 16
     30 
     31 #if ENABLE_AVX512_EMULATION
     32 struct simd16scalar
     33 {
     34     __m256  lo;
     35     __m256  hi;
     36 };
     37 struct simd16scalard
     38 {
     39     __m256d lo;
     40     __m256d hi;
     41 };
     42 struct simd16scalari
     43 {
     44     __m256i lo;
     45     __m256i hi;
     46 };
     47 typedef uint16_t simd16mask;
     48 
     49 #define _simd16_masklo(mask) ((mask) & 0xFF)
     50 #define _simd16_maskhi(mask) (((mask) >> 8))
     51 #define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
     52 
     53 #else
     54 typedef __m512 simd16scalar;
     55 typedef __m512d simd16scalard;
     56 typedef __m512i simd16scalari;
     57 typedef __mmask16 simd16mask;
     58 #endif//ENABLE_AVX512_EMULATION
     59 #else
     60 #error Unsupported vector width
     61 #endif//KNOB_SIMD16_WIDTH == 16
     62 
     63 OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
     64 {
     65     simd16scalar  v[4];
     66     struct
     67     {
     68         simd16scalar x, y, z, w;
     69     };
     70 
     71     simd16scalar& operator[] (const int i) { return v[i]; }
     72     const simd16scalar& operator[] (const int i) const { return v[i]; }
     73 };
     74 
     75 #if ENABLE_AVX512_EMULATION
     76 
     77 #define SIMD16_EMU_AVX512_0(type, func, intrin) \
     78 INLINE type func()\
     79 {\
     80     type result;\
     81 \
     82     result.lo = intrin();\
     83     result.hi = intrin();\
     84 \
     85     return result;\
     86 }
     87 
     88 #define SIMD16_EMU_AVX512_1(type, func, intrin) \
     89 INLINE type func(type a)\
     90 {\
     91     type result;\
     92 \
     93     result.lo = intrin(a.lo);\
     94     result.hi = intrin(a.hi);\
     95 \
     96     return result;\
     97 }
     98 
     99 #define SIMD16_EMU_AVX512_2(type, func, intrin) \
    100 INLINE type func(type a, type b)\
    101 {\
    102     type result;\
    103 \
    104     result.lo = intrin(a.lo, b.lo);\
    105     result.hi = intrin(a.hi, b.hi);\
    106 \
    107     return result;\
    108 }
    109 
    110 #define SIMD16_EMU_AVX512_3(type, func, intrin) \
    111 INLINE type func(type a, type b, type c)\
    112 {\
    113     type result;\
    114 \
    115     result.lo = intrin(a.lo, b.lo, c.lo);\
    116     result.hi = intrin(a.hi, b.hi, c.hi);\
    117 \
    118     return result;\
    119 }
    120 
    121 SIMD16_EMU_AVX512_0(simd16scalar, _simd16_setzero_ps, _mm256_setzero_ps)
    122 SIMD16_EMU_AVX512_0(simd16scalari, _simd16_setzero_si, _mm256_setzero_si256)
    123 
    124 INLINE simd16scalar _simd16_set1_ps(float a)
    125 {
    126     simd16scalar result;
    127 
    128     result.lo = _mm256_set1_ps(a);
    129     result.hi = _mm256_set1_ps(a);
    130 
    131     return result;
    132 }
    133 
    134 INLINE simd16scalari _simd16_set1_epi8(char a)
    135 {
    136     simd16scalari result;
    137 
    138     result.lo = _mm256_set1_epi8(a);
    139     result.hi = _mm256_set1_epi8(a);
    140 
    141     return result;
    142 }
    143 
    144 INLINE simd16scalari _simd16_set1_epi32(int a)
    145 {
    146     simd16scalari result;
    147 
    148     result.lo = _mm256_set1_epi32(a);
    149     result.hi = _mm256_set1_epi32(a);
    150 
    151     return result;
    152 }
    153 
    154 INLINE simd16scalar _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
    155 {
    156     simd16scalar result;
    157 
    158     result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
    159     result.hi = _mm256_set_ps(e15, e14, e13, e12, e11, e10, e9, e8);
    160 
    161     return result;
    162 }
    163 
    164 INLINE simd16scalari _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
    165 {
    166     simd16scalari result;
    167 
    168     result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
    169     result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8);
    170 
    171     return result;
    172 }
    173 
    174 INLINE simd16scalar _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
    175 {
    176     simd16scalar result;
    177 
    178     result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
    179     result.hi = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
    180 
    181     return result;
    182 }
    183 
    184 INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
    185 {
    186     simd16scalari result;
    187 
    188     result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
    189     result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
    190 
    191     return result;
    192 }
    193 
    194 INLINE simd16scalar _simd16_load_ps(float const *m)
    195 {
    196     simd16scalar result;
    197 
    198     float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
    199 
    200     result.lo = _mm256_load_ps(m);
    201     result.hi = _mm256_load_ps(n);
    202 
    203     return result;
    204 }
    205 
    206 INLINE simd16scalar _simd16_loadu_ps(float const *m)
    207 {
    208     simd16scalar result;
    209 
    210     float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
    211 
    212     result.lo = _mm256_loadu_ps(m);
    213     result.hi = _mm256_loadu_ps(n);
    214 
    215     return result;
    216 }
    217 
    218 INLINE simd16scalar _simd16_load1_ps(float const *m)
    219 {
    220     simd16scalar result;
    221 
    222     result.lo = _mm256_broadcast_ss(m);
    223     result.hi = _mm256_broadcast_ss(m);
    224 
    225     return result;
    226 }
    227 
    228 INLINE simd16scalari _simd16_load_si(simd16scalari const *m)
    229 {
    230     simd16scalari result;
    231 
    232     result.lo = _mm256_load_si256(&m[0].lo);
    233     result.hi = _mm256_load_si256(&m[0].hi);
    234 
    235     return result;
    236 }
    237 
    238 INLINE simd16scalari _simd16_loadu_si(simd16scalari const *m)
    239 {
    240     simd16scalari result;
    241 
    242     result.lo = _mm256_loadu_si256(&m[0].lo);
    243     result.hi = _mm256_loadu_si256(&m[0].hi);
    244 
    245     return result;
    246 }
    247 
    248 INLINE simd16scalar _simd16_broadcast_ss(float const *m)
    249 {
    250     simd16scalar result;
    251 
    252     result.lo = _mm256_broadcast_ss(m);
    253     result.hi = _mm256_broadcast_ss(m);
    254 
    255     return result;
    256 }
    257 
    258 INLINE simd16scalar _simd16_broadcast_ps(__m128 const *m)
    259 {
    260     simd16scalar result;
    261 
    262     result.lo = _mm256_broadcast_ps(m);
    263     result.hi = _mm256_broadcast_ps(m);
    264 
    265     return result;
    266 }
    267 
    268 INLINE void _simd16_store_ps(float *m, simd16scalar a)
    269 {
    270     float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
    271 
    272     _mm256_store_ps(m, a.lo);
    273     _mm256_store_ps(n, a.hi);
    274 }
    275 
    276 INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
    277 {
    278     float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
    279 
    280     _mm256_maskstore_ps(m, mask.lo, a.lo);
    281     _mm256_maskstore_ps(n, mask.hi, a.hi);
    282 }
    283 
    284 INLINE void _simd16_store_si(simd16scalari *m, simd16scalari a)
    285 {
    286     _mm256_store_si256(&m[0].lo, a.lo);
    287     _mm256_store_si256(&m[0].hi, a.hi);
    288 }
    289 
    290 INLINE simdscalar _simd16_extract_ps(simd16scalar a, int imm8)
    291 {
    292     switch (imm8)
    293     {
    294     case 0:
    295         return a.lo;
    296     case 1:
    297         return a.hi;
    298     }
    299     return _simd_set1_ps(0.0f);
    300 }
    301 
    302 INLINE simdscalari _simd16_extract_si(simd16scalari a, int imm8)
    303 {
    304     switch (imm8)
    305     {
    306     case 0:
    307         return a.lo;
    308     case 1:
    309         return a.hi;
    310     }
    311     return _simd_set1_epi32(0);
    312 }
    313 
    314 INLINE simd16scalar _simd16_insert_ps(simd16scalar a, simdscalar b, int imm8)
    315 {
    316     switch (imm8)
    317     {
    318     case 0:
    319         a.lo = b;
    320         break;
    321     case 1:
    322         a.hi = b;
    323         break;
    324     }
    325     return a;
    326 }
    327 
    328 INLINE simd16scalari _simd16_insert_si(simd16scalari a, simdscalari b, int imm8)
    329 {
    330     switch (imm8)
    331     {
    332     case 0:
    333         a.lo = b;
    334         break;
    335     case 1:
    336         a.hi = b;
    337         break;
    338     }
    339     return a;
    340 }
    341 
    342 template <simd16mask mask>
    343 INLINE simd16scalar _simd16_blend_ps_temp(simd16scalar a, simd16scalar b)
    344 {
    345     simd16scalar result;
    346 
    347     result.lo = _mm256_blend_ps(a.lo, b.lo, _simd16_masklo(mask));
    348     result.hi = _mm256_blend_ps(a.hi, b.hi, _simd16_maskhi(mask));
    349 
    350     return result;
    351 }
    352 
    353 #define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
    354 
    355 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_blendv_ps, _mm256_blendv_ps)
    356 
    357 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
    358 {
    359     simd16scalari result;
    360 
    361     result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), mask.lo));
    362     result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), mask.hi));
    363 
    364     return result;
    365 }
    366 
    367 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
    368 {
    369     simd16scalari result;
    370 
    371     result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo)));
    372     result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi)));
    373 
    374     return result;
    375 }
    376 
    377 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_mul_ps, _mm256_mul_ps)
    378 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_add_ps, _mm256_add_ps)
    379 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_sub_ps, _mm256_sub_ps)
    380 SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rsqrt_ps, _mm256_rsqrt_ps)
    381 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_min_ps, _mm256_min_ps)
    382 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps)
    383 
    384 INLINE simd16mask _simd16_movemask_ps(simd16scalar a)
    385 {
    386     simd16mask mask;
    387 
    388     reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_ps(a.lo);
    389     reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_ps(a.hi);
    390 
    391     return mask;
    392 }
    393 
    394 INLINE simd16mask _simd16_movemask_pd(simd16scalard a)
    395 {
    396     simd16mask mask;
    397 
    398     reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_pd(a.lo);
    399     reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_pd(a.hi);
    400 
    401     return mask;
    402 }
    403 
    404 INLINE simd16mask _simd16_movemask_epi8(simd16scalari a)
    405 {
    406     simd16mask mask;
    407 
    408     reinterpret_cast<uint8_t *>(&mask)[0] = _mm256_movemask_epi8(a.lo);
    409     reinterpret_cast<uint8_t *>(&mask)[1] = _mm256_movemask_epi8(a.hi);
    410 
    411     return mask;
    412 }
    413 
    414 INLINE simd16scalari _simd16_cvtps_epi32(simd16scalar a)
    415 {
    416     simd16scalari result;
    417 
    418     result.lo = _mm256_cvtps_epi32(a.lo);
    419     result.hi = _mm256_cvtps_epi32(a.hi);
    420 
    421     return result;
    422 }
    423 
    424 INLINE simd16scalari _simd16_cvttps_epi32(simd16scalar a)
    425 {
    426     simd16scalari result;
    427 
    428     result.lo = _mm256_cvttps_epi32(a.lo);
    429     result.hi = _mm256_cvttps_epi32(a.hi);
    430 
    431     return result;
    432 }
    433 
    434 INLINE simd16scalar _simd16_cvtepi32_ps(simd16scalari a)
    435 {
    436     simd16scalar result;
    437 
    438     result.lo = _mm256_cvtepi32_ps(a.lo);
    439     result.hi = _mm256_cvtepi32_ps(a.hi);
    440 
    441     return result;
    442 }
    443 
    444 template <int comp>
    445 INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b)
    446 {
    447     simd16scalar result;
    448 
    449     result.lo = _mm256_cmp_ps(a.lo, b.lo, comp);
    450     result.hi = _mm256_cmp_ps(a.hi, b.hi, comp);
    451 
    452     return result;
    453 }
    454 
    455 #define _simd16_cmplt_ps(a, b) _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
    456 #define _simd16_cmpgt_ps(a, b) _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
    457 #define _simd16_cmpneq_ps(a, b) _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
    458 #define _simd16_cmpeq_ps(a, b) _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
    459 #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
    460 #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
    461 
    462 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
    463 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
    464 SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
    465 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)
    466 
    467 INLINE simd16scalar _simd16_castsi_ps(simd16scalari a)
    468 {
    469     return *reinterpret_cast<simd16scalar *>(&a);
    470 }
    471 
    472 INLINE simd16scalari _simd16_castps_si(simd16scalar a)
    473 {
    474     return *reinterpret_cast<simd16scalari *>(&a);
    475 }
    476 
    477 INLINE simd16scalard _simd16_castsi_pd(simd16scalari a)
    478 {
    479     return *reinterpret_cast<simd16scalard *>(&a);
    480 }
    481 
    482 INLINE simd16scalari _simd16_castpd_si(simd16scalard a)
    483 {
    484     return *reinterpret_cast<simd16scalari *>(&a);
    485 }
    486 
    487 INLINE simd16scalar _simd16_castpd_ps(simd16scalard a)
    488 {
    489     return *reinterpret_cast<simd16scalar *>(&a);
    490 }
    491 
    492 INLINE simd16scalard _simd16_castps_pd(simd16scalar a)
    493 {
    494     return *reinterpret_cast<simd16scalard *>(&a);
    495 }
    496 
    497 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _mm256_andnot_ps)
    498 
    499 template <int mode>
    500 INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
    501 {
    502     simd16scalar result;
    503 
    504     result.lo = _mm256_round_ps(a.lo, mode);
    505     result.hi = _mm256_round_ps(a.hi, mode);
    506 
    507     return result;
    508 }
    509 
    510 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
    511 
    512 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32)
    513 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32)
    514 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32)
    515 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64)
    516 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32)
    517 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32)
    518 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
    519 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
    520 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
    521 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
    522 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
    523 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
    524 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
    525 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
    526 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
    527 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)
    528 
    529 INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
    530 {
    531     int lo = _mm256_testz_ps(a.lo, b.lo);
    532     int hi = _mm256_testz_ps(a.hi, b.hi);
    533 
    534     return lo & hi;
    535 }
    536 
    537 #define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a)
    538 
    539 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpacklo_ps, _simd_unpacklo_ps)
    540 SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpackhi_ps, _simd_unpackhi_ps)
    541 SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpacklo_pd, _simd_unpacklo_pd)
    542 SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpackhi_pd, _simd_unpackhi_pd)
    543 
    544 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi8, _simd_unpacklo_epi8)
    545 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi8, _simd_unpackhi_epi8)
    546 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi16, _simd_unpacklo_epi16)
    547 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi16, _simd_unpackhi_epi16)
    548 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32)
    549 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32)
    550 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64)
    551 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi64, _simd_unpackhi_epi64)
    552 
    553 template <int imm8>
    554 INLINE simd16scalari _simd16_slli_epi32_temp(simd16scalari a)
    555 {
    556     simd16scalari result;
    557 
    558     result.lo = _simd_slli_epi32(a.lo, imm8);
    559     result.hi = _simd_slli_epi32(a.hi, imm8);
    560 
    561     return result;
    562 }
    563 
    564 #define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
    565 
    566 template <int imm8>
    567 INLINE simd16scalari _simd16_srai_epi32_temp(simd16scalari a)
    568 {
    569     simd16scalari result;
    570 
    571     result.lo = _simd_srai_epi32(a.lo, imm8);
    572     result.hi = _simd_srai_epi32(a.hi, imm8);
    573 
    574     return result;
    575 }
    576 
    577 #define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
    578 
    579 template <int imm8>
    580 INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a)
    581 {
    582     simd16scalari result;
    583 
    584     result.lo = _simd_srli_epi32(a.lo, imm8);
    585     result.hi = _simd_srli_epi32(a.hi, imm8);
    586 
    587     return result;
    588 }
    589 
    590 #define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
    591 
    592 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
    593 SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
    594 
    595 //__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
    596 template <int scale>
    597 INLINE simd16scalar _simd16_i32gather_ps_temp(const float *m, simd16scalari index)
    598 {
    599     simd16scalar result;
    600 
    601     result.lo = _simd_i32gather_ps(m, index.lo, scale);
    602     result.hi = _simd_i32gather_ps(m, index.hi, scale);
    603 
    604     return result;
    605 }
    606 
    607 #define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
    608 
    609 //__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
    610 template <int scale>
    611 INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
    612 {
    613     simd16scalar result;
    614 
    615     result.lo = _simd_mask_i32gather_ps(a.lo, m, index.lo, _simd_castsi_ps(mask.lo), scale);
    616     result.hi = _simd_mask_i32gather_ps(a.hi, m, index.hi, _simd_castsi_ps(mask.hi), scale);
    617 
    618     return result;
    619 }
    620 
    621 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, mask, index)
    622 
    623 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
    624 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
    625 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
    626 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
    627 SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
    628 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
    629 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
    630 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16)
    631 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16)
    632 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8)
    633 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8)
    634 
    635 INLINE simd16scalar _simd16_permute_ps(simd16scalar a, simd16scalari i)
    636 {
    637     simd16scalar result;
    638 
    639     const simdscalari mask = _simd_set1_epi32(7);
    640 
    641     simdscalar lolo = _simd_permute_ps(a.lo, _simd_and_si(i.lo, mask));
    642     simdscalar lohi = _simd_permute_ps(a.hi, _simd_and_si(i.lo, mask));
    643 
    644     simdscalar hilo = _simd_permute_ps(a.lo, _simd_and_si(i.hi, mask));
    645     simdscalar hihi = _simd_permute_ps(a.hi, _simd_and_si(i.hi, mask));
    646 
    647     result.lo = _simd_blendv_ps(lolo, lohi, _simd_castsi_ps(_simd_cmpgt_epi32(i.lo, mask)));
    648     result.hi = _simd_blendv_ps(hilo, hihi, _simd_castsi_ps(_simd_cmpgt_epi32(i.hi, mask)));
    649 
    650     return result;
    651 }
    652 
    653 INLINE simd16scalari _simd16_permute_epi32(simd16scalari a, simd16scalari i)
    654 {
    655     return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
    656 }
    657 
    658 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32)
    659 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32)
    660 
    661 template <int imm8>
    662 INLINE simd16scalar _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)
    663 {
    664     simd16scalar result;
    665 
    666     result.lo = _simd_permute2f128_ps(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
    667     result.hi = _simd_permute2f128_ps(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
    668 
    669     return result;
    670 }
    671 
    672 #define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
    673 
    674 template <int imm8>
    675 INLINE simd16scalard _simd16_permute2f128_pd_temp(simd16scalard a, simd16scalard b)
    676 {
    677     simd16scalard result;
    678 
    679     result.lo = _simd_permute2f128_pd(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
    680     result.hi = _simd_permute2f128_pd(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
    681 
    682     return result;
    683 }
    684 
    685 #define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
    686 
    687 template <int imm8>
    688 INLINE simd16scalari _simd16_permute2f128_si_temp(simd16scalari a, simd16scalari b)
    689 {
    690     simd16scalari result;
    691 
    692     result.lo = _simd_permute2f128_si(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
    693     result.hi = _simd_permute2f128_si(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
    694 
    695     return result;
    696 }
    697 
    698 #define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
    699 
    700 template <int imm8>
    701 INLINE simd16scalar _simd16_shuffle_ps_temp(simd16scalar a, simd16scalar b)
    702 {
    703     simd16scalar result;
    704 
    705     result.lo = _simd_shuffle_ps(a.lo, b.lo, imm8);
    706     result.hi = _simd_shuffle_ps(a.hi, b.hi, imm8);
    707 
    708     return result;
    709 }
    710 
    711 #define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
    712 
    713 template <int imm8>
    714 INLINE simd16scalard _simd16_shuffle_pd_temp(simd16scalard a, simd16scalard b)
    715 {
    716     simd16scalard result;
    717 
    718     result.lo = _simd_shuffle_pd(a.lo, b.lo, (imm8 & 15));
    719     result.hi = _simd_shuffle_pd(a.hi, b.hi, (imm8 >> 4));
    720 
    721     return result;
    722 }
    723 
    724 #define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
    725 
    726 template <int imm8>
    727 INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
    728 {
    729     return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
    730 }
    731 
    732 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
    733 
    734 template <int imm8>
    735 INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
    736 {
    737     return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
    738 }
    739 
    740 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
    741 
    742 INLINE simd16scalari _simd16_cvtepu8_epi16(simdscalari a)
    743 {
    744     simd16scalari result;
    745 
    746     result.lo = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 0));
    747     result.hi = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 1));
    748 
    749     return result;
    750 }
    751 
    752 INLINE simd16scalari _simd16_cvtepu8_epi32(__m128i a)
    753 {
    754     simd16scalari result;
    755 
    756     result.lo = _simd_cvtepu8_epi32(a);
    757     result.hi = _simd_cvtepu8_epi32(_mm_srli_si128(a, 8));
    758 
    759     return result;
    760 }
    761 
    762 INLINE simd16scalari _simd16_cvtepu16_epi32(simdscalari a)
    763 {
    764     simd16scalari result;
    765 
    766     result.lo = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 0));
    767     result.hi = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 1));
    768 
    769     return result;
    770 }
    771 
    772 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi16, _simd_packus_epi16)
    773 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi16, _simd_packs_epi16)
    774 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi32, _simd_packus_epi32)
    775 SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi32, _simd_packs_epi32)
    776 
    777 INLINE simd16mask _simd16_int2mask(int mask)
    778 {
    779     return mask;
    780 }
    781 
    782 INLINE int _simd16_mask2int(simd16mask mask)
    783 {
    784     return mask;
    785 }
    786 
    787 INLINE simd16mask _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
    788 {
    789     return _simd16_movemask_ps(_simd16_cmplt_ps(a, b));
    790 }
    791 
    792 // convert bitmask to vector mask
    793 INLINE simd16scalar vMask16(int32_t mask)
    794 {
    795     simd16scalari temp = _simd16_set1_epi32(mask);
    796 
    797     simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
    798 
    799     simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
    800 
    801     return _simd16_castsi_ps(result);
    802 }
    803 
    804 #else
    805 
    806 INLINE simd16mask _simd16_scalari2mask(simd16scalari mask)
    807 {
    808     return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32());
    809 }
    810 
    811 #if 0
    812 INLINE simd16mask _simd16_scalard2mask(simd16scalard mask)
    813 {
    814     return _mm512_cmpneq_epu64_mask(mask, _mm512_setzero_epi64());
    815 }
    816 #endif
    817 
    818 #define _simd16_setzero_ps      _mm512_setzero_ps
    819 #define _simd16_setzero_si      _mm512_setzero_si512
    820 #define _simd16_set1_ps         _mm512_set1_ps
    821 #define _simd16_set1_epi8       _mm512_set1_epi8
    822 #define _simd16_set1_epi32      _mm512_set1_epi32
    823 
    824 INLINE simd16scalar _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
    825 {
    826     return _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
    827 }
    828 
    829 INLINE simd16scalari _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
    830 {
    831     return _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
    832 }
    833 
    834 INLINE simd16scalar _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
    835 {
    836     return _mm512_set_ps(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
    837 }
    838 
    839 INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
    840 {
    841     return _mm512_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
    842 }
    843 
    844 #define _simd16_load_ps         _mm512_load_ps
    845 #define _simd16_loadu_ps        _mm512_loadu_ps
    846 #if 1
    847 #define _simd16_load1_ps        _simd16_broadcast_ss
    848 #endif
    849 #define _simd16_load_si         _mm512_load_si512
    850 #define _simd16_loadu_si        _mm512_loadu_si512
    851 #define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
    852 #define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
    853 #define _simd16_store_ps        _mm512_store_ps
    854 #define _simd16_store_si        _mm512_store_si512
    855 #define _simd16_extract_ps      _mm512_extractf32x8_ps
    856 #define _simd16_extract_si      _mm512_extracti32x8_epi32
    857 #define _simd16_insert_ps       _mm512_insertf32x8
    858 #define _simd16_insert_si       _mm512_inserti32x8
    859 
    860 INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
    861 {
    862     simd16mask k = _simd16_scalari2mask(mask);
    863 
    864     _mm512_mask_store_ps(m, k, a);
    865 }
    866 
    867 #define _simd16_blend_ps(a, b, mask)    _mm512_mask_blend_ps(mask, a, b)
    868 
    869 INLINE simd16scalar _simd16_blendv_ps(simd16scalar a, simd16scalar b, const simd16scalar mask)
    870 {
    871     simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
    872 
    873     _mm512_mask_blend_ps(k, a, b);
    874 }
    875 
    876 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
    877 {
    878     simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
    879 
    880     _mm512_mask_blend_epi32(k, a, b);
    881 }
    882 
    883 INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
    884 {
    885     simd16mask k = _simd16_scalari2mask(mask);
    886 
    887     _mm512_mask_blend_epi32(k, a, b);
    888 }
    889 
    890 #define _simd16_mul_ps          _mm512_mul_ps
    891 #define _simd16_add_ps          _mm512_add_ps
    892 #define _simd16_sub_ps          _mm512_sub_ps
    893 #define _simd16_rsqrt_ps        _mm512_rsqrt14_ps
    894 #define _simd16_min_ps          _mm512_min_ps
    895 #define _simd16_max_ps          _mm512_max_ps
    896 
    897 INLINE simd16mask _simd16_movemask_ps(simd16scalar a)
    898 {
    899     return  _simd16_scalari2mask(_mm512_castps_si512(a));
    900 }
    901 
    902 #if 0
    903 INLINE simd16mask _simd16_movemask_pd(simd16scalard a)
    904 {
    905     return  _simd16_scalard2mask(_mm512i_castpd_si512(a));
    906 }
    907 #endif
    908 
    909 #if 0
    910 INLINE int _simd16_movemask_epi8(simd16scalari a)
    911 {
    912     return  _simd16_scalar2mask(a);
    913 }
    914 #endif
    915 
    916 #define _simd16_cvtps_epi32     _mm512_cvtps_epi32
    917 #define _simd16_cvttps_epi32    _mm512_cvttps_epi32
    918 #define _simd16_cvtepi32_ps     _mm512_cvtepi32_ps
    919 
    920 template <int comp>
    921 INLINE simd16scalar _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
    922 {
    923     simd16mask k = _mm512_cmpeq_ps_mask(a, b);
    924 
    925     return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
    926 }
    927 
    928 #define _simd16_cmp_ps(a, b, comp)  _simd16_cmp_ps_temp<comp>(a, b)
    929 
    930 #define _simd16_cmplt_ps(a, b)      _simd16_cmp_ps<_CMP_LT_OQ>(a, b)
    931 #define _simd16_cmpgt_ps(a, b)      _simd16_cmp_ps<_CMP_GT_OQ>(a, b)
    932 #define _simd16_cmpneq_ps(a, b)     _simd16_cmp_ps<_CMP_NEQ_OQ>(a, b)
    933 #define _simd16_cmpeq_ps(a, b)      _simd16_cmp_ps<_CMP_EQ_OQ>(a, b)
    934 #define _simd16_cmpge_ps(a, b)      _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
    935 #define _simd16_cmple_ps(a, b)      _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
    936 
    937 #define _simd16_castsi_ps           _mm512_castsi512_ps
    938 #define _simd16_castps_si           _mm512_castps_si512
    939 #define _simd16_castsi_pd           _mm512_castsi512_pd
    940 #define _simd16_castpd_si           _mm512_castpd_si512
    941 #define _simd16_castpd_ps           _mm512_castpd_ps
    942 #define _simd16_castps_pd           _mm512_castps_pd
    943 
    944 #define _simd16_andnot_ps           _mm512_andnot_ps
    945 
    946 template <int mode>
    947 INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
    948 {
    949     return _mm512_roundscale_ps(a, mode);
    950 }
    951 
    952 #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
    953 
    954 #define _simd16_mul_epi32         _mm512_mul_epi32
    955 #define _simd16_mullo_epi32       _mm512_mullo_epi32
    956 #define _simd16_sub_epi32         _mm512_sub_epi32
    957 #define _simd16_sub_epi64         _mm512_sub_epi64
    958 #define _simd16_min_epi32         _mm512_min_epi32
    959 #define _simd16_max_epi32         _mm512_max_epi32
    960 #define _simd16_min_epu32         _mm512_min_epu32
    961 #define _simd16_max_epu32         _mm512_max_epu32
    962 #define _simd16_add_epi32         _mm512_add_epi32
    963 #define _simd16_and_si            _mm512_and_si512
    964 #define _simd16_andnot_si         _mm512_andnot_si512
    965 #define _simd16_or_si             _mm512_or_si512
    966 #define _simd16_xor_si            _mm512_xor_si512
    967 
    968 INLINE simd16scalari _simd16_cmpeq_epi32(simd16scalari a, simd16scalari b)
    969 {
    970     simd16mask k = _mm512_cmpeq_epi32_mask(a, b);
    971 
    972     return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
    973 }
    974 
    975 INLINE simd16scalari _simd16_cmpgt_epi32(simd16scalari a, simd16scalari b)
    976 {
    977     simd16mask k = _mm512_cmpgt_epi32_mask(a, b);
    978 
    979     return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
    980 }
    981 
    982 INLINE simd16scalari _simd16_cmplt_epi32(simd16scalari a, simd16scalari b)
    983 {
    984     simd16mask k = _mm512_cmplt_epi32_mask(a, b);
    985 
    986     return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
    987 }
    988 
    989 #if 0
    990 INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
    991 {
    992     int lo = _mm256_testz_ps(a.lo, b.lo);
    993     int hi = _mm256_testz_ps(a.hi, b.hi);
    994 
    995     return lo & hi;
    996 }
    997 
    998 #endif
    999 
   1000 #define _simd16_unpacklo_ps       _mm512_unpacklo_ps
   1001 #define _simd16_unpackhi_ps       _mm512_unpackhi_ps
   1002 #define _simd16_unpacklo_pd       _mm512_unpacklo_pd
   1003 #define _simd16_unpackhi_pd       _mm512_unpackhi_pd
   1004 #define _simd16_unpacklo_epi8     _mm512_unpacklo_epi8
   1005 #define _simd16_unpackhi_epi8     _mm512_unpackhi_epi8
   1006 #define _simd16_unpacklo_epi16    _mm512_unpacklo_epi16
   1007 #define _simd16_unpackhi_epi16    _mm512_unpackhi_epi16
   1008 #define _simd16_unpacklo_epi32    _mm512_unpacklo_epi32
   1009 #define _simd16_unpackhi_epi32    _mm512_unpackhi_epi32
   1010 #define _simd16_unpacklo_epi64    _mm512_unpacklo_epi64
   1011 #define _simd16_unpackhi_epi64    _mm512_unpackhi_epi64
   1012 #define _simd16_slli_epi32        _mm512_slli_epi32
   1013 #define _simd16_srli_epi32        _mm512_srli_epi32
   1014 #define _simd16_srai_epi32        _mm512_srai_epi32
   1015 #define _simd16_fmadd_ps          _mm512_fmadd_ps
   1016 #define _simd16_fmsub_ps          _mm512_fmsub_ps
   1017 #define _simd16_adds_epu8         _mm512_adds_epu8
   1018 #define _simd16_subs_epu8         _mm512_subs_epu8
   1019 #define _simd16_add_epi8          _mm512_add_epi8
   1020 #define _simd16_shuffle_epi8      _mm512_shuffle_epi8
   1021 
   1022 #define _simd16_fmadd_ps          _mm512_fmadd_ps
   1023 #define _simd16_fmsub_ps          _mm512_fmsub_ps
   1024 
   1025 #define _simd16_i32gather_ps(m, index, scale)               _mm512_i32gather_ps(index, m, scale)
   1026 #define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _mm512_mask_i32gather_ps(a, m, index, mask, scale)
   1027 
   1028 #define _simd16_abs_epi32         _mm512_abs_epi32
   1029 #define _simd16_cmpeq_epi64       _mm512_abs_epi32
   1030 
   1031 INLINE simd16scalari _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b)
   1032 {
   1033     __mmask8 k = _mm512_cmpeq_epi64_mask(a, b);
   1034 
   1035     return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
   1036 }
   1037 
   1038 INLINE simd16scalari _simd16_cmpgt_epi64(simd16scalari a, simd16scalari b)
   1039 {
   1040     __mmask8 k = _mm512_cmpgt_epi64_mask(a, b);
   1041 
   1042     return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
   1043 }
   1044 
   1045 INLINE simd16scalari _simd16_cmpeq_epi16(simd16scalari a, simd16scalari b)
   1046 {
   1047     __mmask32 k = _mm512_cmpeq_epi16_mask(a, b);
   1048 
   1049     return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
   1050 }
   1051 
   1052 INLINE simd16scalari _simd16_cmpgt_epi16(simd16scalari a, simd16scalari b)
   1053 {
   1054     __mmask32 k = _mm512_cmpgt_epi16_mask(a, b);
   1055 
   1056     return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
   1057 }
   1058 
   1059 INLINE simd16scalari _simd16_cmpeq_epi8(simd16scalari a, simd16scalari b)
   1060 {
   1061     __mmask64 k = _mm512_cmpeq_epi8_mask(a, b);
   1062 
   1063     return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
   1064 }
   1065 
   1066 INLINE simd16scalari _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b)
   1067 {
   1068     __mmask64 k = _mm512_cmpgt_epi8_mask(a, b);
   1069 
   1070     return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
   1071 }
   1072 
   1073 #define _simd16_permute_ps(a, i)        _mm512_permutexvar_ps(i, a)
   1074 #define _simd16_permute_epi32(a, i)     _mm512_permutexvar_epi32(i, a)
   1075 #define _simd16_sllv_epi32              _mm512_srlv_epi32
   1076 #define _simd16_srlv_epi32              _mm512_sllv_epi32
   1077 #define _simd16_permute2f128_ps         _mm512_shuffle_f32x4
   1078 #define _simd16_permute2f128_pd         _mm512_shuffle_f64x2
   1079 #define _simd16_permute2f128_si         _mm512_shuffle_i32x4
   1080 #define _simd16_shuffle_ps              _mm512_shuffle_ps
   1081 #define _simd16_shuffle_pd              _mm512_shuffle_pd
   1082 #define _simd16_cvtepu8_epi16           _mm512_cvtepu8_epi16
   1083 #define _simd16_cvtepu8_epi32           _mm512_cvtepu8_epi32
   1084 #define _simd16_cvtepu16_epi32          _mm512_cvtepu16_epi32
   1085 #define _simd16_packus_epi16            _mm512_packus_epi16
   1086 #define _simd16_packs_epi16             _mm512_packs_epi16
   1087 #define _simd16_packus_epi32            _mm512_packus_epi32
   1088 #define _simd16_packs_epi32             _mm512_packs_epi32
   1089 
   1090 template <int imm8>
   1091 INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
   1092 {
   1093     return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
   1094 }
   1095 
   1096 #define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
   1097 
   1098 template <int imm8>
   1099 INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
   1100 {
   1101     return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
   1102 }
   1103 
   1104 #define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
   1105 
   1106 INLINE simd16mask _simd16_int2mask(int mask)
   1107 {
   1108     return _mm512_int2mask(mask);
   1109 }
   1110 
   1111 INLINE int _simd16_mask2int(simd16mask mask)
   1112 {
   1113     return _mm512_mask2int(mask);
   1114 }
   1115 
   1116 INLINE simd16mask _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
   1117 {
   1118     return _mm512_cmplt_ps_mask(a, b);
   1119 }
   1120 
   1121 // convert bitmask to vector mask
   1122 INLINE simd16scalar vMask16(int32_t mask)
   1123 {
   1124     simd16scalari temp = _simd16_set1_epi32(mask);
   1125 
   1126     simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
   1127 
   1128     simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
   1129 
   1130     return _simd16_castsi_ps(result);
   1131 }
   1132 
   1133 #endif//ENABLE_AVX512_EMULATION
   1134 
   1135 #endif//ENABLE_AVX512_SIMD16
   1136 
   1137 #endif//__SWR_SIMD16INTRIN_H_
   1138