Home | History | Annotate | Download | only in common
      1 /****************************************************************************
      2 * Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 ****************************************************************************/
     23 #if !defined(__SIMD_LIB_AVX512_HPP__)
     24 #error Do not include this file directly, use "simdlib.hpp" instead.
     25 #endif
     26 
     27 #if defined(__GNUC__) && !defined( __clang__) && !defined(__INTEL_COMPILER)
     28 // gcc as of 7.1 was missing these intrinsics
     29 #ifndef _mm512_cmpneq_ps_mask
     30 #define _mm512_cmpneq_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_NEQ_UQ)
     31 #endif
     32 
     33 #ifndef _mm512_cmplt_ps_mask
     34 #define _mm512_cmplt_ps_mask(a,b) _mm512_cmp_ps_mask((a),(b),_CMP_LT_OS)
     35 #endif
     36 
     37 #ifndef _mm512_cmplt_pd_mask
     38 #define _mm512_cmplt_pd_mask(a,b) _mm512_cmp_pd_mask((a),(b),_CMP_LT_OS)
     39 #endif
     40 
     41 #endif
     42 
     43 //============================================================================
     44 // SIMD16 AVX512 (F) implementation (compatible with Knights and Core
     45 // processors)
     46 //
     47 //============================================================================
     48 
     49 static const int TARGET_SIMD_WIDTH = 16;
     50 using SIMD256T = SIMD256Impl::AVX2Impl;
     51 
     52 #define SIMD_WRAPPER_1_(op, intrin)  \
     53     static SIMDINLINE Float SIMDCALL op(Float a)   \
     54     {\
     55         return intrin(a);\
     56     }
     57 
     58 #define SIMD_WRAPPER_1(op)  \
     59     SIMD_WRAPPER_1_(op, _mm512_##op)
     60 
     61 #define SIMD_WRAPPER_2_(op, intrin)  \
     62     static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
     63     {\
     64         return _mm512_##intrin(a, b);\
     65     }
     66 #define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
     67 
     68 #define SIMD_WRAPPERI_2_(op, intrin)  \
     69     static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
     70     {\
     71         return _mm512_castsi512_ps(_mm512_##intrin(\
     72             _mm512_castps_si512(a), _mm512_castps_si512(b)));\
     73     }
     74 
     75 #define SIMD_DWRAPPER_2(op)  \
     76     static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
     77     {\
     78         return _mm512_##op(a, b);\
     79     }
     80 
     81 #define SIMD_WRAPPER_2I_(op, intrin)  \
     82     template<int ImmT>\
     83     static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
     84     {\
     85         return _mm512_##intrin(a, b, ImmT);\
     86     }
     87 #define SIMD_WRAPPER_2I(op)  SIMD_WRAPPER_2I_(op, op)
     88 
     89 #define SIMD_DWRAPPER_2I_(op, intrin)  \
     90     template<int ImmT>\
     91     static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
     92     {\
     93         return _mm512_##intrin(a, b, ImmT);\
     94     }
     95 #define SIMD_DWRAPPER_2I(op)  SIMD_DWRAPPER_2I_(op, op)
     96 
     97 #define SIMD_WRAPPER_3(op)  \
     98     static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
     99     {\
    100         return _mm512_##op(a, b, c);\
    101     }
    102 
    103 #define SIMD_IWRAPPER_1(op)  \
    104     static SIMDINLINE Integer SIMDCALL op(Integer a)   \
    105     {\
    106         return _mm512_##op(a);\
    107     }
    108 #define SIMD_IWRAPPER_1_8(op)  \
    109     static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a)   \
    110     {\
    111         return _mm512_##op(a);\
    112     }
    113 
    114 #define SIMD_IWRAPPER_1_4(op)  \
    115     static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a)   \
    116     {\
    117         return _mm512_##op(a);\
    118     }
    119 
    120 #define SIMD_IWRAPPER_1I_(op, intrin)  \
    121     template<int ImmT> \
    122     static SIMDINLINE Integer SIMDCALL op(Integer a)   \
    123     {\
    124         return intrin(a, ImmT);\
    125     }
    126 #define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
    127 
    128 #define SIMD_IWRAPPER_2_(op, intrin)  \
    129     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
    130     {\
    131         return _mm512_##intrin(a, b);\
    132     }
    133 #define SIMD_IWRAPPER_2(op)  SIMD_IWRAPPER_2_(op, op)
    134 
    135 #define SIMD_IWRAPPER_2_CMP(op, cmp)  \
    136     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
    137     {\
    138         return cmp(a, b);\
    139     }
    140 
    141 #define SIMD_IFWRAPPER_2(op, intrin)  \
    142     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
    143     {\
    144         return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
    145     }
    146 
    147 #define SIMD_IWRAPPER_2I_(op, intrin)  \
    148     template<int ImmT>\
    149     static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
    150     {\
    151         return _mm512_##intrin(a, b, ImmT);\
    152     }
    153 #define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
    154 
    155 private:
    156     static SIMDINLINE Integer vmask(__mmask16 m)
    157     {
    158         return _mm512_maskz_set1_epi32(m, -1);
    159     }
    160 
    161     static SIMDINLINE Integer vmask(__mmask8 m)
    162     {
    163         return _mm512_maskz_set1_epi64(m, -1LL);
    164     }
    165 
    166 public:
    167 //-----------------------------------------------------------------------
    168 // Single precision floating point arithmetic operations
    169 //-----------------------------------------------------------------------
    170 SIMD_WRAPPER_2(add_ps);     // return a + b
    171 SIMD_WRAPPER_2(div_ps);     // return a / b
    172 SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
    173 SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
    174 SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
    175 SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
    176 SIMD_WRAPPER_2(mul_ps);     // return a * b
    177 SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp14_ps);       // return 1.0f / a
    178 SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt14_ps);   // return 1.0f / sqrt(a)
    179 SIMD_WRAPPER_2(sub_ps);     // return a - b
    180 
    181 template <RoundMode RMT>
    182 static SIMDINLINE Float SIMDCALL round_ps(Float a)
    183 {
    184     return _mm512_roundscale_ps(a, static_cast<int>(RMT));
    185 }
    186 
    187 static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
    188 static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
    189 
    190 //-----------------------------------------------------------------------
    191 // Integer (various width) arithmetic operations
    192 //-----------------------------------------------------------------------
    193 SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
    194 SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
    195 //SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
    196 //SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8)
    197 SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
    198 SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
    199 SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
    200 SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
    201 SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
    202 
    203                             // return (a * b) & 0xFFFFFFFF
    204                             //
    205                             // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
    206                             // and store the low 32 bits of the intermediate integers in dst.
    207 SIMD_IWRAPPER_2(mullo_epi32);
    208 SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
    209 SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
    210 //SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
    211 
    212 //-----------------------------------------------------------------------
    213 // Logical operations
    214 //-----------------------------------------------------------------------
    215 SIMD_IWRAPPER_2_(and_si, and_si512);        // return a & b       (int)
    216 SIMD_IWRAPPER_2_(andnot_si, andnot_si512);  // return (~a) & b    (int)
    217 SIMD_IWRAPPER_2_(or_si, or_si512);          // return a | b       (int)
    218 SIMD_IWRAPPER_2_(xor_si, xor_si512);        // return a ^ b       (int)
    219 
    220 // SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
    221 // SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
    222 // SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
    223 // SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
    224 
    225 
    226 //-----------------------------------------------------------------------
    227 // Shift operations
    228 //-----------------------------------------------------------------------
    229 SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
    230 SIMD_IWRAPPER_2(sllv_epi32);
    231 SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
    232 SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
    233 
    234 #if 0
    235 SIMD_IWRAPPER_1I_(srli_si, srli_si512);     // return a >> (ImmT*8) (uint)
    236 
    237 template<int ImmT>                              // same as srli_si, but with Float cast to int
    238 static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
    239 {
    240     return castsi_ps(srli_si<ImmT>(castps_si(a)));
    241 }
    242 #endif
    243 
    244 SIMD_IWRAPPER_2(srlv_epi32);
    245 
    246 //-----------------------------------------------------------------------
    247 // Conversion operations
    248 //-----------------------------------------------------------------------
    249 static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
    250 {
    251     return _mm512_castpd_ps(a);
    252 }
    253 
    254 static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
    255 {
    256     return _mm512_castps_si512(a);
    257 }
    258 
    259 static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
    260 {
    261     return _mm512_castsi512_pd(a);
    262 }
    263 
    264 static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
    265 {
    266     return _mm512_castps_pd(a);
    267 }
    268 
    269 static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
    270 {
    271     return _mm512_castpd_si512(a);
    272 }
    273 
    274 static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
    275 {
    276     return _mm512_castsi512_ps(a);
    277 }
    278 
    279 static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
    280 {
    281     return _mm512_cvtepi32_ps(a);
    282 }
    283 
    284 //SIMD_IWRAPPER_1_8(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
    285 SIMD_IWRAPPER_1_4(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
    286 SIMD_IWRAPPER_1_8(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
    287 SIMD_IWRAPPER_1_4(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
    288 SIMD_IWRAPPER_1_8(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
    289 
    290 static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
    291 {
    292     return _mm512_cvtps_epi32(a);
    293 }
    294 
    295 static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
    296 {
    297     return _mm512_cvttps_epi32(a);
    298 }
    299 
    300 //-----------------------------------------------------------------------
    301 // Comparison operations
    302 //-----------------------------------------------------------------------
    303 template<CompareType CmpTypeT>
    304 static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
    305 {
    306     return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
    307 }
    308 
    309 template<CompareType CmpTypeT>
    310 static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
    311 {
    312     // Legacy vector mask generator
    313     __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
    314     return castsi_ps(vmask(result));
    315 }
    316 
    317 static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
    318 static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
    319 static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
    320 static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
    321 static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
    322 static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
    323 
    324 template<CompareTypeInt CmpTypeT>
    325 static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
    326 {
    327     // Legacy vector mask generator
    328     __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
    329     return vmask(result);
    330 }
    331 template<CompareTypeInt CmpTypeT>
    332 static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
    333 {
    334     // Legacy vector mask generator
    335     __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
    336     return vmask(result);
    337 }
    338 
    339 //SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8<CompareTypeInt::EQ>);    // return a == b (int8)
    340 //SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>);   // return a == b (int16)
    341 SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>);   // return a == b (int32)
    342 SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>);   // return a == b (int64)
    343 //SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8<CompareTypeInt::GT>);    // return a > b (int8)
    344 //SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>);   // return a > b (int16)
    345 SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>);   // return a > b (int32)
    346 SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>);   // return a > b (int64)
    347 SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>);   // return a < b (int32)
    348 
    349 static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
    350 {
    351     return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
    352 }
    353 
    354 static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
    355 {
    356     return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
    357 }
    358 
    359 //-----------------------------------------------------------------------
    360 // Blend / shuffle / permute operations
    361 //-----------------------------------------------------------------------
    362 template <int ImmT>
    363 static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a  (float)
    364 {
    365     return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
    366 }
    367 
    368 template <int ImmT>
    369 static SIMDINLINE Integer blend_epi32(Integer a, Integer b) // return ImmT ? b : a  (int32)
    370 {
    371     return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
    372 }
    373 
    374 static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a  (float)
    375 {
    376     return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
    377 }
    378 
    379 
    380 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
    381 {
    382     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
    383 }
    384 
    385 static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
    386 {
    387     return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
    388 }
    389 
    390 static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
    391 {
    392     return _mm512_set1_ps(*p);
    393 }
    394 
    395 template<int imm>
    396 static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
    397 {
    398     return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
    399 }
    400 
    401 template<int imm>
    402 static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
    403 {
    404     return _mm512_extractf64x4_pd(a, imm);
    405 }
    406 
    407 template<int imm>
    408 static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
    409 {
    410     return _mm512_extracti64x4_epi64(a, imm);
    411 }
    412 
    413 template<int imm>
    414 static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
    415 {
    416     return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
    417 }
    418 
    419 template<int imm>
    420 static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
    421 {
    422     return _mm512_insertf64x4(a, b, imm);
    423 }
    424 
    425 template<int imm>
    426 static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
    427 {
    428     return _mm512_inserti64x4(a, b, imm);
    429 }
    430 
    431 // SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
    432 // SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
    433 // SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
    434 // SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
    435 
    436 static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
    437 {
    438     return _mm512_permutexvar_epi32(swiz, a);
    439 }
    440 
    441 static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
    442 {
    443     return _mm512_permutexvar_ps(swiz, a);
    444 }
    445 
    446 SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
    447 SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
    448 SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
    449 
    450 SIMD_IWRAPPER_1I(shuffle_epi32);
    451 
    452 //SIMD_IWRAPPER_2(shuffle_epi8);
    453 SIMD_DWRAPPER_2I(shuffle_pd);
    454 SIMD_WRAPPER_2I(shuffle_ps);
    455 
    456 template<int ImmT>
    457 static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
    458 {
    459     return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
    460 }
    461 
    462 SIMD_IWRAPPER_2(unpackhi_epi16);
    463 
    464 //SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
    465 static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
    466 {
    467     return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
    468 }
    469 
    470 SIMD_IWRAPPER_2(unpackhi_epi64);
    471 //SIMD_IWRAPPER_2(unpackhi_epi8);
    472 SIMD_DWRAPPER_2(unpackhi_pd);
    473 SIMD_WRAPPER_2(unpackhi_ps);
    474 //SIMD_IWRAPPER_2(unpacklo_epi16);
    475 SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
    476 SIMD_IWRAPPER_2(unpacklo_epi64);
    477 //SIMD_IWRAPPER_2(unpacklo_epi8);
    478 SIMD_DWRAPPER_2(unpacklo_pd);
    479 SIMD_WRAPPER_2(unpacklo_ps);
    480 
    481 //-----------------------------------------------------------------------
    482 // Load / store operations
    483 //-----------------------------------------------------------------------
    484 template<ScaleFactor ScaleT>
    485 static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
    486 {
    487     return _mm512_i32gather_ps(idx, p, static_cast<int>(ScaleT));
    488 }
    489 
    490 static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
    491 {
    492     return broadcast_ss(p);
    493 }
    494 
    495 static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
    496 {
    497     return _mm512_load_ps(p);
    498 }
    499 
    500 static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
    501 {
    502     return _mm512_load_si512(&p->v);
    503 }
    504 
    505 static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
    506 {
    507     return _mm512_loadu_ps(p);
    508 }
    509 
    510 static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
    511 {
    512     return _mm512_loadu_si512(p);
    513 }
    514 
    515 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
    516 template<ScaleFactor ScaleT>
    517 static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
    518 {
    519     __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
    520 
    521     return _mm512_mask_i32gather_ps(old, k, idx, p, static_cast<int>(ScaleT));
    522 }
    523 
    524 static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
    525 {
    526     Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
    527     _mm512_mask_store_ps(p, m, src);
    528 }
    529 
    530 //static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
    531 //{
    532 //    __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
    533 //    return static_cast<uint64_t>(m);
    534 //}
    535 
    536 static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
    537 {
    538     __mmask8 m = _mm512_test_epi64_mask(castpd_si(a), set1_epi64(0x8000000000000000LL));
    539     return static_cast<uint32_t>(m);
    540 }
    541 static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
    542 {
    543     __mmask16 m = _mm512_test_epi32_mask(castps_si(a), set1_epi32(0x80000000));
    544     return static_cast<uint32_t>(m);
    545 }
    546 
    547 static SIMDINLINE Integer SIMDCALL set1_epi64(long long i) // return i (all elements are same value)
    548 {
    549     return _mm512_set1_epi64(i);
    550 }
    551 
    552 static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
    553 {
    554     return _mm512_set1_epi32(i);
    555 }
    556 
    557 static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
    558 {
    559     return _mm512_set1_epi8(i);
    560 }
    561 
    562 static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
    563 {
    564     return _mm512_set1_ps(f);
    565 }
    566 
    567 static SIMDINLINE Double SIMDCALL setzero_pd()      // return 0 (double)
    568 {
    569     return _mm512_setzero_pd();
    570 }
    571 
    572 static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
    573 {
    574     return _mm512_setzero_ps();
    575 }
    576 
    577 static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
    578 {
    579     return _mm512_setzero_si512();
    580 }
    581 
    582 static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
    583 {
    584     _mm512_store_ps(p, a);
    585 }
    586 
    587 static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
    588 {
    589     _mm512_store_si512(&p->v, a);
    590 }
    591 
    592 static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
    593 {
    594     _mm512_storeu_si512(&p->v, a);
    595 }
    596 
    597 static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
    598 {
    599     _mm512_stream_ps(p, a);
    600 }
    601 
    602 static SIMDINLINE Integer SIMDCALL set_epi32(
    603     int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
    604     int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
    605 {
    606     return _mm512_set_epi32(
    607         i15, i14, i13, i12, i11, i10, i9, i8,
    608         i7, i6, i5, i4, i3, i2, i1, i0);
    609 }
    610 
    611 static SIMDINLINE Integer SIMDCALL set_epi32(
    612     int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
    613 {
    614     return set_epi32(
    615         0, 0, 0, 0, 0, 0, 0, 0,
    616         i7, i6, i5, i4, i3, i2, i1, i0);
    617 }
    618 
    619 static SIMDINLINE Float SIMDCALL set_ps(
    620     float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
    621     float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
    622 {
    623     return _mm512_set_ps(
    624         i15, i14, i13, i12, i11, i10, i9, i8,
    625         i7, i6, i5, i4, i3, i2, i1, i0);
    626 }
    627 
    628 static SIMDINLINE Float SIMDCALL set_ps(
    629     float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
    630 {
    631     return set_ps(
    632         0, 0, 0, 0, 0, 0, 0, 0,
    633         i7, i6, i5, i4, i3, i2, i1, i0);
    634 }
    635 
    636 static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
    637 {
    638     return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
    639 }
    640 
    641 #undef SIMD_WRAPPER_1_
    642 #undef SIMD_WRAPPER_1
    643 #undef SIMD_WRAPPER_2
    644 #undef SIMD_WRAPPER_2_
    645 #undef SIMD_WRAPPERI_2_
    646 #undef SIMD_DWRAPPER_2
    647 #undef SIMD_DWRAPPER_2I
    648 #undef SIMD_WRAPPER_2I_
    649 #undef SIMD_WRAPPER_3_
    650 #undef SIMD_WRAPPER_2I
    651 #undef SIMD_WRAPPER_3
    652 #undef SIMD_IWRAPPER_1
    653 #undef SIMD_IWRAPPER_2
    654 #undef SIMD_IFWRAPPER_2
    655 #undef SIMD_IWRAPPER_2I
    656 #undef SIMD_IWRAPPER_1
    657 #undef SIMD_IWRAPPER_1I
    658 #undef SIMD_IWRAPPER_1I_
    659 #undef SIMD_IWRAPPER_2
    660 #undef SIMD_IWRAPPER_2_
    661 #undef SIMD_IWRAPPER_2I
    662 
    663