Home | History | Annotate | Download | only in common
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 ****************************************************************************/
     23 
     24 #ifndef __SWR_SIMDINTRIN_H__
     25 #define __SWR_SIMDINTRIN_H__
     26 
     27 #include "os.h"
     28 
     29 #include <cassert>
     30 
     31 #include <emmintrin.h>
     32 #include <immintrin.h>
     33 #include <xmmintrin.h>
     34 
     35 #if KNOB_SIMD_WIDTH == 8
     36 typedef __m256 simdscalar;
     37 typedef __m256i simdscalari;
     38 typedef uint8_t simdmask;
     39 #else
     40 #error Unsupported vector width
     41 #endif
     42 
     43 // simd vector
     44 OSALIGNSIMD(union) simdvector
     45 {
     46     simdscalar  v[4];
     47     struct
     48     {
     49         simdscalar x, y, z, w;
     50     };
     51 
     52     simdscalar& operator[] (const int i) { return v[i]; }
     53     const simdscalar& operator[] (const int i) const { return v[i]; }
     54 };
     55 
     56 #if KNOB_SIMD_WIDTH == 8
     57 #define _simd128_maskstore_ps _mm_maskstore_ps
     58 #define _simd_load_ps _mm256_load_ps
     59 #define _simd_load1_ps _mm256_broadcast_ss
     60 #define _simd_loadu_ps _mm256_loadu_ps
     61 #define _simd_setzero_ps _mm256_setzero_ps
     62 #define _simd_set1_ps   _mm256_set1_ps
     63 #define _simd_blend_ps  _mm256_blend_ps
     64 #define _simd_blendv_ps _mm256_blendv_ps
     65 #define _simd_store_ps _mm256_store_ps
     66 #define _simd_mul_ps _mm256_mul_ps
     67 #define _simd_add_ps _mm256_add_ps
     68 #define _simd_sub_ps _mm256_sub_ps
     69 #define _simd_rsqrt_ps _mm256_rsqrt_ps
     70 #define _simd_min_ps _mm256_min_ps
     71 #define _simd_max_ps _mm256_max_ps
     72 #define _simd_movemask_ps _mm256_movemask_ps
     73 #define _simd_cvtps_epi32 _mm256_cvtps_epi32
     74 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
     75 #define _simd_cvtepi32_ps _mm256_cvtepi32_ps
     76 #define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
     77 #define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
     78 #define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
     79 #define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
     80 #define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
     81 #define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
     82 #define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
     83 #define _simd_and_ps _mm256_and_ps
     84 #define _simd_or_ps _mm256_or_ps
     85 
     86 #define _simd_rcp_ps _mm256_rcp_ps
     87 #define _simd_div_ps _mm256_div_ps
     88 #define _simd_castsi_ps _mm256_castsi256_ps
     89 #define _simd_andnot_ps _mm256_andnot_ps
     90 #define _simd_round_ps _mm256_round_ps
     91 #define _simd_castpd_ps _mm256_castpd_ps
     92 #define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
     93 #define _simd_stream_ps _mm256_stream_ps
     94 
     95 #define _simd_load_sd _mm256_load_sd
     96 #define _simd_movemask_pd _mm256_movemask_pd
     97 #define _simd_castsi_pd _mm256_castsi256_pd
     98 
     99 // emulated integer simd
    100 #define SIMD_EMU_EPI(func, intrin) \
    101 INLINE \
    102 __m256i func(__m256i a, __m256i b)\
    103 {\
    104     __m128i aHi = _mm256_extractf128_si256(a, 1);\
    105     __m128i bHi = _mm256_extractf128_si256(b, 1);\
    106     __m128i aLo = _mm256_castsi256_si128(a);\
    107     __m128i bLo = _mm256_castsi256_si128(b);\
    108 \
    109     __m128i subLo = intrin(aLo, bLo);\
    110     __m128i subHi = intrin(aHi, bHi);\
    111 \
    112     __m256i result = _mm256_castsi128_si256(subLo);\
    113             result = _mm256_insertf128_si256(result, subHi, 1);\
    114 \
    115     return result;\
    116 }
    117 
    118 #if (KNOB_ARCH == KNOB_ARCH_AVX)
    119 INLINE
    120 __m256 _simdemu_permute_ps(__m256 a, __m256i b)
    121 {
    122     __m128 aHi = _mm256_extractf128_ps(a, 1);
    123     __m128i bHi = _mm256_extractf128_si256(b, 1);
    124     __m128 aLo = _mm256_castps256_ps128(a);
    125     __m128i bLo = _mm256_castsi256_si128(b);
    126 
    127     __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
    128     __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
    129     __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
    130     __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
    131 
    132     indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
    133     resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
    134     resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
    135     __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
    136 
    137     __m256 result = _mm256_castps128_ps256(blendLowRes);
    138     result = _mm256_insertf128_ps(result, blendHiRes, 1);
    139 
    140     return result;
    141 }
    142 
    143 INLINE
    144 __m256i _simdemu_permute_epi32(__m256i a, __m256i b)
    145 {
    146     return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a), b));
    147 }
    148 
    149 INLINE
    150 __m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
    151 {
    152     int32_t aHi, aLow, countHi, countLow;
    153     __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
    154     __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
    155     __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
    156     __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
    157 
    158     aHi = _mm_extract_epi32(vAHi, 0);
    159     countHi = _mm_extract_epi32(vCountHi, 0);
    160     aHi >>= countHi;
    161     vAHi = _mm_insert_epi32(vAHi, aHi, 0);
    162 
    163     aLow = _mm_extract_epi32(vALow, 0);
    164     countLow = _mm_extract_epi32(vCountLow, 0);
    165     aLow >>= countLow;
    166     vALow = _mm_insert_epi32(vALow, aLow, 0);
    167 
    168     aHi = _mm_extract_epi32(vAHi, 1);
    169     countHi = _mm_extract_epi32(vCountHi, 1);
    170     aHi >>= countHi;
    171     vAHi = _mm_insert_epi32(vAHi, aHi, 1);
    172 
    173     aLow = _mm_extract_epi32(vALow, 1);
    174     countLow = _mm_extract_epi32(vCountLow, 1);
    175     aLow >>= countLow;
    176     vALow = _mm_insert_epi32(vALow, aLow, 1);
    177 
    178     aHi = _mm_extract_epi32(vAHi, 2);
    179     countHi = _mm_extract_epi32(vCountHi, 2);
    180     aHi >>= countHi;
    181     vAHi = _mm_insert_epi32(vAHi, aHi, 2);
    182 
    183     aLow = _mm_extract_epi32(vALow, 2);
    184     countLow = _mm_extract_epi32(vCountLow, 2);
    185     aLow >>= countLow;
    186     vALow = _mm_insert_epi32(vALow, aLow, 2);
    187 
    188     aHi = _mm_extract_epi32(vAHi, 3);
    189     countHi = _mm_extract_epi32(vCountHi, 3);
    190     aHi >>= countHi;
    191     vAHi = _mm_insert_epi32(vAHi, aHi, 3);
    192 
    193     aLow = _mm_extract_epi32(vALow, 3);
    194     countLow = _mm_extract_epi32(vCountLow, 3);
    195     aLow >>= countLow;
    196     vALow = _mm_insert_epi32(vALow, aLow, 3);
    197 
    198     __m256i ret = _mm256_set1_epi32(0);
    199     ret = _mm256_insertf128_si256(ret, vAHi, 1);
    200     ret = _mm256_insertf128_si256(ret, vALow, 0);
    201     return ret;
    202 }
    203 
    204 
    205 INLINE
    206 __m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
    207 {
    208     int32_t aHi, aLow, countHi, countLow;
    209     __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
    210     __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
    211     __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
    212     __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
    213 
    214     aHi = _mm_extract_epi32(vAHi, 0);
    215     countHi = _mm_extract_epi32(vCountHi, 0);
    216     aHi <<= countHi;
    217     vAHi = _mm_insert_epi32(vAHi, aHi, 0);
    218 
    219     aLow = _mm_extract_epi32(vALow, 0);
    220     countLow = _mm_extract_epi32(vCountLow, 0);
    221     aLow <<= countLow;
    222     vALow = _mm_insert_epi32(vALow, aLow, 0);
    223 
    224     aHi = _mm_extract_epi32(vAHi, 1);
    225     countHi = _mm_extract_epi32(vCountHi, 1);
    226     aHi <<= countHi;
    227     vAHi = _mm_insert_epi32(vAHi, aHi, 1);
    228 
    229     aLow = _mm_extract_epi32(vALow, 1);
    230     countLow = _mm_extract_epi32(vCountLow, 1);
    231     aLow <<= countLow;
    232     vALow = _mm_insert_epi32(vALow, aLow, 1);
    233 
    234     aHi = _mm_extract_epi32(vAHi, 2);
    235     countHi = _mm_extract_epi32(vCountHi, 2);
    236     aHi <<= countHi;
    237     vAHi = _mm_insert_epi32(vAHi, aHi, 2);
    238 
    239     aLow = _mm_extract_epi32(vALow, 2);
    240     countLow = _mm_extract_epi32(vCountLow, 2);
    241     aLow <<= countLow;
    242     vALow = _mm_insert_epi32(vALow, aLow, 2);
    243 
    244     aHi = _mm_extract_epi32(vAHi, 3);
    245     countHi = _mm_extract_epi32(vCountHi, 3);
    246     aHi <<= countHi;
    247     vAHi = _mm_insert_epi32(vAHi, aHi, 3);
    248 
    249     aLow = _mm_extract_epi32(vALow, 3);
    250     countLow = _mm_extract_epi32(vCountLow, 3);
    251     aLow <<= countLow;
    252     vALow = _mm_insert_epi32(vALow, aLow, 3);
    253 
    254     __m256i ret = _mm256_set1_epi32(0);
    255     ret = _mm256_insertf128_si256(ret, vAHi, 1);
    256     ret = _mm256_insertf128_si256(ret, vALow, 0);
    257     return ret;
    258 }
    259 
    260 #define _simd_mul_epi32 _simdemu_mul_epi32
    261 #define _simd_mullo_epi32 _simdemu_mullo_epi32
    262 #define _simd_sub_epi32 _simdemu_sub_epi32
    263 #define _simd_sub_epi64 _simdemu_sub_epi64
    264 #define _simd_min_epi32 _simdemu_min_epi32
    265 #define _simd_min_epu32 _simdemu_min_epu32
    266 #define _simd_max_epi32 _simdemu_max_epi32
    267 #define _simd_max_epu32 _simdemu_max_epu32
    268 #define _simd_add_epi32 _simdemu_add_epi32
    269 #define _simd_and_si _simdemu_and_si
    270 #define _simd_andnot_si _simdemu_andnot_si
    271 #define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
    272 #define _simd_cmplt_epi32 _simdemu_cmplt_epi32
    273 #define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
    274 #define _simd_or_si _simdemu_or_si
    275 #define _simd_xor_si _simdemu_xor_si
    276 #define _simd_castps_si _mm256_castps_si256
    277 #define _simd_adds_epu8 _simdemu_adds_epu8
    278 #define _simd_subs_epu8 _simdemu_subs_epu8
    279 #define _simd_add_epi8 _simdemu_add_epi8
    280 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
    281 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
    282 #define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
    283 #define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
    284 #define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
    285 #define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
    286 #define _simd_movemask_epi8 _simdemu_movemask_epi8
    287 #define _simd_permute_ps _simdemu_permute_ps
    288 #define _simd_permute_epi32 _simdemu_permute_epi32
    289 #define _simd_srlv_epi32 _simdemu_srlv_epi32
    290 #define _simd_sllv_epi32 _simdemu_sllv_epi32
    291 
    292 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
    293 SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
    294 SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32)
    295 SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64)
    296 SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32)
    297 SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32)
    298 SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32)
    299 SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32)
    300 SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32)
    301 SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128)
    302 SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128)
    303 SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32)
    304 SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32)
    305 SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32)
    306 SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128)
    307 SIMD_EMU_EPI(_simdemu_xor_si, _mm_xor_si128)
    308 SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8)
    309 SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
    310 SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
    311 SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
    312 SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
    313 SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
    314 SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
    315 SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
    316 SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
    317 SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8)
    318 SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8)
    319 SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16)
    320 SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16)
    321 
    322 #define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8
    323 #define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8
    324 #define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16
    325 #define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16
    326 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
    327 #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
    328 #define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
    329 #define _simd_unpackhi_epi64(a, b) _mm256_castpd_si256(_mm256_unpackhi_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
    330 
    331 #define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
    332 #define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
    333 #define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
    334 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
    335 
    336 #define _simd128_fmadd_ps _mm_fmaddemu_ps
    337 #define _simd_fmadd_ps _mm_fmaddemu256_ps
    338 #define _simd_fmsub_ps _mm_fmsubemu256_ps
    339 #define _simd_shuffle_epi8 _simdemu_shuffle_epi8
    340 SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
    341 
    342 INLINE
    343 __m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
    344 {
    345     __m128 res = _mm_mul_ps(a, b);
    346     res = _mm_add_ps(res, c);
    347     return res;
    348 }
    349 
    350 INLINE
    351 __m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
    352 {
    353     __m256 res = _mm256_mul_ps(a, b);
    354     res = _mm256_add_ps(res, c);
    355     return res;
    356 }
    357 
    358 INLINE
    359 __m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
    360 {
    361     __m256 res = _mm256_mul_ps(a, b);
    362     res = _mm256_sub_ps(res, c);
    363     return res;
    364 }
    365 
    366 INLINE
    367 __m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
    368 {
    369     uint32_t *pOffsets = (uint32_t*)&vOffsets;
    370     simdscalar vResult;
    371     float* pResult = (float*)&vResult;
    372     for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
    373     {
    374         uint32_t offset = pOffsets[i];
    375         offset = offset * scale;
    376         pResult[i] = *(float*)(((const uint8_t*)pBase + offset));
    377     }
    378 
    379     return vResult;
    380 }
    381 
    382 INLINE
    383 __m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
    384 {
    385     uint32_t *pOffsets = (uint32_t*)&vOffsets;
    386     simdscalar vResult = vSrc;
    387     float* pResult = (float*)&vResult;
    388     DWORD index;
    389     uint32_t mask = _simd_movemask_ps(vMask);
    390     while (_BitScanForward(&index, mask))
    391     {
    392         mask &= ~(1 << index);
    393         uint32_t offset = pOffsets[index];
    394         offset = offset * scale;
    395         pResult[index] = *(float*)(((const uint8_t*)pBase + offset));
    396     }
    397 
    398     return vResult;
    399 }
    400 
    401 INLINE
    402 __m256i _simd_abs_epi32(__m256i a)
    403 {
    404         __m128i aHi = _mm256_extractf128_si256(a, 1);
    405         __m128i aLo = _mm256_castsi256_si128(a);
    406         __m128i absLo = _mm_abs_epi32(aLo);
    407         __m128i absHi = _mm_abs_epi32(aHi);
    408         __m256i result = _mm256_castsi128_si256(absLo);
    409         result = _mm256_insertf128_si256(result, absHi, 1);
    410         return result;
    411 }
    412 
    413 INLINE
    414 int _simdemu_movemask_epi8(__m256i a)
    415 {
    416     __m128i aHi = _mm256_extractf128_si256(a, 1);
    417     __m128i aLo = _mm256_castsi256_si128(a);
    418 
    419     int resHi = _mm_movemask_epi8(aHi);
    420     int resLo = _mm_movemask_epi8(aLo);
    421 
    422     return (resHi << 16) | resLo;
    423 }
    424 
    425 INLINE
    426 __m256i _simd_cvtepu8_epi16(__m128i a)
    427 {
    428     __m128i resultlo = _mm_cvtepu8_epi16(a);
    429     __m128i resulthi = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
    430 
    431     __m256i result = _mm256_castsi128_si256(resultlo);
    432 
    433     return _mm256_insertf128_si256(result, resulthi, 1);
    434 }
    435 
    436 INLINE
    437 __m256i _simd_cvtepu8_epi32(__m128i a)
    438 {
    439     __m128i resultlo = _mm_cvtepu8_epi32(a);
    440     __m128i resulthi = _mm_cvtepu8_epi32(_mm_srli_si128(a, 4));
    441 
    442     __m256i result = _mm256_castsi128_si256(resultlo);
    443 
    444     return _mm256_insertf128_si256(result, resulthi, 1);
    445 }
    446 
    447 INLINE
    448 __m256i _simd_cvtepu16_epi32(__m128i a)
    449 {
    450     __m128i resultlo = _mm_cvtepu16_epi32(a);
    451     __m128i resulthi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
    452 
    453     __m256i result = _mm256_castsi128_si256(resultlo);
    454 
    455     return _mm256_insertf128_si256(result, resulthi, 1);
    456 }
    457 
    458 INLINE
    459 __m256i _simd_packus_epi16(__m256i a, __m256i b)
    460 {
    461     __m128i alo = _mm256_extractf128_si256(a, 0);
    462     __m128i ahi = _mm256_extractf128_si256(a, 1);
    463 
    464     __m128i blo = _mm256_extractf128_si256(b, 0);
    465     __m128i bhi = _mm256_extractf128_si256(b, 1);
    466 
    467     __m128i resultlo = _mm_packus_epi16(alo, blo);
    468     __m128i resulthi = _mm_packus_epi16(ahi, bhi);
    469 
    470     __m256i result = _mm256_castsi128_si256(resultlo);
    471 
    472     return _mm256_insertf128_si256(result, resulthi, 1);
    473 }
    474 
    475 INLINE
    476 __m256i _simd_packs_epi16(__m256i a, __m256i b)
    477 {
    478     __m128i alo = _mm256_extractf128_si256(a, 0);
    479     __m128i ahi = _mm256_extractf128_si256(a, 1);
    480 
    481     __m128i blo = _mm256_extractf128_si256(b, 0);
    482     __m128i bhi = _mm256_extractf128_si256(b, 1);
    483 
    484     __m128i resultlo = _mm_packs_epi16(alo, blo);
    485     __m128i resulthi = _mm_packs_epi16(ahi, bhi);
    486 
    487     __m256i result = _mm256_castsi128_si256(resultlo);
    488 
    489     return _mm256_insertf128_si256(result, resulthi, 1);
    490 }
    491 
    492 INLINE
    493 __m256i _simd_packus_epi32(__m256i a, __m256i b)
    494 {
    495     __m128i alo = _mm256_extractf128_si256(a, 0);
    496     __m128i ahi = _mm256_extractf128_si256(a, 1);
    497 
    498     __m128i blo = _mm256_extractf128_si256(b, 0);
    499     __m128i bhi = _mm256_extractf128_si256(b, 1);
    500 
    501     __m128i resultlo = _mm_packus_epi32(alo, blo);
    502     __m128i resulthi = _mm_packus_epi32(ahi, bhi);
    503 
    504     __m256i result = _mm256_castsi128_si256(resultlo);
    505 
    506     return _mm256_insertf128_si256(result, resulthi, 1);
    507 }
    508 
    509 INLINE
    510 __m256i _simd_packs_epi32(__m256i a, __m256i b)
    511 {
    512     __m128i alo = _mm256_extractf128_si256(a, 0);
    513     __m128i ahi = _mm256_extractf128_si256(a, 1);
    514 
    515     __m128i blo = _mm256_extractf128_si256(b, 0);
    516     __m128i bhi = _mm256_extractf128_si256(b, 1);
    517 
    518     __m128i resultlo = _mm_packs_epi32(alo, blo);
    519     __m128i resulthi = _mm_packs_epi32(ahi, bhi);
    520 
    521     __m256i result = _mm256_castsi128_si256(resultlo);
    522 
    523     return _mm256_insertf128_si256(result, resulthi, 1);
    524 }
    525 
    526 #else
    527 
    528 #define _simd_mul_epi32 _mm256_mul_epi32
    529 #define _simd_mullo_epi32 _mm256_mullo_epi32
    530 #define _simd_sub_epi32 _mm256_sub_epi32
    531 #define _simd_sub_epi64 _mm256_sub_epi64
    532 #define _simd_min_epi32 _mm256_min_epi32
    533 #define _simd_max_epi32 _mm256_max_epi32
    534 #define _simd_min_epu32 _mm256_min_epu32
    535 #define _simd_max_epu32 _mm256_max_epu32
    536 #define _simd_add_epi32 _mm256_add_epi32
    537 #define _simd_and_si _mm256_and_si256
    538 #define _simd_andnot_si _mm256_andnot_si256
    539 #define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
    540 #define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
    541 #define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
    542 #define _simd_or_si _mm256_or_si256
    543 #define _simd_xor_si _mm256_xor_si256
    544 #define _simd_castps_si _mm256_castps_si256
    545 
    546 #define _simd_unpacklo_epi8 _mm256_unpacklo_epi8
    547 #define _simd_unpackhi_epi8 _mm256_unpackhi_epi8
    548 #define _simd_unpacklo_epi16 _mm256_unpacklo_epi16
    549 #define _simd_unpackhi_epi16 _mm256_unpackhi_epi16
    550 #define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
    551 #define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
    552 #define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
    553 #define _simd_unpackhi_epi64 _mm256_unpackhi_epi64
    554 
    555 #define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
    556 #define _simd_slli_epi32 _mm256_slli_epi32
    557 #define _simd_srai_epi32 _mm256_srai_epi32
    558 #define _simd_srli_epi32 _mm256_srli_epi32
    559 #define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
    560 #define _simd128_fmadd_ps _mm_fmadd_ps
    561 #define _simd_fmadd_ps _mm256_fmadd_ps
    562 #define _simd_fmsub_ps _mm256_fmsub_ps
    563 #define _simd_shuffle_epi8 _mm256_shuffle_epi8
    564 #define _simd_adds_epu8 _mm256_adds_epu8
    565 #define _simd_subs_epu8 _mm256_subs_epu8
    566 #define _simd_add_epi8 _mm256_add_epi8
    567 #define _simd_i32gather_ps _mm256_i32gather_ps
    568 #define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
    569 #define _simd_abs_epi32 _mm256_abs_epi32
    570 
    571 #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
    572 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
    573 #define _simd_cmpgt_epi8  _mm256_cmpgt_epi8
    574 #define _simd_cmpeq_epi8  _mm256_cmpeq_epi8
    575 #define _simd_cmpgt_epi16  _mm256_cmpgt_epi16
    576 #define _simd_cmpeq_epi16  _mm256_cmpeq_epi16
    577 #define _simd_movemask_epi8 _mm256_movemask_epi8
    578 #define _simd_permute_ps _mm256_permutevar8x32_ps
    579 #define _simd_permute_epi32 _mm256_permutevar8x32_epi32
    580 #define _simd_srlv_epi32 _mm256_srlv_epi32
    581 #define _simd_sllv_epi32 _mm256_sllv_epi32
    582 #define _simd_cvtepu8_epi16 _mm256_cvtepu8_epi16
    583 #define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32
    584 #define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32
    585 #define _simd_packus_epi16 _mm256_packus_epi16
    586 #define _simd_packs_epi16 _mm256_packs_epi16
    587 #define _simd_packus_epi32 _mm256_packus_epi32
    588 #define _simd_packs_epi32 _mm256_packs_epi32
    589 
    590 #endif
    591 
    592 #define _simd_unpacklo_ps _mm256_unpacklo_ps
    593 #define _simd_unpackhi_ps _mm256_unpackhi_ps
    594 #define _simd_unpacklo_pd _mm256_unpacklo_pd
    595 #define _simd_unpackhi_pd _mm256_unpackhi_pd
    596 #define _simd_insertf128_ps _mm256_insertf128_ps
    597 #define _simd_insertf128_pd _mm256_insertf128_pd
    598 #define _simd_insertf128_si _mm256_insertf128_si256
    599 #define _simd_extractf128_ps _mm256_extractf128_ps
    600 #define _simd_extractf128_pd _mm256_extractf128_pd
    601 #define _simd_extractf128_si _mm256_extractf128_si256
    602 #define _simd_permute2f128_ps _mm256_permute2f128_ps
    603 #define _simd_permute2f128_pd _mm256_permute2f128_pd
    604 #define _simd_permute2f128_si _mm256_permute2f128_si256
    605 #define _simd_shuffle_ps _mm256_shuffle_ps
    606 #define _simd_shuffle_pd _mm256_shuffle_pd
    607 #define _simd_shuffle_epi32(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), imm8))
    608 #define _simd_shuffle_epi64(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), imm8))
    609 #define _simd_set1_epi32 _mm256_set1_epi32
    610 #define _simd_set_epi32 _mm256_set_epi32
    611 #define _simd_set1_epi8 _mm256_set1_epi8
    612 #define _simd_setzero_si _mm256_setzero_si256
    613 #define _simd_cvttps_epi32 _mm256_cvttps_epi32
    614 #define _simd_store_si _mm256_store_si256
    615 #define _simd_broadcast_ss _mm256_broadcast_ss
    616 #define _simd_maskstore_ps _mm256_maskstore_ps
    617 #define _simd_load_si _mm256_load_si256
    618 #define _simd_loadu_si _mm256_loadu_si256
    619 #define _simd_sub_ps _mm256_sub_ps
    620 #define _simd_testz_ps _mm256_testz_ps
    621 #define _simd_xor_ps _mm256_xor_ps
    622 
    623 INLINE
    624 simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr)
    625 {
    626     __m128i lo = _mm_loadu_si128(loaddr);
    627     __m128i hi = _mm_loadu_si128(hiaddr);
    628 
    629     return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
    630 }
    631 
    632 INLINE
    633 void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a)
    634 {
    635     _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
    636     _mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1));
    637 }
    638 
    639 INLINE
    640 simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
    641 {
    642     return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
    643 }
    644 
    645 INLINE
    646 simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalari mask)
    647 {
    648     return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), _simd_castsi_ps(mask)));
    649 }
    650 
    651 // convert bitmask to vector mask
    652 INLINE
    653 simdscalar vMask(int32_t mask)
    654 {
    655     __m256i vec = _mm256_set1_epi32(mask);
    656     const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
    657     vec = _simd_and_si(vec, bit);
    658     vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
    659     return _simd_castsi_ps(vec);
    660 }
    661 
    662 INLINE
    663 void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
    664 {
    665     OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
    666     _mm256_store_ps(rArray, r);
    667     _mm256_store_ps(sArray, s);
    668     rArray[rlane] = sArray[slane];
    669     r = _mm256_load_ps(rArray);
    670 }
    671 
    672 INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
    673 {
    674     __m128i aHi = _mm256_extractf128_si256(a, 1);
    675     __m128i aLo = _mm256_castsi256_si128(a);
    676 
    677     __m128i resHi = _mm_slli_epi32(aHi, i);
    678     __m128i resLo = _mm_slli_epi32(aLo, i);
    679 
    680     __m256i result = _mm256_castsi128_si256(resLo);
    681             result = _mm256_insertf128_si256(result, resHi, 1);
    682 
    683     return result;
    684 }
    685 
    686 INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
    687 {
    688     __m128i aHi = _mm256_extractf128_si256(a, 1);
    689     __m128i aLo = _mm256_castsi256_si128(a);
    690 
    691     __m128i resHi = _mm_srai_epi32(aHi, i);
    692     __m128i resLo = _mm_srai_epi32(aLo, i);
    693 
    694     __m256i result = _mm256_castsi128_si256(resLo);
    695             result = _mm256_insertf128_si256(result, resHi, 1);
    696 
    697     return result;
    698 }
    699 
    700 INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
    701 {
    702     __m128i aHi = _mm256_extractf128_si256(a, 1);
    703     __m128i aLo = _mm256_castsi256_si128(a);
    704 
    705     __m128i resHi = _mm_srli_epi32(aHi, i);
    706     __m128i resLo = _mm_srli_epi32(aLo, i);
    707 
    708     __m256i result = _mm256_castsi128_si256(resLo);
    709     result = _mm256_insertf128_si256(result, resHi, 1);
    710 
    711     return result;
    712 }
    713 
    714 INLINE
    715 void _simdvec_transpose(simdvector &v)
    716 {
    717     SWR_ASSERT(false, "Need to implement 8 wide version");
    718 }
    719 
    720 #else
    721 #error Unsupported vector width
    722 #endif
    723 
    724 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
    725 INLINE
    726 void _simdvec_load_ps(simdvector& r, const float *p)
    727 {
    728     r[0] = _simd_set1_ps(p[0]);
    729     r[1] = _simd_set1_ps(p[1]);
    730     r[2] = _simd_set1_ps(p[2]);
    731     r[3] = _simd_set1_ps(p[3]);
    732 }
    733 
    734 INLINE
    735 void _simdvec_mov(simdvector& r, const simdscalar& s)
    736 {
    737     r[0] = s;
    738     r[1] = s;
    739     r[2] = s;
    740     r[3] = s;
    741 }
    742 
    743 INLINE
    744 void _simdvec_mov(simdvector& r, const simdvector& v)
    745 {
    746     r[0] = v[0];
    747     r[1] = v[1];
    748     r[2] = v[2];
    749     r[3] = v[3];
    750 }
    751 
    752 #if 0
    753 // just move a lane from the source simdvector to dest simdvector
    754 INLINE
    755 void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
    756 {
    757     _simd_mov(r[0], rlane, s[0], slane);
    758     _simd_mov(r[1], rlane, s[1], slane);
    759     _simd_mov(r[2], rlane, s[2], slane);
    760     _simd_mov(r[3], rlane, s[3], slane);
    761 }
    762 
    763 #endif
    764 INLINE
    765 void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
    766 {
    767     simdscalar tmp;
    768     r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
    769 
    770     tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
    771     r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
    772 
    773     tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
    774     r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
    775 }
    776 
    777 INLINE
    778 void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
    779 {
    780     simdscalar tmp;
    781     r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
    782 
    783     tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
    784     r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
    785 
    786     tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
    787     r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
    788 
    789     tmp = _simd_mul_ps(v0[3], v1[3]);   // (v0.w*v1.w)
    790     r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
    791 }
    792 
    793 INLINE
    794 simdscalar _simdvec_rcp_length_ps(const simdvector& v)
    795 {
    796     simdscalar length;
    797     _simdvec_dp4_ps(length, v, v);
    798     return _simd_rsqrt_ps(length);
    799 }
    800 
    801 INLINE
    802 void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
    803 {
    804     simdscalar vecLength;
    805     vecLength = _simdvec_rcp_length_ps(v);
    806 
    807     r[0] = _simd_mul_ps(v[0], vecLength);
    808     r[1] = _simd_mul_ps(v[1], vecLength);
    809     r[2] = _simd_mul_ps(v[2], vecLength);
    810     r[3] = _simd_mul_ps(v[3], vecLength);
    811 }
    812 
    813 INLINE
    814 void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
    815 {
    816     r[0] = _simd_mul_ps(v[0], s);
    817     r[1] = _simd_mul_ps(v[1], s);
    818     r[2] = _simd_mul_ps(v[2], s);
    819     r[3] = _simd_mul_ps(v[3], s);
    820 }
    821 
    822 INLINE
    823 void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
    824 {
    825     r[0] = _simd_mul_ps(v0[0], v1[0]);
    826     r[1] = _simd_mul_ps(v0[1], v1[1]);
    827     r[2] = _simd_mul_ps(v0[2], v1[2]);
    828     r[3] = _simd_mul_ps(v0[3], v1[3]);
    829 }
    830 
    831 INLINE
    832 void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
    833 {
    834     r[0] = _simd_add_ps(v0[0], v1[0]);
    835     r[1] = _simd_add_ps(v0[1], v1[1]);
    836     r[2] = _simd_add_ps(v0[2], v1[2]);
    837     r[3] = _simd_add_ps(v0[3], v1[3]);
    838 }
    839 
    840 INLINE
    841 void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
    842 {
    843     r[0] = _simd_min_ps(v0[0], s);
    844     r[1] = _simd_min_ps(v0[1], s);
    845     r[2] = _simd_min_ps(v0[2], s);
    846     r[3] = _simd_min_ps(v0[3], s);
    847 }
    848 
    849 INLINE
    850 void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
    851 {
    852     r[0] = _simd_max_ps(v0[0], s);
    853     r[1] = _simd_max_ps(v0[1], s);
    854     r[2] = _simd_max_ps(v0[2], s);
    855     r[3] = _simd_max_ps(v0[3], s);
    856 }
    857 
    858 // Matrix4x4 * Vector4
    859 //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
    860 //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
    861 //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
    862 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
    863 INLINE
    864 void _simd_mat4x4_vec4_multiply(
    865     simdvector& result,
    866     const float *pMatrix,
    867     const simdvector& v)
    868 {
    869     simdscalar m;
    870     simdscalar r0;
    871     simdscalar r1;
    872 
    873     m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
    874     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
    875     m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
    876     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
    877     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
    878     m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
    879     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
    880     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    881     m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
    882     r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
    883     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
    884     result[0] = r0;
    885 
    886     m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
    887     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
    888     m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
    889     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
    890     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
    891     m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
    892     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
    893     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    894     m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
    895     r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
    896     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
    897     result[1] = r0;
    898 
    899     m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
    900     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
    901     m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
    902     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
    903     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
    904     m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
    905     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
    906     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    907     m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
    908     r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
    909     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
    910     result[2] = r0;
    911 
    912     m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
    913     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
    914     m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
    915     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
    916     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
    917     m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
    918     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
    919     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    920     m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
    921     r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
    922     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
    923     result[3] = r0;
    924 }
    925 
    926 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
    927 //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
    928 //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
    929 //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
    930 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
    931 INLINE
    932 void _simd_mat3x3_vec3_w0_multiply(
    933     simdvector& result,
    934     const float *pMatrix,
    935     const simdvector& v)
    936 {
    937     simdscalar m;
    938     simdscalar r0;
    939     simdscalar r1;
    940 
    941     m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
    942     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
    943     m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
    944     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
    945     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
    946     m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
    947     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
    948     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    949     result[0] = r0;
    950 
    951     m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
    952     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
    953     m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
    954     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
    955     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
    956     m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
    957     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
    958     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    959     result[1] = r0;
    960 
    961     m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
    962     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
    963     m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
    964     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
    965     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
    966     m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
    967     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
    968     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    969     result[2] = r0;
    970 
    971     result[3] = _simd_setzero_ps();
    972 }
    973 
    974 // Matrix4x4 * Vector3 - Position vector where w = 1.
    975 //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
    976 //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
    977 //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
    978 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
    979 INLINE
    980 void _simd_mat4x4_vec3_w1_multiply(
    981     simdvector& result,
    982     const float *pMatrix,
    983     const simdvector& v)
    984 {
    985     simdscalar m;
    986     simdscalar r0;
    987     simdscalar r1;
    988 
    989     m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
    990     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
    991     m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
    992     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
    993     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
    994     m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
    995     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
    996     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
    997     m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
    998     r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
    999     result[0] = r0;
   1000 
   1001     m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
   1002     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
   1003     m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
   1004     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
   1005     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
   1006     m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
   1007     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
   1008     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
   1009     m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
   1010     r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
   1011     result[1] = r0;
   1012 
   1013     m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
   1014     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
   1015     m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
   1016     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
   1017     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
   1018     m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
   1019     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
   1020     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
   1021     m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
   1022     r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
   1023     result[2] = r0;
   1024 
   1025     m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
   1026     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
   1027     m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
   1028     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
   1029     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
   1030     m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
   1031     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
   1032     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
   1033     m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
   1034     result[3]   = _simd_add_ps(r0, m);          // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
   1035 }
   1036 
   1037 INLINE
   1038 void _simd_mat4x3_vec3_w1_multiply(
   1039     simdvector& result,
   1040     const float *pMatrix,
   1041     const simdvector& v)
   1042 {
   1043     simdscalar m;
   1044     simdscalar r0;
   1045     simdscalar r1;
   1046 
   1047     m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
   1048     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
   1049     m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
   1050     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
   1051     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
   1052     m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
   1053     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
   1054     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
   1055     m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
   1056     r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
   1057     result[0] = r0;
   1058 
   1059     m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
   1060     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
   1061     m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
   1062     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
   1063     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
   1064     m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
   1065     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
   1066     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
   1067     m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
   1068     r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
   1069     result[1] = r0;
   1070 
   1071     m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
   1072     r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
   1073     m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
   1074     r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
   1075     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
   1076     m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
   1077     r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
   1078     r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
   1079     m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
   1080     r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
   1081     result[2] = r0;
   1082     result[3] = _simd_set1_ps(1.0f);
   1083 }
   1084 
   1085 //////////////////////////////////////////////////////////////////////////
   1086 /// @brief Compute plane equation vA * vX + vB * vY + vC
   1087 INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
   1088 {
   1089     simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
   1090     vOut = _simd_fmadd_ps(vB, vY, vOut);
   1091     return vOut;
   1092 }
   1093 
   1094 //////////////////////////////////////////////////////////////////////////
   1095 /// @brief Compute plane equation vA * vX + vB * vY + vC
   1096 INLINE __m128 vplaneps128(__m128 vA, __m128 vB, __m128 vC, __m128 &vX, __m128 &vY)
   1097 {
   1098     __m128 vOut = _simd128_fmadd_ps(vA, vX, vC);
   1099     vOut = _simd128_fmadd_ps(vB, vY, vOut);
   1100     return vOut;
   1101 }
   1102 
   1103 //////////////////////////////////////////////////////////////////////////
   1104 /// @brief Interpolates a single component.
   1105 /// @param vI - barycentric I
   1106 /// @param vJ - barycentric J
   1107 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
   1108 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
   1109 static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
   1110 {
   1111     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
   1112     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
   1113     const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
   1114 
   1115     simdscalar vA = _simd_broadcast_ss(pInterpA);
   1116     simdscalar vB = _simd_broadcast_ss(pInterpB);
   1117     simdscalar vC = _simd_broadcast_ss(pInterpC);
   1118 
   1119     simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
   1120     vC = _simd_mul_ps(vk, vC);
   1121 
   1122     return vplaneps(vA, vB, vC, vI, vJ);
   1123 }
   1124 
   1125 //////////////////////////////////////////////////////////////////////////
   1126 /// @brief Interpolates a single component.
   1127 /// @param vI - barycentric I
   1128 /// @param vJ - barycentric J
   1129 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
   1130 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
   1131 static INLINE __m128 InterpolateComponent(__m128 vI, __m128 vJ, const float *pInterpBuffer)
   1132 {
   1133     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
   1134     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
   1135     const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
   1136 
   1137     __m128 vA = _mm_broadcast_ss(pInterpA);
   1138     __m128 vB = _mm_broadcast_ss(pInterpB);
   1139     __m128 vC = _mm_broadcast_ss(pInterpC);
   1140 
   1141     __m128 vk = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), vI), vJ);
   1142     vC = _mm_mul_ps(vk, vC);
   1143 
   1144     return vplaneps128(vA, vB, vC, vI, vJ);
   1145 }
   1146 
   1147 static INLINE __m128 _simd128_abs_ps(__m128 a)
   1148 {
   1149     __m128i ai = _mm_castps_si128(a);
   1150     return _mm_castsi128_ps(_mm_and_si128(ai, _mm_set1_epi32(0x7fffffff)));
   1151 }
   1152 
   1153 static INLINE simdscalar _simd_abs_ps(simdscalar a)
   1154 {
   1155     simdscalari ai = _simd_castps_si(a);
   1156     return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
   1157 }
   1158 
   1159 INLINE
   1160 UINT pdep_u32(UINT a, UINT mask)
   1161 {
   1162 #if KNOB_ARCH >= KNOB_ARCH_AVX2
   1163     return _pdep_u32(a, mask);
   1164 #else
   1165     UINT result = 0;
   1166 
   1167     // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
   1168     // using bsf instead of funky loop
   1169     DWORD maskIndex;
   1170     while (_BitScanForward(&maskIndex, mask))
   1171     {
   1172         // 1. isolate lowest set bit of mask
   1173         const UINT lowest = 1 << maskIndex;
   1174 
   1175         // 2. populate LSB from src
   1176         const UINT LSB = (UINT)((int)(a << 31) >> 31);
   1177 
   1178         // 3. copy bit from mask
   1179         result |= LSB & lowest;
   1180 
   1181         // 4. clear lowest bit
   1182         mask &= ~lowest;
   1183 
   1184         // 5. prepare for next iteration
   1185         a >>= 1;
   1186     }
   1187 
   1188     return result;
   1189 #endif
   1190 }
   1191 
   1192 INLINE
   1193 UINT pext_u32(UINT a, UINT mask)
   1194 {
   1195 #if KNOB_ARCH >= KNOB_ARCH_AVX2
   1196     return _pext_u32(a, mask);
   1197 #else
   1198     UINT result = 0;
   1199     DWORD maskIndex;
   1200     uint32_t currentBit = 0;
   1201     while (_BitScanForward(&maskIndex, mask))
   1202     {
   1203         // 1. isolate lowest set bit of mask
   1204         const UINT lowest = 1 << maskIndex;
   1205 
   1206         // 2. copy bit from mask
   1207         result |= ((a & lowest) > 0) << currentBit++;
   1208 
   1209         // 3. clear lowest bit
   1210         mask &= ~lowest;
   1211     }
   1212     return result;
   1213 #endif
   1214 }
   1215 
   1216 #if ENABLE_AVX512_SIMD16
   1217 #include "simd16intrin.h"
   1218 #endif//ENABLE_AVX512_SIMD16
   1219 
   1220 #endif//__SWR_SIMDINTRIN_H__
   1221