1 /**************************************************************************** 2 * Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 ****************************************************************************/ 23 24 #ifndef __SWR_INTRIN_H__ 25 #define __SWR_INTRIN_H__ 26 27 #include "os.h" 28 29 #define SIMD_ARCH KNOB_ARCH 30 #include "simdlib_types.hpp" 31 32 typedef SIMDImpl::SIMD128Impl::Float simd4scalar; 33 typedef SIMDImpl::SIMD128Impl::Double simd4scalard; 34 typedef SIMDImpl::SIMD128Impl::Integer simd4scalari; 35 typedef SIMDImpl::SIMD128Impl::Vec4 simd4vector; 36 typedef SIMDImpl::SIMD128Impl::Mask simd4mask; 37 38 typedef SIMDImpl::SIMD256Impl::Float simd8scalar; 39 typedef SIMDImpl::SIMD256Impl::Double simd8scalard; 40 typedef SIMDImpl::SIMD256Impl::Integer simd8scalari; 41 typedef SIMDImpl::SIMD256Impl::Vec4 simd8vector; 42 typedef SIMDImpl::SIMD256Impl::Mask simd8mask; 43 44 typedef SIMDImpl::SIMD512Impl::Float simd16scalar; 45 typedef SIMDImpl::SIMD512Impl::Double simd16scalard; 46 typedef SIMDImpl::SIMD512Impl::Integer simd16scalari; 47 typedef SIMDImpl::SIMD512Impl::Vec4 simd16vector; 48 typedef SIMDImpl::SIMD512Impl::Mask simd16mask; 49 50 #if KNOB_SIMD_WIDTH == 8 51 typedef simd8scalar simdscalar; 52 typedef simd8scalard simdscalard; 53 typedef simd8scalari simdscalari; 54 typedef simd8vector simdvector; 55 typedef simd8mask simdmask; 56 #else 57 #error Unsupported vector width 58 #endif 59 60 INLINE 61 UINT pdep_u32(UINT a, UINT mask) 62 { 63 #if KNOB_ARCH >= KNOB_ARCH_AVX2 64 return _pdep_u32(a, mask); 65 #else 66 UINT result = 0; 67 68 // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 69 // using bsf instead of funky loop 70 DWORD maskIndex; 71 while (_BitScanForward(&maskIndex, mask)) 72 { 73 // 1. isolate lowest set bit of mask 74 const UINT lowest = 1 << maskIndex; 75 76 // 2. populate LSB from src 77 const UINT LSB = (UINT)((int)(a << 31) >> 31); 78 79 // 3. copy bit from mask 80 result |= LSB & lowest; 81 82 // 4. clear lowest bit 83 mask &= ~lowest; 84 85 // 5. prepare for next iteration 86 a >>= 1; 87 } 88 89 return result; 90 #endif 91 } 92 93 INLINE 94 UINT pext_u32(UINT a, UINT mask) 95 { 96 #if KNOB_ARCH >= KNOB_ARCH_AVX2 97 return _pext_u32(a, mask); 98 #else 99 UINT result = 0; 100 DWORD maskIndex; 101 uint32_t currentBit = 0; 102 while (_BitScanForward(&maskIndex, mask)) 103 { 104 // 1. isolate lowest set bit of mask 105 const UINT lowest = 1 << maskIndex; 106 107 // 2. copy bit from mask 108 result |= ((a & lowest) > 0) << currentBit++; 109 110 // 3. clear lowest bit 111 mask &= ~lowest; 112 } 113 return result; 114 #endif 115 } 116 117 #endif//__SWR_INTRIN_H__ 118