Home | History | Annotate | Download | only in common
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 ****************************************************************************/
     23 
     24 #ifndef __SWR_INTRIN_H__
     25 #define __SWR_INTRIN_H__
     26 
     27 #include "os.h"
     28 
     29 #define SIMD_ARCH KNOB_ARCH
     30 #include "simdlib_types.hpp"
     31 
     32 typedef SIMDImpl::SIMD128Impl::Float                      simd4scalar;
     33 typedef SIMDImpl::SIMD128Impl::Double                     simd4scalard;
     34 typedef SIMDImpl::SIMD128Impl::Integer                    simd4scalari;
     35 typedef SIMDImpl::SIMD128Impl::Vec4                       simd4vector;
     36 typedef SIMDImpl::SIMD128Impl::Mask                       simd4mask;
     37 
     38 typedef SIMDImpl::SIMD256Impl::Float                      simd8scalar;
     39 typedef SIMDImpl::SIMD256Impl::Double                     simd8scalard;
     40 typedef SIMDImpl::SIMD256Impl::Integer                    simd8scalari;
     41 typedef SIMDImpl::SIMD256Impl::Vec4                       simd8vector;
     42 typedef SIMDImpl::SIMD256Impl::Mask                       simd8mask;
     43 
     44 typedef SIMDImpl::SIMD512Impl::Float                      simd16scalar;
     45 typedef SIMDImpl::SIMD512Impl::Double                     simd16scalard;
     46 typedef SIMDImpl::SIMD512Impl::Integer                    simd16scalari;
     47 typedef SIMDImpl::SIMD512Impl::Vec4                       simd16vector;
     48 typedef SIMDImpl::SIMD512Impl::Mask                       simd16mask;
     49 
     50 #if KNOB_SIMD_WIDTH == 8
     51 typedef simd8scalar     simdscalar;
     52 typedef simd8scalard    simdscalard;
     53 typedef simd8scalari    simdscalari;
     54 typedef simd8vector     simdvector;
     55 typedef simd8mask       simdmask;
     56 #else
     57 #error Unsupported vector width
     58 #endif
     59 
     60 INLINE
     61 UINT pdep_u32(UINT a, UINT mask)
     62 {
     63 #if KNOB_ARCH >= KNOB_ARCH_AVX2
     64     return _pdep_u32(a, mask);
     65 #else
     66     UINT result = 0;
     67 
     68     // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
     69     // using bsf instead of funky loop
     70     DWORD maskIndex;
     71     while (_BitScanForward(&maskIndex, mask))
     72     {
     73         // 1. isolate lowest set bit of mask
     74         const UINT lowest = 1 << maskIndex;
     75 
     76         // 2. populate LSB from src
     77         const UINT LSB = (UINT)((int)(a << 31) >> 31);
     78 
     79         // 3. copy bit from mask
     80         result |= LSB & lowest;
     81 
     82         // 4. clear lowest bit
     83         mask &= ~lowest;
     84 
     85         // 5. prepare for next iteration
     86         a >>= 1;
     87     }
     88 
     89     return result;
     90 #endif
     91 }
     92 
     93 INLINE
     94 UINT pext_u32(UINT a, UINT mask)
     95 {
     96 #if KNOB_ARCH >= KNOB_ARCH_AVX2
     97     return _pext_u32(a, mask);
     98 #else
     99     UINT result = 0;
    100     DWORD maskIndex;
    101     uint32_t currentBit = 0;
    102     while (_BitScanForward(&maskIndex, mask))
    103     {
    104         // 1. isolate lowest set bit of mask
    105         const UINT lowest = 1 << maskIndex;
    106 
    107         // 2. copy bit from mask
    108         result |= ((a & lowest) > 0) << currentBit++;
    109 
    110         // 3. clear lowest bit
    111         mask &= ~lowest;
    112     }
    113     return result;
    114 #endif
    115 }
    116 
    117 #endif//__SWR_INTRIN_H__
    118