Home | History | Annotate | Download | only in common
      1 /****************************************************************************
      2 * Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
     21 * IN THE SOFTWARE.
     22 ****************************************************************************/
     23 
     24 #ifndef __SWR_SIMDINTRIN_H__
     25 #define __SWR_SIMDINTRIN_H__
     26 
     27 #include "common/intrin.h"
     28 #include "common/simdlib.hpp"
     29 
     30 #if KNOB_SIMD_WIDTH == 8
     31 typedef SIMD256                             SIMD;
     32 #else
     33 #error Unsupported vector width
     34 #endif//KNOB_SIMD16_WIDTH == 16
     35 
     36 
     37 #define _simd128_maskstore_ps               SIMD128::maskstore_ps
     38 #define _simd128_fmadd_ps                   SIMD128::fmadd_ps
     39 
     40 #define _simd_load_ps                       SIMD::load_ps
     41 #define _simd_load1_ps                      SIMD::broadcast_ss
     42 #define _simd_loadu_ps                      SIMD::loadu_ps
     43 #define _simd_setzero_ps                    SIMD::setzero_ps
     44 #define _simd_set1_ps                       SIMD::set1_ps
     45 #define _simd_blend_ps(a, b, i)             SIMD::blend_ps<i>(a, b)
     46 #define _simd_blend_epi32(a, b, i)          SIMD::blend_epi32<i>(a, b)
     47 #define _simd_blendv_ps                     SIMD::blendv_ps
     48 #define _simd_store_ps                      SIMD::store_ps
     49 #define _simd_mul_ps                        SIMD::mul_ps
     50 #define _simd_add_ps                        SIMD::add_ps
     51 #define _simd_sub_ps                        SIMD::sub_ps
     52 #define _simd_rsqrt_ps                      SIMD::rsqrt_ps
     53 #define _simd_min_ps                        SIMD::min_ps
     54 #define _simd_max_ps                        SIMD::max_ps
     55 #define _simd_movemask_ps                   SIMD::movemask_ps
     56 #define _simd_cvtps_epi32                   SIMD::cvtps_epi32
     57 #define _simd_cvttps_epi32                  SIMD::cvttps_epi32
     58 #define _simd_cvtepi32_ps                   SIMD::cvtepi32_ps
     59 #define _simd_cmplt_ps                      SIMD::cmplt_ps
     60 #define _simd_cmpgt_ps                      SIMD::cmpgt_ps
     61 #define _simd_cmpneq_ps                     SIMD::cmpneq_ps
     62 #define _simd_cmpeq_ps                      SIMD::cmpeq_ps
     63 #define _simd_cmpge_ps                      SIMD::cmpge_ps
     64 #define _simd_cmple_ps                      SIMD::cmple_ps
     65 #define _simd_cmp_ps(a, b, imm)             SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
     66 #define _simd_and_ps                        SIMD::and_ps
     67 #define _simd_or_ps                         SIMD::or_ps
     68 #define _simd_rcp_ps                        SIMD::rcp_ps
     69 #define _simd_div_ps                        SIMD::div_ps
     70 #define _simd_castsi_ps                     SIMD::castsi_ps
     71 #define _simd_castps_pd                     SIMD::castps_pd
     72 #define _simd_castpd_ps                     SIMD::castpd_ps
     73 #define _simd_andnot_ps                     SIMD::andnot_ps
     74 #define _simd_round_ps(a, i)                SIMD::round_ps<SIMD::RoundMode(i)>(a)
     75 #define _simd_castpd_ps                     SIMD::castpd_ps
     76 #define _simd_broadcast_ps(a)               SIMD::broadcast_ps((SIMD128::Float const *)(a))
     77 #define _simd_stream_ps                     SIMD::stream_ps
     78 
     79 #define _simd_movemask_pd                   SIMD::movemask_pd
     80 #define _simd_castsi_pd                     SIMD::castsi_pd
     81 
     82 #define _simd_mul_epi32                     SIMD::mul_epi32
     83 #define _simd_mullo_epi32                   SIMD::mullo_epi32
     84 #define _simd_sub_epi32                     SIMD::sub_epi32
     85 #define _simd_sub_epi64                     SIMD::sub_epi64
     86 #define _simd_min_epi32                     SIMD::min_epi32
     87 #define _simd_min_epu32                     SIMD::min_epu32
     88 #define _simd_max_epi32                     SIMD::max_epi32
     89 #define _simd_max_epu32                     SIMD::max_epu32
     90 #define _simd_add_epi32                     SIMD::add_epi32
     91 #define _simd_and_si                        SIMD::and_si
     92 #define _simd_andnot_si                     SIMD::andnot_si
     93 #define _simd_cmpeq_epi32                   SIMD::cmpeq_epi32
     94 #define _simd_cmplt_epi32                   SIMD::cmplt_epi32
     95 #define _simd_cmpgt_epi32                   SIMD::cmpgt_epi32
     96 #define _simd_or_si                         SIMD::or_si
     97 #define _simd_xor_si                        SIMD::xor_si
     98 #define _simd_castps_si                     SIMD::castps_si
     99 #define _simd_adds_epu8                     SIMD::adds_epu8
    100 #define _simd_subs_epu8                     SIMD::subs_epu8
    101 #define _simd_add_epi8                      SIMD::add_epi8
    102 #define _simd_cmpeq_epi64                   SIMD::cmpeq_epi64
    103 #define _simd_cmpgt_epi64                   SIMD::cmpgt_epi64
    104 #define _simd_cmpgt_epi8                    SIMD::cmpgt_epi8
    105 #define _simd_cmpeq_epi8                    SIMD::cmpeq_epi8
    106 #define _simd_cmpgt_epi16                   SIMD::cmpgt_epi16
    107 #define _simd_cmpeq_epi16                   SIMD::cmpeq_epi16
    108 #define _simd_movemask_epi8                 SIMD::movemask_epi8
    109 #define _simd_permute_ps                    SIMD::permute_ps
    110 #define _simd_permute_epi32                 SIMD::permute_epi32
    111 #define _simd_srlv_epi32                    SIMD::srlv_epi32
    112 #define _simd_sllv_epi32                    SIMD::sllv_epi32
    113 
    114 #define _simd_unpacklo_epi8                 SIMD::unpacklo_epi8
    115 #define _simd_unpackhi_epi8                 SIMD::unpackhi_epi8
    116 #define _simd_unpacklo_epi16                SIMD::unpacklo_epi16
    117 #define _simd_unpackhi_epi16                SIMD::unpackhi_epi16
    118 #define _simd_unpacklo_epi32                SIMD::unpacklo_epi32
    119 #define _simd_unpackhi_epi32                SIMD::unpackhi_epi32
    120 #define _simd_unpacklo_epi64                SIMD::unpacklo_epi64
    121 #define _simd_unpackhi_epi64                SIMD::unpackhi_epi64
    122 
    123 #define _simd_slli_epi32(a,i)               SIMD::slli_epi32<i>(a)
    124 #define _simd_srai_epi32(a,i)               SIMD::srai_epi32<i>(a)
    125 #define _simd_srli_epi32(a,i)               SIMD::srli_epi32<i>(a)
    126 #define _simd_srlisi_ps(a,i)                SIMD::srlisi_ps<i>(a)
    127 
    128 #define _simd_fmadd_ps                      SIMD::fmadd_ps
    129 #define _simd_fmsub_ps                      SIMD::fmsub_ps
    130 #define _simd_shuffle_epi8                  SIMD::shuffle_epi8
    131 
    132 #define _simd_i32gather_ps(p, o, s)         SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
    133 #define _simd_mask_i32gather_ps(r, p, o, m, s) SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
    134 #define _simd_abs_epi32                     SIMD::abs_epi32
    135 
    136 #define _simd_cvtepu8_epi16                 SIMD::cvtepu8_epi16
    137 #define _simd_cvtepu8_epi32                 SIMD::cvtepu8_epi32
    138 #define _simd_cvtepu16_epi32                SIMD::cvtepu16_epi32
    139 #define _simd_cvtepu16_epi64                SIMD::cvtepu16_epi64
    140 #define _simd_cvtepu32_epi64                SIMD::cvtepu32_epi64
    141 
    142 #define _simd_packus_epi16                  SIMD::packus_epi16
    143 #define _simd_packs_epi16                   SIMD::packs_epi16
    144 #define _simd_packus_epi32                  SIMD::packus_epi32
    145 #define _simd_packs_epi32                   SIMD::packs_epi32
    146 
    147 #define _simd_unpacklo_ps                   SIMD::unpacklo_ps
    148 #define _simd_unpackhi_ps                   SIMD::unpackhi_ps
    149 #define _simd_unpacklo_pd                   SIMD::unpacklo_pd
    150 #define _simd_unpackhi_pd                   SIMD::unpackhi_pd
    151 #define _simd_insertf128_ps                 SIMD::insertf128_ps
    152 #define _simd_insertf128_pd                 SIMD::insertf128_pd
    153 #define _simd_insertf128_si(a, b, i)        SIMD::insertf128_si<i>(a, b)
    154 #define _simd_extractf128_ps(a, i)          SIMD::extractf128_ps<i>(a)
    155 #define _simd_extractf128_pd(a, i)          SIMD::extractf128_pd<i>(a)
    156 #define _simd_extractf128_si(a, i)          SIMD::extractf128_si<i>(a)
    157 #define _simd_permute2f128_ps(a, b, i)      SIMD::permute2f128_ps<i>(a, b)
    158 #define _simd_permute2f128_pd(a, b, i)      SIMD::permute2f128_pd<i>(a, b)
    159 #define _simd_permute2f128_si(a, b, i)      SIMD::permute2f128_si<i>(a, b)
    160 #define _simd_shuffle_ps(a, b, i)           SIMD::shuffle_ps<i>(a, b)
    161 #define _simd_shuffle_pd(a, b, i)           SIMD::shuffle_pd<i>(a, b)
    162 #define _simd_shuffle_epi32(a, b, imm8)     SIMD::shuffle_epi32<imm8>(a, b)
    163 #define _simd_shuffle_epi64(a, b, imm8)     SIMD::shuffle_epi64<imm8>(a, b)
    164 #define _simd_set1_epi32                    SIMD::set1_epi32
    165 #define _simd_set_epi32                     SIMD::set_epi32
    166 #define _simd_set_ps                        SIMD::set_ps
    167 #define _simd_set1_epi8                     SIMD::set1_epi8
    168 #define _simd_setzero_si                    SIMD::setzero_si
    169 #define _simd_cvttps_epi32                  SIMD::cvttps_epi32
    170 #define _simd_store_si                      SIMD::store_si
    171 #define _simd_broadcast_ss                  SIMD::broadcast_ss
    172 #define _simd_maskstore_ps                  SIMD::maskstore_ps
    173 #define _simd_load_si                       SIMD::load_si
    174 #define _simd_loadu_si                      SIMD::loadu_si
    175 #define _simd_sub_ps                        SIMD::sub_ps
    176 #define _simd_testz_ps                      SIMD::testz_ps
    177 #define _simd_testz_si                      SIMD::testz_si
    178 #define _simd_xor_ps                        SIMD::xor_ps
    179 
    180 #define _simd_loadu2_si                     SIMD::loadu2_si
    181 #define _simd_storeu2_si                    SIMD::storeu2_si
    182 
    183 #define _simd_blendv_epi32                  SIMD::blendv_epi32
    184 #define _simd_vmask_ps                      SIMD::vmask_ps
    185 
    186 template<int mask> SIMDINLINE
    187 SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer const &a, SIMD128::Integer const &b)
    188 {
    189     return SIMD128::castps_si(SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
    190 }
    191 
    192 SIMDINLINE
    193 void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
    194 {
    195     OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
    196     SIMD256::store_ps(rArray, r);
    197     SIMD256::store_ps(sArray, s);
    198     rArray[rlane] = sArray[slane];
    199     r = SIMD256::load_ps(rArray);
    200 }
    201 
    202 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
    203 #define _simdvec_load_ps SIMD::vec4_load1_ps
    204 
    205 SIMDINLINE
    206 void _simdvec_mov(simdvector& r, const simdscalar& s)
    207 {
    208     SIMD::vec4_set1_vps(r, s);
    209 }
    210 
    211 SIMDINLINE
    212 void _simdvec_mov(simdvector& r, const simdvector& v)
    213 {
    214     r = v;
    215 }
    216 
    217 #if 0
    218 // just move a lane from the source simdvector to dest simdvector
    219 SIMDINLINE
    220 void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
    221 {
    222     _simd_mov(r[0], rlane, s[0], slane);
    223     _simd_mov(r[1], rlane, s[1], slane);
    224     _simd_mov(r[2], rlane, s[2], slane);
    225     _simd_mov(r[3], rlane, s[3], slane);
    226 }
    227 
    228 #endif
    229 
    230 #define _simdvec_dp3_ps                 SIMD::vec4_dp3_ps
    231 #define _simdvec_dp4_ps                 SIMD::vec4_dp4_ps
    232 #define _simdvec_rcp_length_ps          SIMD::vec4_rcp_length_ps
    233 #define _simdvec_normalize_ps           SIMD::vec4_normalize_ps
    234 #define _simdvec_mul_ps                 SIMD::vec4_mul_ps
    235 #define _simdvec_add_ps                 SIMD::vec4_add_ps
    236 #define _simdvec_min_ps                 SIMD::vec4_min_ps
    237 #define _simdvec_max_ps                 SIMD::vec4_max_ps
    238 #define _simd_mat4x4_vec4_multiply      SIMD::mat4x4_vec4_multiply
    239 #define _simd_mat3x3_vec3_w0_multiply   SIMD::mat3x3_vec3_w0_multiply
    240 #define _simd_mat4x4_vec3_w1_multiply   SIMD::mat4x4_vec3_w1_multiply
    241 #define _simd_mat4x3_vec3_w1_multiply   SIMD::mat4x3_vec3_w1_multiply
    242 
    243 //////////////////////////////////////////////////////////////////////////
    244 /// @brief Compute plane equation vA * vX + vB * vY + vC
    245 SIMDINLINE simdscalar vplaneps(simdscalar const &vA, simdscalar const &vB, simdscalar const &vC, simdscalar const &vX, simdscalar const &vY)
    246 {
    247     simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
    248     vOut = _simd_fmadd_ps(vB, vY, vOut);
    249     return vOut;
    250 }
    251 
    252 //////////////////////////////////////////////////////////////////////////
    253 /// @brief Compute plane equation vA * vX + vB * vY + vC
    254 SIMDINLINE simd4scalar vplaneps(simd4scalar const &vA, simd4scalar const &vB, simd4scalar const &vC, simd4scalar const &vX, simd4scalar const &vY)
    255 {
    256     simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
    257     vOut = _simd128_fmadd_ps(vB, vY, vOut);
    258     return vOut;
    259 }
    260 
    261 //////////////////////////////////////////////////////////////////////////
    262 /// @brief Interpolates a single component.
    263 /// @param vI - barycentric I
    264 /// @param vJ - barycentric J
    265 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
    266 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
    267 static SIMDINLINE simdscalar InterpolateComponent(simdscalar const &vI, simdscalar const &vJ, const float *pInterpBuffer)
    268 {
    269     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
    270     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
    271     const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
    272 
    273     simdscalar vA = _simd_broadcast_ss(pInterpA);
    274     simdscalar vB = _simd_broadcast_ss(pInterpB);
    275     simdscalar vC = _simd_broadcast_ss(pInterpC);
    276 
    277     simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
    278     vC = _simd_mul_ps(vk, vC);
    279 
    280     return vplaneps(vA, vB, vC, vI, vJ);
    281 }
    282 
    283 //////////////////////////////////////////////////////////////////////////
    284 /// @brief Interpolates a single component (flat shade).
    285 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
    286 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
    287 static SIMDINLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
    288 {
    289     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
    290 
    291     simdscalar vA = _simd_broadcast_ss(pInterpA);
    292 
    293     return vA;
    294 }
    295 
    296 //////////////////////////////////////////////////////////////////////////
    297 /// @brief Interpolates a single component.
    298 /// @param vI - barycentric I
    299 /// @param vJ - barycentric J
    300 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
    301 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
    302 static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar const &vI, simd4scalar const &vJ, const float *pInterpBuffer)
    303 {
    304     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
    305     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
    306     const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
    307 
    308     simd4scalar vA = SIMD128::broadcast_ss(pInterpA);
    309     simd4scalar vB = SIMD128::broadcast_ss(pInterpB);
    310     simd4scalar vC = SIMD128::broadcast_ss(pInterpC);
    311 
    312     simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ);
    313     vC = SIMD128::mul_ps(vk, vC);
    314 
    315     return vplaneps(vA, vB, vC, vI, vJ);
    316 }
    317 
    318 static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar const &a)
    319 {
    320     simd4scalari ai = SIMD128::castps_si(a);
    321     return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
    322 }
    323 
    324 static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const &a)
    325 {
    326     simdscalari ai = _simd_castps_si(a);
    327     return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
    328 }
    329 
    330 
    331 #if ENABLE_AVX512_SIMD16
    332 #include "simd16intrin.h"
    333 #endif//ENABLE_AVX512_SIMD16
    334 
    335 #endif//__SWR_SIMDINTRIN_H__
    336