1 /**************************************************************************** 2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 ****************************************************************************/ 23 #if !defined(__SIMD_LIB_AVX2_HPP__) 24 #error Do not include this file directly, use "simdlib.hpp" instead. 25 #endif 26 27 //============================================================================ 28 // SIMD4 AVX (2) implementation 29 // 30 // Since this implementation inherits from the AVX (1) implementation, 31 // the only operations below ones that replace AVX (1) operations. 32 // Only 2 shifts and 2 gathers were introduced with AVX 2 33 // Also, add native support for FMA operations 34 //============================================================================ 35 #define SIMD_WRAPPER_3(op) \ 36 static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c) \ 37 {\ 38 return _mm_##op(a, b, c);\ 39 } 40 41 SIMD_WRAPPER_3(fmadd_ps); // return (a * b) + c 42 SIMD_WRAPPER_3(fmsub_ps); // return (a * b) - c 43 44 static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b (uint32) 45 { 46 return _mm_sllv_epi32(vA, vB); 47 } 48 49 static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b (uint32) 50 { 51 return _mm_srlv_epi32(vA, vB); 52 } 53 54 template<ScaleFactor ScaleT> 55 static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT)) 56 { 57 return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT)); 58 } 59 60 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old 61 template<ScaleFactor ScaleT> 62 static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask) 63 { 64 return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT)); 65 } 66 67 #undef SIMD_WRAPPER_3 68 69