1 /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __PMMINTRIN_H 25 #define __PMMINTRIN_H 26 27 #include <emmintrin.h> 28 29 /* Define the default attributes for the functions in this file. */ 30 #define __DEFAULT_FN_ATTRS \ 31 __attribute__((__always_inline__, __nodebug__, __target__("sse3"))) 32 33 /// \brief Loads data from an unaligned memory location to elements in a 128-bit 34 /// vector. If the address of the data is not 16-byte aligned, the 35 /// instruction may read two adjacent aligned blocks of memory to retrieve 36 /// the requested data. 37 /// 38 /// \headerfile <x86intrin.h> 39 /// 40 /// This intrinsic corresponds to the \c VLDDQU instruction. 41 /// 42 /// \param __p 43 /// A pointer to a 128-bit integer vector containing integer values. 44 /// \returns A 128-bit vector containing the moved values. 45 static __inline__ __m128i __DEFAULT_FN_ATTRS 46 _mm_lddqu_si128(__m128i const *__p) 47 { 48 return (__m128i)__builtin_ia32_lddqu((char const *)__p); 49 } 50 51 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of 52 /// two 128-bit vectors of [4 x float]. 53 /// 54 /// \headerfile <x86intrin.h> 55 /// 56 /// This intrinsic corresponds to the \c VADDSUBPS instruction. 57 /// 58 /// \param __a 59 /// A 128-bit vector of [4 x float] containing the left source operand. 60 /// \param __b 61 /// A 128-bit vector of [4 x float] containing the right source operand. 62 /// \returns A 128-bit vector of [4 x float] containing the alternating sums and 63 /// differences of both operands. 64 static __inline__ __m128 __DEFAULT_FN_ATTRS 65 _mm_addsub_ps(__m128 __a, __m128 __b) 66 { 67 return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); 68 } 69 70 /// \brief Horizontally adds the adjacent pairs of values contained in two 71 /// 128-bit vectors of [4 x float]. 72 /// 73 /// \headerfile <x86intrin.h> 74 /// 75 /// This intrinsic corresponds to the \c VHADDPS instruction. 76 /// 77 /// \param __a 78 /// A 128-bit vector of [4 x float] containing one of the source operands. 79 /// The horizontal sums of the values are stored in the lower bits of the 80 /// destination. 81 /// \param __b 82 /// A 128-bit vector of [4 x float] containing one of the source operands. 83 /// The horizontal sums of the values are stored in the upper bits of the 84 /// destination. 85 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of 86 /// both operands. 87 static __inline__ __m128 __DEFAULT_FN_ATTRS 88 _mm_hadd_ps(__m128 __a, __m128 __b) 89 { 90 return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); 91 } 92 93 /// \brief Horizontally subtracts the adjacent pairs of values contained in two 94 /// 128-bit vectors of [4 x float]. 95 /// 96 /// \headerfile <x86intrin.h> 97 /// 98 /// This intrinsic corresponds to the \c VHSUBPS instruction. 99 /// 100 /// \param __a 101 /// A 128-bit vector of [4 x float] containing one of the source operands. 102 /// The horizontal differences between the values are stored in the lower 103 /// bits of the destination. 104 /// \param __b 105 /// A 128-bit vector of [4 x float] containing one of the source operands. 106 /// The horizontal differences between the values are stored in the upper 107 /// bits of the destination. 108 /// \returns A 128-bit vector of [4 x float] containing the horizontal 109 /// differences of both operands. 110 static __inline__ __m128 __DEFAULT_FN_ATTRS 111 _mm_hsub_ps(__m128 __a, __m128 __b) 112 { 113 return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); 114 } 115 116 /// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit 117 /// vector of [4 x float] to float values stored in a 128-bit vector of 118 /// [4 x float]. 119 /// Bits [127:96] of the source are written to bits [127:96] and [95:64] of 120 /// the destination. 121 /// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the 122 /// destination. 123 /// 124 /// \headerfile <x86intrin.h> 125 /// 126 /// This intrinsic corresponds to the \c VMOVSHDUP instruction. 127 /// 128 /// \param __a 129 /// A 128-bit vector of [4 x float]. 130 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 131 /// values. 132 static __inline__ __m128 __DEFAULT_FN_ATTRS 133 _mm_movehdup_ps(__m128 __a) 134 { 135 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); 136 } 137 138 /// \brief Duplicates low-order (even-indexed) values from a 128-bit 139 /// vector of [4 x float] to float values stored in a 128-bit vector of 140 /// [4 x float]. 141 /// Bits [95:64] of the source are written to bits [127:96] and [95:64] of 142 /// the destination. 143 /// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the 144 /// destination. 145 /// 146 /// \headerfile <x86intrin.h> 147 /// 148 /// This intrinsic corresponds to the \c VMOVSLDUP instruction. 149 /// 150 /// \param __a 151 /// A 128-bit vector of [4 x float]. 152 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 153 /// values. 154 static __inline__ __m128 __DEFAULT_FN_ATTRS 155 _mm_moveldup_ps(__m128 __a) 156 { 157 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); 158 } 159 160 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of 161 /// two 128-bit vectors of [2 x double]. 162 /// 163 /// \headerfile <x86intrin.h> 164 /// 165 /// This intrinsic corresponds to the \c VADDSUBPD instruction. 166 /// 167 /// \param __a 168 /// A 128-bit vector of [2 x double] containing the left source operand. 169 /// \param __b 170 /// A 128-bit vector of [2 x double] containing the right source operand. 171 /// \returns A 128-bit vector of [2 x double] containing the alternating sums 172 /// and differences of both operands. 173 static __inline__ __m128d __DEFAULT_FN_ATTRS 174 _mm_addsub_pd(__m128d __a, __m128d __b) 175 { 176 return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); 177 } 178 179 /// \brief Horizontally adds the pairs of values contained in two 128-bit 180 /// vectors of [2 x double]. 181 /// 182 /// \headerfile <x86intrin.h> 183 /// 184 /// This intrinsic corresponds to the \c VHADDPD instruction. 185 /// 186 /// \param __a 187 /// A 128-bit vector of [2 x double] containing one of the source operands. 188 /// The horizontal sum of the values is stored in the lower bits of the 189 /// destination. 190 /// \param __b 191 /// A 128-bit vector of [2 x double] containing one of the source operands. 192 /// The horizontal sum of the values is stored in the upper bits of the 193 /// destination. 194 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of 195 /// both operands. 196 static __inline__ __m128d __DEFAULT_FN_ATTRS 197 _mm_hadd_pd(__m128d __a, __m128d __b) 198 { 199 return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); 200 } 201 202 /// \brief Horizontally subtracts the pairs of values contained in two 128-bit 203 /// vectors of [2 x double]. 204 /// 205 /// \headerfile <x86intrin.h> 206 /// 207 /// This intrinsic corresponds to the \c VHSUBPD instruction. 208 /// 209 /// \param __a 210 /// A 128-bit vector of [2 x double] containing one of the source operands. 211 /// The horizontal difference of the values is stored in the lower bits of 212 /// the destination. 213 /// \param __b 214 /// A 128-bit vector of [2 x double] containing one of the source operands. 215 /// The horizontal difference of the values is stored in the upper bits of 216 /// the destination. 217 /// \returns A 128-bit vector of [2 x double] containing the horizontal 218 /// differences of both operands. 219 static __inline__ __m128d __DEFAULT_FN_ATTRS 220 _mm_hsub_pd(__m128d __a, __m128d __b) 221 { 222 return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); 223 } 224 225 /// \brief Moves and duplicates one double-precision value to double-precision 226 /// values stored in a 128-bit vector of [2 x double]. 227 /// 228 /// \headerfile <x86intrin.h> 229 /// 230 /// \code 231 /// __m128d _mm_loaddup_pd(double const * dp); 232 /// \endcode 233 /// 234 /// This intrinsic corresponds to the \c VMOVDDUP instruction. 235 /// 236 /// \param dp 237 /// A pointer to a double-precision value to be moved and duplicated. 238 /// \returns A 128-bit vector of [2 x double] containing the moved and 239 /// duplicated values. 240 #define _mm_loaddup_pd(dp) _mm_load1_pd(dp) 241 242 /// \brief Moves and duplicates the double-precision value in the lower bits of 243 /// a 128-bit vector of [2 x double] to double-precision values stored in a 244 /// 128-bit vector of [2 x double]. 245 /// 246 /// \headerfile <x86intrin.h> 247 /// 248 /// This intrinsic corresponds to the \c VMOVDDUP instruction. 249 /// 250 /// \param __a 251 /// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits 252 /// [127:64] and [63:0] of the destination. 253 /// \returns A 128-bit vector of [2 x double] containing the moved and 254 /// duplicated values. 255 static __inline__ __m128d __DEFAULT_FN_ATTRS 256 _mm_movedup_pd(__m128d __a) 257 { 258 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 259 } 260 261 #define _MM_DENORMALS_ZERO_ON (0x0040) 262 #define _MM_DENORMALS_ZERO_OFF (0x0000) 263 264 #define _MM_DENORMALS_ZERO_MASK (0x0040) 265 266 #define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) 267 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 268 269 /// \brief Establishes a linear address memory range to be monitored and puts 270 /// the processor in the monitor event pending state. Data stored in the 271 /// monitored address range causes the processor to exit the pending state. 272 /// 273 /// \headerfile <x86intrin.h> 274 /// 275 /// This intrinsic corresponds to the \c MONITOR instruction. 276 /// 277 /// \param __p 278 /// The memory range to be monitored. The size of the range is determined by 279 /// CPUID function 0000_0005h. 280 /// \param __extensions 281 /// Optional extensions for the monitoring state. 282 /// \param __hints 283 /// Optional hints for the monitoring state. 284 static __inline__ void __DEFAULT_FN_ATTRS 285 _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) 286 { 287 __builtin_ia32_monitor((void *)__p, __extensions, __hints); 288 } 289 290 /// \brief Used with the MONITOR instruction to wait while the processor is in 291 /// the monitor event pending state. Data stored in the monitored address 292 /// range causes the processor to exit the pending state. 293 /// 294 /// \headerfile <x86intrin.h> 295 /// 296 /// This intrinsic corresponds to the \c MWAIT instruction. 297 /// 298 /// \param __extensions 299 /// Optional extensions for the monitoring state, which may vary by 300 /// processor. 301 /// \param __hints 302 /// Optional hints for the monitoring state, which may vary by processor. 303 static __inline__ void __DEFAULT_FN_ATTRS 304 _mm_mwait(unsigned __extensions, unsigned __hints) 305 { 306 __builtin_ia32_mwait(__extensions, __hints); 307 } 308 309 #undef __DEFAULT_FN_ATTRS 310 311 #endif /* __PMMINTRIN_H */ 312