1 /*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __PMMINTRIN_H 25 #define __PMMINTRIN_H 26 27 #include <emmintrin.h> 28 29 /* Define the default attributes for the functions in this file. */ 30 #define __DEFAULT_FN_ATTRS \ 31 __attribute__((__always_inline__, __nodebug__, __target__("sse3"))) 32 33 /// \brief Loads data from an unaligned memory location to elements in a 128-bit 34 /// vector. 35 /// 36 /// If the address of the data is not 16-byte aligned, the instruction may 37 /// read two adjacent aligned blocks of memory to retrieve the requested 38 /// data. 39 /// 40 /// \headerfile <x86intrin.h> 41 /// 42 /// This intrinsic corresponds to the <c> VLDDQU </c> instruction. 43 /// 44 /// \param __p 45 /// A pointer to a 128-bit integer vector containing integer values. 46 /// \returns A 128-bit vector containing the moved values. 47 static __inline__ __m128i __DEFAULT_FN_ATTRS 48 _mm_lddqu_si128(__m128i const *__p) 49 { 50 return (__m128i)__builtin_ia32_lddqu((char const *)__p); 51 } 52 53 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of 54 /// two 128-bit vectors of [4 x float]. 55 /// 56 /// \headerfile <x86intrin.h> 57 /// 58 /// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction. 59 /// 60 /// \param __a 61 /// A 128-bit vector of [4 x float] containing the left source operand. 62 /// \param __b 63 /// A 128-bit vector of [4 x float] containing the right source operand. 64 /// \returns A 128-bit vector of [4 x float] containing the alternating sums and 65 /// differences of both operands. 66 static __inline__ __m128 __DEFAULT_FN_ATTRS 67 _mm_addsub_ps(__m128 __a, __m128 __b) 68 { 69 return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); 70 } 71 72 /// \brief Horizontally adds the adjacent pairs of values contained in two 73 /// 128-bit vectors of [4 x float]. 74 /// 75 /// \headerfile <x86intrin.h> 76 /// 77 /// This intrinsic corresponds to the <c> VHADDPS </c> instruction. 78 /// 79 /// \param __a 80 /// A 128-bit vector of [4 x float] containing one of the source operands. 81 /// The horizontal sums of the values are stored in the lower bits of the 82 /// destination. 83 /// \param __b 84 /// A 128-bit vector of [4 x float] containing one of the source operands. 85 /// The horizontal sums of the values are stored in the upper bits of the 86 /// destination. 87 /// \returns A 128-bit vector of [4 x float] containing the horizontal sums of 88 /// both operands. 89 static __inline__ __m128 __DEFAULT_FN_ATTRS 90 _mm_hadd_ps(__m128 __a, __m128 __b) 91 { 92 return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); 93 } 94 95 /// \brief Horizontally subtracts the adjacent pairs of values contained in two 96 /// 128-bit vectors of [4 x float]. 97 /// 98 /// \headerfile <x86intrin.h> 99 /// 100 /// This intrinsic corresponds to the <c> VHSUBPS </c> instruction. 101 /// 102 /// \param __a 103 /// A 128-bit vector of [4 x float] containing one of the source operands. 104 /// The horizontal differences between the values are stored in the lower 105 /// bits of the destination. 106 /// \param __b 107 /// A 128-bit vector of [4 x float] containing one of the source operands. 108 /// The horizontal differences between the values are stored in the upper 109 /// bits of the destination. 110 /// \returns A 128-bit vector of [4 x float] containing the horizontal 111 /// differences of both operands. 112 static __inline__ __m128 __DEFAULT_FN_ATTRS 113 _mm_hsub_ps(__m128 __a, __m128 __b) 114 { 115 return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); 116 } 117 118 /// \brief Moves and duplicates high-order (odd-indexed) values from a 128-bit 119 /// vector of [4 x float] to float values stored in a 128-bit vector of 120 /// [4 x float]. 121 /// 122 /// \headerfile <x86intrin.h> 123 /// 124 /// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction. 125 /// 126 /// \param __a 127 /// A 128-bit vector of [4 x float]. \n 128 /// Bits [127:96] of the source are written to bits [127:96] and [95:64] of 129 /// the destination. \n 130 /// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the 131 /// destination. 132 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 133 /// values. 134 static __inline__ __m128 __DEFAULT_FN_ATTRS 135 _mm_movehdup_ps(__m128 __a) 136 { 137 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); 138 } 139 140 /// \brief Duplicates low-order (even-indexed) values from a 128-bit vector of 141 /// [4 x float] to float values stored in a 128-bit vector of [4 x float]. 142 /// 143 /// \headerfile <x86intrin.h> 144 /// 145 /// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction. 146 /// 147 /// \param __a 148 /// A 128-bit vector of [4 x float] \n 149 /// Bits [95:64] of the source are written to bits [127:96] and [95:64] of 150 /// the destination. \n 151 /// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the 152 /// destination. 153 /// \returns A 128-bit vector of [4 x float] containing the moved and duplicated 154 /// values. 155 static __inline__ __m128 __DEFAULT_FN_ATTRS 156 _mm_moveldup_ps(__m128 __a) 157 { 158 return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); 159 } 160 161 /// \brief Adds the even-indexed values and subtracts the odd-indexed values of 162 /// two 128-bit vectors of [2 x double]. 163 /// 164 /// \headerfile <x86intrin.h> 165 /// 166 /// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction. 167 /// 168 /// \param __a 169 /// A 128-bit vector of [2 x double] containing the left source operand. 170 /// \param __b 171 /// A 128-bit vector of [2 x double] containing the right source operand. 172 /// \returns A 128-bit vector of [2 x double] containing the alternating sums 173 /// and differences of both operands. 174 static __inline__ __m128d __DEFAULT_FN_ATTRS 175 _mm_addsub_pd(__m128d __a, __m128d __b) 176 { 177 return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); 178 } 179 180 /// \brief Horizontally adds the pairs of values contained in two 128-bit 181 /// vectors of [2 x double]. 182 /// 183 /// \headerfile <x86intrin.h> 184 /// 185 /// This intrinsic corresponds to the <c> VHADDPD </c> instruction. 186 /// 187 /// \param __a 188 /// A 128-bit vector of [2 x double] containing one of the source operands. 189 /// The horizontal sum of the values is stored in the lower bits of the 190 /// destination. 191 /// \param __b 192 /// A 128-bit vector of [2 x double] containing one of the source operands. 193 /// The horizontal sum of the values is stored in the upper bits of the 194 /// destination. 195 /// \returns A 128-bit vector of [2 x double] containing the horizontal sums of 196 /// both operands. 197 static __inline__ __m128d __DEFAULT_FN_ATTRS 198 _mm_hadd_pd(__m128d __a, __m128d __b) 199 { 200 return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); 201 } 202 203 /// \brief Horizontally subtracts the pairs of values contained in two 128-bit 204 /// vectors of [2 x double]. 205 /// 206 /// \headerfile <x86intrin.h> 207 /// 208 /// This intrinsic corresponds to the <c> VHSUBPD </c> instruction. 209 /// 210 /// \param __a 211 /// A 128-bit vector of [2 x double] containing one of the source operands. 212 /// The horizontal difference of the values is stored in the lower bits of 213 /// the destination. 214 /// \param __b 215 /// A 128-bit vector of [2 x double] containing one of the source operands. 216 /// The horizontal difference of the values is stored in the upper bits of 217 /// the destination. 218 /// \returns A 128-bit vector of [2 x double] containing the horizontal 219 /// differences of both operands. 220 static __inline__ __m128d __DEFAULT_FN_ATTRS 221 _mm_hsub_pd(__m128d __a, __m128d __b) 222 { 223 return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); 224 } 225 226 /// \brief Moves and duplicates one double-precision value to double-precision 227 /// values stored in a 128-bit vector of [2 x double]. 228 /// 229 /// \headerfile <x86intrin.h> 230 /// 231 /// \code 232 /// __m128d _mm_loaddup_pd(double const * dp); 233 /// \endcode 234 /// 235 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 236 /// 237 /// \param dp 238 /// A pointer to a double-precision value to be moved and duplicated. 239 /// \returns A 128-bit vector of [2 x double] containing the moved and 240 /// duplicated values. 241 #define _mm_loaddup_pd(dp) _mm_load1_pd(dp) 242 243 /// \brief Moves and duplicates the double-precision value in the lower bits of 244 /// a 128-bit vector of [2 x double] to double-precision values stored in a 245 /// 128-bit vector of [2 x double]. 246 /// 247 /// \headerfile <x86intrin.h> 248 /// 249 /// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction. 250 /// 251 /// \param __a 252 /// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits 253 /// [127:64] and [63:0] of the destination. 254 /// \returns A 128-bit vector of [2 x double] containing the moved and 255 /// duplicated values. 256 static __inline__ __m128d __DEFAULT_FN_ATTRS 257 _mm_movedup_pd(__m128d __a) 258 { 259 return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); 260 } 261 262 /// \brief Establishes a linear address memory range to be monitored and puts 263 /// the processor in the monitor event pending state. Data stored in the 264 /// monitored address range causes the processor to exit the pending state. 265 /// 266 /// \headerfile <x86intrin.h> 267 /// 268 /// This intrinsic corresponds to the <c> MONITOR </c> instruction. 269 /// 270 /// \param __p 271 /// The memory range to be monitored. The size of the range is determined by 272 /// CPUID function 0000_0005h. 273 /// \param __extensions 274 /// Optional extensions for the monitoring state. 275 /// \param __hints 276 /// Optional hints for the monitoring state. 277 static __inline__ void __DEFAULT_FN_ATTRS 278 _mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) 279 { 280 __builtin_ia32_monitor((void *)__p, __extensions, __hints); 281 } 282 283 /// \brief Used with the MONITOR instruction to wait while the processor is in 284 /// the monitor event pending state. Data stored in the monitored address 285 /// range causes the processor to exit the pending state. 286 /// 287 /// \headerfile <x86intrin.h> 288 /// 289 /// This intrinsic corresponds to the <c> MWAIT </c> instruction. 290 /// 291 /// \param __extensions 292 /// Optional extensions for the monitoring state, which may vary by 293 /// processor. 294 /// \param __hints 295 /// Optional hints for the monitoring state, which may vary by processor. 296 static __inline__ void __DEFAULT_FN_ATTRS 297 _mm_mwait(unsigned __extensions, unsigned __hints) 298 { 299 __builtin_ia32_mwait(__extensions, __hints); 300 } 301 302 #undef __DEFAULT_FN_ATTRS 303 304 #endif /* __PMMINTRIN_H */ 305