1 /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef _SMMINTRIN_H 25 #define _SMMINTRIN_H 26 27 #include <tmmintrin.h> 28 29 /* Define the default attributes for the functions in this file. */ 30 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"))) 31 32 /* SSE4 Rounding macros. */ 33 #define _MM_FROUND_TO_NEAREST_INT 0x00 34 #define _MM_FROUND_TO_NEG_INF 0x01 35 #define _MM_FROUND_TO_POS_INF 0x02 36 #define _MM_FROUND_TO_ZERO 0x03 37 #define _MM_FROUND_CUR_DIRECTION 0x04 38 39 #define _MM_FROUND_RAISE_EXC 0x00 40 #define _MM_FROUND_NO_EXC 0x08 41 42 #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) 43 #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) 44 #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) 45 #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) 46 #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) 47 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) 48 49 /// \brief Rounds up each element of the 128-bit vector of [4 x float] to an 50 /// integer and returns the rounded values in a 128-bit vector of 51 /// [4 x float]. 52 /// 53 /// \headerfile <x86intrin.h> 54 /// 55 /// \code 56 /// __m128 _mm_ceil_ps(__m128 X); 57 /// \endcode 58 /// 59 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 60 /// 61 /// \param X 62 /// A 128-bit vector of [4 x float] values to be rounded up. 63 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 64 #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) 65 66 /// \brief Rounds up each element of the 128-bit vector of [2 x double] to an 67 /// integer and returns the rounded values in a 128-bit vector of 68 /// [2 x double]. 69 /// 70 /// \headerfile <x86intrin.h> 71 /// 72 /// \code 73 /// __m128d _mm_ceil_pd(__m128d X); 74 /// \endcode 75 /// 76 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 77 /// 78 /// \param X 79 /// A 128-bit vector of [2 x double] values to be rounded up. 80 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 81 #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) 82 83 /// \brief Copies three upper elements of the first 128-bit vector operand to 84 /// the corresponding three upper elements of the 128-bit result vector of 85 /// [4 x float]. Rounds up the lowest element of the second 128-bit vector 86 /// operand to an integer and copies it to the lowest element of the 128-bit 87 /// result vector of [4 x float]. 88 /// 89 /// \headerfile <x86intrin.h> 90 /// 91 /// \code 92 /// __m128 _mm_ceil_ss(__m128 X, __m128 Y); 93 /// \endcode 94 /// 95 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 96 /// 97 /// \param X 98 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 99 /// copied to the corresponding bits of the result. 100 /// \param Y 101 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 102 /// rounded up to the nearest integer and copied to the corresponding bits 103 /// of the result. 104 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 105 /// values. 106 #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) 107 108 /// \brief Copies the upper element of the first 128-bit vector operand to the 109 /// corresponding upper element of the 128-bit result vector of [2 x double]. 110 /// Rounds up the lower element of the second 128-bit vector operand to an 111 /// integer and copies it to the lower element of the 128-bit result vector 112 /// of [2 x double]. 113 /// 114 /// \headerfile <x86intrin.h> 115 /// 116 /// \code 117 /// __m128d _mm_ceil_sd(__m128d X, __m128d Y); 118 /// \endcode 119 /// 120 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 121 /// 122 /// \param X 123 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 124 /// copied to the corresponding bits of the result. 125 /// \param Y 126 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 127 /// rounded up to the nearest integer and copied to the corresponding bits 128 /// of the result. 129 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 130 /// values. 131 #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) 132 133 /// \brief Rounds down each element of the 128-bit vector of [4 x float] to an 134 /// an integer and returns the rounded values in a 128-bit vector of 135 /// [4 x float]. 136 /// 137 /// \headerfile <x86intrin.h> 138 /// 139 /// \code 140 /// __m128 _mm_floor_ps(__m128 X); 141 /// \endcode 142 /// 143 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 144 /// 145 /// \param X 146 /// A 128-bit vector of [4 x float] values to be rounded down. 147 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 148 #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) 149 150 /// \brief Rounds down each element of the 128-bit vector of [2 x double] to an 151 /// integer and returns the rounded values in a 128-bit vector of 152 /// [2 x double]. 153 /// 154 /// \headerfile <x86intrin.h> 155 /// 156 /// \code 157 /// __m128d _mm_floor_pd(__m128d X); 158 /// \endcode 159 /// 160 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 161 /// 162 /// \param X 163 /// A 128-bit vector of [2 x double]. 164 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 165 #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) 166 167 /// \brief Copies three upper elements of the first 128-bit vector operand to 168 /// the corresponding three upper elements of the 128-bit result vector of 169 /// [4 x float]. Rounds down the lowest element of the second 128-bit vector 170 /// operand to an integer and copies it to the lowest element of the 128-bit 171 /// result vector of [4 x float]. 172 /// 173 /// \headerfile <x86intrin.h> 174 /// 175 /// \code 176 /// __m128 _mm_floor_ss(__m128 X, __m128 Y); 177 /// \endcode 178 /// 179 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 180 /// 181 /// \param X 182 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 183 /// copied to the corresponding bits of the result. 184 /// \param Y 185 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 186 /// rounded down to the nearest integer and copied to the corresponding bits 187 /// of the result. 188 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 189 /// values. 190 #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) 191 192 /// \brief Copies the upper element of the first 128-bit vector operand to the 193 /// corresponding upper element of the 128-bit result vector of [2 x double]. 194 /// Rounds down the lower element of the second 128-bit vector operand to an 195 /// integer and copies it to the lower element of the 128-bit result vector 196 /// of [2 x double]. 197 /// 198 /// \headerfile <x86intrin.h> 199 /// 200 /// \code 201 /// __m128d _mm_floor_sd(__m128d X, __m128d Y); 202 /// \endcode 203 /// 204 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 205 /// 206 /// \param X 207 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 208 /// copied to the corresponding bits of the result. 209 /// \param Y 210 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 211 /// rounded down to the nearest integer and copied to the corresponding bits 212 /// of the result. 213 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 214 /// values. 215 #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) 216 217 /// \brief Rounds each element of the 128-bit vector of [4 x float] to an 218 /// integer value according to the rounding control specified by the second 219 /// argument and returns the rounded values in a 128-bit vector of 220 /// [4 x float]. 221 /// 222 /// \headerfile <x86intrin.h> 223 /// 224 /// \code 225 /// __m128 _mm_round_ps(__m128 X, const int M); 226 /// \endcode 227 /// 228 /// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction. 229 /// 230 /// \param X 231 /// A 128-bit vector of [4 x float]. 232 /// \param M 233 /// An integer value that specifies the rounding operation. \n 234 /// Bits [7:4] are reserved. \n 235 /// Bit [3] is a precision exception value: \n 236 /// 0: A normal PE exception is used \n 237 /// 1: The PE field is not updated \n 238 /// Bit [2] is the rounding control source: \n 239 /// 0: Use bits [1:0] of \a M \n 240 /// 1: Use the current MXCSR setting \n 241 /// Bits [1:0] contain the rounding control definition: \n 242 /// 00: Nearest \n 243 /// 01: Downward (toward negative infinity) \n 244 /// 10: Upward (toward positive infinity) \n 245 /// 11: Truncated 246 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 247 #define _mm_round_ps(X, M) __extension__ ({ \ 248 (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); }) 249 250 /// \brief Copies three upper elements of the first 128-bit vector operand to 251 /// the corresponding three upper elements of the 128-bit result vector of 252 /// [4 x float]. Rounds the lowest element of the second 128-bit vector 253 /// operand to an integer value according to the rounding control specified 254 /// by the third argument and copies it to the lowest element of the 128-bit 255 /// result vector of [4 x float]. 256 /// 257 /// \headerfile <x86intrin.h> 258 /// 259 /// \code 260 /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); 261 /// \endcode 262 /// 263 /// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction. 264 /// 265 /// \param X 266 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 267 /// copied to the corresponding bits of the result. 268 /// \param Y 269 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 270 /// rounded to the nearest integer using the specified rounding control and 271 /// copied to the corresponding bits of the result. 272 /// \param M 273 /// An integer value that specifies the rounding operation. \n 274 /// Bits [7:4] are reserved. \n 275 /// Bit [3] is a precision exception value: \n 276 /// 0: A normal PE exception is used \n 277 /// 1: The PE field is not updated \n 278 /// Bit [2] is the rounding control source: \n 279 /// 0: Use bits [1:0] of \a M \n 280 /// 1: Use the current MXCSR setting \n 281 /// Bits [1:0] contain the rounding control definition: \n 282 /// 00: Nearest \n 283 /// 01: Downward (toward negative infinity) \n 284 /// 10: Upward (toward positive infinity) \n 285 /// 11: Truncated 286 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 287 /// values. 288 #define _mm_round_ss(X, Y, M) __extension__ ({ \ 289 (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ 290 (__v4sf)(__m128)(Y), (M)); }) 291 292 /// \brief Rounds each element of the 128-bit vector of [2 x double] to an 293 /// integer value according to the rounding control specified by the second 294 /// argument and returns the rounded values in a 128-bit vector of 295 /// [2 x double]. 296 /// 297 /// \headerfile <x86intrin.h> 298 /// 299 /// \code 300 /// __m128d _mm_round_pd(__m128d X, const int M); 301 /// \endcode 302 /// 303 /// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction. 304 /// 305 /// \param X 306 /// A 128-bit vector of [2 x double]. 307 /// \param M 308 /// An integer value that specifies the rounding operation. \n 309 /// Bits [7:4] are reserved. \n 310 /// Bit [3] is a precision exception value: \n 311 /// 0: A normal PE exception is used \n 312 /// 1: The PE field is not updated \n 313 /// Bit [2] is the rounding control source: \n 314 /// 0: Use bits [1:0] of \a M \n 315 /// 1: Use the current MXCSR setting \n 316 /// Bits [1:0] contain the rounding control definition: \n 317 /// 00: Nearest \n 318 /// 01: Downward (toward negative infinity) \n 319 /// 10: Upward (toward positive infinity) \n 320 /// 11: Truncated 321 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 322 #define _mm_round_pd(X, M) __extension__ ({ \ 323 (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); }) 324 325 /// \brief Copies the upper element of the first 128-bit vector operand to the 326 /// corresponding upper element of the 128-bit result vector of [2 x double]. 327 /// Rounds the lower element of the second 128-bit vector operand to an 328 /// integer value according to the rounding control specified by the third 329 /// argument and copies it to the lower element of the 128-bit result vector 330 /// of [2 x double]. 331 /// 332 /// \headerfile <x86intrin.h> 333 /// 334 /// \code 335 /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); 336 /// \endcode 337 /// 338 /// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction. 339 /// 340 /// \param X 341 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 342 /// copied to the corresponding bits of the result. 343 /// \param Y 344 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 345 /// rounded to the nearest integer using the specified rounding control and 346 /// copied to the corresponding bits of the result. 347 /// \param M 348 /// An integer value that specifies the rounding operation. \n 349 /// Bits [7:4] are reserved. \n 350 /// Bit [3] is a precision exception value: \n 351 /// 0: A normal PE exception is used \n 352 /// 1: The PE field is not updated \n 353 /// Bit [2] is the rounding control source: \n 354 /// 0: Use bits [1:0] of \a M \n 355 /// 1: Use the current MXCSR setting \n 356 /// Bits [1:0] contain the rounding control definition: \n 357 /// 00: Nearest \n 358 /// 01: Downward (toward negative infinity) \n 359 /// 10: Upward (toward positive infinity) \n 360 /// 11: Truncated 361 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 362 /// values. 363 #define _mm_round_sd(X, Y, M) __extension__ ({ \ 364 (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ 365 (__v2df)(__m128d)(Y), (M)); }) 366 367 /* SSE4 Packed Blending Intrinsics. */ 368 /// \brief Returns a 128-bit vector of [2 x double] where the values are 369 /// selected from either the first or second operand as specified by the 370 /// third operand, the control mask. 371 /// 372 /// \headerfile <x86intrin.h> 373 /// 374 /// \code 375 /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); 376 /// \endcode 377 /// 378 /// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction. 379 /// 380 /// \param V1 381 /// A 128-bit vector of [2 x double]. 382 /// \param V2 383 /// A 128-bit vector of [2 x double]. 384 /// \param M 385 /// An immediate integer operand, with mask bits [1:0] specifying how the 386 /// values are to be copied. The position of the mask bit corresponds to the 387 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit 388 /// element in operand \a V1 is copied to the same position in the result. 389 /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 390 /// is copied to the same position in the result. 391 /// \returns A 128-bit vector of [2 x double] containing the copied values. 392 #define _mm_blend_pd(V1, V2, M) __extension__ ({ \ 393 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \ 394 (__v2df)(__m128d)(V2), \ 395 (((M) & 0x01) ? 2 : 0), \ 396 (((M) & 0x02) ? 3 : 1)); }) 397 398 /// \brief Returns a 128-bit vector of [4 x float] where the values are selected 399 /// from either the first or second operand as specified by the third 400 /// operand, the control mask. 401 /// 402 /// \headerfile <x86intrin.h> 403 /// 404 /// \code 405 /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); 406 /// \endcode 407 /// 408 /// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction. 409 /// 410 /// \param V1 411 /// A 128-bit vector of [4 x float]. 412 /// \param V2 413 /// A 128-bit vector of [4 x float]. 414 /// \param M 415 /// An immediate integer operand, with mask bits [3:0] specifying how the 416 /// values are to be copied. The position of the mask bit corresponds to the 417 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit 418 /// element in operand \a V1 is copied to the same position in the result. 419 /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 420 /// is copied to the same position in the result. 421 /// \returns A 128-bit vector of [4 x float] containing the copied values. 422 #define _mm_blend_ps(V1, V2, M) __extension__ ({ \ 423 (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ 424 (((M) & 0x01) ? 4 : 0), \ 425 (((M) & 0x02) ? 5 : 1), \ 426 (((M) & 0x04) ? 6 : 2), \ 427 (((M) & 0x08) ? 7 : 3)); }) 428 429 /// \brief Returns a 128-bit vector of [2 x double] where the values are 430 /// selected from either the first or second operand as specified by the 431 /// third operand, the control mask. 432 /// 433 /// \headerfile <x86intrin.h> 434 /// 435 /// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction. 436 /// 437 /// \param __V1 438 /// A 128-bit vector of [2 x double]. 439 /// \param __V2 440 /// A 128-bit vector of [2 x double]. 441 /// \param __M 442 /// A 128-bit vector operand, with mask bits 127 and 63 specifying how the 443 /// values are to be copied. The position of the mask bit corresponds to the 444 /// most significant bit of a copied value. When a mask bit is 0, the 445 /// corresponding 64-bit element in operand \a __V1 is copied to the same 446 /// position in the result. When a mask bit is 1, the corresponding 64-bit 447 /// element in operand \a __V2 is copied to the same position in the result. 448 /// \returns A 128-bit vector of [2 x double] containing the copied values. 449 static __inline__ __m128d __DEFAULT_FN_ATTRS 450 _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) 451 { 452 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2, 453 (__v2df)__M); 454 } 455 456 /// \brief Returns a 128-bit vector of [4 x float] where the values are 457 /// selected from either the first or second operand as specified by the 458 /// third operand, the control mask. 459 /// 460 /// \headerfile <x86intrin.h> 461 /// 462 /// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction. 463 /// 464 /// \param __V1 465 /// A 128-bit vector of [4 x float]. 466 /// \param __V2 467 /// A 128-bit vector of [4 x float]. 468 /// \param __M 469 /// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying 470 /// how the values are to be copied. The position of the mask bit corresponds 471 /// to the most significant bit of a copied value. When a mask bit is 0, the 472 /// corresponding 32-bit element in operand \a __V1 is copied to the same 473 /// position in the result. When a mask bit is 1, the corresponding 32-bit 474 /// element in operand \a __V2 is copied to the same position in the result. 475 /// \returns A 128-bit vector of [4 x float] containing the copied values. 476 static __inline__ __m128 __DEFAULT_FN_ATTRS 477 _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) 478 { 479 return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2, 480 (__v4sf)__M); 481 } 482 483 /// \brief Returns a 128-bit vector of [16 x i8] where the values are selected 484 /// from either of the first or second operand as specified by the third 485 /// operand, the control mask. 486 /// 487 /// \headerfile <x86intrin.h> 488 /// 489 /// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction. 490 /// 491 /// \param __V1 492 /// A 128-bit vector of [16 x i8]. 493 /// \param __V2 494 /// A 128-bit vector of [16 x i8]. 495 /// \param __M 496 /// A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying 497 /// how the values are to be copied. The position of the mask bit corresponds 498 /// to the most significant bit of a copied value. When a mask bit is 0, the 499 /// corresponding 8-bit element in operand \a __V1 is copied to the same 500 /// position in the result. When a mask bit is 1, the corresponding 8-bit 501 /// element in operand \a __V2 is copied to the same position in the result. 502 /// \returns A 128-bit vector of [16 x i8] containing the copied values. 503 static __inline__ __m128i __DEFAULT_FN_ATTRS 504 _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) 505 { 506 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2, 507 (__v16qi)__M); 508 } 509 510 /// \brief Returns a 128-bit vector of [8 x i16] where the values are selected 511 /// from either of the first or second operand as specified by the third 512 /// operand, the control mask. 513 /// 514 /// \headerfile <x86intrin.h> 515 /// 516 /// \code 517 /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); 518 /// \endcode 519 /// 520 /// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction. 521 /// 522 /// \param V1 523 /// A 128-bit vector of [8 x i16]. 524 /// \param V2 525 /// A 128-bit vector of [8 x i16]. 526 /// \param M 527 /// An immediate integer operand, with mask bits [7:0] specifying how the 528 /// values are to be copied. The position of the mask bit corresponds to the 529 /// index of a copied value. When a mask bit is 0, the corresponding 16-bit 530 /// element in operand \a V1 is copied to the same position in the result. 531 /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 532 /// is copied to the same position in the result. 533 /// \returns A 128-bit vector of [8 x i16] containing the copied values. 534 #define _mm_blend_epi16(V1, V2, M) __extension__ ({ \ 535 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \ 536 (__v8hi)(__m128i)(V2), \ 537 (((M) & 0x01) ? 8 : 0), \ 538 (((M) & 0x02) ? 9 : 1), \ 539 (((M) & 0x04) ? 10 : 2), \ 540 (((M) & 0x08) ? 11 : 3), \ 541 (((M) & 0x10) ? 12 : 4), \ 542 (((M) & 0x20) ? 13 : 5), \ 543 (((M) & 0x40) ? 14 : 6), \ 544 (((M) & 0x80) ? 15 : 7)); }) 545 546 /* SSE4 Dword Multiply Instructions. */ 547 /// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32] 548 /// and returns the lower 32 bits of the each product in a 128-bit vector of 549 /// [4 x i32]. 550 /// 551 /// \headerfile <x86intrin.h> 552 /// 553 /// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction. 554 /// 555 /// \param __V1 556 /// A 128-bit integer vector. 557 /// \param __V2 558 /// A 128-bit integer vector. 559 /// \returns A 128-bit integer vector containing the products of both operands. 560 static __inline__ __m128i __DEFAULT_FN_ATTRS 561 _mm_mullo_epi32 (__m128i __V1, __m128i __V2) 562 { 563 return (__m128i) ((__v4su)__V1 * (__v4su)__V2); 564 } 565 566 /// \brief Multiplies corresponding even-indexed elements of two 128-bit 567 /// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] 568 /// containing the products. 569 /// 570 /// \headerfile <x86intrin.h> 571 /// 572 /// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction. 573 /// 574 /// \param __V1 575 /// A 128-bit vector of [4 x i32]. 576 /// \param __V2 577 /// A 128-bit vector of [4 x i32]. 578 /// \returns A 128-bit vector of [2 x i64] containing the products of both 579 /// operands. 580 static __inline__ __m128i __DEFAULT_FN_ATTRS 581 _mm_mul_epi32 (__m128i __V1, __m128i __V2) 582 { 583 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2); 584 } 585 586 /* SSE4 Floating Point Dot Product Instructions. */ 587 /// \brief Computes the dot product of the two 128-bit vectors of [4 x float] 588 /// and returns it in the elements of the 128-bit result vector of 589 /// [4 x float]. 590 /// 591 /// The immediate integer operand controls which input elements 592 /// will contribute to the dot product, and where the final results are 593 /// returned. 594 /// 595 /// \headerfile <x86intrin.h> 596 /// 597 /// \code 598 /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); 599 /// \endcode 600 /// 601 /// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction. 602 /// 603 /// \param X 604 /// A 128-bit vector of [4 x float]. 605 /// \param Y 606 /// A 128-bit vector of [4 x float]. 607 /// \param M 608 /// An immediate integer operand. Mask bits [7:4] determine which elements 609 /// of the input vectors are used, with bit [4] corresponding to the lowest 610 /// element and bit [7] corresponding to the highest element of each [4 x 611 /// float] vector. If a bit is set, the corresponding elements from the two 612 /// input vectors are used as an input for dot product; otherwise that input 613 /// is treated as zero. Bits [3:0] determine which elements of the result 614 /// will receive a copy of the final dot product, with bit [0] corresponding 615 /// to the lowest element and bit [3] corresponding to the highest element of 616 /// each [4 x float] subvector. If a bit is set, the dot product is returned 617 /// in the corresponding element; otherwise that element is set to zero. 618 /// \returns A 128-bit vector of [4 x float] containing the dot product. 619 #define _mm_dp_ps(X, Y, M) __extension__ ({ \ 620 (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ 621 (__v4sf)(__m128)(Y), (M)); }) 622 623 /// \brief Computes the dot product of the two 128-bit vectors of [2 x double] 624 /// and returns it in the elements of the 128-bit result vector of 625 /// [2 x double]. 626 /// 627 /// The immediate integer operand controls which input 628 /// elements will contribute to the dot product, and where the final results 629 /// are returned. 630 /// 631 /// \headerfile <x86intrin.h> 632 /// 633 /// \code 634 /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); 635 /// \endcode 636 /// 637 /// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction. 638 /// 639 /// \param X 640 /// A 128-bit vector of [2 x double]. 641 /// \param Y 642 /// A 128-bit vector of [2 x double]. 643 /// \param M 644 /// An immediate integer operand. Mask bits [5:4] determine which elements 645 /// of the input vectors are used, with bit [4] corresponding to the lowest 646 /// element and bit [5] corresponding to the highest element of each of [2 x 647 /// double] vector. If a bit is set, the corresponding elements from the two 648 /// input vectors are used as an input for dot product; otherwise that input 649 /// is treated as zero. Bits [1:0] determine which elements of the result 650 /// will receive a copy of the final dot product, with bit [0] corresponding 651 /// to the lowest element and bit [3] corresponding to the highest element of 652 /// each [2 x double] vector. If a bit is set, the dot product is returned in 653 /// the corresponding element; otherwise that element is set to zero. 654 #define _mm_dp_pd(X, Y, M) __extension__ ({\ 655 (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ 656 (__v2df)(__m128d)(Y), (M)); }) 657 658 /* SSE4 Streaming Load Hint Instruction. */ 659 /// \brief Loads integer values from a 128-bit aligned memory location to a 660 /// 128-bit integer vector. 661 /// 662 /// \headerfile <x86intrin.h> 663 /// 664 /// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction. 665 /// 666 /// \param __V 667 /// A pointer to a 128-bit aligned memory location that contains the integer 668 /// values. 669 /// \returns A 128-bit integer vector containing the data stored at the 670 /// specified memory location. 671 static __inline__ __m128i __DEFAULT_FN_ATTRS 672 _mm_stream_load_si128 (__m128i const *__V) 673 { 674 return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V); 675 } 676 677 /* SSE4 Packed Integer Min/Max Instructions. */ 678 /// \brief Compares the corresponding elements of two 128-bit vectors of 679 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser 680 /// of the two values. 681 /// 682 /// \headerfile <x86intrin.h> 683 /// 684 /// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction. 685 /// 686 /// \param __V1 687 /// A 128-bit vector of [16 x i8]. 688 /// \param __V2 689 /// A 128-bit vector of [16 x i8] 690 /// \returns A 128-bit vector of [16 x i8] containing the lesser values. 691 static __inline__ __m128i __DEFAULT_FN_ATTRS 692 _mm_min_epi8 (__m128i __V1, __m128i __V2) 693 { 694 return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); 695 } 696 697 /// \brief Compares the corresponding elements of two 128-bit vectors of 698 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the 699 /// greater value of the two. 700 /// 701 /// \headerfile <x86intrin.h> 702 /// 703 /// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction. 704 /// 705 /// \param __V1 706 /// A 128-bit vector of [16 x i8]. 707 /// \param __V2 708 /// A 128-bit vector of [16 x i8]. 709 /// \returns A 128-bit vector of [16 x i8] containing the greater values. 710 static __inline__ __m128i __DEFAULT_FN_ATTRS 711 _mm_max_epi8 (__m128i __V1, __m128i __V2) 712 { 713 return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); 714 } 715 716 /// \brief Compares the corresponding elements of two 128-bit vectors of 717 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser 718 /// value of the two. 719 /// 720 /// \headerfile <x86intrin.h> 721 /// 722 /// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction. 723 /// 724 /// \param __V1 725 /// A 128-bit vector of [8 x u16]. 726 /// \param __V2 727 /// A 128-bit vector of [8 x u16]. 728 /// \returns A 128-bit vector of [8 x u16] containing the lesser values. 729 static __inline__ __m128i __DEFAULT_FN_ATTRS 730 _mm_min_epu16 (__m128i __V1, __m128i __V2) 731 { 732 return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); 733 } 734 735 /// \brief Compares the corresponding elements of two 128-bit vectors of 736 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the 737 /// greater value of the two. 738 /// 739 /// \headerfile <x86intrin.h> 740 /// 741 /// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction. 742 /// 743 /// \param __V1 744 /// A 128-bit vector of [8 x u16]. 745 /// \param __V2 746 /// A 128-bit vector of [8 x u16]. 747 /// \returns A 128-bit vector of [8 x u16] containing the greater values. 748 static __inline__ __m128i __DEFAULT_FN_ATTRS 749 _mm_max_epu16 (__m128i __V1, __m128i __V2) 750 { 751 return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); 752 } 753 754 /// \brief Compares the corresponding elements of two 128-bit vectors of 755 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser 756 /// value of the two. 757 /// 758 /// \headerfile <x86intrin.h> 759 /// 760 /// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction. 761 /// 762 /// \param __V1 763 /// A 128-bit vector of [4 x i32]. 764 /// \param __V2 765 /// A 128-bit vector of [4 x i32]. 766 /// \returns A 128-bit vector of [4 x i32] containing the lesser values. 767 static __inline__ __m128i __DEFAULT_FN_ATTRS 768 _mm_min_epi32 (__m128i __V1, __m128i __V2) 769 { 770 return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); 771 } 772 773 /// \brief Compares the corresponding elements of two 128-bit vectors of 774 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the 775 /// greater value of the two. 776 /// 777 /// \headerfile <x86intrin.h> 778 /// 779 /// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction. 780 /// 781 /// \param __V1 782 /// A 128-bit vector of [4 x i32]. 783 /// \param __V2 784 /// A 128-bit vector of [4 x i32]. 785 /// \returns A 128-bit vector of [4 x i32] containing the greater values. 786 static __inline__ __m128i __DEFAULT_FN_ATTRS 787 _mm_max_epi32 (__m128i __V1, __m128i __V2) 788 { 789 return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); 790 } 791 792 /// \brief Compares the corresponding elements of two 128-bit vectors of 793 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser 794 /// value of the two. 795 /// 796 /// \headerfile <x86intrin.h> 797 /// 798 /// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction. 799 /// 800 /// \param __V1 801 /// A 128-bit vector of [4 x u32]. 802 /// \param __V2 803 /// A 128-bit vector of [4 x u32]. 804 /// \returns A 128-bit vector of [4 x u32] containing the lesser values. 805 static __inline__ __m128i __DEFAULT_FN_ATTRS 806 _mm_min_epu32 (__m128i __V1, __m128i __V2) 807 { 808 return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); 809 } 810 811 /// \brief Compares the corresponding elements of two 128-bit vectors of 812 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the 813 /// greater value of the two. 814 /// 815 /// \headerfile <x86intrin.h> 816 /// 817 /// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction. 818 /// 819 /// \param __V1 820 /// A 128-bit vector of [4 x u32]. 821 /// \param __V2 822 /// A 128-bit vector of [4 x u32]. 823 /// \returns A 128-bit vector of [4 x u32] containing the greater values. 824 static __inline__ __m128i __DEFAULT_FN_ATTRS 825 _mm_max_epu32 (__m128i __V1, __m128i __V2) 826 { 827 return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2); 828 } 829 830 /* SSE4 Insertion and Extraction from XMM Register Instructions. */ 831 /// \brief Takes the first argument \a X and inserts an element from the second 832 /// argument \a Y as selected by the third argument \a N. That result then 833 /// has elements zeroed out also as selected by the third argument \a N. The 834 /// resulting 128-bit vector of [4 x float] is then returned. 835 /// 836 /// \headerfile <x86intrin.h> 837 /// 838 /// \code 839 /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); 840 /// \endcode 841 /// 842 /// This intrinsic corresponds to the <c> VINSERTPS </c> instruction. 843 /// 844 /// \param X 845 /// A 128-bit vector source operand of [4 x float]. With the exception of 846 /// those bits in the result copied from parameter \a Y and zeroed by bits 847 /// [3:0] of \a N, all bits from this parameter are copied to the result. 848 /// \param Y 849 /// A 128-bit vector source operand of [4 x float]. One single-precision 850 /// floating-point element from this source, as determined by the immediate 851 /// parameter, is copied to the result. 852 /// \param N 853 /// Specifies which bits from operand \a Y will be copied, which bits in the 854 /// result they will be be copied to, and which bits in the result will be 855 /// cleared. The following assignments are made: \n 856 /// Bits [7:6] specify the bits to copy from operand \a Y: \n 857 /// 00: Selects bits [31:0] from operand \a Y. \n 858 /// 01: Selects bits [63:32] from operand \a Y. \n 859 /// 10: Selects bits [95:64] from operand \a Y. \n 860 /// 11: Selects bits [127:96] from operand \a Y. \n 861 /// Bits [5:4] specify the bits in the result to which the selected bits 862 /// from operand \a Y are copied: \n 863 /// 00: Copies the selected bits from \a Y to result bits [31:0]. \n 864 /// 01: Copies the selected bits from \a Y to result bits [63:32]. \n 865 /// 10: Copies the selected bits from \a Y to result bits [95:64]. \n 866 /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n 867 /// Bits[3:0]: If any of these bits are set, the corresponding result 868 /// element is cleared. 869 /// \returns A 128-bit vector of [4 x float] containing the copied single- 870 /// precision floating point elements from the operands. 871 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) 872 873 /// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and 874 /// returns it, using the immediate value parameter \a N as a selector. 875 /// 876 /// \headerfile <x86intrin.h> 877 /// 878 /// \code 879 /// int _mm_extract_ps(__m128 X, const int N); 880 /// \endcode 881 /// 882 /// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c> 883 /// instruction. 884 /// 885 /// \param X 886 /// A 128-bit vector of [4 x float]. 887 /// \param N 888 /// An immediate value. Bits [1:0] determines which bits from the argument 889 /// \a X are extracted and returned: \n 890 /// 00: Bits [31:0] of parameter \a X are returned. \n 891 /// 01: Bits [63:32] of parameter \a X are returned. \n 892 /// 10: Bits [95:64] of parameter \a X are returned. \n 893 /// 11: Bits [127:96] of parameter \a X are returned. 894 /// \returns A 32-bit integer containing the extracted 32 bits of float data. 895 #define _mm_extract_ps(X, N) (__extension__ \ 896 ({ union { int __i; float __f; } __t; \ 897 __v4sf __a = (__v4sf)(__m128)(X); \ 898 __t.__f = __a[(N) & 3]; \ 899 __t.__i;})) 900 901 /* Miscellaneous insert and extract macros. */ 902 /* Extract a single-precision float from X at index N into D. */ 903 #define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \ 904 (D) = __a[N]; })) 905 906 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create 907 an index suitable for _mm_insert_ps. */ 908 #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) 909 910 /* Extract a float from X at index N into the first index of the return. */ 911 #define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \ 912 _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) 913 914 /* Insert int into packed integer array at index. */ 915 /// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of 916 /// the 128-bit integer vector parameter, and then inserting the lower 8 bits 917 /// of an integer parameter \a I into an offset specified by the immediate 918 /// value parameter \a N. 919 /// 920 /// \headerfile <x86intrin.h> 921 /// 922 /// \code 923 /// __m128i _mm_insert_epi8(__m128i X, int I, const int N); 924 /// \endcode 925 /// 926 /// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction. 927 /// 928 /// \param X 929 /// A 128-bit integer vector of [16 x i8]. This vector is copied to the 930 /// result and then one of the sixteen elements in the result vector is 931 /// replaced by the lower 8 bits of \a I. 932 /// \param I 933 /// An integer. The lower 8 bits of this operand are written to the result 934 /// beginning at the offset specified by \a N. 935 /// \param N 936 /// An immediate value. Bits [3:0] specify the bit offset in the result at 937 /// which the lower 8 bits of \a I are written. \n 938 /// 0000: Bits [7:0] of the result are used for insertion. \n 939 /// 0001: Bits [15:8] of the result are used for insertion. \n 940 /// 0010: Bits [23:16] of the result are used for insertion. \n 941 /// 0011: Bits [31:24] of the result are used for insertion. \n 942 /// 0100: Bits [39:32] of the result are used for insertion. \n 943 /// 0101: Bits [47:40] of the result are used for insertion. \n 944 /// 0110: Bits [55:48] of the result are used for insertion. \n 945 /// 0111: Bits [63:56] of the result are used for insertion. \n 946 /// 1000: Bits [71:64] of the result are used for insertion. \n 947 /// 1001: Bits [79:72] of the result are used for insertion. \n 948 /// 1010: Bits [87:80] of the result are used for insertion. \n 949 /// 1011: Bits [95:88] of the result are used for insertion. \n 950 /// 1100: Bits [103:96] of the result are used for insertion. \n 951 /// 1101: Bits [111:104] of the result are used for insertion. \n 952 /// 1110: Bits [119:112] of the result are used for insertion. \n 953 /// 1111: Bits [127:120] of the result are used for insertion. 954 /// \returns A 128-bit integer vector containing the constructed values. 955 #define _mm_insert_epi8(X, I, N) (__extension__ \ 956 ({ __v16qi __a = (__v16qi)(__m128i)(X); \ 957 __a[(N) & 15] = (I); \ 958 (__m128i)__a;})) 959 960 /// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of 961 /// the 128-bit integer vector parameter, and then inserting the 32-bit 962 /// integer parameter \a I at the offset specified by the immediate value 963 /// parameter \a N. 964 /// 965 /// \headerfile <x86intrin.h> 966 /// 967 /// \code 968 /// __m128i _mm_insert_epi32(__m128i X, int I, const int N); 969 /// \endcode 970 /// 971 /// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction. 972 /// 973 /// \param X 974 /// A 128-bit integer vector of [4 x i32]. This vector is copied to the 975 /// result and then one of the four elements in the result vector is 976 /// replaced by \a I. 977 /// \param I 978 /// A 32-bit integer that is written to the result beginning at the offset 979 /// specified by \a N. 980 /// \param N 981 /// An immediate value. Bits [1:0] specify the bit offset in the result at 982 /// which the integer \a I is written. \n 983 /// 00: Bits [31:0] of the result are used for insertion. \n 984 /// 01: Bits [63:32] of the result are used for insertion. \n 985 /// 10: Bits [95:64] of the result are used for insertion. \n 986 /// 11: Bits [127:96] of the result are used for insertion. 987 /// \returns A 128-bit integer vector containing the constructed values. 988 #define _mm_insert_epi32(X, I, N) (__extension__ \ 989 ({ __v4si __a = (__v4si)(__m128i)(X); \ 990 __a[(N) & 3] = (I); \ 991 (__m128i)__a;})) 992 993 #ifdef __x86_64__ 994 /// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of 995 /// the 128-bit integer vector parameter, and then inserting the 64-bit 996 /// integer parameter \a I, using the immediate value parameter \a N as an 997 /// insertion location selector. 998 /// 999 /// \headerfile <x86intrin.h> 1000 /// 1001 /// \code 1002 /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); 1003 /// \endcode 1004 /// 1005 /// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction. 1006 /// 1007 /// \param X 1008 /// A 128-bit integer vector of [2 x i64]. This vector is copied to the 1009 /// result and then one of the two elements in the result vector is replaced 1010 /// by \a I. 1011 /// \param I 1012 /// A 64-bit integer that is written to the result beginning at the offset 1013 /// specified by \a N. 1014 /// \param N 1015 /// An immediate value. Bit [0] specifies the bit offset in the result at 1016 /// which the integer \a I is written. \n 1017 /// 0: Bits [63:0] of the result are used for insertion. \n 1018 /// 1: Bits [127:64] of the result are used for insertion. \n 1019 /// \returns A 128-bit integer vector containing the constructed values. 1020 #define _mm_insert_epi64(X, I, N) (__extension__ \ 1021 ({ __v2di __a = (__v2di)(__m128i)(X); \ 1022 __a[(N) & 1] = (I); \ 1023 (__m128i)__a;})) 1024 #endif /* __x86_64__ */ 1025 1026 /* Extract int from packed integer array at index. This returns the element 1027 * as a zero extended value, so it is unsigned. 1028 */ 1029 /// \brief Extracts an 8-bit element from the 128-bit integer vector of 1030 /// [16 x i8], using the immediate value parameter \a N as a selector. 1031 /// 1032 /// \headerfile <x86intrin.h> 1033 /// 1034 /// \code 1035 /// int _mm_extract_epi8(__m128i X, const int N); 1036 /// \endcode 1037 /// 1038 /// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction. 1039 /// 1040 /// \param X 1041 /// A 128-bit integer vector. 1042 /// \param N 1043 /// An immediate value. Bits [3:0] specify which 8-bit vector element from 1044 /// the argument \a X to extract and copy to the result. \n 1045 /// 0000: Bits [7:0] of parameter \a X are extracted. \n 1046 /// 0001: Bits [15:8] of the parameter \a X are extracted. \n 1047 /// 0010: Bits [23:16] of the parameter \a X are extracted. \n 1048 /// 0011: Bits [31:24] of the parameter \a X are extracted. \n 1049 /// 0100: Bits [39:32] of the parameter \a X are extracted. \n 1050 /// 0101: Bits [47:40] of the parameter \a X are extracted. \n 1051 /// 0110: Bits [55:48] of the parameter \a X are extracted. \n 1052 /// 0111: Bits [63:56] of the parameter \a X are extracted. \n 1053 /// 1000: Bits [71:64] of the parameter \a X are extracted. \n 1054 /// 1001: Bits [79:72] of the parameter \a X are extracted. \n 1055 /// 1010: Bits [87:80] of the parameter \a X are extracted. \n 1056 /// 1011: Bits [95:88] of the parameter \a X are extracted. \n 1057 /// 1100: Bits [103:96] of the parameter \a X are extracted. \n 1058 /// 1101: Bits [111:104] of the parameter \a X are extracted. \n 1059 /// 1110: Bits [119:112] of the parameter \a X are extracted. \n 1060 /// 1111: Bits [127:120] of the parameter \a X are extracted. 1061 /// \returns An unsigned integer, whose lower 8 bits are selected from the 1062 /// 128-bit integer vector parameter and the remaining bits are assigned 1063 /// zeros. 1064 #define _mm_extract_epi8(X, N) (__extension__ \ 1065 ({ __v16qi __a = (__v16qi)(__m128i)(X); \ 1066 (int)(unsigned char) __a[(N) & 15];})) 1067 1068 /// \brief Extracts a 32-bit element from the 128-bit integer vector of 1069 /// [4 x i32], using the immediate value parameter \a N as a selector. 1070 /// 1071 /// \headerfile <x86intrin.h> 1072 /// 1073 /// \code 1074 /// int _mm_extract_epi32(__m128i X, const int N); 1075 /// \endcode 1076 /// 1077 /// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction. 1078 /// 1079 /// \param X 1080 /// A 128-bit integer vector. 1081 /// \param N 1082 /// An immediate value. Bits [1:0] specify which 32-bit vector element from 1083 /// the argument \a X to extract and copy to the result. \n 1084 /// 00: Bits [31:0] of the parameter \a X are extracted. \n 1085 /// 01: Bits [63:32] of the parameter \a X are extracted. \n 1086 /// 10: Bits [95:64] of the parameter \a X are extracted. \n 1087 /// 11: Bits [127:96] of the parameter \a X are exracted. 1088 /// \returns An integer, whose lower 32 bits are selected from the 128-bit 1089 /// integer vector parameter and the remaining bits are assigned zeros. 1090 #define _mm_extract_epi32(X, N) (__extension__ \ 1091 ({ __v4si __a = (__v4si)(__m128i)(X); \ 1092 (int)__a[(N) & 3];})) 1093 1094 #ifdef __x86_64__ 1095 /// \brief Extracts a 64-bit element from the 128-bit integer vector of 1096 /// [2 x i64], using the immediate value parameter \a N as a selector. 1097 /// 1098 /// \headerfile <x86intrin.h> 1099 /// 1100 /// \code 1101 /// long long _mm_extract_epi64(__m128i X, const int N); 1102 /// \endcode 1103 /// 1104 /// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction. 1105 /// 1106 /// \param X 1107 /// A 128-bit integer vector. 1108 /// \param N 1109 /// An immediate value. Bit [0] specifies which 64-bit vector element from 1110 /// the argument \a X to return. \n 1111 /// 0: Bits [63:0] are returned. \n 1112 /// 1: Bits [127:64] are returned. \n 1113 /// \returns A 64-bit integer. 1114 #define _mm_extract_epi64(X, N) (__extension__ \ 1115 ({ __v2di __a = (__v2di)(__m128i)(X); \ 1116 (long long)__a[(N) & 1];})) 1117 #endif /* __x86_64 */ 1118 1119 /* SSE4 128-bit Packed Integer Comparisons. */ 1120 /// \brief Tests whether the specified bits in a 128-bit integer vector are all 1121 /// zeros. 1122 /// 1123 /// \headerfile <x86intrin.h> 1124 /// 1125 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1126 /// 1127 /// \param __M 1128 /// A 128-bit integer vector containing the bits to be tested. 1129 /// \param __V 1130 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1131 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1132 static __inline__ int __DEFAULT_FN_ATTRS 1133 _mm_testz_si128(__m128i __M, __m128i __V) 1134 { 1135 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); 1136 } 1137 1138 /// \brief Tests whether the specified bits in a 128-bit integer vector are all 1139 /// ones. 1140 /// 1141 /// \headerfile <x86intrin.h> 1142 /// 1143 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1144 /// 1145 /// \param __M 1146 /// A 128-bit integer vector containing the bits to be tested. 1147 /// \param __V 1148 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1149 /// \returns TRUE if the specified bits are all ones; FALSE otherwise. 1150 static __inline__ int __DEFAULT_FN_ATTRS 1151 _mm_testc_si128(__m128i __M, __m128i __V) 1152 { 1153 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); 1154 } 1155 1156 /// \brief Tests whether the specified bits in a 128-bit integer vector are 1157 /// neither all zeros nor all ones. 1158 /// 1159 /// \headerfile <x86intrin.h> 1160 /// 1161 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1162 /// 1163 /// \param __M 1164 /// A 128-bit integer vector containing the bits to be tested. 1165 /// \param __V 1166 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1167 /// \returns TRUE if the specified bits are neither all zeros nor all ones; 1168 /// FALSE otherwise. 1169 static __inline__ int __DEFAULT_FN_ATTRS 1170 _mm_testnzc_si128(__m128i __M, __m128i __V) 1171 { 1172 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); 1173 } 1174 1175 /// \brief Tests whether the specified bits in a 128-bit integer vector are all 1176 /// ones. 1177 /// 1178 /// \headerfile <x86intrin.h> 1179 /// 1180 /// \code 1181 /// int _mm_test_all_ones(__m128i V); 1182 /// \endcode 1183 /// 1184 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1185 /// 1186 /// \param V 1187 /// A 128-bit integer vector containing the bits to be tested. 1188 /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE 1189 /// otherwise. 1190 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) 1191 1192 /// \brief Tests whether the specified bits in a 128-bit integer vector are 1193 /// neither all zeros nor all ones. 1194 /// 1195 /// \headerfile <x86intrin.h> 1196 /// 1197 /// \code 1198 /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); 1199 /// \endcode 1200 /// 1201 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1202 /// 1203 /// \param M 1204 /// A 128-bit integer vector containing the bits to be tested. 1205 /// \param V 1206 /// A 128-bit integer vector selecting which bits to test in operand \a M. 1207 /// \returns TRUE if the specified bits are neither all zeros nor all ones; 1208 /// FALSE otherwise. 1209 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 1210 1211 /// \brief Tests whether the specified bits in a 128-bit integer vector are all 1212 /// zeros. 1213 /// 1214 /// \headerfile <x86intrin.h> 1215 /// 1216 /// \code 1217 /// int _mm_test_all_zeros(__m128i M, __m128i V); 1218 /// \endcode 1219 /// 1220 /// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction. 1221 /// 1222 /// \param M 1223 /// A 128-bit integer vector containing the bits to be tested. 1224 /// \param V 1225 /// A 128-bit integer vector selecting which bits to test in operand \a M. 1226 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1227 #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) 1228 1229 /* SSE4 64-bit Packed Integer Comparisons. */ 1230 /// \brief Compares each of the corresponding 64-bit values of the 128-bit 1231 /// integer vectors for equality. 1232 /// 1233 /// \headerfile <x86intrin.h> 1234 /// 1235 /// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction. 1236 /// 1237 /// \param __V1 1238 /// A 128-bit integer vector. 1239 /// \param __V2 1240 /// A 128-bit integer vector. 1241 /// \returns A 128-bit integer vector containing the comparison results. 1242 static __inline__ __m128i __DEFAULT_FN_ATTRS 1243 _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) 1244 { 1245 return (__m128i)((__v2di)__V1 == (__v2di)__V2); 1246 } 1247 1248 /* SSE4 Packed Integer Sign-Extension. */ 1249 /// \brief Sign-extends each of the lower eight 8-bit integer elements of a 1250 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1251 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1252 /// are unused. 1253 /// 1254 /// \headerfile <x86intrin.h> 1255 /// 1256 /// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction. 1257 /// 1258 /// \param __V 1259 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign- 1260 /// extended to 16-bit values. 1261 /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. 1262 static __inline__ __m128i __DEFAULT_FN_ATTRS 1263 _mm_cvtepi8_epi16(__m128i __V) 1264 { 1265 /* This function always performs a signed extension, but __v16qi is a char 1266 which may be signed or unsigned, so use __v16qs. */ 1267 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); 1268 } 1269 1270 /// \brief Sign-extends each of the lower four 8-bit integer elements of a 1271 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1272 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1273 /// vector are unused. 1274 /// 1275 /// \headerfile <x86intrin.h> 1276 /// 1277 /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. 1278 /// 1279 /// \param __V 1280 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign- 1281 /// extended to 32-bit values. 1282 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1283 static __inline__ __m128i __DEFAULT_FN_ATTRS 1284 _mm_cvtepi8_epi32(__m128i __V) 1285 { 1286 /* This function always performs a signed extension, but __v16qi is a char 1287 which may be signed or unsigned, so use __v16qs. */ 1288 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); 1289 } 1290 1291 /// \brief Sign-extends each of the lower two 8-bit integer elements of a 1292 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1293 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1294 /// vector are unused. 1295 /// 1296 /// \headerfile <x86intrin.h> 1297 /// 1298 /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. 1299 /// 1300 /// \param __V 1301 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign- 1302 /// extended to 64-bit values. 1303 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1304 static __inline__ __m128i __DEFAULT_FN_ATTRS 1305 _mm_cvtepi8_epi64(__m128i __V) 1306 { 1307 /* This function always performs a signed extension, but __v16qi is a char 1308 which may be signed or unsigned, so use __v16qs. */ 1309 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); 1310 } 1311 1312 /// \brief Sign-extends each of the lower four 16-bit integer elements of a 1313 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1314 /// a 128-bit vector of [4 x i32]. The upper four elements of the input 1315 /// vector are unused. 1316 /// 1317 /// \headerfile <x86intrin.h> 1318 /// 1319 /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. 1320 /// 1321 /// \param __V 1322 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign- 1323 /// extended to 32-bit values. 1324 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1325 static __inline__ __m128i __DEFAULT_FN_ATTRS 1326 _mm_cvtepi16_epi32(__m128i __V) 1327 { 1328 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); 1329 } 1330 1331 /// \brief Sign-extends each of the lower two 16-bit integer elements of a 1332 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1333 /// a 128-bit vector of [2 x i64]. The upper six elements of the input 1334 /// vector are unused. 1335 /// 1336 /// \headerfile <x86intrin.h> 1337 /// 1338 /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. 1339 /// 1340 /// \param __V 1341 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign- 1342 /// extended to 64-bit values. 1343 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1344 static __inline__ __m128i __DEFAULT_FN_ATTRS 1345 _mm_cvtepi16_epi64(__m128i __V) 1346 { 1347 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); 1348 } 1349 1350 /// \brief Sign-extends each of the lower two 32-bit integer elements of a 1351 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1352 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1353 /// are unused. 1354 /// 1355 /// \headerfile <x86intrin.h> 1356 /// 1357 /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. 1358 /// 1359 /// \param __V 1360 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign- 1361 /// extended to 64-bit values. 1362 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1363 static __inline__ __m128i __DEFAULT_FN_ATTRS 1364 _mm_cvtepi32_epi64(__m128i __V) 1365 { 1366 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); 1367 } 1368 1369 /* SSE4 Packed Integer Zero-Extension. */ 1370 /// \brief Zero-extends each of the lower eight 8-bit integer elements of a 1371 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1372 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1373 /// are unused. 1374 /// 1375 /// \headerfile <x86intrin.h> 1376 /// 1377 /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. 1378 /// 1379 /// \param __V 1380 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero- 1381 /// extended to 16-bit values. 1382 /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. 1383 static __inline__ __m128i __DEFAULT_FN_ATTRS 1384 _mm_cvtepu8_epi16(__m128i __V) 1385 { 1386 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); 1387 } 1388 1389 /// \brief Zero-extends each of the lower four 8-bit integer elements of a 1390 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1391 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1392 /// vector are unused. 1393 /// 1394 /// \headerfile <x86intrin.h> 1395 /// 1396 /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. 1397 /// 1398 /// \param __V 1399 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero- 1400 /// extended to 32-bit values. 1401 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1402 static __inline__ __m128i __DEFAULT_FN_ATTRS 1403 _mm_cvtepu8_epi32(__m128i __V) 1404 { 1405 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); 1406 } 1407 1408 /// \brief Zero-extends each of the lower two 8-bit integer elements of a 1409 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1410 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1411 /// vector are unused. 1412 /// 1413 /// \headerfile <x86intrin.h> 1414 /// 1415 /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. 1416 /// 1417 /// \param __V 1418 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero- 1419 /// extended to 64-bit values. 1420 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1421 static __inline__ __m128i __DEFAULT_FN_ATTRS 1422 _mm_cvtepu8_epi64(__m128i __V) 1423 { 1424 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); 1425 } 1426 1427 /// \brief Zero-extends each of the lower four 16-bit integer elements of a 1428 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1429 /// a 128-bit vector of [4 x i32]. The upper four elements of the input 1430 /// vector are unused. 1431 /// 1432 /// \headerfile <x86intrin.h> 1433 /// 1434 /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. 1435 /// 1436 /// \param __V 1437 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero- 1438 /// extended to 32-bit values. 1439 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1440 static __inline__ __m128i __DEFAULT_FN_ATTRS 1441 _mm_cvtepu16_epi32(__m128i __V) 1442 { 1443 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); 1444 } 1445 1446 /// \brief Zero-extends each of the lower two 16-bit integer elements of a 1447 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1448 /// a 128-bit vector of [2 x i64]. The upper six elements of the input vector 1449 /// are unused. 1450 /// 1451 /// \headerfile <x86intrin.h> 1452 /// 1453 /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. 1454 /// 1455 /// \param __V 1456 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero- 1457 /// extended to 64-bit values. 1458 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1459 static __inline__ __m128i __DEFAULT_FN_ATTRS 1460 _mm_cvtepu16_epi64(__m128i __V) 1461 { 1462 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); 1463 } 1464 1465 /// \brief Zero-extends each of the lower two 32-bit integer elements of a 1466 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1467 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1468 /// are unused. 1469 /// 1470 /// \headerfile <x86intrin.h> 1471 /// 1472 /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. 1473 /// 1474 /// \param __V 1475 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero- 1476 /// extended to 64-bit values. 1477 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1478 static __inline__ __m128i __DEFAULT_FN_ATTRS 1479 _mm_cvtepu32_epi64(__m128i __V) 1480 { 1481 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); 1482 } 1483 1484 /* SSE4 Pack with Unsigned Saturation. */ 1485 /// \brief Converts 32-bit signed integers from both 128-bit integer vector 1486 /// operands into 16-bit unsigned integers, and returns the packed result. 1487 /// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than 1488 /// 0x0000 are saturated to 0x0000. 1489 /// 1490 /// \headerfile <x86intrin.h> 1491 /// 1492 /// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction. 1493 /// 1494 /// \param __V1 1495 /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1496 /// signed integer and is converted to a 16-bit unsigned integer with 1497 /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1498 /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1499 /// are written to the lower 64 bits of the result. 1500 /// \param __V2 1501 /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1502 /// signed integer and is converted to a 16-bit unsigned integer with 1503 /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1504 /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1505 /// are written to the higher 64 bits of the result. 1506 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 1507 static __inline__ __m128i __DEFAULT_FN_ATTRS 1508 _mm_packus_epi32(__m128i __V1, __m128i __V2) 1509 { 1510 return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); 1511 } 1512 1513 /* SSE4 Multiple Packed Sums of Absolute Difference. */ 1514 /// \brief Subtracts 8-bit unsigned integer values and computes the absolute 1515 /// values of the differences to the corresponding bits in the destination. 1516 /// Then sums of the absolute differences are returned according to the bit 1517 /// fields in the immediate operand. 1518 /// 1519 /// \headerfile <x86intrin.h> 1520 /// 1521 /// \code 1522 /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); 1523 /// \endcode 1524 /// 1525 /// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction. 1526 /// 1527 /// \param X 1528 /// A 128-bit vector of [16 x i8]. 1529 /// \param Y 1530 /// A 128-bit vector of [16 x i8]. 1531 /// \param M 1532 /// An 8-bit immediate operand specifying how the absolute differences are to 1533 /// be calculated, according to the following algorithm: 1534 /// \code 1535 /// // M2 represents bit 2 of the immediate operand 1536 /// // M10 represents bits [1:0] of the immediate operand 1537 /// i = M2 * 4 1538 /// j = M10 * 4 1539 /// for (k = 0; k < 8; k = k + 1) { 1540 /// d0 = abs(X[i + k + 0] - Y[j + 0]) 1541 /// d1 = abs(X[i + k + 1] - Y[j + 1]) 1542 /// d2 = abs(X[i + k + 2] - Y[j + 2]) 1543 /// d3 = abs(X[i + k + 3] - Y[j + 3]) 1544 /// r[k] = d0 + d1 + d2 + d3 1545 /// } 1546 /// \endcode 1547 /// \returns A 128-bit integer vector containing the sums of the sets of 1548 /// absolute differences between both operands. 1549 #define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \ 1550 (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ 1551 (__v16qi)(__m128i)(Y), (M)); }) 1552 1553 /// \brief Finds the minimum unsigned 16-bit element in the input 128-bit 1554 /// vector of [8 x u16] and returns it and along with its index. 1555 /// 1556 /// \headerfile <x86intrin.h> 1557 /// 1558 /// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c> 1559 /// instruction. 1560 /// 1561 /// \param __V 1562 /// A 128-bit vector of [8 x u16]. 1563 /// \returns A 128-bit value where bits [15:0] contain the minimum value found 1564 /// in parameter \a __V, bits [18:16] contain the index of the minimum value 1565 /// and the remaining bits are set to 0. 1566 static __inline__ __m128i __DEFAULT_FN_ATTRS 1567 _mm_minpos_epu16(__m128i __V) 1568 { 1569 return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V); 1570 } 1571 1572 /* Handle the sse4.2 definitions here. */ 1573 1574 /* These definitions are normally in nmmintrin.h, but gcc puts them in here 1575 so we'll do the same. */ 1576 1577 #undef __DEFAULT_FN_ATTRS 1578 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) 1579 1580 /* These specify the type of data that we're comparing. */ 1581 #define _SIDD_UBYTE_OPS 0x00 1582 #define _SIDD_UWORD_OPS 0x01 1583 #define _SIDD_SBYTE_OPS 0x02 1584 #define _SIDD_SWORD_OPS 0x03 1585 1586 /* These specify the type of comparison operation. */ 1587 #define _SIDD_CMP_EQUAL_ANY 0x00 1588 #define _SIDD_CMP_RANGES 0x04 1589 #define _SIDD_CMP_EQUAL_EACH 0x08 1590 #define _SIDD_CMP_EQUAL_ORDERED 0x0c 1591 1592 /* These macros specify the polarity of the operation. */ 1593 #define _SIDD_POSITIVE_POLARITY 0x00 1594 #define _SIDD_NEGATIVE_POLARITY 0x10 1595 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 1596 #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 1597 1598 /* These macros are used in _mm_cmpXstri() to specify the return. */ 1599 #define _SIDD_LEAST_SIGNIFICANT 0x00 1600 #define _SIDD_MOST_SIGNIFICANT 0x40 1601 1602 /* These macros are used in _mm_cmpXstri() to specify the return. */ 1603 #define _SIDD_BIT_MASK 0x00 1604 #define _SIDD_UNIT_MASK 0x40 1605 1606 /* SSE4.2 Packed Comparison Intrinsics. */ 1607 /// \brief Uses the immediate operand \a M to perform a comparison of string 1608 /// data with implicitly defined lengths that is contained in source operands 1609 /// \a A and \a B. Returns a 128-bit integer vector representing the result 1610 /// mask of the comparison. 1611 /// 1612 /// \headerfile <x86intrin.h> 1613 /// 1614 /// \code 1615 /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); 1616 /// \endcode 1617 /// 1618 /// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c> 1619 /// instruction. 1620 /// 1621 /// \param A 1622 /// A 128-bit integer vector containing one of the source operands to be 1623 /// compared. 1624 /// \param B 1625 /// A 128-bit integer vector containing one of the source operands to be 1626 /// compared. 1627 /// \param M 1628 /// An 8-bit immediate operand specifying whether the characters are bytes or 1629 /// words, the type of comparison to perform, and the format of the return 1630 /// value. \n 1631 /// Bits [1:0]: Determine source data format. \n 1632 /// 00: 16 unsigned bytes \n 1633 /// 01: 8 unsigned words \n 1634 /// 10: 16 signed bytes \n 1635 /// 11: 8 signed words \n 1636 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1637 /// 00: Subset: Each character in \a B is compared for equality with all 1638 /// the characters in \a A. \n 1639 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1640 /// basis is greater than or equal for even-indexed elements in \a A, 1641 /// and less than or equal for odd-indexed elements in \a A. \n 1642 /// 10: Match: Compare each pair of corresponding characters in \a A and 1643 /// \a B for equality. \n 1644 /// 11: Substring: Search \a B for substring matches of \a A. \n 1645 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1646 /// mask of the comparison results. \n 1647 /// 00: No effect. \n 1648 /// 01: Negate the bit mask. \n 1649 /// 10: No effect. \n 1650 /// 11: Negate the bit mask only for bits with an index less than or equal 1651 /// to the size of \a A or \a B. \n 1652 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1653 /// bytes. \n 1654 /// 0: The result is zero-extended to 16 bytes. \n 1655 /// 1: The result is expanded to 16 bytes (this expansion is performed by 1656 /// repeating each bit 8 or 16 times). 1657 /// \returns Returns a 128-bit integer vector representing the result mask of 1658 /// the comparison. 1659 #define _mm_cmpistrm(A, B, M) \ 1660 (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ 1661 (__v16qi)(__m128i)(B), (int)(M)) 1662 1663 /// \brief Uses the immediate operand \a M to perform a comparison of string 1664 /// data with implicitly defined lengths that is contained in source operands 1665 /// \a A and \a B. Returns an integer representing the result index of the 1666 /// comparison. 1667 /// 1668 /// \headerfile <x86intrin.h> 1669 /// 1670 /// \code 1671 /// int _mm_cmpistri(__m128i A, __m128i B, const int M); 1672 /// \endcode 1673 /// 1674 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1675 /// instruction. 1676 /// 1677 /// \param A 1678 /// A 128-bit integer vector containing one of the source operands to be 1679 /// compared. 1680 /// \param B 1681 /// A 128-bit integer vector containing one of the source operands to be 1682 /// compared. 1683 /// \param M 1684 /// An 8-bit immediate operand specifying whether the characters are bytes or 1685 /// words, the type of comparison to perform, and the format of the return 1686 /// value. \n 1687 /// Bits [1:0]: Determine source data format. \n 1688 /// 00: 16 unsigned bytes \n 1689 /// 01: 8 unsigned words \n 1690 /// 10: 16 signed bytes \n 1691 /// 11: 8 signed words \n 1692 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1693 /// 00: Subset: Each character in \a B is compared for equality with all 1694 /// the characters in \a A. \n 1695 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1696 /// basis is greater than or equal for even-indexed elements in \a A, 1697 /// and less than or equal for odd-indexed elements in \a A. \n 1698 /// 10: Match: Compare each pair of corresponding characters in \a A and 1699 /// \a B for equality. \n 1700 /// 11: Substring: Search B for substring matches of \a A. \n 1701 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1702 /// mask of the comparison results. \n 1703 /// 00: No effect. \n 1704 /// 01: Negate the bit mask. \n 1705 /// 10: No effect. \n 1706 /// 11: Negate the bit mask only for bits with an index less than or equal 1707 /// to the size of \a A or \a B. \n 1708 /// Bit [6]: Determines whether the index of the lowest set bit or the 1709 /// highest set bit is returned. \n 1710 /// 0: The index of the least significant set bit. \n 1711 /// 1: The index of the most significant set bit. \n 1712 /// \returns Returns an integer representing the result index of the comparison. 1713 #define _mm_cmpistri(A, B, M) \ 1714 (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ 1715 (__v16qi)(__m128i)(B), (int)(M)) 1716 1717 /// \brief Uses the immediate operand \a M to perform a comparison of string 1718 /// data with explicitly defined lengths that is contained in source operands 1719 /// \a A and \a B. Returns a 128-bit integer vector representing the result 1720 /// mask of the comparison. 1721 /// 1722 /// \headerfile <x86intrin.h> 1723 /// 1724 /// \code 1725 /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); 1726 /// \endcode 1727 /// 1728 /// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c> 1729 /// instruction. 1730 /// 1731 /// \param A 1732 /// A 128-bit integer vector containing one of the source operands to be 1733 /// compared. 1734 /// \param LA 1735 /// An integer that specifies the length of the string in \a A. 1736 /// \param B 1737 /// A 128-bit integer vector containing one of the source operands to be 1738 /// compared. 1739 /// \param LB 1740 /// An integer that specifies the length of the string in \a B. 1741 /// \param M 1742 /// An 8-bit immediate operand specifying whether the characters are bytes or 1743 /// words, the type of comparison to perform, and the format of the return 1744 /// value. \n 1745 /// Bits [1:0]: Determine source data format. \n 1746 /// 00: 16 unsigned bytes \n 1747 /// 01: 8 unsigned words \n 1748 /// 10: 16 signed bytes \n 1749 /// 11: 8 signed words \n 1750 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1751 /// 00: Subset: Each character in \a B is compared for equality with all 1752 /// the characters in \a A. \n 1753 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1754 /// basis is greater than or equal for even-indexed elements in \a A, 1755 /// and less than or equal for odd-indexed elements in \a A. \n 1756 /// 10: Match: Compare each pair of corresponding characters in \a A and 1757 /// \a B for equality. \n 1758 /// 11: Substring: Search \a B for substring matches of \a A. \n 1759 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1760 /// mask of the comparison results. \n 1761 /// 00: No effect. \n 1762 /// 01: Negate the bit mask. \n 1763 /// 10: No effect. \n 1764 /// 11: Negate the bit mask only for bits with an index less than or equal 1765 /// to the size of \a A or \a B. \n 1766 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1767 /// bytes. \n 1768 /// 0: The result is zero-extended to 16 bytes. \n 1769 /// 1: The result is expanded to 16 bytes (this expansion is performed by 1770 /// repeating each bit 8 or 16 times). \n 1771 /// \returns Returns a 128-bit integer vector representing the result mask of 1772 /// the comparison. 1773 #define _mm_cmpestrm(A, LA, B, LB, M) \ 1774 (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ 1775 (__v16qi)(__m128i)(B), (int)(LB), \ 1776 (int)(M)) 1777 1778 /// \brief Uses the immediate operand \a M to perform a comparison of string 1779 /// data with explicitly defined lengths that is contained in source operands 1780 /// \a A and \a B. Returns an integer representing the result index of the 1781 /// comparison. 1782 /// 1783 /// \headerfile <x86intrin.h> 1784 /// 1785 /// \code 1786 /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); 1787 /// \endcode 1788 /// 1789 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 1790 /// instruction. 1791 /// 1792 /// \param A 1793 /// A 128-bit integer vector containing one of the source operands to be 1794 /// compared. 1795 /// \param LA 1796 /// An integer that specifies the length of the string in \a A. 1797 /// \param B 1798 /// A 128-bit integer vector containing one of the source operands to be 1799 /// compared. 1800 /// \param LB 1801 /// An integer that specifies the length of the string in \a B. 1802 /// \param M 1803 /// An 8-bit immediate operand specifying whether the characters are bytes or 1804 /// words, the type of comparison to perform, and the format of the return 1805 /// value. \n 1806 /// Bits [1:0]: Determine source data format. \n 1807 /// 00: 16 unsigned bytes \n 1808 /// 01: 8 unsigned words \n 1809 /// 10: 16 signed bytes \n 1810 /// 11: 8 signed words \n 1811 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1812 /// 00: Subset: Each character in \a B is compared for equality with all 1813 /// the characters in \a A. \n 1814 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1815 /// basis is greater than or equal for even-indexed elements in \a A, 1816 /// and less than or equal for odd-indexed elements in \a A. \n 1817 /// 10: Match: Compare each pair of corresponding characters in \a A and 1818 /// \a B for equality. \n 1819 /// 11: Substring: Search B for substring matches of \a A. \n 1820 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1821 /// mask of the comparison results. \n 1822 /// 00: No effect. \n 1823 /// 01: Negate the bit mask. \n 1824 /// 10: No effect. \n 1825 /// 11: Negate the bit mask only for bits with an index less than or equal 1826 /// to the size of \a A or \a B. \n 1827 /// Bit [6]: Determines whether the index of the lowest set bit or the 1828 /// highest set bit is returned. \n 1829 /// 0: The index of the least significant set bit. \n 1830 /// 1: The index of the most significant set bit. \n 1831 /// \returns Returns an integer representing the result index of the comparison. 1832 #define _mm_cmpestri(A, LA, B, LB, M) \ 1833 (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ 1834 (__v16qi)(__m128i)(B), (int)(LB), \ 1835 (int)(M)) 1836 1837 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ 1838 /// \brief Uses the immediate operand \a M to perform a comparison of string 1839 /// data with implicitly defined lengths that is contained in source operands 1840 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 1841 /// string in \a B is the maximum, otherwise, returns 0. 1842 /// 1843 /// \headerfile <x86intrin.h> 1844 /// 1845 /// \code 1846 /// int _mm_cmpistra(__m128i A, __m128i B, const int M); 1847 /// \endcode 1848 /// 1849 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1850 /// instruction. 1851 /// 1852 /// \param A 1853 /// A 128-bit integer vector containing one of the source operands to be 1854 /// compared. 1855 /// \param B 1856 /// A 128-bit integer vector containing one of the source operands to be 1857 /// compared. 1858 /// \param M 1859 /// An 8-bit immediate operand specifying whether the characters are bytes or 1860 /// words and the type of comparison to perform. \n 1861 /// Bits [1:0]: Determine source data format. \n 1862 /// 00: 16 unsigned bytes \n 1863 /// 01: 8 unsigned words \n 1864 /// 10: 16 signed bytes \n 1865 /// 11: 8 signed words \n 1866 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1867 /// 00: Subset: Each character in \a B is compared for equality with all 1868 /// the characters in \a A. \n 1869 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1870 /// basis is greater than or equal for even-indexed elements in \a A, 1871 /// and less than or equal for odd-indexed elements in \a A. \n 1872 /// 10: Match: Compare each pair of corresponding characters in \a A and 1873 /// \a B for equality. \n 1874 /// 11: Substring: Search \a B for substring matches of \a A. \n 1875 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1876 /// mask of the comparison results. \n 1877 /// 00: No effect. \n 1878 /// 01: Negate the bit mask. \n 1879 /// 10: No effect. \n 1880 /// 11: Negate the bit mask only for bits with an index less than or equal 1881 /// to the size of \a A or \a B. \n 1882 /// \returns Returns 1 if the bit mask is zero and the length of the string in 1883 /// \a B is the maximum; otherwise, returns 0. 1884 #define _mm_cmpistra(A, B, M) \ 1885 (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ 1886 (__v16qi)(__m128i)(B), (int)(M)) 1887 1888 /// \brief Uses the immediate operand \a M to perform a comparison of string 1889 /// data with implicitly defined lengths that is contained in source operands 1890 /// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns 1891 /// 0. 1892 /// 1893 /// \headerfile <x86intrin.h> 1894 /// 1895 /// \code 1896 /// int _mm_cmpistrc(__m128i A, __m128i B, const int M); 1897 /// \endcode 1898 /// 1899 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1900 /// instruction. 1901 /// 1902 /// \param A 1903 /// A 128-bit integer vector containing one of the source operands to be 1904 /// compared. 1905 /// \param B 1906 /// A 128-bit integer vector containing one of the source operands to be 1907 /// compared. 1908 /// \param M 1909 /// An 8-bit immediate operand specifying whether the characters are bytes or 1910 /// words and the type of comparison to perform. \n 1911 /// Bits [1:0]: Determine source data format. \n 1912 /// 00: 16 unsigned bytes \n 1913 /// 01: 8 unsigned words \n 1914 /// 10: 16 signed bytes \n 1915 /// 11: 8 signed words \n 1916 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1917 /// 00: Subset: Each character in \a B is compared for equality with all 1918 /// the characters in \a A. \n 1919 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1920 /// basis is greater than or equal for even-indexed elements in \a A, 1921 /// and less than or equal for odd-indexed elements in \a A. \n 1922 /// 10: Match: Compare each pair of corresponding characters in \a A and 1923 /// \a B for equality. \n 1924 /// 11: Substring: Search B for substring matches of \a A. \n 1925 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1926 /// mask of the comparison results. \n 1927 /// 00: No effect. \n 1928 /// 01: Negate the bit mask. \n 1929 /// 10: No effect. \n 1930 /// 11: Negate the bit mask only for bits with an index less than or equal 1931 /// to the size of \a A or \a B. 1932 /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. 1933 #define _mm_cmpistrc(A, B, M) \ 1934 (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ 1935 (__v16qi)(__m128i)(B), (int)(M)) 1936 1937 /// \brief Uses the immediate operand \a M to perform a comparison of string 1938 /// data with implicitly defined lengths that is contained in source operands 1939 /// \a A and \a B. Returns bit 0 of the resulting bit mask. 1940 /// 1941 /// \headerfile <x86intrin.h> 1942 /// 1943 /// \code 1944 /// int _mm_cmpistro(__m128i A, __m128i B, const int M); 1945 /// \endcode 1946 /// 1947 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1948 /// instruction. 1949 /// 1950 /// \param A 1951 /// A 128-bit integer vector containing one of the source operands to be 1952 /// compared. 1953 /// \param B 1954 /// A 128-bit integer vector containing one of the source operands to be 1955 /// compared. 1956 /// \param M 1957 /// An 8-bit immediate operand specifying whether the characters are bytes or 1958 /// words and the type of comparison to perform. \n 1959 /// Bits [1:0]: Determine source data format. \n 1960 /// 00: 16 unsigned bytes \n 1961 /// 01: 8 unsigned words \n 1962 /// 10: 16 signed bytes \n 1963 /// 11: 8 signed words \n 1964 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1965 /// 00: Subset: Each character in \a B is compared for equality with all 1966 /// the characters in \a A. \n 1967 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1968 /// basis is greater than or equal for even-indexed elements in \a A, 1969 /// and less than or equal for odd-indexed elements in \a A. \n 1970 /// 10: Match: Compare each pair of corresponding characters in \a A and 1971 /// \a B for equality. \n 1972 /// 11: Substring: Search B for substring matches of \a A. \n 1973 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1974 /// mask of the comparison results. \n 1975 /// 00: No effect. \n 1976 /// 01: Negate the bit mask. \n 1977 /// 10: No effect. \n 1978 /// 11: Negate the bit mask only for bits with an index less than or equal 1979 /// to the size of \a A or \a B. \n 1980 /// \returns Returns bit 0 of the resulting bit mask. 1981 #define _mm_cmpistro(A, B, M) \ 1982 (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ 1983 (__v16qi)(__m128i)(B), (int)(M)) 1984 1985 /// \brief Uses the immediate operand \a M to perform a comparison of string 1986 /// data with implicitly defined lengths that is contained in source operands 1987 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 1988 /// the maximum, otherwise, returns 0. 1989 /// 1990 /// \headerfile <x86intrin.h> 1991 /// 1992 /// \code 1993 /// int _mm_cmpistrs(__m128i A, __m128i B, const int M); 1994 /// \endcode 1995 /// 1996 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 1997 /// instruction. 1998 /// 1999 /// \param A 2000 /// A 128-bit integer vector containing one of the source operands to be 2001 /// compared. 2002 /// \param B 2003 /// A 128-bit integer vector containing one of the source operands to be 2004 /// compared. 2005 /// \param M 2006 /// An 8-bit immediate operand specifying whether the characters are bytes or 2007 /// words and the type of comparison to perform. \n 2008 /// Bits [1:0]: Determine source data format. \n 2009 /// 00: 16 unsigned bytes \n 2010 /// 01: 8 unsigned words \n 2011 /// 10: 16 signed bytes \n 2012 /// 11: 8 signed words \n 2013 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2014 /// 00: Subset: Each character in \a B is compared for equality with all 2015 /// the characters in \a A. \n 2016 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2017 /// basis is greater than or equal for even-indexed elements in \a A, 2018 /// and less than or equal for odd-indexed elements in \a A. \n 2019 /// 10: Match: Compare each pair of corresponding characters in \a A and 2020 /// \a B for equality. \n 2021 /// 11: Substring: Search \a B for substring matches of \a A. \n 2022 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2023 /// mask of the comparison results. \n 2024 /// 00: No effect. \n 2025 /// 01: Negate the bit mask. \n 2026 /// 10: No effect. \n 2027 /// 11: Negate the bit mask only for bits with an index less than or equal 2028 /// to the size of \a A or \a B. \n 2029 /// \returns Returns 1 if the length of the string in \a A is less than the 2030 /// maximum, otherwise, returns 0. 2031 #define _mm_cmpistrs(A, B, M) \ 2032 (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ 2033 (__v16qi)(__m128i)(B), (int)(M)) 2034 2035 /// \brief Uses the immediate operand \a M to perform a comparison of string 2036 /// data with implicitly defined lengths that is contained in source operands 2037 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2038 /// the maximum, otherwise, returns 0. 2039 /// 2040 /// \headerfile <x86intrin.h> 2041 /// 2042 /// \code 2043 /// int _mm_cmpistrz(__m128i A, __m128i B, const int M); 2044 /// \endcode 2045 /// 2046 /// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c> 2047 /// instruction. 2048 /// 2049 /// \param A 2050 /// A 128-bit integer vector containing one of the source operands to be 2051 /// compared. 2052 /// \param B 2053 /// A 128-bit integer vector containing one of the source operands to be 2054 /// compared. 2055 /// \param M 2056 /// An 8-bit immediate operand specifying whether the characters are bytes or 2057 /// words and the type of comparison to perform. \n 2058 /// Bits [1:0]: Determine source data format. \n 2059 /// 00: 16 unsigned bytes \n 2060 /// 01: 8 unsigned words \n 2061 /// 10: 16 signed bytes \n 2062 /// 11: 8 signed words \n 2063 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2064 /// 00: Subset: Each character in \a B is compared for equality with all 2065 /// the characters in \a A. \n 2066 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2067 /// basis is greater than or equal for even-indexed elements in \a A, 2068 /// and less than or equal for odd-indexed elements in \a A. \n 2069 /// 10: Match: Compare each pair of corresponding characters in \a A and 2070 /// \a B for equality. \n 2071 /// 11: Substring: Search \a B for substring matches of \a A. \n 2072 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2073 /// mask of the comparison results. \n 2074 /// 00: No effect. \n 2075 /// 01: Negate the bit mask. \n 2076 /// 10: No effect. \n 2077 /// 11: Negate the bit mask only for bits with an index less than or equal 2078 /// to the size of \a A or \a B. 2079 /// \returns Returns 1 if the length of the string in \a B is less than the 2080 /// maximum, otherwise, returns 0. 2081 #define _mm_cmpistrz(A, B, M) \ 2082 (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ 2083 (__v16qi)(__m128i)(B), (int)(M)) 2084 2085 /// \brief Uses the immediate operand \a M to perform a comparison of string 2086 /// data with explicitly defined lengths that is contained in source operands 2087 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 2088 /// string in \a B is the maximum, otherwise, returns 0. 2089 /// 2090 /// \headerfile <x86intrin.h> 2091 /// 2092 /// \code 2093 /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); 2094 /// \endcode 2095 /// 2096 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2097 /// instruction. 2098 /// 2099 /// \param A 2100 /// A 128-bit integer vector containing one of the source operands to be 2101 /// compared. 2102 /// \param LA 2103 /// An integer that specifies the length of the string in \a A. 2104 /// \param B 2105 /// A 128-bit integer vector containing one of the source operands to be 2106 /// compared. 2107 /// \param LB 2108 /// An integer that specifies the length of the string in \a B. 2109 /// \param M 2110 /// An 8-bit immediate operand specifying whether the characters are bytes or 2111 /// words and the type of comparison to perform. \n 2112 /// Bits [1:0]: Determine source data format. \n 2113 /// 00: 16 unsigned bytes \n 2114 /// 01: 8 unsigned words \n 2115 /// 10: 16 signed bytes \n 2116 /// 11: 8 signed words \n 2117 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2118 /// 00: Subset: Each character in \a B is compared for equality with all 2119 /// the characters in \a A. \n 2120 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2121 /// basis is greater than or equal for even-indexed elements in \a A, 2122 /// and less than or equal for odd-indexed elements in \a A. \n 2123 /// 10: Match: Compare each pair of corresponding characters in \a A and 2124 /// \a B for equality. \n 2125 /// 11: Substring: Search \a B for substring matches of \a A. \n 2126 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2127 /// mask of the comparison results. \n 2128 /// 00: No effect. \n 2129 /// 01: Negate the bit mask. \n 2130 /// 10: No effect. \n 2131 /// 11: Negate the bit mask only for bits with an index less than or equal 2132 /// to the size of \a A or \a B. 2133 /// \returns Returns 1 if the bit mask is zero and the length of the string in 2134 /// \a B is the maximum, otherwise, returns 0. 2135 #define _mm_cmpestra(A, LA, B, LB, M) \ 2136 (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ 2137 (__v16qi)(__m128i)(B), (int)(LB), \ 2138 (int)(M)) 2139 2140 /// \brief Uses the immediate operand \a M to perform a comparison of string 2141 /// data with explicitly defined lengths that is contained in source operands 2142 /// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, 2143 /// returns 0. 2144 /// 2145 /// \headerfile <x86intrin.h> 2146 /// 2147 /// \code 2148 /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); 2149 /// \endcode 2150 /// 2151 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2152 /// instruction. 2153 /// 2154 /// \param A 2155 /// A 128-bit integer vector containing one of the source operands to be 2156 /// compared. 2157 /// \param LA 2158 /// An integer that specifies the length of the string in \a A. 2159 /// \param B 2160 /// A 128-bit integer vector containing one of the source operands to be 2161 /// compared. 2162 /// \param LB 2163 /// An integer that specifies the length of the string in \a B. 2164 /// \param M 2165 /// An 8-bit immediate operand specifying whether the characters are bytes or 2166 /// words and the type of comparison to perform. \n 2167 /// Bits [1:0]: Determine source data format. \n 2168 /// 00: 16 unsigned bytes \n 2169 /// 01: 8 unsigned words \n 2170 /// 10: 16 signed bytes \n 2171 /// 11: 8 signed words \n 2172 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2173 /// 00: Subset: Each character in \a B is compared for equality with all 2174 /// the characters in \a A. \n 2175 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2176 /// basis is greater than or equal for even-indexed elements in \a A, 2177 /// and less than or equal for odd-indexed elements in \a A. \n 2178 /// 10: Match: Compare each pair of corresponding characters in \a A and 2179 /// \a B for equality. \n 2180 /// 11: Substring: Search \a B for substring matches of \a A. \n 2181 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2182 /// mask of the comparison results. \n 2183 /// 00: No effect. \n 2184 /// 01: Negate the bit mask. \n 2185 /// 10: No effect. \n 2186 /// 11: Negate the bit mask only for bits with an index less than or equal 2187 /// to the size of \a A or \a B. \n 2188 /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. 2189 #define _mm_cmpestrc(A, LA, B, LB, M) \ 2190 (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ 2191 (__v16qi)(__m128i)(B), (int)(LB), \ 2192 (int)(M)) 2193 2194 /// \brief Uses the immediate operand \a M to perform a comparison of string 2195 /// data with explicitly defined lengths that is contained in source operands 2196 /// \a A and \a B. Returns bit 0 of the resulting bit mask. 2197 /// 2198 /// \headerfile <x86intrin.h> 2199 /// 2200 /// \code 2201 /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); 2202 /// \endcode 2203 /// 2204 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2205 /// instruction. 2206 /// 2207 /// \param A 2208 /// A 128-bit integer vector containing one of the source operands to be 2209 /// compared. 2210 /// \param LA 2211 /// An integer that specifies the length of the string in \a A. 2212 /// \param B 2213 /// A 128-bit integer vector containing one of the source operands to be 2214 /// compared. 2215 /// \param LB 2216 /// An integer that specifies the length of the string in \a B. 2217 /// \param M 2218 /// An 8-bit immediate operand specifying whether the characters are bytes or 2219 /// words and the type of comparison to perform. \n 2220 /// Bits [1:0]: Determine source data format. \n 2221 /// 00: 16 unsigned bytes \n 2222 /// 01: 8 unsigned words \n 2223 /// 10: 16 signed bytes \n 2224 /// 11: 8 signed words \n 2225 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2226 /// 00: Subset: Each character in \a B is compared for equality with all 2227 /// the characters in \a A. \n 2228 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2229 /// basis is greater than or equal for even-indexed elements in \a A, 2230 /// and less than or equal for odd-indexed elements in \a A. \n 2231 /// 10: Match: Compare each pair of corresponding characters in \a A and 2232 /// \a B for equality. \n 2233 /// 11: Substring: Search \a B for substring matches of \a A. \n 2234 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2235 /// mask of the comparison results. \n 2236 /// 00: No effect. \n 2237 /// 01: Negate the bit mask. \n 2238 /// 10: No effect. \n 2239 /// 11: Negate the bit mask only for bits with an index less than or equal 2240 /// to the size of \a A or \a B. 2241 /// \returns Returns bit 0 of the resulting bit mask. 2242 #define _mm_cmpestro(A, LA, B, LB, M) \ 2243 (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ 2244 (__v16qi)(__m128i)(B), (int)(LB), \ 2245 (int)(M)) 2246 2247 /// \brief Uses the immediate operand \a M to perform a comparison of string 2248 /// data with explicitly defined lengths that is contained in source operands 2249 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 2250 /// the maximum, otherwise, returns 0. 2251 /// 2252 /// \headerfile <x86intrin.h> 2253 /// 2254 /// \code 2255 /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); 2256 /// \endcode 2257 /// 2258 /// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c> 2259 /// instruction. 2260 /// 2261 /// \param A 2262 /// A 128-bit integer vector containing one of the source operands to be 2263 /// compared. 2264 /// \param LA 2265 /// An integer that specifies the length of the string in \a A. 2266 /// \param B 2267 /// A 128-bit integer vector containing one of the source operands to be 2268 /// compared. 2269 /// \param LB 2270 /// An integer that specifies the length of the string in \a B. 2271 /// \param M 2272 /// An 8-bit immediate operand specifying whether the characters are bytes or 2273 /// words and the type of comparison to perform. \n 2274 /// Bits [1:0]: Determine source data format. \n 2275 /// 00: 16 unsigned bytes \n 2276 /// 01: 8 unsigned words \n 2277 /// 10: 16 signed bytes \n 2278 /// 11: 8 signed words \n 2279 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2280 /// 00: Subset: Each character in \a B is compared for equality with all 2281 /// the characters in \a A. \n 2282 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2283 /// basis is greater than or equal for even-indexed elements in \a A, 2284 /// and less than or equal for odd-indexed elements in \a A. \n 2285 /// 10: Match: Compare each pair of corresponding characters in \a A and 2286 /// \a B for equality. \n 2287 /// 11: Substring: Search \a B for substring matches of \a A. \n 2288 /// Bits [5:4]: Determine whether to perform a one's complement in the bit 2289 /// mask of the comparison results. \n 2290 /// 00: No effect. \n 2291 /// 01: Negate the bit mask. \n 2292 /// 10: No effect. \n 2293 /// 11: Negate the bit mask only for bits with an index less than or equal 2294 /// to the size of \a A or \a B. \n 2295 /// \returns Returns 1 if the length of the string in \a A is less than the 2296 /// maximum, otherwise, returns 0. 2297 #define _mm_cmpestrs(A, LA, B, LB, M) \ 2298 (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ 2299 (__v16qi)(__m128i)(B), (int)(LB), \ 2300 (int)(M)) 2301 2302 /// \brief Uses the immediate operand \a M to perform a comparison of string 2303 /// data with explicitly defined lengths that is contained in source operands 2304 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2305 /// the maximum, otherwise, returns 0. 2306 /// 2307 /// \headerfile <x86intrin.h> 2308 /// 2309 /// \code 2310 /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); 2311 /// \endcode 2312 /// 2313 /// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction. 2314 /// 2315 /// \param A 2316 /// A 128-bit integer vector containing one of the source operands to be 2317 /// compared. 2318 /// \param LA 2319 /// An integer that specifies the length of the string in \a A. 2320 /// \param B 2321 /// A 128-bit integer vector containing one of the source operands to be 2322 /// compared. 2323 /// \param LB 2324 /// An integer that specifies the length of the string in \a B. 2325 /// \param M 2326 /// An 8-bit immediate operand specifying whether the characters are bytes or 2327 /// words and the type of comparison to perform. \n 2328 /// Bits [1:0]: Determine source data format. \n 2329 /// 00: 16 unsigned bytes \n 2330 /// 01: 8 unsigned words \n 2331 /// 10: 16 signed bytes \n 2332 /// 11: 8 signed words \n 2333 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2334 /// 00: Subset: Each character in \a B is compared for equality with all 2335 /// the characters in \a A. \n 2336 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2337 /// basis is greater than or equal for even-indexed elements in \a A, 2338 /// and less than or equal for odd-indexed elements in \a A. \n 2339 /// 10: Match: Compare each pair of corresponding characters in \a A and 2340 /// \a B for equality. \n 2341 /// 11: Substring: Search \a B for substring matches of \a A. \n 2342 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2343 /// mask of the comparison results. \n 2344 /// 00: No effect. \n 2345 /// 01: Negate the bit mask. \n 2346 /// 10: No effect. \n 2347 /// 11: Negate the bit mask only for bits with an index less than or equal 2348 /// to the size of \a A or \a B. 2349 /// \returns Returns 1 if the length of the string in \a B is less than the 2350 /// maximum, otherwise, returns 0. 2351 #define _mm_cmpestrz(A, LA, B, LB, M) \ 2352 (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ 2353 (__v16qi)(__m128i)(B), (int)(LB), \ 2354 (int)(M)) 2355 2356 /* SSE4.2 Compare Packed Data -- Greater Than. */ 2357 /// \brief Compares each of the corresponding 64-bit values of the 128-bit 2358 /// integer vectors to determine if the values in the first operand are 2359 /// greater than those in the second operand. 2360 /// 2361 /// \headerfile <x86intrin.h> 2362 /// 2363 /// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction. 2364 /// 2365 /// \param __V1 2366 /// A 128-bit integer vector. 2367 /// \param __V2 2368 /// A 128-bit integer vector. 2369 /// \returns A 128-bit integer vector containing the comparison results. 2370 static __inline__ __m128i __DEFAULT_FN_ATTRS 2371 _mm_cmpgt_epi64(__m128i __V1, __m128i __V2) 2372 { 2373 return (__m128i)((__v2di)__V1 > (__v2di)__V2); 2374 } 2375 2376 /* SSE4.2 Accumulate CRC32. */ 2377 /// \brief Adds the unsigned integer operand to the CRC-32C checksum of the 2378 /// unsigned char operand. 2379 /// 2380 /// \headerfile <x86intrin.h> 2381 /// 2382 /// This intrinsic corresponds to the <c> CRC32B </c> instruction. 2383 /// 2384 /// \param __C 2385 /// An unsigned integer operand to add to the CRC-32C checksum of operand 2386 /// \a __D. 2387 /// \param __D 2388 /// An unsigned 8-bit integer operand used to compute the CRC-32C checksum. 2389 /// \returns The result of adding operand \a __C to the CRC-32C checksum of 2390 /// operand \a __D. 2391 static __inline__ unsigned int __DEFAULT_FN_ATTRS 2392 _mm_crc32_u8(unsigned int __C, unsigned char __D) 2393 { 2394 return __builtin_ia32_crc32qi(__C, __D); 2395 } 2396 2397 /// \brief Adds the unsigned integer operand to the CRC-32C checksum of the 2398 /// unsigned short operand. 2399 /// 2400 /// \headerfile <x86intrin.h> 2401 /// 2402 /// This intrinsic corresponds to the <c> CRC32W </c> instruction. 2403 /// 2404 /// \param __C 2405 /// An unsigned integer operand to add to the CRC-32C checksum of operand 2406 /// \a __D. 2407 /// \param __D 2408 /// An unsigned 16-bit integer operand used to compute the CRC-32C checksum. 2409 /// \returns The result of adding operand \a __C to the CRC-32C checksum of 2410 /// operand \a __D. 2411 static __inline__ unsigned int __DEFAULT_FN_ATTRS 2412 _mm_crc32_u16(unsigned int __C, unsigned short __D) 2413 { 2414 return __builtin_ia32_crc32hi(__C, __D); 2415 } 2416 2417 /// \brief Adds the first unsigned integer operand to the CRC-32C checksum of 2418 /// the second unsigned integer operand. 2419 /// 2420 /// \headerfile <x86intrin.h> 2421 /// 2422 /// This intrinsic corresponds to the <c> CRC32L </c> instruction. 2423 /// 2424 /// \param __C 2425 /// An unsigned integer operand to add to the CRC-32C checksum of operand 2426 /// \a __D. 2427 /// \param __D 2428 /// An unsigned 32-bit integer operand used to compute the CRC-32C checksum. 2429 /// \returns The result of adding operand \a __C to the CRC-32C checksum of 2430 /// operand \a __D. 2431 static __inline__ unsigned int __DEFAULT_FN_ATTRS 2432 _mm_crc32_u32(unsigned int __C, unsigned int __D) 2433 { 2434 return __builtin_ia32_crc32si(__C, __D); 2435 } 2436 2437 #ifdef __x86_64__ 2438 /// \brief Adds the unsigned integer operand to the CRC-32C checksum of the 2439 /// unsigned 64-bit integer operand. 2440 /// 2441 /// \headerfile <x86intrin.h> 2442 /// 2443 /// This intrinsic corresponds to the <c> CRC32Q </c> instruction. 2444 /// 2445 /// \param __C 2446 /// An unsigned integer operand to add to the CRC-32C checksum of operand 2447 /// \a __D. 2448 /// \param __D 2449 /// An unsigned 64-bit integer operand used to compute the CRC-32C checksum. 2450 /// \returns The result of adding operand \a __C to the CRC-32C checksum of 2451 /// operand \a __D. 2452 static __inline__ unsigned long long __DEFAULT_FN_ATTRS 2453 _mm_crc32_u64(unsigned long long __C, unsigned long long __D) 2454 { 2455 return __builtin_ia32_crc32di(__C, __D); 2456 } 2457 #endif /* __x86_64__ */ 2458 2459 #undef __DEFAULT_FN_ATTRS 2460 2461 #ifdef __POPCNT__ 2462 #include <popcntintrin.h> 2463 #endif 2464 2465 #endif /* _SMMINTRIN_H */ 2466