1 /*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef _SMMINTRIN_H 25 #define _SMMINTRIN_H 26 27 #include <tmmintrin.h> 28 29 /* Define the default attributes for the functions in this file. */ 30 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"))) 31 32 /* SSE4 Rounding macros. */ 33 #define _MM_FROUND_TO_NEAREST_INT 0x00 34 #define _MM_FROUND_TO_NEG_INF 0x01 35 #define _MM_FROUND_TO_POS_INF 0x02 36 #define _MM_FROUND_TO_ZERO 0x03 37 #define _MM_FROUND_CUR_DIRECTION 0x04 38 39 #define _MM_FROUND_RAISE_EXC 0x00 40 #define _MM_FROUND_NO_EXC 0x08 41 42 #define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) 43 #define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) 44 #define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) 45 #define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) 46 #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) 47 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) 48 49 /// \brief Rounds up each element of the 128-bit vector of [4 x float] to an 50 /// integer and returns the rounded values in a 128-bit vector of 51 /// [4 x float]. 52 /// 53 /// \headerfile <x86intrin.h> 54 /// 55 /// \code 56 /// __m128 _mm_ceil_ps(__m128 X); 57 /// \endcode 58 /// 59 /// This intrinsic corresponds to the <c> <i> VROUNDPS / ROUNDPS </i> </c> 60 /// instruction. 61 /// 62 /// \param X 63 /// A 128-bit vector of [4 x float] values to be rounded up. 64 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 65 #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) 66 67 /// \brief Rounds up each element of the 128-bit vector of [2 x double] to an 68 /// integer and returns the rounded values in a 128-bit vector of 69 /// [2 x double]. 70 /// 71 /// \headerfile <x86intrin.h> 72 /// 73 /// \code 74 /// __m128d _mm_ceil_pd(__m128d X); 75 /// \endcode 76 /// 77 /// This intrinsic corresponds to the <c> <i> VROUNDPD / ROUNDPD </i> </c> 78 /// instruction. 79 /// 80 /// \param X 81 /// A 128-bit vector of [2 x double] values to be rounded up. 82 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 83 #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) 84 85 /// \brief Copies three upper elements of the first 128-bit vector operand to 86 /// the corresponding three upper elements of the 128-bit result vector of 87 /// [4 x float]. Rounds up the lowest element of the second 128-bit vector 88 /// operand to an integer and copies it to the lowest element of the 128-bit 89 /// result vector of [4 x float]. 90 /// 91 /// \headerfile <x86intrin.h> 92 /// 93 /// \code 94 /// __m128 _mm_ceil_ss(__m128 X, __m128 Y); 95 /// \endcode 96 /// 97 /// This intrinsic corresponds to the <c> <i> VROUNDSS / ROUNDSS </i> </c> 98 /// instruction. 99 /// 100 /// \param X 101 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 102 /// copied to the corresponding bits of the result. 103 /// \param Y 104 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 105 /// rounded up to the nearest integer and copied to the corresponding bits 106 /// of the result. 107 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 108 /// values. 109 #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) 110 111 /// \brief Copies the upper element of the first 128-bit vector operand to the 112 /// corresponding upper element of the 128-bit result vector of [2 x double]. 113 /// Rounds up the lower element of the second 128-bit vector operand to an 114 /// integer and copies it to the lower element of the 128-bit result vector 115 /// of [2 x double]. 116 /// 117 /// \headerfile <x86intrin.h> 118 /// 119 /// \code 120 /// __m128d _mm_ceil_sd(__m128d X, __m128d Y); 121 /// \endcode 122 /// 123 /// This intrinsic corresponds to the <c> <i> VROUNDSD / ROUNDSD </i> </c> 124 /// instruction. 125 /// 126 /// \param X 127 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 128 /// copied to the corresponding bits of the result. 129 /// \param Y 130 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 131 /// rounded up to the nearest integer and copied to the corresponding bits 132 /// of the result. 133 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 134 /// values. 135 #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) 136 137 /// \brief Rounds down each element of the 128-bit vector of [4 x float] to an 138 /// an integer and returns the rounded values in a 128-bit vector of 139 /// [4 x float]. 140 /// 141 /// \headerfile <x86intrin.h> 142 /// 143 /// \code 144 /// __m128 _mm_floor_ps(__m128 X); 145 /// \endcode 146 /// 147 /// This intrinsic corresponds to the <c> <i> VROUNDPS / ROUNDPS </i> </c> 148 /// instruction. 149 /// 150 /// \param X 151 /// A 128-bit vector of [4 x float] values to be rounded down. 152 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 153 #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) 154 155 /// \brief Rounds down each element of the 128-bit vector of [2 x double] to an 156 /// integer and returns the rounded values in a 128-bit vector of 157 /// [2 x double]. 158 /// 159 /// \headerfile <x86intrin.h> 160 /// 161 /// \code 162 /// __m128d _mm_floor_pd(__m128d X); 163 /// \endcode 164 /// 165 /// This intrinsic corresponds to the <c> <i> VROUNDPD / ROUNDPD </i> </c> 166 /// instruction. 167 /// 168 /// \param X 169 /// A 128-bit vector of [2 x double]. 170 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 171 #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) 172 173 /// \brief Copies three upper elements of the first 128-bit vector operand to 174 /// the corresponding three upper elements of the 128-bit result vector of 175 /// [4 x float]. Rounds down the lowest element of the second 128-bit vector 176 /// operand to an integer and copies it to the lowest element of the 128-bit 177 /// result vector of [4 x float]. 178 /// 179 /// \headerfile <x86intrin.h> 180 /// 181 /// \code 182 /// __m128 _mm_floor_ss(__m128 X, __m128 Y); 183 /// \endcode 184 /// 185 /// This intrinsic corresponds to the <c> <i> VROUNDSS / ROUNDSS </i> </c> 186 /// instruction. 187 /// 188 /// \param X 189 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 190 /// copied to the corresponding bits of the result. 191 /// \param Y 192 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 193 /// rounded down to the nearest integer and copied to the corresponding bits 194 /// of the result. 195 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 196 /// values. 197 #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) 198 199 /// \brief Copies the upper element of the first 128-bit vector operand to the 200 /// corresponding upper element of the 128-bit result vector of [2 x double]. 201 /// Rounds down the lower element of the second 128-bit vector operand to an 202 /// integer and copies it to the lower element of the 128-bit result vector 203 /// of [2 x double]. 204 /// 205 /// \headerfile <x86intrin.h> 206 /// 207 /// \code 208 /// __m128d _mm_floor_sd(__m128d X, __m128d Y); 209 /// \endcode 210 /// 211 /// This intrinsic corresponds to the <c> <i> VROUNDSD / ROUNDSD </i> </c> 212 /// instruction. 213 /// 214 /// \param X 215 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 216 /// copied to the corresponding bits of the result. 217 /// \param Y 218 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 219 /// rounded down to the nearest integer and copied to the corresponding bits 220 /// of the result. 221 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 222 /// values. 223 #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) 224 225 /// \brief Rounds each element of the 128-bit vector of [4 x float] to an 226 /// integer value according to the rounding control specified by the second 227 /// argument and returns the rounded values in a 128-bit vector of 228 /// [4 x float]. 229 /// 230 /// \headerfile <x86intrin.h> 231 /// 232 /// \code 233 /// __m128 _mm_round_ps(__m128 X, const int M); 234 /// \endcode 235 /// 236 /// This intrinsic corresponds to the <c> <i> VROUNDPS / ROUNDPS </i> </c> 237 /// instruction. 238 /// 239 /// \param X 240 /// A 128-bit vector of [4 x float]. 241 /// \param M 242 /// An integer value that specifies the rounding operation. \n 243 /// Bits [7:4] are reserved. \n 244 /// Bit [3] is a precision exception value: \n 245 /// 0: A normal PE exception is used \n 246 /// 1: The PE field is not updated \n 247 /// Bit [2] is the rounding control source: \n 248 /// 0: Use bits [1:0] of \a M \n 249 /// 1: Use the current MXCSR setting \n 250 /// Bits [1:0] contain the rounding control definition: \n 251 /// 00: Nearest \n 252 /// 01: Downward (toward negative infinity) \n 253 /// 10: Upward (toward positive infinity) \n 254 /// 11: Truncated 255 /// \returns A 128-bit vector of [4 x float] containing the rounded values. 256 #define _mm_round_ps(X, M) __extension__ ({ \ 257 (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); }) 258 259 /// \brief Copies three upper elements of the first 128-bit vector operand to 260 /// the corresponding three upper elements of the 128-bit result vector of 261 /// [4 x float]. Rounds the lowest element of the second 128-bit vector 262 /// operand to an integer value according to the rounding control specified 263 /// by the third argument and copies it to the lowest element of the 128-bit 264 /// result vector of [4 x float]. 265 /// 266 /// \headerfile <x86intrin.h> 267 /// 268 /// \code 269 /// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); 270 /// \endcode 271 /// 272 /// This intrinsic corresponds to the <c> <i> VROUNDSS / ROUNDSS </i> </c> 273 /// instruction. 274 /// 275 /// \param X 276 /// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are 277 /// copied to the corresponding bits of the result. 278 /// \param Y 279 /// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is 280 /// rounded to the nearest integer using the specified rounding control and 281 /// copied to the corresponding bits of the result. 282 /// \param M 283 /// An integer value that specifies the rounding operation. \n 284 /// Bits [7:4] are reserved. \n 285 /// Bit [3] is a precision exception value: \n 286 /// 0: A normal PE exception is used \n 287 /// 1: The PE field is not updated \n 288 /// Bit [2] is the rounding control source: \n 289 /// 0: Use bits [1:0] of \a M \n 290 /// 1: Use the current MXCSR setting \n 291 /// Bits [1:0] contain the rounding control definition: \n 292 /// 00: Nearest \n 293 /// 01: Downward (toward negative infinity) \n 294 /// 10: Upward (toward positive infinity) \n 295 /// 11: Truncated 296 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded 297 /// values. 298 #define _mm_round_ss(X, Y, M) __extension__ ({ \ 299 (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ 300 (__v4sf)(__m128)(Y), (M)); }) 301 302 /// \brief Rounds each element of the 128-bit vector of [2 x double] to an 303 /// integer value according to the rounding control specified by the second 304 /// argument and returns the rounded values in a 128-bit vector of 305 /// [2 x double]. 306 /// 307 /// \headerfile <x86intrin.h> 308 /// 309 /// \code 310 /// __m128d _mm_round_pd(__m128d X, const int M); 311 /// \endcode 312 /// 313 /// This intrinsic corresponds to the <c> <i> VROUNDPD / ROUNDPD </i> </c> 314 /// instruction. 315 /// 316 /// \param X 317 /// A 128-bit vector of [2 x double]. 318 /// \param M 319 /// An integer value that specifies the rounding operation. \n 320 /// Bits [7:4] are reserved. \n 321 /// Bit [3] is a precision exception value: \n 322 /// 0: A normal PE exception is used \n 323 /// 1: The PE field is not updated \n 324 /// Bit [2] is the rounding control source: \n 325 /// 0: Use bits [1:0] of \a M \n 326 /// 1: Use the current MXCSR setting \n 327 /// Bits [1:0] contain the rounding control definition: \n 328 /// 00: Nearest \n 329 /// 01: Downward (toward negative infinity) \n 330 /// 10: Upward (toward positive infinity) \n 331 /// 11: Truncated 332 /// \returns A 128-bit vector of [2 x double] containing the rounded values. 333 #define _mm_round_pd(X, M) __extension__ ({ \ 334 (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); }) 335 336 337 /// \brief Copies the upper element of the first 128-bit vector operand to the 338 /// corresponding upper element of the 128-bit result vector of [2 x double]. 339 /// Rounds the lower element of the second 128-bit vector operand to an 340 /// integer value according to the rounding control specified by the third 341 /// argument and copies it to the lower element of the 128-bit result vector 342 /// of [2 x double]. 343 /// 344 /// \headerfile <x86intrin.h> 345 /// 346 /// \code 347 /// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); 348 /// \endcode 349 /// 350 /// This intrinsic corresponds to the <c> <i> VROUNDSD / ROUNDSD </i> </c> 351 /// instruction. 352 /// 353 /// \param X 354 /// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is 355 /// copied to the corresponding bits of the result. 356 /// \param Y 357 /// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is 358 /// rounded to the nearest integer using the specified rounding control and 359 /// copied to the corresponding bits of the result. 360 /// \param M 361 /// An integer value that specifies the rounding operation. \n 362 /// Bits [7:4] are reserved. \n 363 /// Bit [3] is a precision exception value: \n 364 /// 0: A normal PE exception is used \n 365 /// 1: The PE field is not updated \n 366 /// Bit [2] is the rounding control source: \n 367 /// 0: Use bits [1:0] of \a M \n 368 /// 1: Use the current MXCSR setting \n 369 /// Bits [1:0] contain the rounding control definition: \n 370 /// 00: Nearest \n 371 /// 01: Downward (toward negative infinity) \n 372 /// 10: Upward (toward positive infinity) \n 373 /// 11: Truncated 374 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded 375 /// values. 376 #define _mm_round_sd(X, Y, M) __extension__ ({ \ 377 (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ 378 (__v2df)(__m128d)(Y), (M)); }) 379 380 /* SSE4 Packed Blending Intrinsics. */ 381 /// \brief Returns a 128-bit vector of [2 x double] where the values are 382 /// selected from either the first or second operand as specified by the 383 /// third operand, the control mask. 384 /// 385 /// \headerfile <x86intrin.h> 386 /// 387 /// \code 388 /// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); 389 /// \endcode 390 /// 391 /// This intrinsic corresponds to the <c> <i> VBLENDPD / BLENDPD </i> </c> 392 /// instruction. 393 /// 394 /// \param V1 395 /// A 128-bit vector of [2 x double]. 396 /// \param V2 397 /// A 128-bit vector of [2 x double]. 398 /// \param M 399 /// An immediate integer operand, with mask bits [1:0] specifying how the 400 /// values are to be copied. The position of the mask bit corresponds to the 401 /// index of a copied value. When a mask bit is 0, the corresponding 64-bit 402 /// element in operand \a V1 is copied to the same position in the result. 403 /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 404 /// is copied to the same position in the result. 405 /// \returns A 128-bit vector of [2 x double] containing the copied values. 406 #define _mm_blend_pd(V1, V2, M) __extension__ ({ \ 407 (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \ 408 (__v2df)(__m128d)(V2), \ 409 (((M) & 0x01) ? 2 : 0), \ 410 (((M) & 0x02) ? 3 : 1)); }) 411 412 /// \brief Returns a 128-bit vector of [4 x float] where the values are selected 413 /// from either the first or second operand as specified by the third 414 /// operand, the control mask. 415 /// 416 /// \headerfile <x86intrin.h> 417 /// 418 /// \code 419 /// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); 420 /// \endcode 421 /// 422 /// This intrinsic corresponds to the <c> <i> VBLENDPS / BLENDPS </i> </c> 423 /// instruction. 424 /// 425 /// \param V1 426 /// A 128-bit vector of [4 x float]. 427 /// \param V2 428 /// A 128-bit vector of [4 x float]. 429 /// \param M 430 /// An immediate integer operand, with mask bits [3:0] specifying how the 431 /// values are to be copied. The position of the mask bit corresponds to the 432 /// index of a copied value. When a mask bit is 0, the corresponding 32-bit 433 /// element in operand \a V1 is copied to the same position in the result. 434 /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 435 /// is copied to the same position in the result. 436 /// \returns A 128-bit vector of [4 x float] containing the copied values. 437 #define _mm_blend_ps(V1, V2, M) __extension__ ({ \ 438 (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ 439 (((M) & 0x01) ? 4 : 0), \ 440 (((M) & 0x02) ? 5 : 1), \ 441 (((M) & 0x04) ? 6 : 2), \ 442 (((M) & 0x08) ? 7 : 3)); }) 443 444 /// \brief Returns a 128-bit vector of [2 x double] where the values are 445 /// selected from either the first or second operand as specified by the 446 /// third operand, the control mask. 447 /// 448 /// \headerfile <x86intrin.h> 449 /// 450 /// This intrinsic corresponds to the <c> <i> VBLENDVPD / BLENDVPD </i> </c> 451 /// instruction. 452 /// 453 /// \param __V1 454 /// A 128-bit vector of [2 x double]. 455 /// \param __V2 456 /// A 128-bit vector of [2 x double]. 457 /// \param __M 458 /// A 128-bit vector operand, with mask bits 127 and 63 specifying how the 459 /// values are to be copied. The position of the mask bit corresponds to the 460 /// most significant bit of a copied value. When a mask bit is 0, the 461 /// corresponding 64-bit element in operand \a __V1 is copied to the same 462 /// position in the result. When a mask bit is 1, the corresponding 64-bit 463 /// element in operand \a __V2 is copied to the same position in the result. 464 /// \returns A 128-bit vector of [2 x double] containing the copied values. 465 static __inline__ __m128d __DEFAULT_FN_ATTRS 466 _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) 467 { 468 return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2, 469 (__v2df)__M); 470 } 471 472 /// \brief Returns a 128-bit vector of [4 x float] where the values are 473 /// selected from either the first or second operand as specified by the 474 /// third operand, the control mask. 475 /// 476 /// \headerfile <x86intrin.h> 477 /// 478 /// This intrinsic corresponds to the <c> <i> VBLENDVPS / BLENDVPS </i> </c> 479 /// instruction. 480 /// 481 /// \param __V1 482 /// A 128-bit vector of [4 x float]. 483 /// \param __V2 484 /// A 128-bit vector of [4 x float]. 485 /// \param __M 486 /// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying 487 /// how the values are to be copied. The position of the mask bit corresponds 488 /// to the most significant bit of a copied value. When a mask bit is 0, the 489 /// corresponding 32-bit element in operand \a __V1 is copied to the same 490 /// position in the result. When a mask bit is 1, the corresponding 32-bit 491 /// element in operand \a __V2 is copied to the same position in the result. 492 /// \returns A 128-bit vector of [4 x float] containing the copied values. 493 static __inline__ __m128 __DEFAULT_FN_ATTRS 494 _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) 495 { 496 return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2, 497 (__v4sf)__M); 498 } 499 500 /// \brief Returns a 128-bit vector of [16 x i8] where the values are selected 501 /// from either of the first or second operand as specified by the third 502 /// operand, the control mask. 503 /// 504 /// \headerfile <x86intrin.h> 505 /// 506 /// This intrinsic corresponds to the <c> <i> VPBLENDVB / PBLENDVB </i> </c> 507 /// instruction. 508 /// 509 /// \param __V1 510 /// A 128-bit vector of [16 x i8]. 511 /// \param __V2 512 /// A 128-bit vector of [16 x i8]. 513 /// \param __M 514 /// A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying 515 /// how the values are to be copied. The position of the mask bit corresponds 516 /// to the most significant bit of a copied value. When a mask bit is 0, the 517 /// corresponding 8-bit element in operand \a __V1 is copied to the same 518 /// position in the result. When a mask bit is 1, the corresponding 8-bit 519 /// element in operand \a __V2 is copied to the same position in the result. 520 /// \returns A 128-bit vector of [16 x i8] containing the copied values. 521 static __inline__ __m128i __DEFAULT_FN_ATTRS 522 _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) 523 { 524 return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2, 525 (__v16qi)__M); 526 } 527 528 /// \brief Returns a 128-bit vector of [8 x i16] where the values are selected 529 /// from either of the first or second operand as specified by the third 530 /// operand, the control mask. 531 /// 532 /// \headerfile <x86intrin.h> 533 /// 534 /// \code 535 /// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); 536 /// \endcode 537 /// 538 /// This intrinsic corresponds to the <c> <i> VPBLENDW / PBLENDW </i> </c> 539 /// instruction. 540 /// 541 /// \param V1 542 /// A 128-bit vector of [8 x i16]. 543 /// \param V2 544 /// A 128-bit vector of [8 x i16]. 545 /// \param M 546 /// An immediate integer operand, with mask bits [7:0] specifying how the 547 /// values are to be copied. The position of the mask bit corresponds to the 548 /// index of a copied value. When a mask bit is 0, the corresponding 16-bit 549 /// element in operand \a V1 is copied to the same position in the result. 550 /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 551 /// is copied to the same position in the result. 552 /// \returns A 128-bit vector of [8 x i16] containing the copied values. 553 #define _mm_blend_epi16(V1, V2, M) __extension__ ({ \ 554 (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \ 555 (__v8hi)(__m128i)(V2), \ 556 (((M) & 0x01) ? 8 : 0), \ 557 (((M) & 0x02) ? 9 : 1), \ 558 (((M) & 0x04) ? 10 : 2), \ 559 (((M) & 0x08) ? 11 : 3), \ 560 (((M) & 0x10) ? 12 : 4), \ 561 (((M) & 0x20) ? 13 : 5), \ 562 (((M) & 0x40) ? 14 : 6), \ 563 (((M) & 0x80) ? 15 : 7)); }) 564 565 /* SSE4 Dword Multiply Instructions. */ 566 /// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32] 567 /// and returns the lower 32 bits of the each product in a 128-bit vector of 568 /// [4 x i32]. 569 /// 570 /// \headerfile <x86intrin.h> 571 /// 572 /// This intrinsic corresponds to the <c> <i> VPMULLD / PMULLD </i> </c> 573 /// instruction. 574 /// 575 /// \param __V1 576 /// A 128-bit integer vector. 577 /// \param __V2 578 /// A 128-bit integer vector. 579 /// \returns A 128-bit integer vector containing the products of both operands. 580 static __inline__ __m128i __DEFAULT_FN_ATTRS 581 _mm_mullo_epi32 (__m128i __V1, __m128i __V2) 582 { 583 return (__m128i) ((__v4su)__V1 * (__v4su)__V2); 584 } 585 586 /// \brief Multiplies corresponding even-indexed elements of two 128-bit 587 /// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] 588 /// containing the products. 589 /// 590 /// \headerfile <x86intrin.h> 591 /// 592 /// This intrinsic corresponds to the <c> <i> VPMULDQ / PMULDQ </i> </c> 593 /// instruction. 594 /// 595 /// \param __V1 596 /// A 128-bit vector of [4 x i32]. 597 /// \param __V2 598 /// A 128-bit vector of [4 x i32]. 599 /// \returns A 128-bit vector of [2 x i64] containing the products of both 600 /// operands. 601 static __inline__ __m128i __DEFAULT_FN_ATTRS 602 _mm_mul_epi32 (__m128i __V1, __m128i __V2) 603 { 604 return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2); 605 } 606 607 /* SSE4 Floating Point Dot Product Instructions. */ 608 /// \brief Computes the dot product of the two 128-bit vectors of [4 x float] 609 /// and returns it in the elements of the 128-bit result vector of 610 /// [4 x float]. The immediate integer operand controls which input elements 611 /// will contribute to the dot product, and where the final results are 612 /// returned. 613 /// 614 /// \headerfile <x86intrin.h> 615 /// 616 /// \code 617 /// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); 618 /// \endcode 619 /// 620 /// This intrinsic corresponds to the <c> <i> VDPPS / DPPS </i> </c> 621 /// instruction. 622 /// 623 /// \param X 624 /// A 128-bit vector of [4 x float]. 625 /// \param Y 626 /// A 128-bit vector of [4 x float]. 627 /// \param M 628 /// An immediate integer operand. Mask bits [7:4] determine which elements 629 /// of the input vectors are used, with bit [4] corresponding to the lowest 630 /// element and bit [7] corresponding to the highest element of each [4 x 631 /// float] vector. If a bit is set, the corresponding elements from the two 632 /// input vectors are used as an input for dot product; otherwise that input 633 /// is treated as zero. Bits [3:0] determine which elements of the result 634 /// will receive a copy of the final dot product, with bit [0] corresponding 635 /// to the lowest element and bit [3] corresponding to the highest element of 636 /// each [4 x float] subvector. If a bit is set, the dot product is returned 637 /// in the corresponding element; otherwise that element is set to zero. 638 /// \returns A 128-bit vector of [4 x float] containing the dot product. 639 #define _mm_dp_ps(X, Y, M) __extension__ ({ \ 640 (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ 641 (__v4sf)(__m128)(Y), (M)); }) 642 643 /// \brief Computes the dot product of the two 128-bit vectors of [2 x double] 644 /// and returns it in the elements of the 128-bit result vector of 645 /// [2 x double]. The immediate integer operand controls which input 646 /// elements will contribute to the dot product, and where the final results 647 /// are returned. 648 /// 649 /// \headerfile <x86intrin.h> 650 /// 651 /// \code 652 /// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); 653 /// \endcode 654 /// 655 /// This intrinsic corresponds to the <c> <i> VDPPD / DPPD </i> </c> 656 /// instruction. 657 /// 658 /// \param X 659 /// A 128-bit vector of [2 x double]. 660 /// \param Y 661 /// A 128-bit vector of [2 x double]. 662 /// \param M 663 /// An immediate integer operand. Mask bits [5:4] determine which elements 664 /// of the input vectors are used, with bit [4] corresponding to the lowest 665 /// element and bit [5] corresponding to the highest element of each of [2 x 666 /// double] vector. If a bit is set, the corresponding elements from the two 667 /// input vectors are used as an input for dot product; otherwise that input 668 /// is treated as zero. Bits [1:0] determine which elements of the result 669 /// will receive a copy of the final dot product, with bit [0] corresponding 670 /// to the lowest element and bit [3] corresponding to the highest element of 671 /// each [2 x double] vector. If a bit is set, the dot product is returned in 672 /// the corresponding element; otherwise that element is set to zero. 673 #define _mm_dp_pd(X, Y, M) __extension__ ({\ 674 (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ 675 (__v2df)(__m128d)(Y), (M)); }) 676 677 /* SSE4 Streaming Load Hint Instruction. */ 678 /// \brief Loads integer values from a 128-bit aligned memory location to a 679 /// 128-bit integer vector. 680 /// 681 /// \headerfile <x86intrin.h> 682 /// 683 /// This intrinsic corresponds to the <c> <i> VMOVNTDQA / MOVNTDQA </i> </c> 684 /// instruction. 685 /// 686 /// \param __V 687 /// A pointer to a 128-bit aligned memory location that contains the integer 688 /// values. 689 /// \returns A 128-bit integer vector containing the data stored at the 690 /// specified memory location. 691 static __inline__ __m128i __DEFAULT_FN_ATTRS 692 _mm_stream_load_si128 (__m128i const *__V) 693 { 694 return (__m128i) __builtin_ia32_movntdqa ((const __v2di *) __V); 695 } 696 697 /* SSE4 Packed Integer Min/Max Instructions. */ 698 /// \brief Compares the corresponding elements of two 128-bit vectors of 699 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser 700 /// of the two values. 701 /// 702 /// \headerfile <x86intrin.h> 703 /// 704 /// This intrinsic corresponds to the <c> <i> VPMINSB / PMINSB </i> </c> 705 /// instruction. 706 /// 707 /// \param __V1 708 /// A 128-bit vector of [16 x i8]. 709 /// \param __V2 710 /// A 128-bit vector of [16 x i8] 711 /// \returns A 128-bit vector of [16 x i8] containing the lesser values. 712 static __inline__ __m128i __DEFAULT_FN_ATTRS 713 _mm_min_epi8 (__m128i __V1, __m128i __V2) 714 { 715 return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); 716 } 717 718 /// \brief Compares the corresponding elements of two 128-bit vectors of 719 /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the 720 /// greater value of the two. 721 /// 722 /// \headerfile <x86intrin.h> 723 /// 724 /// This intrinsic corresponds to the <c> <i> VPMAXSB / PMAXSB </i> </c> 725 /// instruction. 726 /// 727 /// \param __V1 728 /// A 128-bit vector of [16 x i8]. 729 /// \param __V2 730 /// A 128-bit vector of [16 x i8]. 731 /// \returns A 128-bit vector of [16 x i8] containing the greater values. 732 static __inline__ __m128i __DEFAULT_FN_ATTRS 733 _mm_max_epi8 (__m128i __V1, __m128i __V2) 734 { 735 return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); 736 } 737 738 /// \brief Compares the corresponding elements of two 128-bit vectors of 739 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser 740 /// value of the two. 741 /// 742 /// \headerfile <x86intrin.h> 743 /// 744 /// This intrinsic corresponds to the <c> <i> VPMINUW / PMINUW </i> </c> 745 /// instruction. 746 /// 747 /// \param __V1 748 /// A 128-bit vector of [8 x u16]. 749 /// \param __V2 750 /// A 128-bit vector of [8 x u16]. 751 /// \returns A 128-bit vector of [8 x u16] containing the lesser values. 752 static __inline__ __m128i __DEFAULT_FN_ATTRS 753 _mm_min_epu16 (__m128i __V1, __m128i __V2) 754 { 755 return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); 756 } 757 758 /// \brief Compares the corresponding elements of two 128-bit vectors of 759 /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the 760 /// greater value of the two. 761 /// 762 /// \headerfile <x86intrin.h> 763 /// 764 /// This intrinsic corresponds to the <c> <i> VPMAXUW / PMAXUW </i> </c> 765 /// instruction. 766 /// 767 /// \param __V1 768 /// A 128-bit vector of [8 x u16]. 769 /// \param __V2 770 /// A 128-bit vector of [8 x u16]. 771 /// \returns A 128-bit vector of [8 x u16] containing the greater values. 772 static __inline__ __m128i __DEFAULT_FN_ATTRS 773 _mm_max_epu16 (__m128i __V1, __m128i __V2) 774 { 775 return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); 776 } 777 778 /// \brief Compares the corresponding elements of two 128-bit vectors of 779 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser 780 /// value of the two. 781 /// 782 /// \headerfile <x86intrin.h> 783 /// 784 /// This intrinsic corresponds to the <c> <i> VPMINSD / PMINSD </i> </c> 785 /// instruction. 786 /// 787 /// \param __V1 788 /// A 128-bit vector of [4 x i32]. 789 /// \param __V2 790 /// A 128-bit vector of [4 x i32]. 791 /// \returns A 128-bit vector of [4 x i32] containing the lesser values. 792 static __inline__ __m128i __DEFAULT_FN_ATTRS 793 _mm_min_epi32 (__m128i __V1, __m128i __V2) 794 { 795 return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); 796 } 797 798 /// \brief Compares the corresponding elements of two 128-bit vectors of 799 /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the 800 /// greater value of the two. 801 /// 802 /// \headerfile <x86intrin.h> 803 /// 804 /// This intrinsic corresponds to the <c> <i> VPMAXSD / PMAXSD </i> </c> 805 /// instruction. 806 /// 807 /// \param __V1 808 /// A 128-bit vector of [4 x i32]. 809 /// \param __V2 810 /// A 128-bit vector of [4 x i32]. 811 /// \returns A 128-bit vector of [4 x i32] containing the greater values. 812 static __inline__ __m128i __DEFAULT_FN_ATTRS 813 _mm_max_epi32 (__m128i __V1, __m128i __V2) 814 { 815 return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); 816 } 817 818 /// \brief Compares the corresponding elements of two 128-bit vectors of 819 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser 820 /// value of the two. 821 /// 822 /// \headerfile <x86intrin.h> 823 /// 824 /// This intrinsic corresponds to the <c> <i> VPMINUD / PMINUD </i> </c> 825 /// instruction. 826 /// 827 /// \param __V1 828 /// A 128-bit vector of [4 x u32]. 829 /// \param __V2 830 /// A 128-bit vector of [4 x u32]. 831 /// \returns A 128-bit vector of [4 x u32] containing the lesser values. 832 static __inline__ __m128i __DEFAULT_FN_ATTRS 833 _mm_min_epu32 (__m128i __V1, __m128i __V2) 834 { 835 return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); 836 } 837 838 /// \brief Compares the corresponding elements of two 128-bit vectors of 839 /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the 840 /// greater value of the two. 841 /// 842 /// \headerfile <x86intrin.h> 843 /// 844 /// This intrinsic corresponds to the <c> <i> VPMAXUD / PMAXUD </i> </c> 845 /// instruction. 846 /// 847 /// \param __V1 848 /// A 128-bit vector of [4 x u32]. 849 /// \param __V2 850 /// A 128-bit vector of [4 x u32]. 851 /// \returns A 128-bit vector of [4 x u32] containing the greater values. 852 static __inline__ __m128i __DEFAULT_FN_ATTRS 853 _mm_max_epu32 (__m128i __V1, __m128i __V2) 854 { 855 return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2); 856 } 857 858 /* SSE4 Insertion and Extraction from XMM Register Instructions. */ 859 /// \brief Takes the first argument \a X and inserts an element from the second 860 /// argument \a Y as selected by the third argument \a N. That result then 861 /// has elements zeroed out also as selected by the third argument \a N. The 862 /// resulting 128-bit vector of [4 x float] is then returned. 863 /// 864 /// \headerfile <x86intrin.h> 865 /// 866 /// \code 867 /// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); 868 /// \endcode 869 /// 870 /// This intrinsic corresponds to the <c> <i> VINSERTPS </i> </c> instruction. 871 /// 872 /// \param X 873 /// A 128-bit vector source operand of [4 x float]. With the exception of 874 /// those bits in the result copied from parameter \a Y and zeroed by bits 875 /// [3:0] of \a N, all bits from this parameter are copied to the result. 876 /// \param Y 877 /// A 128-bit vector source operand of [4 x float]. One single-precision 878 /// floating-point element from this source, as determined by the immediate 879 /// parameter, is copied to the result. 880 /// \param N 881 /// Specifies which bits from operand \a Y will be copied, which bits in the 882 /// result they will be be copied to, and which bits in the result will be 883 /// cleared. The following assignments are made: \n 884 /// Bits [7:6] specify the bits to copy from operand \a Y: \n 885 /// 00: Selects bits [31:0] from operand \a Y. \n 886 /// 01: Selects bits [63:32] from operand \a Y. \n 887 /// 10: Selects bits [95:64] from operand \a Y. \n 888 /// 11: Selects bits [127:96] from operand \a Y. \n 889 /// Bits [5:4] specify the bits in the result to which the selected bits 890 /// from operand \a Y are copied: \n 891 /// 00: Copies the selected bits from \a Y to result bits [31:0]. \n 892 /// 01: Copies the selected bits from \a Y to result bits [63:32]. \n 893 /// 10: Copies the selected bits from \a Y to result bits [95:64]. \n 894 /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n 895 /// Bits[3:0]: If any of these bits are set, the corresponding result 896 /// element is cleared. 897 /// \returns A 128-bit vector of [4 x float] containing the copied single- 898 /// precision floating point elements from the operands. 899 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) 900 901 /// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and 902 /// returns it, using the immediate value parameter \a N as a selector. 903 /// 904 /// \headerfile <x86intrin.h> 905 /// 906 /// \code 907 /// int _mm_extract_ps(__m128 X, const int N); 908 /// \endcode 909 /// 910 /// This intrinsic corresponds to the <c> <i> VEXTRACTPS / EXTRACTPS </i> </c> 911 /// instruction. 912 /// 913 /// \param X 914 /// A 128-bit vector of [4 x float]. 915 /// \param N 916 /// An immediate value. Bits [1:0] determines which bits from the argument 917 /// \a X are extracted and returned: \n 918 /// 00: Bits [31:0] of parameter \a X are returned. \n 919 /// 01: Bits [63:32] of parameter \a X are returned. \n 920 /// 10: Bits [95:64] of parameter \a X are returned. \n 921 /// 11: Bits [127:96] of parameter \a X are returned. 922 /// \returns A 32-bit integer containing the extracted 32 bits of float data. 923 #define _mm_extract_ps(X, N) (__extension__ \ 924 ({ union { int __i; float __f; } __t; \ 925 __v4sf __a = (__v4sf)(__m128)(X); \ 926 __t.__f = __a[(N) & 3]; \ 927 __t.__i;})) 928 929 /* Miscellaneous insert and extract macros. */ 930 /* Extract a single-precision float from X at index N into D. */ 931 #define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \ 932 (D) = __a[N]; })) 933 934 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create 935 an index suitable for _mm_insert_ps. */ 936 #define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) 937 938 /* Extract a float from X at index N into the first index of the return. */ 939 #define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \ 940 _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) 941 942 /* Insert int into packed integer array at index. */ 943 /// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of 944 /// the 128-bit integer vector parameter, and then inserting the lower 8 bits 945 /// of an integer parameter \a I into an offset specified by the immediate 946 /// value parameter \a N. 947 /// 948 /// \headerfile <x86intrin.h> 949 /// 950 /// \code 951 /// __m128i _mm_insert_epi8(__m128i X, int I, const int N); 952 /// \endcode 953 /// 954 /// This intrinsic corresponds to the <c> <i> VPINSRB / PINSRB </i> </c> 955 /// instruction. 956 /// 957 /// \param X 958 /// A 128-bit integer vector of [16 x i8]. This vector is copied to the 959 /// result and then one of the sixteen elements in the result vector is 960 /// replaced by the lower 8 bits of \a I. 961 /// \param I 962 /// An integer. The lower 8 bits of this operand are written to the result 963 /// beginning at the offset specified by \a N. 964 /// \param N 965 /// An immediate value. Bits [3:0] specify the bit offset in the result at 966 /// which the lower 8 bits of \a I are written. \n 967 /// 0000: Bits [7:0] of the result are used for insertion. \n 968 /// 0001: Bits [15:8] of the result are used for insertion. \n 969 /// 0010: Bits [23:16] of the result are used for insertion. \n 970 /// 0011: Bits [31:24] of the result are used for insertion. \n 971 /// 0100: Bits [39:32] of the result are used for insertion. \n 972 /// 0101: Bits [47:40] of the result are used for insertion. \n 973 /// 0110: Bits [55:48] of the result are used for insertion. \n 974 /// 0111: Bits [63:56] of the result are used for insertion. \n 975 /// 1000: Bits [71:64] of the result are used for insertion. \n 976 /// 1001: Bits [79:72] of the result are used for insertion. \n 977 /// 1010: Bits [87:80] of the result are used for insertion. \n 978 /// 1011: Bits [95:88] of the result are used for insertion. \n 979 /// 1100: Bits [103:96] of the result are used for insertion. \n 980 /// 1101: Bits [111:104] of the result are used for insertion. \n 981 /// 1110: Bits [119:112] of the result are used for insertion. \n 982 /// 1111: Bits [127:120] of the result are used for insertion. 983 /// \returns A 128-bit integer vector containing the constructed values. 984 #define _mm_insert_epi8(X, I, N) (__extension__ \ 985 ({ __v16qi __a = (__v16qi)(__m128i)(X); \ 986 __a[(N) & 15] = (I); \ 987 (__m128i)__a;})) 988 989 /// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of 990 /// the 128-bit integer vector parameter, and then inserting the 32-bit 991 /// integer parameter \a I at the offset specified by the immediate value 992 /// parameter \a N. 993 /// 994 /// \headerfile <x86intrin.h> 995 /// 996 /// \code 997 /// __m128i _mm_insert_epi32(__m128i X, int I, const int N); 998 /// \endcode 999 /// 1000 /// This intrinsic corresponds to the <c> <i> VPINSRD / PINSRD </i> </c> 1001 /// instruction. 1002 /// 1003 /// \param X 1004 /// A 128-bit integer vector of [4 x i32]. This vector is copied to the 1005 /// result and then one of the four elements in the result vector is 1006 /// replaced by \a I. 1007 /// \param I 1008 /// A 32-bit integer that is written to the result beginning at the offset 1009 /// specified by \a N. 1010 /// \param N 1011 /// An immediate value. Bits [1:0] specify the bit offset in the result at 1012 /// which the integer \a I is written. 1013 /// 00: Bits [31:0] of the result are used for insertion. \n 1014 /// 01: Bits [63:32] of the result are used for insertion. \n 1015 /// 10: Bits [95:64] of the result are used for insertion. \n 1016 /// 11: Bits [127:96] of the result are used for insertion. 1017 /// \returns A 128-bit integer vector containing the constructed values. 1018 #define _mm_insert_epi32(X, I, N) (__extension__ \ 1019 ({ __v4si __a = (__v4si)(__m128i)(X); \ 1020 __a[(N) & 3] = (I); \ 1021 (__m128i)__a;})) 1022 #ifdef __x86_64__ 1023 /// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of 1024 /// the 128-bit integer vector parameter, and then inserting the 64-bit 1025 /// integer parameter \a I, using the immediate value parameter \a N as an 1026 /// insertion location selector. 1027 /// 1028 /// \headerfile <x86intrin.h> 1029 /// 1030 /// \code 1031 /// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); 1032 /// \endcode 1033 /// 1034 /// This intrinsic corresponds to the <c> <i> VPINSRQ / PINSRQ </i> </c> 1035 /// instruction. 1036 /// 1037 /// \param X 1038 /// A 128-bit integer vector of [2 x i64]. This vector is copied to the 1039 /// result and then one of the two elements in the result vector is replaced 1040 /// by \a I. 1041 /// \param I 1042 /// A 64-bit integer that is written to the result beginning at the offset 1043 /// specified by \a N. 1044 /// \param N 1045 /// An immediate value. Bit [0] specifies the bit offset in the result at 1046 /// which the integer \a I is written. 1047 /// 0: Bits [63:0] of the result are used for insertion. \n 1048 /// 1: Bits [127:64] of the result are used for insertion. \n 1049 /// \returns A 128-bit integer vector containing the constructed values. 1050 #define _mm_insert_epi64(X, I, N) (__extension__ \ 1051 ({ __v2di __a = (__v2di)(__m128i)(X); \ 1052 __a[(N) & 1] = (I); \ 1053 (__m128i)__a;})) 1054 #endif /* __x86_64__ */ 1055 1056 /* Extract int from packed integer array at index. This returns the element 1057 * as a zero extended value, so it is unsigned. 1058 */ 1059 /// \brief Extracts an 8-bit element from the 128-bit integer vector of 1060 /// [16 x i8], using the immediate value parameter \a N as a selector. 1061 /// 1062 /// \headerfile <x86intrin.h> 1063 /// 1064 /// \code 1065 /// int _mm_extract_epi8(__m128i X, const int N); 1066 /// \endcode 1067 /// 1068 /// This intrinsic corresponds to the <c> <i> VPEXTRB / PEXTRB </i> </c> 1069 /// instruction. 1070 /// 1071 /// \param X 1072 /// A 128-bit integer vector. 1073 /// \param N 1074 /// An immediate value. Bits [3:0] specify which 8-bit vector element 1075 /// from the argument \a X to extract and copy to the result. \n 1076 /// 0000: Bits [7:0] of parameter \a X are extracted. \n 1077 /// 0001: Bits [15:8] of the parameter \a X are extracted. \n 1078 /// 0010: Bits [23:16] of the parameter \a X are extracted. \n 1079 /// 0011: Bits [31:24] of the parameter \a X are extracted. \n 1080 /// 0100: Bits [39:32] of the parameter \a X are extracted. \n 1081 /// 0101: Bits [47:40] of the parameter \a X are extracted. \n 1082 /// 0110: Bits [55:48] of the parameter \a X are extracted. \n 1083 /// 0111: Bits [63:56] of the parameter \a X are extracted. \n 1084 /// 1000: Bits [71:64] of the parameter \a X are extracted. \n 1085 /// 1001: Bits [79:72] of the parameter \a X are extracted. \n 1086 /// 1010: Bits [87:80] of the parameter \a X are extracted. \n 1087 /// 1011: Bits [95:88] of the parameter \a X are extracted. \n 1088 /// 1100: Bits [103:96] of the parameter \a X are extracted. \n 1089 /// 1101: Bits [111:104] of the parameter \a X are extracted. \n 1090 /// 1110: Bits [119:112] of the parameter \a X are extracted. \n 1091 /// 1111: Bits [127:120] of the parameter \a X are extracted. 1092 /// \returns An unsigned integer, whose lower 8 bits are selected from the 1093 /// 128-bit integer vector parameter and the remaining bits are assigned 1094 /// zeros. 1095 #define _mm_extract_epi8(X, N) (__extension__ \ 1096 ({ __v16qi __a = (__v16qi)(__m128i)(X); \ 1097 (int)(unsigned char) __a[(N) & 15];})) 1098 1099 /// \brief Extracts a 32-bit element from the 128-bit integer vector of 1100 /// [4 x i32], using the immediate value parameter \a N as a selector. 1101 /// 1102 /// \headerfile <x86intrin.h> 1103 /// 1104 /// \code 1105 /// int _mm_extract_epi32(__m128i X, const int N); 1106 /// \endcode 1107 /// 1108 /// This intrinsic corresponds to the <c> <i> VPEXTRD / PEXTRD </i> </c> 1109 /// instruction. 1110 /// 1111 /// \param X 1112 /// A 128-bit integer vector. 1113 /// \param N 1114 /// An immediate value. Bits [1:0] specify which 32-bit vector element 1115 /// from the argument \a X to extract and copy to the result. \n 1116 /// 00: Bits [31:0] of the parameter \a X are extracted. \n 1117 /// 01: Bits [63:32] of the parameter \a X are extracted. \n 1118 /// 10: Bits [95:64] of the parameter \a X are extracted. \n 1119 /// 11: Bits [127:96] of the parameter \a X are exracted. 1120 /// \returns An integer, whose lower 32 bits are selected from the 128-bit 1121 /// integer vector parameter and the remaining bits are assigned zeros. 1122 #define _mm_extract_epi32(X, N) (__extension__ \ 1123 ({ __v4si __a = (__v4si)(__m128i)(X); \ 1124 (int)__a[(N) & 3];})) 1125 #ifdef __x86_64__ 1126 /// \brief Extracts a 64-bit element from the 128-bit integer vector of 1127 /// [2 x i64], using the immediate value parameter \a N as a selector. 1128 /// 1129 /// \headerfile <x86intrin.h> 1130 /// 1131 /// \code 1132 /// long long _mm_extract_epi64(__m128i X, const int N); 1133 /// \endcode 1134 /// 1135 /// This intrinsic corresponds to the <c> <i> VPEXTRQ / PEXTRQ </i> </c> 1136 /// instruction. 1137 /// 1138 /// \param X 1139 /// A 128-bit integer vector. 1140 /// \param N 1141 /// An immediate value. Bit [0] specifies which 64-bit vector element 1142 /// from the argument \a X to return. \n 1143 /// 0: Bits [63:0] are returned. \n 1144 /// 1: Bits [127:64] are returned. \n 1145 /// \returns A 64-bit integer. 1146 #define _mm_extract_epi64(X, N) (__extension__ \ 1147 ({ __v2di __a = (__v2di)(__m128i)(X); \ 1148 (long long)__a[(N) & 1];})) 1149 #endif /* __x86_64 */ 1150 1151 /* SSE4 128-bit Packed Integer Comparisons. */ 1152 /// \brief Tests whether the specified bits in a 128-bit integer vector are all 1153 /// zeros. 1154 /// 1155 /// \headerfile <x86intrin.h> 1156 /// 1157 /// This intrinsic corresponds to the <c> <i> VPTEST / PTEST </i> </c> 1158 /// instruction. 1159 /// 1160 /// \param __M 1161 /// A 128-bit integer vector containing the bits to be tested. 1162 /// \param __V 1163 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1164 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1165 static __inline__ int __DEFAULT_FN_ATTRS 1166 _mm_testz_si128(__m128i __M, __m128i __V) 1167 { 1168 return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); 1169 } 1170 1171 /// \brief Tests whether the specified bits in a 128-bit integer vector are all 1172 /// ones. 1173 /// 1174 /// \headerfile <x86intrin.h> 1175 /// 1176 /// This intrinsic corresponds to the <c> <i> VPTEST / PTEST </i> </c> 1177 /// instruction. 1178 /// 1179 /// \param __M 1180 /// A 128-bit integer vector containing the bits to be tested. 1181 /// \param __V 1182 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1183 /// \returns TRUE if the specified bits are all ones; FALSE otherwise. 1184 static __inline__ int __DEFAULT_FN_ATTRS 1185 _mm_testc_si128(__m128i __M, __m128i __V) 1186 { 1187 return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); 1188 } 1189 1190 /// \brief Tests whether the specified bits in a 128-bit integer vector are 1191 /// neither all zeros nor all ones. 1192 /// 1193 /// \headerfile <x86intrin.h> 1194 /// 1195 /// This intrinsic corresponds to the <c> <i> VPTEST / PTEST </i> </c> 1196 /// instruction. 1197 /// 1198 /// \param __M 1199 /// A 128-bit integer vector containing the bits to be tested. 1200 /// \param __V 1201 /// A 128-bit integer vector selecting which bits to test in operand \a __M. 1202 /// \returns TRUE if the specified bits are neither all zeros nor all ones; 1203 /// FALSE otherwise. 1204 static __inline__ int __DEFAULT_FN_ATTRS 1205 _mm_testnzc_si128(__m128i __M, __m128i __V) 1206 { 1207 return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); 1208 } 1209 1210 /// \brief Tests whether the specified bits in a 128-bit integer vector are all 1211 /// ones. 1212 /// 1213 /// \headerfile <x86intrin.h> 1214 /// 1215 /// \code 1216 /// int _mm_test_all_ones(__m128i V); 1217 /// \endcode 1218 /// 1219 /// This intrinsic corresponds to the <c> <i> VPTEST / PTEST </i> </c> 1220 /// instruction. 1221 /// 1222 /// \param V 1223 /// A 128-bit integer vector containing the bits to be tested. 1224 /// \returns TRUE if the bits specified in the operand are all set to 1; FALSE 1225 /// otherwise. 1226 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) 1227 1228 /// \brief Tests whether the specified bits in a 128-bit integer vector are 1229 /// neither all zeros nor all ones. 1230 /// 1231 /// \headerfile <x86intrin.h> 1232 /// 1233 /// \code 1234 /// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); 1235 /// \endcode 1236 /// 1237 /// This intrinsic corresponds to the <c> <i> VPTEST / PTEST </i> </c> 1238 /// instruction. 1239 /// 1240 /// \param M 1241 /// A 128-bit integer vector containing the bits to be tested. 1242 /// \param V 1243 /// A 128-bit integer vector selecting which bits to test in operand \a M. 1244 /// \returns TRUE if the specified bits are neither all zeros nor all ones; 1245 /// FALSE otherwise. 1246 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) 1247 1248 /// \brief Tests whether the specified bits in a 128-bit integer vector are all 1249 /// zeros. 1250 /// 1251 /// \headerfile <x86intrin.h> 1252 /// 1253 /// \code 1254 /// int _mm_test_all_zeros(__m128i M, __m128i V); 1255 /// \endcode 1256 /// 1257 /// This intrinsic corresponds to the <c> <i> VPTEST / PTEST </i> </c> 1258 /// instruction. 1259 /// 1260 /// \param M 1261 /// A 128-bit integer vector containing the bits to be tested. 1262 /// \param V 1263 /// A 128-bit integer vector selecting which bits to test in operand \a M. 1264 /// \returns TRUE if the specified bits are all zeros; FALSE otherwise. 1265 #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) 1266 1267 /* SSE4 64-bit Packed Integer Comparisons. */ 1268 /// \brief Compares each of the corresponding 64-bit values of the 128-bit 1269 /// integer vectors for equality. 1270 /// 1271 /// \headerfile <x86intrin.h> 1272 /// 1273 /// This intrinsic corresponds to the <c> <i> VPCMPEQQ / PCMPEQQ </i> </c> 1274 /// instruction. 1275 /// 1276 /// \param __V1 1277 /// A 128-bit integer vector. 1278 /// \param __V2 1279 /// A 128-bit integer vector. 1280 /// \returns A 128-bit integer vector containing the comparison results. 1281 static __inline__ __m128i __DEFAULT_FN_ATTRS 1282 _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) 1283 { 1284 return (__m128i)((__v2di)__V1 == (__v2di)__V2); 1285 } 1286 1287 /* SSE4 Packed Integer Sign-Extension. */ 1288 /// \brief Sign-extends each of the lower eight 8-bit integer elements of a 1289 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1290 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1291 /// are unused. 1292 /// 1293 /// \headerfile <x86intrin.h> 1294 /// 1295 /// This intrinsic corresponds to the <c> <i> VPMOVSXBW / PMOVSXBW </i> </c> 1296 /// instruction. 1297 /// 1298 /// \param __V 1299 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign- 1300 /// extended to 16-bit values. 1301 /// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. 1302 static __inline__ __m128i __DEFAULT_FN_ATTRS 1303 _mm_cvtepi8_epi16(__m128i __V) 1304 { 1305 /* This function always performs a signed extension, but __v16qi is a char 1306 which may be signed or unsigned, so use __v16qs. */ 1307 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); 1308 } 1309 1310 /// \brief Sign-extends each of the lower four 8-bit integer elements of a 1311 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1312 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1313 /// vector are unused. 1314 /// 1315 /// \headerfile <x86intrin.h> 1316 /// 1317 /// This intrinsic corresponds to the <c> <i> VPMOVSXBD / PMOVSXBD </i> </c> 1318 /// instruction. 1319 /// 1320 /// \param __V 1321 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign- 1322 /// extended to 32-bit values. 1323 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1324 static __inline__ __m128i __DEFAULT_FN_ATTRS 1325 _mm_cvtepi8_epi32(__m128i __V) 1326 { 1327 /* This function always performs a signed extension, but __v16qi is a char 1328 which may be signed or unsigned, so use __v16qs. */ 1329 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); 1330 } 1331 1332 /// \brief Sign-extends each of the lower two 8-bit integer elements of a 1333 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1334 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1335 /// vector are unused. 1336 /// 1337 /// \headerfile <x86intrin.h> 1338 /// 1339 /// This intrinsic corresponds to the <c> <i> VPMOVSXBQ / PMOVSXBQ </i> </c> 1340 /// instruction. 1341 /// 1342 /// \param __V 1343 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign- 1344 /// extended to 64-bit values. 1345 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1346 static __inline__ __m128i __DEFAULT_FN_ATTRS 1347 _mm_cvtepi8_epi64(__m128i __V) 1348 { 1349 /* This function always performs a signed extension, but __v16qi is a char 1350 which may be signed or unsigned, so use __v16qs. */ 1351 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); 1352 } 1353 1354 /// \brief Sign-extends each of the lower four 16-bit integer elements of a 1355 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1356 /// a 128-bit vector of [4 x i32]. The upper four elements of the input 1357 /// vector are unused. 1358 /// 1359 /// \headerfile <x86intrin.h> 1360 /// 1361 /// This intrinsic corresponds to the <c> <i> VPMOVSXWD / PMOVSXWD </i> </c> 1362 /// instruction. 1363 /// 1364 /// \param __V 1365 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign- 1366 /// extended to 32-bit values. 1367 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. 1368 static __inline__ __m128i __DEFAULT_FN_ATTRS 1369 _mm_cvtepi16_epi32(__m128i __V) 1370 { 1371 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); 1372 } 1373 1374 /// \brief Sign-extends each of the lower two 16-bit integer elements of a 1375 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1376 /// a 128-bit vector of [2 x i64]. The upper six elements of the input 1377 /// vector are unused. 1378 /// 1379 /// \headerfile <x86intrin.h> 1380 /// 1381 /// This intrinsic corresponds to the <c> <i> VPMOVSXWQ / PMOVSXWQ </i> </c> 1382 /// instruction. 1383 /// 1384 /// \param __V 1385 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign- 1386 /// extended to 64-bit values. 1387 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1388 static __inline__ __m128i __DEFAULT_FN_ATTRS 1389 _mm_cvtepi16_epi64(__m128i __V) 1390 { 1391 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); 1392 } 1393 1394 /// \brief Sign-extends each of the lower two 32-bit integer elements of a 1395 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1396 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1397 /// are unused. 1398 /// 1399 /// \headerfile <x86intrin.h> 1400 /// 1401 /// This intrinsic corresponds to the <c> <i> VPMOVSXDQ / PMOVSXDQ </i> </c> 1402 /// instruction. 1403 /// 1404 /// \param __V 1405 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign- 1406 /// extended to 64-bit values. 1407 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. 1408 static __inline__ __m128i __DEFAULT_FN_ATTRS 1409 _mm_cvtepi32_epi64(__m128i __V) 1410 { 1411 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); 1412 } 1413 1414 /* SSE4 Packed Integer Zero-Extension. */ 1415 /// \brief Zero-extends each of the lower eight 8-bit integer elements of a 1416 /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a 1417 /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector 1418 /// are unused. 1419 /// 1420 /// \headerfile <x86intrin.h> 1421 /// 1422 /// This intrinsic corresponds to the <c> <i> VPMOVZXBW / PMOVZXBW </i> </c> 1423 /// instruction. 1424 /// 1425 /// \param __V 1426 /// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero- 1427 /// extended to 16-bit values. 1428 /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. 1429 static __inline__ __m128i __DEFAULT_FN_ATTRS 1430 _mm_cvtepu8_epi16(__m128i __V) 1431 { 1432 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); 1433 } 1434 1435 /// \brief Zero-extends each of the lower four 8-bit integer elements of a 1436 /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a 1437 /// 128-bit vector of [4 x i32]. The upper twelve elements of the input 1438 /// vector are unused. 1439 /// 1440 /// \headerfile <x86intrin.h> 1441 /// 1442 /// This intrinsic corresponds to the <c> <i> VPMOVZXBD / PMOVZXBD </i> </c> 1443 /// instruction. 1444 /// 1445 /// \param __V 1446 /// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero- 1447 /// extended to 32-bit values. 1448 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1449 static __inline__ __m128i __DEFAULT_FN_ATTRS 1450 _mm_cvtepu8_epi32(__m128i __V) 1451 { 1452 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); 1453 } 1454 1455 /// \brief Zero-extends each of the lower two 8-bit integer elements of a 1456 /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in 1457 /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input 1458 /// vector are unused. 1459 /// 1460 /// \headerfile <x86intrin.h> 1461 /// 1462 /// This intrinsic corresponds to the <c> <i> VPMOVZXBQ / PMOVZXBQ </i> </c> 1463 /// instruction. 1464 /// 1465 /// \param __V 1466 /// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero- 1467 /// extended to 64-bit values. 1468 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1469 static __inline__ __m128i __DEFAULT_FN_ATTRS 1470 _mm_cvtepu8_epi64(__m128i __V) 1471 { 1472 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); 1473 } 1474 1475 /// \brief Zero-extends each of the lower four 16-bit integer elements of a 1476 /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in 1477 /// a 128-bit vector of [4 x i32]. The upper four elements of the input 1478 /// vector are unused. 1479 /// 1480 /// \headerfile <x86intrin.h> 1481 /// 1482 /// This intrinsic corresponds to the <c> <i> VPMOVZXWD / PMOVZXWD </i> </c> 1483 /// instruction. 1484 /// 1485 /// \param __V 1486 /// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero- 1487 /// extended to 32-bit values. 1488 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. 1489 static __inline__ __m128i __DEFAULT_FN_ATTRS 1490 _mm_cvtepu16_epi32(__m128i __V) 1491 { 1492 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); 1493 } 1494 1495 /// \brief Zero-extends each of the lower two 16-bit integer elements of a 1496 /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in 1497 /// a 128-bit vector of [2 x i64]. The upper six elements of the input vector 1498 /// are unused. 1499 /// 1500 /// \headerfile <x86intrin.h> 1501 /// 1502 /// This intrinsic corresponds to the <c> <i> VPMOVZXWQ / PMOVZXWQ </i> </c> 1503 /// instruction. 1504 /// 1505 /// \param __V 1506 /// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero- 1507 /// extended to 64-bit values. 1508 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1509 static __inline__ __m128i __DEFAULT_FN_ATTRS 1510 _mm_cvtepu16_epi64(__m128i __V) 1511 { 1512 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); 1513 } 1514 1515 /// \brief Zero-extends each of the lower two 32-bit integer elements of a 1516 /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in 1517 /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector 1518 /// are unused. 1519 /// 1520 /// \headerfile <x86intrin.h> 1521 /// 1522 /// This intrinsic corresponds to the <c> <i> VPMOVZXDQ / PMOVZXDQ </i> </c> 1523 /// instruction. 1524 /// 1525 /// \param __V 1526 /// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero- 1527 /// extended to 64-bit values. 1528 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. 1529 static __inline__ __m128i __DEFAULT_FN_ATTRS 1530 _mm_cvtepu32_epi64(__m128i __V) 1531 { 1532 return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); 1533 } 1534 1535 /* SSE4 Pack with Unsigned Saturation. */ 1536 /// \brief Converts 32-bit signed integers from both 128-bit integer vector 1537 /// operands into 16-bit unsigned integers, and returns the packed result. 1538 /// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than 1539 /// 0x0000 are saturated to 0x0000. 1540 /// 1541 /// \headerfile <x86intrin.h> 1542 /// 1543 /// This intrinsic corresponds to the <c> <i> VPACKUSDW / PACKUSDW </i> </c> 1544 /// instruction. 1545 /// 1546 /// \param __V1 1547 /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1548 /// signed integer and is converted to a 16-bit unsigned integer with 1549 /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1550 /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1551 /// are written to the lower 64 bits of the result. 1552 /// \param __V2 1553 /// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a 1554 /// signed integer and is converted to a 16-bit unsigned integer with 1555 /// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values 1556 /// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values 1557 /// are written to the higher 64 bits of the result. 1558 /// \returns A 128-bit vector of [8 x i16] containing the converted values. 1559 static __inline__ __m128i __DEFAULT_FN_ATTRS 1560 _mm_packus_epi32(__m128i __V1, __m128i __V2) 1561 { 1562 return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); 1563 } 1564 1565 /* SSE4 Multiple Packed Sums of Absolute Difference. */ 1566 /// \brief Subtracts 8-bit unsigned integer values and computes the absolute 1567 /// values of the differences to the corresponding bits in the destination. 1568 /// Then sums of the absolute differences are returned according to the bit 1569 /// fields in the immediate operand. 1570 /// 1571 /// \headerfile <x86intrin.h> 1572 /// 1573 /// \code 1574 /// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); 1575 /// \endcode 1576 /// 1577 /// This intrinsic corresponds to the <c> <i> VMPSADBW / MPSADBW </i> </c> 1578 /// instruction. 1579 /// 1580 /// \param X 1581 /// A 128-bit vector of [16 x i8]. 1582 /// \param Y 1583 /// A 128-bit vector of [16 x i8]. 1584 /// \param M 1585 /// An 8-bit immediate operand specifying how the absolute differences are to 1586 /// be calculated, according to the following algorithm: 1587 /// \code 1588 /// // M2 represents bit 2 of the immediate operand 1589 /// // M10 represents bits [1:0] of the immediate operand 1590 /// i = M2 * 4 1591 /// j = M10 * 4 1592 /// for (k = 0; k < 8; k = k + 1) { 1593 /// d0 = abs(X[i + k + 0] - Y[j + 0]) 1594 /// d1 = abs(X[i + k + 1] - Y[j + 1]) 1595 /// d2 = abs(X[i + k + 2] - Y[j + 2]) 1596 /// d3 = abs(X[i + k + 3] - Y[j + 3]) 1597 /// r[k] = d0 + d1 + d2 + d3 1598 /// } 1599 /// \endcode 1600 /// \returns A 128-bit integer vector containing the sums of the sets of 1601 /// absolute differences between both operands. 1602 #define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \ 1603 (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ 1604 (__v16qi)(__m128i)(Y), (M)); }) 1605 1606 /// \brief Finds the minimum unsigned 16-bit element in the input 128-bit 1607 /// vector of [8 x u16] and returns it and along with its index. 1608 /// 1609 /// \headerfile <x86intrin.h> 1610 /// 1611 /// This intrinsic corresponds to the <c> <i> VPHMINPOSUW / PHMINPOSUW </i> </c> 1612 /// instruction. 1613 /// 1614 /// \param __V 1615 /// A 128-bit vector of [8 x u16]. 1616 /// \returns A 128-bit value where bits [15:0] contain the minimum value found 1617 /// in parameter \a __V, bits [18:16] contain the index of the minimum value 1618 /// and the remaining bits are set to 0. 1619 static __inline__ __m128i __DEFAULT_FN_ATTRS 1620 _mm_minpos_epu16(__m128i __V) 1621 { 1622 return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V); 1623 } 1624 1625 /* Handle the sse4.2 definitions here. */ 1626 1627 /* These definitions are normally in nmmintrin.h, but gcc puts them in here 1628 so we'll do the same. */ 1629 1630 #undef __DEFAULT_FN_ATTRS 1631 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) 1632 1633 /* These specify the type of data that we're comparing. */ 1634 #define _SIDD_UBYTE_OPS 0x00 1635 #define _SIDD_UWORD_OPS 0x01 1636 #define _SIDD_SBYTE_OPS 0x02 1637 #define _SIDD_SWORD_OPS 0x03 1638 1639 /* These specify the type of comparison operation. */ 1640 #define _SIDD_CMP_EQUAL_ANY 0x00 1641 #define _SIDD_CMP_RANGES 0x04 1642 #define _SIDD_CMP_EQUAL_EACH 0x08 1643 #define _SIDD_CMP_EQUAL_ORDERED 0x0c 1644 1645 /* These macros specify the polarity of the operation. */ 1646 #define _SIDD_POSITIVE_POLARITY 0x00 1647 #define _SIDD_NEGATIVE_POLARITY 0x10 1648 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 1649 #define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 1650 1651 /* These macros are used in _mm_cmpXstri() to specify the return. */ 1652 #define _SIDD_LEAST_SIGNIFICANT 0x00 1653 #define _SIDD_MOST_SIGNIFICANT 0x40 1654 1655 /* These macros are used in _mm_cmpXstri() to specify the return. */ 1656 #define _SIDD_BIT_MASK 0x00 1657 #define _SIDD_UNIT_MASK 0x40 1658 1659 /* SSE4.2 Packed Comparison Intrinsics. */ 1660 /// \brief Uses the immediate operand \a M to perform a comparison of string 1661 /// data with implicitly defined lengths that is contained in source operands 1662 /// \a A and \a B. Returns a 128-bit integer vector representing the result 1663 /// mask of the comparison. 1664 /// 1665 /// \headerfile <x86intrin.h> 1666 /// 1667 /// \code 1668 /// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); 1669 /// \endcode 1670 /// 1671 /// This intrinsic corresponds to the <c> <i> VPCMPISTRM / PCMPISTRM </i> </c> 1672 /// instruction. 1673 /// 1674 /// \param A 1675 /// A 128-bit integer vector containing one of the source operands to be 1676 /// compared. 1677 /// \param B 1678 /// A 128-bit integer vector containing one of the source operands to be 1679 /// compared. 1680 /// \param M 1681 /// An 8-bit immediate operand specifying whether the characters are bytes or 1682 /// words, the type of comparison to perform, and the format of the return 1683 /// value. \n 1684 /// Bits [1:0]: Determine source data format. \n 1685 /// 00: 16 unsigned bytes \n 1686 /// 01: 8 unsigned words \n 1687 /// 10: 16 signed bytes \n 1688 /// 11: 8 signed words \n 1689 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1690 /// 00: Subset: Each character in \a B is compared for equality with all 1691 /// the characters in \a A. \n 1692 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1693 /// basis is greater than or equal for even-indexed elements in \a A, 1694 /// and less than or equal for odd-indexed elements in \a A. \n 1695 /// 10: Match: Compare each pair of corresponding characters in \a A and 1696 /// \a B for equality. \n 1697 /// 11: Substring: Search \a B for substring matches of \a A. \n 1698 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1699 /// mask of the comparison results. \n 1700 /// 00: No effect. \n 1701 /// 01: Negate the bit mask. \n 1702 /// 10: No effect. \n 1703 /// 11: Negate the bit mask only for bits with an index less than or equal 1704 /// to the size of \a A or \a B. \n 1705 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1706 /// bytes. \n 1707 /// 0: The result is zero-extended to 16 bytes. \n 1708 /// 1: The result is expanded to 16 bytes (this expansion is performed by 1709 /// repeating each bit 8 or 16 times). 1710 /// \returns Returns a 128-bit integer vector representing the result mask of 1711 /// the comparison. 1712 #define _mm_cmpistrm(A, B, M) \ 1713 (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ 1714 (__v16qi)(__m128i)(B), (int)(M)) 1715 1716 /// \brief Uses the immediate operand \a M to perform a comparison of string 1717 /// data with implicitly defined lengths that is contained in source operands 1718 /// \a A and \a B. Returns an integer representing the result index of the 1719 /// comparison. 1720 /// 1721 /// \headerfile <x86intrin.h> 1722 /// 1723 /// \code 1724 /// int _mm_cmpistri(__m128i A, __m128i B, const int M); 1725 /// \endcode 1726 /// 1727 /// This intrinsic corresponds to the <c> <i> VPCMPISTRI / PCMPISTRI </i> </c> 1728 /// instruction. 1729 /// 1730 /// \param A 1731 /// A 128-bit integer vector containing one of the source operands to be 1732 /// compared. 1733 /// \param B 1734 /// A 128-bit integer vector containing one of the source operands to be 1735 /// compared. 1736 /// \param M 1737 /// An 8-bit immediate operand specifying whether the characters are bytes or 1738 /// words, the type of comparison to perform, and the format of the return 1739 /// value. \n 1740 /// Bits [1:0]: Determine source data format. \n 1741 /// 00: 16 unsigned bytes \n 1742 /// 01: 8 unsigned words \n 1743 /// 10: 16 signed bytes \n 1744 /// 11: 8 signed words \n 1745 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1746 /// 00: Subset: Each character in \a B is compared for equality with all 1747 /// the characters in \a A. \n 1748 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1749 /// basis is greater than or equal for even-indexed elements in \a A, 1750 /// and less than or equal for odd-indexed elements in \a A. \n 1751 /// 10: Match: Compare each pair of corresponding characters in \a A and 1752 /// \a B for equality. \n 1753 /// 11: Substring: Search B for substring matches of \a A. \n 1754 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1755 /// mask of the comparison results. \n 1756 /// 00: No effect. \n 1757 /// 01: Negate the bit mask. \n 1758 /// 10: No effect. \n 1759 /// 11: Negate the bit mask only for bits with an index less than or equal 1760 /// to the size of \a A or \a B. \n 1761 /// Bit [6]: Determines whether the index of the lowest set bit or the 1762 /// highest set bit is returned. \n 1763 /// 0: The index of the least significant set bit. \n 1764 /// 1: The index of the most significant set bit. \n 1765 /// \returns Returns an integer representing the result index of the comparison. 1766 #define _mm_cmpistri(A, B, M) \ 1767 (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ 1768 (__v16qi)(__m128i)(B), (int)(M)) 1769 1770 /// \brief Uses the immediate operand \a M to perform a comparison of string 1771 /// data with explicitly defined lengths that is contained in source operands 1772 /// \a A and \a B. Returns a 128-bit integer vector representing the result 1773 /// mask of the comparison. 1774 /// 1775 /// \headerfile <x86intrin.h> 1776 /// 1777 /// \code 1778 /// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); 1779 /// \endcode 1780 /// 1781 /// This intrinsic corresponds to the <c> <i> VPCMPESTRM / PCMPESTRM </i> </c> 1782 /// instruction. 1783 /// 1784 /// \param A 1785 /// A 128-bit integer vector containing one of the source operands to be 1786 /// compared. 1787 /// \param LA 1788 /// An integer that specifies the length of the string in \a A. 1789 /// \param B 1790 /// A 128-bit integer vector containing one of the source operands to be 1791 /// compared. 1792 /// \param LB 1793 /// An integer that specifies the length of the string in \a B. 1794 /// \param M 1795 /// An 8-bit immediate operand specifying whether the characters are bytes or 1796 /// words, the type of comparison to perform, and the format of the return 1797 /// value. \n 1798 /// Bits [1:0]: Determine source data format. \n 1799 /// 00: 16 unsigned bytes \n 1800 /// 01: 8 unsigned words \n 1801 /// 10: 16 signed bytes \n 1802 /// 11: 8 signed words \n 1803 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1804 /// 00: Subset: Each character in \a B is compared for equality with all 1805 /// the characters in \a A. \n 1806 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1807 /// basis is greater than or equal for even-indexed elements in \a A, 1808 /// and less than or equal for odd-indexed elements in \a A. \n 1809 /// 10: Match: Compare each pair of corresponding characters in \a A and 1810 /// \a B for equality. \n 1811 /// 11: Substring: Search \a B for substring matches of \a A. \n 1812 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1813 /// mask of the comparison results. \n 1814 /// 00: No effect. \n 1815 /// 01: Negate the bit mask. \n 1816 /// 10: No effect. \n 1817 /// 11: Negate the bit mask only for bits with an index less than or equal 1818 /// to the size of \a A or \a B. \n 1819 /// Bit [6]: Determines whether the result is zero-extended or expanded to 16 1820 /// bytes. \n 1821 /// 0: The result is zero-extended to 16 bytes. \n 1822 /// 1: The result is expanded to 16 bytes (this expansion is performed by 1823 /// repeating each bit 8 or 16 times). \n 1824 /// \returns Returns a 128-bit integer vector representing the result mask of 1825 /// the comparison. 1826 #define _mm_cmpestrm(A, LA, B, LB, M) \ 1827 (__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ 1828 (__v16qi)(__m128i)(B), (int)(LB), \ 1829 (int)(M)) 1830 1831 /// \brief Uses the immediate operand \a M to perform a comparison of string 1832 /// data with explicitly defined lengths that is contained in source operands 1833 /// \a A and \a B. Returns an integer representing the result index of the 1834 /// comparison. 1835 /// 1836 /// \headerfile <x86intrin.h> 1837 /// 1838 /// \code 1839 /// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); 1840 /// \endcode 1841 /// 1842 /// This intrinsic corresponds to the <c> <i> VPCMPESTRI / PCMPESTRI </i> </c> 1843 /// instruction. 1844 /// 1845 /// \param A 1846 /// A 128-bit integer vector containing one of the source operands to be 1847 /// compared. 1848 /// \param LA 1849 /// An integer that specifies the length of the string in \a A. 1850 /// \param B 1851 /// A 128-bit integer vector containing one of the source operands to be 1852 /// compared. 1853 /// \param LB 1854 /// An integer that specifies the length of the string in \a B. 1855 /// \param M 1856 /// An 8-bit immediate operand specifying whether the characters are bytes or 1857 /// words, the type of comparison to perform, and the format of the return 1858 /// value. \n 1859 /// Bits [1:0]: Determine source data format. \n 1860 /// 00: 16 unsigned bytes \n 1861 /// 01: 8 unsigned words \n 1862 /// 10: 16 signed bytes \n 1863 /// 11: 8 signed words \n 1864 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1865 /// 00: Subset: Each character in \a B is compared for equality with all 1866 /// the characters in \a A. \n 1867 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1868 /// basis is greater than or equal for even-indexed elements in \a A, 1869 /// and less than or equal for odd-indexed elements in \a A. \n 1870 /// 10: Match: Compare each pair of corresponding characters in \a A and 1871 /// \a B for equality. \n 1872 /// 11: Substring: Search B for substring matches of \a A. \n 1873 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1874 /// mask of the comparison results. \n 1875 /// 00: No effect. \n 1876 /// 01: Negate the bit mask. \n 1877 /// 10: No effect. \n 1878 /// 11: Negate the bit mask only for bits with an index less than or equal 1879 /// to the size of \a A or \a B. \n 1880 /// Bit [6]: Determines whether the index of the lowest set bit or the 1881 /// highest set bit is returned. \n 1882 /// 0: The index of the least significant set bit. \n 1883 /// 1: The index of the most significant set bit. \n 1884 /// \returns Returns an integer representing the result index of the comparison. 1885 #define _mm_cmpestri(A, LA, B, LB, M) \ 1886 (int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ 1887 (__v16qi)(__m128i)(B), (int)(LB), \ 1888 (int)(M)) 1889 1890 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ 1891 /// \brief Uses the immediate operand \a M to perform a comparison of string 1892 /// data with implicitly defined lengths that is contained in source operands 1893 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 1894 /// string in \a B is the maximum, otherwise, returns 0. 1895 /// 1896 /// \headerfile <x86intrin.h> 1897 /// 1898 /// \code 1899 /// int _mm_cmpistra(__m128i A, __m128i B, const int M); 1900 /// \endcode 1901 /// 1902 /// This intrinsic corresponds to the <c> <i> VPCMPISTRI / PCMPISTRI </i> </c> 1903 /// instruction. 1904 /// 1905 /// \param A 1906 /// A 128-bit integer vector containing one of the source operands to be 1907 /// compared. 1908 /// \param B 1909 /// A 128-bit integer vector containing one of the source operands to be 1910 /// compared. 1911 /// \param M 1912 /// An 8-bit immediate operand specifying whether the characters are bytes or 1913 /// words and the type of comparison to perform. \n 1914 /// Bits [1:0]: Determine source data format. \n 1915 /// 00: 16 unsigned bytes \n 1916 /// 01: 8 unsigned words \n 1917 /// 10: 16 signed bytes \n 1918 /// 11: 8 signed words \n 1919 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1920 /// 00: Subset: Each character in \a B is compared for equality with all 1921 /// the characters in \a A. \n 1922 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1923 /// basis is greater than or equal for even-indexed elements in \a A, 1924 /// and less than or equal for odd-indexed elements in \a A. \n 1925 /// 10: Match: Compare each pair of corresponding characters in \a A and 1926 /// \a B for equality. \n 1927 /// 11: Substring: Search \a B for substring matches of \a A. \n 1928 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1929 /// mask of the comparison results. \n 1930 /// 00: No effect. \n 1931 /// 01: Negate the bit mask. \n 1932 /// 10: No effect. \n 1933 /// 11: Negate the bit mask only for bits with an index less than or equal 1934 /// to the size of \a A or \a B. \n 1935 /// \returns Returns 1 if the bit mask is zero and the length of the string in 1936 /// \a B is the maximum; otherwise, returns 0. 1937 #define _mm_cmpistra(A, B, M) \ 1938 (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ 1939 (__v16qi)(__m128i)(B), (int)(M)) 1940 1941 /// \brief Uses the immediate operand \a M to perform a comparison of string 1942 /// data with implicitly defined lengths that is contained in source operands 1943 /// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns 1944 /// 0. 1945 /// 1946 /// \headerfile <x86intrin.h> 1947 /// 1948 /// \code 1949 /// int _mm_cmpistrc(__m128i A, __m128i B, const int M); 1950 /// \endcode 1951 /// 1952 /// This intrinsic corresponds to the <c> <i> VPCMPISTRI / PCMPISTRI </i> </c> 1953 /// instruction. 1954 /// 1955 /// \param A 1956 /// A 128-bit integer vector containing one of the source operands to be 1957 /// compared. 1958 /// \param B 1959 /// A 128-bit integer vector containing one of the source operands to be 1960 /// compared. 1961 /// \param M 1962 /// An 8-bit immediate operand specifying whether the characters are bytes or 1963 /// words and the type of comparison to perform. \n 1964 /// Bits [1:0]: Determine source data format. \n 1965 /// 00: 16 unsigned bytes \n 1966 /// 01: 8 unsigned words \n 1967 /// 10: 16 signed bytes \n 1968 /// 11: 8 signed words \n 1969 /// Bits [3:2]: Determine comparison type and aggregation method. \n 1970 /// 00: Subset: Each character in \a B is compared for equality with all 1971 /// the characters in \a A. \n 1972 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 1973 /// basis is greater than or equal for even-indexed elements in \a A, 1974 /// and less than or equal for odd-indexed elements in \a A. \n 1975 /// 10: Match: Compare each pair of corresponding characters in \a A and 1976 /// \a B for equality. \n 1977 /// 11: Substring: Search B for substring matches of \a A. \n 1978 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 1979 /// mask of the comparison results. \n 1980 /// 00: No effect. \n 1981 /// 01: Negate the bit mask. \n 1982 /// 10: No effect. \n 1983 /// 11: Negate the bit mask only for bits with an index less than or equal 1984 /// to the size of \a A or \a B. 1985 /// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. 1986 #define _mm_cmpistrc(A, B, M) \ 1987 (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ 1988 (__v16qi)(__m128i)(B), (int)(M)) 1989 1990 /// \brief Uses the immediate operand \a M to perform a comparison of string 1991 /// data with implicitly defined lengths that is contained in source operands 1992 /// \a A and \a B. Returns bit 0 of the resulting bit mask. 1993 /// 1994 /// \headerfile <x86intrin.h> 1995 /// 1996 /// \code 1997 /// int _mm_cmpistro(__m128i A, __m128i B, const int M); 1998 /// \endcode 1999 /// 2000 /// This intrinsic corresponds to the <c> <i> VPCMPISTRI / PCMPISTRI </i> </c> 2001 /// instruction. 2002 /// 2003 /// \param A 2004 /// A 128-bit integer vector containing one of the source operands to be 2005 /// compared. 2006 /// \param B 2007 /// A 128-bit integer vector containing one of the source operands to be 2008 /// compared. 2009 /// \param M 2010 /// An 8-bit immediate operand specifying whether the characters are bytes or 2011 /// words and the type of comparison to perform. \n 2012 /// Bits [1:0]: Determine source data format. \n 2013 /// 00: 16 unsigned bytes \n 2014 /// 01: 8 unsigned words \n 2015 /// 10: 16 signed bytes \n 2016 /// 11: 8 signed words \n 2017 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2018 /// 00: Subset: Each character in \a B is compared for equality with all 2019 /// the characters in \a A. \n 2020 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2021 /// basis is greater than or equal for even-indexed elements in \a A, 2022 /// and less than or equal for odd-indexed elements in \a A. \n 2023 /// 10: Match: Compare each pair of corresponding characters in \a A and 2024 /// \a B for equality. \n 2025 /// 11: Substring: Search B for substring matches of \a A. \n 2026 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2027 /// mask of the comparison results. \n 2028 /// 00: No effect. \n 2029 /// 01: Negate the bit mask. \n 2030 /// 10: No effect. \n 2031 /// 11: Negate the bit mask only for bits with an index less than or equal 2032 /// to the size of \a A or \a B. \n 2033 /// \returns Returns bit 0 of the resulting bit mask. 2034 #define _mm_cmpistro(A, B, M) \ 2035 (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ 2036 (__v16qi)(__m128i)(B), (int)(M)) 2037 2038 /// \brief Uses the immediate operand \a M to perform a comparison of string 2039 /// data with implicitly defined lengths that is contained in source operands 2040 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 2041 /// the maximum, otherwise, returns 0. 2042 /// 2043 /// \headerfile <x86intrin.h> 2044 /// 2045 /// \code 2046 /// int _mm_cmpistrs(__m128i A, __m128i B, const int M); 2047 /// \endcode 2048 /// 2049 /// This intrinsic corresponds to the <c> <i> VPCMPISTRI / PCMPISTRI </i> </c> 2050 /// instruction. 2051 /// 2052 /// \param A 2053 /// A 128-bit integer vector containing one of the source operands to be 2054 /// compared. 2055 /// \param B 2056 /// A 128-bit integer vector containing one of the source operands to be 2057 /// compared. 2058 /// \param M 2059 /// An 8-bit immediate operand specifying whether the characters are bytes or 2060 /// words and the type of comparison to perform. \n 2061 /// Bits [1:0]: Determine source data format. \n 2062 /// 00: 16 unsigned bytes \n 2063 /// 01: 8 unsigned words \n 2064 /// 10: 16 signed bytes \n 2065 /// 11: 8 signed words \n 2066 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2067 /// 00: Subset: Each character in \a B is compared for equality with all 2068 /// the characters in \a A. \n 2069 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2070 /// basis is greater than or equal for even-indexed elements in \a A, 2071 /// and less than or equal for odd-indexed elements in \a A. \n 2072 /// 10: Match: Compare each pair of corresponding characters in \a A and 2073 /// \a B for equality. \n 2074 /// 11: Substring: Search \a B for substring matches of \a A. \n 2075 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2076 /// mask of the comparison results. \n 2077 /// 00: No effect. \n 2078 /// 01: Negate the bit mask. \n 2079 /// 10: No effect. \n 2080 /// 11: Negate the bit mask only for bits with an index less than or equal 2081 /// to the size of \a A or \a B. \n 2082 /// \returns Returns 1 if the length of the string in \a A is less than the 2083 /// maximum, otherwise, returns 0. 2084 #define _mm_cmpistrs(A, B, M) \ 2085 (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ 2086 (__v16qi)(__m128i)(B), (int)(M)) 2087 2088 /// \brief Uses the immediate operand \a M to perform a comparison of string 2089 /// data with implicitly defined lengths that is contained in source operands 2090 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2091 /// the maximum, otherwise, returns 0. 2092 /// 2093 /// \headerfile <x86intrin.h> 2094 /// 2095 /// \code 2096 /// int _mm_cmpistrz(__m128i A, __m128i B, const int M); 2097 /// \endcode 2098 /// 2099 /// This intrinsic corresponds to the <c> <i> VPCMPISTRI / PCMPISTRI </i> </c> 2100 /// instruction. 2101 /// 2102 /// \param A 2103 /// A 128-bit integer vector containing one of the source operands to be 2104 /// compared. 2105 /// \param B 2106 /// A 128-bit integer vector containing one of the source operands to be 2107 /// compared. 2108 /// \param M 2109 /// An 8-bit immediate operand specifying whether the characters are bytes or 2110 /// words and the type of comparison to perform. \n 2111 /// Bits [1:0]: Determine source data format. \n 2112 /// 00: 16 unsigned bytes \n 2113 /// 01: 8 unsigned words \n 2114 /// 10: 16 signed bytes \n 2115 /// 11: 8 signed words \n 2116 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2117 /// 00: Subset: Each character in \a B is compared for equality with all 2118 /// the characters in \a A. \n 2119 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2120 /// basis is greater than or equal for even-indexed elements in \a A, 2121 /// and less than or equal for odd-indexed elements in \a A. \n 2122 /// 10: Match: Compare each pair of corresponding characters in \a A and 2123 /// \a B for equality. \n 2124 /// 11: Substring: Search \a B for substring matches of \a A. \n 2125 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2126 /// mask of the comparison results. \n 2127 /// 00: No effect. \n 2128 /// 01: Negate the bit mask. \n 2129 /// 10: No effect. \n 2130 /// 11: Negate the bit mask only for bits with an index less than or equal 2131 /// to the size of \a A or \a B. 2132 /// \returns Returns 1 if the length of the string in \a B is less than the 2133 /// maximum, otherwise, returns 0. 2134 #define _mm_cmpistrz(A, B, M) \ 2135 (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ 2136 (__v16qi)(__m128i)(B), (int)(M)) 2137 2138 /// \brief Uses the immediate operand \a M to perform a comparison of string 2139 /// data with explicitly defined lengths that is contained in source operands 2140 /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the 2141 /// string in \a B is the maximum, otherwise, returns 0. 2142 /// 2143 /// \headerfile <x86intrin.h> 2144 /// 2145 /// \code 2146 /// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); 2147 /// \endcode 2148 /// 2149 /// This intrinsic corresponds to the <c> <i> VPCMPESTRI / PCMPESTRI </i> </c> 2150 /// instruction. 2151 /// 2152 /// \param A 2153 /// A 128-bit integer vector containing one of the source operands to be 2154 /// compared. 2155 /// \param LA 2156 /// An integer that specifies the length of the string in \a A. 2157 /// \param B 2158 /// A 128-bit integer vector containing one of the source operands to be 2159 /// compared. 2160 /// \param LB 2161 /// An integer that specifies the length of the string in \a B. 2162 /// \param M 2163 /// An 8-bit immediate operand specifying whether the characters are bytes or 2164 /// words and the type of comparison to perform. \n 2165 /// Bits [1:0]: Determine source data format. \n 2166 /// 00: 16 unsigned bytes \n 2167 /// 01: 8 unsigned words \n 2168 /// 10: 16 signed bytes \n 2169 /// 11: 8 signed words \n 2170 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2171 /// 00: Subset: Each character in \a B is compared for equality with all 2172 /// the characters in \a A. \n 2173 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2174 /// basis is greater than or equal for even-indexed elements in \a A, 2175 /// and less than or equal for odd-indexed elements in \a A. \n 2176 /// 10: Match: Compare each pair of corresponding characters in \a A and 2177 /// \a B for equality. \n 2178 /// 11: Substring: Search \a B for substring matches of \a A. \n 2179 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2180 /// mask of the comparison results. \n 2181 /// 00: No effect. \n 2182 /// 01: Negate the bit mask. \n 2183 /// 10: No effect. \n 2184 /// 11: Negate the bit mask only for bits with an index less than or equal 2185 /// to the size of \a A or \a B. 2186 /// \returns Returns 1 if the bit mask is zero and the length of the string in 2187 /// \a B is the maximum, otherwise, returns 0. 2188 #define _mm_cmpestra(A, LA, B, LB, M) \ 2189 (int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ 2190 (__v16qi)(__m128i)(B), (int)(LB), \ 2191 (int)(M)) 2192 2193 /// \brief Uses the immediate operand \a M to perform a comparison of string 2194 /// data with explicitly defined lengths that is contained in source operands 2195 /// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, 2196 /// returns 0. 2197 /// 2198 /// \headerfile <x86intrin.h> 2199 /// 2200 /// \code 2201 /// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); 2202 /// \endcode 2203 /// 2204 /// This intrinsic corresponds to the <c> <i> VPCMPESTRI / PCMPESTRI </i> </c> 2205 /// instruction. 2206 /// 2207 /// \param A 2208 /// A 128-bit integer vector containing one of the source operands to be 2209 /// compared. 2210 /// \param LA 2211 /// An integer that specifies the length of the string in \a A. 2212 /// \param B 2213 /// A 128-bit integer vector containing one of the source operands to be 2214 /// compared. 2215 /// \param LB 2216 /// An integer that specifies the length of the string in \a B. 2217 /// \param M 2218 /// An 8-bit immediate operand specifying whether the characters are bytes or 2219 /// words and the type of comparison to perform. \n 2220 /// Bits [1:0]: Determine source data format. \n 2221 /// 00: 16 unsigned bytes \n 2222 /// 01: 8 unsigned words \n 2223 /// 10: 16 signed bytes \n 2224 /// 11: 8 signed words \n 2225 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2226 /// 00: Subset: Each character in \a B is compared for equality with all 2227 /// the characters in \a A. \n 2228 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2229 /// basis is greater than or equal for even-indexed elements in \a A, 2230 /// and less than or equal for odd-indexed elements in \a A. \n 2231 /// 10: Match: Compare each pair of corresponding characters in \a A and 2232 /// \a B for equality. \n 2233 /// 11: Substring: Search \a B for substring matches of \a A. \n 2234 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2235 /// mask of the comparison results. \n 2236 /// 00: No effect. \n 2237 /// 01: Negate the bit mask. \n 2238 /// 10: No effect. \n 2239 /// 11: Negate the bit mask only for bits with an index less than or equal 2240 /// to the size of \a A or \a B. \n 2241 /// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. 2242 #define _mm_cmpestrc(A, LA, B, LB, M) \ 2243 (int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ 2244 (__v16qi)(__m128i)(B), (int)(LB), \ 2245 (int)(M)) 2246 /// \brief Uses the immediate operand \a M to perform a comparison of string 2247 /// data with explicitly defined lengths that is contained in source operands 2248 /// \a A and \a B. Returns bit 0 of the resulting bit mask. 2249 /// 2250 /// \headerfile <x86intrin.h> 2251 /// 2252 /// \code 2253 /// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); 2254 /// \endcode 2255 /// 2256 /// This intrinsic corresponds to the <c> <i> VPCMPESTRI / PCMPESTRI </i> </c> 2257 /// instruction. 2258 /// 2259 /// \param A 2260 /// A 128-bit integer vector containing one of the source operands to be 2261 /// compared. 2262 /// \param LA 2263 /// An integer that specifies the length of the string in \a A. 2264 /// \param B 2265 /// A 128-bit integer vector containing one of the source operands to be 2266 /// compared. 2267 /// \param LB 2268 /// An integer that specifies the length of the string in \a B. 2269 /// \param M 2270 /// An 8-bit immediate operand specifying whether the characters are bytes or 2271 /// words and the type of comparison to perform. \n 2272 /// Bits [1:0]: Determine source data format. \n 2273 /// 00: 16 unsigned bytes \n 2274 /// 01: 8 unsigned words \n 2275 /// 10: 16 signed bytes \n 2276 /// 11: 8 signed words \n 2277 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2278 /// 00: Subset: Each character in \a B is compared for equality with all 2279 /// the characters in \a A. \n 2280 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2281 /// basis is greater than or equal for even-indexed elements in \a A, 2282 /// and less than or equal for odd-indexed elements in \a A. \n 2283 /// 10: Match: Compare each pair of corresponding characters in \a A and 2284 /// \a B for equality. \n 2285 /// 11: Substring: Search \a B for substring matches of \a A. \n 2286 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2287 /// mask of the comparison results. \n 2288 /// 00: No effect. \n 2289 /// 01: Negate the bit mask. \n 2290 /// 10: No effect. \n 2291 /// 11: Negate the bit mask only for bits with an index less than or equal 2292 /// to the size of \a A or \a B. 2293 /// \returns Returns bit 0 of the resulting bit mask. 2294 #define _mm_cmpestro(A, LA, B, LB, M) \ 2295 (int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ 2296 (__v16qi)(__m128i)(B), (int)(LB), \ 2297 (int)(M)) 2298 2299 /// \brief Uses the immediate operand \a M to perform a comparison of string 2300 /// data with explicitly defined lengths that is contained in source operands 2301 /// \a A and \a B. Returns 1 if the length of the string in \a A is less than 2302 /// the maximum, otherwise, returns 0. 2303 /// 2304 /// \headerfile <x86intrin.h> 2305 /// 2306 /// \code 2307 /// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); 2308 /// \endcode 2309 /// 2310 /// This intrinsic corresponds to the <c> <i> VPCMPESTRI / PCMPESTRI </i> </c> 2311 /// instruction. 2312 /// 2313 /// \param A 2314 /// A 128-bit integer vector containing one of the source operands to be 2315 /// compared. 2316 /// \param LA 2317 /// An integer that specifies the length of the string in \a A. 2318 /// \param B 2319 /// A 128-bit integer vector containing one of the source operands to be 2320 /// compared. 2321 /// \param LB 2322 /// An integer that specifies the length of the string in \a B. 2323 /// \param M 2324 /// An 8-bit immediate operand specifying whether the characters are bytes or 2325 /// words and the type of comparison to perform. \n 2326 /// Bits [1:0]: Determine source data format. \n 2327 /// 00: 16 unsigned bytes \n 2328 /// 01: 8 unsigned words \n 2329 /// 10: 16 signed bytes \n 2330 /// 11: 8 signed words \n 2331 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2332 /// 00: Subset: Each character in \a B is compared for equality with all 2333 /// the characters in \a A. \n 2334 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2335 /// basis is greater than or equal for even-indexed elements in \a A, 2336 /// and less than or equal for odd-indexed elements in \a A. \n 2337 /// 10: Match: Compare each pair of corresponding characters in \a A and 2338 /// \a B for equality. \n 2339 /// 11: Substring: Search \a B for substring matches of \a A. \n 2340 /// Bits [5:4]: Determine whether to perform a one's complement in the bit 2341 /// mask of the comparison results. \n 2342 /// 00: No effect. \n 2343 /// 01: Negate the bit mask. \n 2344 /// 10: No effect. \n 2345 /// 11: Negate the bit mask only for bits with an index less than or equal 2346 /// to the size of \a A or \a B. \n 2347 /// \returns Returns 1 if the length of the string in \a A is less than the 2348 /// maximum, otherwise, returns 0. 2349 #define _mm_cmpestrs(A, LA, B, LB, M) \ 2350 (int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ 2351 (__v16qi)(__m128i)(B), (int)(LB), \ 2352 (int)(M)) 2353 2354 /// \brief Uses the immediate operand \a M to perform a comparison of string 2355 /// data with explicitly defined lengths that is contained in source operands 2356 /// \a A and \a B. Returns 1 if the length of the string in \a B is less than 2357 /// the maximum, otherwise, returns 0. 2358 /// 2359 /// \headerfile <x86intrin.h> 2360 /// 2361 /// \code 2362 /// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); 2363 /// \endcode 2364 /// 2365 /// This intrinsic corresponds to the <c> <i> VPCMPESTRI </i> </c> instruction. 2366 /// 2367 /// \param A 2368 /// A 128-bit integer vector containing one of the source operands to be 2369 /// compared. 2370 /// \param LA 2371 /// An integer that specifies the length of the string in \a A. 2372 /// \param B 2373 /// A 128-bit integer vector containing one of the source operands to be 2374 /// compared. 2375 /// \param LB 2376 /// An integer that specifies the length of the string in \a B. 2377 /// \param M 2378 /// An 8-bit immediate operand specifying whether the characters are bytes or 2379 /// words and the type of comparison to perform. \n 2380 /// Bits [1:0]: Determine source data format. \n 2381 /// 00: 16 unsigned bytes \n 2382 /// 01: 8 unsigned words \n 2383 /// 10: 16 signed bytes \n 2384 /// 11: 8 signed words \n 2385 /// Bits [3:2]: Determine comparison type and aggregation method. \n 2386 /// 00: Subset: Each character in \a B is compared for equality with all 2387 /// the characters in \a A. \n 2388 /// 01: Ranges: Each character in \a B is compared to \a A. The comparison 2389 /// basis is greater than or equal for even-indexed elements in \a A, 2390 /// and less than or equal for odd-indexed elements in \a A. \n 2391 /// 10: Match: Compare each pair of corresponding characters in \a A and 2392 /// \a B for equality. \n 2393 /// 11: Substring: Search \a B for substring matches of \a A. \n 2394 /// Bits [5:4]: Determine whether to perform a one's complement on the bit 2395 /// mask of the comparison results. \n 2396 /// 00: No effect. \n 2397 /// 01: Negate the bit mask. \n 2398 /// 10: No effect. \n 2399 /// 11: Negate the bit mask only for bits with an index less than or equal 2400 /// to the size of \a A or \a B. 2401 /// \returns Returns 1 if the length of the string in \a B is less than the 2402 /// maximum, otherwise, returns 0. 2403 #define _mm_cmpestrz(A, LA, B, LB, M) \ 2404 (int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ 2405 (__v16qi)(__m128i)(B), (int)(LB), \ 2406 (int)(M)) 2407 2408 /* SSE4.2 Compare Packed Data -- Greater Than. */ 2409 /// \brief Compares each of the corresponding 64-bit values of the 128-bit 2410 /// integer vectors to determine if the values in the first operand are 2411 /// greater than those in the second operand. 2412 /// 2413 /// \headerfile <x86intrin.h> 2414 /// 2415 /// This intrinsic corresponds to the <c> <i> VPCMPGTQ / PCMPGTQ </i> </c> 2416 /// instruction. 2417 /// 2418 /// \param __V1 2419 /// A 128-bit integer vector. 2420 /// \param __V2 2421 /// A 128-bit integer vector. 2422 /// \returns A 128-bit integer vector containing the comparison results. 2423 static __inline__ __m128i __DEFAULT_FN_ATTRS 2424 _mm_cmpgt_epi64(__m128i __V1, __m128i __V2) 2425 { 2426 return (__m128i)((__v2di)__V1 > (__v2di)__V2); 2427 } 2428 2429 /* SSE4.2 Accumulate CRC32. */ 2430 /// \brief Adds the unsigned integer operand to the CRC-32C checksum of the 2431 /// unsigned char operand. 2432 /// 2433 /// \headerfile <x86intrin.h> 2434 /// 2435 /// This intrinsic corresponds to the <c> <i> CRC32B </i> </c> instruction. 2436 /// 2437 /// \param __C 2438 /// An unsigned integer operand to add to the CRC-32C checksum of operand 2439 /// \a __D. 2440 /// \param __D 2441 /// An unsigned 8-bit integer operand used to compute the CRC-32C checksum. 2442 /// \returns The result of adding operand \a __C to the CRC-32C checksum of 2443 /// operand \a __D. 2444 static __inline__ unsigned int __DEFAULT_FN_ATTRS 2445 _mm_crc32_u8(unsigned int __C, unsigned char __D) 2446 { 2447 return __builtin_ia32_crc32qi(__C, __D); 2448 } 2449 2450 /// \brief Adds the unsigned integer operand to the CRC-32C checksum of the 2451 /// unsigned short operand. 2452 /// 2453 /// \headerfile <x86intrin.h> 2454 /// 2455 /// This intrinsic corresponds to the <c> <i> CRC32W </i> </c> instruction. 2456 /// 2457 /// \param __C 2458 /// An unsigned integer operand to add to the CRC-32C checksum of operand 2459 /// \a __D. 2460 /// \param __D 2461 /// An unsigned 16-bit integer operand used to compute the CRC-32C checksum. 2462 /// \returns The result of adding operand \a __C to the CRC-32C checksum of 2463 /// operand \a __D. 2464 static __inline__ unsigned int __DEFAULT_FN_ATTRS 2465 _mm_crc32_u16(unsigned int __C, unsigned short __D) 2466 { 2467 return __builtin_ia32_crc32hi(__C, __D); 2468 } 2469 2470 /// \brief Adds the first unsigned integer operand to the CRC-32C checksum of 2471 /// the second unsigned integer operand. 2472 /// 2473 /// \headerfile <x86intrin.h> 2474 /// 2475 /// This intrinsic corresponds to the <c> <i> CRC32L </i> </c> instruction. 2476 /// 2477 /// \param __C 2478 /// An unsigned integer operand to add to the CRC-32C checksum of operand 2479 /// \a __D. 2480 /// \param __D 2481 /// An unsigned 32-bit integer operand used to compute the CRC-32C checksum. 2482 /// \returns The result of adding operand \a __C to the CRC-32C checksum of 2483 /// operand \a __D. 2484 static __inline__ unsigned int __DEFAULT_FN_ATTRS 2485 _mm_crc32_u32(unsigned int __C, unsigned int __D) 2486 { 2487 return __builtin_ia32_crc32si(__C, __D); 2488 } 2489 2490 #ifdef __x86_64__ 2491 /// \brief Adds the unsigned integer operand to the CRC-32C checksum of the 2492 /// unsigned 64-bit integer operand. 2493 /// 2494 /// \headerfile <x86intrin.h> 2495 /// 2496 /// This intrinsic corresponds to the <c> <i> CRC32Q </i> </c> instruction. 2497 /// 2498 /// \param __C 2499 /// An unsigned integer operand to add to the CRC-32C checksum of operand 2500 /// \a __D. 2501 /// \param __D 2502 /// An unsigned 64-bit integer operand used to compute the CRC-32C checksum. 2503 /// \returns The result of adding operand \a __C to the CRC-32C checksum of 2504 /// operand \a __D. 2505 static __inline__ unsigned long long __DEFAULT_FN_ATTRS 2506 _mm_crc32_u64(unsigned long long __C, unsigned long long __D) 2507 { 2508 return __builtin_ia32_crc32di(__C, __D); 2509 } 2510 #endif /* __x86_64__ */ 2511 2512 #undef __DEFAULT_FN_ATTRS 2513 2514 #ifdef __POPCNT__ 2515 #include <popcntintrin.h> 2516 #endif 2517 2518 #endif /* _SMMINTRIN_H */ 2519