1 /* Copyright (C) 2002-2013 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 /* Implemented from the specification included in the Intel C++ Compiler 25 User Guide and Reference, version 9.0. */ 26 27 #ifndef _MMINTRIN_H_INCLUDED 28 #define _MMINTRIN_H_INCLUDED 29 30 #ifndef __MMX__ 31 # error "MMX instruction set not enabled" 32 #else 33 /* The Intel API is flexible enough that we must allow aliasing with other 34 vector types, and their scalar components. */ 35 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__)); 36 37 /* Internal data types for implementing the intrinsics. */ 38 typedef int __v2si __attribute__ ((__vector_size__ (8))); 39 typedef short __v4hi __attribute__ ((__vector_size__ (8))); 40 typedef char __v8qi __attribute__ ((__vector_size__ (8))); 41 typedef long long __v1di __attribute__ ((__vector_size__ (8))); 42 typedef float __v2sf __attribute__ ((__vector_size__ (8))); 43 44 /* Empty the multimedia state. */ 45 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 46 _mm_empty (void) 47 { 48 __builtin_ia32_emms (); 49 } 50 51 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 52 _m_empty (void) 53 { 54 _mm_empty (); 55 } 56 57 /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ 58 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 59 _mm_cvtsi32_si64 (int __i) 60 { 61 return (__m64) __builtin_ia32_vec_init_v2si (__i, 0); 62 } 63 64 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 65 _m_from_int (int __i) 66 { 67 return _mm_cvtsi32_si64 (__i); 68 } 69 70 #ifdef __x86_64__ 71 /* Convert I to a __m64 object. */ 72 73 /* Intel intrinsic. */ 74 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 75 _m_from_int64 (long long __i) 76 { 77 return (__m64) __i; 78 } 79 80 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 81 _mm_cvtsi64_m64 (long long __i) 82 { 83 return (__m64) __i; 84 } 85 86 /* Microsoft intrinsic. */ 87 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 88 _mm_cvtsi64x_si64 (long long __i) 89 { 90 return (__m64) __i; 91 } 92 93 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 94 _mm_set_pi64x (long long __i) 95 { 96 return (__m64) __i; 97 } 98 #endif 99 100 /* Convert the lower 32 bits of the __m64 object into an integer. */ 101 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 102 _mm_cvtsi64_si32 (__m64 __i) 103 { 104 return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0); 105 } 106 107 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 108 _m_to_int (__m64 __i) 109 { 110 return _mm_cvtsi64_si32 (__i); 111 } 112 113 #ifdef __x86_64__ 114 /* Convert the __m64 object to a 64bit integer. */ 115 116 /* Intel intrinsic. */ 117 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 118 _m_to_int64 (__m64 __i) 119 { 120 return (long long)__i; 121 } 122 123 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 124 _mm_cvtm64_si64 (__m64 __i) 125 { 126 return (long long)__i; 127 } 128 129 /* Microsoft intrinsic. */ 130 extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 131 _mm_cvtsi64_si64x (__m64 __i) 132 { 133 return (long long)__i; 134 } 135 #endif 136 137 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 138 the result, and the four 16-bit values from M2 into the upper four 8-bit 139 values of the result, all with signed saturation. */ 140 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 141 _mm_packs_pi16 (__m64 __m1, __m64 __m2) 142 { 143 return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2); 144 } 145 146 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 147 _m_packsswb (__m64 __m1, __m64 __m2) 148 { 149 return _mm_packs_pi16 (__m1, __m2); 150 } 151 152 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 153 the result, and the two 32-bit values from M2 into the upper two 16-bit 154 values of the result, all with signed saturation. */ 155 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 156 _mm_packs_pi32 (__m64 __m1, __m64 __m2) 157 { 158 return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2); 159 } 160 161 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 162 _m_packssdw (__m64 __m1, __m64 __m2) 163 { 164 return _mm_packs_pi32 (__m1, __m2); 165 } 166 167 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 168 the result, and the four 16-bit values from M2 into the upper four 8-bit 169 values of the result, all with unsigned saturation. */ 170 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 171 _mm_packs_pu16 (__m64 __m1, __m64 __m2) 172 { 173 return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2); 174 } 175 176 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 177 _m_packuswb (__m64 __m1, __m64 __m2) 178 { 179 return _mm_packs_pu16 (__m1, __m2); 180 } 181 182 /* Interleave the four 8-bit values from the high half of M1 with the four 183 8-bit values from the high half of M2. */ 184 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 185 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) 186 { 187 return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2); 188 } 189 190 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 191 _m_punpckhbw (__m64 __m1, __m64 __m2) 192 { 193 return _mm_unpackhi_pi8 (__m1, __m2); 194 } 195 196 /* Interleave the two 16-bit values from the high half of M1 with the two 197 16-bit values from the high half of M2. */ 198 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 199 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) 200 { 201 return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2); 202 } 203 204 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 205 _m_punpckhwd (__m64 __m1, __m64 __m2) 206 { 207 return _mm_unpackhi_pi16 (__m1, __m2); 208 } 209 210 /* Interleave the 32-bit value from the high half of M1 with the 32-bit 211 value from the high half of M2. */ 212 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 213 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) 214 { 215 return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2); 216 } 217 218 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 219 _m_punpckhdq (__m64 __m1, __m64 __m2) 220 { 221 return _mm_unpackhi_pi32 (__m1, __m2); 222 } 223 224 /* Interleave the four 8-bit values from the low half of M1 with the four 225 8-bit values from the low half of M2. */ 226 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 227 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) 228 { 229 return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2); 230 } 231 232 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 233 _m_punpcklbw (__m64 __m1, __m64 __m2) 234 { 235 return _mm_unpacklo_pi8 (__m1, __m2); 236 } 237 238 /* Interleave the two 16-bit values from the low half of M1 with the two 239 16-bit values from the low half of M2. */ 240 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 241 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) 242 { 243 return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2); 244 } 245 246 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 247 _m_punpcklwd (__m64 __m1, __m64 __m2) 248 { 249 return _mm_unpacklo_pi16 (__m1, __m2); 250 } 251 252 /* Interleave the 32-bit value from the low half of M1 with the 32-bit 253 value from the low half of M2. */ 254 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 255 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) 256 { 257 return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2); 258 } 259 260 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 261 _m_punpckldq (__m64 __m1, __m64 __m2) 262 { 263 return _mm_unpacklo_pi32 (__m1, __m2); 264 } 265 266 /* Add the 8-bit values in M1 to the 8-bit values in M2. */ 267 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 268 _mm_add_pi8 (__m64 __m1, __m64 __m2) 269 { 270 return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2); 271 } 272 273 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 274 _m_paddb (__m64 __m1, __m64 __m2) 275 { 276 return _mm_add_pi8 (__m1, __m2); 277 } 278 279 /* Add the 16-bit values in M1 to the 16-bit values in M2. */ 280 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 281 _mm_add_pi16 (__m64 __m1, __m64 __m2) 282 { 283 return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2); 284 } 285 286 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 287 _m_paddw (__m64 __m1, __m64 __m2) 288 { 289 return _mm_add_pi16 (__m1, __m2); 290 } 291 292 /* Add the 32-bit values in M1 to the 32-bit values in M2. */ 293 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 294 _mm_add_pi32 (__m64 __m1, __m64 __m2) 295 { 296 return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2); 297 } 298 299 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 300 _m_paddd (__m64 __m1, __m64 __m2) 301 { 302 return _mm_add_pi32 (__m1, __m2); 303 } 304 305 /* Add the 64-bit values in M1 to the 64-bit values in M2. */ 306 #ifdef __SSE2__ 307 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 308 _mm_add_si64 (__m64 __m1, __m64 __m2) 309 { 310 return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2); 311 } 312 #endif 313 314 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 315 saturated arithmetic. */ 316 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 317 _mm_adds_pi8 (__m64 __m1, __m64 __m2) 318 { 319 return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2); 320 } 321 322 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 323 _m_paddsb (__m64 __m1, __m64 __m2) 324 { 325 return _mm_adds_pi8 (__m1, __m2); 326 } 327 328 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 329 saturated arithmetic. */ 330 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 331 _mm_adds_pi16 (__m64 __m1, __m64 __m2) 332 { 333 return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2); 334 } 335 336 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 337 _m_paddsw (__m64 __m1, __m64 __m2) 338 { 339 return _mm_adds_pi16 (__m1, __m2); 340 } 341 342 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 343 saturated arithmetic. */ 344 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 345 _mm_adds_pu8 (__m64 __m1, __m64 __m2) 346 { 347 return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2); 348 } 349 350 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 351 _m_paddusb (__m64 __m1, __m64 __m2) 352 { 353 return _mm_adds_pu8 (__m1, __m2); 354 } 355 356 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 357 saturated arithmetic. */ 358 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 359 _mm_adds_pu16 (__m64 __m1, __m64 __m2) 360 { 361 return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2); 362 } 363 364 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 365 _m_paddusw (__m64 __m1, __m64 __m2) 366 { 367 return _mm_adds_pu16 (__m1, __m2); 368 } 369 370 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 371 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 372 _mm_sub_pi8 (__m64 __m1, __m64 __m2) 373 { 374 return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2); 375 } 376 377 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 378 _m_psubb (__m64 __m1, __m64 __m2) 379 { 380 return _mm_sub_pi8 (__m1, __m2); 381 } 382 383 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 384 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 385 _mm_sub_pi16 (__m64 __m1, __m64 __m2) 386 { 387 return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2); 388 } 389 390 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 391 _m_psubw (__m64 __m1, __m64 __m2) 392 { 393 return _mm_sub_pi16 (__m1, __m2); 394 } 395 396 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 397 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 398 _mm_sub_pi32 (__m64 __m1, __m64 __m2) 399 { 400 return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2); 401 } 402 403 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 404 _m_psubd (__m64 __m1, __m64 __m2) 405 { 406 return _mm_sub_pi32 (__m1, __m2); 407 } 408 409 /* Add the 64-bit values in M1 to the 64-bit values in M2. */ 410 #ifdef __SSE2__ 411 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 412 _mm_sub_si64 (__m64 __m1, __m64 __m2) 413 { 414 return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2); 415 } 416 #endif 417 418 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 419 saturating arithmetic. */ 420 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 421 _mm_subs_pi8 (__m64 __m1, __m64 __m2) 422 { 423 return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2); 424 } 425 426 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 427 _m_psubsb (__m64 __m1, __m64 __m2) 428 { 429 return _mm_subs_pi8 (__m1, __m2); 430 } 431 432 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 433 signed saturating arithmetic. */ 434 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 435 _mm_subs_pi16 (__m64 __m1, __m64 __m2) 436 { 437 return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2); 438 } 439 440 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 441 _m_psubsw (__m64 __m1, __m64 __m2) 442 { 443 return _mm_subs_pi16 (__m1, __m2); 444 } 445 446 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 447 unsigned saturating arithmetic. */ 448 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 449 _mm_subs_pu8 (__m64 __m1, __m64 __m2) 450 { 451 return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2); 452 } 453 454 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 455 _m_psubusb (__m64 __m1, __m64 __m2) 456 { 457 return _mm_subs_pu8 (__m1, __m2); 458 } 459 460 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 461 unsigned saturating arithmetic. */ 462 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 463 _mm_subs_pu16 (__m64 __m1, __m64 __m2) 464 { 465 return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2); 466 } 467 468 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 469 _m_psubusw (__m64 __m1, __m64 __m2) 470 { 471 return _mm_subs_pu16 (__m1, __m2); 472 } 473 474 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 475 four 32-bit intermediate results, which are then summed by pairs to 476 produce two 32-bit results. */ 477 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 478 _mm_madd_pi16 (__m64 __m1, __m64 __m2) 479 { 480 return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2); 481 } 482 483 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 484 _m_pmaddwd (__m64 __m1, __m64 __m2) 485 { 486 return _mm_madd_pi16 (__m1, __m2); 487 } 488 489 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 490 M2 and produce the high 16 bits of the 32-bit results. */ 491 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 492 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2) 493 { 494 return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2); 495 } 496 497 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 498 _m_pmulhw (__m64 __m1, __m64 __m2) 499 { 500 return _mm_mulhi_pi16 (__m1, __m2); 501 } 502 503 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 504 the low 16 bits of the results. */ 505 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 506 _mm_mullo_pi16 (__m64 __m1, __m64 __m2) 507 { 508 return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2); 509 } 510 511 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 512 _m_pmullw (__m64 __m1, __m64 __m2) 513 { 514 return _mm_mullo_pi16 (__m1, __m2); 515 } 516 517 /* Shift four 16-bit values in M left by COUNT. */ 518 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 519 _mm_sll_pi16 (__m64 __m, __m64 __count) 520 { 521 return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count); 522 } 523 524 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 525 _m_psllw (__m64 __m, __m64 __count) 526 { 527 return _mm_sll_pi16 (__m, __count); 528 } 529 530 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 531 _mm_slli_pi16 (__m64 __m, int __count) 532 { 533 return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count); 534 } 535 536 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 537 _m_psllwi (__m64 __m, int __count) 538 { 539 return _mm_slli_pi16 (__m, __count); 540 } 541 542 /* Shift two 32-bit values in M left by COUNT. */ 543 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 544 _mm_sll_pi32 (__m64 __m, __m64 __count) 545 { 546 return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count); 547 } 548 549 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 550 _m_pslld (__m64 __m, __m64 __count) 551 { 552 return _mm_sll_pi32 (__m, __count); 553 } 554 555 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 556 _mm_slli_pi32 (__m64 __m, int __count) 557 { 558 return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count); 559 } 560 561 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 562 _m_pslldi (__m64 __m, int __count) 563 { 564 return _mm_slli_pi32 (__m, __count); 565 } 566 567 /* Shift the 64-bit value in M left by COUNT. */ 568 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 569 _mm_sll_si64 (__m64 __m, __m64 __count) 570 { 571 return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count); 572 } 573 574 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 575 _m_psllq (__m64 __m, __m64 __count) 576 { 577 return _mm_sll_si64 (__m, __count); 578 } 579 580 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 581 _mm_slli_si64 (__m64 __m, int __count) 582 { 583 return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count); 584 } 585 586 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 587 _m_psllqi (__m64 __m, int __count) 588 { 589 return _mm_slli_si64 (__m, __count); 590 } 591 592 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 593 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 594 _mm_sra_pi16 (__m64 __m, __m64 __count) 595 { 596 return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count); 597 } 598 599 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 600 _m_psraw (__m64 __m, __m64 __count) 601 { 602 return _mm_sra_pi16 (__m, __count); 603 } 604 605 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 606 _mm_srai_pi16 (__m64 __m, int __count) 607 { 608 return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count); 609 } 610 611 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 612 _m_psrawi (__m64 __m, int __count) 613 { 614 return _mm_srai_pi16 (__m, __count); 615 } 616 617 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 618 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 619 _mm_sra_pi32 (__m64 __m, __m64 __count) 620 { 621 return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count); 622 } 623 624 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 625 _m_psrad (__m64 __m, __m64 __count) 626 { 627 return _mm_sra_pi32 (__m, __count); 628 } 629 630 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 631 _mm_srai_pi32 (__m64 __m, int __count) 632 { 633 return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count); 634 } 635 636 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 637 _m_psradi (__m64 __m, int __count) 638 { 639 return _mm_srai_pi32 (__m, __count); 640 } 641 642 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 643 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 644 _mm_srl_pi16 (__m64 __m, __m64 __count) 645 { 646 return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count); 647 } 648 649 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 650 _m_psrlw (__m64 __m, __m64 __count) 651 { 652 return _mm_srl_pi16 (__m, __count); 653 } 654 655 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 656 _mm_srli_pi16 (__m64 __m, int __count) 657 { 658 return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count); 659 } 660 661 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 662 _m_psrlwi (__m64 __m, int __count) 663 { 664 return _mm_srli_pi16 (__m, __count); 665 } 666 667 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 668 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 669 _mm_srl_pi32 (__m64 __m, __m64 __count) 670 { 671 return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count); 672 } 673 674 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 675 _m_psrld (__m64 __m, __m64 __count) 676 { 677 return _mm_srl_pi32 (__m, __count); 678 } 679 680 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 681 _mm_srli_pi32 (__m64 __m, int __count) 682 { 683 return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count); 684 } 685 686 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 687 _m_psrldi (__m64 __m, int __count) 688 { 689 return _mm_srli_pi32 (__m, __count); 690 } 691 692 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 693 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 694 _mm_srl_si64 (__m64 __m, __m64 __count) 695 { 696 return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count); 697 } 698 699 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 700 _m_psrlq (__m64 __m, __m64 __count) 701 { 702 return _mm_srl_si64 (__m, __count); 703 } 704 705 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 706 _mm_srli_si64 (__m64 __m, int __count) 707 { 708 return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count); 709 } 710 711 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 712 _m_psrlqi (__m64 __m, int __count) 713 { 714 return _mm_srli_si64 (__m, __count); 715 } 716 717 /* Bit-wise AND the 64-bit values in M1 and M2. */ 718 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 719 _mm_and_si64 (__m64 __m1, __m64 __m2) 720 { 721 return __builtin_ia32_pand (__m1, __m2); 722 } 723 724 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 725 _m_pand (__m64 __m1, __m64 __m2) 726 { 727 return _mm_and_si64 (__m1, __m2); 728 } 729 730 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 731 64-bit value in M2. */ 732 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 733 _mm_andnot_si64 (__m64 __m1, __m64 __m2) 734 { 735 return __builtin_ia32_pandn (__m1, __m2); 736 } 737 738 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 739 _m_pandn (__m64 __m1, __m64 __m2) 740 { 741 return _mm_andnot_si64 (__m1, __m2); 742 } 743 744 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 745 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 746 _mm_or_si64 (__m64 __m1, __m64 __m2) 747 { 748 return __builtin_ia32_por (__m1, __m2); 749 } 750 751 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 752 _m_por (__m64 __m1, __m64 __m2) 753 { 754 return _mm_or_si64 (__m1, __m2); 755 } 756 757 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 758 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 759 _mm_xor_si64 (__m64 __m1, __m64 __m2) 760 { 761 return __builtin_ia32_pxor (__m1, __m2); 762 } 763 764 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 765 _m_pxor (__m64 __m1, __m64 __m2) 766 { 767 return _mm_xor_si64 (__m1, __m2); 768 } 769 770 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the 771 test is true and zero if false. */ 772 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 773 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) 774 { 775 return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2); 776 } 777 778 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 779 _m_pcmpeqb (__m64 __m1, __m64 __m2) 780 { 781 return _mm_cmpeq_pi8 (__m1, __m2); 782 } 783 784 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 785 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) 786 { 787 return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2); 788 } 789 790 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 791 _m_pcmpgtb (__m64 __m1, __m64 __m2) 792 { 793 return _mm_cmpgt_pi8 (__m1, __m2); 794 } 795 796 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if 797 the test is true and zero if false. */ 798 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 799 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) 800 { 801 return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2); 802 } 803 804 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 805 _m_pcmpeqw (__m64 __m1, __m64 __m2) 806 { 807 return _mm_cmpeq_pi16 (__m1, __m2); 808 } 809 810 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 811 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) 812 { 813 return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2); 814 } 815 816 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 817 _m_pcmpgtw (__m64 __m1, __m64 __m2) 818 { 819 return _mm_cmpgt_pi16 (__m1, __m2); 820 } 821 822 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 823 the test is true and zero if false. */ 824 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 825 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) 826 { 827 return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2); 828 } 829 830 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 831 _m_pcmpeqd (__m64 __m1, __m64 __m2) 832 { 833 return _mm_cmpeq_pi32 (__m1, __m2); 834 } 835 836 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 837 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) 838 { 839 return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2); 840 } 841 842 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 843 _m_pcmpgtd (__m64 __m1, __m64 __m2) 844 { 845 return _mm_cmpgt_pi32 (__m1, __m2); 846 } 847 848 /* Creates a 64-bit zero. */ 849 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 850 _mm_setzero_si64 (void) 851 { 852 return (__m64)0LL; 853 } 854 855 /* Creates a vector of two 32-bit values; I0 is least significant. */ 856 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 857 _mm_set_pi32 (int __i1, int __i0) 858 { 859 return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1); 860 } 861 862 /* Creates a vector of four 16-bit values; W0 is least significant. */ 863 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 864 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) 865 { 866 return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3); 867 } 868 869 /* Creates a vector of eight 8-bit values; B0 is least significant. */ 870 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 871 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, 872 char __b3, char __b2, char __b1, char __b0) 873 { 874 return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3, 875 __b4, __b5, __b6, __b7); 876 } 877 878 /* Similar, but with the arguments in reverse order. */ 879 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 880 _mm_setr_pi32 (int __i0, int __i1) 881 { 882 return _mm_set_pi32 (__i1, __i0); 883 } 884 885 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 886 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) 887 { 888 return _mm_set_pi16 (__w3, __w2, __w1, __w0); 889 } 890 891 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 892 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, 893 char __b4, char __b5, char __b6, char __b7) 894 { 895 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 896 } 897 898 /* Creates a vector of two 32-bit values, both elements containing I. */ 899 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 900 _mm_set1_pi32 (int __i) 901 { 902 return _mm_set_pi32 (__i, __i); 903 } 904 905 /* Creates a vector of four 16-bit values, all elements containing W. */ 906 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 907 _mm_set1_pi16 (short __w) 908 { 909 return _mm_set_pi16 (__w, __w, __w, __w); 910 } 911 912 /* Creates a vector of eight 8-bit values, all elements containing B. */ 913 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 914 _mm_set1_pi8 (char __b) 915 { 916 return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b); 917 } 918 919 #endif /* __MMX__ */ 920 #endif /* _MMINTRIN_H_INCLUDED */ 921