1 /*===---- mmintrin.h - MMX intrinsics --------------------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 24 #ifndef __MMINTRIN_H 25 #define __MMINTRIN_H 26 27 #ifndef __MMX__ 28 #error "MMX instruction set not enabled" 29 #else 30 31 typedef long long __m64 __attribute__((__vector_size__(8))); 32 33 typedef int __v2si __attribute__((__vector_size__(8))); 34 typedef short __v4hi __attribute__((__vector_size__(8))); 35 typedef char __v8qi __attribute__((__vector_size__(8))); 36 37 static __inline__ void __attribute__((__always_inline__, __nodebug__)) 38 _mm_empty(void) 39 { 40 __builtin_ia32_emms(); 41 } 42 43 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 44 _mm_cvtsi32_si64(int __i) 45 { 46 return (__m64)(__v2si){__i, 0}; 47 } 48 49 static __inline__ int __attribute__((__always_inline__, __nodebug__)) 50 _mm_cvtsi64_si32(__m64 __m) 51 { 52 __v2si __mmx_var2 = (__v2si)__m; 53 return __mmx_var2[0]; 54 } 55 56 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 57 _mm_cvtsi64_m64(long long __i) 58 { 59 return (__m64)__i; 60 } 61 62 static __inline__ long long __attribute__((__always_inline__, __nodebug__)) 63 _mm_cvtm64_si64(__m64 __m) 64 { 65 return (long long)__m; 66 } 67 68 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 69 _mm_packs_pi16(__m64 __m1, __m64 __m2) 70 { 71 return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); 72 } 73 74 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 75 _mm_packs_pi32(__m64 __m1, __m64 __m2) 76 { 77 return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); 78 } 79 80 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 81 _mm_packs_pu16(__m64 __m1, __m64 __m2) 82 { 83 return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); 84 } 85 86 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 87 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) 88 { 89 return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 4, 8+4, 5, 90 8+5, 6, 8+6, 7, 8+7); 91 } 92 93 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 94 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) 95 { 96 return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 2, 4+2, 3, 97 4+3); 98 } 99 100 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 101 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) 102 { 103 return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 1, 2+1); 104 } 105 106 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 107 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) 108 { 109 return (__m64)__builtin_shufflevector((__v8qi)__m1, (__v8qi)__m2, 0, 8+0, 1, 110 8+1, 2, 8+2, 3, 8+3); 111 } 112 113 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 114 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) 115 { 116 return (__m64)__builtin_shufflevector((__v4hi)__m1, (__v4hi)__m2, 0, 4+0, 1, 117 4+1); 118 } 119 120 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 121 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) 122 { 123 return (__m64)__builtin_shufflevector((__v2si)__m1, (__v2si)__m2, 0, 2+0); 124 } 125 126 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 127 _mm_add_pi8(__m64 __m1, __m64 __m2) 128 { 129 return (__m64)((__v8qi)__m1 + (__v8qi)__m2); 130 } 131 132 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 133 _mm_add_pi16(__m64 __m1, __m64 __m2) 134 { 135 return (__m64)((__v4hi)__m1 + (__v4hi)__m2); 136 } 137 138 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 139 _mm_add_pi32(__m64 __m1, __m64 __m2) 140 { 141 return (__m64)((__v2si)__m1 + (__v2si)__m2); 142 } 143 144 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 145 _mm_adds_pi8(__m64 __m1, __m64 __m2) 146 { 147 return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); 148 } 149 150 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 151 _mm_adds_pi16(__m64 __m1, __m64 __m2) 152 { 153 return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2); 154 } 155 156 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 157 _mm_adds_pu8(__m64 __m1, __m64 __m2) 158 { 159 return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2); 160 } 161 162 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 163 _mm_adds_pu16(__m64 __m1, __m64 __m2) 164 { 165 return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2); 166 } 167 168 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 169 _mm_sub_pi8(__m64 __m1, __m64 __m2) 170 { 171 return (__m64)((__v8qi)__m1 - (__v8qi)__m2); 172 } 173 174 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 175 _mm_sub_pi16(__m64 __m1, __m64 __m2) 176 { 177 return (__m64)((__v4hi)__m1 - (__v4hi)__m2); 178 } 179 180 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 181 _mm_sub_pi32(__m64 __m1, __m64 __m2) 182 { 183 return (__m64)((__v2si)__m1 - (__v2si)__m2); 184 } 185 186 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 187 _mm_subs_pi8(__m64 __m1, __m64 __m2) 188 { 189 return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2); 190 } 191 192 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 193 _mm_subs_pi16(__m64 __m1, __m64 __m2) 194 { 195 return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2); 196 } 197 198 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 199 _mm_subs_pu8(__m64 __m1, __m64 __m2) 200 { 201 return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2); 202 } 203 204 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 205 _mm_subs_pu16(__m64 __m1, __m64 __m2) 206 { 207 return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2); 208 } 209 210 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 211 _mm_madd_pi16(__m64 __m1, __m64 __m2) 212 { 213 return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2); 214 } 215 216 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 217 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) 218 { 219 return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2); 220 } 221 222 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 223 _mm_mullo_pi16(__m64 __m1, __m64 __m2) 224 { 225 return (__m64)((__v4hi)__m1 * (__v4hi)__m2); 226 } 227 228 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 229 _mm_sll_pi16(__m64 __m, __m64 __count) 230 { 231 return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count); 232 } 233 234 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 235 _mm_slli_pi16(__m64 __m, int __count) 236 { 237 return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count); 238 } 239 240 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 241 _mm_sll_pi32(__m64 __m, __m64 __count) 242 { 243 return (__m64)__builtin_ia32_pslld((__v2si)__m, __count); 244 } 245 246 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 247 _mm_slli_pi32(__m64 __m, int __count) 248 { 249 return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count); 250 } 251 252 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 253 _mm_sll_si64(__m64 __m, __m64 __count) 254 { 255 return __builtin_ia32_psllq(__m, __count); 256 } 257 258 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 259 _mm_slli_si64(__m64 __m, int __count) 260 { 261 return __builtin_ia32_psllqi(__m, __count); 262 } 263 264 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 265 _mm_sra_pi16(__m64 __m, __m64 __count) 266 { 267 return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count); 268 } 269 270 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 271 _mm_srai_pi16(__m64 __m, int __count) 272 { 273 return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count); 274 } 275 276 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 277 _mm_sra_pi32(__m64 __m, __m64 __count) 278 { 279 return (__m64)__builtin_ia32_psrad((__v2si)__m, __count); 280 } 281 282 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 283 _mm_srai_pi32(__m64 __m, int __count) 284 { 285 return (__m64)__builtin_ia32_psradi((__v2si)__m, __count); 286 } 287 288 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 289 _mm_srl_pi16(__m64 __m, __m64 __count) 290 { 291 return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count); 292 } 293 294 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 295 _mm_srli_pi16(__m64 __m, int __count) 296 { 297 return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count); 298 } 299 300 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 301 _mm_srl_pi32(__m64 __m, __m64 __count) 302 { 303 return (__m64)__builtin_ia32_psrld((__v2si)__m, __count); 304 } 305 306 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 307 _mm_srli_pi32(__m64 __m, int __count) 308 { 309 return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count); 310 } 311 312 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 313 _mm_srl_si64(__m64 __m, __m64 __count) 314 { 315 return (__m64)__builtin_ia32_psrlq(__m, __count); 316 } 317 318 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 319 _mm_srli_si64(__m64 __m, int __count) 320 { 321 return __builtin_ia32_psrlqi(__m, __count); 322 } 323 324 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 325 _mm_and_si64(__m64 __m1, __m64 __m2) 326 { 327 return __m1 & __m2; 328 } 329 330 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 331 _mm_andnot_si64(__m64 __m1, __m64 __m2) 332 { 333 return ~__m1 & __m2; 334 } 335 336 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 337 _mm_or_si64(__m64 __m1, __m64 __m2) 338 { 339 return __m1 | __m2; 340 } 341 342 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 343 _mm_xor_si64(__m64 __m1, __m64 __m2) 344 { 345 return __m1 ^ __m2; 346 } 347 348 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 349 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) 350 { 351 return (__m64)((__v8qi)__m1 == (__v8qi)__m2); 352 } 353 354 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 355 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) 356 { 357 return (__m64)((__v4hi)__m1 == (__v4hi)__m2); 358 } 359 360 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 361 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) 362 { 363 return (__m64)((__v2si)__m1 == (__v2si)__m2); 364 } 365 366 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 367 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) 368 { 369 return (__m64)((__v8qi)__m1 > (__v8qi)__m2); 370 } 371 372 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 373 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) 374 { 375 return (__m64)((__v4hi)__m1 > (__v4hi)__m2); 376 } 377 378 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 379 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) 380 { 381 return (__m64)((__v2si)__m1 > (__v2si)__m2); 382 } 383 384 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 385 _mm_setzero_si64(void) 386 { 387 return (__m64){ 0LL }; 388 } 389 390 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 391 _mm_set_pi32(int __i1, int __i0) 392 { 393 return (__m64)(__v2si){ __i0, __i1 }; 394 } 395 396 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 397 _mm_set_pi16(short __s3, short __s2, short __s1, short __s0) 398 { 399 return (__m64)(__v4hi){ __s0, __s1, __s2, __s3 }; 400 } 401 402 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 403 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, 404 char __b1, char __b0) 405 { 406 return (__m64)(__v8qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7 }; 407 } 408 409 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 410 _mm_set1_pi32(int __i) 411 { 412 return (__m64)(__v2si){ __i, __i }; 413 } 414 415 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 416 _mm_set1_pi16(short __s) 417 { 418 return (__m64)(__v4hi){ __s, __s, __s, __s }; 419 } 420 421 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 422 _mm_set1_pi8(char __b) 423 { 424 return (__m64)(__v8qi){ __b, __b, __b, __b, __b, __b, __b, __b }; 425 } 426 427 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 428 _mm_setr_pi32(int __i1, int __i0) 429 { 430 return (__m64)(__v2si){ __i1, __i0 }; 431 } 432 433 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 434 _mm_setr_pi16(short __s3, short __s2, short __s1, short __s0) 435 { 436 return (__m64)(__v4hi){ __s3, __s2, __s1, __s0 }; 437 } 438 439 static __inline__ __m64 __attribute__((__always_inline__, __nodebug__)) 440 _mm_setr_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, 441 char __b1, char __b0) 442 { 443 return (__m64)(__v8qi){ __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0 }; 444 } 445 446 447 /* Aliases for compatibility. */ 448 #define _m_empty _mm_empty 449 #define _m_from_int _mm_cvtsi32_si64 450 #define _m_to_int _mm_cvtsi64_si32 451 #define _m_packsswb _mm_packs_pi16 452 #define _m_packssdw _mm_packs_pi32 453 #define _m_packuswb _mm_packs_pu16 454 #define _m_punpckhbw _mm_unpackhi_pi8 455 #define _m_punpckhwd _mm_unpackhi_pi16 456 #define _m_punpckhdq _mm_unpackhi_pi32 457 #define _m_punpcklbw _mm_unpacklo_pi8 458 #define _m_punpcklwd _mm_unpacklo_pi16 459 #define _m_punpckldq _mm_unpacklo_pi32 460 #define _m_paddb _mm_add_pi8 461 #define _m_paddw _mm_add_pi16 462 #define _m_paddd _mm_add_pi32 463 #define _m_paddsb _mm_adds_pi8 464 #define _m_paddsw _mm_adds_pi16 465 #define _m_paddusb _mm_adds_pu8 466 #define _m_paddusw _mm_adds_pu16 467 #define _m_psubb _mm_sub_pi8 468 #define _m_psubw _mm_sub_pi16 469 #define _m_psubd _mm_sub_pi32 470 #define _m_psubsb _mm_subs_pi8 471 #define _m_psubsw _mm_subs_pi16 472 #define _m_psubusb _mm_subs_pu8 473 #define _m_psubusw _mm_subs_pu16 474 #define _m_pmaddwd _mm_madd_pi16 475 #define _m_pmulhw _mm_mulhi_pi16 476 #define _m_pmullw _mm_mullo_pi16 477 #define _m_psllw _mm_sll_pi16 478 #define _m_psllwi _mm_slli_pi16 479 #define _m_pslld _mm_sll_pi32 480 #define _m_pslldi _mm_slli_pi32 481 #define _m_psllq _mm_sll_si64 482 #define _m_psllqi _mm_slli_si64 483 #define _m_psraw _mm_sra_pi16 484 #define _m_psrawi _mm_srai_pi16 485 #define _m_psrad _mm_sra_pi32 486 #define _m_psradi _mm_srai_pi32 487 #define _m_psrlw _mm_srl_pi16 488 #define _m_psrlwi _mm_srli_pi16 489 #define _m_psrld _mm_srl_pi32 490 #define _m_psrldi _mm_srli_pi32 491 #define _m_psrlq _mm_srl_si64 492 #define _m_psrlqi _mm_srli_si64 493 #define _m_pand _mm_and_si64 494 #define _m_pandn _mm_andnot_si64 495 #define _m_por _mm_or_si64 496 #define _m_pxor _mm_xor_si64 497 #define _m_pcmpeqb _mm_cmpeq_pi8 498 #define _m_pcmpeqw _mm_cmpeq_pi16 499 #define _m_pcmpeqd _mm_cmpeq_pi32 500 #define _m_pcmpgtb _mm_cmpgt_pi8 501 #define _m_pcmpgtw _mm_cmpgt_pi16 502 #define _m_pcmpgtd _mm_cmpgt_pi32 503 504 #endif /* __MMX__ */ 505 506 #endif /* __MMINTRIN_H */ 507 508