1 /* Copyright (C) 2002-2014 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify it 6 under the terms of the GNU General Public License as published 7 by the Free Software Foundation; either version 3, or (at your 8 option) any later version. 9 10 GCC is distributed in the hope that it will be useful, but WITHOUT 11 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 12 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public 13 License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 #ifndef _MMINTRIN_H_INCLUDED 25 #define _MMINTRIN_H_INCLUDED 26 27 #ifndef __IWMMXT__ 28 #error mmintrin.h included without enabling WMMX/WMMX2 instructions (e.g. -march=iwmmxt or -march=iwmmxt2) 29 #endif 30 31 32 #if defined __cplusplus 33 extern "C" { 34 /* Intrinsics use C name-mangling. */ 35 #endif /* __cplusplus */ 36 37 /* The data type intended for user use. */ 38 typedef unsigned long long __m64, __int64; 39 40 /* Internal data types for implementing the intrinsics. */ 41 typedef int __v2si __attribute__ ((vector_size (8))); 42 typedef short __v4hi __attribute__ ((vector_size (8))); 43 typedef signed char __v8qi __attribute__ ((vector_size (8))); 44 45 /* Provided for source compatibility with MMX. */ 46 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 47 _mm_empty (void) 48 { 49 } 50 51 /* "Convert" __m64 and __int64 into each other. */ 52 static __inline __m64 53 _mm_cvtsi64_m64 (__int64 __i) 54 { 55 return __i; 56 } 57 58 static __inline __int64 59 _mm_cvtm64_si64 (__m64 __i) 60 { 61 return __i; 62 } 63 64 static __inline int 65 _mm_cvtsi64_si32 (__int64 __i) 66 { 67 return __i; 68 } 69 70 static __inline __int64 71 _mm_cvtsi32_si64 (int __i) 72 { 73 return (__i & 0xffffffff); 74 } 75 76 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 77 the result, and the four 16-bit values from M2 into the upper four 8-bit 78 values of the result, all with signed saturation. */ 79 static __inline __m64 80 _mm_packs_pi16 (__m64 __m1, __m64 __m2) 81 { 82 return (__m64) __builtin_arm_wpackhss ((__v4hi)__m1, (__v4hi)__m2); 83 } 84 85 /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of 86 the result, and the two 32-bit values from M2 into the upper two 16-bit 87 values of the result, all with signed saturation. */ 88 static __inline __m64 89 _mm_packs_pi32 (__m64 __m1, __m64 __m2) 90 { 91 return (__m64) __builtin_arm_wpackwss ((__v2si)__m1, (__v2si)__m2); 92 } 93 94 /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and 95 the 64-bit value from M2 into the upper 32-bits of the result, all with 96 signed saturation for values that do not fit exactly into 32-bits. */ 97 static __inline __m64 98 _mm_packs_pi64 (__m64 __m1, __m64 __m2) 99 { 100 return (__m64) __builtin_arm_wpackdss ((long long)__m1, (long long)__m2); 101 } 102 103 /* Pack the four 16-bit values from M1 into the lower four 8-bit values of 104 the result, and the four 16-bit values from M2 into the upper four 8-bit 105 values of the result, all with unsigned saturation. */ 106 static __inline __m64 107 _mm_packs_pu16 (__m64 __m1, __m64 __m2) 108 { 109 return (__m64) __builtin_arm_wpackhus ((__v4hi)__m1, (__v4hi)__m2); 110 } 111 112 /* Pack the two 32-bit values from M1 into the lower two 16-bit values of 113 the result, and the two 32-bit values from M2 into the upper two 16-bit 114 values of the result, all with unsigned saturation. */ 115 static __inline __m64 116 _mm_packs_pu32 (__m64 __m1, __m64 __m2) 117 { 118 return (__m64) __builtin_arm_wpackwus ((__v2si)__m1, (__v2si)__m2); 119 } 120 121 /* Copy the 64-bit value from M1 into the lower 32-bits of the result, and 122 the 64-bit value from M2 into the upper 32-bits of the result, all with 123 unsigned saturation for values that do not fit exactly into 32-bits. */ 124 static __inline __m64 125 _mm_packs_pu64 (__m64 __m1, __m64 __m2) 126 { 127 return (__m64) __builtin_arm_wpackdus ((long long)__m1, (long long)__m2); 128 } 129 130 /* Interleave the four 8-bit values from the high half of M1 with the four 131 8-bit values from the high half of M2. */ 132 static __inline __m64 133 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) 134 { 135 return (__m64) __builtin_arm_wunpckihb ((__v8qi)__m1, (__v8qi)__m2); 136 } 137 138 /* Interleave the two 16-bit values from the high half of M1 with the two 139 16-bit values from the high half of M2. */ 140 static __inline __m64 141 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) 142 { 143 return (__m64) __builtin_arm_wunpckihh ((__v4hi)__m1, (__v4hi)__m2); 144 } 145 146 /* Interleave the 32-bit value from the high half of M1 with the 32-bit 147 value from the high half of M2. */ 148 static __inline __m64 149 _mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) 150 { 151 return (__m64) __builtin_arm_wunpckihw ((__v2si)__m1, (__v2si)__m2); 152 } 153 154 /* Interleave the four 8-bit values from the low half of M1 with the four 155 8-bit values from the low half of M2. */ 156 static __inline __m64 157 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) 158 { 159 return (__m64) __builtin_arm_wunpckilb ((__v8qi)__m1, (__v8qi)__m2); 160 } 161 162 /* Interleave the two 16-bit values from the low half of M1 with the two 163 16-bit values from the low half of M2. */ 164 static __inline __m64 165 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) 166 { 167 return (__m64) __builtin_arm_wunpckilh ((__v4hi)__m1, (__v4hi)__m2); 168 } 169 170 /* Interleave the 32-bit value from the low half of M1 with the 32-bit 171 value from the low half of M2. */ 172 static __inline __m64 173 _mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) 174 { 175 return (__m64) __builtin_arm_wunpckilw ((__v2si)__m1, (__v2si)__m2); 176 } 177 178 /* Take the four 8-bit values from the low half of M1, sign extend them, 179 and return the result as a vector of four 16-bit quantities. */ 180 static __inline __m64 181 _mm_unpackel_pi8 (__m64 __m1) 182 { 183 return (__m64) __builtin_arm_wunpckelsb ((__v8qi)__m1); 184 } 185 186 /* Take the two 16-bit values from the low half of M1, sign extend them, 187 and return the result as a vector of two 32-bit quantities. */ 188 static __inline __m64 189 _mm_unpackel_pi16 (__m64 __m1) 190 { 191 return (__m64) __builtin_arm_wunpckelsh ((__v4hi)__m1); 192 } 193 194 /* Take the 32-bit value from the low half of M1, and return it sign extended 195 to 64 bits. */ 196 static __inline __m64 197 _mm_unpackel_pi32 (__m64 __m1) 198 { 199 return (__m64) __builtin_arm_wunpckelsw ((__v2si)__m1); 200 } 201 202 /* Take the four 8-bit values from the high half of M1, sign extend them, 203 and return the result as a vector of four 16-bit quantities. */ 204 static __inline __m64 205 _mm_unpackeh_pi8 (__m64 __m1) 206 { 207 return (__m64) __builtin_arm_wunpckehsb ((__v8qi)__m1); 208 } 209 210 /* Take the two 16-bit values from the high half of M1, sign extend them, 211 and return the result as a vector of two 32-bit quantities. */ 212 static __inline __m64 213 _mm_unpackeh_pi16 (__m64 __m1) 214 { 215 return (__m64) __builtin_arm_wunpckehsh ((__v4hi)__m1); 216 } 217 218 /* Take the 32-bit value from the high half of M1, and return it sign extended 219 to 64 bits. */ 220 static __inline __m64 221 _mm_unpackeh_pi32 (__m64 __m1) 222 { 223 return (__m64) __builtin_arm_wunpckehsw ((__v2si)__m1); 224 } 225 226 /* Take the four 8-bit values from the low half of M1, zero extend them, 227 and return the result as a vector of four 16-bit quantities. */ 228 static __inline __m64 229 _mm_unpackel_pu8 (__m64 __m1) 230 { 231 return (__m64) __builtin_arm_wunpckelub ((__v8qi)__m1); 232 } 233 234 /* Take the two 16-bit values from the low half of M1, zero extend them, 235 and return the result as a vector of two 32-bit quantities. */ 236 static __inline __m64 237 _mm_unpackel_pu16 (__m64 __m1) 238 { 239 return (__m64) __builtin_arm_wunpckeluh ((__v4hi)__m1); 240 } 241 242 /* Take the 32-bit value from the low half of M1, and return it zero extended 243 to 64 bits. */ 244 static __inline __m64 245 _mm_unpackel_pu32 (__m64 __m1) 246 { 247 return (__m64) __builtin_arm_wunpckeluw ((__v2si)__m1); 248 } 249 250 /* Take the four 8-bit values from the high half of M1, zero extend them, 251 and return the result as a vector of four 16-bit quantities. */ 252 static __inline __m64 253 _mm_unpackeh_pu8 (__m64 __m1) 254 { 255 return (__m64) __builtin_arm_wunpckehub ((__v8qi)__m1); 256 } 257 258 /* Take the two 16-bit values from the high half of M1, zero extend them, 259 and return the result as a vector of two 32-bit quantities. */ 260 static __inline __m64 261 _mm_unpackeh_pu16 (__m64 __m1) 262 { 263 return (__m64) __builtin_arm_wunpckehuh ((__v4hi)__m1); 264 } 265 266 /* Take the 32-bit value from the high half of M1, and return it zero extended 267 to 64 bits. */ 268 static __inline __m64 269 _mm_unpackeh_pu32 (__m64 __m1) 270 { 271 return (__m64) __builtin_arm_wunpckehuw ((__v2si)__m1); 272 } 273 274 /* Add the 8-bit values in M1 to the 8-bit values in M2. */ 275 static __inline __m64 276 _mm_add_pi8 (__m64 __m1, __m64 __m2) 277 { 278 return (__m64) __builtin_arm_waddb ((__v8qi)__m1, (__v8qi)__m2); 279 } 280 281 /* Add the 16-bit values in M1 to the 16-bit values in M2. */ 282 static __inline __m64 283 _mm_add_pi16 (__m64 __m1, __m64 __m2) 284 { 285 return (__m64) __builtin_arm_waddh ((__v4hi)__m1, (__v4hi)__m2); 286 } 287 288 /* Add the 32-bit values in M1 to the 32-bit values in M2. */ 289 static __inline __m64 290 _mm_add_pi32 (__m64 __m1, __m64 __m2) 291 { 292 return (__m64) __builtin_arm_waddw ((__v2si)__m1, (__v2si)__m2); 293 } 294 295 /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed 296 saturated arithmetic. */ 297 static __inline __m64 298 _mm_adds_pi8 (__m64 __m1, __m64 __m2) 299 { 300 return (__m64) __builtin_arm_waddbss ((__v8qi)__m1, (__v8qi)__m2); 301 } 302 303 /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed 304 saturated arithmetic. */ 305 static __inline __m64 306 _mm_adds_pi16 (__m64 __m1, __m64 __m2) 307 { 308 return (__m64) __builtin_arm_waddhss ((__v4hi)__m1, (__v4hi)__m2); 309 } 310 311 /* Add the 32-bit values in M1 to the 32-bit values in M2 using signed 312 saturated arithmetic. */ 313 static __inline __m64 314 _mm_adds_pi32 (__m64 __m1, __m64 __m2) 315 { 316 return (__m64) __builtin_arm_waddwss ((__v2si)__m1, (__v2si)__m2); 317 } 318 319 /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned 320 saturated arithmetic. */ 321 static __inline __m64 322 _mm_adds_pu8 (__m64 __m1, __m64 __m2) 323 { 324 return (__m64) __builtin_arm_waddbus ((__v8qi)__m1, (__v8qi)__m2); 325 } 326 327 /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned 328 saturated arithmetic. */ 329 static __inline __m64 330 _mm_adds_pu16 (__m64 __m1, __m64 __m2) 331 { 332 return (__m64) __builtin_arm_waddhus ((__v4hi)__m1, (__v4hi)__m2); 333 } 334 335 /* Add the 32-bit values in M1 to the 32-bit values in M2 using unsigned 336 saturated arithmetic. */ 337 static __inline __m64 338 _mm_adds_pu32 (__m64 __m1, __m64 __m2) 339 { 340 return (__m64) __builtin_arm_waddwus ((__v2si)__m1, (__v2si)__m2); 341 } 342 343 /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ 344 static __inline __m64 345 _mm_sub_pi8 (__m64 __m1, __m64 __m2) 346 { 347 return (__m64) __builtin_arm_wsubb ((__v8qi)__m1, (__v8qi)__m2); 348 } 349 350 /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ 351 static __inline __m64 352 _mm_sub_pi16 (__m64 __m1, __m64 __m2) 353 { 354 return (__m64) __builtin_arm_wsubh ((__v4hi)__m1, (__v4hi)__m2); 355 } 356 357 /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ 358 static __inline __m64 359 _mm_sub_pi32 (__m64 __m1, __m64 __m2) 360 { 361 return (__m64) __builtin_arm_wsubw ((__v2si)__m1, (__v2si)__m2); 362 } 363 364 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed 365 saturating arithmetic. */ 366 static __inline __m64 367 _mm_subs_pi8 (__m64 __m1, __m64 __m2) 368 { 369 return (__m64) __builtin_arm_wsubbss ((__v8qi)__m1, (__v8qi)__m2); 370 } 371 372 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 373 signed saturating arithmetic. */ 374 static __inline __m64 375 _mm_subs_pi16 (__m64 __m1, __m64 __m2) 376 { 377 return (__m64) __builtin_arm_wsubhss ((__v4hi)__m1, (__v4hi)__m2); 378 } 379 380 /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using 381 signed saturating arithmetic. */ 382 static __inline __m64 383 _mm_subs_pi32 (__m64 __m1, __m64 __m2) 384 { 385 return (__m64) __builtin_arm_wsubwss ((__v2si)__m1, (__v2si)__m2); 386 } 387 388 /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using 389 unsigned saturating arithmetic. */ 390 static __inline __m64 391 _mm_subs_pu8 (__m64 __m1, __m64 __m2) 392 { 393 return (__m64) __builtin_arm_wsubbus ((__v8qi)__m1, (__v8qi)__m2); 394 } 395 396 /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using 397 unsigned saturating arithmetic. */ 398 static __inline __m64 399 _mm_subs_pu16 (__m64 __m1, __m64 __m2) 400 { 401 return (__m64) __builtin_arm_wsubhus ((__v4hi)__m1, (__v4hi)__m2); 402 } 403 404 /* Subtract the 32-bit values in M2 from the 32-bit values in M1 using 405 unsigned saturating arithmetic. */ 406 static __inline __m64 407 _mm_subs_pu32 (__m64 __m1, __m64 __m2) 408 { 409 return (__m64) __builtin_arm_wsubwus ((__v2si)__m1, (__v2si)__m2); 410 } 411 412 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 413 four 32-bit intermediate results, which are then summed by pairs to 414 produce two 32-bit results. */ 415 static __inline __m64 416 _mm_madd_pi16 (__m64 __m1, __m64 __m2) 417 { 418 return (__m64) __builtin_arm_wmadds ((__v4hi)__m1, (__v4hi)__m2); 419 } 420 421 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing 422 four 32-bit intermediate results, which are then summed by pairs to 423 produce two 32-bit results. */ 424 static __inline __m64 425 _mm_madd_pu16 (__m64 __m1, __m64 __m2) 426 { 427 return (__m64) __builtin_arm_wmaddu ((__v4hi)__m1, (__v4hi)__m2); 428 } 429 430 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 431 M2 and produce the high 16 bits of the 32-bit results. */ 432 static __inline __m64 433 _mm_mulhi_pi16 (__m64 __m1, __m64 __m2) 434 { 435 return (__m64) __builtin_arm_wmulsm ((__v4hi)__m1, (__v4hi)__m2); 436 } 437 438 /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in 439 M2 and produce the high 16 bits of the 32-bit results. */ 440 static __inline __m64 441 _mm_mulhi_pu16 (__m64 __m1, __m64 __m2) 442 { 443 return (__m64) __builtin_arm_wmulum ((__v4hi)__m1, (__v4hi)__m2); 444 } 445 446 /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce 447 the low 16 bits of the results. */ 448 static __inline __m64 449 _mm_mullo_pi16 (__m64 __m1, __m64 __m2) 450 { 451 return (__m64) __builtin_arm_wmulul ((__v4hi)__m1, (__v4hi)__m2); 452 } 453 454 /* Shift four 16-bit values in M left by COUNT. */ 455 static __inline __m64 456 _mm_sll_pi16 (__m64 __m, __m64 __count) 457 { 458 return (__m64) __builtin_arm_wsllh ((__v4hi)__m, __count); 459 } 460 461 static __inline __m64 462 _mm_slli_pi16 (__m64 __m, int __count) 463 { 464 return (__m64) __builtin_arm_wsllhi ((__v4hi)__m, __count); 465 } 466 467 /* Shift two 32-bit values in M left by COUNT. */ 468 static __inline __m64 469 _mm_sll_pi32 (__m64 __m, __m64 __count) 470 { 471 return (__m64) __builtin_arm_wsllw ((__v2si)__m, __count); 472 } 473 474 static __inline __m64 475 _mm_slli_pi32 (__m64 __m, int __count) 476 { 477 return (__m64) __builtin_arm_wsllwi ((__v2si)__m, __count); 478 } 479 480 /* Shift the 64-bit value in M left by COUNT. */ 481 static __inline __m64 482 _mm_sll_si64 (__m64 __m, __m64 __count) 483 { 484 return (__m64) __builtin_arm_wslld (__m, __count); 485 } 486 487 static __inline __m64 488 _mm_slli_si64 (__m64 __m, int __count) 489 { 490 return (__m64) __builtin_arm_wslldi (__m, __count); 491 } 492 493 /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ 494 static __inline __m64 495 _mm_sra_pi16 (__m64 __m, __m64 __count) 496 { 497 return (__m64) __builtin_arm_wsrah ((__v4hi)__m, __count); 498 } 499 500 static __inline __m64 501 _mm_srai_pi16 (__m64 __m, int __count) 502 { 503 return (__m64) __builtin_arm_wsrahi ((__v4hi)__m, __count); 504 } 505 506 /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ 507 static __inline __m64 508 _mm_sra_pi32 (__m64 __m, __m64 __count) 509 { 510 return (__m64) __builtin_arm_wsraw ((__v2si)__m, __count); 511 } 512 513 static __inline __m64 514 _mm_srai_pi32 (__m64 __m, int __count) 515 { 516 return (__m64) __builtin_arm_wsrawi ((__v2si)__m, __count); 517 } 518 519 /* Shift the 64-bit value in M right by COUNT; shift in the sign bit. */ 520 static __inline __m64 521 _mm_sra_si64 (__m64 __m, __m64 __count) 522 { 523 return (__m64) __builtin_arm_wsrad (__m, __count); 524 } 525 526 static __inline __m64 527 _mm_srai_si64 (__m64 __m, int __count) 528 { 529 return (__m64) __builtin_arm_wsradi (__m, __count); 530 } 531 532 /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ 533 static __inline __m64 534 _mm_srl_pi16 (__m64 __m, __m64 __count) 535 { 536 return (__m64) __builtin_arm_wsrlh ((__v4hi)__m, __count); 537 } 538 539 static __inline __m64 540 _mm_srli_pi16 (__m64 __m, int __count) 541 { 542 return (__m64) __builtin_arm_wsrlhi ((__v4hi)__m, __count); 543 } 544 545 /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ 546 static __inline __m64 547 _mm_srl_pi32 (__m64 __m, __m64 __count) 548 { 549 return (__m64) __builtin_arm_wsrlw ((__v2si)__m, __count); 550 } 551 552 static __inline __m64 553 _mm_srli_pi32 (__m64 __m, int __count) 554 { 555 return (__m64) __builtin_arm_wsrlwi ((__v2si)__m, __count); 556 } 557 558 /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ 559 static __inline __m64 560 _mm_srl_si64 (__m64 __m, __m64 __count) 561 { 562 return (__m64) __builtin_arm_wsrld (__m, __count); 563 } 564 565 static __inline __m64 566 _mm_srli_si64 (__m64 __m, int __count) 567 { 568 return (__m64) __builtin_arm_wsrldi (__m, __count); 569 } 570 571 /* Rotate four 16-bit values in M right by COUNT. */ 572 static __inline __m64 573 _mm_ror_pi16 (__m64 __m, __m64 __count) 574 { 575 return (__m64) __builtin_arm_wrorh ((__v4hi)__m, __count); 576 } 577 578 static __inline __m64 579 _mm_rori_pi16 (__m64 __m, int __count) 580 { 581 return (__m64) __builtin_arm_wrorhi ((__v4hi)__m, __count); 582 } 583 584 /* Rotate two 32-bit values in M right by COUNT. */ 585 static __inline __m64 586 _mm_ror_pi32 (__m64 __m, __m64 __count) 587 { 588 return (__m64) __builtin_arm_wrorw ((__v2si)__m, __count); 589 } 590 591 static __inline __m64 592 _mm_rori_pi32 (__m64 __m, int __count) 593 { 594 return (__m64) __builtin_arm_wrorwi ((__v2si)__m, __count); 595 } 596 597 /* Rotate two 64-bit values in M right by COUNT. */ 598 static __inline __m64 599 _mm_ror_si64 (__m64 __m, __m64 __count) 600 { 601 return (__m64) __builtin_arm_wrord (__m, __count); 602 } 603 604 static __inline __m64 605 _mm_rori_si64 (__m64 __m, int __count) 606 { 607 return (__m64) __builtin_arm_wrordi (__m, __count); 608 } 609 610 /* Bit-wise AND the 64-bit values in M1 and M2. */ 611 static __inline __m64 612 _mm_and_si64 (__m64 __m1, __m64 __m2) 613 { 614 return __builtin_arm_wand (__m1, __m2); 615 } 616 617 /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the 618 64-bit value in M2. */ 619 static __inline __m64 620 _mm_andnot_si64 (__m64 __m1, __m64 __m2) 621 { 622 return __builtin_arm_wandn (__m2, __m1); 623 } 624 625 /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ 626 static __inline __m64 627 _mm_or_si64 (__m64 __m1, __m64 __m2) 628 { 629 return __builtin_arm_wor (__m1, __m2); 630 } 631 632 /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ 633 static __inline __m64 634 _mm_xor_si64 (__m64 __m1, __m64 __m2) 635 { 636 return __builtin_arm_wxor (__m1, __m2); 637 } 638 639 /* Compare eight 8-bit values. The result of the comparison is 0xFF if the 640 test is true and zero if false. */ 641 static __inline __m64 642 _mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) 643 { 644 return (__m64) __builtin_arm_wcmpeqb ((__v8qi)__m1, (__v8qi)__m2); 645 } 646 647 static __inline __m64 648 _mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) 649 { 650 return (__m64) __builtin_arm_wcmpgtsb ((__v8qi)__m1, (__v8qi)__m2); 651 } 652 653 static __inline __m64 654 _mm_cmpgt_pu8 (__m64 __m1, __m64 __m2) 655 { 656 return (__m64) __builtin_arm_wcmpgtub ((__v8qi)__m1, (__v8qi)__m2); 657 } 658 659 /* Compare four 16-bit values. The result of the comparison is 0xFFFF if 660 the test is true and zero if false. */ 661 static __inline __m64 662 _mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) 663 { 664 return (__m64) __builtin_arm_wcmpeqh ((__v4hi)__m1, (__v4hi)__m2); 665 } 666 667 static __inline __m64 668 _mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) 669 { 670 return (__m64) __builtin_arm_wcmpgtsh ((__v4hi)__m1, (__v4hi)__m2); 671 } 672 673 static __inline __m64 674 _mm_cmpgt_pu16 (__m64 __m1, __m64 __m2) 675 { 676 return (__m64) __builtin_arm_wcmpgtuh ((__v4hi)__m1, (__v4hi)__m2); 677 } 678 679 /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if 680 the test is true and zero if false. */ 681 static __inline __m64 682 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) 683 { 684 return (__m64) __builtin_arm_wcmpeqw ((__v2si)__m1, (__v2si)__m2); 685 } 686 687 static __inline __m64 688 _mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) 689 { 690 return (__m64) __builtin_arm_wcmpgtsw ((__v2si)__m1, (__v2si)__m2); 691 } 692 693 static __inline __m64 694 _mm_cmpgt_pu32 (__m64 __m1, __m64 __m2) 695 { 696 return (__m64) __builtin_arm_wcmpgtuw ((__v2si)__m1, (__v2si)__m2); 697 } 698 699 /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed 700 by accumulate across all elements and __A. */ 701 static __inline __m64 702 _mm_mac_pu16 (__m64 __A, __m64 __B, __m64 __C) 703 { 704 return __builtin_arm_wmacu (__A, (__v4hi)__B, (__v4hi)__C); 705 } 706 707 /* Element-wise multiplication of signed 16-bit values __B and __C, followed 708 by accumulate across all elements and __A. */ 709 static __inline __m64 710 _mm_mac_pi16 (__m64 __A, __m64 __B, __m64 __C) 711 { 712 return __builtin_arm_wmacs (__A, (__v4hi)__B, (__v4hi)__C); 713 } 714 715 /* Element-wise multiplication of unsigned 16-bit values __B and __C, followed 716 by accumulate across all elements. */ 717 static __inline __m64 718 _mm_macz_pu16 (__m64 __A, __m64 __B) 719 { 720 return __builtin_arm_wmacuz ((__v4hi)__A, (__v4hi)__B); 721 } 722 723 /* Element-wise multiplication of signed 16-bit values __B and __C, followed 724 by accumulate across all elements. */ 725 static __inline __m64 726 _mm_macz_pi16 (__m64 __A, __m64 __B) 727 { 728 return __builtin_arm_wmacsz ((__v4hi)__A, (__v4hi)__B); 729 } 730 731 /* Accumulate across all unsigned 8-bit values in __A. */ 732 static __inline __m64 733 _mm_acc_pu8 (__m64 __A) 734 { 735 return __builtin_arm_waccb ((__v8qi)__A); 736 } 737 738 /* Accumulate across all unsigned 16-bit values in __A. */ 739 static __inline __m64 740 _mm_acc_pu16 (__m64 __A) 741 { 742 return __builtin_arm_wacch ((__v4hi)__A); 743 } 744 745 /* Accumulate across all unsigned 32-bit values in __A. */ 746 static __inline __m64 747 _mm_acc_pu32 (__m64 __A) 748 { 749 return __builtin_arm_waccw ((__v2si)__A); 750 } 751 752 static __inline __m64 753 _mm_mia_si64 (__m64 __A, int __B, int __C) 754 { 755 return __builtin_arm_tmia (__A, __B, __C); 756 } 757 758 static __inline __m64 759 _mm_miaph_si64 (__m64 __A, int __B, int __C) 760 { 761 return __builtin_arm_tmiaph (__A, __B, __C); 762 } 763 764 static __inline __m64 765 _mm_miabb_si64 (__m64 __A, int __B, int __C) 766 { 767 return __builtin_arm_tmiabb (__A, __B, __C); 768 } 769 770 static __inline __m64 771 _mm_miabt_si64 (__m64 __A, int __B, int __C) 772 { 773 return __builtin_arm_tmiabt (__A, __B, __C); 774 } 775 776 static __inline __m64 777 _mm_miatb_si64 (__m64 __A, int __B, int __C) 778 { 779 return __builtin_arm_tmiatb (__A, __B, __C); 780 } 781 782 static __inline __m64 783 _mm_miatt_si64 (__m64 __A, int __B, int __C) 784 { 785 return __builtin_arm_tmiatt (__A, __B, __C); 786 } 787 788 /* Extract one of the elements of A and sign extend. The selector N must 789 be immediate. */ 790 #define _mm_extract_pi8(A, N) __builtin_arm_textrmsb ((__v8qi)(A), (N)) 791 #define _mm_extract_pi16(A, N) __builtin_arm_textrmsh ((__v4hi)(A), (N)) 792 #define _mm_extract_pi32(A, N) __builtin_arm_textrmsw ((__v2si)(A), (N)) 793 794 /* Extract one of the elements of A and zero extend. The selector N must 795 be immediate. */ 796 #define _mm_extract_pu8(A, N) __builtin_arm_textrmub ((__v8qi)(A), (N)) 797 #define _mm_extract_pu16(A, N) __builtin_arm_textrmuh ((__v4hi)(A), (N)) 798 #define _mm_extract_pu32(A, N) __builtin_arm_textrmuw ((__v2si)(A), (N)) 799 800 /* Inserts word D into one of the elements of A. The selector N must be 801 immediate. */ 802 #define _mm_insert_pi8(A, D, N) \ 803 ((__m64) __builtin_arm_tinsrb ((__v8qi)(A), (D), (N))) 804 #define _mm_insert_pi16(A, D, N) \ 805 ((__m64) __builtin_arm_tinsrh ((__v4hi)(A), (D), (N))) 806 #define _mm_insert_pi32(A, D, N) \ 807 ((__m64) __builtin_arm_tinsrw ((__v2si)(A), (D), (N))) 808 809 /* Compute the element-wise maximum of signed 8-bit values. */ 810 static __inline __m64 811 _mm_max_pi8 (__m64 __A, __m64 __B) 812 { 813 return (__m64) __builtin_arm_wmaxsb ((__v8qi)__A, (__v8qi)__B); 814 } 815 816 /* Compute the element-wise maximum of signed 16-bit values. */ 817 static __inline __m64 818 _mm_max_pi16 (__m64 __A, __m64 __B) 819 { 820 return (__m64) __builtin_arm_wmaxsh ((__v4hi)__A, (__v4hi)__B); 821 } 822 823 /* Compute the element-wise maximum of signed 32-bit values. */ 824 static __inline __m64 825 _mm_max_pi32 (__m64 __A, __m64 __B) 826 { 827 return (__m64) __builtin_arm_wmaxsw ((__v2si)__A, (__v2si)__B); 828 } 829 830 /* Compute the element-wise maximum of unsigned 8-bit values. */ 831 static __inline __m64 832 _mm_max_pu8 (__m64 __A, __m64 __B) 833 { 834 return (__m64) __builtin_arm_wmaxub ((__v8qi)__A, (__v8qi)__B); 835 } 836 837 /* Compute the element-wise maximum of unsigned 16-bit values. */ 838 static __inline __m64 839 _mm_max_pu16 (__m64 __A, __m64 __B) 840 { 841 return (__m64) __builtin_arm_wmaxuh ((__v4hi)__A, (__v4hi)__B); 842 } 843 844 /* Compute the element-wise maximum of unsigned 32-bit values. */ 845 static __inline __m64 846 _mm_max_pu32 (__m64 __A, __m64 __B) 847 { 848 return (__m64) __builtin_arm_wmaxuw ((__v2si)__A, (__v2si)__B); 849 } 850 851 /* Compute the element-wise minimum of signed 16-bit values. */ 852 static __inline __m64 853 _mm_min_pi8 (__m64 __A, __m64 __B) 854 { 855 return (__m64) __builtin_arm_wminsb ((__v8qi)__A, (__v8qi)__B); 856 } 857 858 /* Compute the element-wise minimum of signed 16-bit values. */ 859 static __inline __m64 860 _mm_min_pi16 (__m64 __A, __m64 __B) 861 { 862 return (__m64) __builtin_arm_wminsh ((__v4hi)__A, (__v4hi)__B); 863 } 864 865 /* Compute the element-wise minimum of signed 32-bit values. */ 866 static __inline __m64 867 _mm_min_pi32 (__m64 __A, __m64 __B) 868 { 869 return (__m64) __builtin_arm_wminsw ((__v2si)__A, (__v2si)__B); 870 } 871 872 /* Compute the element-wise minimum of unsigned 16-bit values. */ 873 static __inline __m64 874 _mm_min_pu8 (__m64 __A, __m64 __B) 875 { 876 return (__m64) __builtin_arm_wminub ((__v8qi)__A, (__v8qi)__B); 877 } 878 879 /* Compute the element-wise minimum of unsigned 16-bit values. */ 880 static __inline __m64 881 _mm_min_pu16 (__m64 __A, __m64 __B) 882 { 883 return (__m64) __builtin_arm_wminuh ((__v4hi)__A, (__v4hi)__B); 884 } 885 886 /* Compute the element-wise minimum of unsigned 32-bit values. */ 887 static __inline __m64 888 _mm_min_pu32 (__m64 __A, __m64 __B) 889 { 890 return (__m64) __builtin_arm_wminuw ((__v2si)__A, (__v2si)__B); 891 } 892 893 /* Create an 8-bit mask of the signs of 8-bit values. */ 894 static __inline int 895 _mm_movemask_pi8 (__m64 __A) 896 { 897 return __builtin_arm_tmovmskb ((__v8qi)__A); 898 } 899 900 /* Create an 8-bit mask of the signs of 16-bit values. */ 901 static __inline int 902 _mm_movemask_pi16 (__m64 __A) 903 { 904 return __builtin_arm_tmovmskh ((__v4hi)__A); 905 } 906 907 /* Create an 8-bit mask of the signs of 32-bit values. */ 908 static __inline int 909 _mm_movemask_pi32 (__m64 __A) 910 { 911 return __builtin_arm_tmovmskw ((__v2si)__A); 912 } 913 914 /* Return a combination of the four 16-bit values in A. The selector 915 must be an immediate. */ 916 #define _mm_shuffle_pi16(A, N) \ 917 ((__m64) __builtin_arm_wshufh ((__v4hi)(A), (N))) 918 919 920 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 921 static __inline __m64 922 _mm_avg_pu8 (__m64 __A, __m64 __B) 923 { 924 return (__m64) __builtin_arm_wavg2br ((__v8qi)__A, (__v8qi)__B); 925 } 926 927 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 928 static __inline __m64 929 _mm_avg_pu16 (__m64 __A, __m64 __B) 930 { 931 return (__m64) __builtin_arm_wavg2hr ((__v4hi)__A, (__v4hi)__B); 932 } 933 934 /* Compute the averages of the unsigned 8-bit values in A and B. */ 935 static __inline __m64 936 _mm_avg2_pu8 (__m64 __A, __m64 __B) 937 { 938 return (__m64) __builtin_arm_wavg2b ((__v8qi)__A, (__v8qi)__B); 939 } 940 941 /* Compute the averages of the unsigned 16-bit values in A and B. */ 942 static __inline __m64 943 _mm_avg2_pu16 (__m64 __A, __m64 __B) 944 { 945 return (__m64) __builtin_arm_wavg2h ((__v4hi)__A, (__v4hi)__B); 946 } 947 948 /* Compute the sum of the absolute differences of the unsigned 8-bit 949 values in A and B. Return the value in the lower 16-bit word; the 950 upper words are cleared. */ 951 static __inline __m64 952 _mm_sad_pu8 (__m64 __A, __m64 __B) 953 { 954 return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B); 955 } 956 957 static __inline __m64 958 _mm_sada_pu8 (__m64 __A, __m64 __B, __m64 __C) 959 { 960 return (__m64) __builtin_arm_wsadb ((__v2si)__A, (__v8qi)__B, (__v8qi)__C); 961 } 962 963 /* Compute the sum of the absolute differences of the unsigned 16-bit 964 values in A and B. Return the value in the lower 32-bit word; the 965 upper words are cleared. */ 966 static __inline __m64 967 _mm_sad_pu16 (__m64 __A, __m64 __B) 968 { 969 return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B); 970 } 971 972 static __inline __m64 973 _mm_sada_pu16 (__m64 __A, __m64 __B, __m64 __C) 974 { 975 return (__m64) __builtin_arm_wsadh ((__v2si)__A, (__v4hi)__B, (__v4hi)__C); 976 } 977 978 979 /* Compute the sum of the absolute differences of the unsigned 8-bit 980 values in A and B. Return the value in the lower 16-bit word; the 981 upper words are cleared. */ 982 static __inline __m64 983 _mm_sadz_pu8 (__m64 __A, __m64 __B) 984 { 985 return (__m64) __builtin_arm_wsadbz ((__v8qi)__A, (__v8qi)__B); 986 } 987 988 /* Compute the sum of the absolute differences of the unsigned 16-bit 989 values in A and B. Return the value in the lower 32-bit word; the 990 upper words are cleared. */ 991 static __inline __m64 992 _mm_sadz_pu16 (__m64 __A, __m64 __B) 993 { 994 return (__m64) __builtin_arm_wsadhz ((__v4hi)__A, (__v4hi)__B); 995 } 996 997 #define _mm_align_si64(__A,__B, N) \ 998 (__m64) __builtin_arm_walign ((__v8qi) (__A),(__v8qi) (__B), (N)) 999 1000 /* Creates a 64-bit zero. */ 1001 static __inline __m64 1002 _mm_setzero_si64 (void) 1003 { 1004 return __builtin_arm_wzero (); 1005 } 1006 1007 /* Set and Get arbitrary iWMMXt Control registers. 1008 Note only registers 0-3 and 8-11 are currently defined, 1009 the rest are reserved. */ 1010 1011 static __inline void 1012 _mm_setwcx (const int __value, const int __regno) 1013 { 1014 switch (__regno) 1015 { 1016 case 0: 1017 __asm __volatile ("tmcr wcid, %0" :: "r"(__value)); 1018 break; 1019 case 1: 1020 __asm __volatile ("tmcr wcon, %0" :: "r"(__value)); 1021 break; 1022 case 2: 1023 __asm __volatile ("tmcr wcssf, %0" :: "r"(__value)); 1024 break; 1025 case 3: 1026 __asm __volatile ("tmcr wcasf, %0" :: "r"(__value)); 1027 break; 1028 case 8: 1029 __builtin_arm_setwcgr0 (__value); 1030 break; 1031 case 9: 1032 __builtin_arm_setwcgr1 (__value); 1033 break; 1034 case 10: 1035 __builtin_arm_setwcgr2 (__value); 1036 break; 1037 case 11: 1038 __builtin_arm_setwcgr3 (__value); 1039 break; 1040 default: 1041 break; 1042 } 1043 } 1044 1045 static __inline int 1046 _mm_getwcx (const int __regno) 1047 { 1048 int __value; 1049 switch (__regno) 1050 { 1051 case 0: 1052 __asm __volatile ("tmrc %0, wcid" : "=r"(__value)); 1053 break; 1054 case 1: 1055 __asm __volatile ("tmrc %0, wcon" : "=r"(__value)); 1056 break; 1057 case 2: 1058 __asm __volatile ("tmrc %0, wcssf" : "=r"(__value)); 1059 break; 1060 case 3: 1061 __asm __volatile ("tmrc %0, wcasf" : "=r"(__value)); 1062 break; 1063 case 8: 1064 return __builtin_arm_getwcgr0 (); 1065 case 9: 1066 return __builtin_arm_getwcgr1 (); 1067 case 10: 1068 return __builtin_arm_getwcgr2 (); 1069 case 11: 1070 return __builtin_arm_getwcgr3 (); 1071 default: 1072 break; 1073 } 1074 return __value; 1075 } 1076 1077 /* Creates a vector of two 32-bit values; I0 is least significant. */ 1078 static __inline __m64 1079 _mm_set_pi32 (int __i1, int __i0) 1080 { 1081 union 1082 { 1083 __m64 __q; 1084 struct 1085 { 1086 unsigned int __i0; 1087 unsigned int __i1; 1088 } __s; 1089 } __u; 1090 1091 __u.__s.__i0 = __i0; 1092 __u.__s.__i1 = __i1; 1093 1094 return __u.__q; 1095 } 1096 1097 /* Creates a vector of four 16-bit values; W0 is least significant. */ 1098 static __inline __m64 1099 _mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) 1100 { 1101 unsigned int __i1 = (unsigned short) __w3 << 16 | (unsigned short) __w2; 1102 unsigned int __i0 = (unsigned short) __w1 << 16 | (unsigned short) __w0; 1103 1104 return _mm_set_pi32 (__i1, __i0); 1105 } 1106 1107 /* Creates a vector of eight 8-bit values; B0 is least significant. */ 1108 static __inline __m64 1109 _mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, 1110 char __b3, char __b2, char __b1, char __b0) 1111 { 1112 unsigned int __i1, __i0; 1113 1114 __i1 = (unsigned char)__b7; 1115 __i1 = __i1 << 8 | (unsigned char)__b6; 1116 __i1 = __i1 << 8 | (unsigned char)__b5; 1117 __i1 = __i1 << 8 | (unsigned char)__b4; 1118 1119 __i0 = (unsigned char)__b3; 1120 __i0 = __i0 << 8 | (unsigned char)__b2; 1121 __i0 = __i0 << 8 | (unsigned char)__b1; 1122 __i0 = __i0 << 8 | (unsigned char)__b0; 1123 1124 return _mm_set_pi32 (__i1, __i0); 1125 } 1126 1127 /* Similar, but with the arguments in reverse order. */ 1128 static __inline __m64 1129 _mm_setr_pi32 (int __i0, int __i1) 1130 { 1131 return _mm_set_pi32 (__i1, __i0); 1132 } 1133 1134 static __inline __m64 1135 _mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) 1136 { 1137 return _mm_set_pi16 (__w3, __w2, __w1, __w0); 1138 } 1139 1140 static __inline __m64 1141 _mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, 1142 char __b4, char __b5, char __b6, char __b7) 1143 { 1144 return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); 1145 } 1146 1147 /* Creates a vector of two 32-bit values, both elements containing I. */ 1148 static __inline __m64 1149 _mm_set1_pi32 (int __i) 1150 { 1151 return _mm_set_pi32 (__i, __i); 1152 } 1153 1154 /* Creates a vector of four 16-bit values, all elements containing W. */ 1155 static __inline __m64 1156 _mm_set1_pi16 (short __w) 1157 { 1158 unsigned int __i = (unsigned short)__w << 16 | (unsigned short)__w; 1159 return _mm_set1_pi32 (__i); 1160 } 1161 1162 /* Creates a vector of four 16-bit values, all elements containing B. */ 1163 static __inline __m64 1164 _mm_set1_pi8 (char __b) 1165 { 1166 unsigned int __w = (unsigned char)__b << 8 | (unsigned char)__b; 1167 unsigned int __i = __w << 16 | __w; 1168 return _mm_set1_pi32 (__i); 1169 } 1170 1171 #ifdef __IWMMXT2__ 1172 static __inline __m64 1173 _mm_abs_pi8 (__m64 m1) 1174 { 1175 return (__m64) __builtin_arm_wabsb ((__v8qi)m1); 1176 } 1177 1178 static __inline __m64 1179 _mm_abs_pi16 (__m64 m1) 1180 { 1181 return (__m64) __builtin_arm_wabsh ((__v4hi)m1); 1182 1183 } 1184 1185 static __inline __m64 1186 _mm_abs_pi32 (__m64 m1) 1187 { 1188 return (__m64) __builtin_arm_wabsw ((__v2si)m1); 1189 1190 } 1191 1192 static __inline __m64 1193 _mm_addsubhx_pi16 (__m64 a, __m64 b) 1194 { 1195 return (__m64) __builtin_arm_waddsubhx ((__v4hi)a, (__v4hi)b); 1196 } 1197 1198 static __inline __m64 1199 _mm_absdiff_pu8 (__m64 a, __m64 b) 1200 { 1201 return (__m64) __builtin_arm_wabsdiffb ((__v8qi)a, (__v8qi)b); 1202 } 1203 1204 static __inline __m64 1205 _mm_absdiff_pu16 (__m64 a, __m64 b) 1206 { 1207 return (__m64) __builtin_arm_wabsdiffh ((__v4hi)a, (__v4hi)b); 1208 } 1209 1210 static __inline __m64 1211 _mm_absdiff_pu32 (__m64 a, __m64 b) 1212 { 1213 return (__m64) __builtin_arm_wabsdiffw ((__v2si)a, (__v2si)b); 1214 } 1215 1216 static __inline __m64 1217 _mm_addc_pu16 (__m64 a, __m64 b) 1218 { 1219 __m64 result; 1220 __asm__ __volatile__ ("waddhc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b)); 1221 return result; 1222 } 1223 1224 static __inline __m64 1225 _mm_addc_pu32 (__m64 a, __m64 b) 1226 { 1227 __m64 result; 1228 __asm__ __volatile__ ("waddwc %0, %1, %2" : "=y" (result) : "y" (a), "y" (b)); 1229 return result; 1230 } 1231 1232 static __inline __m64 1233 _mm_avg4_pu8 (__m64 a, __m64 b) 1234 { 1235 return (__m64) __builtin_arm_wavg4 ((__v8qi)a, (__v8qi)b); 1236 } 1237 1238 static __inline __m64 1239 _mm_avg4r_pu8 (__m64 a, __m64 b) 1240 { 1241 return (__m64) __builtin_arm_wavg4r ((__v8qi)a, (__v8qi)b); 1242 } 1243 1244 static __inline __m64 1245 _mm_maddx_pi16 (__m64 a, __m64 b) 1246 { 1247 return (__m64) __builtin_arm_wmaddsx ((__v4hi)a, (__v4hi)b); 1248 } 1249 1250 static __inline __m64 1251 _mm_maddx_pu16 (__m64 a, __m64 b) 1252 { 1253 return (__m64) __builtin_arm_wmaddux ((__v4hi)a, (__v4hi)b); 1254 } 1255 1256 static __inline __m64 1257 _mm_msub_pi16 (__m64 a, __m64 b) 1258 { 1259 return (__m64) __builtin_arm_wmaddsn ((__v4hi)a, (__v4hi)b); 1260 } 1261 1262 static __inline __m64 1263 _mm_msub_pu16 (__m64 a, __m64 b) 1264 { 1265 return (__m64) __builtin_arm_wmaddun ((__v4hi)a, (__v4hi)b); 1266 } 1267 1268 static __inline __m64 1269 _mm_mulhi_pi32 (__m64 a, __m64 b) 1270 { 1271 return (__m64) __builtin_arm_wmulwsm ((__v2si)a, (__v2si)b); 1272 } 1273 1274 static __inline __m64 1275 _mm_mulhi_pu32 (__m64 a, __m64 b) 1276 { 1277 return (__m64) __builtin_arm_wmulwum ((__v2si)a, (__v2si)b); 1278 } 1279 1280 static __inline __m64 1281 _mm_mulhir_pi16 (__m64 a, __m64 b) 1282 { 1283 return (__m64) __builtin_arm_wmulsmr ((__v4hi)a, (__v4hi)b); 1284 } 1285 1286 static __inline __m64 1287 _mm_mulhir_pi32 (__m64 a, __m64 b) 1288 { 1289 return (__m64) __builtin_arm_wmulwsmr ((__v2si)a, (__v2si)b); 1290 } 1291 1292 static __inline __m64 1293 _mm_mulhir_pu16 (__m64 a, __m64 b) 1294 { 1295 return (__m64) __builtin_arm_wmulumr ((__v4hi)a, (__v4hi)b); 1296 } 1297 1298 static __inline __m64 1299 _mm_mulhir_pu32 (__m64 a, __m64 b) 1300 { 1301 return (__m64) __builtin_arm_wmulwumr ((__v2si)a, (__v2si)b); 1302 } 1303 1304 static __inline __m64 1305 _mm_mullo_pi32 (__m64 a, __m64 b) 1306 { 1307 return (__m64) __builtin_arm_wmulwl ((__v2si)a, (__v2si)b); 1308 } 1309 1310 static __inline __m64 1311 _mm_qmulm_pi16 (__m64 a, __m64 b) 1312 { 1313 return (__m64) __builtin_arm_wqmulm ((__v4hi)a, (__v4hi)b); 1314 } 1315 1316 static __inline __m64 1317 _mm_qmulm_pi32 (__m64 a, __m64 b) 1318 { 1319 return (__m64) __builtin_arm_wqmulwm ((__v2si)a, (__v2si)b); 1320 } 1321 1322 static __inline __m64 1323 _mm_qmulmr_pi16 (__m64 a, __m64 b) 1324 { 1325 return (__m64) __builtin_arm_wqmulmr ((__v4hi)a, (__v4hi)b); 1326 } 1327 1328 static __inline __m64 1329 _mm_qmulmr_pi32 (__m64 a, __m64 b) 1330 { 1331 return (__m64) __builtin_arm_wqmulwmr ((__v2si)a, (__v2si)b); 1332 } 1333 1334 static __inline __m64 1335 _mm_subaddhx_pi16 (__m64 a, __m64 b) 1336 { 1337 return (__m64) __builtin_arm_wsubaddhx ((__v4hi)a, (__v4hi)b); 1338 } 1339 1340 static __inline __m64 1341 _mm_addbhusl_pu8 (__m64 a, __m64 b) 1342 { 1343 return (__m64) __builtin_arm_waddbhusl ((__v4hi)a, (__v8qi)b); 1344 } 1345 1346 static __inline __m64 1347 _mm_addbhusm_pu8 (__m64 a, __m64 b) 1348 { 1349 return (__m64) __builtin_arm_waddbhusm ((__v4hi)a, (__v8qi)b); 1350 } 1351 1352 #define _mm_qmiabb_pi32(acc, m1, m2) \ 1353 ({\ 1354 __m64 _acc = acc;\ 1355 __m64 _m1 = m1;\ 1356 __m64 _m2 = m2;\ 1357 _acc = (__m64) __builtin_arm_wqmiabb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1358 _acc;\ 1359 }) 1360 1361 #define _mm_qmiabbn_pi32(acc, m1, m2) \ 1362 ({\ 1363 __m64 _acc = acc;\ 1364 __m64 _m1 = m1;\ 1365 __m64 _m2 = m2;\ 1366 _acc = (__m64) __builtin_arm_wqmiabbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1367 _acc;\ 1368 }) 1369 1370 #define _mm_qmiabt_pi32(acc, m1, m2) \ 1371 ({\ 1372 __m64 _acc = acc;\ 1373 __m64 _m1 = m1;\ 1374 __m64 _m2 = m2;\ 1375 _acc = (__m64) __builtin_arm_wqmiabt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1376 _acc;\ 1377 }) 1378 1379 #define _mm_qmiabtn_pi32(acc, m1, m2) \ 1380 ({\ 1381 __m64 _acc=acc;\ 1382 __m64 _m1=m1;\ 1383 __m64 _m2=m2;\ 1384 _acc = (__m64) __builtin_arm_wqmiabtn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1385 _acc;\ 1386 }) 1387 1388 #define _mm_qmiatb_pi32(acc, m1, m2) \ 1389 ({\ 1390 __m64 _acc = acc;\ 1391 __m64 _m1 = m1;\ 1392 __m64 _m2 = m2;\ 1393 _acc = (__m64) __builtin_arm_wqmiatb ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1394 _acc;\ 1395 }) 1396 1397 #define _mm_qmiatbn_pi32(acc, m1, m2) \ 1398 ({\ 1399 __m64 _acc = acc;\ 1400 __m64 _m1 = m1;\ 1401 __m64 _m2 = m2;\ 1402 _acc = (__m64) __builtin_arm_wqmiatbn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1403 _acc;\ 1404 }) 1405 1406 #define _mm_qmiatt_pi32(acc, m1, m2) \ 1407 ({\ 1408 __m64 _acc = acc;\ 1409 __m64 _m1 = m1;\ 1410 __m64 _m2 = m2;\ 1411 _acc = (__m64) __builtin_arm_wqmiatt ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1412 _acc;\ 1413 }) 1414 1415 #define _mm_qmiattn_pi32(acc, m1, m2) \ 1416 ({\ 1417 __m64 _acc = acc;\ 1418 __m64 _m1 = m1;\ 1419 __m64 _m2 = m2;\ 1420 _acc = (__m64) __builtin_arm_wqmiattn ((__v2si)_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1421 _acc;\ 1422 }) 1423 1424 #define _mm_wmiabb_si64(acc, m1, m2) \ 1425 ({\ 1426 __m64 _acc = acc;\ 1427 __m64 _m1 = m1;\ 1428 __m64 _m2 = m2;\ 1429 _acc = (__m64) __builtin_arm_wmiabb (_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1430 _acc;\ 1431 }) 1432 1433 #define _mm_wmiabbn_si64(acc, m1, m2) \ 1434 ({\ 1435 __m64 _acc = acc;\ 1436 __m64 _m1 = m1;\ 1437 __m64 _m2 = m2;\ 1438 _acc = (__m64) __builtin_arm_wmiabbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1439 _acc;\ 1440 }) 1441 1442 #define _mm_wmiabt_si64(acc, m1, m2) \ 1443 ({\ 1444 __m64 _acc = acc;\ 1445 __m64 _m1 = m1;\ 1446 __m64 _m2 = m2;\ 1447 _acc = (__m64) __builtin_arm_wmiabt (_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1448 _acc;\ 1449 }) 1450 1451 #define _mm_wmiabtn_si64(acc, m1, m2) \ 1452 ({\ 1453 __m64 _acc = acc;\ 1454 __m64 _m1 = m1;\ 1455 __m64 _m2 = m2;\ 1456 _acc = (__m64) __builtin_arm_wmiabtn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1457 _acc;\ 1458 }) 1459 1460 #define _mm_wmiatb_si64(acc, m1, m2) \ 1461 ({\ 1462 __m64 _acc = acc;\ 1463 __m64 _m1 = m1;\ 1464 __m64 _m2 = m2;\ 1465 _acc = (__m64) __builtin_arm_wmiatb (_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1466 _acc;\ 1467 }) 1468 1469 #define _mm_wmiatbn_si64(acc, m1, m2) \ 1470 ({\ 1471 __m64 _acc = acc;\ 1472 __m64 _m1 = m1;\ 1473 __m64 _m2 = m2;\ 1474 _acc = (__m64) __builtin_arm_wmiatbn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1475 _acc;\ 1476 }) 1477 1478 #define _mm_wmiatt_si64(acc, m1, m2) \ 1479 ({\ 1480 __m64 _acc = acc;\ 1481 __m64 _m1 = m1;\ 1482 __m64 _m2 = m2;\ 1483 _acc = (__m64) __builtin_arm_wmiatt (_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1484 _acc;\ 1485 }) 1486 1487 #define _mm_wmiattn_si64(acc, m1, m2) \ 1488 ({\ 1489 __m64 _acc = acc;\ 1490 __m64 _m1 = m1;\ 1491 __m64 _m2 = m2;\ 1492 _acc = (__m64) __builtin_arm_wmiattn (_acc, (__v4hi)_m1, (__v4hi)_m2);\ 1493 _acc;\ 1494 }) 1495 1496 #define _mm_wmiawbb_si64(acc, m1, m2) \ 1497 ({\ 1498 __m64 _acc = acc;\ 1499 __m64 _m1 = m1;\ 1500 __m64 _m2 = m2;\ 1501 _acc = (__m64) __builtin_arm_wmiawbb (_acc, (__v2si)_m1, (__v2si)_m2);\ 1502 _acc;\ 1503 }) 1504 1505 #define _mm_wmiawbbn_si64(acc, m1, m2) \ 1506 ({\ 1507 __m64 _acc = acc;\ 1508 __m64 _m1 = m1;\ 1509 __m64 _m2 = m2;\ 1510 _acc = (__m64) __builtin_arm_wmiawbbn (_acc, (__v2si)_m1, (__v2si)_m2);\ 1511 _acc;\ 1512 }) 1513 1514 #define _mm_wmiawbt_si64(acc, m1, m2) \ 1515 ({\ 1516 __m64 _acc = acc;\ 1517 __m64 _m1 = m1;\ 1518 __m64 _m2 = m2;\ 1519 _acc = (__m64) __builtin_arm_wmiawbt (_acc, (__v2si)_m1, (__v2si)_m2);\ 1520 _acc;\ 1521 }) 1522 1523 #define _mm_wmiawbtn_si64(acc, m1, m2) \ 1524 ({\ 1525 __m64 _acc = acc;\ 1526 __m64 _m1 = m1;\ 1527 __m64 _m2 = m2;\ 1528 _acc = (__m64) __builtin_arm_wmiawbtn (_acc, (__v2si)_m1, (__v2si)_m2);\ 1529 _acc;\ 1530 }) 1531 1532 #define _mm_wmiawtb_si64(acc, m1, m2) \ 1533 ({\ 1534 __m64 _acc = acc;\ 1535 __m64 _m1 = m1;\ 1536 __m64 _m2 = m2;\ 1537 _acc = (__m64) __builtin_arm_wmiawtb (_acc, (__v2si)_m1, (__v2si)_m2);\ 1538 _acc;\ 1539 }) 1540 1541 #define _mm_wmiawtbn_si64(acc, m1, m2) \ 1542 ({\ 1543 __m64 _acc = acc;\ 1544 __m64 _m1 = m1;\ 1545 __m64 _m2 = m2;\ 1546 _acc = (__m64) __builtin_arm_wmiawtbn (_acc, (__v2si)_m1, (__v2si)_m2);\ 1547 _acc;\ 1548 }) 1549 1550 #define _mm_wmiawtt_si64(acc, m1, m2) \ 1551 ({\ 1552 __m64 _acc = acc;\ 1553 __m64 _m1 = m1;\ 1554 __m64 _m2 = m2;\ 1555 _acc = (__m64) __builtin_arm_wmiawtt (_acc, (__v2si)_m1, (__v2si)_m2);\ 1556 _acc;\ 1557 }) 1558 1559 #define _mm_wmiawttn_si64(acc, m1, m2) \ 1560 ({\ 1561 __m64 _acc = acc;\ 1562 __m64 _m1 = m1;\ 1563 __m64 _m2 = m2;\ 1564 _acc = (__m64) __builtin_arm_wmiawttn (_acc, (__v2si)_m1, (__v2si)_m2);\ 1565 _acc;\ 1566 }) 1567 1568 /* The third arguments should be an immediate. */ 1569 #define _mm_merge_si64(a, b, n) \ 1570 ({\ 1571 __m64 result;\ 1572 result = (__m64) __builtin_arm_wmerge ((__m64) (a), (__m64) (b), (n));\ 1573 result;\ 1574 }) 1575 #endif /* __IWMMXT2__ */ 1576 1577 static __inline __m64 1578 _mm_alignr0_si64 (__m64 a, __m64 b) 1579 { 1580 return (__m64) __builtin_arm_walignr0 ((__v8qi) a, (__v8qi) b); 1581 } 1582 1583 static __inline __m64 1584 _mm_alignr1_si64 (__m64 a, __m64 b) 1585 { 1586 return (__m64) __builtin_arm_walignr1 ((__v8qi) a, (__v8qi) b); 1587 } 1588 1589 static __inline __m64 1590 _mm_alignr2_si64 (__m64 a, __m64 b) 1591 { 1592 return (__m64) __builtin_arm_walignr2 ((__v8qi) a, (__v8qi) b); 1593 } 1594 1595 static __inline __m64 1596 _mm_alignr3_si64 (__m64 a, __m64 b) 1597 { 1598 return (__m64) __builtin_arm_walignr3 ((__v8qi) a, (__v8qi) b); 1599 } 1600 1601 static __inline void 1602 _mm_tandcb () 1603 { 1604 __asm __volatile ("tandcb r15"); 1605 } 1606 1607 static __inline void 1608 _mm_tandch () 1609 { 1610 __asm __volatile ("tandch r15"); 1611 } 1612 1613 static __inline void 1614 _mm_tandcw () 1615 { 1616 __asm __volatile ("tandcw r15"); 1617 } 1618 1619 #define _mm_textrcb(n) \ 1620 ({\ 1621 __asm__ __volatile__ (\ 1622 "textrcb r15, %0" : : "i" (n));\ 1623 }) 1624 1625 #define _mm_textrch(n) \ 1626 ({\ 1627 __asm__ __volatile__ (\ 1628 "textrch r15, %0" : : "i" (n));\ 1629 }) 1630 1631 #define _mm_textrcw(n) \ 1632 ({\ 1633 __asm__ __volatile__ (\ 1634 "textrcw r15, %0" : : "i" (n));\ 1635 }) 1636 1637 static __inline void 1638 _mm_torcb () 1639 { 1640 __asm __volatile ("torcb r15"); 1641 } 1642 1643 static __inline void 1644 _mm_torch () 1645 { 1646 __asm __volatile ("torch r15"); 1647 } 1648 1649 static __inline void 1650 _mm_torcw () 1651 { 1652 __asm __volatile ("torcw r15"); 1653 } 1654 1655 #ifdef __IWMMXT2__ 1656 static __inline void 1657 _mm_torvscb () 1658 { 1659 __asm __volatile ("torvscb r15"); 1660 } 1661 1662 static __inline void 1663 _mm_torvsch () 1664 { 1665 __asm __volatile ("torvsch r15"); 1666 } 1667 1668 static __inline void 1669 _mm_torvscw () 1670 { 1671 __asm __volatile ("torvscw r15"); 1672 } 1673 #endif /* __IWMMXT2__ */ 1674 1675 static __inline __m64 1676 _mm_tbcst_pi8 (int value) 1677 { 1678 return (__m64) __builtin_arm_tbcstb ((signed char) value); 1679 } 1680 1681 static __inline __m64 1682 _mm_tbcst_pi16 (int value) 1683 { 1684 return (__m64) __builtin_arm_tbcsth ((short) value); 1685 } 1686 1687 static __inline __m64 1688 _mm_tbcst_pi32 (int value) 1689 { 1690 return (__m64) __builtin_arm_tbcstw (value); 1691 } 1692 1693 #define _m_empty _mm_empty 1694 #define _m_packsswb _mm_packs_pi16 1695 #define _m_packssdw _mm_packs_pi32 1696 #define _m_packuswb _mm_packs_pu16 1697 #define _m_packusdw _mm_packs_pu32 1698 #define _m_packssqd _mm_packs_pi64 1699 #define _m_packusqd _mm_packs_pu64 1700 #define _mm_packs_si64 _mm_packs_pi64 1701 #define _mm_packs_su64 _mm_packs_pu64 1702 #define _m_punpckhbw _mm_unpackhi_pi8 1703 #define _m_punpckhwd _mm_unpackhi_pi16 1704 #define _m_punpckhdq _mm_unpackhi_pi32 1705 #define _m_punpcklbw _mm_unpacklo_pi8 1706 #define _m_punpcklwd _mm_unpacklo_pi16 1707 #define _m_punpckldq _mm_unpacklo_pi32 1708 #define _m_punpckehsbw _mm_unpackeh_pi8 1709 #define _m_punpckehswd _mm_unpackeh_pi16 1710 #define _m_punpckehsdq _mm_unpackeh_pi32 1711 #define _m_punpckehubw _mm_unpackeh_pu8 1712 #define _m_punpckehuwd _mm_unpackeh_pu16 1713 #define _m_punpckehudq _mm_unpackeh_pu32 1714 #define _m_punpckelsbw _mm_unpackel_pi8 1715 #define _m_punpckelswd _mm_unpackel_pi16 1716 #define _m_punpckelsdq _mm_unpackel_pi32 1717 #define _m_punpckelubw _mm_unpackel_pu8 1718 #define _m_punpckeluwd _mm_unpackel_pu16 1719 #define _m_punpckeludq _mm_unpackel_pu32 1720 #define _m_paddb _mm_add_pi8 1721 #define _m_paddw _mm_add_pi16 1722 #define _m_paddd _mm_add_pi32 1723 #define _m_paddsb _mm_adds_pi8 1724 #define _m_paddsw _mm_adds_pi16 1725 #define _m_paddsd _mm_adds_pi32 1726 #define _m_paddusb _mm_adds_pu8 1727 #define _m_paddusw _mm_adds_pu16 1728 #define _m_paddusd _mm_adds_pu32 1729 #define _m_psubb _mm_sub_pi8 1730 #define _m_psubw _mm_sub_pi16 1731 #define _m_psubd _mm_sub_pi32 1732 #define _m_psubsb _mm_subs_pi8 1733 #define _m_psubsw _mm_subs_pi16 1734 #define _m_psubuw _mm_subs_pi32 1735 #define _m_psubusb _mm_subs_pu8 1736 #define _m_psubusw _mm_subs_pu16 1737 #define _m_psubusd _mm_subs_pu32 1738 #define _m_pmaddwd _mm_madd_pi16 1739 #define _m_pmadduwd _mm_madd_pu16 1740 #define _m_pmulhw _mm_mulhi_pi16 1741 #define _m_pmulhuw _mm_mulhi_pu16 1742 #define _m_pmullw _mm_mullo_pi16 1743 #define _m_pmacsw _mm_mac_pi16 1744 #define _m_pmacuw _mm_mac_pu16 1745 #define _m_pmacszw _mm_macz_pi16 1746 #define _m_pmacuzw _mm_macz_pu16 1747 #define _m_paccb _mm_acc_pu8 1748 #define _m_paccw _mm_acc_pu16 1749 #define _m_paccd _mm_acc_pu32 1750 #define _m_pmia _mm_mia_si64 1751 #define _m_pmiaph _mm_miaph_si64 1752 #define _m_pmiabb _mm_miabb_si64 1753 #define _m_pmiabt _mm_miabt_si64 1754 #define _m_pmiatb _mm_miatb_si64 1755 #define _m_pmiatt _mm_miatt_si64 1756 #define _m_psllw _mm_sll_pi16 1757 #define _m_psllwi _mm_slli_pi16 1758 #define _m_pslld _mm_sll_pi32 1759 #define _m_pslldi _mm_slli_pi32 1760 #define _m_psllq _mm_sll_si64 1761 #define _m_psllqi _mm_slli_si64 1762 #define _m_psraw _mm_sra_pi16 1763 #define _m_psrawi _mm_srai_pi16 1764 #define _m_psrad _mm_sra_pi32 1765 #define _m_psradi _mm_srai_pi32 1766 #define _m_psraq _mm_sra_si64 1767 #define _m_psraqi _mm_srai_si64 1768 #define _m_psrlw _mm_srl_pi16 1769 #define _m_psrlwi _mm_srli_pi16 1770 #define _m_psrld _mm_srl_pi32 1771 #define _m_psrldi _mm_srli_pi32 1772 #define _m_psrlq _mm_srl_si64 1773 #define _m_psrlqi _mm_srli_si64 1774 #define _m_prorw _mm_ror_pi16 1775 #define _m_prorwi _mm_rori_pi16 1776 #define _m_prord _mm_ror_pi32 1777 #define _m_prordi _mm_rori_pi32 1778 #define _m_prorq _mm_ror_si64 1779 #define _m_prorqi _mm_rori_si64 1780 #define _m_pand _mm_and_si64 1781 #define _m_pandn _mm_andnot_si64 1782 #define _m_por _mm_or_si64 1783 #define _m_pxor _mm_xor_si64 1784 #define _m_pcmpeqb _mm_cmpeq_pi8 1785 #define _m_pcmpeqw _mm_cmpeq_pi16 1786 #define _m_pcmpeqd _mm_cmpeq_pi32 1787 #define _m_pcmpgtb _mm_cmpgt_pi8 1788 #define _m_pcmpgtub _mm_cmpgt_pu8 1789 #define _m_pcmpgtw _mm_cmpgt_pi16 1790 #define _m_pcmpgtuw _mm_cmpgt_pu16 1791 #define _m_pcmpgtd _mm_cmpgt_pi32 1792 #define _m_pcmpgtud _mm_cmpgt_pu32 1793 #define _m_pextrb _mm_extract_pi8 1794 #define _m_pextrw _mm_extract_pi16 1795 #define _m_pextrd _mm_extract_pi32 1796 #define _m_pextrub _mm_extract_pu8 1797 #define _m_pextruw _mm_extract_pu16 1798 #define _m_pextrud _mm_extract_pu32 1799 #define _m_pinsrb _mm_insert_pi8 1800 #define _m_pinsrw _mm_insert_pi16 1801 #define _m_pinsrd _mm_insert_pi32 1802 #define _m_pmaxsb _mm_max_pi8 1803 #define _m_pmaxsw _mm_max_pi16 1804 #define _m_pmaxsd _mm_max_pi32 1805 #define _m_pmaxub _mm_max_pu8 1806 #define _m_pmaxuw _mm_max_pu16 1807 #define _m_pmaxud _mm_max_pu32 1808 #define _m_pminsb _mm_min_pi8 1809 #define _m_pminsw _mm_min_pi16 1810 #define _m_pminsd _mm_min_pi32 1811 #define _m_pminub _mm_min_pu8 1812 #define _m_pminuw _mm_min_pu16 1813 #define _m_pminud _mm_min_pu32 1814 #define _m_pmovmskb _mm_movemask_pi8 1815 #define _m_pmovmskw _mm_movemask_pi16 1816 #define _m_pmovmskd _mm_movemask_pi32 1817 #define _m_pshufw _mm_shuffle_pi16 1818 #define _m_pavgb _mm_avg_pu8 1819 #define _m_pavgw _mm_avg_pu16 1820 #define _m_pavg2b _mm_avg2_pu8 1821 #define _m_pavg2w _mm_avg2_pu16 1822 #define _m_psadbw _mm_sad_pu8 1823 #define _m_psadwd _mm_sad_pu16 1824 #define _m_psadzbw _mm_sadz_pu8 1825 #define _m_psadzwd _mm_sadz_pu16 1826 #define _m_paligniq _mm_align_si64 1827 #define _m_cvt_si2pi _mm_cvtsi64_m64 1828 #define _m_cvt_pi2si _mm_cvtm64_si64 1829 #define _m_from_int _mm_cvtsi32_si64 1830 #define _m_to_int _mm_cvtsi64_si32 1831 1832 #if defined __cplusplus 1833 }; /* End "C" */ 1834 #endif /* __cplusplus */ 1835 1836 #endif /* _MMINTRIN_H_INCLUDED */ 1837