1 /* APPLE LOCAL file mainline 2005-06-30 Radar 4131077 */ 2 /* Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 3 Free Software Foundation, Inc. 4 5 This file is part of GCC. 6 7 GCC is free software; you can redistribute it and/or modify 8 it under the terms of the GNU General Public License as published by 9 the Free Software Foundation; either version 2, or (at your option) 10 any later version. 11 12 GCC is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License 18 along with GCC; see the file COPYING. If not, write to 19 the Free Software Foundation, 51 Franklin Street, Fifth Floor, 20 Boston, MA 02110-1301, USA. */ 21 22 /* As a special exception, if you include this header file into source 23 files compiled by GCC, this header file does not by itself cause 24 the resulting executable to be covered by the GNU General Public 25 License. This exception does not however invalidate any other 26 reasons why the executable file might be covered by the GNU General 27 Public License. */ 28 29 /* Implemented from the specification included in the Intel C++ Compiler 30 User Guide and Reference, version 9.0. */ 31 32 #ifndef _XMMINTRIN_H_INCLUDED 33 #define _XMMINTRIN_H_INCLUDED 34 35 #ifndef __SSE__ 36 # error "SSE instruction set not enabled" 37 #else 38 39 /* We need type definitions from the MMX header file. */ 40 #include <mmintrin.h> 41 42 /* Get _mm_malloc () and _mm_free (). */ 43 /* APPLE LOCAL begin xmmintrin.h for kernel 4123064 */ 44 #if __STDC_HOSTED__ 45 #include <mm_malloc.h> 46 #endif 47 /* APPLE LOCAL end xmmintrin.h for kernel 4123064 */ 48 49 /* The Intel API is flexible enough that we must allow aliasing with other 50 vector types, and their scalar components. */ 51 typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); 52 53 /* Internal data types for implementing the intrinsics. */ 54 typedef float __v4sf __attribute__ ((__vector_size__ (16))); 55 56 #if defined(__clang__) && defined(WITH_SYNTAX_CHECK) 57 /* Workaround for "clang -fsyntax-only" happens to use this header, but may 58 * choke on something not supported in clang 59 */ 60 int __builtin_ia32_cvtss2si (__v4sf); 61 int __builtin_ia32_cvttss2si (__v4sf); 62 __m128 __builtin_ia32_addps (__v4sf, __v4sf); 63 __m128 __builtin_ia32_addss (__v4sf, __v4sf); 64 __m128 __builtin_ia32_addss (__v4sf, __v4sf); 65 __m128 __builtin_ia32_addss (__v4sf, __v4sf); 66 __m128 __builtin_ia32_andnps (__m128, __m128); 67 __m128 __builtin_ia32_andps (__m128, __m128); 68 __m128 __builtin_ia32_cmpeqps (__v4sf, __v4sf); 69 __m128 __builtin_ia32_cmpeqss (__v4sf, __v4sf); 70 __m128 __builtin_ia32_cmpgeps (__v4sf, __v4sf); 71 __m128 __builtin_ia32_cmpgtps (__v4sf, __v4sf); 72 __m128 __builtin_ia32_cmpleps (__v4sf, __v4sf); 73 __m128 __builtin_ia32_cmpless (__v4sf, __v4sf); 74 __m128 __builtin_ia32_cmpltps (__v4sf, __v4sf); 75 __m128 __builtin_ia32_cmpltss (__v4sf, __v4sf); 76 __m128 __builtin_ia32_cmpneqps (__v4sf, __v4sf); 77 __m128 __builtin_ia32_cmpneqss (__v4sf, __v4sf); 78 __m128 __builtin_ia32_cmpngeps (__v4sf, __v4sf); 79 __m128 __builtin_ia32_cmpngtps (__v4sf, __v4sf); 80 __m128 __builtin_ia32_cmpnleps (__v4sf, __v4sf); 81 __m128 __builtin_ia32_cmpnless (__v4sf, __v4sf); 82 __m128 __builtin_ia32_cmpnltps (__v4sf, __v4sf); 83 __m128 __builtin_ia32_cmpnltss (__v4sf, __v4sf); 84 __m128 __builtin_ia32_cmpordps (__v4sf, __v4sf); 85 __m128 __builtin_ia32_cmpordss (__v4sf, __v4sf); 86 __m128 __builtin_ia32_cmpunordps (__v4sf, __v4sf); 87 __m128 __builtin_ia32_cmpunordss (__v4sf, __v4sf); 88 __m128 __builtin_ia32_cvtsi2ss (__v4sf, int); 89 __m128 __builtin_ia32_divps (__v4sf, __v4sf); 90 __m128 __builtin_ia32_divss (__v4sf, __v4sf); 91 __m128 __builtin_ia32_movss (__v4sf, __v4sf); 92 __m128 __builtin_ia32_mulps (__v4sf, __v4sf); 93 __m128 __builtin_ia32_mulps (__v4sf, __v4sf); 94 __m128 __builtin_ia32_mulss (__v4sf, __v4sf); 95 __m128 __builtin_ia32_mulss (__v4sf, __v4sf); 96 __m128 __builtin_ia32_orps (__m128, __m128); 97 __m128 __builtin_ia32_subps (__v4sf, __v4sf); 98 __m128 __builtin_ia32_subss (__v4sf, __v4sf); 99 __m128 __builtin_ia32_subss (__v4sf, __v4sf); 100 __m128 __builtin_ia32_xorps (__m128, __m128); 101 __m128 __builtin_ia32_loadhps (__v4sf, const __v2si *); 102 __m128 __builtin_ia32_loadlps (__v4sf, const __v2si *); 103 __m128 __builtin_ia32_movhlps (__v4sf, __v4sf); 104 __m128 __builtin_ia32_movlhps (__v4sf, __v4sf); 105 __m128 __builtin_ia32_shufps (__v4sf, __v4sf, int const); 106 __m128 __builtin_ia32_unpckhps (__v4sf, __v4sf); 107 __m128 __builtin_ia32_unpcklps (__v4sf, __v4sf); 108 __m128 __builtin_ia32_loadups (float const *); 109 __m64 __builtin_ia32_vec_set_v4hi (__v4hi, int const, int const); 110 float __builtin_ia32_vec_ext_v4sf (__v4sf, const int); 111 int __builtin_ia32_vec_ext_v4hi (__v4hi, const int); 112 long long __builtin_ia32_cvtss2si64 (__v4sf); 113 long long __builtin_ia32_cvttss2si64 (__v4sf); 114 __m128 __builtin_ia32_cvtsi642ss (__v4sf, long long); 115 #endif 116 117 /* Create a selector for use with the SHUFPS instruction. */ 118 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 119 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 120 121 /* Constants for use with _mm_prefetch. */ 122 enum _mm_hint 123 { 124 _MM_HINT_T0 = 3, 125 _MM_HINT_T1 = 2, 126 _MM_HINT_T2 = 1, 127 _MM_HINT_NTA = 0 128 }; 129 130 /* Bits in the MXCSR. */ 131 #define _MM_EXCEPT_MASK 0x003f 132 #define _MM_EXCEPT_INVALID 0x0001 133 #define _MM_EXCEPT_DENORM 0x0002 134 #define _MM_EXCEPT_DIV_ZERO 0x0004 135 #define _MM_EXCEPT_OVERFLOW 0x0008 136 #define _MM_EXCEPT_UNDERFLOW 0x0010 137 #define _MM_EXCEPT_INEXACT 0x0020 138 139 #define _MM_MASK_MASK 0x1f80 140 #define _MM_MASK_INVALID 0x0080 141 #define _MM_MASK_DENORM 0x0100 142 #define _MM_MASK_DIV_ZERO 0x0200 143 #define _MM_MASK_OVERFLOW 0x0400 144 #define _MM_MASK_UNDERFLOW 0x0800 145 #define _MM_MASK_INEXACT 0x1000 146 147 #define _MM_ROUND_MASK 0x6000 148 #define _MM_ROUND_NEAREST 0x0000 149 #define _MM_ROUND_DOWN 0x2000 150 #define _MM_ROUND_UP 0x4000 151 #define _MM_ROUND_TOWARD_ZERO 0x6000 152 153 #define _MM_FLUSH_ZERO_MASK 0x8000 154 #define _MM_FLUSH_ZERO_ON 0x8000 155 #define _MM_FLUSH_ZERO_OFF 0x0000 156 157 /* APPLE LOCAL begin nodebug inline 4152603 */ 158 #define __always_inline__ __always_inline__, __nodebug__ 159 /* APPLE LOCAL end nodebug inline 4152603 */ 160 161 /* APPLE LOCAL begin radar 5618945 */ 162 #undef __STATIC_INLINE 163 #ifdef __GNUC_STDC_INLINE__ 164 #define __STATIC_INLINE __inline 165 #else 166 #define __STATIC_INLINE static __inline 167 #endif 168 /* APPLE LOCAL end radar 5618945 */ 169 170 /* Create a vector of zeros. */ 171 /* APPLE LOCAL begin radar 4152603 */ 172 /* APPLE LOCAL begin radar 5618945 */ 173 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 174 /* APPLE LOCAL end radar 5618945 */ 175 _mm_setzero_ps (void) 176 { 177 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; 178 } 179 180 /* Perform the respective operation on the lower SPFP (single-precision 181 floating-point) values of A and B; the upper three SPFP values are 182 passed through from A. */ 183 184 /* APPLE LOCAL begin radar 5618945 */ 185 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 186 /* APPLE LOCAL end radar 5618945 */ 187 _mm_add_ss (__m128 __A, __m128 __B) 188 { 189 return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); 190 } 191 192 /* APPLE LOCAL begin radar 5618945 */ 193 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 194 /* APPLE LOCAL end radar 5618945 */ 195 _mm_sub_ss (__m128 __A, __m128 __B) 196 { 197 return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); 198 } 199 200 /* APPLE LOCAL begin radar 5618945 */ 201 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 202 /* APPLE LOCAL end radar 5618945 */ 203 _mm_mul_ss (__m128 __A, __m128 __B) 204 { 205 return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); 206 } 207 208 /* APPLE LOCAL begin radar 5618945 */ 209 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 210 /* APPLE LOCAL end radar 5618945 */ 211 _mm_div_ss (__m128 __A, __m128 __B) 212 { 213 return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); 214 } 215 216 /* APPLE LOCAL begin radar 5618945 */ 217 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 218 /* APPLE LOCAL end radar 5618945 */ 219 _mm_sqrt_ss (__m128 __A) 220 { 221 return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); 222 } 223 224 /* APPLE LOCAL begin radar 5618945 */ 225 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 226 /* APPLE LOCAL end radar 5618945 */ 227 _mm_rcp_ss (__m128 __A) 228 { 229 return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); 230 } 231 232 /* APPLE LOCAL begin radar 5618945 */ 233 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 234 /* APPLE LOCAL end radar 5618945 */ 235 _mm_rsqrt_ss (__m128 __A) 236 { 237 return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); 238 } 239 240 /* APPLE LOCAL begin radar 5618945 */ 241 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 242 /* APPLE LOCAL end radar 5618945 */ 243 _mm_min_ss (__m128 __A, __m128 __B) 244 { 245 return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); 246 } 247 248 /* APPLE LOCAL begin radar 5618945 */ 249 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 250 /* APPLE LOCAL end radar 5618945 */ 251 _mm_max_ss (__m128 __A, __m128 __B) 252 { 253 return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); 254 } 255 256 /* Perform the respective operation on the four SPFP values in A and B. */ 257 258 /* APPLE LOCAL begin radar 5618945 */ 259 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 260 /* APPLE LOCAL end radar 5618945 */ 261 _mm_add_ps (__m128 __A, __m128 __B) 262 { 263 return (__m128) __builtin_ia32_addps ((__v4sf)__A, (__v4sf)__B); 264 } 265 266 /* APPLE LOCAL begin radar 5618945 */ 267 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 268 /* APPLE LOCAL end radar 5618945 */ 269 _mm_sub_ps (__m128 __A, __m128 __B) 270 { 271 return (__m128) __builtin_ia32_subps ((__v4sf)__A, (__v4sf)__B); 272 } 273 274 /* APPLE LOCAL begin radar 5618945 */ 275 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 276 /* APPLE LOCAL end radar 5618945 */ 277 _mm_mul_ps (__m128 __A, __m128 __B) 278 { 279 return (__m128) __builtin_ia32_mulps ((__v4sf)__A, (__v4sf)__B); 280 } 281 282 /* APPLE LOCAL begin radar 5618945 */ 283 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 284 /* APPLE LOCAL end radar 5618945 */ 285 _mm_div_ps (__m128 __A, __m128 __B) 286 { 287 return (__m128) __builtin_ia32_divps ((__v4sf)__A, (__v4sf)__B); 288 } 289 290 /* APPLE LOCAL begin radar 5618945 */ 291 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 292 /* APPLE LOCAL end radar 5618945 */ 293 _mm_sqrt_ps (__m128 __A) 294 { 295 return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); 296 } 297 298 /* APPLE LOCAL begin radar 5618945 */ 299 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 300 /* APPLE LOCAL end radar 5618945 */ 301 _mm_rcp_ps (__m128 __A) 302 { 303 return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); 304 } 305 306 /* APPLE LOCAL begin radar 5618945 */ 307 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 308 /* APPLE LOCAL end radar 5618945 */ 309 _mm_rsqrt_ps (__m128 __A) 310 { 311 return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); 312 } 313 314 /* APPLE LOCAL begin radar 5618945 */ 315 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 316 /* APPLE LOCAL end radar 5618945 */ 317 _mm_min_ps (__m128 __A, __m128 __B) 318 { 319 return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); 320 } 321 322 /* APPLE LOCAL begin radar 5618945 */ 323 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 324 /* APPLE LOCAL end radar 5618945 */ 325 _mm_max_ps (__m128 __A, __m128 __B) 326 { 327 return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); 328 } 329 330 /* Perform logical bit-wise operations on 128-bit values. */ 331 332 /* APPLE LOCAL begin radar 5618945 */ 333 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 334 /* APPLE LOCAL end radar 5618945 */ 335 _mm_and_ps (__m128 __A, __m128 __B) 336 { 337 return __builtin_ia32_andps (__A, __B); 338 } 339 340 /* APPLE LOCAL begin radar 5618945 */ 341 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 342 /* APPLE LOCAL end radar 5618945 */ 343 _mm_andnot_ps (__m128 __A, __m128 __B) 344 { 345 return __builtin_ia32_andnps (__A, __B); 346 } 347 348 /* APPLE LOCAL begin radar 5618945 */ 349 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 350 /* APPLE LOCAL end radar 5618945 */ 351 _mm_or_ps (__m128 __A, __m128 __B) 352 { 353 return __builtin_ia32_orps (__A, __B); 354 } 355 356 /* APPLE LOCAL begin radar 5618945 */ 357 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 358 /* APPLE LOCAL end radar 5618945 */ 359 _mm_xor_ps (__m128 __A, __m128 __B) 360 { 361 return __builtin_ia32_xorps (__A, __B); 362 } 363 364 /* Perform a comparison on the lower SPFP values of A and B. If the 365 comparison is true, place a mask of all ones in the result, otherwise a 366 mask of zeros. The upper three SPFP values are passed through from A. */ 367 368 /* APPLE LOCAL begin radar 5618945 */ 369 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 370 /* APPLE LOCAL end radar 5618945 */ 371 _mm_cmpeq_ss (__m128 __A, __m128 __B) 372 { 373 return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); 374 } 375 376 /* APPLE LOCAL begin radar 5618945 */ 377 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 378 /* APPLE LOCAL end radar 5618945 */ 379 _mm_cmplt_ss (__m128 __A, __m128 __B) 380 { 381 return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); 382 } 383 384 /* APPLE LOCAL begin radar 5618945 */ 385 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 386 /* APPLE LOCAL end radar 5618945 */ 387 _mm_cmple_ss (__m128 __A, __m128 __B) 388 { 389 return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); 390 } 391 392 /* APPLE LOCAL begin radar 5618945 */ 393 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 394 /* APPLE LOCAL end radar 5618945 */ 395 _mm_cmpgt_ss (__m128 __A, __m128 __B) 396 { 397 return (__m128) __builtin_ia32_movss ((__v4sf) __A, 398 (__v4sf) 399 __builtin_ia32_cmpltss ((__v4sf) __B, 400 (__v4sf) 401 __A)); 402 } 403 404 /* APPLE LOCAL begin radar 5618945 */ 405 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 406 /* APPLE LOCAL end radar 5618945 */ 407 _mm_cmpge_ss (__m128 __A, __m128 __B) 408 { 409 return (__m128) __builtin_ia32_movss ((__v4sf) __A, 410 (__v4sf) 411 __builtin_ia32_cmpless ((__v4sf) __B, 412 (__v4sf) 413 __A)); 414 } 415 416 /* APPLE LOCAL begin radar 5618945 */ 417 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 418 /* APPLE LOCAL end radar 5618945 */ 419 _mm_cmpneq_ss (__m128 __A, __m128 __B) 420 { 421 return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); 422 } 423 424 /* APPLE LOCAL begin radar 5618945 */ 425 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 426 /* APPLE LOCAL end radar 5618945 */ 427 _mm_cmpnlt_ss (__m128 __A, __m128 __B) 428 { 429 return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); 430 } 431 432 /* APPLE LOCAL begin radar 5618945 */ 433 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 434 /* APPLE LOCAL end radar 5618945 */ 435 _mm_cmpnle_ss (__m128 __A, __m128 __B) 436 { 437 return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); 438 } 439 440 /* APPLE LOCAL begin radar 5618945 */ 441 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 442 /* APPLE LOCAL end radar 5618945 */ 443 _mm_cmpngt_ss (__m128 __A, __m128 __B) 444 { 445 return (__m128) __builtin_ia32_movss ((__v4sf) __A, 446 (__v4sf) 447 __builtin_ia32_cmpnltss ((__v4sf) __B, 448 (__v4sf) 449 __A)); 450 } 451 452 /* APPLE LOCAL begin radar 5618945 */ 453 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 454 /* APPLE LOCAL end radar 5618945 */ 455 _mm_cmpnge_ss (__m128 __A, __m128 __B) 456 { 457 return (__m128) __builtin_ia32_movss ((__v4sf) __A, 458 (__v4sf) 459 __builtin_ia32_cmpnless ((__v4sf) __B, 460 (__v4sf) 461 __A)); 462 } 463 464 /* APPLE LOCAL begin radar 5618945 */ 465 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 466 /* APPLE LOCAL end radar 5618945 */ 467 _mm_cmpord_ss (__m128 __A, __m128 __B) 468 { 469 return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); 470 } 471 472 /* APPLE LOCAL begin radar 5618945 */ 473 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 474 /* APPLE LOCAL end radar 5618945 */ 475 _mm_cmpunord_ss (__m128 __A, __m128 __B) 476 { 477 return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); 478 } 479 480 /* Perform a comparison on the four SPFP values of A and B. For each 481 element, if the comparison is true, place a mask of all ones in the 482 result, otherwise a mask of zeros. */ 483 484 /* APPLE LOCAL begin radar 5618945 */ 485 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 486 /* APPLE LOCAL end radar 5618945 */ 487 _mm_cmpeq_ps (__m128 __A, __m128 __B) 488 { 489 return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); 490 } 491 492 /* APPLE LOCAL begin radar 5618945 */ 493 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 494 /* APPLE LOCAL end radar 5618945 */ 495 _mm_cmplt_ps (__m128 __A, __m128 __B) 496 { 497 return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); 498 } 499 500 /* APPLE LOCAL begin radar 5618945 */ 501 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 502 /* APPLE LOCAL end radar 5618945 */ 503 _mm_cmple_ps (__m128 __A, __m128 __B) 504 { 505 return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); 506 } 507 508 /* APPLE LOCAL begin radar 5618945 */ 509 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 510 /* APPLE LOCAL end radar 5618945 */ 511 _mm_cmpgt_ps (__m128 __A, __m128 __B) 512 { 513 return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); 514 } 515 516 /* APPLE LOCAL begin radar 5618945 */ 517 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 518 /* APPLE LOCAL end radar 5618945 */ 519 _mm_cmpge_ps (__m128 __A, __m128 __B) 520 { 521 return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); 522 } 523 524 /* APPLE LOCAL begin radar 5618945 */ 525 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 526 /* APPLE LOCAL end radar 5618945 */ 527 _mm_cmpneq_ps (__m128 __A, __m128 __B) 528 { 529 return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); 530 } 531 532 /* APPLE LOCAL begin radar 5618945 */ 533 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 534 /* APPLE LOCAL end radar 5618945 */ 535 _mm_cmpnlt_ps (__m128 __A, __m128 __B) 536 { 537 return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); 538 } 539 540 /* APPLE LOCAL begin radar 5618945 */ 541 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 542 /* APPLE LOCAL end radar 5618945 */ 543 _mm_cmpnle_ps (__m128 __A, __m128 __B) 544 { 545 return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); 546 } 547 548 /* APPLE LOCAL begin radar 5618945 */ 549 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 550 /* APPLE LOCAL end radar 5618945 */ 551 _mm_cmpngt_ps (__m128 __A, __m128 __B) 552 { 553 return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); 554 } 555 556 /* APPLE LOCAL begin radar 5618945 */ 557 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 558 /* APPLE LOCAL end radar 5618945 */ 559 _mm_cmpnge_ps (__m128 __A, __m128 __B) 560 { 561 return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); 562 } 563 564 /* APPLE LOCAL begin radar 5618945 */ 565 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 566 /* APPLE LOCAL end radar 5618945 */ 567 _mm_cmpord_ps (__m128 __A, __m128 __B) 568 { 569 return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); 570 } 571 572 /* APPLE LOCAL begin radar 5618945 */ 573 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 574 /* APPLE LOCAL end radar 5618945 */ 575 _mm_cmpunord_ps (__m128 __A, __m128 __B) 576 { 577 return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); 578 } 579 580 /* Compare the lower SPFP values of A and B and return 1 if true 581 and 0 if false. */ 582 583 /* APPLE LOCAL begin radar 5618945 */ 584 __STATIC_INLINE int __attribute__((__always_inline__)) 585 /* APPLE LOCAL end radar 5618945 */ 586 _mm_comieq_ss (__m128 __A, __m128 __B) 587 { 588 return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); 589 } 590 591 /* APPLE LOCAL begin radar 5618945 */ 592 __STATIC_INLINE int __attribute__((__always_inline__)) 593 /* APPLE LOCAL end radar 5618945 */ 594 _mm_comilt_ss (__m128 __A, __m128 __B) 595 { 596 return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); 597 } 598 599 /* APPLE LOCAL begin radar 5618945 */ 600 __STATIC_INLINE int __attribute__((__always_inline__)) 601 /* APPLE LOCAL end radar 5618945 */ 602 _mm_comile_ss (__m128 __A, __m128 __B) 603 { 604 return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); 605 } 606 607 /* APPLE LOCAL begin radar 5618945 */ 608 __STATIC_INLINE int __attribute__((__always_inline__)) 609 /* APPLE LOCAL end radar 5618945 */ 610 _mm_comigt_ss (__m128 __A, __m128 __B) 611 { 612 return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); 613 } 614 615 /* APPLE LOCAL begin radar 5618945 */ 616 __STATIC_INLINE int __attribute__((__always_inline__)) 617 /* APPLE LOCAL end radar 5618945 */ 618 _mm_comige_ss (__m128 __A, __m128 __B) 619 { 620 return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); 621 } 622 623 /* APPLE LOCAL begin radar 5618945 */ 624 __STATIC_INLINE int __attribute__((__always_inline__)) 625 /* APPLE LOCAL end radar 5618945 */ 626 _mm_comineq_ss (__m128 __A, __m128 __B) 627 { 628 return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); 629 } 630 631 /* APPLE LOCAL begin radar 5618945 */ 632 __STATIC_INLINE int __attribute__((__always_inline__)) 633 /* APPLE LOCAL end radar 5618945 */ 634 _mm_ucomieq_ss (__m128 __A, __m128 __B) 635 { 636 return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); 637 } 638 639 /* APPLE LOCAL begin radar 5618945 */ 640 __STATIC_INLINE int __attribute__((__always_inline__)) 641 /* APPLE LOCAL end radar 5618945 */ 642 _mm_ucomilt_ss (__m128 __A, __m128 __B) 643 { 644 return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); 645 } 646 647 /* APPLE LOCAL begin radar 5618945 */ 648 __STATIC_INLINE int __attribute__((__always_inline__)) 649 /* APPLE LOCAL end radar 5618945 */ 650 _mm_ucomile_ss (__m128 __A, __m128 __B) 651 { 652 return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); 653 } 654 655 /* APPLE LOCAL begin radar 5618945 */ 656 __STATIC_INLINE int __attribute__((__always_inline__)) 657 /* APPLE LOCAL end radar 5618945 */ 658 _mm_ucomigt_ss (__m128 __A, __m128 __B) 659 { 660 return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); 661 } 662 663 /* APPLE LOCAL begin radar 5618945 */ 664 __STATIC_INLINE int __attribute__((__always_inline__)) 665 /* APPLE LOCAL end radar 5618945 */ 666 _mm_ucomige_ss (__m128 __A, __m128 __B) 667 { 668 return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); 669 } 670 671 /* APPLE LOCAL begin radar 5618945 */ 672 __STATIC_INLINE int __attribute__((__always_inline__)) 673 /* APPLE LOCAL end radar 5618945 */ 674 _mm_ucomineq_ss (__m128 __A, __m128 __B) 675 { 676 return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); 677 } 678 679 /* Convert the lower SPFP value to a 32-bit integer according to the current 680 rounding mode. */ 681 /* APPLE LOCAL begin radar 5618945 */ 682 __STATIC_INLINE int __attribute__((__always_inline__)) 683 /* APPLE LOCAL end radar 5618945 */ 684 _mm_cvtss_si32 (__m128 __A) 685 { 686 return __builtin_ia32_cvtss2si ((__v4sf) __A); 687 } 688 689 /* APPLE LOCAL begin radar 5618945 */ 690 __STATIC_INLINE int __attribute__((__always_inline__)) 691 /* APPLE LOCAL end radar 5618945 */ 692 _mm_cvt_ss2si (__m128 __A) 693 { 694 return _mm_cvtss_si32 (__A); 695 } 696 697 #ifdef __x86_64__ 698 /* Convert the lower SPFP value to a 32-bit integer according to the 699 current rounding mode. */ 700 701 /* Intel intrinsic. */ 702 /* APPLE LOCAL begin radar 5618945 */ 703 __STATIC_INLINE long long __attribute__((__always_inline__)) 704 /* APPLE LOCAL end radar 5618945 */ 705 _mm_cvtss_si64 (__m128 __A) 706 { 707 return __builtin_ia32_cvtss2si64 ((__v4sf) __A); 708 } 709 710 /* Microsoft intrinsic. */ 711 /* APPLE LOCAL begin radar 5618945 */ 712 __STATIC_INLINE long long __attribute__((__always_inline__)) 713 /* APPLE LOCAL end radar 5618945 */ 714 _mm_cvtss_si64x (__m128 __A) 715 { 716 return __builtin_ia32_cvtss2si64 ((__v4sf) __A); 717 } 718 #endif 719 720 /* Convert the two lower SPFP values to 32-bit integers according to the 721 current rounding mode. Return the integers in packed form. */ 722 /* APPLE LOCAL begin radar 5618945 */ 723 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 724 /* APPLE LOCAL end radar 5618945 */ 725 _mm_cvtps_pi32 (__m128 __A) 726 { 727 return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); 728 } 729 730 /* APPLE LOCAL begin radar 5618945 */ 731 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 732 /* APPLE LOCAL end radar 5618945 */ 733 _mm_cvt_ps2pi (__m128 __A) 734 { 735 return _mm_cvtps_pi32 (__A); 736 } 737 738 /* Truncate the lower SPFP value to a 32-bit integer. */ 739 /* APPLE LOCAL begin radar 5618945 */ 740 __STATIC_INLINE int __attribute__((__always_inline__)) 741 /* APPLE LOCAL end radar 5618945 */ 742 _mm_cvttss_si32 (__m128 __A) 743 { 744 return __builtin_ia32_cvttss2si ((__v4sf) __A); 745 } 746 747 /* APPLE LOCAL begin radar 5618945 */ 748 __STATIC_INLINE int __attribute__((__always_inline__)) 749 /* APPLE LOCAL end radar 5618945 */ 750 _mm_cvtt_ss2si (__m128 __A) 751 { 752 return _mm_cvttss_si32 (__A); 753 } 754 755 #ifdef __x86_64__ 756 /* Truncate the lower SPFP value to a 32-bit integer. */ 757 758 /* Intel intrinsic. */ 759 /* APPLE LOCAL begin radar 5618945 */ 760 __STATIC_INLINE long long __attribute__((__always_inline__)) 761 /* APPLE LOCAL end radar 5618945 */ 762 _mm_cvttss_si64 (__m128 __A) 763 { 764 return __builtin_ia32_cvttss2si64 ((__v4sf) __A); 765 } 766 767 /* Microsoft intrinsic. */ 768 /* APPLE LOCAL begin radar 5618945 */ 769 __STATIC_INLINE long long __attribute__((__always_inline__)) 770 /* APPLE LOCAL end radar 5618945 */ 771 _mm_cvttss_si64x (__m128 __A) 772 { 773 return __builtin_ia32_cvttss2si64 ((__v4sf) __A); 774 } 775 #endif 776 777 /* Truncate the two lower SPFP values to 32-bit integers. Return the 778 integers in packed form. */ 779 /* APPLE LOCAL begin radar 5618945 */ 780 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 781 /* APPLE LOCAL end radar 5618945 */ 782 _mm_cvttps_pi32 (__m128 __A) 783 { 784 return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); 785 } 786 787 /* APPLE LOCAL begin radar 5618945 */ 788 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 789 /* APPLE LOCAL end radar 5618945 */ 790 _mm_cvtt_ps2pi (__m128 __A) 791 { 792 return _mm_cvttps_pi32 (__A); 793 } 794 795 /* Convert B to a SPFP value and insert it as element zero in A. */ 796 /* APPLE LOCAL begin radar 5618945 */ 797 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 798 /* APPLE LOCAL end radar 5618945 */ 799 _mm_cvtsi32_ss (__m128 __A, int __B) 800 { 801 return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); 802 } 803 804 /* APPLE LOCAL begin radar 5618945 */ 805 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 806 /* APPLE LOCAL end radar 5618945 */ 807 _mm_cvt_si2ss (__m128 __A, int __B) 808 { 809 return _mm_cvtsi32_ss (__A, __B); 810 } 811 812 #ifdef __x86_64__ 813 /* Convert B to a SPFP value and insert it as element zero in A. */ 814 815 /* Intel intrinsic. */ 816 /* APPLE LOCAL begin radar 5618945 */ 817 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 818 /* APPLE LOCAL end radar 5618945 */ 819 _mm_cvtsi64_ss (__m128 __A, long long __B) 820 { 821 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); 822 } 823 824 /* Microsoft intrinsic. */ 825 /* APPLE LOCAL begin radar 5618945 */ 826 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 827 /* APPLE LOCAL end radar 5618945 */ 828 _mm_cvtsi64x_ss (__m128 __A, long long __B) 829 { 830 return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); 831 } 832 #endif 833 834 /* Convert the two 32-bit values in B to SPFP form and insert them 835 as the two lower elements in A. */ 836 /* APPLE LOCAL begin radar 5618945 */ 837 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 838 /* APPLE LOCAL end radar 5618945 */ 839 _mm_cvtpi32_ps (__m128 __A, __m64 __B) 840 { 841 return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); 842 } 843 844 /* APPLE LOCAL begin radar 5618945 */ 845 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 846 /* APPLE LOCAL end radar 5618945 */ 847 _mm_cvt_pi2ps (__m128 __A, __m64 __B) 848 { 849 return _mm_cvtpi32_ps (__A, __B); 850 } 851 852 /* Convert the four signed 16-bit values in A to SPFP form. */ 853 /* APPLE LOCAL begin radar 5618945 */ 854 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 855 /* APPLE LOCAL end radar 5618945 */ 856 _mm_cvtpi16_ps (__m64 __A) 857 { 858 __v4hi __sign; 859 __v2si __hisi, __losi; 860 __v4sf __r; 861 862 /* This comparison against zero gives us a mask that can be used to 863 fill in the missing sign bits in the unpack operations below, so 864 that we get signed values after unpacking. */ 865 __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A); 866 867 /* Convert the four words to doublewords. */ 868 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); 869 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); 870 871 /* Convert the doublewords to floating point two at a time. */ 872 __r = (__v4sf) _mm_setzero_ps (); 873 __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 874 __r = __builtin_ia32_movlhps (__r, __r); 875 __r = __builtin_ia32_cvtpi2ps (__r, __losi); 876 877 return (__m128) __r; 878 } 879 880 /* Convert the four unsigned 16-bit values in A to SPFP form. */ 881 /* APPLE LOCAL begin radar 5618945 */ 882 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 883 /* APPLE LOCAL end radar 5618945 */ 884 _mm_cvtpu16_ps (__m64 __A) 885 { 886 __v2si __hisi, __losi; 887 __v4sf __r; 888 889 /* Convert the four words to doublewords. */ 890 __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL); 891 __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL); 892 893 /* Convert the doublewords to floating point two at a time. */ 894 __r = (__v4sf) _mm_setzero_ps (); 895 __r = __builtin_ia32_cvtpi2ps (__r, __hisi); 896 __r = __builtin_ia32_movlhps (__r, __r); 897 __r = __builtin_ia32_cvtpi2ps (__r, __losi); 898 899 return (__m128) __r; 900 } 901 902 /* Convert the low four signed 8-bit values in A to SPFP form. */ 903 /* APPLE LOCAL begin radar 5618945 */ 904 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 905 /* APPLE LOCAL end radar 5618945 */ 906 _mm_cvtpi8_ps (__m64 __A) 907 { 908 __v8qi __sign; 909 910 /* This comparison against zero gives us a mask that can be used to 911 fill in the missing sign bits in the unpack operations below, so 912 that we get signed values after unpacking. */ 913 __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A); 914 915 /* Convert the four low bytes to words. */ 916 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); 917 918 return _mm_cvtpi16_ps(__A); 919 } 920 921 /* Convert the low four unsigned 8-bit values in A to SPFP form. */ 922 /* APPLE LOCAL begin radar 5618945 */ 923 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 924 /* APPLE LOCAL end radar 5618945 */ 925 _mm_cvtpu8_ps(__m64 __A) 926 { 927 __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL); 928 return _mm_cvtpu16_ps(__A); 929 } 930 931 /* Convert the four signed 32-bit values in A and B to SPFP form. */ 932 /* APPLE LOCAL begin radar 5618945 */ 933 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 934 /* APPLE LOCAL end radar 5618945 */ 935 _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) 936 { 937 __v4sf __zero = (__v4sf) _mm_setzero_ps (); 938 __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); 939 __v4sf __sfb = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__B); 940 return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); 941 } 942 943 /* Convert the four SPFP values in A to four signed 16-bit integers. */ 944 /* APPLE LOCAL begin radar 5618945 */ 945 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 946 /* APPLE LOCAL end radar 5618945 */ 947 _mm_cvtps_pi16(__m128 __A) 948 { 949 __v4sf __hisf = (__v4sf)__A; 950 __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); 951 __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); 952 __v2si __losi = __builtin_ia32_cvtps2pi (__losf); 953 return (__m64) __builtin_ia32_packssdw (__hisi, __losi); 954 } 955 956 /* Convert the four SPFP values in A to four signed 8-bit integers. */ 957 /* APPLE LOCAL begin radar 5618945 */ 958 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 959 /* APPLE LOCAL end radar 5618945 */ 960 _mm_cvtps_pi8(__m128 __A) 961 { 962 __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); 963 return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL); 964 } 965 966 /* Selects four specific SPFP values from A and B based on MASK. */ 967 #if 0 968 /* APPLE LOCAL begin radar 5618945 */ 969 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 970 /* APPLE LOCAL end radar 5618945 */ 971 _mm_shuffle_ps (__m128 __A, __m128 __B, int __mask) 972 { 973 return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); 974 } 975 #else 976 #define _mm_shuffle_ps(A, B, MASK) \ 977 ((__m128) __builtin_ia32_shufps ((__v4sf)(A), (__v4sf)(B), (MASK))) 978 #endif 979 980 981 /* Selects and interleaves the upper two SPFP values from A and B. */ 982 /* APPLE LOCAL begin radar 5618945 */ 983 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 984 /* APPLE LOCAL end radar 5618945 */ 985 _mm_unpackhi_ps (__m128 __A, __m128 __B) 986 { 987 return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); 988 } 989 990 /* Selects and interleaves the lower two SPFP values from A and B. */ 991 /* APPLE LOCAL begin radar 5618945 */ 992 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 993 /* APPLE LOCAL end radar 5618945 */ 994 _mm_unpacklo_ps (__m128 __A, __m128 __B) 995 { 996 return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); 997 } 998 999 /* Sets the upper two SPFP values with 64-bits of data loaded from P; 1000 the lower two values are passed through from A. */ 1001 /* APPLE LOCAL begin radar 5618945 */ 1002 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1003 /* APPLE LOCAL end radar 5618945 */ 1004 _mm_loadh_pi (__m128 __A, __m64 const *__P) 1005 { 1006 return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (__v2si *)__P); 1007 } 1008 1009 /* Stores the upper two SPFP values of A into P. */ 1010 /* APPLE LOCAL begin radar 5618945 */ 1011 __STATIC_INLINE void __attribute__((__always_inline__)) 1012 /* APPLE LOCAL end radar 5618945 */ 1013 _mm_storeh_pi (__m64 *__P, __m128 __A) 1014 { 1015 __builtin_ia32_storehps ((__v2si *)__P, (__v4sf)__A); 1016 } 1017 1018 /* Moves the upper two values of B into the lower two values of A. */ 1019 /* APPLE LOCAL begin radar 5618945 */ 1020 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1021 /* APPLE LOCAL end radar 5618945 */ 1022 _mm_movehl_ps (__m128 __A, __m128 __B) 1023 { 1024 return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); 1025 } 1026 1027 /* Moves the lower two values of B into the upper two values of A. */ 1028 /* APPLE LOCAL begin radar 5618945 */ 1029 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1030 /* APPLE LOCAL end radar 5618945 */ 1031 _mm_movelh_ps (__m128 __A, __m128 __B) 1032 { 1033 return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); 1034 } 1035 1036 /* Sets the lower two SPFP values with 64-bits of data loaded from P; 1037 the upper two values are passed through from A. */ 1038 /* APPLE LOCAL begin radar 5618945 */ 1039 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1040 /* APPLE LOCAL end radar 5618945 */ 1041 _mm_loadl_pi (__m128 __A, __m64 const *__P) 1042 { 1043 return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (__v2si *)__P); 1044 } 1045 1046 /* Stores the lower two SPFP values of A into P. */ 1047 /* APPLE LOCAL begin radar 5618945 */ 1048 __STATIC_INLINE void __attribute__((__always_inline__)) 1049 /* APPLE LOCAL end radar 5618945 */ 1050 _mm_storel_pi (__m64 *__P, __m128 __A) 1051 { 1052 __builtin_ia32_storelps ((__v2si *)__P, (__v4sf)__A); 1053 } 1054 1055 /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ 1056 /* APPLE LOCAL begin radar 5618945 */ 1057 __STATIC_INLINE int __attribute__((__always_inline__)) 1058 /* APPLE LOCAL end radar 5618945 */ 1059 _mm_movemask_ps (__m128 __A) 1060 { 1061 return __builtin_ia32_movmskps ((__v4sf)__A); 1062 } 1063 1064 /* Return the contents of the control register. */ 1065 /* APPLE LOCAL begin radar 5618945 */ 1066 __STATIC_INLINE unsigned int __attribute__((__always_inline__)) 1067 /* APPLE LOCAL end radar 5618945 */ 1068 _mm_getcsr (void) 1069 { 1070 return __builtin_ia32_stmxcsr (); 1071 } 1072 1073 /* Read exception bits from the control register. */ 1074 /* APPLE LOCAL begin radar 5618945 */ 1075 __STATIC_INLINE unsigned int __attribute__((__always_inline__)) 1076 /* APPLE LOCAL end radar 5618945 */ 1077 _MM_GET_EXCEPTION_STATE (void) 1078 { 1079 return _mm_getcsr() & _MM_EXCEPT_MASK; 1080 } 1081 1082 /* APPLE LOCAL begin radar 5618945 */ 1083 __STATIC_INLINE unsigned int __attribute__((__always_inline__)) 1084 /* APPLE LOCAL end radar 5618945 */ 1085 _MM_GET_EXCEPTION_MASK (void) 1086 { 1087 return _mm_getcsr() & _MM_MASK_MASK; 1088 } 1089 1090 /* APPLE LOCAL begin radar 5618945 */ 1091 __STATIC_INLINE unsigned int __attribute__((__always_inline__)) 1092 /* APPLE LOCAL end radar 5618945 */ 1093 _MM_GET_ROUNDING_MODE (void) 1094 { 1095 return _mm_getcsr() & _MM_ROUND_MASK; 1096 } 1097 1098 /* APPLE LOCAL begin radar 5618945 */ 1099 __STATIC_INLINE unsigned int __attribute__((__always_inline__)) 1100 /* APPLE LOCAL end radar 5618945 */ 1101 _MM_GET_FLUSH_ZERO_MODE (void) 1102 { 1103 return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; 1104 } 1105 1106 /* Set the control register to I. */ 1107 /* APPLE LOCAL begin radar 5618945 */ 1108 __STATIC_INLINE void __attribute__((__always_inline__)) 1109 /* APPLE LOCAL end radar 5618945 */ 1110 _mm_setcsr (unsigned int __I) 1111 { 1112 __builtin_ia32_ldmxcsr (__I); 1113 } 1114 1115 /* Set exception bits in the control register. */ 1116 /* APPLE LOCAL begin radar 5618945 */ 1117 __STATIC_INLINE void __attribute__((__always_inline__)) 1118 /* APPLE LOCAL end radar 5618945 */ 1119 _MM_SET_EXCEPTION_STATE(unsigned int __mask) 1120 { 1121 _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); 1122 } 1123 1124 /* APPLE LOCAL begin radar 5618945 */ 1125 __STATIC_INLINE void __attribute__((__always_inline__)) 1126 /* APPLE LOCAL end radar 5618945 */ 1127 _MM_SET_EXCEPTION_MASK (unsigned int __mask) 1128 { 1129 _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); 1130 } 1131 1132 /* APPLE LOCAL begin radar 5618945 */ 1133 __STATIC_INLINE void __attribute__((__always_inline__)) 1134 /* APPLE LOCAL end radar 5618945 */ 1135 _MM_SET_ROUNDING_MODE (unsigned int __mode) 1136 { 1137 _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); 1138 } 1139 1140 /* APPLE LOCAL begin radar 5618945 */ 1141 __STATIC_INLINE void __attribute__((__always_inline__)) 1142 /* APPLE LOCAL end radar 5618945 */ 1143 _MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) 1144 { 1145 _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); 1146 } 1147 1148 /* Create a vector with element 0 as F and the rest zero. */ 1149 /* APPLE LOCAL begin radar 5618945 */ 1150 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1151 /* APPLE LOCAL end radar 5618945 */ 1152 _mm_set_ss (float __F) 1153 { 1154 return __extension__ (__m128)(__v4sf){ __F, 0, 0, 0 }; 1155 } 1156 1157 /* Create a vector with all four elements equal to F. */ 1158 /* APPLE LOCAL begin radar 5618945 */ 1159 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1160 /* APPLE LOCAL end radar 5618945 */ 1161 _mm_set1_ps (float __F) 1162 { 1163 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; 1164 } 1165 1166 /* APPLE LOCAL begin radar 5618945 */ 1167 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1168 /* APPLE LOCAL end radar 5618945 */ 1169 _mm_set_ps1 (float __F) 1170 { 1171 return _mm_set1_ps (__F); 1172 } 1173 1174 /* Create a vector with element 0 as *P and the rest zero. */ 1175 /* APPLE LOCAL begin radar 5618945 */ 1176 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1177 /* APPLE LOCAL end radar 5618945 */ 1178 _mm_load_ss (float const *__P) 1179 { 1180 return _mm_set_ss (*__P); 1181 } 1182 1183 /* Create a vector with all four elements equal to *P. */ 1184 /* APPLE LOCAL begin radar 5618945 */ 1185 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1186 /* APPLE LOCAL end radar 5618945 */ 1187 _mm_load1_ps (float const *__P) 1188 { 1189 return _mm_set1_ps (*__P); 1190 } 1191 1192 /* APPLE LOCAL begin radar 5618945 */ 1193 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1194 /* APPLE LOCAL end radar 5618945 */ 1195 _mm_load_ps1 (float const *__P) 1196 { 1197 return _mm_load1_ps (__P); 1198 } 1199 1200 /* Load four SPFP values from P. The address must be 16-byte aligned. */ 1201 /* APPLE LOCAL begin radar 5618945 */ 1202 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1203 /* APPLE LOCAL end radar 5618945 */ 1204 _mm_load_ps (float const *__P) 1205 { 1206 return (__m128) *(__v4sf *)__P; 1207 } 1208 1209 /* Load four SPFP values from P. The address need not be 16-byte aligned. */ 1210 /* APPLE LOCAL begin radar 5618945 */ 1211 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1212 /* APPLE LOCAL end radar 5618945 */ 1213 _mm_loadu_ps (float const *__P) 1214 { 1215 return (__m128) __builtin_ia32_loadups (__P); 1216 } 1217 1218 /* Load four SPFP values in reverse order. The address must be aligned. */ 1219 /* APPLE LOCAL begin radar 5618945 */ 1220 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1221 /* APPLE LOCAL end radar 5618945 */ 1222 _mm_loadr_ps (float const *__P) 1223 { 1224 __v4sf __tmp = *(__v4sf *)__P; 1225 return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); 1226 } 1227 1228 /* Create the vector [Z Y X W]. */ 1229 /* APPLE LOCAL begin radar 5618945 */ 1230 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1231 /* APPLE LOCAL end radar 5618945 */ 1232 _mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) 1233 { 1234 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; 1235 } 1236 1237 /* Create the vector [W X Y Z]. */ 1238 /* APPLE LOCAL begin radar 5618945 */ 1239 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1240 /* APPLE LOCAL end radar 5618945 */ 1241 _mm_setr_ps (float __Z, float __Y, float __X, float __W) 1242 { 1243 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; 1244 } 1245 1246 /* Stores the lower SPFP value. */ 1247 /* APPLE LOCAL begin radar 5618945 */ 1248 __STATIC_INLINE void __attribute__((__always_inline__)) 1249 /* APPLE LOCAL end radar 5618945 */ 1250 _mm_store_ss (float *__P, __m128 __A) 1251 { 1252 *__P = __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); 1253 } 1254 1255 /* APPLE LOCAL begin radar 5618945 */ 1256 __STATIC_INLINE float __attribute__((__always_inline__)) 1257 /* APPLE LOCAL end radar 5618945 */ 1258 _mm_cvtss_f32 (__m128 __A) 1259 { 1260 return __builtin_ia32_vec_ext_v4sf ((__v4sf)__A, 0); 1261 } 1262 1263 /* Store four SPFP values. The address must be 16-byte aligned. */ 1264 /* APPLE LOCAL begin radar 5618945 */ 1265 __STATIC_INLINE void __attribute__((__always_inline__)) 1266 /* APPLE LOCAL end radar 5618945 */ 1267 _mm_store_ps (float *__P, __m128 __A) 1268 { 1269 *(__v4sf *)__P = (__v4sf)__A; 1270 } 1271 1272 /* Store four SPFP values. The address need not be 16-byte aligned. */ 1273 /* APPLE LOCAL begin radar 5618945 */ 1274 __STATIC_INLINE void __attribute__((__always_inline__)) 1275 /* APPLE LOCAL end radar 5618945 */ 1276 _mm_storeu_ps (float *__P, __m128 __A) 1277 { 1278 __builtin_ia32_storeups (__P, (__v4sf)__A); 1279 } 1280 1281 /* Store the lower SPFP value across four words. */ 1282 /* APPLE LOCAL begin radar 5618945 */ 1283 __STATIC_INLINE void __attribute__((__always_inline__)) 1284 /* APPLE LOCAL end radar 5618945 */ 1285 _mm_store1_ps (float *__P, __m128 __A) 1286 { 1287 __v4sf __va = (__v4sf)__A; 1288 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); 1289 _mm_storeu_ps (__P, __tmp); 1290 } 1291 1292 /* APPLE LOCAL begin radar 5618945 */ 1293 __STATIC_INLINE void __attribute__((__always_inline__)) 1294 /* APPLE LOCAL end radar 5618945 */ 1295 _mm_store_ps1 (float *__P, __m128 __A) 1296 { 1297 _mm_store1_ps (__P, __A); 1298 } 1299 1300 /* Store four SPFP values in reverse order. The address must be aligned. */ 1301 /* APPLE LOCAL begin radar 5618945 */ 1302 __STATIC_INLINE void __attribute__((__always_inline__)) 1303 /* APPLE LOCAL end radar 5618945 */ 1304 _mm_storer_ps (float *__P, __m128 __A) 1305 { 1306 __v4sf __va = (__v4sf)__A; 1307 __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); 1308 _mm_store_ps (__P, __tmp); 1309 } 1310 1311 /* Sets the low SPFP value of A from the low value of B. */ 1312 /* APPLE LOCAL begin radar 5618945 */ 1313 __STATIC_INLINE __m128 __attribute__((__always_inline__)) 1314 /* APPLE LOCAL end radar 5618945 */ 1315 _mm_move_ss (__m128 __A, __m128 __B) 1316 { 1317 return (__m128) __builtin_ia32_movss ((__v4sf)__A, (__v4sf)__B); 1318 } 1319 1320 /* Extracts one of the four words of A. The selector N must be immediate. */ 1321 #if 0 1322 /* APPLE LOCAL begin radar 5618945 */ 1323 __STATIC_INLINE int __attribute__((__always_inline__)) 1324 /* APPLE LOCAL end radar 5618945 */ 1325 _mm_extract_pi16 (__m64 const __A, int const __N) 1326 { 1327 return __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N); 1328 } 1329 1330 /* APPLE LOCAL begin radar 5618945 */ 1331 __STATIC_INLINE int __attribute__((__always_inline__)) 1332 /* APPLE LOCAL end radar 5618945 */ 1333 _m_pextrw (__m64 const __A, int const __N) 1334 { 1335 return _mm_extract_pi16 (__A, __N); 1336 } 1337 #else 1338 #define _mm_extract_pi16(A, N) __builtin_ia32_vec_ext_v4hi ((__v4hi)(A), (N)) 1339 #define _m_pextrw(A, N) _mm_extract_pi16((A), (N)) 1340 #endif 1341 1342 /* Inserts word D into one of four words of A. The selector N must be 1343 immediate. */ 1344 #if 0 1345 /* APPLE LOCAL begin radar 5618945 */ 1346 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1347 /* APPLE LOCAL end radar 5618945 */ 1348 _mm_insert_pi16 (__m64 const __A, int const __D, int const __N) 1349 { 1350 return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N); 1351 } 1352 1353 /* APPLE LOCAL begin radar 5618945 */ 1354 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1355 /* APPLE LOCAL end radar 5618945 */ 1356 _m_pinsrw (__m64 const __A, int const __D, int const __N) 1357 { 1358 return _mm_insert_pi16 (__A, __D, __N); 1359 } 1360 #else 1361 #define _mm_insert_pi16(A, D, N) \ 1362 ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(A), (D), (N))) 1363 #define _m_pinsrw(A, D, N) _mm_insert_pi16((A), (D), (N)) 1364 #endif 1365 1366 /* Compute the element-wise maximum of signed 16-bit values. */ 1367 /* APPLE LOCAL begin radar 5618945 */ 1368 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1369 /* APPLE LOCAL end radar 5618945 */ 1370 _mm_max_pi16 (__m64 __A, __m64 __B) 1371 { 1372 return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); 1373 } 1374 1375 /* APPLE LOCAL begin radar 5618945 */ 1376 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1377 /* APPLE LOCAL end radar 5618945 */ 1378 _m_pmaxsw (__m64 __A, __m64 __B) 1379 { 1380 return _mm_max_pi16 (__A, __B); 1381 } 1382 1383 /* Compute the element-wise maximum of unsigned 8-bit values. */ 1384 /* APPLE LOCAL begin radar 5618945 */ 1385 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1386 /* APPLE LOCAL end radar 5618945 */ 1387 _mm_max_pu8 (__m64 __A, __m64 __B) 1388 { 1389 return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); 1390 } 1391 1392 /* APPLE LOCAL begin radar 5618945 */ 1393 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1394 /* APPLE LOCAL end radar 5618945 */ 1395 _m_pmaxub (__m64 __A, __m64 __B) 1396 { 1397 return _mm_max_pu8 (__A, __B); 1398 } 1399 1400 /* Compute the element-wise minimum of signed 16-bit values. */ 1401 /* APPLE LOCAL begin radar 5618945 */ 1402 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1403 /* APPLE LOCAL end radar 5618945 */ 1404 _mm_min_pi16 (__m64 __A, __m64 __B) 1405 { 1406 return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); 1407 } 1408 1409 /* APPLE LOCAL begin radar 5618945 */ 1410 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1411 /* APPLE LOCAL end radar 5618945 */ 1412 _m_pminsw (__m64 __A, __m64 __B) 1413 { 1414 return _mm_min_pi16 (__A, __B); 1415 } 1416 1417 /* Compute the element-wise minimum of unsigned 8-bit values. */ 1418 /* APPLE LOCAL begin radar 5618945 */ 1419 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1420 /* APPLE LOCAL end radar 5618945 */ 1421 _mm_min_pu8 (__m64 __A, __m64 __B) 1422 { 1423 return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); 1424 } 1425 1426 /* APPLE LOCAL begin radar 5618945 */ 1427 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1428 /* APPLE LOCAL end radar 5618945 */ 1429 _m_pminub (__m64 __A, __m64 __B) 1430 { 1431 return _mm_min_pu8 (__A, __B); 1432 } 1433 1434 /* Create an 8-bit mask of the signs of 8-bit values. */ 1435 /* APPLE LOCAL begin radar 5618945 */ 1436 __STATIC_INLINE int __attribute__((__always_inline__)) 1437 /* APPLE LOCAL end radar 5618945 */ 1438 _mm_movemask_pi8 (__m64 __A) 1439 { 1440 return __builtin_ia32_pmovmskb ((__v8qi)__A); 1441 } 1442 1443 /* APPLE LOCAL begin radar 5618945 */ 1444 __STATIC_INLINE int __attribute__((__always_inline__)) 1445 /* APPLE LOCAL end radar 5618945 */ 1446 _m_pmovmskb (__m64 __A) 1447 { 1448 return _mm_movemask_pi8 (__A); 1449 } 1450 1451 /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values 1452 in B and produce the high 16 bits of the 32-bit results. */ 1453 /* APPLE LOCAL begin radar 5618945 */ 1454 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1455 /* APPLE LOCAL end radar 5618945 */ 1456 _mm_mulhi_pu16 (__m64 __A, __m64 __B) 1457 { 1458 return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); 1459 } 1460 1461 /* APPLE LOCAL begin radar 5618945 */ 1462 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1463 /* APPLE LOCAL end radar 5618945 */ 1464 _m_pmulhuw (__m64 __A, __m64 __B) 1465 { 1466 return _mm_mulhi_pu16 (__A, __B); 1467 } 1468 1469 /* Return a combination of the four 16-bit values in A. The selector 1470 must be an immediate. */ 1471 #if 0 1472 /* APPLE LOCAL begin radar 5618945 */ 1473 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1474 /* APPLE LOCAL end radar 5618945 */ 1475 _mm_shuffle_pi16 (__m64 __A, int __N) 1476 { 1477 return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); 1478 } 1479 1480 /* APPLE LOCAL begin radar 5618945 */ 1481 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1482 /* APPLE LOCAL end radar 5618945 */ 1483 _m_pshufw (__m64 __A, int __N) 1484 { 1485 return _mm_shuffle_pi16 (__A, __N); 1486 } 1487 #else 1488 #define _mm_shuffle_pi16(A, N) \ 1489 ((__m64) __builtin_ia32_pshufw ((__v4hi)(A), (N))) 1490 #define _m_pshufw(A, N) _mm_shuffle_pi16 ((A), (N)) 1491 #endif 1492 1493 /* Conditionally store byte elements of A into P. The high bit of each 1494 byte in the selector N determines whether the corresponding byte from 1495 A is stored. */ 1496 /* APPLE LOCAL begin radar 5618945 */ 1497 __STATIC_INLINE void __attribute__((__always_inline__)) 1498 /* APPLE LOCAL end radar 5618945 */ 1499 _mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) 1500 { 1501 __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); 1502 } 1503 1504 /* APPLE LOCAL begin radar 5618945 */ 1505 __STATIC_INLINE void __attribute__((__always_inline__)) 1506 /* APPLE LOCAL end radar 5618945 */ 1507 _m_maskmovq (__m64 __A, __m64 __N, char *__P) 1508 { 1509 _mm_maskmove_si64 (__A, __N, __P); 1510 } 1511 1512 /* Compute the rounded averages of the unsigned 8-bit values in A and B. */ 1513 /* APPLE LOCAL begin radar 5618945 */ 1514 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1515 /* APPLE LOCAL end radar 5618945 */ 1516 _mm_avg_pu8 (__m64 __A, __m64 __B) 1517 { 1518 return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); 1519 } 1520 1521 /* APPLE LOCAL begin radar 5618945 */ 1522 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1523 /* APPLE LOCAL end radar 5618945 */ 1524 _m_pavgb (__m64 __A, __m64 __B) 1525 { 1526 return _mm_avg_pu8 (__A, __B); 1527 } 1528 1529 /* Compute the rounded averages of the unsigned 16-bit values in A and B. */ 1530 /* APPLE LOCAL begin radar 5618945 */ 1531 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1532 /* APPLE LOCAL end radar 5618945 */ 1533 _mm_avg_pu16 (__m64 __A, __m64 __B) 1534 { 1535 return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); 1536 } 1537 1538 /* APPLE LOCAL begin radar 5618945 */ 1539 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1540 /* APPLE LOCAL end radar 5618945 */ 1541 _m_pavgw (__m64 __A, __m64 __B) 1542 { 1543 return _mm_avg_pu16 (__A, __B); 1544 } 1545 1546 /* Compute the sum of the absolute differences of the unsigned 8-bit 1547 values in A and B. Return the value in the lower 16-bit word; the 1548 upper words are cleared. */ 1549 /* APPLE LOCAL begin radar 5618945 */ 1550 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1551 /* APPLE LOCAL end radar 5618945 */ 1552 _mm_sad_pu8 (__m64 __A, __m64 __B) 1553 { 1554 return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); 1555 } 1556 1557 /* APPLE LOCAL begin radar 5618945 */ 1558 __STATIC_INLINE __m64 __attribute__((__always_inline__)) 1559 /* APPLE LOCAL end radar 5618945 */ 1560 _m_psadbw (__m64 __A, __m64 __B) 1561 { 1562 return _mm_sad_pu8 (__A, __B); 1563 } 1564 1565 /* Loads one cache line from address P to a location "closer" to the 1566 processor. The selector I specifies the type of prefetch operation. */ 1567 #if 0 1568 /* APPLE LOCAL begin radar 5618945 */ 1569 __STATIC_INLINE void __attribute__((__always_inline__)) 1570 /* APPLE LOCAL end radar 5618945 */ 1571 _mm_prefetch (void *__P, enum _mm_hint __I) 1572 { 1573 __builtin_prefetch (__P, 0, __I); 1574 } 1575 #else 1576 #define _mm_prefetch(P, I) \ 1577 __builtin_prefetch ((P), 0, (I)) 1578 #endif 1579 1580 /* Stores the data in A to the address P without polluting the caches. */ 1581 /* APPLE LOCAL begin radar 5618945 */ 1582 __STATIC_INLINE void __attribute__((__always_inline__)) 1583 /* APPLE LOCAL end radar 5618945 */ 1584 _mm_stream_pi (__m64 *__P, __m64 __A) 1585 { 1586 /* APPLE LOCAL 4656532 use V1DImode for _m64 */ 1587 __builtin_ia32_movntq (__P, __A); 1588 } 1589 1590 /* Likewise. The address must be 16-byte aligned. */ 1591 /* APPLE LOCAL begin radar 5618945 */ 1592 __STATIC_INLINE void __attribute__((__always_inline__)) 1593 /* APPLE LOCAL end radar 5618945 */ 1594 _mm_stream_ps (float *__P, __m128 __A) 1595 { 1596 __builtin_ia32_movntps (__P, (__v4sf)__A); 1597 } 1598 1599 /* Guarantees that every preceding store is globally visible before 1600 any subsequent store. */ 1601 /* APPLE LOCAL begin radar 5618945 */ 1602 __STATIC_INLINE void __attribute__((__always_inline__)) 1603 /* APPLE LOCAL end radar 5618945 */ 1604 _mm_sfence (void) 1605 { 1606 __builtin_ia32_sfence (); 1607 } 1608 1609 /* The execution of the next instruction is delayed by an implementation 1610 specific amount of time. The instruction does not modify the 1611 architectural state. */ 1612 /* APPLE LOCAL begin radar 5618945 */ 1613 __STATIC_INLINE void __attribute__((__always_inline__)) 1614 /* APPLE LOCAL end radar 5618945 */ 1615 _mm_pause (void) 1616 { 1617 __asm__ __volatile__ ("rep; nop" : : ); 1618 } 1619 /* APPLE LOCAL end radar 4152603 */ 1620 1621 /* Transpose the 4x4 matrix composed of row[0-3]. */ 1622 #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 1623 do { \ 1624 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ 1625 __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \ 1626 __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3); \ 1627 __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1); \ 1628 __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \ 1629 (row0) = __builtin_ia32_movlhps (__t0, __t1); \ 1630 (row1) = __builtin_ia32_movhlps (__t1, __t0); \ 1631 (row2) = __builtin_ia32_movlhps (__t2, __t3); \ 1632 (row3) = __builtin_ia32_movhlps (__t3, __t2); \ 1633 } while (0) 1634 1635 /* APPLE LOCAL begin nodebug inline 4152603 */ 1636 #undef __always_inline__ 1637 /* APPLE LOCAL end nodebug inline 4152603 */ 1638 1639 /* For backward source compatibility. */ 1640 #include <emmintrin.h> 1641 1642 #endif /* __SSE__ */ 1643 #endif /* _XMMINTRIN_H_INCLUDED */ 1644