1 /* Copyright (C) 2011-2013 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 #ifndef _IMMINTRIN_H_INCLUDED 25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 26 #endif 27 28 /* Sum absolute 8-bit integer difference of adjacent groups of 4 29 byte integers in the first 2 operands. Starting offsets within 30 operands are determined by the 3rd mask operand. */ 31 #ifdef __OPTIMIZE__ 32 extern __inline __m256i 33 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 34 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M) 35 { 36 return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X, 37 (__v32qi)__Y, __M); 38 } 39 #else 40 #define _mm256_mpsadbw_epu8(X, Y, M) \ 41 ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \ 42 (__v32qi)(__m256i)(Y), (int)(M))) 43 #endif 44 45 extern __inline __m256i 46 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 47 _mm256_abs_epi8 (__m256i __A) 48 { 49 return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A); 50 } 51 52 extern __inline __m256i 53 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 54 _mm256_abs_epi16 (__m256i __A) 55 { 56 return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A); 57 } 58 59 extern __inline __m256i 60 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 61 _mm256_abs_epi32 (__m256i __A) 62 { 63 return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A); 64 } 65 66 extern __inline __m256i 67 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 68 _mm256_packs_epi32 (__m256i __A, __m256i __B) 69 { 70 return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B); 71 } 72 73 extern __inline __m256i 74 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 75 _mm256_packs_epi16 (__m256i __A, __m256i __B) 76 { 77 return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B); 78 } 79 80 extern __inline __m256i 81 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 82 _mm256_packus_epi32 (__m256i __A, __m256i __B) 83 { 84 return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B); 85 } 86 87 extern __inline __m256i 88 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 89 _mm256_packus_epi16 (__m256i __A, __m256i __B) 90 { 91 return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B); 92 } 93 94 extern __inline __m256i 95 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 96 _mm256_add_epi8 (__m256i __A, __m256i __B) 97 { 98 return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B); 99 } 100 101 extern __inline __m256i 102 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 103 _mm256_add_epi16 (__m256i __A, __m256i __B) 104 { 105 return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B); 106 } 107 108 extern __inline __m256i 109 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 110 _mm256_add_epi32 (__m256i __A, __m256i __B) 111 { 112 return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B); 113 } 114 115 extern __inline __m256i 116 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 117 _mm256_add_epi64 (__m256i __A, __m256i __B) 118 { 119 return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B); 120 } 121 122 extern __inline __m256i 123 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 124 _mm256_adds_epi8 (__m256i __A, __m256i __B) 125 { 126 return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B); 127 } 128 129 extern __inline __m256i 130 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 131 _mm256_adds_epi16 (__m256i __A, __m256i __B) 132 { 133 return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B); 134 } 135 136 extern __inline __m256i 137 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 138 _mm256_adds_epu8 (__m256i __A, __m256i __B) 139 { 140 return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B); 141 } 142 143 extern __inline __m256i 144 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 145 _mm256_adds_epu16 (__m256i __A, __m256i __B) 146 { 147 return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B); 148 } 149 150 #ifdef __OPTIMIZE__ 151 extern __inline __m256i 152 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 153 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N) 154 { 155 return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A, 156 (__v4di)__B, 157 __N * 8); 158 } 159 #else 160 /* In that case (__N*8) will be in vreg, and insn will not be matched. */ 161 /* Use define instead */ 162 #define _mm256_alignr_epi8(A, B, N) \ 163 ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \ 164 (__v4di)(__m256i)(B), \ 165 (int)(N) * 8)) 166 #endif 167 168 extern __inline __m256i 169 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 170 _mm256_and_si256 (__m256i __A, __m256i __B) 171 { 172 return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B); 173 } 174 175 extern __inline __m256i 176 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 177 _mm256_andnot_si256 (__m256i __A, __m256i __B) 178 { 179 return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B); 180 } 181 182 extern __inline __m256i 183 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 184 _mm256_avg_epu8 (__m256i __A, __m256i __B) 185 { 186 return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B); 187 } 188 189 extern __inline __m256i 190 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 191 _mm256_avg_epu16 (__m256i __A, __m256i __B) 192 { 193 return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B); 194 } 195 196 extern __inline __m256i 197 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 198 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M) 199 { 200 return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X, 201 (__v32qi)__Y, 202 (__v32qi)__M); 203 } 204 205 #ifdef __OPTIMIZE__ 206 extern __inline __m256i 207 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 208 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M) 209 { 210 return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X, 211 (__v16hi)__Y, 212 __M); 213 } 214 #else 215 #define _mm256_blend_epi16(X, Y, M) \ 216 ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \ 217 (__v16hi)(__m256i)(Y), (int)(M))) 218 #endif 219 220 extern __inline __m256i 221 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 222 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B) 223 { 224 return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B); 225 } 226 227 extern __inline __m256i 228 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 229 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B) 230 { 231 return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B); 232 } 233 234 extern __inline __m256i 235 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 236 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B) 237 { 238 return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B); 239 } 240 241 extern __inline __m256i 242 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 243 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B) 244 { 245 return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B); 246 } 247 248 extern __inline __m256i 249 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 250 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B) 251 { 252 return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A, 253 (__v32qi)__B); 254 } 255 256 extern __inline __m256i 257 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 258 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B) 259 { 260 return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A, 261 (__v16hi)__B); 262 } 263 264 extern __inline __m256i 265 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 266 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B) 267 { 268 return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A, 269 (__v8si)__B); 270 } 271 272 extern __inline __m256i 273 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 274 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B) 275 { 276 return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B); 277 } 278 279 extern __inline __m256i 280 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 281 _mm256_hadd_epi16 (__m256i __X, __m256i __Y) 282 { 283 return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X, 284 (__v16hi)__Y); 285 } 286 287 extern __inline __m256i 288 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 289 _mm256_hadd_epi32 (__m256i __X, __m256i __Y) 290 { 291 return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y); 292 } 293 294 extern __inline __m256i 295 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 296 _mm256_hadds_epi16 (__m256i __X, __m256i __Y) 297 { 298 return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X, 299 (__v16hi)__Y); 300 } 301 302 extern __inline __m256i 303 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 304 _mm256_hsub_epi16 (__m256i __X, __m256i __Y) 305 { 306 return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X, 307 (__v16hi)__Y); 308 } 309 310 extern __inline __m256i 311 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 312 _mm256_hsub_epi32 (__m256i __X, __m256i __Y) 313 { 314 return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y); 315 } 316 317 extern __inline __m256i 318 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 319 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y) 320 { 321 return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X, 322 (__v16hi)__Y); 323 } 324 325 extern __inline __m256i 326 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 327 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y) 328 { 329 return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X, 330 (__v32qi)__Y); 331 } 332 333 extern __inline __m256i 334 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 335 _mm256_madd_epi16 (__m256i __A, __m256i __B) 336 { 337 return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A, 338 (__v16hi)__B); 339 } 340 341 extern __inline __m256i 342 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 343 _mm256_max_epi8 (__m256i __A, __m256i __B) 344 { 345 return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B); 346 } 347 348 extern __inline __m256i 349 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 350 _mm256_max_epi16 (__m256i __A, __m256i __B) 351 { 352 return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B); 353 } 354 355 extern __inline __m256i 356 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 357 _mm256_max_epi32 (__m256i __A, __m256i __B) 358 { 359 return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B); 360 } 361 362 extern __inline __m256i 363 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 364 _mm256_max_epu8 (__m256i __A, __m256i __B) 365 { 366 return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B); 367 } 368 369 extern __inline __m256i 370 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 371 _mm256_max_epu16 (__m256i __A, __m256i __B) 372 { 373 return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B); 374 } 375 376 extern __inline __m256i 377 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 378 _mm256_max_epu32 (__m256i __A, __m256i __B) 379 { 380 return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B); 381 } 382 383 extern __inline __m256i 384 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 385 _mm256_min_epi8 (__m256i __A, __m256i __B) 386 { 387 return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B); 388 } 389 390 extern __inline __m256i 391 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 392 _mm256_min_epi16 (__m256i __A, __m256i __B) 393 { 394 return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B); 395 } 396 397 extern __inline __m256i 398 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 399 _mm256_min_epi32 (__m256i __A, __m256i __B) 400 { 401 return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B); 402 } 403 404 extern __inline __m256i 405 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 406 _mm256_min_epu8 (__m256i __A, __m256i __B) 407 { 408 return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B); 409 } 410 411 extern __inline __m256i 412 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 413 _mm256_min_epu16 (__m256i __A, __m256i __B) 414 { 415 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B); 416 } 417 418 extern __inline __m256i 419 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 420 _mm256_min_epu32 (__m256i __A, __m256i __B) 421 { 422 return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B); 423 } 424 425 extern __inline int 426 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 427 _mm256_movemask_epi8 (__m256i __A) 428 { 429 return __builtin_ia32_pmovmskb256 ((__v32qi)__A); 430 } 431 432 extern __inline __m256i 433 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 434 _mm256_cvtepi8_epi16 (__m128i __X) 435 { 436 return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X); 437 } 438 439 extern __inline __m256i 440 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 441 _mm256_cvtepi8_epi32 (__m128i __X) 442 { 443 return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X); 444 } 445 446 extern __inline __m256i 447 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 448 _mm256_cvtepi8_epi64 (__m128i __X) 449 { 450 return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X); 451 } 452 453 extern __inline __m256i 454 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 455 _mm256_cvtepi16_epi32 (__m128i __X) 456 { 457 return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X); 458 } 459 460 extern __inline __m256i 461 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 462 _mm256_cvtepi16_epi64 (__m128i __X) 463 { 464 return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X); 465 } 466 467 extern __inline __m256i 468 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 469 _mm256_cvtepi32_epi64 (__m128i __X) 470 { 471 return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X); 472 } 473 474 extern __inline __m256i 475 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 476 _mm256_cvtepu8_epi16 (__m128i __X) 477 { 478 return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X); 479 } 480 481 extern __inline __m256i 482 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 483 _mm256_cvtepu8_epi32 (__m128i __X) 484 { 485 return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X); 486 } 487 488 extern __inline __m256i 489 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 490 _mm256_cvtepu8_epi64 (__m128i __X) 491 { 492 return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X); 493 } 494 495 extern __inline __m256i 496 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 497 _mm256_cvtepu16_epi32 (__m128i __X) 498 { 499 return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X); 500 } 501 502 extern __inline __m256i 503 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 504 _mm256_cvtepu16_epi64 (__m128i __X) 505 { 506 return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X); 507 } 508 509 extern __inline __m256i 510 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 511 _mm256_cvtepu32_epi64 (__m128i __X) 512 { 513 return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X); 514 } 515 516 extern __inline __m256i 517 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 518 _mm256_mul_epi32 (__m256i __X, __m256i __Y) 519 { 520 return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y); 521 } 522 523 extern __inline __m256i 524 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 525 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y) 526 { 527 return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X, 528 (__v16hi)__Y); 529 } 530 531 extern __inline __m256i 532 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 533 _mm256_mulhi_epu16 (__m256i __A, __m256i __B) 534 { 535 return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B); 536 } 537 538 extern __inline __m256i 539 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 540 _mm256_mulhi_epi16 (__m256i __A, __m256i __B) 541 { 542 return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B); 543 } 544 545 extern __inline __m256i 546 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 547 _mm256_mullo_epi16 (__m256i __A, __m256i __B) 548 { 549 return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B); 550 } 551 552 extern __inline __m256i 553 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 554 _mm256_mullo_epi32 (__m256i __A, __m256i __B) 555 { 556 return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B); 557 } 558 559 extern __inline __m256i 560 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 561 _mm256_mul_epu32 (__m256i __A, __m256i __B) 562 { 563 return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B); 564 } 565 566 extern __inline __m256i 567 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 568 _mm256_or_si256 (__m256i __A, __m256i __B) 569 { 570 return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B); 571 } 572 573 extern __inline __m256i 574 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 575 _mm256_sad_epu8 (__m256i __A, __m256i __B) 576 { 577 return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B); 578 } 579 580 extern __inline __m256i 581 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 582 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y) 583 { 584 return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X, 585 (__v32qi)__Y); 586 } 587 588 #ifdef __OPTIMIZE__ 589 extern __inline __m256i 590 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 591 _mm256_shuffle_epi32 (__m256i __A, const int __mask) 592 { 593 return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask); 594 } 595 596 extern __inline __m256i 597 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 598 _mm256_shufflehi_epi16 (__m256i __A, const int __mask) 599 { 600 return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask); 601 } 602 603 extern __inline __m256i 604 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 605 _mm256_shufflelo_epi16 (__m256i __A, const int __mask) 606 { 607 return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask); 608 } 609 #else 610 #define _mm256_shuffle_epi32(A, N) \ 611 ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N))) 612 #define _mm256_shufflehi_epi16(A, N) \ 613 ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N))) 614 #define _mm256_shufflelo_epi16(A, N) \ 615 ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N))) 616 #endif 617 618 extern __inline __m256i 619 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 620 _mm256_sign_epi8 (__m256i __X, __m256i __Y) 621 { 622 return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y); 623 } 624 625 extern __inline __m256i 626 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 627 _mm256_sign_epi16 (__m256i __X, __m256i __Y) 628 { 629 return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y); 630 } 631 632 extern __inline __m256i 633 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 634 _mm256_sign_epi32 (__m256i __X, __m256i __Y) 635 { 636 return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y); 637 } 638 639 #ifdef __OPTIMIZE__ 640 extern __inline __m256i 641 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 642 _mm256_slli_si256 (__m256i __A, const int __N) 643 { 644 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); 645 } 646 #else 647 #define _mm256_slli_si256(A, N) \ 648 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) 649 #endif 650 651 extern __inline __m256i 652 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 653 _mm256_slli_epi16 (__m256i __A, int __B) 654 { 655 return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); 656 } 657 658 extern __inline __m256i 659 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 660 _mm256_sll_epi16 (__m256i __A, __m128i __B) 661 { 662 return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B); 663 } 664 665 extern __inline __m256i 666 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 667 _mm256_slli_epi32 (__m256i __A, int __B) 668 { 669 return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B); 670 } 671 672 extern __inline __m256i 673 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 674 _mm256_sll_epi32 (__m256i __A, __m128i __B) 675 { 676 return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B); 677 } 678 679 extern __inline __m256i 680 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 681 _mm256_slli_epi64 (__m256i __A, int __B) 682 { 683 return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B); 684 } 685 686 extern __inline __m256i 687 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 688 _mm256_sll_epi64 (__m256i __A, __m128i __B) 689 { 690 return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B); 691 } 692 693 extern __inline __m256i 694 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 695 _mm256_srai_epi16 (__m256i __A, int __B) 696 { 697 return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B); 698 } 699 700 extern __inline __m256i 701 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 702 _mm256_sra_epi16 (__m256i __A, __m128i __B) 703 { 704 return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B); 705 } 706 707 extern __inline __m256i 708 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 709 _mm256_srai_epi32 (__m256i __A, int __B) 710 { 711 return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B); 712 } 713 714 extern __inline __m256i 715 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 716 _mm256_sra_epi32 (__m256i __A, __m128i __B) 717 { 718 return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B); 719 } 720 721 #ifdef __OPTIMIZE__ 722 extern __inline __m256i 723 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 724 _mm256_srli_si256 (__m256i __A, const int __N) 725 { 726 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); 727 } 728 #else 729 #define _mm256_srli_si256(A, N) \ 730 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) 731 #endif 732 733 extern __inline __m256i 734 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 735 _mm256_srli_epi16 (__m256i __A, int __B) 736 { 737 return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B); 738 } 739 740 extern __inline __m256i 741 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 742 _mm256_srl_epi16 (__m256i __A, __m128i __B) 743 { 744 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B); 745 } 746 747 extern __inline __m256i 748 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 749 _mm256_srli_epi32 (__m256i __A, int __B) 750 { 751 return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B); 752 } 753 754 extern __inline __m256i 755 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 756 _mm256_srl_epi32 (__m256i __A, __m128i __B) 757 { 758 return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B); 759 } 760 761 extern __inline __m256i 762 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 763 _mm256_srli_epi64 (__m256i __A, int __B) 764 { 765 return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B); 766 } 767 768 extern __inline __m256i 769 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 770 _mm256_srl_epi64 (__m256i __A, __m128i __B) 771 { 772 return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B); 773 } 774 775 extern __inline __m256i 776 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 777 _mm256_sub_epi8 (__m256i __A, __m256i __B) 778 { 779 return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B); 780 } 781 782 extern __inline __m256i 783 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 784 _mm256_sub_epi16 (__m256i __A, __m256i __B) 785 { 786 return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B); 787 } 788 789 extern __inline __m256i 790 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 791 _mm256_sub_epi32 (__m256i __A, __m256i __B) 792 { 793 return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B); 794 } 795 796 extern __inline __m256i 797 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 798 _mm256_sub_epi64 (__m256i __A, __m256i __B) 799 { 800 return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B); 801 } 802 803 extern __inline __m256i 804 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 805 _mm256_subs_epi8 (__m256i __A, __m256i __B) 806 { 807 return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B); 808 } 809 810 extern __inline __m256i 811 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 812 _mm256_subs_epi16 (__m256i __A, __m256i __B) 813 { 814 return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B); 815 } 816 817 extern __inline __m256i 818 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 819 _mm256_subs_epu8 (__m256i __A, __m256i __B) 820 { 821 return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B); 822 } 823 824 extern __inline __m256i 825 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 826 _mm256_subs_epu16 (__m256i __A, __m256i __B) 827 { 828 return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B); 829 } 830 831 extern __inline __m256i 832 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 833 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B) 834 { 835 return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B); 836 } 837 838 extern __inline __m256i 839 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 840 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B) 841 { 842 return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B); 843 } 844 845 extern __inline __m256i 846 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 847 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B) 848 { 849 return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B); 850 } 851 852 extern __inline __m256i 853 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 854 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B) 855 { 856 return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B); 857 } 858 859 extern __inline __m256i 860 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 861 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B) 862 { 863 return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B); 864 } 865 866 extern __inline __m256i 867 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 868 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B) 869 { 870 return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B); 871 } 872 873 extern __inline __m256i 874 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 875 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B) 876 { 877 return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B); 878 } 879 880 extern __inline __m256i 881 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 882 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B) 883 { 884 return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B); 885 } 886 887 extern __inline __m256i 888 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 889 _mm256_xor_si256 (__m256i __A, __m256i __B) 890 { 891 return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B); 892 } 893 894 extern __inline __m256i 895 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 896 _mm256_stream_load_si256 (__m256i const *__X) 897 { 898 return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X); 899 } 900 901 extern __inline __m128 902 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 903 _mm_broadcastss_ps (__m128 __X) 904 { 905 return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X); 906 } 907 908 extern __inline __m256 909 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 910 _mm256_broadcastss_ps (__m128 __X) 911 { 912 return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X); 913 } 914 915 extern __inline __m256d 916 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 917 _mm256_broadcastsd_pd (__m128d __X) 918 { 919 return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X); 920 } 921 922 extern __inline __m256i 923 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 924 _mm256_broadcastsi128_si256 (__m128i __X) 925 { 926 return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); 927 } 928 929 #ifdef __OPTIMIZE__ 930 extern __inline __m128i 931 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 932 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M) 933 { 934 return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X, 935 (__v4si)__Y, 936 __M); 937 } 938 #else 939 #define _mm_blend_epi32(X, Y, M) \ 940 ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \ 941 (__v4si)(__m128i)(Y), (int)(M))) 942 #endif 943 944 #ifdef __OPTIMIZE__ 945 extern __inline __m256i 946 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 947 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M) 948 { 949 return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X, 950 (__v8si)__Y, 951 __M); 952 } 953 #else 954 #define _mm256_blend_epi32(X, Y, M) \ 955 ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \ 956 (__v8si)(__m256i)(Y), (int)(M))) 957 #endif 958 959 extern __inline __m256i 960 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 961 _mm256_broadcastb_epi8 (__m128i __X) 962 { 963 return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X); 964 } 965 966 extern __inline __m256i 967 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 968 _mm256_broadcastw_epi16 (__m128i __X) 969 { 970 return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X); 971 } 972 973 extern __inline __m256i 974 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 975 _mm256_broadcastd_epi32 (__m128i __X) 976 { 977 return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X); 978 } 979 980 extern __inline __m256i 981 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 982 _mm256_broadcastq_epi64 (__m128i __X) 983 { 984 return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X); 985 } 986 987 extern __inline __m128i 988 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 989 _mm_broadcastb_epi8 (__m128i __X) 990 { 991 return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X); 992 } 993 994 extern __inline __m128i 995 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 996 _mm_broadcastw_epi16 (__m128i __X) 997 { 998 return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X); 999 } 1000 1001 extern __inline __m128i 1002 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1003 _mm_broadcastd_epi32 (__m128i __X) 1004 { 1005 return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X); 1006 } 1007 1008 extern __inline __m128i 1009 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1010 _mm_broadcastq_epi64 (__m128i __X) 1011 { 1012 return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X); 1013 } 1014 1015 extern __inline __m256i 1016 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1017 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y) 1018 { 1019 return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y); 1020 } 1021 1022 #ifdef __OPTIMIZE__ 1023 extern __inline __m256d 1024 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1025 _mm256_permute4x64_pd (__m256d __X, const int __M) 1026 { 1027 return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M); 1028 } 1029 #else 1030 #define _mm256_permute4x64_pd(X, M) \ 1031 ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M))) 1032 #endif 1033 1034 extern __inline __m256 1035 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1036 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y) 1037 { 1038 return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y); 1039 } 1040 1041 #ifdef __OPTIMIZE__ 1042 extern __inline __m256i 1043 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1044 _mm256_permute4x64_epi64 (__m256i __X, const int __M) 1045 { 1046 return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M); 1047 } 1048 #else 1049 #define _mm256_permute4x64_epi64(X, M) \ 1050 ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M))) 1051 #endif 1052 1053 1054 #ifdef __OPTIMIZE__ 1055 extern __inline __m256i 1056 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1057 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M) 1058 { 1059 return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M); 1060 } 1061 #else 1062 #define _mm256_permute2x128_si256(X, Y, M) \ 1063 ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M))) 1064 #endif 1065 1066 #ifdef __OPTIMIZE__ 1067 extern __inline __m128i 1068 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1069 _mm256_extracti128_si256 (__m256i __X, const int __M) 1070 { 1071 return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M); 1072 } 1073 #else 1074 #define _mm256_extracti128_si256(X, M) \ 1075 ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M))) 1076 #endif 1077 1078 #ifdef __OPTIMIZE__ 1079 extern __inline __m256i 1080 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1081 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M) 1082 { 1083 return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M); 1084 } 1085 #else 1086 #define _mm256_inserti128_si256(X, Y, M) \ 1087 ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \ 1088 (__v2di)(__m128i)(Y), \ 1089 (int)(M))) 1090 #endif 1091 1092 extern __inline __m256i 1093 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1094 _mm256_maskload_epi32 (int const *__X, __m256i __M ) 1095 { 1096 return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X, 1097 (__v8si)__M); 1098 } 1099 1100 extern __inline __m256i 1101 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1102 _mm256_maskload_epi64 (long long const *__X, __m256i __M ) 1103 { 1104 return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X, 1105 (__v4di)__M); 1106 } 1107 1108 extern __inline __m128i 1109 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1110 _mm_maskload_epi32 (int const *__X, __m128i __M ) 1111 { 1112 return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X, 1113 (__v4si)__M); 1114 } 1115 1116 extern __inline __m128i 1117 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1118 _mm_maskload_epi64 (long long const *__X, __m128i __M ) 1119 { 1120 return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X, 1121 (__v2di)__M); 1122 } 1123 1124 extern __inline void 1125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1126 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y ) 1127 { 1128 __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 1129 } 1130 1131 extern __inline void 1132 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1133 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y ) 1134 { 1135 __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y); 1136 } 1137 1138 extern __inline void 1139 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1140 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y ) 1141 { 1142 __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 1143 } 1144 1145 extern __inline void 1146 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1147 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y ) 1148 { 1149 __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y); 1150 } 1151 1152 extern __inline __m256i 1153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1154 _mm256_sllv_epi32 (__m256i __X, __m256i __Y) 1155 { 1156 return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y); 1157 } 1158 1159 extern __inline __m128i 1160 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1161 _mm_sllv_epi32 (__m128i __X, __m128i __Y) 1162 { 1163 return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y); 1164 } 1165 1166 extern __inline __m256i 1167 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1168 _mm256_sllv_epi64 (__m256i __X, __m256i __Y) 1169 { 1170 return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y); 1171 } 1172 1173 extern __inline __m128i 1174 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1175 _mm_sllv_epi64 (__m128i __X, __m128i __Y) 1176 { 1177 return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y); 1178 } 1179 1180 extern __inline __m256i 1181 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1182 _mm256_srav_epi32 (__m256i __X, __m256i __Y) 1183 { 1184 return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y); 1185 } 1186 1187 extern __inline __m128i 1188 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1189 _mm_srav_epi32 (__m128i __X, __m128i __Y) 1190 { 1191 return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y); 1192 } 1193 1194 extern __inline __m256i 1195 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1196 _mm256_srlv_epi32 (__m256i __X, __m256i __Y) 1197 { 1198 return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y); 1199 } 1200 1201 extern __inline __m128i 1202 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1203 _mm_srlv_epi32 (__m128i __X, __m128i __Y) 1204 { 1205 return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y); 1206 } 1207 1208 extern __inline __m256i 1209 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1210 _mm256_srlv_epi64 (__m256i __X, __m256i __Y) 1211 { 1212 return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y); 1213 } 1214 1215 extern __inline __m128i 1216 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1217 _mm_srlv_epi64 (__m128i __X, __m128i __Y) 1218 { 1219 return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y); 1220 } 1221 1222 #ifdef __OPTIMIZE__ 1223 extern __inline __m128d 1224 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1225 _mm_i32gather_pd (double const *base, __m128i index, const int scale) 1226 { 1227 __v2df src = _mm_setzero_pd (); 1228 __v2df mask = _mm_cmpeq_pd (src, src); 1229 1230 return (__m128d) __builtin_ia32_gathersiv2df (src, 1231 base, 1232 (__v4si)index, 1233 mask, 1234 scale); 1235 } 1236 1237 extern __inline __m128d 1238 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1239 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index, 1240 __m128d mask, const int scale) 1241 { 1242 return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src, 1243 base, 1244 (__v4si)index, 1245 (__v2df)mask, 1246 scale); 1247 } 1248 1249 extern __inline __m256d 1250 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1251 _mm256_i32gather_pd (double const *base, __m128i index, const int scale) 1252 { 1253 __v4df src = _mm256_setzero_pd (); 1254 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ); 1255 1256 return (__m256d) __builtin_ia32_gathersiv4df (src, 1257 base, 1258 (__v4si)index, 1259 mask, 1260 scale); 1261 } 1262 1263 extern __inline __m256d 1264 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1265 _mm256_mask_i32gather_pd (__m256d src, double const *base, 1266 __m128i index, __m256d mask, const int scale) 1267 { 1268 return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src, 1269 base, 1270 (__v4si)index, 1271 (__v4df)mask, 1272 scale); 1273 } 1274 1275 extern __inline __m128d 1276 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1277 _mm_i64gather_pd (double const *base, __m128i index, const int scale) 1278 { 1279 __v2df src = _mm_setzero_pd (); 1280 __v2df mask = _mm_cmpeq_pd (src, src); 1281 1282 return (__m128d) __builtin_ia32_gatherdiv2df (src, 1283 base, 1284 (__v2di)index, 1285 mask, 1286 scale); 1287 } 1288 1289 extern __inline __m128d 1290 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1291 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index, 1292 __m128d mask, const int scale) 1293 { 1294 return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src, 1295 base, 1296 (__v2di)index, 1297 (__v2df)mask, 1298 scale); 1299 } 1300 1301 extern __inline __m256d 1302 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1303 _mm256_i64gather_pd (double const *base, __m256i index, const int scale) 1304 { 1305 __v4df src = _mm256_setzero_pd (); 1306 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ); 1307 1308 return (__m256d) __builtin_ia32_gatherdiv4df (src, 1309 base, 1310 (__v4di)index, 1311 mask, 1312 scale); 1313 } 1314 1315 extern __inline __m256d 1316 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1317 _mm256_mask_i64gather_pd (__m256d src, double const *base, 1318 __m256i index, __m256d mask, const int scale) 1319 { 1320 return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src, 1321 base, 1322 (__v4di)index, 1323 (__v4df)mask, 1324 scale); 1325 } 1326 1327 extern __inline __m128 1328 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1329 _mm_i32gather_ps (float const *base, __m128i index, const int scale) 1330 { 1331 __v4sf src = _mm_setzero_ps (); 1332 __v4sf mask = _mm_cmpeq_ps (src, src); 1333 1334 return (__m128) __builtin_ia32_gathersiv4sf (src, 1335 base, 1336 (__v4si)index, 1337 mask, 1338 scale); 1339 } 1340 1341 extern __inline __m128 1342 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1343 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index, 1344 __m128 mask, const int scale) 1345 { 1346 return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src, 1347 base, 1348 (__v4si)index, 1349 (__v4sf)mask, 1350 scale); 1351 } 1352 1353 extern __inline __m256 1354 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1355 _mm256_i32gather_ps (float const *base, __m256i index, const int scale) 1356 { 1357 __v8sf src = _mm256_setzero_ps (); 1358 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ); 1359 1360 return (__m256) __builtin_ia32_gathersiv8sf (src, 1361 base, 1362 (__v8si)index, 1363 mask, 1364 scale); 1365 } 1366 1367 extern __inline __m256 1368 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1369 _mm256_mask_i32gather_ps (__m256 src, float const *base, 1370 __m256i index, __m256 mask, const int scale) 1371 { 1372 return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src, 1373 base, 1374 (__v8si)index, 1375 (__v8sf)mask, 1376 scale); 1377 } 1378 1379 extern __inline __m128 1380 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1381 _mm_i64gather_ps (float const *base, __m128i index, const int scale) 1382 { 1383 __v4sf src = _mm_setzero_ps (); 1384 __v4sf mask = _mm_cmpeq_ps (src, src); 1385 1386 return (__m128) __builtin_ia32_gatherdiv4sf (src, 1387 base, 1388 (__v2di)index, 1389 mask, 1390 scale); 1391 } 1392 1393 extern __inline __m128 1394 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1395 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index, 1396 __m128 mask, const int scale) 1397 { 1398 return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src, 1399 base, 1400 (__v2di)index, 1401 (__v4sf)mask, 1402 scale); 1403 } 1404 1405 extern __inline __m128 1406 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1407 _mm256_i64gather_ps (float const *base, __m256i index, const int scale) 1408 { 1409 __v4sf src = _mm_setzero_ps (); 1410 __v4sf mask = _mm_cmpeq_ps (src, src); 1411 1412 return (__m128) __builtin_ia32_gatherdiv4sf256 (src, 1413 base, 1414 (__v4di)index, 1415 mask, 1416 scale); 1417 } 1418 1419 extern __inline __m128 1420 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1421 _mm256_mask_i64gather_ps (__m128 src, float const *base, 1422 __m256i index, __m128 mask, const int scale) 1423 { 1424 return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src, 1425 base, 1426 (__v4di)index, 1427 (__v4sf)mask, 1428 scale); 1429 } 1430 1431 extern __inline __m128i 1432 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1433 _mm_i32gather_epi64 (long long int const *base, 1434 __m128i index, const int scale) 1435 { 1436 __v2di src = __extension__ (__v2di){ 0, 0 }; 1437 __v2di mask = __extension__ (__v2di){ ~0, ~0 }; 1438 1439 return (__m128i) __builtin_ia32_gathersiv2di (src, 1440 base, 1441 (__v4si)index, 1442 mask, 1443 scale); 1444 } 1445 1446 extern __inline __m128i 1447 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1448 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base, 1449 __m128i index, __m128i mask, const int scale) 1450 { 1451 return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src, 1452 base, 1453 (__v4si)index, 1454 (__v2di)mask, 1455 scale); 1456 } 1457 1458 extern __inline __m256i 1459 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1460 _mm256_i32gather_epi64 (long long int const *base, 1461 __m128i index, const int scale) 1462 { 1463 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; 1464 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; 1465 1466 return (__m256i) __builtin_ia32_gathersiv4di (src, 1467 base, 1468 (__v4si)index, 1469 mask, 1470 scale); 1471 } 1472 1473 extern __inline __m256i 1474 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1475 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base, 1476 __m128i index, __m256i mask, const int scale) 1477 { 1478 return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src, 1479 base, 1480 (__v4si)index, 1481 (__v4di)mask, 1482 scale); 1483 } 1484 1485 extern __inline __m128i 1486 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1487 _mm_i64gather_epi64 (long long int const *base, 1488 __m128i index, const int scale) 1489 { 1490 __v2di src = __extension__ (__v2di){ 0, 0 }; 1491 __v2di mask = __extension__ (__v2di){ ~0, ~0 }; 1492 1493 return (__m128i) __builtin_ia32_gatherdiv2di (src, 1494 base, 1495 (__v2di)index, 1496 mask, 1497 scale); 1498 } 1499 1500 extern __inline __m128i 1501 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1502 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index, 1503 __m128i mask, const int scale) 1504 { 1505 return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src, 1506 base, 1507 (__v2di)index, 1508 (__v2di)mask, 1509 scale); 1510 } 1511 1512 extern __inline __m256i 1513 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1514 _mm256_i64gather_epi64 (long long int const *base, 1515 __m256i index, const int scale) 1516 { 1517 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; 1518 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; 1519 1520 return (__m256i) __builtin_ia32_gatherdiv4di (src, 1521 base, 1522 (__v4di)index, 1523 mask, 1524 scale); 1525 } 1526 1527 extern __inline __m256i 1528 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1529 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base, 1530 __m256i index, __m256i mask, const int scale) 1531 { 1532 return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src, 1533 base, 1534 (__v4di)index, 1535 (__v4di)mask, 1536 scale); 1537 } 1538 1539 extern __inline __m128i 1540 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1541 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale) 1542 { 1543 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1544 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1545 1546 return (__m128i) __builtin_ia32_gathersiv4si (src, 1547 base, 1548 (__v4si)index, 1549 mask, 1550 scale); 1551 } 1552 1553 extern __inline __m128i 1554 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1555 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index, 1556 __m128i mask, const int scale) 1557 { 1558 return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src, 1559 base, 1560 (__v4si)index, 1561 (__v4si)mask, 1562 scale); 1563 } 1564 1565 extern __inline __m256i 1566 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1567 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale) 1568 { 1569 __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1570 __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }; 1571 1572 return (__m256i) __builtin_ia32_gathersiv8si (src, 1573 base, 1574 (__v8si)index, 1575 mask, 1576 scale); 1577 } 1578 1579 extern __inline __m256i 1580 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1581 _mm256_mask_i32gather_epi32 (__m256i src, int const *base, 1582 __m256i index, __m256i mask, const int scale) 1583 { 1584 return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src, 1585 base, 1586 (__v8si)index, 1587 (__v8si)mask, 1588 scale); 1589 } 1590 1591 extern __inline __m128i 1592 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1593 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale) 1594 { 1595 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1596 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1597 1598 return (__m128i) __builtin_ia32_gatherdiv4si (src, 1599 base, 1600 (__v2di)index, 1601 mask, 1602 scale); 1603 } 1604 1605 extern __inline __m128i 1606 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1607 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index, 1608 __m128i mask, const int scale) 1609 { 1610 return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src, 1611 base, 1612 (__v2di)index, 1613 (__v4si)mask, 1614 scale); 1615 } 1616 1617 extern __inline __m128i 1618 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1619 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale) 1620 { 1621 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1622 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1623 1624 return (__m128i) __builtin_ia32_gatherdiv4si256 (src, 1625 base, 1626 (__v4di)index, 1627 mask, 1628 scale); 1629 } 1630 1631 extern __inline __m128i 1632 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1633 _mm256_mask_i64gather_epi32 (__m128i src, int const *base, 1634 __m256i index, __m128i mask, const int scale) 1635 { 1636 return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src, 1637 base, 1638 (__v4di)index, 1639 (__v4si)mask, 1640 scale); 1641 } 1642 #else /* __OPTIMIZE__ */ 1643 #define _mm_i32gather_pd(BASE, INDEX, SCALE) \ 1644 (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \ 1645 (double const *)BASE, \ 1646 (__v4si)(__m128i)INDEX, \ 1647 (__v2df)_mm_set1_pd( \ 1648 (double)(long long int) -1), \ 1649 (int)SCALE) 1650 1651 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1652 (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \ 1653 (double const *)BASE, \ 1654 (__v4si)(__m128i)INDEX, \ 1655 (__v2df)(__m128d)MASK, \ 1656 (int)SCALE) 1657 1658 #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \ 1659 (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \ 1660 (double const *)BASE, \ 1661 (__v4si)(__m128i)INDEX, \ 1662 (__v4df)_mm256_set1_pd( \ 1663 (double)(long long int) -1), \ 1664 (int)SCALE) 1665 1666 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1667 (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \ 1668 (double const *)BASE, \ 1669 (__v4si)(__m128i)INDEX, \ 1670 (__v4df)(__m256d)MASK, \ 1671 (int)SCALE) 1672 1673 #define _mm_i64gather_pd(BASE, INDEX, SCALE) \ 1674 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \ 1675 (double const *)BASE, \ 1676 (__v2di)(__m128i)INDEX, \ 1677 (__v2df)_mm_set1_pd( \ 1678 (double)(long long int) -1), \ 1679 (int)SCALE) 1680 1681 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1682 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \ 1683 (double const *)BASE, \ 1684 (__v2di)(__m128i)INDEX, \ 1685 (__v2df)(__m128d)MASK, \ 1686 (int)SCALE) 1687 1688 #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \ 1689 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \ 1690 (double const *)BASE, \ 1691 (__v4di)(__m256i)INDEX, \ 1692 (__v4df)_mm256_set1_pd( \ 1693 (double)(long long int) -1), \ 1694 (int)SCALE) 1695 1696 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1697 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \ 1698 (double const *)BASE, \ 1699 (__v4di)(__m256i)INDEX, \ 1700 (__v4df)(__m256d)MASK, \ 1701 (int)SCALE) 1702 1703 #define _mm_i32gather_ps(BASE, INDEX, SCALE) \ 1704 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \ 1705 (float const *)BASE, \ 1706 (__v4si)(__m128i)INDEX, \ 1707 _mm_set1_ps ((float)(int) -1), \ 1708 (int)SCALE) 1709 1710 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1711 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \ 1712 (float const *)BASE, \ 1713 (__v4si)(__m128i)INDEX, \ 1714 (__v4sf)(__m128d)MASK, \ 1715 (int)SCALE) 1716 1717 #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \ 1718 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \ 1719 (float const *)BASE, \ 1720 (__v8si)(__m256i)INDEX, \ 1721 (__v8sf)_mm256_set1_ps ( \ 1722 (float)(int) -1), \ 1723 (int)SCALE) 1724 1725 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1726 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \ 1727 (float const *)BASE, \ 1728 (__v8si)(__m256i)INDEX, \ 1729 (__v8sf)(__m256d)MASK, \ 1730 (int)SCALE) 1731 1732 #define _mm_i64gather_ps(BASE, INDEX, SCALE) \ 1733 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \ 1734 (float const *)BASE, \ 1735 (__v2di)(__m128i)INDEX, \ 1736 (__v4sf)_mm_set1_ps ( \ 1737 (float)(int) -1), \ 1738 (int)SCALE) 1739 1740 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1741 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \ 1742 (float const *)BASE, \ 1743 (__v2di)(__m128i)INDEX, \ 1744 (__v4sf)(__m128d)MASK, \ 1745 (int)SCALE) 1746 1747 #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \ 1748 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \ 1749 (float const *)BASE, \ 1750 (__v4di)(__m256i)INDEX, \ 1751 (__v4sf)_mm_set1_ps( \ 1752 (float)(int) -1), \ 1753 (int)SCALE) 1754 1755 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1756 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \ 1757 (float const *)BASE, \ 1758 (__v4di)(__m256i)INDEX, \ 1759 (__v4sf)(__m128)MASK, \ 1760 (int)SCALE) 1761 1762 #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \ 1763 (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \ 1764 (long long const *)BASE, \ 1765 (__v4si)(__m128i)INDEX, \ 1766 (__v2di)_mm_set1_epi64x (-1), \ 1767 (int)SCALE) 1768 1769 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1770 (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \ 1771 (long long const *)BASE, \ 1772 (__v4si)(__m128i)INDEX, \ 1773 (__v2di)(__m128i)MASK, \ 1774 (int)SCALE) 1775 1776 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \ 1777 (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \ 1778 (long long const *)BASE, \ 1779 (__v4si)(__m128i)INDEX, \ 1780 (__v4di)_mm256_set1_epi64x (-1), \ 1781 (int)SCALE) 1782 1783 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1784 (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \ 1785 (long long const *)BASE, \ 1786 (__v4si)(__m128i)INDEX, \ 1787 (__v4di)(__m256i)MASK, \ 1788 (int)SCALE) 1789 1790 #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \ 1791 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \ 1792 (long long const *)BASE, \ 1793 (__v2di)(__m128i)INDEX, \ 1794 (__v2di)_mm_set1_epi64x (-1), \ 1795 (int)SCALE) 1796 1797 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1798 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \ 1799 (long long const *)BASE, \ 1800 (__v2di)(__m128i)INDEX, \ 1801 (__v2di)(__m128i)MASK, \ 1802 (int)SCALE) 1803 1804 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \ 1805 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \ 1806 (long long const *)BASE, \ 1807 (__v4di)(__m256i)INDEX, \ 1808 (__v4di)_mm256_set1_epi64x (-1), \ 1809 (int)SCALE) 1810 1811 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1812 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \ 1813 (long long const *)BASE, \ 1814 (__v4di)(__m256i)INDEX, \ 1815 (__v4di)(__m256i)MASK, \ 1816 (int)SCALE) 1817 1818 #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \ 1819 (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \ 1820 (int const *)BASE, \ 1821 (__v4si)(__m128i)INDEX, \ 1822 (__v4si)_mm_set1_epi32 (-1), \ 1823 (int)SCALE) 1824 1825 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1826 (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \ 1827 (int const *)BASE, \ 1828 (__v4si)(__m128i)INDEX, \ 1829 (__v4si)(__m128i)MASK, \ 1830 (int)SCALE) 1831 1832 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \ 1833 (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \ 1834 (int const *)BASE, \ 1835 (__v8si)(__m256i)INDEX, \ 1836 (__v8si)_mm256_set1_epi32 (-1), \ 1837 (int)SCALE) 1838 1839 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1840 (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \ 1841 (int const *)BASE, \ 1842 (__v8si)(__m256i)INDEX, \ 1843 (__v8si)(__m256i)MASK, \ 1844 (int)SCALE) 1845 1846 #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \ 1847 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \ 1848 (int const *)BASE, \ 1849 (__v2di)(__m128i)INDEX, \ 1850 (__v4si)_mm_set1_epi32 (-1), \ 1851 (int)SCALE) 1852 1853 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1854 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \ 1855 (int const *)BASE, \ 1856 (__v2di)(__m128i)INDEX, \ 1857 (__v4si)(__m128i)MASK, \ 1858 (int)SCALE) 1859 1860 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \ 1861 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \ 1862 (int const *)BASE, \ 1863 (__v4di)(__m256i)INDEX, \ 1864 (__v4si)_mm_set1_epi32(-1), \ 1865 (int)SCALE) 1866 1867 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1868 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \ 1869 (int const *)BASE, \ 1870 (__v4di)(__m256i)INDEX, \ 1871 (__v4si)(__m128i)MASK, \ 1872 (int)SCALE) 1873 #endif /* __OPTIMIZE__ */ 1874