1 /* Copyright (C) 2011-2014 Free Software Foundation, Inc. 2 3 This file is part of GCC. 4 5 GCC is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 3, or (at your option) 8 any later version. 9 10 GCC is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 Under Section 7 of GPL version 3, you are granted additional 16 permissions described in the GCC Runtime Library Exception, version 17 3.1, as published by the Free Software Foundation. 18 19 You should have received a copy of the GNU General Public License and 20 a copy of the GCC Runtime Library Exception along with this program; 21 see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 22 <http://www.gnu.org/licenses/>. */ 23 24 #ifndef _IMMINTRIN_H_INCLUDED 25 # error "Never use <avx2intrin.h> directly; include <immintrin.h> instead." 26 #endif 27 28 #ifndef _AVX2INTRIN_H_INCLUDED 29 #define _AVX2INTRIN_H_INCLUDED 30 31 #ifndef __AVX2__ 32 #pragma GCC push_options 33 #pragma GCC target("avx2") 34 #define __DISABLE_AVX2__ 35 #endif /* __AVX2__ */ 36 37 /* Sum absolute 8-bit integer difference of adjacent groups of 4 38 byte integers in the first 2 operands. Starting offsets within 39 operands are determined by the 3rd mask operand. */ 40 #ifdef __OPTIMIZE__ 41 extern __inline __m256i 42 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 43 _mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M) 44 { 45 return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X, 46 (__v32qi)__Y, __M); 47 } 48 #else 49 #define _mm256_mpsadbw_epu8(X, Y, M) \ 50 ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \ 51 (__v32qi)(__m256i)(Y), (int)(M))) 52 #endif 53 54 extern __inline __m256i 55 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 56 _mm256_abs_epi8 (__m256i __A) 57 { 58 return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A); 59 } 60 61 extern __inline __m256i 62 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 63 _mm256_abs_epi16 (__m256i __A) 64 { 65 return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A); 66 } 67 68 extern __inline __m256i 69 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 70 _mm256_abs_epi32 (__m256i __A) 71 { 72 return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A); 73 } 74 75 extern __inline __m256i 76 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 77 _mm256_packs_epi32 (__m256i __A, __m256i __B) 78 { 79 return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B); 80 } 81 82 extern __inline __m256i 83 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 84 _mm256_packs_epi16 (__m256i __A, __m256i __B) 85 { 86 return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B); 87 } 88 89 extern __inline __m256i 90 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 91 _mm256_packus_epi32 (__m256i __A, __m256i __B) 92 { 93 return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B); 94 } 95 96 extern __inline __m256i 97 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 98 _mm256_packus_epi16 (__m256i __A, __m256i __B) 99 { 100 return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B); 101 } 102 103 extern __inline __m256i 104 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 105 _mm256_add_epi8 (__m256i __A, __m256i __B) 106 { 107 return (__m256i)__builtin_ia32_paddb256 ((__v32qi)__A, (__v32qi)__B); 108 } 109 110 extern __inline __m256i 111 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 112 _mm256_add_epi16 (__m256i __A, __m256i __B) 113 { 114 return (__m256i)__builtin_ia32_paddw256 ((__v16hi)__A, (__v16hi)__B); 115 } 116 117 extern __inline __m256i 118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 119 _mm256_add_epi32 (__m256i __A, __m256i __B) 120 { 121 return (__m256i)__builtin_ia32_paddd256 ((__v8si)__A, (__v8si)__B); 122 } 123 124 extern __inline __m256i 125 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 126 _mm256_add_epi64 (__m256i __A, __m256i __B) 127 { 128 return (__m256i)__builtin_ia32_paddq256 ((__v4di)__A, (__v4di)__B); 129 } 130 131 extern __inline __m256i 132 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 133 _mm256_adds_epi8 (__m256i __A, __m256i __B) 134 { 135 return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B); 136 } 137 138 extern __inline __m256i 139 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 140 _mm256_adds_epi16 (__m256i __A, __m256i __B) 141 { 142 return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B); 143 } 144 145 extern __inline __m256i 146 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 147 _mm256_adds_epu8 (__m256i __A, __m256i __B) 148 { 149 return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B); 150 } 151 152 extern __inline __m256i 153 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 154 _mm256_adds_epu16 (__m256i __A, __m256i __B) 155 { 156 return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B); 157 } 158 159 #ifdef __OPTIMIZE__ 160 extern __inline __m256i 161 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 162 _mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N) 163 { 164 return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A, 165 (__v4di)__B, 166 __N * 8); 167 } 168 #else 169 /* In that case (__N*8) will be in vreg, and insn will not be matched. */ 170 /* Use define instead */ 171 #define _mm256_alignr_epi8(A, B, N) \ 172 ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \ 173 (__v4di)(__m256i)(B), \ 174 (int)(N) * 8)) 175 #endif 176 177 extern __inline __m256i 178 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 179 _mm256_and_si256 (__m256i __A, __m256i __B) 180 { 181 return (__m256i) __builtin_ia32_andsi256 ((__v4di)__A, (__v4di)__B); 182 } 183 184 extern __inline __m256i 185 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 186 _mm256_andnot_si256 (__m256i __A, __m256i __B) 187 { 188 return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B); 189 } 190 191 extern __inline __m256i 192 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 193 _mm256_avg_epu8 (__m256i __A, __m256i __B) 194 { 195 return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B); 196 } 197 198 extern __inline __m256i 199 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 200 _mm256_avg_epu16 (__m256i __A, __m256i __B) 201 { 202 return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B); 203 } 204 205 extern __inline __m256i 206 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 207 _mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M) 208 { 209 return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X, 210 (__v32qi)__Y, 211 (__v32qi)__M); 212 } 213 214 #ifdef __OPTIMIZE__ 215 extern __inline __m256i 216 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 217 _mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M) 218 { 219 return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X, 220 (__v16hi)__Y, 221 __M); 222 } 223 #else 224 #define _mm256_blend_epi16(X, Y, M) \ 225 ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \ 226 (__v16hi)(__m256i)(Y), (int)(M))) 227 #endif 228 229 extern __inline __m256i 230 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 231 _mm256_cmpeq_epi8 (__m256i __A, __m256i __B) 232 { 233 return (__m256i)__builtin_ia32_pcmpeqb256 ((__v32qi)__A, (__v32qi)__B); 234 } 235 236 extern __inline __m256i 237 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 238 _mm256_cmpeq_epi16 (__m256i __A, __m256i __B) 239 { 240 return (__m256i)__builtin_ia32_pcmpeqw256 ((__v16hi)__A, (__v16hi)__B); 241 } 242 243 extern __inline __m256i 244 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 245 _mm256_cmpeq_epi32 (__m256i __A, __m256i __B) 246 { 247 return (__m256i)__builtin_ia32_pcmpeqd256 ((__v8si)__A, (__v8si)__B); 248 } 249 250 extern __inline __m256i 251 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 252 _mm256_cmpeq_epi64 (__m256i __A, __m256i __B) 253 { 254 return (__m256i)__builtin_ia32_pcmpeqq256 ((__v4di)__A, (__v4di)__B); 255 } 256 257 extern __inline __m256i 258 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 259 _mm256_cmpgt_epi8 (__m256i __A, __m256i __B) 260 { 261 return (__m256i)__builtin_ia32_pcmpgtb256 ((__v32qi)__A, 262 (__v32qi)__B); 263 } 264 265 extern __inline __m256i 266 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 267 _mm256_cmpgt_epi16 (__m256i __A, __m256i __B) 268 { 269 return (__m256i)__builtin_ia32_pcmpgtw256 ((__v16hi)__A, 270 (__v16hi)__B); 271 } 272 273 extern __inline __m256i 274 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 275 _mm256_cmpgt_epi32 (__m256i __A, __m256i __B) 276 { 277 return (__m256i)__builtin_ia32_pcmpgtd256 ((__v8si)__A, 278 (__v8si)__B); 279 } 280 281 extern __inline __m256i 282 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 283 _mm256_cmpgt_epi64 (__m256i __A, __m256i __B) 284 { 285 return (__m256i)__builtin_ia32_pcmpgtq256 ((__v4di)__A, (__v4di)__B); 286 } 287 288 extern __inline __m256i 289 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 290 _mm256_hadd_epi16 (__m256i __X, __m256i __Y) 291 { 292 return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X, 293 (__v16hi)__Y); 294 } 295 296 extern __inline __m256i 297 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 298 _mm256_hadd_epi32 (__m256i __X, __m256i __Y) 299 { 300 return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y); 301 } 302 303 extern __inline __m256i 304 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 305 _mm256_hadds_epi16 (__m256i __X, __m256i __Y) 306 { 307 return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X, 308 (__v16hi)__Y); 309 } 310 311 extern __inline __m256i 312 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 313 _mm256_hsub_epi16 (__m256i __X, __m256i __Y) 314 { 315 return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X, 316 (__v16hi)__Y); 317 } 318 319 extern __inline __m256i 320 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 321 _mm256_hsub_epi32 (__m256i __X, __m256i __Y) 322 { 323 return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y); 324 } 325 326 extern __inline __m256i 327 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 328 _mm256_hsubs_epi16 (__m256i __X, __m256i __Y) 329 { 330 return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X, 331 (__v16hi)__Y); 332 } 333 334 extern __inline __m256i 335 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 336 _mm256_maddubs_epi16 (__m256i __X, __m256i __Y) 337 { 338 return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X, 339 (__v32qi)__Y); 340 } 341 342 extern __inline __m256i 343 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 344 _mm256_madd_epi16 (__m256i __A, __m256i __B) 345 { 346 return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A, 347 (__v16hi)__B); 348 } 349 350 extern __inline __m256i 351 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 352 _mm256_max_epi8 (__m256i __A, __m256i __B) 353 { 354 return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B); 355 } 356 357 extern __inline __m256i 358 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 359 _mm256_max_epi16 (__m256i __A, __m256i __B) 360 { 361 return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B); 362 } 363 364 extern __inline __m256i 365 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 366 _mm256_max_epi32 (__m256i __A, __m256i __B) 367 { 368 return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B); 369 } 370 371 extern __inline __m256i 372 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 373 _mm256_max_epu8 (__m256i __A, __m256i __B) 374 { 375 return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B); 376 } 377 378 extern __inline __m256i 379 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 380 _mm256_max_epu16 (__m256i __A, __m256i __B) 381 { 382 return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B); 383 } 384 385 extern __inline __m256i 386 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 387 _mm256_max_epu32 (__m256i __A, __m256i __B) 388 { 389 return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B); 390 } 391 392 extern __inline __m256i 393 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 394 _mm256_min_epi8 (__m256i __A, __m256i __B) 395 { 396 return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B); 397 } 398 399 extern __inline __m256i 400 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 401 _mm256_min_epi16 (__m256i __A, __m256i __B) 402 { 403 return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B); 404 } 405 406 extern __inline __m256i 407 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 408 _mm256_min_epi32 (__m256i __A, __m256i __B) 409 { 410 return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B); 411 } 412 413 extern __inline __m256i 414 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 415 _mm256_min_epu8 (__m256i __A, __m256i __B) 416 { 417 return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B); 418 } 419 420 extern __inline __m256i 421 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 422 _mm256_min_epu16 (__m256i __A, __m256i __B) 423 { 424 return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B); 425 } 426 427 extern __inline __m256i 428 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 429 _mm256_min_epu32 (__m256i __A, __m256i __B) 430 { 431 return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B); 432 } 433 434 extern __inline int 435 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 436 _mm256_movemask_epi8 (__m256i __A) 437 { 438 return __builtin_ia32_pmovmskb256 ((__v32qi)__A); 439 } 440 441 extern __inline __m256i 442 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 443 _mm256_cvtepi8_epi16 (__m128i __X) 444 { 445 return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X); 446 } 447 448 extern __inline __m256i 449 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 450 _mm256_cvtepi8_epi32 (__m128i __X) 451 { 452 return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X); 453 } 454 455 extern __inline __m256i 456 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 457 _mm256_cvtepi8_epi64 (__m128i __X) 458 { 459 return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X); 460 } 461 462 extern __inline __m256i 463 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 464 _mm256_cvtepi16_epi32 (__m128i __X) 465 { 466 return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X); 467 } 468 469 extern __inline __m256i 470 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 471 _mm256_cvtepi16_epi64 (__m128i __X) 472 { 473 return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X); 474 } 475 476 extern __inline __m256i 477 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 478 _mm256_cvtepi32_epi64 (__m128i __X) 479 { 480 return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X); 481 } 482 483 extern __inline __m256i 484 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 485 _mm256_cvtepu8_epi16 (__m128i __X) 486 { 487 return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X); 488 } 489 490 extern __inline __m256i 491 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 492 _mm256_cvtepu8_epi32 (__m128i __X) 493 { 494 return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X); 495 } 496 497 extern __inline __m256i 498 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 499 _mm256_cvtepu8_epi64 (__m128i __X) 500 { 501 return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X); 502 } 503 504 extern __inline __m256i 505 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 506 _mm256_cvtepu16_epi32 (__m128i __X) 507 { 508 return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X); 509 } 510 511 extern __inline __m256i 512 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 513 _mm256_cvtepu16_epi64 (__m128i __X) 514 { 515 return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X); 516 } 517 518 extern __inline __m256i 519 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 520 _mm256_cvtepu32_epi64 (__m128i __X) 521 { 522 return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X); 523 } 524 525 extern __inline __m256i 526 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 527 _mm256_mul_epi32 (__m256i __X, __m256i __Y) 528 { 529 return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y); 530 } 531 532 extern __inline __m256i 533 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 534 _mm256_mulhrs_epi16 (__m256i __X, __m256i __Y) 535 { 536 return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X, 537 (__v16hi)__Y); 538 } 539 540 extern __inline __m256i 541 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 542 _mm256_mulhi_epu16 (__m256i __A, __m256i __B) 543 { 544 return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B); 545 } 546 547 extern __inline __m256i 548 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 549 _mm256_mulhi_epi16 (__m256i __A, __m256i __B) 550 { 551 return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B); 552 } 553 554 extern __inline __m256i 555 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 556 _mm256_mullo_epi16 (__m256i __A, __m256i __B) 557 { 558 return (__m256i)__builtin_ia32_pmullw256 ((__v16hi)__A, (__v16hi)__B); 559 } 560 561 extern __inline __m256i 562 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 563 _mm256_mullo_epi32 (__m256i __A, __m256i __B) 564 { 565 return (__m256i)__builtin_ia32_pmulld256 ((__v8si)__A, (__v8si)__B); 566 } 567 568 extern __inline __m256i 569 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 570 _mm256_mul_epu32 (__m256i __A, __m256i __B) 571 { 572 return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B); 573 } 574 575 extern __inline __m256i 576 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 577 _mm256_or_si256 (__m256i __A, __m256i __B) 578 { 579 return (__m256i)__builtin_ia32_por256 ((__v4di)__A, (__v4di)__B); 580 } 581 582 extern __inline __m256i 583 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 584 _mm256_sad_epu8 (__m256i __A, __m256i __B) 585 { 586 return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B); 587 } 588 589 extern __inline __m256i 590 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 591 _mm256_shuffle_epi8 (__m256i __X, __m256i __Y) 592 { 593 return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X, 594 (__v32qi)__Y); 595 } 596 597 #ifdef __OPTIMIZE__ 598 extern __inline __m256i 599 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 600 _mm256_shuffle_epi32 (__m256i __A, const int __mask) 601 { 602 return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask); 603 } 604 605 extern __inline __m256i 606 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 607 _mm256_shufflehi_epi16 (__m256i __A, const int __mask) 608 { 609 return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask); 610 } 611 612 extern __inline __m256i 613 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 614 _mm256_shufflelo_epi16 (__m256i __A, const int __mask) 615 { 616 return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask); 617 } 618 #else 619 #define _mm256_shuffle_epi32(A, N) \ 620 ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N))) 621 #define _mm256_shufflehi_epi16(A, N) \ 622 ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N))) 623 #define _mm256_shufflelo_epi16(A, N) \ 624 ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N))) 625 #endif 626 627 extern __inline __m256i 628 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 629 _mm256_sign_epi8 (__m256i __X, __m256i __Y) 630 { 631 return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y); 632 } 633 634 extern __inline __m256i 635 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 636 _mm256_sign_epi16 (__m256i __X, __m256i __Y) 637 { 638 return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y); 639 } 640 641 extern __inline __m256i 642 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 643 _mm256_sign_epi32 (__m256i __X, __m256i __Y) 644 { 645 return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y); 646 } 647 648 #ifdef __OPTIMIZE__ 649 extern __inline __m256i 650 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 651 _mm256_slli_si256 (__m256i __A, const int __N) 652 { 653 return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); 654 } 655 #else 656 #define _mm256_slli_si256(A, N) \ 657 ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) 658 #endif 659 660 extern __inline __m256i 661 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 662 _mm256_slli_epi16 (__m256i __A, int __B) 663 { 664 return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); 665 } 666 667 extern __inline __m256i 668 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 669 _mm256_sll_epi16 (__m256i __A, __m128i __B) 670 { 671 return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B); 672 } 673 674 extern __inline __m256i 675 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 676 _mm256_slli_epi32 (__m256i __A, int __B) 677 { 678 return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B); 679 } 680 681 extern __inline __m256i 682 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 683 _mm256_sll_epi32 (__m256i __A, __m128i __B) 684 { 685 return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B); 686 } 687 688 extern __inline __m256i 689 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 690 _mm256_slli_epi64 (__m256i __A, int __B) 691 { 692 return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B); 693 } 694 695 extern __inline __m256i 696 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 697 _mm256_sll_epi64 (__m256i __A, __m128i __B) 698 { 699 return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B); 700 } 701 702 extern __inline __m256i 703 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 704 _mm256_srai_epi16 (__m256i __A, int __B) 705 { 706 return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B); 707 } 708 709 extern __inline __m256i 710 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 711 _mm256_sra_epi16 (__m256i __A, __m128i __B) 712 { 713 return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B); 714 } 715 716 extern __inline __m256i 717 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 718 _mm256_srai_epi32 (__m256i __A, int __B) 719 { 720 return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B); 721 } 722 723 extern __inline __m256i 724 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 725 _mm256_sra_epi32 (__m256i __A, __m128i __B) 726 { 727 return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B); 728 } 729 730 #ifdef __OPTIMIZE__ 731 extern __inline __m256i 732 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 733 _mm256_srli_si256 (__m256i __A, const int __N) 734 { 735 return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); 736 } 737 #else 738 #define _mm256_srli_si256(A, N) \ 739 ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) 740 #endif 741 742 extern __inline __m256i 743 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 744 _mm256_srli_epi16 (__m256i __A, int __B) 745 { 746 return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B); 747 } 748 749 extern __inline __m256i 750 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 751 _mm256_srl_epi16 (__m256i __A, __m128i __B) 752 { 753 return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B); 754 } 755 756 extern __inline __m256i 757 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 758 _mm256_srli_epi32 (__m256i __A, int __B) 759 { 760 return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B); 761 } 762 763 extern __inline __m256i 764 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 765 _mm256_srl_epi32 (__m256i __A, __m128i __B) 766 { 767 return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B); 768 } 769 770 extern __inline __m256i 771 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 772 _mm256_srli_epi64 (__m256i __A, int __B) 773 { 774 return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B); 775 } 776 777 extern __inline __m256i 778 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 779 _mm256_srl_epi64 (__m256i __A, __m128i __B) 780 { 781 return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B); 782 } 783 784 extern __inline __m256i 785 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 786 _mm256_sub_epi8 (__m256i __A, __m256i __B) 787 { 788 return (__m256i)__builtin_ia32_psubb256 ((__v32qi)__A, (__v32qi)__B); 789 } 790 791 extern __inline __m256i 792 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 793 _mm256_sub_epi16 (__m256i __A, __m256i __B) 794 { 795 return (__m256i)__builtin_ia32_psubw256 ((__v16hi)__A, (__v16hi)__B); 796 } 797 798 extern __inline __m256i 799 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 800 _mm256_sub_epi32 (__m256i __A, __m256i __B) 801 { 802 return (__m256i)__builtin_ia32_psubd256 ((__v8si)__A, (__v8si)__B); 803 } 804 805 extern __inline __m256i 806 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 807 _mm256_sub_epi64 (__m256i __A, __m256i __B) 808 { 809 return (__m256i)__builtin_ia32_psubq256 ((__v4di)__A, (__v4di)__B); 810 } 811 812 extern __inline __m256i 813 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 814 _mm256_subs_epi8 (__m256i __A, __m256i __B) 815 { 816 return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B); 817 } 818 819 extern __inline __m256i 820 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 821 _mm256_subs_epi16 (__m256i __A, __m256i __B) 822 { 823 return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B); 824 } 825 826 extern __inline __m256i 827 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 828 _mm256_subs_epu8 (__m256i __A, __m256i __B) 829 { 830 return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B); 831 } 832 833 extern __inline __m256i 834 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 835 _mm256_subs_epu16 (__m256i __A, __m256i __B) 836 { 837 return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B); 838 } 839 840 extern __inline __m256i 841 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 842 _mm256_unpackhi_epi8 (__m256i __A, __m256i __B) 843 { 844 return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B); 845 } 846 847 extern __inline __m256i 848 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 849 _mm256_unpackhi_epi16 (__m256i __A, __m256i __B) 850 { 851 return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B); 852 } 853 854 extern __inline __m256i 855 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 856 _mm256_unpackhi_epi32 (__m256i __A, __m256i __B) 857 { 858 return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B); 859 } 860 861 extern __inline __m256i 862 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 863 _mm256_unpackhi_epi64 (__m256i __A, __m256i __B) 864 { 865 return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B); 866 } 867 868 extern __inline __m256i 869 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 870 _mm256_unpacklo_epi8 (__m256i __A, __m256i __B) 871 { 872 return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B); 873 } 874 875 extern __inline __m256i 876 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 877 _mm256_unpacklo_epi16 (__m256i __A, __m256i __B) 878 { 879 return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B); 880 } 881 882 extern __inline __m256i 883 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 884 _mm256_unpacklo_epi32 (__m256i __A, __m256i __B) 885 { 886 return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B); 887 } 888 889 extern __inline __m256i 890 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 891 _mm256_unpacklo_epi64 (__m256i __A, __m256i __B) 892 { 893 return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B); 894 } 895 896 extern __inline __m256i 897 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 898 _mm256_xor_si256 (__m256i __A, __m256i __B) 899 { 900 return (__m256i)__builtin_ia32_pxor256 ((__v4di)__A, (__v4di)__B); 901 } 902 903 extern __inline __m256i 904 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 905 _mm256_stream_load_si256 (__m256i const *__X) 906 { 907 return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X); 908 } 909 910 extern __inline __m128 911 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 912 _mm_broadcastss_ps (__m128 __X) 913 { 914 return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X); 915 } 916 917 extern __inline __m256 918 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 919 _mm256_broadcastss_ps (__m128 __X) 920 { 921 return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X); 922 } 923 924 extern __inline __m256d 925 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 926 _mm256_broadcastsd_pd (__m128d __X) 927 { 928 return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X); 929 } 930 931 extern __inline __m256i 932 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 933 _mm256_broadcastsi128_si256 (__m128i __X) 934 { 935 return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); 936 } 937 938 #ifdef __OPTIMIZE__ 939 extern __inline __m128i 940 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 941 _mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M) 942 { 943 return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X, 944 (__v4si)__Y, 945 __M); 946 } 947 #else 948 #define _mm_blend_epi32(X, Y, M) \ 949 ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \ 950 (__v4si)(__m128i)(Y), (int)(M))) 951 #endif 952 953 #ifdef __OPTIMIZE__ 954 extern __inline __m256i 955 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 956 _mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M) 957 { 958 return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X, 959 (__v8si)__Y, 960 __M); 961 } 962 #else 963 #define _mm256_blend_epi32(X, Y, M) \ 964 ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \ 965 (__v8si)(__m256i)(Y), (int)(M))) 966 #endif 967 968 extern __inline __m256i 969 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 970 _mm256_broadcastb_epi8 (__m128i __X) 971 { 972 return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X); 973 } 974 975 extern __inline __m256i 976 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 977 _mm256_broadcastw_epi16 (__m128i __X) 978 { 979 return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X); 980 } 981 982 extern __inline __m256i 983 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 984 _mm256_broadcastd_epi32 (__m128i __X) 985 { 986 return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X); 987 } 988 989 extern __inline __m256i 990 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 991 _mm256_broadcastq_epi64 (__m128i __X) 992 { 993 return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X); 994 } 995 996 extern __inline __m128i 997 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 998 _mm_broadcastb_epi8 (__m128i __X) 999 { 1000 return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X); 1001 } 1002 1003 extern __inline __m128i 1004 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1005 _mm_broadcastw_epi16 (__m128i __X) 1006 { 1007 return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X); 1008 } 1009 1010 extern __inline __m128i 1011 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1012 _mm_broadcastd_epi32 (__m128i __X) 1013 { 1014 return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X); 1015 } 1016 1017 extern __inline __m128i 1018 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1019 _mm_broadcastq_epi64 (__m128i __X) 1020 { 1021 return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X); 1022 } 1023 1024 extern __inline __m256i 1025 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1026 _mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y) 1027 { 1028 return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y); 1029 } 1030 1031 #ifdef __OPTIMIZE__ 1032 extern __inline __m256d 1033 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1034 _mm256_permute4x64_pd (__m256d __X, const int __M) 1035 { 1036 return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M); 1037 } 1038 #else 1039 #define _mm256_permute4x64_pd(X, M) \ 1040 ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M))) 1041 #endif 1042 1043 extern __inline __m256 1044 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1045 _mm256_permutevar8x32_ps (__m256 __X, __m256i __Y) 1046 { 1047 return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y); 1048 } 1049 1050 #ifdef __OPTIMIZE__ 1051 extern __inline __m256i 1052 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1053 _mm256_permute4x64_epi64 (__m256i __X, const int __M) 1054 { 1055 return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M); 1056 } 1057 #else 1058 #define _mm256_permute4x64_epi64(X, M) \ 1059 ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M))) 1060 #endif 1061 1062 1063 #ifdef __OPTIMIZE__ 1064 extern __inline __m256i 1065 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1066 _mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M) 1067 { 1068 return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M); 1069 } 1070 #else 1071 #define _mm256_permute2x128_si256(X, Y, M) \ 1072 ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M))) 1073 #endif 1074 1075 #ifdef __OPTIMIZE__ 1076 extern __inline __m128i 1077 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1078 _mm256_extracti128_si256 (__m256i __X, const int __M) 1079 { 1080 return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M); 1081 } 1082 #else 1083 #define _mm256_extracti128_si256(X, M) \ 1084 ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M))) 1085 #endif 1086 1087 #ifdef __OPTIMIZE__ 1088 extern __inline __m256i 1089 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1090 _mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M) 1091 { 1092 return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M); 1093 } 1094 #else 1095 #define _mm256_inserti128_si256(X, Y, M) \ 1096 ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \ 1097 (__v2di)(__m128i)(Y), \ 1098 (int)(M))) 1099 #endif 1100 1101 extern __inline __m256i 1102 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1103 _mm256_maskload_epi32 (int const *__X, __m256i __M ) 1104 { 1105 return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X, 1106 (__v8si)__M); 1107 } 1108 1109 extern __inline __m256i 1110 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1111 _mm256_maskload_epi64 (long long const *__X, __m256i __M ) 1112 { 1113 return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X, 1114 (__v4di)__M); 1115 } 1116 1117 extern __inline __m128i 1118 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1119 _mm_maskload_epi32 (int const *__X, __m128i __M ) 1120 { 1121 return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X, 1122 (__v4si)__M); 1123 } 1124 1125 extern __inline __m128i 1126 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1127 _mm_maskload_epi64 (long long const *__X, __m128i __M ) 1128 { 1129 return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X, 1130 (__v2di)__M); 1131 } 1132 1133 extern __inline void 1134 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1135 _mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y ) 1136 { 1137 __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y); 1138 } 1139 1140 extern __inline void 1141 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1142 _mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y ) 1143 { 1144 __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y); 1145 } 1146 1147 extern __inline void 1148 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1149 _mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y ) 1150 { 1151 __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y); 1152 } 1153 1154 extern __inline void 1155 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1156 _mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y ) 1157 { 1158 __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y); 1159 } 1160 1161 extern __inline __m256i 1162 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1163 _mm256_sllv_epi32 (__m256i __X, __m256i __Y) 1164 { 1165 return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y); 1166 } 1167 1168 extern __inline __m128i 1169 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1170 _mm_sllv_epi32 (__m128i __X, __m128i __Y) 1171 { 1172 return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y); 1173 } 1174 1175 extern __inline __m256i 1176 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1177 _mm256_sllv_epi64 (__m256i __X, __m256i __Y) 1178 { 1179 return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y); 1180 } 1181 1182 extern __inline __m128i 1183 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1184 _mm_sllv_epi64 (__m128i __X, __m128i __Y) 1185 { 1186 return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y); 1187 } 1188 1189 extern __inline __m256i 1190 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1191 _mm256_srav_epi32 (__m256i __X, __m256i __Y) 1192 { 1193 return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y); 1194 } 1195 1196 extern __inline __m128i 1197 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1198 _mm_srav_epi32 (__m128i __X, __m128i __Y) 1199 { 1200 return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y); 1201 } 1202 1203 extern __inline __m256i 1204 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1205 _mm256_srlv_epi32 (__m256i __X, __m256i __Y) 1206 { 1207 return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y); 1208 } 1209 1210 extern __inline __m128i 1211 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1212 _mm_srlv_epi32 (__m128i __X, __m128i __Y) 1213 { 1214 return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y); 1215 } 1216 1217 extern __inline __m256i 1218 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1219 _mm256_srlv_epi64 (__m256i __X, __m256i __Y) 1220 { 1221 return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y); 1222 } 1223 1224 extern __inline __m128i 1225 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1226 _mm_srlv_epi64 (__m128i __X, __m128i __Y) 1227 { 1228 return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y); 1229 } 1230 1231 #ifdef __OPTIMIZE__ 1232 extern __inline __m128d 1233 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1234 _mm_i32gather_pd (double const *base, __m128i index, const int scale) 1235 { 1236 __v2df zero = _mm_setzero_pd (); 1237 __v2df mask = _mm_cmpeq_pd (zero, zero); 1238 1239 return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (), 1240 base, 1241 (__v4si)index, 1242 mask, 1243 scale); 1244 } 1245 1246 extern __inline __m128d 1247 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1248 _mm_mask_i32gather_pd (__m128d src, double const *base, __m128i index, 1249 __m128d mask, const int scale) 1250 { 1251 return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)src, 1252 base, 1253 (__v4si)index, 1254 (__v2df)mask, 1255 scale); 1256 } 1257 1258 extern __inline __m256d 1259 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1260 _mm256_i32gather_pd (double const *base, __m128i index, const int scale) 1261 { 1262 __v4df zero = _mm256_setzero_pd (); 1263 __v4df mask = _mm256_cmp_pd (zero, zero, _CMP_EQ_OQ); 1264 1265 return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (), 1266 base, 1267 (__v4si)index, 1268 mask, 1269 scale); 1270 } 1271 1272 extern __inline __m256d 1273 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1274 _mm256_mask_i32gather_pd (__m256d src, double const *base, 1275 __m128i index, __m256d mask, const int scale) 1276 { 1277 return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)src, 1278 base, 1279 (__v4si)index, 1280 (__v4df)mask, 1281 scale); 1282 } 1283 1284 extern __inline __m128d 1285 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1286 _mm_i64gather_pd (double const *base, __m128i index, const int scale) 1287 { 1288 __v2df src = _mm_setzero_pd (); 1289 __v2df mask = _mm_cmpeq_pd (src, src); 1290 1291 return (__m128d) __builtin_ia32_gatherdiv2df (src, 1292 base, 1293 (__v2di)index, 1294 mask, 1295 scale); 1296 } 1297 1298 extern __inline __m128d 1299 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1300 _mm_mask_i64gather_pd (__m128d src, double const *base, __m128i index, 1301 __m128d mask, const int scale) 1302 { 1303 return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)src, 1304 base, 1305 (__v2di)index, 1306 (__v2df)mask, 1307 scale); 1308 } 1309 1310 extern __inline __m256d 1311 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1312 _mm256_i64gather_pd (double const *base, __m256i index, const int scale) 1313 { 1314 __v4df src = _mm256_setzero_pd (); 1315 __v4df mask = _mm256_cmp_pd (src, src, _CMP_EQ_OQ); 1316 1317 return (__m256d) __builtin_ia32_gatherdiv4df (src, 1318 base, 1319 (__v4di)index, 1320 mask, 1321 scale); 1322 } 1323 1324 extern __inline __m256d 1325 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1326 _mm256_mask_i64gather_pd (__m256d src, double const *base, 1327 __m256i index, __m256d mask, const int scale) 1328 { 1329 return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)src, 1330 base, 1331 (__v4di)index, 1332 (__v4df)mask, 1333 scale); 1334 } 1335 1336 extern __inline __m128 1337 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1338 _mm_i32gather_ps (float const *base, __m128i index, const int scale) 1339 { 1340 __v4sf src = _mm_setzero_ps (); 1341 __v4sf mask = _mm_cmpeq_ps (src, src); 1342 1343 return (__m128) __builtin_ia32_gathersiv4sf (src, 1344 base, 1345 (__v4si)index, 1346 mask, 1347 scale); 1348 } 1349 1350 extern __inline __m128 1351 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1352 _mm_mask_i32gather_ps (__m128 src, float const *base, __m128i index, 1353 __m128 mask, const int scale) 1354 { 1355 return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)src, 1356 base, 1357 (__v4si)index, 1358 (__v4sf)mask, 1359 scale); 1360 } 1361 1362 extern __inline __m256 1363 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1364 _mm256_i32gather_ps (float const *base, __m256i index, const int scale) 1365 { 1366 __v8sf src = _mm256_setzero_ps (); 1367 __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ); 1368 1369 return (__m256) __builtin_ia32_gathersiv8sf (src, 1370 base, 1371 (__v8si)index, 1372 mask, 1373 scale); 1374 } 1375 1376 extern __inline __m256 1377 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1378 _mm256_mask_i32gather_ps (__m256 src, float const *base, 1379 __m256i index, __m256 mask, const int scale) 1380 { 1381 return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)src, 1382 base, 1383 (__v8si)index, 1384 (__v8sf)mask, 1385 scale); 1386 } 1387 1388 extern __inline __m128 1389 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1390 _mm_i64gather_ps (float const *base, __m128i index, const int scale) 1391 { 1392 __v4sf src = _mm_setzero_ps (); 1393 __v4sf mask = _mm_cmpeq_ps (src, src); 1394 1395 return (__m128) __builtin_ia32_gatherdiv4sf (src, 1396 base, 1397 (__v2di)index, 1398 mask, 1399 scale); 1400 } 1401 1402 extern __inline __m128 1403 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1404 _mm_mask_i64gather_ps (__m128 src, float const *base, __m128i index, 1405 __m128 mask, const int scale) 1406 { 1407 return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)src, 1408 base, 1409 (__v2di)index, 1410 (__v4sf)mask, 1411 scale); 1412 } 1413 1414 extern __inline __m128 1415 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1416 _mm256_i64gather_ps (float const *base, __m256i index, const int scale) 1417 { 1418 __v4sf src = _mm_setzero_ps (); 1419 __v4sf mask = _mm_cmpeq_ps (src, src); 1420 1421 return (__m128) __builtin_ia32_gatherdiv4sf256 (src, 1422 base, 1423 (__v4di)index, 1424 mask, 1425 scale); 1426 } 1427 1428 extern __inline __m128 1429 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1430 _mm256_mask_i64gather_ps (__m128 src, float const *base, 1431 __m256i index, __m128 mask, const int scale) 1432 { 1433 return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)src, 1434 base, 1435 (__v4di)index, 1436 (__v4sf)mask, 1437 scale); 1438 } 1439 1440 extern __inline __m128i 1441 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1442 _mm_i32gather_epi64 (long long int const *base, 1443 __m128i index, const int scale) 1444 { 1445 __v2di src = __extension__ (__v2di){ 0, 0 }; 1446 __v2di mask = __extension__ (__v2di){ ~0, ~0 }; 1447 1448 return (__m128i) __builtin_ia32_gathersiv2di (src, 1449 base, 1450 (__v4si)index, 1451 mask, 1452 scale); 1453 } 1454 1455 extern __inline __m128i 1456 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1457 _mm_mask_i32gather_epi64 (__m128i src, long long int const *base, 1458 __m128i index, __m128i mask, const int scale) 1459 { 1460 return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)src, 1461 base, 1462 (__v4si)index, 1463 (__v2di)mask, 1464 scale); 1465 } 1466 1467 extern __inline __m256i 1468 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1469 _mm256_i32gather_epi64 (long long int const *base, 1470 __m128i index, const int scale) 1471 { 1472 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; 1473 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; 1474 1475 return (__m256i) __builtin_ia32_gathersiv4di (src, 1476 base, 1477 (__v4si)index, 1478 mask, 1479 scale); 1480 } 1481 1482 extern __inline __m256i 1483 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1484 _mm256_mask_i32gather_epi64 (__m256i src, long long int const *base, 1485 __m128i index, __m256i mask, const int scale) 1486 { 1487 return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)src, 1488 base, 1489 (__v4si)index, 1490 (__v4di)mask, 1491 scale); 1492 } 1493 1494 extern __inline __m128i 1495 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1496 _mm_i64gather_epi64 (long long int const *base, 1497 __m128i index, const int scale) 1498 { 1499 __v2di src = __extension__ (__v2di){ 0, 0 }; 1500 __v2di mask = __extension__ (__v2di){ ~0, ~0 }; 1501 1502 return (__m128i) __builtin_ia32_gatherdiv2di (src, 1503 base, 1504 (__v2di)index, 1505 mask, 1506 scale); 1507 } 1508 1509 extern __inline __m128i 1510 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1511 _mm_mask_i64gather_epi64 (__m128i src, long long int const *base, __m128i index, 1512 __m128i mask, const int scale) 1513 { 1514 return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)src, 1515 base, 1516 (__v2di)index, 1517 (__v2di)mask, 1518 scale); 1519 } 1520 1521 extern __inline __m256i 1522 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1523 _mm256_i64gather_epi64 (long long int const *base, 1524 __m256i index, const int scale) 1525 { 1526 __v4di src = __extension__ (__v4di){ 0, 0, 0, 0 }; 1527 __v4di mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; 1528 1529 return (__m256i) __builtin_ia32_gatherdiv4di (src, 1530 base, 1531 (__v4di)index, 1532 mask, 1533 scale); 1534 } 1535 1536 extern __inline __m256i 1537 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1538 _mm256_mask_i64gather_epi64 (__m256i src, long long int const *base, 1539 __m256i index, __m256i mask, const int scale) 1540 { 1541 return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)src, 1542 base, 1543 (__v4di)index, 1544 (__v4di)mask, 1545 scale); 1546 } 1547 1548 extern __inline __m128i 1549 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1550 _mm_i32gather_epi32 (int const *base, __m128i index, const int scale) 1551 { 1552 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1553 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1554 1555 return (__m128i) __builtin_ia32_gathersiv4si (src, 1556 base, 1557 (__v4si)index, 1558 mask, 1559 scale); 1560 } 1561 1562 extern __inline __m128i 1563 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1564 _mm_mask_i32gather_epi32 (__m128i src, int const *base, __m128i index, 1565 __m128i mask, const int scale) 1566 { 1567 return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)src, 1568 base, 1569 (__v4si)index, 1570 (__v4si)mask, 1571 scale); 1572 } 1573 1574 extern __inline __m256i 1575 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1576 _mm256_i32gather_epi32 (int const *base, __m256i index, const int scale) 1577 { 1578 __v8si src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 }; 1579 __v8si mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }; 1580 1581 return (__m256i) __builtin_ia32_gathersiv8si (src, 1582 base, 1583 (__v8si)index, 1584 mask, 1585 scale); 1586 } 1587 1588 extern __inline __m256i 1589 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1590 _mm256_mask_i32gather_epi32 (__m256i src, int const *base, 1591 __m256i index, __m256i mask, const int scale) 1592 { 1593 return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)src, 1594 base, 1595 (__v8si)index, 1596 (__v8si)mask, 1597 scale); 1598 } 1599 1600 extern __inline __m128i 1601 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1602 _mm_i64gather_epi32 (int const *base, __m128i index, const int scale) 1603 { 1604 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1605 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1606 1607 return (__m128i) __builtin_ia32_gatherdiv4si (src, 1608 base, 1609 (__v2di)index, 1610 mask, 1611 scale); 1612 } 1613 1614 extern __inline __m128i 1615 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1616 _mm_mask_i64gather_epi32 (__m128i src, int const *base, __m128i index, 1617 __m128i mask, const int scale) 1618 { 1619 return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)src, 1620 base, 1621 (__v2di)index, 1622 (__v4si)mask, 1623 scale); 1624 } 1625 1626 extern __inline __m128i 1627 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1628 _mm256_i64gather_epi32 (int const *base, __m256i index, const int scale) 1629 { 1630 __v4si src = __extension__ (__v4si){ 0, 0, 0, 0 }; 1631 __v4si mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; 1632 1633 return (__m128i) __builtin_ia32_gatherdiv4si256 (src, 1634 base, 1635 (__v4di)index, 1636 mask, 1637 scale); 1638 } 1639 1640 extern __inline __m128i 1641 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) 1642 _mm256_mask_i64gather_epi32 (__m128i src, int const *base, 1643 __m256i index, __m128i mask, const int scale) 1644 { 1645 return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)src, 1646 base, 1647 (__v4di)index, 1648 (__v4si)mask, 1649 scale); 1650 } 1651 #else /* __OPTIMIZE__ */ 1652 #define _mm_i32gather_pd(BASE, INDEX, SCALE) \ 1653 (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \ 1654 (double const *)BASE, \ 1655 (__v4si)(__m128i)INDEX, \ 1656 (__v2df)_mm_set1_pd( \ 1657 (double)(long long int) -1), \ 1658 (int)SCALE) 1659 1660 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1661 (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC, \ 1662 (double const *)BASE, \ 1663 (__v4si)(__m128i)INDEX, \ 1664 (__v2df)(__m128d)MASK, \ 1665 (int)SCALE) 1666 1667 #define _mm256_i32gather_pd(BASE, INDEX, SCALE) \ 1668 (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \ 1669 (double const *)BASE, \ 1670 (__v4si)(__m128i)INDEX, \ 1671 (__v4df)_mm256_set1_pd( \ 1672 (double)(long long int) -1), \ 1673 (int)SCALE) 1674 1675 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1676 (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC, \ 1677 (double const *)BASE, \ 1678 (__v4si)(__m128i)INDEX, \ 1679 (__v4df)(__m256d)MASK, \ 1680 (int)SCALE) 1681 1682 #define _mm_i64gather_pd(BASE, INDEX, SCALE) \ 1683 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \ 1684 (double const *)BASE, \ 1685 (__v2di)(__m128i)INDEX, \ 1686 (__v2df)_mm_set1_pd( \ 1687 (double)(long long int) -1), \ 1688 (int)SCALE) 1689 1690 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1691 (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC, \ 1692 (double const *)BASE, \ 1693 (__v2di)(__m128i)INDEX, \ 1694 (__v2df)(__m128d)MASK, \ 1695 (int)SCALE) 1696 1697 #define _mm256_i64gather_pd(BASE, INDEX, SCALE) \ 1698 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \ 1699 (double const *)BASE, \ 1700 (__v4di)(__m256i)INDEX, \ 1701 (__v4df)_mm256_set1_pd( \ 1702 (double)(long long int) -1), \ 1703 (int)SCALE) 1704 1705 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ 1706 (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC, \ 1707 (double const *)BASE, \ 1708 (__v4di)(__m256i)INDEX, \ 1709 (__v4df)(__m256d)MASK, \ 1710 (int)SCALE) 1711 1712 #define _mm_i32gather_ps(BASE, INDEX, SCALE) \ 1713 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \ 1714 (float const *)BASE, \ 1715 (__v4si)(__m128i)INDEX, \ 1716 _mm_set1_ps ((float)(int) -1), \ 1717 (int)SCALE) 1718 1719 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1720 (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC, \ 1721 (float const *)BASE, \ 1722 (__v4si)(__m128i)INDEX, \ 1723 (__v4sf)(__m128d)MASK, \ 1724 (int)SCALE) 1725 1726 #define _mm256_i32gather_ps(BASE, INDEX, SCALE) \ 1727 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \ 1728 (float const *)BASE, \ 1729 (__v8si)(__m256i)INDEX, \ 1730 (__v8sf)_mm256_set1_ps ( \ 1731 (float)(int) -1), \ 1732 (int)SCALE) 1733 1734 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1735 (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC, \ 1736 (float const *)BASE, \ 1737 (__v8si)(__m256i)INDEX, \ 1738 (__v8sf)(__m256d)MASK, \ 1739 (int)SCALE) 1740 1741 #define _mm_i64gather_ps(BASE, INDEX, SCALE) \ 1742 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \ 1743 (float const *)BASE, \ 1744 (__v2di)(__m128i)INDEX, \ 1745 (__v4sf)_mm_set1_ps ( \ 1746 (float)(int) -1), \ 1747 (int)SCALE) 1748 1749 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1750 (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC, \ 1751 (float const *)BASE, \ 1752 (__v2di)(__m128i)INDEX, \ 1753 (__v4sf)(__m128d)MASK, \ 1754 (int)SCALE) 1755 1756 #define _mm256_i64gather_ps(BASE, INDEX, SCALE) \ 1757 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \ 1758 (float const *)BASE, \ 1759 (__v4di)(__m256i)INDEX, \ 1760 (__v4sf)_mm_set1_ps( \ 1761 (float)(int) -1), \ 1762 (int)SCALE) 1763 1764 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ 1765 (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC, \ 1766 (float const *)BASE, \ 1767 (__v4di)(__m256i)INDEX, \ 1768 (__v4sf)(__m128)MASK, \ 1769 (int)SCALE) 1770 1771 #define _mm_i32gather_epi64(BASE, INDEX, SCALE) \ 1772 (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \ 1773 (long long const *)BASE, \ 1774 (__v4si)(__m128i)INDEX, \ 1775 (__v2di)_mm_set1_epi64x (-1), \ 1776 (int)SCALE) 1777 1778 #define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1779 (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i)SRC, \ 1780 (long long const *)BASE, \ 1781 (__v4si)(__m128i)INDEX, \ 1782 (__v2di)(__m128i)MASK, \ 1783 (int)SCALE) 1784 1785 #define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \ 1786 (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \ 1787 (long long const *)BASE, \ 1788 (__v4si)(__m128i)INDEX, \ 1789 (__v4di)_mm256_set1_epi64x (-1), \ 1790 (int)SCALE) 1791 1792 #define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1793 (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i)SRC, \ 1794 (long long const *)BASE, \ 1795 (__v4si)(__m128i)INDEX, \ 1796 (__v4di)(__m256i)MASK, \ 1797 (int)SCALE) 1798 1799 #define _mm_i64gather_epi64(BASE, INDEX, SCALE) \ 1800 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \ 1801 (long long const *)BASE, \ 1802 (__v2di)(__m128i)INDEX, \ 1803 (__v2di)_mm_set1_epi64x (-1), \ 1804 (int)SCALE) 1805 1806 #define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1807 (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i)SRC, \ 1808 (long long const *)BASE, \ 1809 (__v2di)(__m128i)INDEX, \ 1810 (__v2di)(__m128i)MASK, \ 1811 (int)SCALE) 1812 1813 #define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \ 1814 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \ 1815 (long long const *)BASE, \ 1816 (__v4di)(__m256i)INDEX, \ 1817 (__v4di)_mm256_set1_epi64x (-1), \ 1818 (int)SCALE) 1819 1820 #define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ 1821 (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i)SRC, \ 1822 (long long const *)BASE, \ 1823 (__v4di)(__m256i)INDEX, \ 1824 (__v4di)(__m256i)MASK, \ 1825 (int)SCALE) 1826 1827 #define _mm_i32gather_epi32(BASE, INDEX, SCALE) \ 1828 (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \ 1829 (int const *)BASE, \ 1830 (__v4si)(__m128i)INDEX, \ 1831 (__v4si)_mm_set1_epi32 (-1), \ 1832 (int)SCALE) 1833 1834 #define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1835 (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i)SRC, \ 1836 (int const *)BASE, \ 1837 (__v4si)(__m128i)INDEX, \ 1838 (__v4si)(__m128i)MASK, \ 1839 (int)SCALE) 1840 1841 #define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \ 1842 (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \ 1843 (int const *)BASE, \ 1844 (__v8si)(__m256i)INDEX, \ 1845 (__v8si)_mm256_set1_epi32 (-1), \ 1846 (int)SCALE) 1847 1848 #define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1849 (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i)SRC, \ 1850 (int const *)BASE, \ 1851 (__v8si)(__m256i)INDEX, \ 1852 (__v8si)(__m256i)MASK, \ 1853 (int)SCALE) 1854 1855 #define _mm_i64gather_epi32(BASE, INDEX, SCALE) \ 1856 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \ 1857 (int const *)BASE, \ 1858 (__v2di)(__m128i)INDEX, \ 1859 (__v4si)_mm_set1_epi32 (-1), \ 1860 (int)SCALE) 1861 1862 #define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1863 (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i)SRC, \ 1864 (int const *)BASE, \ 1865 (__v2di)(__m128i)INDEX, \ 1866 (__v4si)(__m128i)MASK, \ 1867 (int)SCALE) 1868 1869 #define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \ 1870 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \ 1871 (int const *)BASE, \ 1872 (__v4di)(__m256i)INDEX, \ 1873 (__v4si)_mm_set1_epi32(-1), \ 1874 (int)SCALE) 1875 1876 #define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ 1877 (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i)SRC, \ 1878 (int const *)BASE, \ 1879 (__v4di)(__m256i)INDEX, \ 1880 (__v4si)(__m128i)MASK, \ 1881 (int)SCALE) 1882 #endif /* __OPTIMIZE__ */ 1883 1884 #ifdef __DISABLE_AVX2__ 1885 #undef __DISABLE_AVX2__ 1886 #pragma GCC pop_options 1887 #endif /* __DISABLE_AVX2__ */ 1888 1889 #endif /* _AVX2INTRIN_H_INCLUDED */ 1890