Home | History | Annotate | Download | only in X86
      1 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=core-avx2 -mattr=avx2 | FileCheck %s
      2 
      3 define <16 x i16> @test_x86_avx2_packssdw(<8 x i32> %a0, <8 x i32> %a1) {
      4   ; CHECK: vpackssdw
      5   %res = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
      6   ret <16 x i16> %res
      7 }
      8 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
      9 
     10 
     11 define <32 x i8> @test_x86_avx2_packsswb(<16 x i16> %a0, <16 x i16> %a1) {
     12   ; CHECK: vpacksswb
     13   %res = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
     14   ret <32 x i8> %res
     15 }
     16 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
     17 
     18 
     19 define <32 x i8> @test_x86_avx2_packuswb(<16 x i16> %a0, <16 x i16> %a1) {
     20   ; CHECK: vpackuswb
     21   %res = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1) ; <<32 x i8>> [#uses=1]
     22   ret <32 x i8> %res
     23 }
     24 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
     25 
     26 
     27 define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) {
     28   ; CHECK: vpaddsb
     29   %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
     30   ret <32 x i8> %res
     31 }
     32 declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
     33 
     34 
     35 define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) {
     36   ; CHECK: vpaddsw
     37   %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
     38   ret <16 x i16> %res
     39 }
     40 declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
     41 
     42 
     43 define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) {
     44   ; CHECK: vpaddusb
     45   %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
     46   ret <32 x i8> %res
     47 }
     48 declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
     49 
     50 
     51 define <16 x i16> @test_x86_avx2_paddus_w(<16 x i16> %a0, <16 x i16> %a1) {
     52   ; CHECK: vpaddusw
     53   %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
     54   ret <16 x i16> %res
     55 }
     56 declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
     57 
     58 
     59 define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) {
     60   ; CHECK: vpavgb
     61   %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
     62   ret <32 x i8> %res
     63 }
     64 declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
     65 
     66 
     67 define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) {
     68   ; CHECK: vpavgw
     69   %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
     70   ret <16 x i16> %res
     71 }
     72 declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
     73 
     74 
     75 define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) {
     76   ; CHECK: vpmaddwd
     77   %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1) ; <<8 x i32>> [#uses=1]
     78   ret <8 x i32> %res
     79 }
     80 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
     81 
     82 
     83 define <16 x i16> @test_x86_avx2_pmaxs_w(<16 x i16> %a0, <16 x i16> %a1) {
     84   ; CHECK: vpmaxsw
     85   %res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
     86   ret <16 x i16> %res
     87 }
     88 declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone
     89 
     90 
     91 define <32 x i8> @test_x86_avx2_pmaxu_b(<32 x i8> %a0, <32 x i8> %a1) {
     92   ; CHECK: vpmaxub
     93   %res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
     94   ret <32 x i8> %res
     95 }
     96 declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
     97 
     98 
     99 define <16 x i16> @test_x86_avx2_pmins_w(<16 x i16> %a0, <16 x i16> %a1) {
    100   ; CHECK: vpminsw
    101   %res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    102   ret <16 x i16> %res
    103 }
    104 declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone
    105 
    106 
    107 define <32 x i8> @test_x86_avx2_pminu_b(<32 x i8> %a0, <32 x i8> %a1) {
    108   ; CHECK: vpminub
    109   %res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
    110   ret <32 x i8> %res
    111 }
    112 declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
    113 
    114 
    115 define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) {
    116   ; CHECK: vpmovmskb
    117   %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; <i32> [#uses=1]
    118   ret i32 %res
    119 }
    120 declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
    121 
    122 
    123 define <16 x i16> @test_x86_avx2_pmulh_w(<16 x i16> %a0, <16 x i16> %a1) {
    124   ; CHECK: vpmulhw
    125   %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    126   ret <16 x i16> %res
    127 }
    128 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
    129 
    130 
    131 define <16 x i16> @test_x86_avx2_pmulhu_w(<16 x i16> %a0, <16 x i16> %a1) {
    132   ; CHECK: vpmulhuw
    133   %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    134   ret <16 x i16> %res
    135 }
    136 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
    137 
    138 
    139 define <4 x i64> @test_x86_avx2_pmulu_dq(<8 x i32> %a0, <8 x i32> %a1) {
    140   ; CHECK: vpmuludq
    141   %res = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i64>> [#uses=1]
    142   ret <4 x i64> %res
    143 }
    144 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
    145 
    146 
    147 define <4 x i64> @test_x86_avx2_psad_bw(<32 x i8> %a0, <32 x i8> %a1) {
    148   ; CHECK: vpsadbw
    149   %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1) ; <<4 x i64>> [#uses=1]
    150   ret <4 x i64> %res
    151 }
    152 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
    153 
    154 
    155 define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) {
    156   ; CHECK: vpslld
    157   %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
    158   ret <8 x i32> %res
    159 }
    160 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
    161 
    162 
    163 define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
    164   ; CHECK: vpslldq
    165   %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
    166   ret <4 x i64> %res
    167 }
    168 declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
    169 
    170 
    171 define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
    172   ; CHECK: vpslldq
    173   %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
    174   ret <4 x i64> %res
    175 }
    176 declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone
    177 
    178 
    179 define <4 x i64> @test_x86_avx2_psll_q(<4 x i64> %a0, <2 x i64> %a1) {
    180   ; CHECK: vpsllq
    181   %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
    182   ret <4 x i64> %res
    183 }
    184 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
    185 
    186 
    187 define <16 x i16> @test_x86_avx2_psll_w(<16 x i16> %a0, <8 x i16> %a1) {
    188   ; CHECK: vpsllw
    189   %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
    190   ret <16 x i16> %res
    191 }
    192 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
    193 
    194 
    195 define <8 x i32> @test_x86_avx2_pslli_d(<8 x i32> %a0) {
    196   ; CHECK: vpslld
    197   %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
    198   ret <8 x i32> %res
    199 }
    200 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
    201 
    202 
    203 define <4 x i64> @test_x86_avx2_pslli_q(<4 x i64> %a0) {
    204   ; CHECK: vpsllq
    205   %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
    206   ret <4 x i64> %res
    207 }
    208 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
    209 
    210 
    211 define <16 x i16> @test_x86_avx2_pslli_w(<16 x i16> %a0) {
    212   ; CHECK: vpsllw
    213   %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
    214   ret <16 x i16> %res
    215 }
    216 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
    217 
    218 
    219 define <8 x i32> @test_x86_avx2_psra_d(<8 x i32> %a0, <4 x i32> %a1) {
    220   ; CHECK: vpsrad
    221   %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
    222   ret <8 x i32> %res
    223 }
    224 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
    225 
    226 
    227 define <16 x i16> @test_x86_avx2_psra_w(<16 x i16> %a0, <8 x i16> %a1) {
    228   ; CHECK: vpsraw
    229   %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
    230   ret <16 x i16> %res
    231 }
    232 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
    233 
    234 
    235 define <8 x i32> @test_x86_avx2_psrai_d(<8 x i32> %a0) {
    236   ; CHECK: vpsrad
    237   %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
    238   ret <8 x i32> %res
    239 }
    240 declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
    241 
    242 
    243 define <16 x i16> @test_x86_avx2_psrai_w(<16 x i16> %a0) {
    244   ; CHECK: vpsraw
    245   %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
    246   ret <16 x i16> %res
    247 }
    248 declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
    249 
    250 
    251 define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) {
    252   ; CHECK: vpsrld
    253   %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1) ; <<8 x i32>> [#uses=1]
    254   ret <8 x i32> %res
    255 }
    256 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
    257 
    258 
    259 define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
    260   ; CHECK: vpsrldq
    261   %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
    262   ret <4 x i64> %res
    263 }
    264 declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
    265 
    266 
    267 define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
    268   ; CHECK: vpsrldq
    269   %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
    270   ret <4 x i64> %res
    271 }
    272 declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone
    273 
    274 
    275 define <4 x i64> @test_x86_avx2_psrl_q(<4 x i64> %a0, <2 x i64> %a1) {
    276   ; CHECK: vpsrlq
    277   %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
    278   ret <4 x i64> %res
    279 }
    280 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
    281 
    282 
    283 define <16 x i16> @test_x86_avx2_psrl_w(<16 x i16> %a0, <8 x i16> %a1) {
    284   ; CHECK: vpsrlw
    285   %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1) ; <<16 x i16>> [#uses=1]
    286   ret <16 x i16> %res
    287 }
    288 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
    289 
    290 
    291 define <8 x i32> @test_x86_avx2_psrli_d(<8 x i32> %a0) {
    292   ; CHECK: vpsrld
    293   %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %a0, i32 7) ; <<8 x i32>> [#uses=1]
    294   ret <8 x i32> %res
    295 }
    296 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
    297 
    298 
    299 define <4 x i64> @test_x86_avx2_psrli_q(<4 x i64> %a0) {
    300   ; CHECK: vpsrlq
    301   %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
    302   ret <4 x i64> %res
    303 }
    304 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
    305 
    306 
    307 define <16 x i16> @test_x86_avx2_psrli_w(<16 x i16> %a0) {
    308   ; CHECK: vpsrlw
    309   %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %a0, i32 7) ; <<16 x i16>> [#uses=1]
    310   ret <16 x i16> %res
    311 }
    312 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
    313 
    314 
    315 define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) {
    316   ; CHECK: vpsubsb
    317   %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
    318   ret <32 x i8> %res
    319 }
    320 declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
    321 
    322 
    323 define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) {
    324   ; CHECK: vpsubsw
    325   %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    326   ret <16 x i16> %res
    327 }
    328 declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
    329 
    330 
    331 define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) {
    332   ; CHECK: vpsubusb
    333   %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
    334   ret <32 x i8> %res
    335 }
    336 declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
    337 
    338 
    339 define <16 x i16> @test_x86_avx2_psubus_w(<16 x i16> %a0, <16 x i16> %a1) {
    340   ; CHECK: vpsubusw
    341   %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    342   ret <16 x i16> %res
    343 }
    344 declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
    345 
    346 
    347 define <32 x i8> @test_x86_avx2_pabs_b(<32 x i8> %a0) {
    348   ; CHECK: vpabsb
    349   %res = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0) ; <<32 x i8>> [#uses=1]
    350   ret <32 x i8> %res
    351 }
    352 declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
    353 
    354 
    355 define <8 x i32> @test_x86_avx2_pabs_d(<8 x i32> %a0) {
    356   ; CHECK: vpabsd
    357   %res = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0) ; <<8 x i32>> [#uses=1]
    358   ret <8 x i32> %res
    359 }
    360 declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
    361 
    362 
    363 define <16 x i16> @test_x86_avx2_pabs_w(<16 x i16> %a0) {
    364   ; CHECK: vpabsw
    365   %res = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0) ; <<16 x i16>> [#uses=1]
    366   ret <16 x i16> %res
    367 }
    368 declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
    369 
    370 
    371 define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) {
    372   ; CHECK: vphaddd
    373   %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
    374   ret <8 x i32> %res
    375 }
    376 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
    377 
    378 
    379 define <16 x i16> @test_x86_avx2_phadd_sw(<16 x i16> %a0, <16 x i16> %a1) {
    380   ; CHECK: vphaddsw
    381   %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    382   ret <16 x i16> %res
    383 }
    384 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
    385 
    386 
    387 define <16 x i16> @test_x86_avx2_phadd_w(<16 x i16> %a0, <16 x i16> %a1) {
    388   ; CHECK: vphaddw
    389   %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    390   ret <16 x i16> %res
    391 }
    392 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
    393 
    394 
    395 define <8 x i32> @test_x86_avx2_phsub_d(<8 x i32> %a0, <8 x i32> %a1) {
    396   ; CHECK: vphsubd
    397   %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
    398   ret <8 x i32> %res
    399 }
    400 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
    401 
    402 
    403 define <16 x i16> @test_x86_avx2_phsub_sw(<16 x i16> %a0, <16 x i16> %a1) {
    404   ; CHECK: vphsubsw
    405   %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    406   ret <16 x i16> %res
    407 }
    408 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
    409 
    410 
    411 define <16 x i16> @test_x86_avx2_phsub_w(<16 x i16> %a0, <16 x i16> %a1) {
    412   ; CHECK: vphsubw
    413   %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    414   ret <16 x i16> %res
    415 }
    416 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
    417 
    418 
    419 define <16 x i16> @test_x86_avx2_pmadd_ub_sw(<32 x i8> %a0, <32 x i8> %a1) {
    420   ; CHECK: vpmaddubsw
    421   %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i16>> [#uses=1]
    422   ret <16 x i16> %res
    423 }
    424 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
    425 
    426 
    427 define <16 x i16> @test_x86_avx2_pmul_hr_sw(<16 x i16> %a0, <16 x i16> %a1) {
    428   ; CHECK: vpmulhrsw
    429   %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    430   ret <16 x i16> %res
    431 }
    432 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
    433 
    434 
    435 define <32 x i8> @test_x86_avx2_pshuf_b(<32 x i8> %a0, <32 x i8> %a1) {
    436   ; CHECK: vpshufb
    437   %res = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> %a1) ; <<16 x i8>> [#uses=1]
    438   ret <32 x i8> %res
    439 }
    440 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
    441 
    442 
    443 define <32 x i8> @test_x86_avx2_psign_b(<32 x i8> %a0, <32 x i8> %a1) {
    444   ; CHECK: vpsignb
    445   %res = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
    446   ret <32 x i8> %res
    447 }
    448 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
    449 
    450 
    451 define <8 x i32> @test_x86_avx2_psign_d(<8 x i32> %a0, <8 x i32> %a1) {
    452   ; CHECK: vpsignd
    453   %res = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %a0, <8 x i32> %a1) ; <<4 x i32>> [#uses=1]
    454   ret <8 x i32> %res
    455 }
    456 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
    457 
    458 
    459 define <16 x i16> @test_x86_avx2_psign_w(<16 x i16> %a0, <16 x i16> %a1) {
    460   ; CHECK: vpsignw
    461   %res = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    462   ret <16 x i16> %res
    463 }
    464 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
    465 
    466 
    467 define <4 x i64> @test_x86_avx2_movntdqa(i8* %a0) {
    468   ; CHECK: movl
    469   ; CHECK: vmovntdqa
    470   %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %a0) ; <<4 x i64>> [#uses=1]
    471   ret <4 x i64> %res
    472 }
    473 declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
    474 
    475 
    476 define <16 x i16> @test_x86_avx2_mpsadbw(<32 x i8> %a0, <32 x i8> %a1) {
    477   ; CHECK: vmpsadbw
    478   %res = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %a0, <32 x i8> %a1, i32 7) ; <<16 x i16>> [#uses=1]
    479   ret <16 x i16> %res
    480 }
    481 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i32) nounwind readnone
    482 
    483 
    484 define <16 x i16> @test_x86_avx2_packusdw(<8 x i32> %a0, <8 x i32> %a1) {
    485   ; CHECK: vpackusdw
    486   %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1) ; <<16 x i16>> [#uses=1]
    487   ret <16 x i16> %res
    488 }
    489 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
    490 
    491 
    492 define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) {
    493   ; CHECK: vpblendvb
    494   %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %a2) ; <<32 x i8>> [#uses=1]
    495   ret <32 x i8> %res
    496 }
    497 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
    498 
    499 
    500 define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
    501   ; CHECK: vpblendw
    502   %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 7) ; <<16 x i16>> [#uses=1]
    503   ret <16 x i16> %res
    504 }
    505 declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32) nounwind readnone
    506 
    507 
    508 define <32 x i8> @test_x86_avx2_pmaxsb(<32 x i8> %a0, <32 x i8> %a1) {
    509   ; CHECK: vpmaxsb
    510   %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
    511   ret <32 x i8> %res
    512 }
    513 declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
    514 
    515 
    516 define <8 x i32> @test_x86_avx2_pmaxsd(<8 x i32> %a0, <8 x i32> %a1) {
    517   ; CHECK: vpmaxsd
    518   %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
    519   ret <8 x i32> %res
    520 }
    521 declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
    522 
    523 
    524 define <8 x i32> @test_x86_avx2_pmaxud(<8 x i32> %a0, <8 x i32> %a1) {
    525   ; CHECK: vpmaxud
    526   %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
    527   ret <8 x i32> %res
    528 }
    529 declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
    530 
    531 
    532 define <16 x i16> @test_x86_avx2_pmaxuw(<16 x i16> %a0, <16 x i16> %a1) {
    533   ; CHECK: vpmaxuw
    534   %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    535   ret <16 x i16> %res
    536 }
    537 declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
    538 
    539 
    540 define <32 x i8> @test_x86_avx2_pminsb(<32 x i8> %a0, <32 x i8> %a1) {
    541   ; CHECK: vpminsb
    542   %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1]
    543   ret <32 x i8> %res
    544 }
    545 declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
    546 
    547 
    548 define <8 x i32> @test_x86_avx2_pminsd(<8 x i32> %a0, <8 x i32> %a1) {
    549   ; CHECK: vpminsd
    550   %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
    551   ret <8 x i32> %res
    552 }
    553 declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
    554 
    555 
    556 define <8 x i32> @test_x86_avx2_pminud(<8 x i32> %a0, <8 x i32> %a1) {
    557   ; CHECK: vpminud
    558   %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
    559   ret <8 x i32> %res
    560 }
    561 declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
    562 
    563 
    564 define <16 x i16> @test_x86_avx2_pminuw(<16 x i16> %a0, <16 x i16> %a1) {
    565   ; CHECK: vpminuw
    566   %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1]
    567   ret <16 x i16> %res
    568 }
    569 declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
    570 
    571 
    572 define <8 x i32> @test_x86_avx2_pmovsxbd(<16 x i8> %a0) {
    573   ; CHECK: vpmovsxbd
    574   %res = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
    575   ret <8 x i32> %res
    576 }
    577 declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone
    578 
    579 
    580 define <4 x i64> @test_x86_avx2_pmovsxbq(<16 x i8> %a0) {
    581   ; CHECK: vpmovsxbq
    582   %res = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
    583   ret <4 x i64> %res
    584 }
    585 declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone
    586 
    587 
    588 define <16 x i16> @test_x86_avx2_pmovsxbw(<16 x i8> %a0) {
    589   ; CHECK: vpmovsxbw
    590   %res = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
    591   ret <16 x i16> %res
    592 }
    593 declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone
    594 
    595 
    596 define <4 x i64> @test_x86_avx2_pmovsxdq(<4 x i32> %a0) {
    597   ; CHECK: vpmovsxdq
    598   %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
    599   ret <4 x i64> %res
    600 }
    601 declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone
    602 
    603 
    604 define <8 x i32> @test_x86_avx2_pmovsxwd(<8 x i16> %a0) {
    605   ; CHECK: vpmovsxwd
    606   %res = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
    607   ret <8 x i32> %res
    608 }
    609 declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone
    610 
    611 
    612 define <4 x i64> @test_x86_avx2_pmovsxwq(<8 x i16> %a0) {
    613   ; CHECK: vpmovsxwq
    614   %res = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
    615   ret <4 x i64> %res
    616 }
    617 declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone
    618 
    619 
    620 define <8 x i32> @test_x86_avx2_pmovzxbd(<16 x i8> %a0) {
    621   ; CHECK: vpmovzxbd
    622   %res = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %a0) ; <<8 x i32>> [#uses=1]
    623   ret <8 x i32> %res
    624 }
    625 declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone
    626 
    627 
    628 define <4 x i64> @test_x86_avx2_pmovzxbq(<16 x i8> %a0) {
    629   ; CHECK: vpmovzxbq
    630   %res = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %a0) ; <<4 x i64>> [#uses=1]
    631   ret <4 x i64> %res
    632 }
    633 declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone
    634 
    635 
    636 define <16 x i16> @test_x86_avx2_pmovzxbw(<16 x i8> %a0) {
    637   ; CHECK: vpmovzxbw
    638   %res = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %a0) ; <<16 x i16>> [#uses=1]
    639   ret <16 x i16> %res
    640 }
    641 declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone
    642 
    643 
    644 define <4 x i64> @test_x86_avx2_pmovzxdq(<4 x i32> %a0) {
    645   ; CHECK: vpmovzxdq
    646   %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %a0) ; <<4 x i64>> [#uses=1]
    647   ret <4 x i64> %res
    648 }
    649 declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone
    650 
    651 
    652 define <8 x i32> @test_x86_avx2_pmovzxwd(<8 x i16> %a0) {
    653   ; CHECK: vpmovzxwd
    654   %res = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %a0) ; <<8 x i32>> [#uses=1]
    655   ret <8 x i32> %res
    656 }
    657 declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone
    658 
    659 
    660 define <4 x i64> @test_x86_avx2_pmovzxwq(<8 x i16> %a0) {
    661   ; CHECK: vpmovzxwq
    662   %res = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %a0) ; <<4 x i64>> [#uses=1]
    663   ret <4 x i64> %res
    664 }
    665 declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone
    666 
    667 
    668 define <4 x i64> @test_x86_avx2_pmul.dq(<8 x i32> %a0, <8 x i32> %a1) {
    669   ; CHECK: vpmuldq
    670   %res = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1) ; <<2 x i64>> [#uses=1]
    671   ret <4 x i64> %res
    672 }
    673 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
    674 
    675 
    676 define <4 x i64> @test_x86_avx2_vbroadcasti128(i8* %a0) {
    677   ; CHECK: vbroadcasti128
    678   %res = call <4 x i64> @llvm.x86.avx2.vbroadcasti128(i8* %a0) ; <<4 x i64>> [#uses=1]
    679   ret <4 x i64> %res
    680 }
    681 declare <4 x i64> @llvm.x86.avx2.vbroadcasti128(i8*) nounwind readonly
    682 
    683 define <4 x double> @test_x86_avx2_vbroadcast_sd_pd_256(<2 x double> %a0) {
    684   ; CHECK: vbroadcastsd
    685   %res = call <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double> %a0) ; <<4 x double>> [#uses=1]
    686   ret <4 x double> %res
    687 }
    688 declare <4 x double> @llvm.x86.avx2.vbroadcast.sd.pd.256(<2 x double>) nounwind readonly
    689 
    690 
    691 define <4 x float> @test_x86_avx2_vbroadcast_ss_ps(<4 x float> %a0) {
    692   ; CHECK: vbroadcastss
    693   %res = call <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
    694   ret <4 x float> %res
    695 }
    696 declare <4 x float> @llvm.x86.avx2.vbroadcast.ss.ps(<4 x float>) nounwind readonly
    697 
    698 
    699 define <8 x float> @test_x86_avx2_vbroadcast_ss_ps_256(<4 x float> %a0) {
    700   ; CHECK: vbroadcastss
    701   %res = call <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float> %a0) ; <<8 x float>> [#uses=1]
    702   ret <8 x float> %res
    703 }
    704 declare <8 x float> @llvm.x86.avx2.vbroadcast.ss.ps.256(<4 x float>) nounwind readonly
    705 
    706 
    707 define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
    708   ; CHECK: vpblendd
    709   %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 7) ; <<4 x i32>> [#uses=1]
    710   ret <4 x i32> %res
    711 }
    712 declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32) nounwind readnone
    713 
    714 
    715 define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
    716   ; CHECK: vpblendd
    717   %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 7) ; <<8 x i32>> [#uses=1]
    718   ret <8 x i32> %res
    719 }
    720 declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32) nounwind readnone
    721 
    722 
    723 define <16 x i8> @test_x86_avx2_pbroadcastb_128(<16 x i8> %a0) {
    724   ; CHECK: vpbroadcastb
    725   %res = call <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1]
    726   ret <16 x i8> %res
    727 }
    728 declare <16 x i8> @llvm.x86.avx2.pbroadcastb.128(<16 x i8>) nounwind readonly
    729 
    730 
    731 define <32 x i8> @test_x86_avx2_pbroadcastb_256(<16 x i8> %a0) {
    732   ; CHECK: vpbroadcastb
    733   %res = call <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8> %a0) ; <<32 x i8>> [#uses=1]
    734   ret <32 x i8> %res
    735 }
    736 declare <32 x i8> @llvm.x86.avx2.pbroadcastb.256(<16 x i8>) nounwind readonly
    737 
    738 
    739 define <8 x i16> @test_x86_avx2_pbroadcastw_128(<8 x i16> %a0) {
    740   ; CHECK: vpbroadcastw
    741   %res = call <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
    742   ret <8 x i16> %res
    743 }
    744 declare <8 x i16> @llvm.x86.avx2.pbroadcastw.128(<8 x i16>) nounwind readonly
    745 
    746 
    747 define <16 x i16> @test_x86_avx2_pbroadcastw_256(<8 x i16> %a0) {
    748   ; CHECK: vpbroadcastw
    749   %res = call <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16> %a0) ; <<16 x i16>> [#uses=1]
    750   ret <16 x i16> %res
    751 }
    752 declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
    753 
    754 
    755 define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
    756   ; CHECK: vpbroadcastd
    757   %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
    758   ret <4 x i32> %res
    759 }
    760 declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
    761 
    762 
    763 define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
    764   ; CHECK: vpbroadcastd
    765   %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) ; <<8 x i32>> [#uses=1]
    766   ret <8 x i32> %res
    767 }
    768 declare <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32>) nounwind readonly
    769 
    770 
    771 define <2 x i64> @test_x86_avx2_pbroadcastq_128(<2 x i64> %a0) {
    772   ; CHECK: vpbroadcastq
    773   %res = call <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64> %a0) ; <<2 x i64>> [#uses=1]
    774   ret <2 x i64> %res
    775 }
    776 declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
    777 
    778 
    779 define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
    780   ; CHECK: vpbroadcastq
    781   %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) ; <<4 x i64>> [#uses=1]
    782   ret <4 x i64> %res
    783 }
    784 declare <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64>) nounwind readonly
    785 
    786 
    787 define <8 x i32> @test_x86_avx2_permd(<8 x i32> %a0, <8 x i32> %a1) {
    788   ; CHECK: vpermd
    789   %res = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
    790   ret <8 x i32> %res
    791 }
    792 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
    793 
    794 
    795 define <8 x float> @test_x86_avx2_permps(<8 x float> %a0, <8 x float> %a1) {
    796   ; CHECK: vpermps
    797   %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x float> %a1) ; <<8 x float>> [#uses=1]
    798   ret <8 x float> %res
    799 }
    800 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x float>) nounwind readonly
    801 
    802 
    803 define <4 x i64> @test_x86_avx2_vperm2i128(<4 x i64> %a0, <4 x i64> %a1) {
    804   ; CHECK: vperm2i128
    805   %res = call <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64> %a0, <4 x i64> %a1, i8 1) ; <<4 x i64>> [#uses=1]
    806   ret <4 x i64> %res
    807 }
    808 declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
    809 
    810 
    811 define <2 x i64> @test_x86_avx2_vextracti128(<4 x i64> %a0) {
    812   ; CHECK: vextracti128
    813   %res = call <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64> %a0, i8 7) ; <<2 x i64>> [#uses=1]
    814   ret <2 x i64> %res
    815 }
    816 declare <2 x i64> @llvm.x86.avx2.vextracti128(<4 x i64>, i8) nounwind readnone
    817 
    818 
    819 define <4 x i64> @test_x86_avx2_vinserti128(<4 x i64> %a0, <2 x i64> %a1) {
    820   ; CHECK: vinserti128
    821   %res = call <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64> %a0, <2 x i64> %a1, i8 7) ; <<4 x i64>> [#uses=1]
    822   ret <4 x i64> %res
    823 }
    824 declare <4 x i64> @llvm.x86.avx2.vinserti128(<4 x i64>, <2 x i64>, i8) nounwind readnone
    825 
    826 
    827 define <2 x i64> @test_x86_avx2_maskload_q(i8* %a0, <2 x i64> %a1) {
    828   ; CHECK: vpmaskmovq
    829   %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
    830   ret <2 x i64> %res
    831 }
    832 declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
    833 
    834 
    835 define <4 x i64> @test_x86_avx2_maskload_q_256(i8* %a0, <4 x i64> %a1) {
    836   ; CHECK: vpmaskmovq
    837   %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
    838   ret <4 x i64> %res
    839 }
    840 declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
    841 
    842 
    843 define <4 x i32> @test_x86_avx2_maskload_d(i8* %a0, <4 x i32> %a1) {
    844   ; CHECK: vpmaskmovd
    845   %res = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
    846   ret <4 x i32> %res
    847 }
    848 declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
    849 
    850 
    851 define <8 x i32> @test_x86_avx2_maskload_d_256(i8* %a0, <8 x i32> %a1) {
    852   ; CHECK: vpmaskmovd
    853   %res = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
    854   ret <8 x i32> %res
    855 }
    856 declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
    857 
    858 
    859 define void @test_x86_avx2_maskstore_q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) {
    860   ; CHECK: vpmaskmovq
    861   call void @llvm.x86.avx2.maskstore.q(i8* %a0, <2 x i64> %a1, <2 x i64> %a2)
    862   ret void
    863 }
    864 declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind
    865 
    866 
    867 define void @test_x86_avx2_maskstore_q_256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) {
    868   ; CHECK: vpmaskmovq
    869   call void @llvm.x86.avx2.maskstore.q.256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2)
    870   ret void
    871 }
    872 declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind
    873 
    874 
    875 define void @test_x86_avx2_maskstore_d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) {
    876   ; CHECK: vpmaskmovd
    877   call void @llvm.x86.avx2.maskstore.d(i8* %a0, <4 x i32> %a1, <4 x i32> %a2)
    878   ret void
    879 }
    880 declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind
    881 
    882 
    883 define void @test_x86_avx2_maskstore_d_256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) {
    884   ; CHECK: vpmaskmovd
    885   call void @llvm.x86.avx2.maskstore.d.256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2)
    886   ret void
    887 }
    888 declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind
    889 
    890 
    891 define <4 x i32> @test_x86_avx2_psllv_d(<4 x i32> %a0, <4 x i32> %a1) {
    892   ; CHECK: vpsllvd
    893   %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
    894   ret <4 x i32> %res
    895 }
    896 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
    897 
    898 
    899 define <8 x i32> @test_x86_avx2_psllv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
    900   ; CHECK: vpsllvd
    901   %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
    902   ret <8 x i32> %res
    903 }
    904 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
    905 
    906 
    907 define <2 x i64> @test_x86_avx2_psllv_q(<2 x i64> %a0, <2 x i64> %a1) {
    908   ; CHECK: vpsllvq
    909   %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
    910   ret <2 x i64> %res
    911 }
    912 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
    913 
    914 
    915 define <4 x i64> @test_x86_avx2_psllv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
    916   ; CHECK: vpsllvq
    917   %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
    918   ret <4 x i64> %res
    919 }
    920 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
    921 
    922 
    923 define <4 x i32> @test_x86_avx2_psrlv_d(<4 x i32> %a0, <4 x i32> %a1) {
    924   ; CHECK: vpsrlvd
    925   %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
    926   ret <4 x i32> %res
    927 }
    928 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
    929 
    930 
    931 define <8 x i32> @test_x86_avx2_psrlv_d_256(<8 x i32> %a0, <8 x i32> %a1) {
    932   ; CHECK: vpsrlvd
    933   %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
    934   ret <8 x i32> %res
    935 }
    936 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
    937 
    938 
    939 define <2 x i64> @test_x86_avx2_psrlv_q(<2 x i64> %a0, <2 x i64> %a1) {
    940   ; CHECK: vpsrlvq
    941   %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
    942   ret <2 x i64> %res
    943 }
    944 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
    945 
    946 
    947 define <4 x i64> @test_x86_avx2_psrlv_q_256(<4 x i64> %a0, <4 x i64> %a1) {
    948   ; CHECK: vpsrlvq
    949   %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1) ; <<4 x i64>> [#uses=1]
    950   ret <4 x i64> %res
    951 }
    952 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
    953 
    954 
    955 define <4 x i32> @test_x86_avx2_psrav_d(<4 x i32> %a0, <4 x i32> %a1) {
    956   ; CHECK: vpsravd
    957   %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
    958   ret <4 x i32> %res
    959 }
    960 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
    961 
    962 
    963 define <8 x i32> @test_x86_avx2_psrav_d_256(<8 x i32> %a0, <8 x i32> %a1) {
    964   ; CHECK: vpsravd
    965   %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1) ; <<8 x i32>> [#uses=1]
    966   ret <8 x i32> %res
    967 }
    968 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
    969 
    970 ; This is checked here because the execution dependency fix pass makes it hard to test in AVX mode since we don't have 256-bit integer instructions
    971 define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
    972   ; CHECK: vmovdqu
    973   ; add operation forces the execution domain.
    974   %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
    975   call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
    976   ret void
    977 }
    978 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
    979 
    980 define <2 x double> @test_x86_avx2_gather_d_pd(<2 x double> %a0, i8* %a1,
    981                      <4 x i32> %idx, <2 x double> %mask) {
    982   ; CHECK: vgatherdpd
    983   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0,
    984                             i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ;
    985   ret <2 x double> %res
    986 }
    987 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
    988                       <4 x i32>, <2 x double>, i8) nounwind readonly
    989 
    990 define <4 x double> @test_x86_avx2_gather_d_pd_256(<4 x double> %a0, i8* %a1,
    991                      <4 x i32> %idx, <4 x double> %mask) {
    992   ; CHECK: vgatherdpd
    993   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0,
    994                             i8* %a1, <4 x i32> %idx, <4 x double> %mask, i8 2) ;
    995   ret <4 x double> %res
    996 }
    997 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*,
    998                       <4 x i32>, <4 x double>, i8) nounwind readonly
    999 
   1000 define <2 x double> @test_x86_avx2_gather_q_pd(<2 x double> %a0, i8* %a1,
   1001                      <2 x i64> %idx, <2 x double> %mask) {
   1002   ; CHECK: vgatherqpd
   1003   %res = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0,
   1004                             i8* %a1, <2 x i64> %idx, <2 x double> %mask, i8 2) ;
   1005   ret <2 x double> %res
   1006 }
   1007 declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*,
   1008                       <2 x i64>, <2 x double>, i8) nounwind readonly
   1009 
   1010 define <4 x double> @test_x86_avx2_gather_q_pd_256(<4 x double> %a0, i8* %a1,
   1011                      <4 x i64> %idx, <4 x double> %mask) {
   1012   ; CHECK: vgatherqpd
   1013   %res = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0,
   1014                             i8* %a1, <4 x i64> %idx, <4 x double> %mask, i8 2) ;
   1015   ret <4 x double> %res
   1016 }
   1017 declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*,
   1018                       <4 x i64>, <4 x double>, i8) nounwind readonly
   1019 
   1020 define <4 x float> @test_x86_avx2_gather_d_ps(<4 x float> %a0, i8* %a1,
   1021                      <4 x i32> %idx, <4 x float> %mask) {
   1022   ; CHECK: vgatherdps
   1023   %res = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0,
   1024                             i8* %a1, <4 x i32> %idx, <4 x float> %mask, i8 2) ;
   1025   ret <4 x float> %res
   1026 }
   1027 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*,
   1028                       <4 x i32>, <4 x float>, i8) nounwind readonly
   1029 
   1030 define <8 x float> @test_x86_avx2_gather_d_ps_256(<8 x float> %a0, i8* %a1,
   1031                      <8 x i32> %idx, <8 x float> %mask) {
   1032   ; CHECK: vgatherdps
   1033   %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
   1034                             i8* %a1, <8 x i32> %idx, <8 x float> %mask, i8 2) ;
   1035   ret <8 x float> %res
   1036 }
   1037 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*,
   1038                       <8 x i32>, <8 x float>, i8) nounwind readonly
   1039 
   1040 define <4 x float> @test_x86_avx2_gather_q_ps(<4 x float> %a0, i8* %a1,
   1041                      <2 x i64> %idx, <4 x float> %mask) {
   1042   ; CHECK: vgatherqps
   1043   %res = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0,
   1044                             i8* %a1, <2 x i64> %idx, <4 x float> %mask, i8 2) ;
   1045   ret <4 x float> %res
   1046 }
   1047 declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*,
   1048                       <2 x i64>, <4 x float>, i8) nounwind readonly
   1049 
   1050 define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1,
   1051                      <4 x i64> %idx, <4 x float> %mask) {
   1052   ; CHECK: vgatherqps
   1053   %res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0,
   1054                             i8* %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ;
   1055   ret <4 x float> %res
   1056 }
   1057 declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*,
   1058                       <4 x i64>, <4 x float>, i8) nounwind readonly
   1059 
   1060 define <2 x i64> @test_x86_avx2_gather_d_q(<2 x i64> %a0, i8* %a1,
   1061                      <4 x i32> %idx, <2 x i64> %mask) {
   1062   ; CHECK: vpgatherdq
   1063   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0,
   1064                             i8* %a1, <4 x i32> %idx, <2 x i64> %mask, i8 2) ;
   1065   ret <2 x i64> %res
   1066 }
   1067 declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*,
   1068                       <4 x i32>, <2 x i64>, i8) nounwind readonly
   1069 
   1070 define <4 x i64> @test_x86_avx2_gather_d_q_256(<4 x i64> %a0, i8* %a1,
   1071                      <4 x i32> %idx, <4 x i64> %mask) {
   1072   ; CHECK: vpgatherdq
   1073   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0,
   1074                             i8* %a1, <4 x i32> %idx, <4 x i64> %mask, i8 2) ;
   1075   ret <4 x i64> %res
   1076 }
   1077 declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*,
   1078                       <4 x i32>, <4 x i64>, i8) nounwind readonly
   1079 
   1080 define <2 x i64> @test_x86_avx2_gather_q_q(<2 x i64> %a0, i8* %a1,
   1081                      <2 x i64> %idx, <2 x i64> %mask) {
   1082   ; CHECK: vpgatherqq
   1083   %res = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0,
   1084                             i8* %a1, <2 x i64> %idx, <2 x i64> %mask, i8 2) ;
   1085   ret <2 x i64> %res
   1086 }
   1087 declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*,
   1088                       <2 x i64>, <2 x i64>, i8) nounwind readonly
   1089 
   1090 define <4 x i64> @test_x86_avx2_gather_q_q_256(<4 x i64> %a0, i8* %a1,
   1091                      <4 x i64> %idx, <4 x i64> %mask) {
   1092   ; CHECK: vpgatherqq
   1093   %res = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0,
   1094                             i8* %a1, <4 x i64> %idx, <4 x i64> %mask, i8 2) ;
   1095   ret <4 x i64> %res
   1096 }
   1097 declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*,
   1098                       <4 x i64>, <4 x i64>, i8) nounwind readonly
   1099 
   1100 define <4 x i32> @test_x86_avx2_gather_d_d(<4 x i32> %a0, i8* %a1,
   1101                      <4 x i32> %idx, <4 x i32> %mask) {
   1102   ; CHECK: vpgatherdd
   1103   %res = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %a0,
   1104                             i8* %a1, <4 x i32> %idx, <4 x i32> %mask, i8 2) ;
   1105   ret <4 x i32> %res
   1106 }
   1107 declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*,
   1108                       <4 x i32>, <4 x i32>, i8) nounwind readonly
   1109 
   1110 define <8 x i32> @test_x86_avx2_gather_d_d_256(<8 x i32> %a0, i8* %a1,
   1111                      <8 x i32> %idx, <8 x i32> %mask) {
   1112   ; CHECK: vpgatherdd
   1113   %res = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %a0,
   1114                             i8* %a1, <8 x i32> %idx, <8 x i32> %mask, i8 2) ;
   1115   ret <8 x i32> %res
   1116 }
   1117 declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*,
   1118                       <8 x i32>, <8 x i32>, i8) nounwind readonly
   1119 
   1120 define <4 x i32> @test_x86_avx2_gather_q_d(<4 x i32> %a0, i8* %a1,
   1121                      <2 x i64> %idx, <4 x i32> %mask) {
   1122   ; CHECK: vpgatherqd
   1123   %res = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %a0,
   1124                             i8* %a1, <2 x i64> %idx, <4 x i32> %mask, i8 2) ;
   1125   ret <4 x i32> %res
   1126 }
   1127 declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*,
   1128                       <2 x i64>, <4 x i32>, i8) nounwind readonly
   1129 
   1130 define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1,
   1131                      <4 x i64> %idx, <4 x i32> %mask) {
   1132   ; CHECK: vpgatherqd
   1133   %res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0,
   1134                             i8* %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ;
   1135   ret <4 x i32> %res
   1136 }
   1137 declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*,
   1138                       <4 x i64>, <4 x i32>, i8) nounwind readonly
   1139 
   1140 ; PR13298
   1141 define <8 x float>  @test_gather_mask(<8 x float> %a0, float* %a,
   1142                                       <8 x i32> %idx, <8 x float> %mask,
   1143                                       float* nocapture %out) {
   1144 ; CHECK: test_gather_mask
   1145 ; CHECK: vmovdqa %ymm2, [[DEST:%.*]]
   1146 ; CHECK: vgatherdps [[DEST]]
   1147 ;; gather with mask
   1148   %a_i8 = bitcast float* %a to i8*
   1149   %res = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0,
   1150                            i8* %a_i8, <8 x i32> %idx, <8 x float> %mask, i8 4) ;
   1151 
   1152 ;; for debugging, we'll just dump out the mask
   1153   %out_ptr = bitcast float * %out to <8 x float> *
   1154   store <8 x float> %mask, <8 x float> * %out_ptr, align 4
   1155 
   1156   ret <8 x float> %res
   1157 }
   1158