Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X86
      3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=X64
      4 
      5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx2-builtins.c
      6 
      7 define <4 x i64> @test_mm256_abs_epi8(<4 x i64> %a0) {
      8 ; CHECK-LABEL: test_mm256_abs_epi8:
      9 ; CHECK:       # %bb.0:
     10 ; CHECK-NEXT:    vpabsb %ymm0, %ymm0
     11 ; CHECK-NEXT:    ret{{[l|q]}}
     12   %arg = bitcast <4 x i64> %a0 to <32 x i8>
     13   %sub = sub <32 x i8> zeroinitializer, %arg
     14   %cmp = icmp sgt <32 x i8> %arg, zeroinitializer
     15   %sel = select <32 x i1> %cmp, <32 x i8> %arg, <32 x i8> %sub
     16   %res = bitcast <32 x i8> %sel to <4 x i64>
     17   ret <4 x i64> %res
     18 }
     19 declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
     20 
     21 define <4 x i64> @test_mm256_abs_epi16(<4 x i64> %a0) {
     22 ; CHECK-LABEL: test_mm256_abs_epi16:
     23 ; CHECK:       # %bb.0:
     24 ; CHECK-NEXT:    vpabsw %ymm0, %ymm0
     25 ; CHECK-NEXT:    ret{{[l|q]}}
     26   %arg = bitcast <4 x i64> %a0 to <16 x i16>
     27   %sub = sub <16 x i16> zeroinitializer, %arg
     28   %cmp = icmp sgt <16 x i16> %arg, zeroinitializer
     29   %sel = select <16 x i1> %cmp, <16 x i16> %arg, <16 x i16> %sub
     30   %res = bitcast <16 x i16> %sel to <4 x i64>
     31   ret <4 x i64> %res
     32 }
     33 declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
     34 
     35 define <4 x i64> @test_mm256_abs_epi32(<4 x i64> %a0) {
     36 ; CHECK-LABEL: test_mm256_abs_epi32:
     37 ; CHECK:       # %bb.0:
     38 ; CHECK-NEXT:    vpabsd %ymm0, %ymm0
     39 ; CHECK-NEXT:    ret{{[l|q]}}
     40   %arg = bitcast <4 x i64> %a0 to <8 x i32>
     41   %sub = sub <8 x i32> zeroinitializer, %arg
     42   %cmp = icmp sgt <8 x i32> %arg, zeroinitializer
     43   %sel = select <8 x i1> %cmp, <8 x i32> %arg, <8 x i32> %sub
     44   %res = bitcast <8 x i32> %sel to <4 x i64>
     45   ret <4 x i64> %res
     46 }
     47 declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
     48 
     49 define <4 x i64> @test_mm256_add_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
     50 ; CHECK-LABEL: test_mm256_add_epi8:
     51 ; CHECK:       # %bb.0:
     52 ; CHECK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
     53 ; CHECK-NEXT:    ret{{[l|q]}}
     54   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
     55   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
     56   %res = add <32 x i8> %arg0, %arg1
     57   %bc = bitcast <32 x i8> %res to <4 x i64>
     58   ret <4 x i64> %bc
     59 }
     60 
     61 define <4 x i64> @test_mm256_add_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
     62 ; CHECK-LABEL: test_mm256_add_epi16:
     63 ; CHECK:       # %bb.0:
     64 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
     65 ; CHECK-NEXT:    ret{{[l|q]}}
     66   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
     67   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
     68   %res = add <16 x i16> %arg0, %arg1
     69   %bc = bitcast <16 x i16> %res to <4 x i64>
     70   ret <4 x i64> %bc
     71 }
     72 
     73 define <4 x i64> @test_mm256_add_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
     74 ; CHECK-LABEL: test_mm256_add_epi32:
     75 ; CHECK:       # %bb.0:
     76 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
     77 ; CHECK-NEXT:    ret{{[l|q]}}
     78   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
     79   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
     80   %res = add <8 x i32> %arg0, %arg1
     81   %bc = bitcast <8 x i32> %res to <4 x i64>
     82   ret <4 x i64> %bc
     83 }
     84 
     85 define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
     86 ; CHECK-LABEL: test_mm256_add_epi64:
     87 ; CHECK:       # %bb.0:
     88 ; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
     89 ; CHECK-NEXT:    ret{{[l|q]}}
     90   %res = add <4 x i64> %a0, %a1
     91   ret <4 x i64> %res
     92 }
     93 
     94 define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) {
     95 ; CHECK-LABEL: test_mm256_adds_epi8:
     96 ; CHECK:       # %bb.0:
     97 ; CHECK-NEXT:    vpaddsb %ymm1, %ymm0, %ymm0
     98 ; CHECK-NEXT:    ret{{[l|q]}}
     99   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    100   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
    101   %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1)
    102   %bc = bitcast <32 x i8> %res to <4 x i64>
    103   ret <4 x i64> %bc
    104 }
    105 declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone
    106 
    107 define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
    108 ; CHECK-LABEL: test_mm256_adds_epi16:
    109 ; CHECK:       # %bb.0:
    110 ; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0
    111 ; CHECK-NEXT:    ret{{[l|q]}}
    112   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    113   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
    114   %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1)
    115   %bc = bitcast <16 x i16> %res to <4 x i64>
    116   ret <4 x i64> %bc
    117 }
    118 declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone
    119 
    120 define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) {
    121 ; CHECK-LABEL: test_mm256_adds_epu8:
    122 ; CHECK:       # %bb.0:
    123 ; CHECK-NEXT:    vpaddusb %ymm1, %ymm0, %ymm0
    124 ; CHECK-NEXT:    ret{{[l|q]}}
    125   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    126   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
    127   %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1)
    128   %bc = bitcast <32 x i8> %res to <4 x i64>
    129   ret <4 x i64> %bc
    130 }
    131 declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone
    132 
    133 define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) {
    134 ; CHECK-LABEL: test_mm256_adds_epu16:
    135 ; CHECK:       # %bb.0:
    136 ; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0
    137 ; CHECK-NEXT:    ret{{[l|q]}}
    138   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    139   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
    140   %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1)
    141   %bc = bitcast <16 x i16> %res to <4 x i64>
    142   ret <4 x i64> %bc
    143 }
    144 declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone
    145 
    146 define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
    147 ; CHECK-LABEL: test_mm256_alignr_epi8:
    148 ; CHECK:       # %bb.0:
    149 ; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17]
    150 ; CHECK-NEXT:    ret{{[l|q]}}
    151   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    152   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
    153   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49>
    154   %res = bitcast <32 x i8> %shuf to <4 x i64>
    155   ret <4 x i64> %res
    156 }
    157 
    158 define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) {
    159 ; CHECK-LABEL: test2_mm256_alignr_epi8:
    160 ; CHECK:       # %bb.0:
    161 ; CHECK-NEXT:    vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16]
    162 ; CHECK-NEXT:    ret{{[l|q]}}
    163   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    164   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
    165   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
    166   %res = bitcast <32 x i8> %shuf to <4 x i64>
    167   ret <4 x i64> %res
    168 }
    169 
    170 define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    171 ; CHECK-LABEL: test_mm256_and_si256:
    172 ; CHECK:       # %bb.0:
    173 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
    174 ; CHECK-NEXT:    ret{{[l|q]}}
    175   %res = and <4 x i64> %a0, %a1
    176   ret <4 x i64> %res
    177 }
    178 
    179 define <4 x i64> @test_mm256_andnot_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    180 ; CHECK-LABEL: test_mm256_andnot_si256:
    181 ; CHECK:       # %bb.0:
    182 ; CHECK-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    183 ; CHECK-NEXT:    vpxor %ymm2, %ymm0, %ymm0
    184 ; CHECK-NEXT:    vpand %ymm1, %ymm0, %ymm0
    185 ; CHECK-NEXT:    ret{{[l|q]}}
    186   %not = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
    187   %res = and <4 x i64> %not, %a1
    188   ret <4 x i64> %res
    189 }
    190 
    191 define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    192 ; CHECK-LABEL: test_mm256_avg_epu8:
    193 ; CHECK:       # %bb.0:
    194 ; CHECK-NEXT:    vpavgb %ymm1, %ymm0, %ymm0
    195 ; CHECK-NEXT:    ret{{[l|q]}}
    196   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    197   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
    198   %zext0 = zext <32 x i8> %arg0 to <32 x i16>
    199   %zext1 = zext <32 x i8> %arg1 to <32 x i16>
    200   %add = add <32 x i16> %zext0, %zext1
    201   %add1 = add <32 x i16> %add, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    202   %lshr = lshr <32 x i16> %add1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
    203   %res = trunc <32 x i16> %lshr to <32 x i8>
    204   %bc = bitcast <32 x i8> %res to <4 x i64>
    205   ret <4 x i64> %bc
    206 }
    207 
    208 define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    209 ; CHECK-LABEL: test_mm256_avg_epu16:
    210 ; CHECK:       # %bb.0:
    211 ; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0
    212 ; CHECK-NEXT:    ret{{[l|q]}}
    213   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    214   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
    215   %zext0 = zext <16 x i16> %arg0 to <16 x i32>
    216   %zext1 = zext <16 x i16> %arg1 to <16 x i32>
    217   %add = add <16 x i32> %zext0, %zext1
    218   %add1 = add <16 x i32> %add, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    219   %lshr = lshr <16 x i32> %add1, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
    220   %res = trunc <16 x i32> %lshr to <16 x i16>
    221   %bc = bitcast <16 x i16> %res to <4 x i64>
    222   ret <4 x i64> %bc
    223 }
    224 
    225 define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) {
    226 ; CHECK-LABEL: test_mm256_blend_epi16:
    227 ; CHECK:       # %bb.0:
    228 ; CHECK-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7,8],ymm1[9],ymm0[10,11,12,13,14,15]
    229 ; CHECK-NEXT:    ret{{[l|q]}}
    230   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    231   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
    232   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    233   %res = bitcast <16 x i16> %shuf to <4 x i64>
    234   ret <4 x i64> %res
    235 }
    236 
    237 define <2 x i64> @test_mm_blend_epi32(<2 x i64> %a0, <2 x i64> %a1) {
    238 ; CHECK-LABEL: test_mm_blend_epi32:
    239 ; CHECK:       # %bb.0:
    240 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
    241 ; CHECK-NEXT:    ret{{[l|q]}}
    242   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    243   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    244   %shuf = shufflevector <4 x i32> %arg0, <4 x i32> %arg1, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
    245   %res = bitcast <4 x i32> %shuf to <2 x i64>
    246   ret <2 x i64> %res
    247 }
    248 
    249 define <4 x i64> @test_mm256_blend_epi32(<4 x i64> %a0, <4 x i64> %a1) {
    250 ; CHECK-LABEL: test_mm256_blend_epi32:
    251 ; CHECK:       # %bb.0:
    252 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
    253 ; CHECK-NEXT:    ret{{[l|q]}}
    254   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
    255   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
    256   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
    257   %res = bitcast <8 x i32> %shuf to <4 x i64>
    258   ret <4 x i64> %res
    259 }
    260 
    261 define <4 x i64> @test_mm256_blendv_epi8(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
    262 ; CHECK-LABEL: test_mm256_blendv_epi8:
    263 ; CHECK:       # %bb.0:
    264 ; CHECK-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
    265 ; CHECK-NEXT:    ret{{[l|q]}}
    266   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    267   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
    268   %arg2 = bitcast <4 x i64> %a2 to <32 x i8>
    269   %call = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %arg0, <32 x i8> %arg1, <32 x i8> %arg2)
    270   %res = bitcast <32 x i8> %call to <4 x i64>
    271   ret <4 x i64> %res
    272 }
    273 declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>) nounwind readnone
    274 
    275 define <2 x i64> @test_mm_broadcastb_epi8(<2 x i64> %a0) {
    276 ; CHECK-LABEL: test_mm_broadcastb_epi8:
    277 ; CHECK:       # %bb.0:
    278 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0
    279 ; CHECK-NEXT:    ret{{[l|q]}}
    280   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    281   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <16 x i32> zeroinitializer
    282   %res = bitcast <16 x i8> %shuf to <2 x i64>
    283   ret <2 x i64> %res
    284 }
    285 
    286 define <4 x i64> @test_mm256_broadcastb_epi8(<4 x i64> %a0) {
    287 ; CHECK-LABEL: test_mm256_broadcastb_epi8:
    288 ; CHECK:       # %bb.0:
    289 ; CHECK-NEXT:    vpbroadcastb %xmm0, %ymm0
    290 ; CHECK-NEXT:    ret{{[l|q]}}
    291   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    292   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> undef, <32 x i32> zeroinitializer
    293   %res = bitcast <32 x i8> %shuf to <4 x i64>
    294   ret <4 x i64> %res
    295 }
    296 
    297 define <2 x i64> @test_mm_broadcastd_epi32(<2 x i64> %a0) {
    298 ; CHECK-LABEL: test_mm_broadcastd_epi32:
    299 ; CHECK:       # %bb.0:
    300 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
    301 ; CHECK-NEXT:    ret{{[l|q]}}
    302   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    303   %shuf = shufflevector <4 x i32> %arg0, <4 x i32> undef, <4 x i32> zeroinitializer
    304   %res = bitcast <4 x i32> %shuf to <2 x i64>
    305   ret <2 x i64> %res
    306 }
    307 
    308 define <4 x i64> @test_mm256_broadcastd_epi32(<4 x i64> %a0) {
    309 ; CHECK-LABEL: test_mm256_broadcastd_epi32:
    310 ; CHECK:       # %bb.0:
    311 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
    312 ; CHECK-NEXT:    ret{{[l|q]}}
    313   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
    314   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> zeroinitializer
    315   %res = bitcast <8 x i32> %shuf to <4 x i64>
    316   ret <4 x i64> %res
    317 }
    318 
    319 define <2 x i64> @test_mm_broadcastq_epi64(<2 x i64> %a0) {
    320 ; CHECK-LABEL: test_mm_broadcastq_epi64:
    321 ; CHECK:       # %bb.0:
    322 ; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0
    323 ; CHECK-NEXT:    ret{{[l|q]}}
    324   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> zeroinitializer
    325   ret <2 x i64> %res
    326 }
    327 
    328 define <4 x i64> @test_mm256_broadcastq_epi64(<4 x i64> %a0) {
    329 ; CHECK-LABEL: test_mm256_broadcastq_epi64:
    330 ; CHECK:       # %bb.0:
    331 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
    332 ; CHECK-NEXT:    ret{{[l|q]}}
    333   %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> zeroinitializer
    334   ret <4 x i64> %res
    335 }
    336 
    337 define <2 x double> @test_mm_broadcastsd_pd(<2 x double> %a0) {
    338 ; CHECK-LABEL: test_mm_broadcastsd_pd:
    339 ; CHECK:       # %bb.0:
    340 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
    341 ; CHECK-NEXT:    ret{{[l|q]}}
    342   %res = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
    343   ret <2 x double> %res
    344 }
    345 
    346 define <4 x double> @test_mm256_broadcastsd_pd(<4 x double> %a0) {
    347 ; CHECK-LABEL: test_mm256_broadcastsd_pd:
    348 ; CHECK:       # %bb.0:
    349 ; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0
    350 ; CHECK-NEXT:    ret{{[l|q]}}
    351   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> zeroinitializer
    352   ret <4 x double> %res
    353 }
    354 
    355 define <4 x i64> @test_mm256_broadcastsi128_si256(<2 x i64> %a0) {
    356 ; CHECK-LABEL: test_mm256_broadcastsi128_si256:
    357 ; CHECK:       # %bb.0:
    358 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    359 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
    360 ; CHECK-NEXT:    ret{{[l|q]}}
    361   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
    362   ret <4 x i64> %res
    363 }
    364 
    365 define <4 x i64> @test_mm256_broadcastsi128_si256_mem(<2 x i64>* %p0) {
    366 ; X86-LABEL: test_mm256_broadcastsi128_si256_mem:
    367 ; X86:       # %bb.0:
    368 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    369 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    370 ; X86-NEXT:    retl
    371 ;
    372 ; X64-LABEL: test_mm256_broadcastsi128_si256_mem:
    373 ; X64:       # %bb.0:
    374 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    375 ; X64-NEXT:    retq
    376   %a0 = load <2 x i64>, <2 x i64>* %p0
    377   %res = shufflevector <2 x i64> %a0, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
    378   ret <4 x i64> %res
    379 }
    380 
    381 define <4 x float> @test_mm_broadcastss_ps(<4 x float> %a0) {
    382 ; CHECK-LABEL: test_mm_broadcastss_ps:
    383 ; CHECK:       # %bb.0:
    384 ; CHECK-NEXT:    vbroadcastss %xmm0, %xmm0
    385 ; CHECK-NEXT:    ret{{[l|q]}}
    386   %res = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer
    387   ret <4 x float> %res
    388 }
    389 
    390 define <8 x float> @test_mm256_broadcastss_ps(<8 x float> %a0) {
    391 ; CHECK-LABEL: test_mm256_broadcastss_ps:
    392 ; CHECK:       # %bb.0:
    393 ; CHECK-NEXT:    vbroadcastss %xmm0, %ymm0
    394 ; CHECK-NEXT:    ret{{[l|q]}}
    395   %res = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> zeroinitializer
    396   ret <8 x float> %res
    397 }
    398 
    399 define <2 x i64> @test_mm_broadcastw_epi16(<2 x i64> %a0) {
    400 ; CHECK-LABEL: test_mm_broadcastw_epi16:
    401 ; CHECK:       # %bb.0:
    402 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0
    403 ; CHECK-NEXT:    ret{{[l|q]}}
    404   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    405   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <8 x i32> zeroinitializer
    406   %res = bitcast <8 x i16> %shuf to <2 x i64>
    407   ret <2 x i64> %res
    408 }
    409 
    410 define <4 x i64> @test_mm256_broadcastw_epi16(<4 x i64> %a0) {
    411 ; CHECK-LABEL: test_mm256_broadcastw_epi16:
    412 ; CHECK:       # %bb.0:
    413 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0
    414 ; CHECK-NEXT:    ret{{[l|q]}}
    415   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    416   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> zeroinitializer
    417   %res = bitcast <16 x i16> %shuf to <4 x i64>
    418   ret <4 x i64> %res
    419 }
    420 
    421 define <4 x i64> @test_mm256_bslli_epi128(<4 x i64> %a0) {
    422 ; CHECK-LABEL: test_mm256_bslli_epi128:
    423 ; CHECK:       # %bb.0:
    424 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
    425 ; CHECK-NEXT:    ret{{[l|q]}}
    426   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    427   %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
    428   %res = bitcast <32 x i8> %shuf to <4 x i64>
    429   ret <4 x i64> %res
    430 }
    431 
    432 define <4 x i64> @test_mm256_bsrli_epi128(<4 x i64> %a0) {
    433 ; CHECK-LABEL: test_mm256_bsrli_epi128:
    434 ; CHECK:       # %bb.0:
    435 ; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
    436 ; CHECK-NEXT:    ret{{[l|q]}}
    437   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    438   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
    439   %res = bitcast <32 x i8> %shuf to <4 x i64>
    440   ret <4 x i64> %res
    441 }
    442 
    443 define <4 x i64> @test_mm256_cmpeq_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    444 ; CHECK-LABEL: test_mm256_cmpeq_epi8:
    445 ; CHECK:       # %bb.0:
    446 ; CHECK-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
    447 ; CHECK-NEXT:    ret{{[l|q]}}
    448   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    449   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
    450   %cmp = icmp eq <32 x i8> %arg0, %arg1
    451   %res = sext <32 x i1> %cmp to <32 x i8>
    452   %bc = bitcast <32 x i8> %res to <4 x i64>
    453   ret <4 x i64> %bc
    454 }
    455 
    456 define <4 x i64> @test_mm256_cmpeq_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    457 ; CHECK-LABEL: test_mm256_cmpeq_epi16:
    458 ; CHECK:       # %bb.0:
    459 ; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
    460 ; CHECK-NEXT:    ret{{[l|q]}}
    461   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    462   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
    463   %cmp = icmp eq <16 x i16> %arg0, %arg1
    464   %res = sext <16 x i1> %cmp to <16 x i16>
    465   %bc = bitcast <16 x i16> %res to <4 x i64>
    466   ret <4 x i64> %bc
    467 }
    468 
    469 define <4 x i64> @test_mm256_cmpeq_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    470 ; CHECK-LABEL: test_mm256_cmpeq_epi32:
    471 ; CHECK:       # %bb.0:
    472 ; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %ymm0
    473 ; CHECK-NEXT:    ret{{[l|q]}}
    474   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
    475   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
    476   %cmp = icmp eq <8 x i32> %arg0, %arg1
    477   %res = sext <8 x i1> %cmp to <8 x i32>
    478   %bc = bitcast <8 x i32> %res to <4 x i64>
    479   ret <4 x i64> %bc
    480 }
    481 
    482 define <4 x i64> @test_mm256_cmpeq_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    483 ; CHECK-LABEL: test_mm256_cmpeq_epi64:
    484 ; CHECK:       # %bb.0:
    485 ; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %ymm0
    486 ; CHECK-NEXT:    ret{{[l|q]}}
    487   %cmp = icmp eq <4 x i64> %a0, %a1
    488   %res = sext <4 x i1> %cmp to <4 x i64>
    489   ret <4 x i64> %res
    490 }
    491 
    492 define <4 x i64> @test_mm256_cmpgt_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    493 ; CHECK-LABEL: test_mm256_cmpgt_epi8:
    494 ; CHECK:       # %bb.0:
    495 ; CHECK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
    496 ; CHECK-NEXT:    ret{{[l|q]}}
    497   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    498   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
    499   %cmp = icmp sgt <32 x i8> %arg0, %arg1
    500   %res = sext <32 x i1> %cmp to <32 x i8>
    501   %bc = bitcast <32 x i8> %res to <4 x i64>
    502   ret <4 x i64> %bc
    503 }
    504 
    505 define <4 x i64> @test_mm256_cmpgt_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    506 ; CHECK-LABEL: test_mm256_cmpgt_epi16:
    507 ; CHECK:       # %bb.0:
    508 ; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
    509 ; CHECK-NEXT:    ret{{[l|q]}}
    510   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    511   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
    512   %cmp = icmp sgt <16 x i16> %arg0, %arg1
    513   %res = sext <16 x i1> %cmp to <16 x i16>
    514   %bc = bitcast <16 x i16> %res to <4 x i64>
    515   ret <4 x i64> %bc
    516 }
    517 
    518 define <4 x i64> @test_mm256_cmpgt_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    519 ; CHECK-LABEL: test_mm256_cmpgt_epi32:
    520 ; CHECK:       # %bb.0:
    521 ; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
    522 ; CHECK-NEXT:    ret{{[l|q]}}
    523   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
    524   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
    525   %cmp = icmp sgt <8 x i32> %arg0, %arg1
    526   %res = sext <8 x i1> %cmp to <8 x i32>
    527   %bc = bitcast <8 x i32> %res to <4 x i64>
    528   ret <4 x i64> %bc
    529 }
    530 
    531 define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
    532 ; CHECK-LABEL: test_mm256_cmpgt_epi64:
    533 ; CHECK:       # %bb.0:
    534 ; CHECK-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
    535 ; CHECK-NEXT:    ret{{[l|q]}}
    536   %cmp = icmp sgt <4 x i64> %a0, %a1
    537   %res = sext <4 x i1> %cmp to <4 x i64>
    538   ret <4 x i64> %res
    539 }
    540 
    541 define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) {
    542 ; CHECK-LABEL: test_mm256_cvtepi8_epi16:
    543 ; CHECK:       # %bb.0:
    544 ; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0
    545 ; CHECK-NEXT:    ret{{[l|q]}}
    546   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    547   %ext = sext <16 x i8> %arg0 to <16 x i16>
    548   %res = bitcast <16 x i16> %ext to <4 x i64>
    549   ret <4 x i64> %res
    550 }
    551 
    552 define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) {
    553 ; CHECK-LABEL: test_mm256_cvtepi8_epi32:
    554 ; CHECK:       # %bb.0:
    555 ; CHECK-NEXT:    vpmovsxbd %xmm0, %ymm0
    556 ; CHECK-NEXT:    ret{{[l|q]}}
    557   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    558   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    559   %ext = sext <8 x i8> %shuf to <8 x i32>
    560   %res = bitcast <8 x i32> %ext to <4 x i64>
    561   ret <4 x i64> %res
    562 }
    563 
    564 define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) {
    565 ; CHECK-LABEL: test_mm256_cvtepi8_epi64:
    566 ; CHECK:       # %bb.0:
    567 ; CHECK-NEXT:    vpmovsxbq %xmm0, %ymm0
    568 ; CHECK-NEXT:    ret{{[l|q]}}
    569   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    570   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    571   %ext = sext <4 x i8> %shuf to <4 x i64>
    572   ret <4 x i64> %ext
    573 }
    574 
    575 define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) {
    576 ; CHECK-LABEL: test_mm256_cvtepi16_epi32:
    577 ; CHECK:       # %bb.0:
    578 ; CHECK-NEXT:    vpmovsxwd %xmm0, %ymm0
    579 ; CHECK-NEXT:    ret{{[l|q]}}
    580   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    581   %ext = sext <8 x i16> %arg0 to <8 x i32>
    582   %res = bitcast <8 x i32> %ext to <4 x i64>
    583   ret <4 x i64> %res
    584 }
    585 
    586 define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) {
    587 ; CHECK-LABEL: test_mm256_cvtepi16_epi64:
    588 ; CHECK:       # %bb.0:
    589 ; CHECK-NEXT:    vpmovsxwq %xmm0, %ymm0
    590 ; CHECK-NEXT:    ret{{[l|q]}}
    591   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    592   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    593   %ext = sext <4 x i16> %shuf to <4 x i64>
    594   ret <4 x i64> %ext
    595 }
    596 
    597 define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) {
    598 ; CHECK-LABEL: test_mm256_cvtepi32_epi64:
    599 ; CHECK:       # %bb.0:
    600 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0
    601 ; CHECK-NEXT:    ret{{[l|q]}}
    602   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    603   %ext = sext <4 x i32> %arg0 to <4 x i64>
    604   ret <4 x i64> %ext
    605 }
    606 
    607 define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) {
    608 ; CHECK-LABEL: test_mm256_cvtepu8_epi16:
    609 ; CHECK:       # %bb.0:
    610 ; CHECK-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
    611 ; CHECK-NEXT:    ret{{[l|q]}}
    612   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    613   %ext = zext <16 x i8> %arg0 to <16 x i16>
    614   %res = bitcast <16 x i16> %ext to <4 x i64>
    615   ret <4 x i64> %res
    616 }
    617 
    618 define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) {
    619 ; CHECK-LABEL: test_mm256_cvtepu8_epi32:
    620 ; CHECK:       # %bb.0:
    621 ; CHECK-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
    622 ; CHECK-NEXT:    ret{{[l|q]}}
    623   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    624   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    625   %ext = zext <8 x i8> %shuf to <8 x i32>
    626   %res = bitcast <8 x i32> %ext to <4 x i64>
    627   ret <4 x i64> %res
    628 }
    629 
    630 define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) {
    631 ; CHECK-LABEL: test_mm256_cvtepu8_epi64:
    632 ; CHECK:       # %bb.0:
    633 ; CHECK-NEXT:    vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero
    634 ; CHECK-NEXT:    ret{{[l|q]}}
    635   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    636   %shuf = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    637   %ext = zext <4 x i8> %shuf to <4 x i64>
    638   ret <4 x i64> %ext
    639 }
    640 
    641 define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) {
    642 ; CHECK-LABEL: test_mm256_cvtepu16_epi32:
    643 ; CHECK:       # %bb.0:
    644 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    645 ; CHECK-NEXT:    ret{{[l|q]}}
    646   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    647   %ext = zext <8 x i16> %arg0 to <8 x i32>
    648   %res = bitcast <8 x i32> %ext to <4 x i64>
    649   ret <4 x i64> %res
    650 }
    651 
    652 define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) {
    653 ; CHECK-LABEL: test_mm256_cvtepu16_epi64:
    654 ; CHECK:       # %bb.0:
    655 ; CHECK-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    656 ; CHECK-NEXT:    ret{{[l|q]}}
    657   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    658   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    659   %ext = zext <4 x i16> %shuf to <4 x i64>
    660   ret <4 x i64> %ext
    661 }
    662 
    663 define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) {
    664 ; CHECK-LABEL: test_mm256_cvtepu32_epi64:
    665 ; CHECK:       # %bb.0:
    666 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    667 ; CHECK-NEXT:    ret{{[l|q]}}
    668   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    669   %ext = zext <4 x i32> %arg0 to <4 x i64>
    670   ret <4 x i64> %ext
    671 }
    672 
    673 define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind {
    674 ; CHECK-LABEL: test_mm256_extracti128_si256:
    675 ; CHECK:       # %bb.0:
    676 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
    677 ; CHECK-NEXT:    vzeroupper
    678 ; CHECK-NEXT:    ret{{[l|q]}}
    679   %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
    680   ret <2 x i64> %res
    681 }
    682 
    683 define <4 x i64> @test_mm256_hadd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
    684 ; CHECK-LABEL: test_mm256_hadd_epi16:
    685 ; CHECK:       # %bb.0:
    686 ; CHECK-NEXT:    vphaddw %ymm1, %ymm0, %ymm0
    687 ; CHECK-NEXT:    ret{{[l|q]}}
    688   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    689   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
    690   %res = call <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16> %arg0, <16 x i16> %arg1)
    691   %bc = bitcast <16 x i16> %res to <4 x i64>
    692   ret <4 x i64> %bc
    693 }
    694 declare <16 x i16> @llvm.x86.avx2.phadd.w(<16 x i16>, <16 x i16>) nounwind readnone
    695 
    696 define <4 x i64> @test_mm256_hadd_epi32(<4 x i64> %a0, <4 x i64> %a1) {
    697 ; CHECK-LABEL: test_mm256_hadd_epi32:
    698 ; CHECK:       # %bb.0:
    699 ; CHECK-NEXT:    vphaddd %ymm1, %ymm0, %ymm0
    700 ; CHECK-NEXT:    ret{{[l|q]}}
    701   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
    702   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
    703   %res = call <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32> %arg0, <8 x i32> %arg1)
    704   %bc = bitcast <8 x i32> %res to <4 x i64>
    705   ret <4 x i64> %bc
    706 }
    707 declare <8 x i32> @llvm.x86.avx2.phadd.d(<8 x i32>, <8 x i32>) nounwind readnone
    708 
    709 define <4 x i64> @test_mm256_hadds_epi16(<4 x i64> %a0, <4 x i64> %a1) {
    710 ; CHECK-LABEL: test_mm256_hadds_epi16:
    711 ; CHECK:       # %bb.0:
    712 ; CHECK-NEXT:    vphaddsw %ymm1, %ymm0, %ymm0
    713 ; CHECK-NEXT:    ret{{[l|q]}}
    714   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    715   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
    716   %res = call <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16> %arg0, <16 x i16> %arg1)
    717   %bc = bitcast <16 x i16> %res to <4 x i64>
    718   ret <4 x i64> %bc
    719 }
    720 declare <16 x i16> @llvm.x86.avx2.phadd.sw(<16 x i16>, <16 x i16>) nounwind readnone
    721 
    722 define <4 x i64> @test_mm256_hsub_epi16(<4 x i64> %a0, <4 x i64> %a1) {
    723 ; CHECK-LABEL: test_mm256_hsub_epi16:
    724 ; CHECK:       # %bb.0:
    725 ; CHECK-NEXT:    vphsubw %ymm1, %ymm0, %ymm0
    726 ; CHECK-NEXT:    ret{{[l|q]}}
    727   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    728   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
    729   %res = call <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16> %arg0, <16 x i16> %arg1)
    730   %bc = bitcast <16 x i16> %res to <4 x i64>
    731   ret <4 x i64> %bc
    732 }
    733 declare <16 x i16> @llvm.x86.avx2.phsub.w(<16 x i16>, <16 x i16>) nounwind readnone
    734 
    735 define <4 x i64> @test_mm256_hsub_epi32(<4 x i64> %a0, <4 x i64> %a1) {
    736 ; CHECK-LABEL: test_mm256_hsub_epi32:
    737 ; CHECK:       # %bb.0:
    738 ; CHECK-NEXT:    vphsubd %ymm1, %ymm0, %ymm0
    739 ; CHECK-NEXT:    ret{{[l|q]}}
    740   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
    741   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
    742   %res = call <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32> %arg0, <8 x i32> %arg1)
    743   %bc = bitcast <8 x i32> %res to <4 x i64>
    744   ret <4 x i64> %bc
    745 }
    746 declare <8 x i32> @llvm.x86.avx2.phsub.d(<8 x i32>, <8 x i32>) nounwind readnone
    747 
    748 define <4 x i64> @test_mm256_hsubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
    749 ; CHECK-LABEL: test_mm256_hsubs_epi16:
    750 ; CHECK:       # %bb.0:
    751 ; CHECK-NEXT:    vphsubsw %ymm1, %ymm0, %ymm0
    752 ; CHECK-NEXT:    ret{{[l|q]}}
    753   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    754   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
    755   %res = call <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16> %arg0, <16 x i16> %arg1)
    756   %bc = bitcast <16 x i16> %res to <4 x i64>
    757   ret <4 x i64> %bc
    758 }
    759 declare <16 x i16> @llvm.x86.avx2.phsub.sw(<16 x i16>, <16 x i16>) nounwind readnone
    760 
    761 define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
    762 ; X86-LABEL: test_mm_i32gather_epi32:
    763 ; X86:       # %bb.0:
    764 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    765 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    766 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    767 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
    768 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
    769 ; X86-NEXT:    retl
    770 ;
    771 ; X64-LABEL: test_mm_i32gather_epi32:
    772 ; X64:       # %bb.0:
    773 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    774 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    775 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
    776 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
    777 ; X64-NEXT:    retq
    778   %arg0 = bitcast i32 *%a0 to i8*
    779   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    780   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
    781   %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> undef, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
    782   %bc = bitcast <4 x i32> %call to <2 x i64>
    783   ret <2 x i64> %bc
    784 }
    785 declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
    786 
    787 define <2 x i64> @test_mm_mask_i32gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
    788 ; X86-LABEL: test_mm_mask_i32gather_epi32:
    789 ; X86:       # %bb.0:
    790 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    791 ; X86-NEXT:    vpgatherdd %xmm2, (%eax,%xmm1,2), %xmm0
    792 ; X86-NEXT:    retl
    793 ;
    794 ; X64-LABEL: test_mm_mask_i32gather_epi32:
    795 ; X64:       # %bb.0:
    796 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm1,2), %xmm0
    797 ; X64-NEXT:    retq
    798   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    799   %arg1 = bitcast i32 *%a1 to i8*
    800   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
    801   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
    802   %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> %arg0, i8* %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i8 2)
    803   %bc = bitcast <4 x i32> %call to <2 x i64>
    804   ret <2 x i64> %bc
    805 }
    806 
    807 define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
    808 ; X86-LABEL: test_mm256_i32gather_epi32:
    809 ; X86:       # %bb.0:
    810 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    811 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    812 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    813 ; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
    814 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
    815 ; X86-NEXT:    retl
    816 ;
    817 ; X64-LABEL: test_mm256_i32gather_epi32:
    818 ; X64:       # %bb.0:
    819 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    820 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    821 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
    822 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
    823 ; X64-NEXT:    retq
    824   %arg0 = bitcast i32 *%a0 to i8*
    825   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
    826   %mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x i32>
    827   %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> undef, i8* %arg0, <8 x i32> %arg1, <8 x i32> %mask, i8 2)
    828   %bc = bitcast <8 x i32> %call to <4 x i64>
    829   ret <4 x i64> %bc
    830 }
    831 declare <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32>, i8*, <8 x i32>, <8 x i32>, i8) nounwind readonly
    832 
    833 define <4 x i64> @test_mm256_mask_i32gather_epi32(<4 x i64> %a0, i32 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
    834 ; X86-LABEL: test_mm256_mask_i32gather_epi32:
    835 ; X86:       # %bb.0:
    836 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    837 ; X86-NEXT:    vpgatherdd %ymm2, (%eax,%ymm1,2), %ymm0
    838 ; X86-NEXT:    retl
    839 ;
    840 ; X64-LABEL: test_mm256_mask_i32gather_epi32:
    841 ; X64:       # %bb.0:
    842 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm1,2), %ymm0
    843 ; X64-NEXT:    retq
    844   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
    845   %arg1 = bitcast i32 *%a1 to i8*
    846   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
    847   %arg3 = bitcast <4 x i64> %a3 to <8 x i32>
    848   %call = call <8 x i32> @llvm.x86.avx2.gather.d.d.256(<8 x i32> %arg0, i8* %arg1, <8 x i32> %arg2, <8 x i32> %arg3, i8 2)
    849   %bc = bitcast <8 x i32> %call to <4 x i64>
    850   ret <4 x i64> %bc
    851 }
    852 
    853 define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
    854 ; X86-LABEL: test_mm_i32gather_epi64:
    855 ; X86:       # %bb.0:
    856 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    857 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    858 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    859 ; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
    860 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
    861 ; X86-NEXT:    retl
    862 ;
    863 ; X64-LABEL: test_mm_i32gather_epi64:
    864 ; X64:       # %bb.0:
    865 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    866 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    867 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
    868 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
    869 ; X64-NEXT:    retq
    870   %arg0 = bitcast i64 *%a0 to i8*
    871   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    872   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> undef, i8* %arg0, <4 x i32> %arg1, <2 x i64> <i64 -1, i64 -1>, i8 2)
    873   ret <2 x i64> %res
    874 }
    875 declare <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64>, i8*, <4 x i32>, <2 x i64>, i8) nounwind readonly
    876 
    877 define <2 x i64> @test_mm_mask_i32gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
    878 ; X86-LABEL: test_mm_mask_i32gather_epi64:
    879 ; X86:       # %bb.0:
    880 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    881 ; X86-NEXT:    vpgatherdq %xmm2, (%eax,%xmm1,2), %xmm0
    882 ; X86-NEXT:    retl
    883 ;
    884 ; X64-LABEL: test_mm_mask_i32gather_epi64:
    885 ; X64:       # %bb.0:
    886 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm1,2), %xmm0
    887 ; X64-NEXT:    retq
    888   %arg1 = bitcast i64 *%a1 to i8*
    889   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
    890   %res = call <2 x i64> @llvm.x86.avx2.gather.d.q(<2 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <2 x i64> %a3, i8 2)
    891   ret <2 x i64> %res
    892 }
    893 
    894 define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
    895 ; X86-LABEL: test_mm256_i32gather_epi64:
    896 ; X86:       # %bb.0:
    897 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    898 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    899 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    900 ; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
    901 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
    902 ; X86-NEXT:    retl
    903 ;
    904 ; X64-LABEL: test_mm256_i32gather_epi64:
    905 ; X64:       # %bb.0:
    906 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
    907 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    908 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
    909 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
    910 ; X64-NEXT:    retq
    911   %arg0 = bitcast i64 *%a0 to i8*
    912   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    913   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> undef, i8* %arg0, <4 x i32> %arg1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
    914   ret <4 x i64> %res
    915 }
    916 declare <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64>, i8*, <4 x i32>, <4 x i64>, i8) nounwind readonly
    917 
    918 define <4 x i64> @test_mm256_mask_i32gather_epi64(<4 x i64> %a0, i64 *%a1, <2 x i64> %a2, <4 x i64> %a3) {
    919 ; X86-LABEL: test_mm256_mask_i32gather_epi64:
    920 ; X86:       # %bb.0:
    921 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    922 ; X86-NEXT:    vpgatherdq %ymm2, (%eax,%xmm1,2), %ymm0
    923 ; X86-NEXT:    retl
    924 ;
    925 ; X64-LABEL: test_mm256_mask_i32gather_epi64:
    926 ; X64:       # %bb.0:
    927 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm1,2), %ymm0
    928 ; X64-NEXT:    retq
    929   %arg1 = bitcast i64 *%a1 to i8*
    930   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
    931   %res = call <4 x i64> @llvm.x86.avx2.gather.d.q.256(<4 x i64> %a0, i8* %arg1, <4 x i32> %arg2, <4 x i64> %a3, i8 2)
    932   ret <4 x i64> %res
    933 }
    934 
    935 define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
    936 ; X86-LABEL: test_mm_i32gather_pd:
    937 ; X86:       # %bb.0:
    938 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    939 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    940 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    941 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
    942 ; X86-NEXT:    vmovapd %xmm1, %xmm0
    943 ; X86-NEXT:    retl
    944 ;
    945 ; X64-LABEL: test_mm_i32gather_pd:
    946 ; X64:       # %bb.0:
    947 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
    948 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    949 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
    950 ; X64-NEXT:    vmovapd %xmm1, %xmm0
    951 ; X64-NEXT:    retq
    952   %arg0 = bitcast double *%a0 to i8*
    953   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    954   %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
    955   %sext = sext <2 x i1> %cmp to <2 x i64>
    956   %mask = bitcast <2 x i64> %sext to <2 x double>
    957   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
    958   ret <2 x double> %res
    959 }
    960 declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*, <4 x i32>, <2 x double>, i8) nounwind readonly
    961 
    962 define <2 x double> @test_mm_mask_i32gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
    963 ; X86-LABEL: test_mm_mask_i32gather_pd:
    964 ; X86:       # %bb.0:
    965 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    966 ; X86-NEXT:    vgatherdpd %xmm2, (%eax,%xmm1,2), %xmm0
    967 ; X86-NEXT:    retl
    968 ;
    969 ; X64-LABEL: test_mm_mask_i32gather_pd:
    970 ; X64:       # %bb.0:
    971 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm1,2), %xmm0
    972 ; X64-NEXT:    retq
    973   %arg1 = bitcast double *%a1 to i8*
    974   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
    975   %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> %a0, i8* %arg1, <4 x i32> %arg2, <2 x double> %a3, i8 2)
    976   ret <2 x double> %res
    977 }
    978 
    979 define <4 x double> @test_mm256_i32gather_pd(double *%a0, <2 x i64> %a1) {
    980 ; X86-LABEL: test_mm256_i32gather_pd:
    981 ; X86:       # %bb.0:
    982 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    983 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    984 ; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
    985 ; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm0,2), %ymm1
    986 ; X86-NEXT:    vmovapd %ymm1, %ymm0
    987 ; X86-NEXT:    retl
    988 ;
    989 ; X64-LABEL: test_mm256_i32gather_pd:
    990 ; X64:       # %bb.0:
    991 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
    992 ; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
    993 ; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm0,2), %ymm1
    994 ; X64-NEXT:    vmovapd %ymm1, %ymm0
    995 ; X64-NEXT:    retq
    996   %arg0 = bitcast double *%a0 to i8*
    997   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    998   %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
    999   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> undef, i8* %arg0, <4 x i32> %arg1, <4 x double> %mask, i8 2)
   1000   ret <4 x double> %res
   1001 }
   1002 declare <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double>, i8*, <4 x i32>, <4 x double>, i8) nounwind readonly
   1003 
   1004 define <4 x double> @test_mm256_mask_i32gather_pd(<4 x double> %a0, double *%a1, <2 x i64> %a2, <4 x double> %a3) {
   1005 ; X86-LABEL: test_mm256_mask_i32gather_pd:
   1006 ; X86:       # %bb.0:
   1007 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1008 ; X86-NEXT:    vgatherdpd %ymm2, (%eax,%xmm1,2), %ymm0
   1009 ; X86-NEXT:    retl
   1010 ;
   1011 ; X64-LABEL: test_mm256_mask_i32gather_pd:
   1012 ; X64:       # %bb.0:
   1013 ; X64-NEXT:    vgatherdpd %ymm2, (%rdi,%xmm1,2), %ymm0
   1014 ; X64-NEXT:    retq
   1015   %arg1 = bitcast double *%a1 to i8*
   1016   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
   1017   %res = call <4 x double> @llvm.x86.avx2.gather.d.pd.256(<4 x double> %a0, i8* %arg1, <4 x i32> %arg2, <4 x double> %a3, i8 2)
   1018   ret <4 x double> %res
   1019 }
   1020 
   1021 define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
   1022 ; X86-LABEL: test_mm_i32gather_ps:
   1023 ; X86:       # %bb.0:
   1024 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1025 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1026 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1027 ; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
   1028 ; X86-NEXT:    vmovaps %xmm1, %xmm0
   1029 ; X86-NEXT:    retl
   1030 ;
   1031 ; X64-LABEL: test_mm_i32gather_ps:
   1032 ; X64:       # %bb.0:
   1033 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1034 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1035 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
   1036 ; X64-NEXT:    vmovaps %xmm1, %xmm0
   1037 ; X64-NEXT:    retq
   1038   %arg0 = bitcast float *%a0 to i8*
   1039   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   1040   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
   1041   %sext = sext <4 x i1> %cmp to <4 x i32>
   1042   %mask = bitcast <4 x i32> %sext to <4 x float>
   1043   %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> undef, i8* %arg0, <4 x i32> %arg1, <4 x float> %mask, i8 2)
   1044   ret <4 x float> %call
   1045 }
   1046 declare <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float>, i8*, <4 x i32>, <4 x float>, i8) nounwind readonly
   1047 
   1048 define <4 x float> @test_mm_mask_i32gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
   1049 ; X86-LABEL: test_mm_mask_i32gather_ps:
   1050 ; X86:       # %bb.0:
   1051 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1052 ; X86-NEXT:    vgatherdps %xmm2, (%eax,%xmm1,2), %xmm0
   1053 ; X86-NEXT:    retl
   1054 ;
   1055 ; X64-LABEL: test_mm_mask_i32gather_ps:
   1056 ; X64:       # %bb.0:
   1057 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm1,2), %xmm0
   1058 ; X64-NEXT:    retq
   1059   %arg1 = bitcast float *%a1 to i8*
   1060   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
   1061   %call = call <4 x float> @llvm.x86.avx2.gather.d.ps(<4 x float> %a0, i8* %arg1, <4 x i32> %arg2, <4 x float> %a3, i8 2)
   1062   ret <4 x float> %call
   1063 }
   1064 
   1065 define <8 x float> @test_mm256_i32gather_ps(float *%a0, <4 x i64> %a1) {
   1066 ; X86-LABEL: test_mm256_i32gather_ps:
   1067 ; X86:       # %bb.0:
   1068 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1069 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1070 ; X86-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
   1071 ; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm0,2), %ymm1
   1072 ; X86-NEXT:    vmovaps %ymm1, %ymm0
   1073 ; X86-NEXT:    retl
   1074 ;
   1075 ; X64-LABEL: test_mm256_i32gather_ps:
   1076 ; X64:       # %bb.0:
   1077 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1078 ; X64-NEXT:    vcmpeqps %ymm1, %ymm1, %ymm2
   1079 ; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm0,2), %ymm1
   1080 ; X64-NEXT:    vmovaps %ymm1, %ymm0
   1081 ; X64-NEXT:    retq
   1082   %arg0 = bitcast float *%a0 to i8*
   1083   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1084   %mask = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> zeroinitializer, <8 x float> zeroinitializer, i8 0)
   1085   %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, i8* %arg0, <8 x i32> %arg1, <8 x float> %mask, i8 2)
   1086   ret <8 x float> %call
   1087 }
   1088 declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, i8*, <8 x i32>, <8 x float>, i8) nounwind readonly
   1089 
   1090 define <8 x float> @test_mm256_mask_i32gather_ps(<8 x float> %a0, float *%a1, <4 x i64> %a2, <8 x float> %a3) {
   1091 ; X86-LABEL: test_mm256_mask_i32gather_ps:
   1092 ; X86:       # %bb.0:
   1093 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1094 ; X86-NEXT:    vgatherdps %ymm2, (%eax,%ymm1,2), %ymm0
   1095 ; X86-NEXT:    retl
   1096 ;
   1097 ; X64-LABEL: test_mm256_mask_i32gather_ps:
   1098 ; X64:       # %bb.0:
   1099 ; X64-NEXT:    vgatherdps %ymm2, (%rdi,%ymm1,2), %ymm0
   1100 ; X64-NEXT:    retq
   1101   %arg1 = bitcast float *%a1 to i8*
   1102   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
   1103   %call = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> %a0, i8* %arg1, <8 x i32> %arg2, <8 x float> %a3, i8 2)
   1104   ret <8 x float> %call
   1105 }
   1106 
   1107 define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
   1108 ; X86-LABEL: test_mm_i64gather_epi32:
   1109 ; X86:       # %bb.0:
   1110 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1111 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1112 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1113 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
   1114 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
   1115 ; X86-NEXT:    retl
   1116 ;
   1117 ; X64-LABEL: test_mm_i64gather_epi32:
   1118 ; X64:       # %bb.0:
   1119 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1120 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1121 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
   1122 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
   1123 ; X64-NEXT:    retq
   1124   %arg0 = bitcast i32 *%a0 to i8*
   1125   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
   1126   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> undef, i8* %arg0, <2 x i64> %a1, <4 x i32> %mask, i8 2)
   1127   %bc = bitcast <4 x i32> %call to <2 x i64>
   1128   ret <2 x i64> %bc
   1129 }
   1130 declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*, <2 x i64>, <4 x i32>, i8) nounwind readonly
   1131 
   1132 define <2 x i64> @test_mm_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
   1133 ; X86-LABEL: test_mm_mask_i64gather_epi32:
   1134 ; X86:       # %bb.0:
   1135 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1136 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%xmm1,2), %xmm0
   1137 ; X86-NEXT:    retl
   1138 ;
   1139 ; X64-LABEL: test_mm_mask_i64gather_epi32:
   1140 ; X64:       # %bb.0:
   1141 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm1,2), %xmm0
   1142 ; X64-NEXT:    retq
   1143   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   1144   %arg1 = bitcast i32 *%a1 to i8*
   1145   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
   1146   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32> %arg0, i8* %arg1, <2 x i64> %a2, <4 x i32> %arg3, i8 2)
   1147   %bc = bitcast <4 x i32> %call to <2 x i64>
   1148   ret <2 x i64> %bc
   1149 }
   1150 
   1151 define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
   1152 ; X86-LABEL: test_mm256_i64gather_epi32:
   1153 ; X86:       # %bb.0:
   1154 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1155 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1156 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1157 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
   1158 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
   1159 ; X86-NEXT:    vzeroupper
   1160 ; X86-NEXT:    retl
   1161 ;
   1162 ; X64-LABEL: test_mm256_i64gather_epi32:
   1163 ; X64:       # %bb.0:
   1164 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1165 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1166 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
   1167 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
   1168 ; X64-NEXT:    vzeroupper
   1169 ; X64-NEXT:    retq
   1170   %arg0 = bitcast i32 *%a0 to i8*
   1171   %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
   1172   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> undef, i8* %arg0, <4 x i64> %a1, <4 x i32> %mask, i8 2)
   1173   %bc = bitcast <4 x i32> %call to <2 x i64>
   1174   ret <2 x i64> %bc
   1175 }
   1176 declare <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32>, i8*, <4 x i64>, <4 x i32>, i8) nounwind readonly
   1177 
   1178 define <2 x i64> @test_mm256_mask_i64gather_epi32(<2 x i64> %a0, i32 *%a1, <4 x i64> %a2, <2 x i64> %a3) {
   1179 ; X86-LABEL: test_mm256_mask_i64gather_epi32:
   1180 ; X86:       # %bb.0:
   1181 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1182 ; X86-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0
   1183 ; X86-NEXT:    vzeroupper
   1184 ; X86-NEXT:    retl
   1185 ;
   1186 ; X64-LABEL: test_mm256_mask_i64gather_epi32:
   1187 ; X64:       # %bb.0:
   1188 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm1,2), %xmm0
   1189 ; X64-NEXT:    vzeroupper
   1190 ; X64-NEXT:    retq
   1191   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   1192   %arg1 = bitcast i32 *%a1 to i8*
   1193   %arg3 = bitcast <2 x i64> %a3 to <4 x i32>
   1194   %call = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %arg0, i8* %arg1, <4 x i64> %a2, <4 x i32> %arg3, i8 2)
   1195   %bc = bitcast <4 x i32> %call to <2 x i64>
   1196   ret <2 x i64> %bc
   1197 }
   1198 
   1199 define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
   1200 ; X86-LABEL: test_mm_i64gather_epi64:
   1201 ; X86:       # %bb.0:
   1202 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1203 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1204 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1205 ; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
   1206 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
   1207 ; X86-NEXT:    retl
   1208 ;
   1209 ; X64-LABEL: test_mm_i64gather_epi64:
   1210 ; X64:       # %bb.0:
   1211 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1212 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1213 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
   1214 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
   1215 ; X64-NEXT:    retq
   1216   %arg0 = bitcast i64 *%a0 to i8*
   1217   %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> undef, i8* %arg0, <2 x i64> %a1, <2 x i64> <i64 -1, i64 -1>, i8 2)
   1218   ret <2 x i64> %call
   1219 }
   1220 declare <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64>, i8*, <2 x i64>, <2 x i64>, i8) nounwind readonly
   1221 
   1222 define <2 x i64> @test_mm_mask_i64gather_epi64(<2 x i64> %a0, i64 *%a1, <2 x i64> %a2, <2 x i64> %a3) {
   1223 ; X86-LABEL: test_mm_mask_i64gather_epi64:
   1224 ; X86:       # %bb.0:
   1225 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1226 ; X86-NEXT:    vpgatherqq %xmm2, (%eax,%xmm1,2), %xmm0
   1227 ; X86-NEXT:    retl
   1228 ;
   1229 ; X64-LABEL: test_mm_mask_i64gather_epi64:
   1230 ; X64:       # %bb.0:
   1231 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm1,2), %xmm0
   1232 ; X64-NEXT:    retq
   1233   %arg1 = bitcast i64 *%a1 to i8*
   1234   %call = call <2 x i64> @llvm.x86.avx2.gather.q.q(<2 x i64> %a0, i8* %arg1, <2 x i64> %a2, <2 x i64> %a3, i8 2)
   1235   ret <2 x i64> %call
   1236 }
   1237 
   1238 define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
   1239 ; X86-LABEL: test_mm256_i64gather_epi64:
   1240 ; X86:       # %bb.0:
   1241 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1242 ; X86-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
   1243 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1244 ; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
   1245 ; X86-NEXT:    vmovdqa %ymm1, %ymm0
   1246 ; X86-NEXT:    retl
   1247 ;
   1248 ; X64-LABEL: test_mm256_i64gather_epi64:
   1249 ; X64:       # %bb.0:
   1250 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
   1251 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1252 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
   1253 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
   1254 ; X64-NEXT:    retq
   1255   %arg0 = bitcast i64 *%a0 to i8*
   1256   %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> undef, i8* %arg0, <4 x i64> %a1, <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>, i8 2)
   1257   ret <4 x i64> %call
   1258 }
   1259 declare <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64>, i8*, <4 x i64>, <4 x i64>, i8) nounwind readonly
   1260 
   1261 define <4 x i64> @test_mm256_mask_i64gather_epi64(<4 x i64> %a0, i64 *%a1, <4 x i64> %a2, <4 x i64> %a3) {
   1262 ; X86-LABEL: test_mm256_mask_i64gather_epi64:
   1263 ; X86:       # %bb.0:
   1264 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1265 ; X86-NEXT:    vpgatherqq %ymm2, (%eax,%ymm1,2), %ymm0
   1266 ; X86-NEXT:    retl
   1267 ;
   1268 ; X64-LABEL: test_mm256_mask_i64gather_epi64:
   1269 ; X64:       # %bb.0:
   1270 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm1,2), %ymm0
   1271 ; X64-NEXT:    retq
   1272   %arg1 = bitcast i64 *%a1 to i8*
   1273   %call = call <4 x i64> @llvm.x86.avx2.gather.q.q.256(<4 x i64> %a0, i8* %arg1, <4 x i64> %a2, <4 x i64> %a3, i8 2)
   1274   ret <4 x i64> %call
   1275 }
   1276 
   1277 define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
   1278 ; X86-LABEL: test_mm_i64gather_pd:
   1279 ; X86:       # %bb.0:
   1280 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1281 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1282 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
   1283 ; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
   1284 ; X86-NEXT:    vmovapd %xmm1, %xmm0
   1285 ; X86-NEXT:    retl
   1286 ;
   1287 ; X64-LABEL: test_mm_i64gather_pd:
   1288 ; X64:       # %bb.0:
   1289 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1290 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
   1291 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
   1292 ; X64-NEXT:    vmovapd %xmm1, %xmm0
   1293 ; X64-NEXT:    retq
   1294   %arg0 = bitcast double *%a0 to i8*
   1295   %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
   1296   %sext = sext <2 x i1> %cmp to <2 x i64>
   1297   %mask = bitcast <2 x i64> %sext to <2 x double>
   1298   %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> undef, i8* %arg0, <2 x i64> %a1, <2 x double> %mask, i8 2)
   1299   ret <2 x double> %call
   1300 }
   1301 declare <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double>, i8*, <2 x i64>, <2 x double>, i8) nounwind readonly
   1302 
   1303 define <2 x double> @test_mm_mask_i64gather_pd(<2 x double> %a0, double *%a1, <2 x i64> %a2, <2 x double> %a3) {
   1304 ; X86-LABEL: test_mm_mask_i64gather_pd:
   1305 ; X86:       # %bb.0:
   1306 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1307 ; X86-NEXT:    vgatherqpd %xmm2, (%eax,%xmm1,2), %xmm0
   1308 ; X86-NEXT:    retl
   1309 ;
   1310 ; X64-LABEL: test_mm_mask_i64gather_pd:
   1311 ; X64:       # %bb.0:
   1312 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm1,2), %xmm0
   1313 ; X64-NEXT:    retq
   1314   %arg1 = bitcast double *%a1 to i8*
   1315   %call = call <2 x double> @llvm.x86.avx2.gather.q.pd(<2 x double> %a0, i8* %arg1, <2 x i64> %a2, <2 x double> %a3, i8 2)
   1316   ret <2 x double> %call
   1317 }
   1318 
   1319 define <4 x double> @test_mm256_i64gather_pd(double *%a0, <4 x i64> %a1) {
   1320 ; X86-LABEL: test_mm256_i64gather_pd:
   1321 ; X86:       # %bb.0:
   1322 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1323 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
   1324 ; X86-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
   1325 ; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm0,2), %ymm1
   1326 ; X86-NEXT:    vmovapd %ymm1, %ymm0
   1327 ; X86-NEXT:    retl
   1328 ;
   1329 ; X64-LABEL: test_mm256_i64gather_pd:
   1330 ; X64:       # %bb.0:
   1331 ; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
   1332 ; X64-NEXT:    vcmpeqpd %ymm1, %ymm1, %ymm2
   1333 ; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm0,2), %ymm1
   1334 ; X64-NEXT:    vmovapd %ymm1, %ymm0
   1335 ; X64-NEXT:    retq
   1336   %arg0 = bitcast double *%a0 to i8*
   1337   %mask = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> zeroinitializer, <4 x double> zeroinitializer, i8 0)
   1338   %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> undef, i8* %arg0, <4 x i64> %a1, <4 x double> %mask, i8 2)
   1339   ret <4 x double> %call
   1340 }
   1341 declare <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double>, i8*, <4 x i64>, <4 x double>, i8) nounwind readonly
   1342 
   1343 define <4 x double> @test_mm256_mask_i64gather_pd(<4 x double> %a0, i64 *%a1, <4 x i64> %a2, <4 x double> %a3) {
   1344 ; X86-LABEL: test_mm256_mask_i64gather_pd:
   1345 ; X86:       # %bb.0:
   1346 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1347 ; X86-NEXT:    vgatherqpd %ymm2, (%eax,%ymm1,2), %ymm0
   1348 ; X86-NEXT:    retl
   1349 ;
   1350 ; X64-LABEL: test_mm256_mask_i64gather_pd:
   1351 ; X64:       # %bb.0:
   1352 ; X64-NEXT:    vgatherqpd %ymm2, (%rdi,%ymm1,2), %ymm0
   1353 ; X64-NEXT:    retq
   1354   %arg1 = bitcast i64 *%a1 to i8*
   1355   %call = call <4 x double> @llvm.x86.avx2.gather.q.pd.256(<4 x double> %a0, i8* %arg1, <4 x i64> %a2, <4 x double> %a3, i8 2)
   1356   ret <4 x double> %call
   1357 }
   1358 
   1359 define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
   1360 ; X86-LABEL: test_mm_i64gather_ps:
   1361 ; X86:       # %bb.0:
   1362 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1363 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1364 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1365 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
   1366 ; X86-NEXT:    vmovaps %xmm1, %xmm0
   1367 ; X86-NEXT:    retl
   1368 ;
   1369 ; X64-LABEL: test_mm_i64gather_ps:
   1370 ; X64:       # %bb.0:
   1371 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1372 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1373 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
   1374 ; X64-NEXT:    vmovaps %xmm1, %xmm0
   1375 ; X64-NEXT:    retq
   1376   %arg0 = bitcast float *%a0 to i8*
   1377   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
   1378   %sext = sext <4 x i1> %cmp to <4 x i32>
   1379   %mask = bitcast <4 x i32> %sext to <4 x float>
   1380   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> undef, i8* %arg0, <2 x i64> %a1, <4 x float> %mask, i8 2)
   1381   ret <4 x float> %call
   1382 }
   1383 declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*, <2 x i64>, <4 x float>, i8) nounwind readonly
   1384 
   1385 define <4 x float> @test_mm_mask_i64gather_ps(<4 x float> %a0, float *%a1, <2 x i64> %a2, <4 x float> %a3) {
   1386 ; X86-LABEL: test_mm_mask_i64gather_ps:
   1387 ; X86:       # %bb.0:
   1388 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1389 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%xmm1,2), %xmm0
   1390 ; X86-NEXT:    retl
   1391 ;
   1392 ; X64-LABEL: test_mm_mask_i64gather_ps:
   1393 ; X64:       # %bb.0:
   1394 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm1,2), %xmm0
   1395 ; X64-NEXT:    retq
   1396   %arg1 = bitcast float *%a1 to i8*
   1397   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float> %a0, i8* %arg1, <2 x i64> %a2, <4 x float> %a3, i8 2)
   1398   ret <4 x float> %call
   1399 }
   1400 
   1401 define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
   1402 ; X86-LABEL: test_mm256_i64gather_ps:
   1403 ; X86:       # %bb.0:
   1404 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1405 ; X86-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1406 ; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1407 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
   1408 ; X86-NEXT:    vmovaps %xmm1, %xmm0
   1409 ; X86-NEXT:    vzeroupper
   1410 ; X86-NEXT:    retl
   1411 ;
   1412 ; X64-LABEL: test_mm256_i64gather_ps:
   1413 ; X64:       # %bb.0:
   1414 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
   1415 ; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
   1416 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
   1417 ; X64-NEXT:    vmovaps %xmm1, %xmm0
   1418 ; X64-NEXT:    vzeroupper
   1419 ; X64-NEXT:    retq
   1420   %arg0 = bitcast float *%a0 to i8*
   1421   %cmp = fcmp oeq <4 x float> zeroinitializer, zeroinitializer
   1422   %sext = sext <4 x i1> %cmp to <4 x i32>
   1423   %mask = bitcast <4 x i32> %sext to <4 x float>
   1424   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> undef, i8* %arg0, <4 x i64> %a1, <4 x float> %mask, i8 2)
   1425   ret <4 x float> %call
   1426 }
   1427 declare <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float>, i8*, <4 x i64>, <4 x float>, i8) nounwind readonly
   1428 
   1429 define <4 x float> @test_mm256_mask_i64gather_ps(<4 x float> %a0, float *%a1, <4 x i64> %a2, <4 x float> %a3) {
   1430 ; X86-LABEL: test_mm256_mask_i64gather_ps:
   1431 ; X86:       # %bb.0:
   1432 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1433 ; X86-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0
   1434 ; X86-NEXT:    vzeroupper
   1435 ; X86-NEXT:    retl
   1436 ;
   1437 ; X64-LABEL: test_mm256_mask_i64gather_ps:
   1438 ; X64:       # %bb.0:
   1439 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm1,2), %xmm0
   1440 ; X64-NEXT:    vzeroupper
   1441 ; X64-NEXT:    retq
   1442   %arg1 = bitcast float *%a1 to i8*
   1443   %call = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0, i8* %arg1, <4 x i64> %a2, <4 x float> %a3, i8 2)
   1444   ret <4 x float> %call
   1445 }
   1446 
   1447 define <4 x i64> @test0_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
   1448 ; CHECK-LABEL: test0_mm256_inserti128_si256:
   1449 ; CHECK:       # %bb.0:
   1450 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
   1451 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
   1452 ; CHECK-NEXT:    ret{{[l|q]}}
   1453   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   1454   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   1455   ret <4 x i64> %res
   1456 }
   1457 
   1458 define <4 x i64> @test1_mm256_inserti128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
   1459 ; CHECK-LABEL: test1_mm256_inserti128_si256:
   1460 ; CHECK:       # %bb.0:
   1461 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1462 ; CHECK-NEXT:    ret{{[l|q]}}
   1463   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   1464   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1465   ret <4 x i64> %res
   1466 }
   1467 
   1468 define <4 x i64> @test_mm256_madd_epi16(<4 x i64> %a0, <4 x i64> %a1) {
   1469 ; CHECK-LABEL: test_mm256_madd_epi16:
   1470 ; CHECK:       # %bb.0:
   1471 ; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0
   1472 ; CHECK-NEXT:    ret{{[l|q]}}
   1473   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   1474   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   1475   %res = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %arg0, <16 x i16> %arg1)
   1476   %bc = bitcast <8 x i32> %res to <4 x i64>
   1477   ret <4 x i64> %bc
   1478 }
   1479 declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
   1480 
   1481 define <4 x i64> @test_mm256_maddubs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
   1482 ; CHECK-LABEL: test_mm256_maddubs_epi16:
   1483 ; CHECK:       # %bb.0:
   1484 ; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
   1485 ; CHECK-NEXT:    ret{{[l|q]}}
   1486   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   1487   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   1488   %res = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %arg0, <32 x i8> %arg1)
   1489   %bc = bitcast <16 x i16> %res to <4 x i64>
   1490   ret <4 x i64> %bc
   1491 }
   1492 declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
   1493 
   1494 define <2 x i64> @test_mm_maskload_epi32(i32* %a0, <2 x i64> %a1) nounwind {
   1495 ; X86-LABEL: test_mm_maskload_epi32:
   1496 ; X86:       # %bb.0:
   1497 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1498 ; X86-NEXT:    vpmaskmovd (%eax), %xmm0, %xmm0
   1499 ; X86-NEXT:    retl
   1500 ;
   1501 ; X64-LABEL: test_mm_maskload_epi32:
   1502 ; X64:       # %bb.0:
   1503 ; X64-NEXT:    vpmaskmovd (%rdi), %xmm0, %xmm0
   1504 ; X64-NEXT:    retq
   1505   %arg0 = bitcast i32* %a0 to i8*
   1506   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   1507   %call = call <4 x i32> @llvm.x86.avx2.maskload.d(i8* %arg0, <4 x i32> %arg1)
   1508   %bc = bitcast <4 x i32> %call to <2 x i64>
   1509   ret <2 x i64> %bc
   1510 }
   1511 declare <4 x i32> @llvm.x86.avx2.maskload.d(i8*, <4 x i32>) nounwind readonly
   1512 
   1513 define <4 x i64> @test_mm256_maskload_epi32(i32* %a0, <4 x i64> %a1) nounwind {
   1514 ; X86-LABEL: test_mm256_maskload_epi32:
   1515 ; X86:       # %bb.0:
   1516 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1517 ; X86-NEXT:    vpmaskmovd (%eax), %ymm0, %ymm0
   1518 ; X86-NEXT:    retl
   1519 ;
   1520 ; X64-LABEL: test_mm256_maskload_epi32:
   1521 ; X64:       # %bb.0:
   1522 ; X64-NEXT:    vpmaskmovd (%rdi), %ymm0, %ymm0
   1523 ; X64-NEXT:    retq
   1524   %arg0 = bitcast i32* %a0 to i8*
   1525   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1526   %call = call <8 x i32> @llvm.x86.avx2.maskload.d.256(i8* %arg0, <8 x i32> %arg1)
   1527   %bc = bitcast <8 x i32> %call to <4 x i64>
   1528   ret <4 x i64> %bc
   1529 }
   1530 declare <8 x i32> @llvm.x86.avx2.maskload.d.256(i8*, <8 x i32>) nounwind readonly
   1531 
   1532 define <2 x i64> @test_mm_maskload_epi64(i64* %a0, <2 x i64> %a1) nounwind {
   1533 ; X86-LABEL: test_mm_maskload_epi64:
   1534 ; X86:       # %bb.0:
   1535 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1536 ; X86-NEXT:    vpmaskmovq (%eax), %xmm0, %xmm0
   1537 ; X86-NEXT:    retl
   1538 ;
   1539 ; X64-LABEL: test_mm_maskload_epi64:
   1540 ; X64:       # %bb.0:
   1541 ; X64-NEXT:    vpmaskmovq (%rdi), %xmm0, %xmm0
   1542 ; X64-NEXT:    retq
   1543   %arg0 = bitcast i64* %a0 to i8*
   1544   %res = call <2 x i64> @llvm.x86.avx2.maskload.q(i8* %arg0, <2 x i64> %a1)
   1545   ret <2 x i64> %res
   1546 }
   1547 declare <2 x i64> @llvm.x86.avx2.maskload.q(i8*, <2 x i64>) nounwind readonly
   1548 
   1549 define <4 x i64> @test_mm256_maskload_epi64(i64* %a0, <4 x i64> %a1) nounwind {
   1550 ; X86-LABEL: test_mm256_maskload_epi64:
   1551 ; X86:       # %bb.0:
   1552 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1553 ; X86-NEXT:    vpmaskmovq (%eax), %ymm0, %ymm0
   1554 ; X86-NEXT:    retl
   1555 ;
   1556 ; X64-LABEL: test_mm256_maskload_epi64:
   1557 ; X64:       # %bb.0:
   1558 ; X64-NEXT:    vpmaskmovq (%rdi), %ymm0, %ymm0
   1559 ; X64-NEXT:    retq
   1560   %arg0 = bitcast i64* %a0 to i8*
   1561   %res = call <4 x i64> @llvm.x86.avx2.maskload.q.256(i8* %arg0, <4 x i64> %a1)
   1562   ret <4 x i64> %res
   1563 }
   1564 declare <4 x i64> @llvm.x86.avx2.maskload.q.256(i8*, <4 x i64>) nounwind readonly
   1565 
   1566 define void @test_mm_maskstore_epi32(float* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
   1567 ; X86-LABEL: test_mm_maskstore_epi32:
   1568 ; X86:       # %bb.0:
   1569 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1570 ; X86-NEXT:    vpmaskmovd %xmm1, %xmm0, (%eax)
   1571 ; X86-NEXT:    retl
   1572 ;
   1573 ; X64-LABEL: test_mm_maskstore_epi32:
   1574 ; X64:       # %bb.0:
   1575 ; X64-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
   1576 ; X64-NEXT:    retq
   1577   %arg0 = bitcast float* %a0 to i8*
   1578   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   1579   %arg2 = bitcast <2 x i64> %a2 to <4 x i32>
   1580   call void @llvm.x86.avx2.maskstore.d(i8* %arg0, <4 x i32> %arg1, <4 x i32> %arg2)
   1581   ret void
   1582 }
   1583 declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind readnone
   1584 
   1585 define void @test_mm256_maskstore_epi32(float* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
   1586 ; X86-LABEL: test_mm256_maskstore_epi32:
   1587 ; X86:       # %bb.0:
   1588 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1589 ; X86-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax)
   1590 ; X86-NEXT:    vzeroupper
   1591 ; X86-NEXT:    retl
   1592 ;
   1593 ; X64-LABEL: test_mm256_maskstore_epi32:
   1594 ; X64:       # %bb.0:
   1595 ; X64-NEXT:    vpmaskmovd %ymm1, %ymm0, (%rdi)
   1596 ; X64-NEXT:    vzeroupper
   1597 ; X64-NEXT:    retq
   1598   %arg0 = bitcast float* %a0 to i8*
   1599   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1600   %arg2 = bitcast <4 x i64> %a2 to <8 x i32>
   1601   call void @llvm.x86.avx2.maskstore.d.256(i8* %arg0, <8 x i32> %arg1, <8 x i32> %arg2)
   1602   ret void
   1603 }
   1604 declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind readnone
   1605 
   1606 define void @test_mm_maskstore_epi64(i64* %a0, <2 x i64> %a1, <2 x i64> %a2) nounwind {
   1607 ; X86-LABEL: test_mm_maskstore_epi64:
   1608 ; X86:       # %bb.0:
   1609 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1610 ; X86-NEXT:    vpmaskmovq %xmm1, %xmm0, (%eax)
   1611 ; X86-NEXT:    retl
   1612 ;
   1613 ; X64-LABEL: test_mm_maskstore_epi64:
   1614 ; X64:       # %bb.0:
   1615 ; X64-NEXT:    vpmaskmovq %xmm1, %xmm0, (%rdi)
   1616 ; X64-NEXT:    retq
   1617   %arg0 = bitcast i64* %a0 to i8*
   1618   call void @llvm.x86.avx2.maskstore.q(i8* %arg0, <2 x i64> %a1, <2 x i64> %a2)
   1619   ret void
   1620 }
   1621 declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind readnone
   1622 
   1623 define void @test_mm256_maskstore_epi64(i64* %a0, <4 x i64> %a1, <4 x i64> %a2) nounwind {
   1624 ; X86-LABEL: test_mm256_maskstore_epi64:
   1625 ; X86:       # %bb.0:
   1626 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1627 ; X86-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax)
   1628 ; X86-NEXT:    vzeroupper
   1629 ; X86-NEXT:    retl
   1630 ;
   1631 ; X64-LABEL: test_mm256_maskstore_epi64:
   1632 ; X64:       # %bb.0:
   1633 ; X64-NEXT:    vpmaskmovq %ymm1, %ymm0, (%rdi)
   1634 ; X64-NEXT:    vzeroupper
   1635 ; X64-NEXT:    retq
   1636   %arg0 = bitcast i64* %a0 to i8*
   1637   call void @llvm.x86.avx2.maskstore.q.256(i8* %arg0, <4 x i64> %a1, <4 x i64> %a2)
   1638   ret void
   1639 }
   1640 declare void @llvm.x86.avx2.maskstore.q.256(i8*, <4 x i64>, <4 x i64>) nounwind readnone
   1641 
   1642 define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) {
   1643 ; CHECK-LABEL: test_mm256_max_epi8:
   1644 ; CHECK:       # %bb.0:
   1645 ; CHECK-NEXT:    vpmaxsb %ymm1, %ymm0, %ymm0
   1646 ; CHECK-NEXT:    ret{{[l|q]}}
   1647   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   1648   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   1649   %cmp = icmp sgt <32 x i8> %arg0, %arg1
   1650   %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
   1651   %bc = bitcast <32 x i8> %sel to <4 x i64>
   1652   ret <4 x i64> %bc
   1653 }
   1654 
   1655 define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) {
   1656 ; CHECK-LABEL: test_mm256_max_epi16:
   1657 ; CHECK:       # %bb.0:
   1658 ; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0
   1659 ; CHECK-NEXT:    ret{{[l|q]}}
   1660   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   1661   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   1662   %cmp = icmp sgt <16 x i16> %arg0, %arg1
   1663   %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
   1664   %bc = bitcast <16 x i16> %sel to <4 x i64>
   1665   ret <4 x i64> %bc
   1666 }
   1667 
   1668 define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) {
   1669 ; CHECK-LABEL: test_mm256_max_epi32:
   1670 ; CHECK:       # %bb.0:
   1671 ; CHECK-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
   1672 ; CHECK-NEXT:    ret{{[l|q]}}
   1673   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   1674   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1675   %cmp = icmp sgt <8 x i32> %arg0, %arg1
   1676   %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
   1677   %bc = bitcast <8 x i32> %sel to <4 x i64>
   1678   ret <4 x i64> %bc
   1679 }
   1680 
   1681 define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) {
   1682 ; CHECK-LABEL: test_mm256_max_epu8:
   1683 ; CHECK:       # %bb.0:
   1684 ; CHECK-NEXT:    vpmaxub %ymm1, %ymm0, %ymm0
   1685 ; CHECK-NEXT:    ret{{[l|q]}}
   1686   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   1687   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   1688   %cmp = icmp ugt <32 x i8> %arg0, %arg1
   1689   %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
   1690   %bc = bitcast <32 x i8> %sel to <4 x i64>
   1691   ret <4 x i64> %bc
   1692 }
   1693 
   1694 define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) {
   1695 ; CHECK-LABEL: test_mm256_max_epu16:
   1696 ; CHECK:       # %bb.0:
   1697 ; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0
   1698 ; CHECK-NEXT:    ret{{[l|q]}}
   1699   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   1700   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   1701   %cmp = icmp ugt <16 x i16> %arg0, %arg1
   1702   %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
   1703   %bc = bitcast <16 x i16> %sel to <4 x i64>
   1704   ret <4 x i64> %bc
   1705 }
   1706 
   1707 define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) {
   1708 ; CHECK-LABEL: test_mm256_max_epu32:
   1709 ; CHECK:       # %bb.0:
   1710 ; CHECK-NEXT:    vpmaxud %ymm1, %ymm0, %ymm0
   1711 ; CHECK-NEXT:    ret{{[l|q]}}
   1712   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   1713   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1714   %cmp = icmp ugt <8 x i32> %arg0, %arg1
   1715   %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
   1716   %bc = bitcast <8 x i32> %sel to <4 x i64>
   1717   ret <4 x i64> %bc
   1718 }
   1719 
   1720 define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) {
   1721 ; CHECK-LABEL: test_mm256_min_epi8:
   1722 ; CHECK:       # %bb.0:
   1723 ; CHECK-NEXT:    vpminsb %ymm1, %ymm0, %ymm0
   1724 ; CHECK-NEXT:    ret{{[l|q]}}
   1725   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   1726   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   1727   %cmp = icmp slt <32 x i8> %arg0, %arg1
   1728   %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
   1729   %bc = bitcast <32 x i8> %sel to <4 x i64>
   1730   ret <4 x i64> %bc
   1731 }
   1732 
   1733 define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) {
   1734 ; CHECK-LABEL: test_mm256_min_epi16:
   1735 ; CHECK:       # %bb.0:
   1736 ; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0
   1737 ; CHECK-NEXT:    ret{{[l|q]}}
   1738   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   1739   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   1740   %cmp = icmp slt <16 x i16> %arg0, %arg1
   1741   %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
   1742   %bc = bitcast <16 x i16> %sel to <4 x i64>
   1743   ret <4 x i64> %bc
   1744 }
   1745 
   1746 define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) {
   1747 ; CHECK-LABEL: test_mm256_min_epi32:
   1748 ; CHECK:       # %bb.0:
   1749 ; CHECK-NEXT:    vpminsd %ymm1, %ymm0, %ymm0
   1750 ; CHECK-NEXT:    ret{{[l|q]}}
   1751   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   1752   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1753   %cmp = icmp slt <8 x i32> %arg0, %arg1
   1754   %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
   1755   %bc = bitcast <8 x i32> %sel to <4 x i64>
   1756   ret <4 x i64> %bc
   1757 }
   1758 
   1759 define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) {
   1760 ; CHECK-LABEL: test_mm256_min_epu8:
   1761 ; CHECK:       # %bb.0:
   1762 ; CHECK-NEXT:    vpminub %ymm1, %ymm0, %ymm0
   1763 ; CHECK-NEXT:    ret{{[l|q]}}
   1764   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   1765   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   1766   %cmp = icmp ult <32 x i8> %arg0, %arg1
   1767   %sel = select <32 x i1> %cmp, <32 x i8> %arg0, <32 x i8> %arg1
   1768   %bc = bitcast <32 x i8> %sel to <4 x i64>
   1769   ret <4 x i64> %bc
   1770 }
   1771 
   1772 define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) {
   1773 ; CHECK-LABEL: test_mm256_min_epu16:
   1774 ; CHECK:       # %bb.0:
   1775 ; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0
   1776 ; CHECK-NEXT:    ret{{[l|q]}}
   1777   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   1778   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   1779   %cmp = icmp ult <16 x i16> %arg0, %arg1
   1780   %sel = select <16 x i1> %cmp, <16 x i16> %arg0, <16 x i16> %arg1
   1781   %bc = bitcast <16 x i16> %sel to <4 x i64>
   1782   ret <4 x i64> %bc
   1783 }
   1784 
   1785 define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) {
   1786 ; CHECK-LABEL: test_mm256_min_epu32:
   1787 ; CHECK:       # %bb.0:
   1788 ; CHECK-NEXT:    vpminud %ymm1, %ymm0, %ymm0
   1789 ; CHECK-NEXT:    ret{{[l|q]}}
   1790   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   1791   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1792   %cmp = icmp ult <8 x i32> %arg0, %arg1
   1793   %sel = select <8 x i1> %cmp, <8 x i32> %arg0, <8 x i32> %arg1
   1794   %bc = bitcast <8 x i32> %sel to <4 x i64>
   1795   ret <4 x i64> %bc
   1796 }
   1797 
   1798 define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind {
   1799 ; CHECK-LABEL: test_mm256_movemask_epi8:
   1800 ; CHECK:       # %bb.0:
   1801 ; CHECK-NEXT:    vpmovmskb %ymm0, %eax
   1802 ; CHECK-NEXT:    vzeroupper
   1803 ; CHECK-NEXT:    ret{{[l|q]}}
   1804   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   1805   %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0)
   1806   ret i32 %res
   1807 }
   1808 declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone
   1809 
   1810 define <4 x i64> @test_mm256_mpsadbw_epu8(<4 x i64> %a0, <4 x i64> %a1) {
   1811 ; CHECK-LABEL: test_mm256_mpsadbw_epu8:
   1812 ; CHECK:       # %bb.0:
   1813 ; CHECK-NEXT:    vmpsadbw $3, %ymm1, %ymm0, %ymm0
   1814 ; CHECK-NEXT:    ret{{[l|q]}}
   1815   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   1816   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   1817   %call = call <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8> %arg0, <32 x i8> %arg1, i8 3)
   1818   %bc = bitcast <16 x i16>  %call to <4 x i64>
   1819   ret <4 x i64> %bc
   1820 }
   1821 declare <16 x i16> @llvm.x86.avx2.mpsadbw(<32 x i8>, <32 x i8>, i8) nounwind readnone
   1822 
   1823 define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) {
   1824 ; CHECK-LABEL: test_mm256_mul_epi32:
   1825 ; CHECK:       # %bb.0:
   1826 ; CHECK-NEXT:    vpsllq $32, %ymm0, %ymm0
   1827 ; CHECK-NEXT:    vpsrad $31, %ymm0, %ymm2
   1828 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
   1829 ; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
   1830 ; CHECK-NEXT:    vpsllq $32, %ymm1, %ymm1
   1831 ; CHECK-NEXT:    vpsrad $31, %ymm1, %ymm2
   1832 ; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
   1833 ; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
   1834 ; CHECK-NEXT:    vpmuldq %ymm1, %ymm0, %ymm0
   1835 ; CHECK-NEXT:    ret{{[l|q]}}
   1836   %A = shl <4 x i64> %a0, <i64 32, i64 32, i64 32, i64 32>
   1837   %A1 = ashr exact <4 x i64> %A, <i64 32, i64 32, i64 32, i64 32>
   1838   %B = shl <4 x i64> %a1, <i64 32, i64 32, i64 32, i64 32>
   1839   %B1 = ashr exact <4 x i64> %B, <i64 32, i64 32, i64 32, i64 32>
   1840   %res = mul nsw <4 x i64> %A1, %B1
   1841   ret <4 x i64> %res
   1842 }
   1843 declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
   1844 
   1845 define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) {
   1846 ; CHECK-LABEL: test_mm256_mul_epu32:
   1847 ; CHECK:       # %bb.0:
   1848 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1849 ; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7]
   1850 ; CHECK-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7]
   1851 ; CHECK-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
   1852 ; CHECK-NEXT:    ret{{[l|q]}}
   1853   %A = and <4 x i64> %a0, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   1854   %B = and <4 x i64> %a1, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   1855   %res = mul nuw <4 x i64> %A, %B
   1856   ret <4 x i64> %res
   1857 }
   1858 declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
   1859 
   1860 define <4 x i64> @test_mm256_mulhi_epi16(<4 x i64> %a0, <4 x i64> %a1) {
   1861 ; CHECK-LABEL: test_mm256_mulhi_epi16:
   1862 ; CHECK:       # %bb.0:
   1863 ; CHECK-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0
   1864 ; CHECK-NEXT:    ret{{[l|q]}}
   1865   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   1866   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   1867   %res = call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %arg0, <16 x i16> %arg1)
   1868   %bc = bitcast <16 x i16> %res to <4 x i64>
   1869   ret <4 x i64> %bc
   1870 }
   1871 declare <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16>, <16 x i16>) nounwind readnone
   1872 
   1873 define <4 x i64> @test_mm256_mulhi_epu16(<4 x i64> %a0, <4 x i64> %a1) {
   1874 ; CHECK-LABEL: test_mm256_mulhi_epu16:
   1875 ; CHECK:       # %bb.0:
   1876 ; CHECK-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0
   1877 ; CHECK-NEXT:    ret{{[l|q]}}
   1878   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   1879   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   1880   %res = call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %arg0, <16 x i16> %arg1)
   1881   %bc = bitcast <16 x i16> %res to <4 x i64>
   1882   ret <4 x i64> %bc
   1883 }
   1884 declare <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16>, <16 x i16>) nounwind readnone
   1885 
   1886 define <4 x i64> @test_mm256_mulhrs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
   1887 ; CHECK-LABEL: test_mm256_mulhrs_epi16:
   1888 ; CHECK:       # %bb.0:
   1889 ; CHECK-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0
   1890 ; CHECK-NEXT:    ret{{[l|q]}}
   1891   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   1892   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   1893   %res = call <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16> %arg0, <16 x i16> %arg1)
   1894   %bc = bitcast <16 x i16> %res to <4 x i64>
   1895   ret <4 x i64> %bc
   1896 }
   1897 declare <16 x i16> @llvm.x86.avx2.pmul.hr.sw(<16 x i16>, <16 x i16>) nounwind readnone
   1898 
   1899 define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) {
   1900 ; CHECK-LABEL: test_mm256_mullo_epi16:
   1901 ; CHECK:       # %bb.0:
   1902 ; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1903 ; CHECK-NEXT:    ret{{[l|q]}}
   1904   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   1905   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   1906   %res = mul <16 x i16> %arg0, %arg1
   1907   %bc = bitcast <16 x i16> %res to <4 x i64>
   1908   ret <4 x i64> %bc
   1909 }
   1910 
   1911 define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) {
   1912 ; CHECK-LABEL: test_mm256_mullo_epi32:
   1913 ; CHECK:       # %bb.0:
   1914 ; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
   1915 ; CHECK-NEXT:    ret{{[l|q]}}
   1916   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   1917   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1918   %res = mul <8 x i32> %arg0, %arg1
   1919   %bc = bitcast <8 x i32> %res to <4 x i64>
   1920   ret <4 x i64> %bc
   1921 }
   1922 
   1923 define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   1924 ; CHECK-LABEL: test_mm256_or_si256:
   1925 ; CHECK:       # %bb.0:
   1926 ; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
   1927 ; CHECK-NEXT:    ret{{[l|q]}}
   1928   %res = or <4 x i64> %a0, %a1
   1929   ret <4 x i64> %res
   1930 }
   1931 
   1932 define <4 x i64> @test_mm256_packs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
   1933 ; CHECK-LABEL: test_mm256_packs_epi16:
   1934 ; CHECK:       # %bb.0:
   1935 ; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0
   1936 ; CHECK-NEXT:    ret{{[l|q]}}
   1937   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   1938   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   1939   %call = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %arg0, <16 x i16> %arg1)
   1940   %res = bitcast <32 x i8> %call to <4 x i64>
   1941   ret <4 x i64> %res
   1942 }
   1943 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
   1944 
   1945 define <4 x i64> @test_mm256_packs_epi32(<4 x i64> %a0, <4 x i64> %a1) {
   1946 ; CHECK-LABEL: test_mm256_packs_epi32:
   1947 ; CHECK:       # %bb.0:
   1948 ; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
   1949 ; CHECK-NEXT:    ret{{[l|q]}}
   1950   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   1951   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1952   %call = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %arg0, <8 x i32> %arg1)
   1953   %res = bitcast <16 x i16> %call to <4 x i64>
   1954   ret <4 x i64> %res
   1955 }
   1956 declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
   1957 
   1958 define <4 x i64> @test_mm256_packus_epi16(<4 x i64> %a0, <4 x i64> %a1) {
   1959 ; CHECK-LABEL: test_mm256_packus_epi16:
   1960 ; CHECK:       # %bb.0:
   1961 ; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
   1962 ; CHECK-NEXT:    ret{{[l|q]}}
   1963   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   1964   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   1965   %call = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %arg0, <16 x i16> %arg1)
   1966   %res = bitcast <32 x i8> %call to <4 x i64>
   1967   ret <4 x i64> %res
   1968 }
   1969 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
   1970 
   1971 define <4 x i64> @test_mm256_packus_epi32(<4 x i64> %a0, <4 x i64> %a1) {
   1972 ; CHECK-LABEL: test_mm256_packus_epi32:
   1973 ; CHECK:       # %bb.0:
   1974 ; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0
   1975 ; CHECK-NEXT:    ret{{[l|q]}}
   1976   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   1977   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1978   %call = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %arg0, <8 x i32> %arg1)
   1979   %res = bitcast <16 x i16> %call to <4 x i64>
   1980   ret <4 x i64> %res
   1981 }
   1982 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
   1983 
   1984 define <4 x i64> @test_mm256_permute2x128_si256(<4 x i64> %a0, <4 x i64> %a1) {
   1985 ; CHECK-LABEL: test_mm256_permute2x128_si256:
   1986 ; CHECK:       # %bb.0:
   1987 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
   1988 ; CHECK-NEXT:    ret{{[l|q]}}
   1989   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
   1990   ret <4 x i64> %res
   1991 }
   1992 declare <4 x i64> @llvm.x86.avx2.vperm2i128(<4 x i64>, <4 x i64>, i8) nounwind readonly
   1993 
   1994 define <4 x i64> @test_mm256_permute4x64_epi64(<4 x i64> %a0) {
   1995 ; CHECK-LABEL: test_mm256_permute4x64_epi64:
   1996 ; CHECK:       # %bb.0:
   1997 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,0,2,0]
   1998 ; CHECK-NEXT:    ret{{[l|q]}}
   1999   %res = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 0, i32 2, i32 0>
   2000   ret <4 x i64> %res
   2001 }
   2002 
   2003 define <4 x double> @test_mm256_permute4x64_pd(<4 x double> %a0) {
   2004 ; CHECK-LABEL: test_mm256_permute4x64_pd:
   2005 ; CHECK:       # %bb.0:
   2006 ; CHECK-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,2,1,0]
   2007 ; CHECK-NEXT:    ret{{[l|q]}}
   2008   %res = shufflevector <4 x double> %a0, <4 x double> undef, <4 x i32> <i32 1, i32 2, i32 1, i32 0>
   2009   ret <4 x double> %res
   2010 }
   2011 
   2012 define <4 x i64> @test_mm256_permutevar8x32_epi32(<4 x i64> %a0, <4 x i64> %a1) {
   2013 ; CHECK-LABEL: test_mm256_permutevar8x32_epi32:
   2014 ; CHECK:       # %bb.0:
   2015 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   2016 ; CHECK-NEXT:    ret{{[l|q]}}
   2017   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2018   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   2019   %call = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %arg0, <8 x i32> %arg1)
   2020   %res = bitcast <8 x i32> %call to <4 x i64>
   2021   ret <4 x i64> %res
   2022 }
   2023 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
   2024 
   2025 define <8 x float> @test_mm256_permutevar8x32_ps(<8 x float> %a0, <4 x i64> %a1) {
   2026 ; CHECK-LABEL: test_mm256_permutevar8x32_ps:
   2027 ; CHECK:       # %bb.0:
   2028 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
   2029 ; CHECK-NEXT:    ret{{[l|q]}}
   2030   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   2031   %res = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a0, <8 x i32> %arg1)
   2032   ret <8 x float> %res
   2033 }
   2034 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) nounwind readonly
   2035 
   2036 define <4 x i64> @test_mm256_sad_epu8(<4 x i64> %a0, <4 x i64> %a1) {
   2037 ; CHECK-LABEL: test_mm256_sad_epu8:
   2038 ; CHECK:       # %bb.0:
   2039 ; CHECK-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
   2040 ; CHECK-NEXT:    ret{{[l|q]}}
   2041   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   2042   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   2043   %res = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %arg0, <32 x i8> %arg1)
   2044   ret <4 x i64> %res
   2045 }
   2046 declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
   2047 
   2048 define <4 x i64> @test_mm256_shuffle_epi32(<4 x i64> %a0) {
   2049 ; CHECK-LABEL: test_mm256_shuffle_epi32:
   2050 ; CHECK:       # %bb.0:
   2051 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,3,0,0,7,7,4,4]
   2052 ; CHECK-NEXT:    ret{{[l|q]}}
   2053   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2054   %shuf = shufflevector <8 x i32> %arg0, <8 x i32> undef, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
   2055   %res = bitcast <8 x i32> %shuf to <4 x i64>
   2056   ret <4 x i64> %res
   2057 }
   2058 
   2059 define <4 x i64> @test_mm256_shuffle_epi8(<4 x i64> %a0, <4 x i64> %a1) {
   2060 ; CHECK-LABEL: test_mm256_shuffle_epi8:
   2061 ; CHECK:       # %bb.0:
   2062 ; CHECK-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
   2063 ; CHECK-NEXT:    ret{{[l|q]}}
   2064   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   2065   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   2066   %shuf = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %arg0, <32 x i8> %arg1)
   2067   %res = bitcast <32 x i8> %shuf to <4 x i64>
   2068   ret <4 x i64> %res
   2069 }
   2070 declare <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8>, <32 x i8>) nounwind readnone
   2071 
   2072 define <4 x i64> @test_mm256_shufflehi_epi16(<4 x i64> %a0) {
   2073 ; CHECK-LABEL: test_mm256_shufflehi_epi16:
   2074 ; CHECK:       # %bb.0:
   2075 ; CHECK-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,6,6,5,8,9,10,11,15,14,14,13]
   2076 ; CHECK-NEXT:    ret{{[l|q]}}
   2077   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2078   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 6, i32 5, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 14, i32 13>
   2079   %res = bitcast <16 x i16> %shuf to <4 x i64>
   2080   ret <4 x i64> %res
   2081 }
   2082 
   2083 define <4 x i64> @test_mm256_shufflelo_epi16(<4 x i64> %a0) {
   2084 ; CHECK-LABEL: test_mm256_shufflelo_epi16:
   2085 ; CHECK:       # %bb.0:
   2086 ; CHECK-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,0,1,1,4,5,6,7,11,8,9,9,12,13,14,15]
   2087 ; CHECK-NEXT:    ret{{[l|q]}}
   2088   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2089   %shuf = shufflevector <16 x i16> %arg0, <16 x i16> undef, <16 x i32> <i32 3, i32 0, i32 1, i32 1, i32 4, i32 5, i32 6, i32 7, i32 11, i32 8, i32 9, i32 9, i32 12, i32 13, i32 14, i32 15>
   2090   %res = bitcast <16 x i16> %shuf to <4 x i64>
   2091   ret <4 x i64> %res
   2092 }
   2093 
   2094 define <4 x i64> @test_mm256_sign_epi8(<4 x i64> %a0, <4 x i64> %a1) {
   2095 ; CHECK-LABEL: test_mm256_sign_epi8:
   2096 ; CHECK:       # %bb.0:
   2097 ; CHECK-NEXT:    vpsignb %ymm1, %ymm0, %ymm0
   2098 ; CHECK-NEXT:    ret{{[l|q]}}
   2099   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   2100   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   2101   %call = call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %arg0, <32 x i8> %arg1)
   2102   %res = bitcast <32 x i8> %call to <4 x i64>
   2103   ret <4 x i64> %res
   2104 }
   2105 declare <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8>, <32 x i8>) nounwind readnone
   2106 
   2107 define <4 x i64> @test_mm256_sign_epi16(<4 x i64> %a0, <4 x i64> %a1) {
   2108 ; CHECK-LABEL: test_mm256_sign_epi16:
   2109 ; CHECK:       # %bb.0:
   2110 ; CHECK-NEXT:    vpsignw %ymm1, %ymm0, %ymm0
   2111 ; CHECK-NEXT:    ret{{[l|q]}}
   2112   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2113   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   2114   %call = call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %arg0, <16 x i16> %arg1)
   2115   %res = bitcast <16 x i16> %call to <4 x i64>
   2116   ret <4 x i64> %res
   2117 }
   2118 declare <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16>, <16 x i16>) nounwind readnone
   2119 
   2120 define <4 x i64> @test_mm256_sign_epi32(<4 x i64> %a0, <4 x i64> %a1) {
   2121 ; CHECK-LABEL: test_mm256_sign_epi32:
   2122 ; CHECK:       # %bb.0:
   2123 ; CHECK-NEXT:    vpsignd %ymm1, %ymm0, %ymm0
   2124 ; CHECK-NEXT:    ret{{[l|q]}}
   2125   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2126   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   2127   %call = call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %arg0, <8 x i32> %arg1)
   2128   %res = bitcast <8 x i32> %call to <4 x i64>
   2129   ret <4 x i64> %res
   2130 }
   2131 declare <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32>, <8 x i32>) nounwind readnone
   2132 
   2133 define <4 x i64> @test_mm256_sll_epi16(<4 x i64> %a0, <2 x i64> %a1) {
   2134 ; CHECK-LABEL: test_mm256_sll_epi16:
   2135 ; CHECK:       # %bb.0:
   2136 ; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
   2137 ; CHECK-NEXT:    ret{{[l|q]}}
   2138   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2139   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   2140   %res = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %arg0, <8 x i16> %arg1)
   2141   %bc = bitcast <16 x i16> %res to <4 x i64>
   2142   ret <4 x i64> %bc
   2143 }
   2144 declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
   2145 
   2146 define <4 x i64> @test_mm256_sll_epi32(<4 x i64> %a0, <2 x i64> %a1) {
   2147 ; CHECK-LABEL: test_mm256_sll_epi32:
   2148 ; CHECK:       # %bb.0:
   2149 ; CHECK-NEXT:    vpslld %xmm1, %ymm0, %ymm0
   2150 ; CHECK-NEXT:    ret{{[l|q]}}
   2151   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2152   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   2153   %res = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %arg0, <4 x i32> %arg1)
   2154   %bc = bitcast <8 x i32> %res to <4 x i64>
   2155   ret <4 x i64> %bc
   2156 }
   2157 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
   2158 
   2159 define <4 x i64> @test_mm256_sll_epi64(<4 x i64> %a0, <2 x i64> %a1) {
   2160 ; CHECK-LABEL: test_mm256_sll_epi64:
   2161 ; CHECK:       # %bb.0:
   2162 ; CHECK-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
   2163 ; CHECK-NEXT:    ret{{[l|q]}}
   2164   %res = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
   2165   ret <4 x i64> %res
   2166 }
   2167 declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
   2168 
   2169 define <4 x i64> @test_mm256_slli_epi16(<4 x i64> %a0) {
   2170 ; CHECK-LABEL: test_mm256_slli_epi16:
   2171 ; CHECK:       # %bb.0:
   2172 ; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0
   2173 ; CHECK-NEXT:    ret{{[l|q]}}
   2174   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2175   %res = call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %arg0, i32 3)
   2176   %bc = bitcast <16 x i16> %res to <4 x i64>
   2177   ret <4 x i64> %bc
   2178 }
   2179 declare <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16>, i32) nounwind readnone
   2180 
   2181 define <4 x i64> @test_mm256_slli_epi32(<4 x i64> %a0) {
   2182 ; CHECK-LABEL: test_mm256_slli_epi32:
   2183 ; CHECK:       # %bb.0:
   2184 ; CHECK-NEXT:    vpslld $3, %ymm0, %ymm0
   2185 ; CHECK-NEXT:    ret{{[l|q]}}
   2186   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2187   %res = call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %arg0, i32 3)
   2188   %bc = bitcast <8 x i32> %res to <4 x i64>
   2189   ret <4 x i64> %bc
   2190 }
   2191 declare <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32>, i32) nounwind readnone
   2192 
   2193 define <4 x i64> @test_mm256_slli_epi64(<4 x i64> %a0) {
   2194 ; CHECK-LABEL: test_mm256_slli_epi64:
   2195 ; CHECK:       # %bb.0:
   2196 ; CHECK-NEXT:    vpsllq $3, %ymm0, %ymm0
   2197 ; CHECK-NEXT:    ret{{[l|q]}}
   2198   %res = call <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64> %a0, i32 3)
   2199   ret <4 x i64> %res
   2200 }
   2201 declare <4 x i64> @llvm.x86.avx2.pslli.q(<4 x i64>, i32) nounwind readnone
   2202 
   2203 define <4 x i64> @test_mm256_slli_si256(<4 x i64> %a0) {
   2204 ; CHECK-LABEL: test_mm256_slli_si256:
   2205 ; CHECK:       # %bb.0:
   2206 ; CHECK-NEXT:    vpslldq {{.*#+}} ymm0 = zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12],zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26,27,28]
   2207 ; CHECK-NEXT:    ret{{[l|q]}}
   2208   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   2209   %shuf = shufflevector <32 x i8> zeroinitializer, <32 x i8> %arg0, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
   2210   %res = bitcast <32 x i8> %shuf to <4 x i64>
   2211   ret <4 x i64> %res
   2212 }
   2213 
   2214 define <2 x i64> @test_mm_sllv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
   2215 ; CHECK-LABEL: test_mm_sllv_epi32:
   2216 ; CHECK:       # %bb.0:
   2217 ; CHECK-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
   2218 ; CHECK-NEXT:    ret{{[l|q]}}
   2219   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   2220   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   2221   %res = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %arg0, <4 x i32> %arg1)
   2222   %bc = bitcast <4 x i32> %res to <2 x i64>
   2223   ret <2 x i64> %bc
   2224 }
   2225 declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
   2226 
   2227 define <4 x i64> @test_mm256_sllv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
   2228 ; CHECK-LABEL: test_mm256_sllv_epi32:
   2229 ; CHECK:       # %bb.0:
   2230 ; CHECK-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
   2231 ; CHECK-NEXT:    ret{{[l|q]}}
   2232   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2233   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   2234   %res = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
   2235   %bc = bitcast <8 x i32> %res to <4 x i64>
   2236   ret <4 x i64> %bc
   2237 }
   2238 declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
   2239 
   2240 define <2 x i64> @test_mm_sllv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
   2241 ; CHECK-LABEL: test_mm_sllv_epi64:
   2242 ; CHECK:       # %bb.0:
   2243 ; CHECK-NEXT:    vpsllvq %xmm1, %xmm0, %xmm0
   2244 ; CHECK-NEXT:    ret{{[l|q]}}
   2245   %res = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
   2246   ret <2 x i64> %res
   2247 }
   2248 declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
   2249 
   2250 define <4 x i64> @test_mm256_sllv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
   2251 ; CHECK-LABEL: test_mm256_sllv_epi64:
   2252 ; CHECK:       # %bb.0:
   2253 ; CHECK-NEXT:    vpsllvq %ymm1, %ymm0, %ymm0
   2254 ; CHECK-NEXT:    ret{{[l|q]}}
   2255   %res = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
   2256   ret <4 x i64> %res
   2257 }
   2258 declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
   2259 
   2260 define <4 x i64> @test_mm256_sra_epi16(<4 x i64> %a0, <2 x i64> %a1) {
   2261 ; CHECK-LABEL: test_mm256_sra_epi16:
   2262 ; CHECK:       # %bb.0:
   2263 ; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
   2264 ; CHECK-NEXT:    ret{{[l|q]}}
   2265   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2266   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   2267   %res = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %arg0, <8 x i16> %arg1)
   2268   %bc = bitcast <16 x i16> %res to <4 x i64>
   2269   ret <4 x i64> %bc
   2270 }
   2271 declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
   2272 
   2273 define <4 x i64> @test_mm256_sra_epi32(<4 x i64> %a0, <2 x i64> %a1) {
   2274 ; CHECK-LABEL: test_mm256_sra_epi32:
   2275 ; CHECK:       # %bb.0:
   2276 ; CHECK-NEXT:    vpsrad %xmm1, %ymm0, %ymm0
   2277 ; CHECK-NEXT:    ret{{[l|q]}}
   2278   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2279   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   2280   %res = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %arg0, <4 x i32> %arg1)
   2281   %bc = bitcast <8 x i32> %res to <4 x i64>
   2282   ret <4 x i64> %bc
   2283 }
   2284 declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
   2285 
   2286 define <4 x i64> @test_mm256_srai_epi16(<4 x i64> %a0) {
   2287 ; CHECK-LABEL: test_mm256_srai_epi16:
   2288 ; CHECK:       # %bb.0:
   2289 ; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0
   2290 ; CHECK-NEXT:    ret{{[l|q]}}
   2291   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2292   %res = call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %arg0, i32 3)
   2293   %bc = bitcast <16 x i16> %res to <4 x i64>
   2294   ret <4 x i64> %bc
   2295 }
   2296 declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32) nounwind readnone
   2297 
   2298 define <4 x i64> @test_mm256_srai_epi32(<4 x i64> %a0) {
   2299 ; CHECK-LABEL: test_mm256_srai_epi32:
   2300 ; CHECK:       # %bb.0:
   2301 ; CHECK-NEXT:    vpsrad $3, %ymm0, %ymm0
   2302 ; CHECK-NEXT:    ret{{[l|q]}}
   2303   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2304   %res = call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %arg0, i32 3)
   2305   %bc = bitcast <8 x i32> %res to <4 x i64>
   2306   ret <4 x i64> %bc
   2307 }
   2308 declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32) nounwind readnone
   2309 
   2310 define <2 x i64> @test_mm_srav_epi32(<2 x i64> %a0, <2 x i64> %a1) {
   2311 ; CHECK-LABEL: test_mm_srav_epi32:
   2312 ; CHECK:       # %bb.0:
   2313 ; CHECK-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
   2314 ; CHECK-NEXT:    ret{{[l|q]}}
   2315   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   2316   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   2317   %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %arg0, <4 x i32> %arg1)
   2318   %bc = bitcast <4 x i32> %res to <2 x i64>
   2319   ret <2 x i64> %bc
   2320 }
   2321 declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
   2322 
   2323 define <4 x i64> @test_mm256_srav_epi32(<4 x i64> %a0, <4 x i64> %a1) {
   2324 ; CHECK-LABEL: test_mm256_srav_epi32:
   2325 ; CHECK:       # %bb.0:
   2326 ; CHECK-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
   2327 ; CHECK-NEXT:    ret{{[l|q]}}
   2328   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2329   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   2330   %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
   2331   %bc = bitcast <8 x i32> %res to <4 x i64>
   2332   ret <4 x i64> %bc
   2333 }
   2334 declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
   2335 
   2336 define <4 x i64> @test_mm256_srl_epi16(<4 x i64> %a0, <2 x i64> %a1) {
   2337 ; CHECK-LABEL: test_mm256_srl_epi16:
   2338 ; CHECK:       # %bb.0:
   2339 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
   2340 ; CHECK-NEXT:    ret{{[l|q]}}
   2341   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2342   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
   2343   %res = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %arg0, <8 x i16> %arg1)
   2344   %bc = bitcast <16 x i16> %res to <4 x i64>
   2345   ret <4 x i64> %bc
   2346 }
   2347 declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
   2348 
   2349 define <4 x i64> @test_mm256_srl_epi32(<4 x i64> %a0, <2 x i64> %a1) {
   2350 ; CHECK-LABEL: test_mm256_srl_epi32:
   2351 ; CHECK:       # %bb.0:
   2352 ; CHECK-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
   2353 ; CHECK-NEXT:    ret{{[l|q]}}
   2354   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2355   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   2356   %res = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %arg0, <4 x i32> %arg1)
   2357   %bc = bitcast <8 x i32> %res to <4 x i64>
   2358   ret <4 x i64> %bc
   2359 }
   2360 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
   2361 
   2362 define <4 x i64> @test_mm256_srl_epi64(<4 x i64> %a0, <2 x i64> %a1) {
   2363 ; CHECK-LABEL: test_mm256_srl_epi64:
   2364 ; CHECK:       # %bb.0:
   2365 ; CHECK-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
   2366 ; CHECK-NEXT:    ret{{[l|q]}}
   2367   %res = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
   2368   ret <4 x i64> %res
   2369 }
   2370 declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
   2371 
   2372 define <4 x i64> @test_mm256_srli_epi16(<4 x i64> %a0) {
   2373 ; CHECK-LABEL: test_mm256_srli_epi16:
   2374 ; CHECK:       # %bb.0:
   2375 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0
   2376 ; CHECK-NEXT:    ret{{[l|q]}}
   2377   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2378   %res = call <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16> %arg0, i32 3)
   2379   %bc = bitcast <16 x i16> %res to <4 x i64>
   2380   ret <4 x i64> %bc
   2381 }
   2382 declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone
   2383 
   2384 define <4 x i64> @test_mm256_srli_epi32(<4 x i64> %a0) {
   2385 ; CHECK-LABEL: test_mm256_srli_epi32:
   2386 ; CHECK:       # %bb.0:
   2387 ; CHECK-NEXT:    vpsrld $3, %ymm0, %ymm0
   2388 ; CHECK-NEXT:    ret{{[l|q]}}
   2389   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2390   %res = call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %arg0, i32 3)
   2391   %bc = bitcast <8 x i32> %res to <4 x i64>
   2392   ret <4 x i64> %bc
   2393 }
   2394 declare <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32>, i32) nounwind readnone
   2395 
   2396 define <4 x i64> @test_mm256_srli_epi64(<4 x i64> %a0) {
   2397 ; CHECK-LABEL: test_mm256_srli_epi64:
   2398 ; CHECK:       # %bb.0:
   2399 ; CHECK-NEXT:    vpsrlq $3, %ymm0, %ymm0
   2400 ; CHECK-NEXT:    ret{{[l|q]}}
   2401   %res = call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %a0, i32 3)
   2402   ret <4 x i64> %res
   2403 }
   2404 declare <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64>, i32) nounwind readnone
   2405 
   2406 define <4 x i64> @test_mm256_srli_si256(<4 x i64> %a0) {
   2407 ; CHECK-LABEL: test_mm256_srli_si256:
   2408 ; CHECK:       # %bb.0:
   2409 ; CHECK-NEXT:    vpsrldq {{.*#+}} ymm0 = ymm0[3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,ymm0[19,20,21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero
   2410 ; CHECK-NEXT:    ret{{[l|q]}}
   2411   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   2412   %shuf = shufflevector <32 x i8> %arg0, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
   2413   %res = bitcast <32 x i8> %shuf to <4 x i64>
   2414   ret <4 x i64> %res
   2415 }
   2416 
   2417 define <2 x i64> @test_mm_srlv_epi32(<2 x i64> %a0, <2 x i64> %a1) {
   2418 ; CHECK-LABEL: test_mm_srlv_epi32:
   2419 ; CHECK:       # %bb.0:
   2420 ; CHECK-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
   2421 ; CHECK-NEXT:    ret{{[l|q]}}
   2422   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
   2423   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   2424   %res = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %arg0, <4 x i32> %arg1)
   2425   %bc = bitcast <4 x i32> %res to <2 x i64>
   2426   ret <2 x i64> %bc
   2427 }
   2428 declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
   2429 
   2430 define <4 x i64> @test_mm256_srlv_epi32(<4 x i64> %a0, <4 x i64> %a1) {
   2431 ; CHECK-LABEL: test_mm256_srlv_epi32:
   2432 ; CHECK:       # %bb.0:
   2433 ; CHECK-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
   2434 ; CHECK-NEXT:    ret{{[l|q]}}
   2435   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2436   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   2437   %res = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %arg0, <8 x i32> %arg1)
   2438   %bc = bitcast <8 x i32> %res to <4 x i64>
   2439   ret <4 x i64> %bc
   2440 }
   2441 declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
   2442 
   2443 define <2 x i64> @test_mm_srlv_epi64(<2 x i64> %a0, <2 x i64> %a1) {
   2444 ; CHECK-LABEL: test_mm_srlv_epi64:
   2445 ; CHECK:       # %bb.0:
   2446 ; CHECK-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
   2447 ; CHECK-NEXT:    ret{{[l|q]}}
   2448   %res = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
   2449   ret <2 x i64> %res
   2450 }
   2451 declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
   2452 
   2453 define <4 x i64> @test_mm256_srlv_epi64(<4 x i64> %a0, <4 x i64> %a1) {
   2454 ; CHECK-LABEL: test_mm256_srlv_epi64:
   2455 ; CHECK:       # %bb.0:
   2456 ; CHECK-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
   2457 ; CHECK-NEXT:    ret{{[l|q]}}
   2458   %res = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
   2459   ret <4 x i64> %res
   2460 }
   2461 declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
   2462 
   2463 define <4 x i64> @test_mm256_stream_load_si256(<4 x i64> *%a0) {
   2464 ; X86-LABEL: test_mm256_stream_load_si256:
   2465 ; X86:       # %bb.0:
   2466 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2467 ; X86-NEXT:    vmovntdqa (%eax), %ymm0
   2468 ; X86-NEXT:    retl
   2469 ;
   2470 ; X64-LABEL: test_mm256_stream_load_si256:
   2471 ; X64:       # %bb.0:
   2472 ; X64-NEXT:    vmovntdqa (%rdi), %ymm0
   2473 ; X64-NEXT:    retq
   2474   %arg0 = bitcast <4 x i64> *%a0 to i8*
   2475   %res = call <4 x i64> @llvm.x86.avx2.movntdqa(i8* %arg0)
   2476   ret <4 x i64> %res
   2477 }
   2478 declare <4 x i64> @llvm.x86.avx2.movntdqa(i8*) nounwind readonly
   2479 
   2480 define <4 x i64> @test_mm256_sub_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2481 ; CHECK-LABEL: test_mm256_sub_epi8:
   2482 ; CHECK:       # %bb.0:
   2483 ; CHECK-NEXT:    vpsubb %ymm1, %ymm0, %ymm0
   2484 ; CHECK-NEXT:    ret{{[l|q]}}
   2485   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   2486   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   2487   %res = sub <32 x i8> %arg0, %arg1
   2488   %bc = bitcast <32 x i8> %res to <4 x i64>
   2489   ret <4 x i64> %bc
   2490 }
   2491 
   2492 define <4 x i64> @test_mm256_sub_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2493 ; CHECK-LABEL: test_mm256_sub_epi16:
   2494 ; CHECK:       # %bb.0:
   2495 ; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
   2496 ; CHECK-NEXT:    ret{{[l|q]}}
   2497   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2498   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   2499   %res = sub <16 x i16> %arg0, %arg1
   2500   %bc = bitcast <16 x i16> %res to <4 x i64>
   2501   ret <4 x i64> %bc
   2502 }
   2503 
   2504 define <4 x i64> @test_mm256_sub_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2505 ; CHECK-LABEL: test_mm256_sub_epi32:
   2506 ; CHECK:       # %bb.0:
   2507 ; CHECK-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
   2508 ; CHECK-NEXT:    ret{{[l|q]}}
   2509   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2510   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   2511   %res = sub <8 x i32> %arg0, %arg1
   2512   %bc = bitcast <8 x i32> %res to <4 x i64>
   2513   ret <4 x i64> %bc
   2514 }
   2515 
   2516 define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2517 ; CHECK-LABEL: test_mm256_sub_epi64:
   2518 ; CHECK:       # %bb.0:
   2519 ; CHECK-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
   2520 ; CHECK-NEXT:    ret{{[l|q]}}
   2521   %res = sub <4 x i64> %a0, %a1
   2522   ret <4 x i64> %res
   2523 }
   2524 
   2525 define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) {
   2526 ; CHECK-LABEL: test_mm256_subs_epi8:
   2527 ; CHECK:       # %bb.0:
   2528 ; CHECK-NEXT:    vpsubsb %ymm1, %ymm0, %ymm0
   2529 ; CHECK-NEXT:    ret{{[l|q]}}
   2530   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   2531   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   2532   %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1)
   2533   %bc = bitcast <32 x i8> %res to <4 x i64>
   2534   ret <4 x i64> %bc
   2535 }
   2536 declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
   2537 
   2538 define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) {
   2539 ; CHECK-LABEL: test_mm256_subs_epi16:
   2540 ; CHECK:       # %bb.0:
   2541 ; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0
   2542 ; CHECK-NEXT:    ret{{[l|q]}}
   2543   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2544   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   2545   %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1)
   2546   %bc = bitcast <16 x i16> %res to <4 x i64>
   2547   ret <4 x i64> %bc
   2548 }
   2549 declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
   2550 
   2551 define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) {
   2552 ; CHECK-LABEL: test_mm256_subs_epu8:
   2553 ; CHECK:       # %bb.0:
   2554 ; CHECK-NEXT:    vpsubusb %ymm1, %ymm0, %ymm0
   2555 ; CHECK-NEXT:    ret{{[l|q]}}
   2556   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   2557   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   2558   %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1)
   2559   %bc = bitcast <32 x i8> %res to <4 x i64>
   2560   ret <4 x i64> %bc
   2561 }
   2562 declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
   2563 
   2564 define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) {
   2565 ; CHECK-LABEL: test_mm256_subs_epu16:
   2566 ; CHECK:       # %bb.0:
   2567 ; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0
   2568 ; CHECK-NEXT:    ret{{[l|q]}}
   2569   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2570   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   2571   %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1)
   2572   %bc = bitcast <16 x i16> %res to <4 x i64>
   2573   ret <4 x i64> %bc
   2574 }
   2575 declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
   2576 
   2577 define <4 x i64> @test_mm256_unpackhi_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2578 ; CHECK-LABEL: test_mm256_unpackhi_epi8:
   2579 ; CHECK:       # %bb.0:
   2580 ; CHECK-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
   2581 ; CHECK-NEXT:    ret{{[l|q]}}
   2582   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   2583   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   2584   %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   2585   %bc = bitcast <32 x i8> %res to <4 x i64>
   2586   ret <4 x i64> %bc
   2587 }
   2588 
   2589 define <4 x i64> @test_mm256_unpackhi_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2590 ; CHECK-LABEL: test_mm256_unpackhi_epi16:
   2591 ; CHECK:       # %bb.0:
   2592 ; CHECK-NEXT:    vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
   2593 ; CHECK-NEXT:    ret{{[l|q]}}
   2594   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2595   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   2596   %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   2597   %bc = bitcast <16 x i16> %res to <4 x i64>
   2598   ret <4 x i64> %bc
   2599 }
   2600 
   2601 define <4 x i64> @test_mm256_unpackhi_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2602 ; CHECK-LABEL: test_mm256_unpackhi_epi32:
   2603 ; CHECK:       # %bb.0:
   2604 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
   2605 ; CHECK-NEXT:    ret{{[l|q]}}
   2606   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2607   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   2608   %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   2609   %bc = bitcast <8 x i32> %res to <4 x i64>
   2610   ret <4 x i64> %bc
   2611 }
   2612 
   2613 define <4 x i64> @test_mm256_unpackhi_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2614 ; CHECK-LABEL: test_mm256_unpackhi_epi64:
   2615 ; CHECK:       # %bb.0:
   2616 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
   2617 ; CHECK-NEXT:    ret{{[l|q]}}
   2618   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   2619   ret <4 x i64> %res
   2620 }
   2621 
   2622 define <4 x i64> @test_mm256_unpacklo_epi8(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2623 ; CHECK-LABEL: test_mm256_unpacklo_epi8:
   2624 ; CHECK:       # %bb.0:
   2625 ; CHECK-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
   2626 ; CHECK-NEXT:    ret{{[l|q]}}
   2627   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
   2628   %arg1 = bitcast <4 x i64> %a1 to <32 x i8>
   2629   %res = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
   2630   %bc = bitcast <32 x i8> %res to <4 x i64>
   2631   ret <4 x i64> %bc
   2632 }
   2633 
   2634 define <4 x i64> @test_mm256_unpacklo_epi16(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2635 ; CHECK-LABEL: test_mm256_unpacklo_epi16:
   2636 ; CHECK:       # %bb.0:
   2637 ; CHECK-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
   2638 ; CHECK-NEXT:    ret{{[l|q]}}
   2639   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
   2640   %arg1 = bitcast <4 x i64> %a1 to <16 x i16>
   2641   %res = shufflevector <16 x i16> %arg0, <16 x i16> %arg1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
   2642   %bc = bitcast <16 x i16> %res to <4 x i64>
   2643   ret <4 x i64> %bc
   2644 }
   2645 
   2646 define <4 x i64> @test_mm256_unpacklo_epi32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2647 ; CHECK-LABEL: test_mm256_unpacklo_epi32:
   2648 ; CHECK:       # %bb.0:
   2649 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
   2650 ; CHECK-NEXT:    ret{{[l|q]}}
   2651   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
   2652   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   2653   %res = shufflevector <8 x i32> %arg0, <8 x i32> %arg1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   2654   %bc = bitcast <8 x i32> %res to <4 x i64>
   2655   ret <4 x i64> %bc
   2656 }
   2657 
   2658 define <4 x i64> @test_mm256_unpacklo_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2659 ; CHECK-LABEL: test_mm256_unpacklo_epi64:
   2660 ; CHECK:       # %bb.0:
   2661 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
   2662 ; CHECK-NEXT:    ret{{[l|q]}}
   2663   %res = shufflevector <4 x i64> %a0, <4 x i64> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   2664   ret <4 x i64> %res
   2665 }
   2666 
   2667 define <4 x i64> @test_mm256_xor_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2668 ; CHECK-LABEL: test_mm256_xor_si256:
   2669 ; CHECK:       # %bb.0:
   2670 ; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
   2671 ; CHECK-NEXT:    ret{{[l|q]}}
   2672   %res = xor <4 x i64> %a0, %a1
   2673   ret <4 x i64> %res
   2674 }
   2675 
   2676 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
   2677 
   2678 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
   2679