Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X86
      3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=avx,aes,pclmul | FileCheck %s --check-prefixes=CHECK,X64
      4 
      5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx-builtins.c
      6 
      7 define <4 x double> @test_mm256_add_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
      8 ; CHECK-LABEL: test_mm256_add_pd:
      9 ; CHECK:       # %bb.0:
     10 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
     11 ; CHECK-NEXT:    ret{{[l|q]}}
     12   %res = fadd <4 x double> %a0, %a1
     13   ret <4 x double> %res
     14 }
     15 
     16 define <8 x float> @test_mm256_add_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
     17 ; CHECK-LABEL: test_mm256_add_ps:
     18 ; CHECK:       # %bb.0:
     19 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
     20 ; CHECK-NEXT:    ret{{[l|q]}}
     21   %res = fadd <8 x float> %a0, %a1
     22   ret <8 x float> %res
     23 }
     24 
     25 define <4 x double> @test_mm256_addsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
     26 ; CHECK-LABEL: test_mm256_addsub_pd:
     27 ; CHECK:       # %bb.0:
     28 ; CHECK-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
     29 ; CHECK-NEXT:    ret{{[l|q]}}
     30   %res = call <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double> %a0, <4 x double> %a1)
     31   ret <4 x double> %res
     32 }
     33 declare <4 x double> @llvm.x86.avx.addsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
     34 
     35 define <8 x float> @test_mm256_addsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
     36 ; CHECK-LABEL: test_mm256_addsub_ps:
     37 ; CHECK:       # %bb.0:
     38 ; CHECK-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
     39 ; CHECK-NEXT:    ret{{[l|q]}}
     40   %res = call <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float> %a0, <8 x float> %a1)
     41   ret <8 x float> %res
     42 }
     43 declare <8 x float> @llvm.x86.avx.addsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
     44 
     45 define <4 x double> @test_mm256_and_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
     46 ; CHECK-LABEL: test_mm256_and_pd:
     47 ; CHECK:       # %bb.0:
     48 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
     49 ; CHECK-NEXT:    ret{{[l|q]}}
     50   %1 = bitcast <4 x double> %a0 to <4 x i64>
     51   %2 = bitcast <4 x double> %a1 to <4 x i64>
     52   %res = and <4 x i64> %1, %2
     53   %bc = bitcast <4 x i64> %res to <4 x double>
     54   ret <4 x double> %bc
     55 }
     56 
     57 define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
     58 ; CHECK-LABEL: test_mm256_and_ps:
     59 ; CHECK:       # %bb.0:
     60 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
     61 ; CHECK-NEXT:    ret{{[l|q]}}
     62   %1 = bitcast <8 x float> %a0 to <8 x i32>
     63   %2 = bitcast <8 x float> %a1 to <8 x i32>
     64   %res = and <8 x i32> %1, %2
     65   %bc = bitcast <8 x i32> %res to <8 x float>
     66   ret <8 x float> %bc
     67 }
     68 
     69 define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
     70 ; CHECK-LABEL: test_mm256_andnot_pd:
     71 ; CHECK:       # %bb.0:
     72 ; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
     73 ; CHECK-NEXT:    vcmptrueps %ymm2, %ymm2, %ymm2
     74 ; CHECK-NEXT:    vxorps %ymm2, %ymm0, %ymm0
     75 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
     76 ; CHECK-NEXT:    ret{{[l|q]}}
     77   %1 = bitcast <4 x double> %a0 to <4 x i64>
     78   %2 = bitcast <4 x double> %a1 to <4 x i64>
     79   %3 = xor <4 x i64> %1, <i64 -1, i64 -1, i64 -1, i64 -1>
     80   %res = and <4 x i64> %3, %2
     81   %bc = bitcast <4 x i64> %res to <4 x double>
     82   ret <4 x double> %bc
     83 }
     84 
     85 define <8 x float> @test_mm256_andnot_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
     86 ; CHECK-LABEL: test_mm256_andnot_ps:
     87 ; CHECK:       # %bb.0:
     88 ; CHECK-NEXT:    vandnps %ymm1, %ymm0, %ymm0
     89 ; CHECK-NEXT:    ret{{[l|q]}}
     90   %1 = bitcast <8 x float> %a0 to <8 x i32>
     91   %2 = bitcast <8 x float> %a1 to <8 x i32>
     92   %3 = xor <8 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
     93   %res = and <8 x i32> %3, %2
     94   %bc = bitcast <8 x i32> %res to <8 x float>
     95   ret <8 x float> %bc
     96 }
     97 
     98 define <4 x double> @test_mm256_blend_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
     99 ; CHECK-LABEL: test_mm256_blend_pd:
    100 ; CHECK:       # %bb.0:
    101 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5],ymm0[6,7]
    102 ; CHECK-NEXT:    ret{{[l|q]}}
    103   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
    104   ret <4 x double> %res
    105 }
    106 
    107 define <8 x float> @test_mm256_blend_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
    108 ; CHECK-LABEL: test_mm256_blend_ps:
    109 ; CHECK:       # %bb.0:
    110 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4,5,6],ymm1[7]
    111 ; CHECK-NEXT:    ret{{[l|q]}}
    112   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
    113   ret <8 x float> %res
    114 }
    115 
    116 define <4 x double> @test_mm256_blendv_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) nounwind {
    117 ; CHECK-LABEL: test_mm256_blendv_pd:
    118 ; CHECK:       # %bb.0:
    119 ; CHECK-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
    120 ; CHECK-NEXT:    ret{{[l|q]}}
    121   %res = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2)
    122   ret <4 x double> %res
    123 }
    124 declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
    125 
    126 define <8 x float> @test_mm256_blendv_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) nounwind {
    127 ; CHECK-LABEL: test_mm256_blendv_ps:
    128 ; CHECK:       # %bb.0:
    129 ; CHECK-NEXT:    vblendvps %ymm2, %ymm1, %ymm0, %ymm0
    130 ; CHECK-NEXT:    ret{{[l|q]}}
    131   %res = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2)
    132   ret <8 x float> %res
    133 }
    134 declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
    135 
    136 define <4 x double> @test_mm256_broadcast_pd(<2 x double>* %a0) nounwind {
    137 ; X86-LABEL: test_mm256_broadcast_pd:
    138 ; X86:       # %bb.0:
    139 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    140 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    141 ; X86-NEXT:    retl
    142 ;
    143 ; X64-LABEL: test_mm256_broadcast_pd:
    144 ; X64:       # %bb.0:
    145 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    146 ; X64-NEXT:    retq
    147   %ld = load <2 x double>, <2 x double>* %a0
    148   %res = shufflevector <2 x double> %ld, <2 x double> %ld, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
    149   ret <4 x double> %res
    150 }
    151 
    152 define <8 x float> @test_mm256_broadcast_ps(<4 x float>* %a0) nounwind {
    153 ; X86-LABEL: test_mm256_broadcast_ps:
    154 ; X86:       # %bb.0:
    155 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    156 ; X86-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    157 ; X86-NEXT:    retl
    158 ;
    159 ; X64-LABEL: test_mm256_broadcast_ps:
    160 ; X64:       # %bb.0:
    161 ; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
    162 ; X64-NEXT:    retq
    163   %ld = load <4 x float>, <4 x float>* %a0
    164   %res = shufflevector <4 x float> %ld, <4 x float> %ld, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    165   ret <8 x float> %res
    166 }
    167 
    168 define <4 x double> @test_mm256_broadcast_sd(double* %a0) nounwind {
    169 ; X86-LABEL: test_mm256_broadcast_sd:
    170 ; X86:       # %bb.0:
    171 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    172 ; X86-NEXT:    vbroadcastsd (%eax), %ymm0
    173 ; X86-NEXT:    retl
    174 ;
    175 ; X64-LABEL: test_mm256_broadcast_sd:
    176 ; X64:       # %bb.0:
    177 ; X64-NEXT:    vbroadcastsd (%rdi), %ymm0
    178 ; X64-NEXT:    retq
    179   %ld = load double, double* %a0
    180   %ins0 = insertelement <4 x double> undef, double %ld, i32 0
    181   %ins1 = insertelement <4 x double> %ins0, double %ld, i32 1
    182   %ins2 = insertelement <4 x double> %ins1, double %ld, i32 2
    183   %ins3 = insertelement <4 x double> %ins2, double %ld, i32 3
    184   ret <4 x double> %ins3
    185 }
    186 
    187 define <4 x float> @test_mm_broadcast_ss(float* %a0) nounwind {
    188 ; X86-LABEL: test_mm_broadcast_ss:
    189 ; X86:       # %bb.0:
    190 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    191 ; X86-NEXT:    vbroadcastss (%eax), %xmm0
    192 ; X86-NEXT:    retl
    193 ;
    194 ; X64-LABEL: test_mm_broadcast_ss:
    195 ; X64:       # %bb.0:
    196 ; X64-NEXT:    vbroadcastss (%rdi), %xmm0
    197 ; X64-NEXT:    retq
    198   %ld = load float, float* %a0
    199   %ins0 = insertelement <4 x float> undef, float %ld, i32 0
    200   %ins1 = insertelement <4 x float> %ins0, float %ld, i32 1
    201   %ins2 = insertelement <4 x float> %ins1, float %ld, i32 2
    202   %ins3 = insertelement <4 x float> %ins2, float %ld, i32 3
    203   ret <4 x float> %ins3
    204 }
    205 
    206 define <8 x float> @test_mm256_broadcast_ss(float* %a0) nounwind {
    207 ; X86-LABEL: test_mm256_broadcast_ss:
    208 ; X86:       # %bb.0:
    209 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    210 ; X86-NEXT:    vbroadcastss (%eax), %ymm0
    211 ; X86-NEXT:    retl
    212 ;
    213 ; X64-LABEL: test_mm256_broadcast_ss:
    214 ; X64:       # %bb.0:
    215 ; X64-NEXT:    vbroadcastss (%rdi), %ymm0
    216 ; X64-NEXT:    retq
    217   %ld = load float, float* %a0
    218   %ins0 = insertelement <8 x float> undef, float %ld, i32 0
    219   %ins1 = insertelement <8 x float> %ins0, float %ld, i32 1
    220   %ins2 = insertelement <8 x float> %ins1, float %ld, i32 2
    221   %ins3 = insertelement <8 x float> %ins2, float %ld, i32 3
    222   %ins4 = insertelement <8 x float> %ins3, float %ld, i32 4
    223   %ins5 = insertelement <8 x float> %ins4, float %ld, i32 5
    224   %ins6 = insertelement <8 x float> %ins5, float %ld, i32 6
    225   %ins7 = insertelement <8 x float> %ins6, float %ld, i32 7
    226   ret <8 x float> %ins7
    227 }
    228 
    229 define <8 x float> @test_mm256_castpd_ps(<4 x double> %a0) nounwind {
    230 ; CHECK-LABEL: test_mm256_castpd_ps:
    231 ; CHECK:       # %bb.0:
    232 ; CHECK-NEXT:    ret{{[l|q]}}
    233   %res = bitcast <4 x double> %a0 to <8 x float>
    234   ret <8 x float> %res
    235 }
    236 
    237 define <4 x i64> @test_mm256_castpd_si256(<4 x double> %a0) nounwind {
    238 ; CHECK-LABEL: test_mm256_castpd_si256:
    239 ; CHECK:       # %bb.0:
    240 ; CHECK-NEXT:    ret{{[l|q]}}
    241   %res = bitcast <4 x double> %a0 to <4 x i64>
    242   ret <4 x i64> %res
    243 }
    244 
    245 define <4 x double> @test_mm256_castpd128_pd256(<2 x double> %a0) nounwind {
    246 ; CHECK-LABEL: test_mm256_castpd128_pd256:
    247 ; CHECK:       # %bb.0:
    248 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    249 ; CHECK-NEXT:    ret{{[l|q]}}
    250   %res = shufflevector <2 x double> %a0, <2 x double> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    251   ret <4 x double> %res
    252 }
    253 
    254 define <2 x double> @test_mm256_castpd256_pd128(<4 x double> %a0) nounwind {
    255 ; CHECK-LABEL: test_mm256_castpd256_pd128:
    256 ; CHECK:       # %bb.0:
    257 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    258 ; CHECK-NEXT:    vzeroupper
    259 ; CHECK-NEXT:    ret{{[l|q]}}
    260   %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 0, i32 1>
    261   ret <2 x double> %res
    262 }
    263 
    264 define <4 x double> @test_mm256_castps_pd(<8 x float> %a0) nounwind {
    265 ; CHECK-LABEL: test_mm256_castps_pd:
    266 ; CHECK:       # %bb.0:
    267 ; CHECK-NEXT:    ret{{[l|q]}}
    268   %res = bitcast <8 x float> %a0 to <4 x double>
    269   ret <4 x double> %res
    270 }
    271 
    272 define <4 x i64> @test_mm256_castps_si256(<8 x float> %a0) nounwind {
    273 ; CHECK-LABEL: test_mm256_castps_si256:
    274 ; CHECK:       # %bb.0:
    275 ; CHECK-NEXT:    ret{{[l|q]}}
    276   %res = bitcast <8 x float> %a0 to <4 x i64>
    277   ret <4 x i64> %res
    278 }
    279 
    280 define <8 x float> @test_mm256_castps128_ps256(<4 x float> %a0) nounwind {
    281 ; CHECK-LABEL: test_mm256_castps128_ps256:
    282 ; CHECK:       # %bb.0:
    283 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    284 ; CHECK-NEXT:    ret{{[l|q]}}
    285   %res = shufflevector <4 x float> %a0, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
    286   ret <8 x float> %res
    287 }
    288 
    289 define <4 x float> @test_mm256_castps256_ps128(<8 x float> %a0) nounwind {
    290 ; CHECK-LABEL: test_mm256_castps256_ps128:
    291 ; CHECK:       # %bb.0:
    292 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    293 ; CHECK-NEXT:    vzeroupper
    294 ; CHECK-NEXT:    ret{{[l|q]}}
    295   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    296   ret <4 x float> %res
    297 }
    298 
    299 define <4 x i64> @test_mm256_castsi128_si256(<2 x i64> %a0) nounwind {
    300 ; CHECK-LABEL: test_mm256_castsi128_si256:
    301 ; CHECK:       # %bb.0:
    302 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
    303 ; CHECK-NEXT:    ret{{[l|q]}}
    304   %res = shufflevector <2 x i64> %a0, <2 x i64> %a0, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    305   ret <4 x i64> %res
    306 }
    307 
    308 define <4 x double> @test_mm256_castsi256_pd(<4 x i64> %a0) nounwind {
    309 ; CHECK-LABEL: test_mm256_castsi256_pd:
    310 ; CHECK:       # %bb.0:
    311 ; CHECK-NEXT:    ret{{[l|q]}}
    312   %res = bitcast <4 x i64> %a0 to <4 x double>
    313   ret <4 x double> %res
    314 }
    315 
    316 define <8 x float> @test_mm256_castsi256_ps(<4 x i64> %a0) nounwind {
    317 ; CHECK-LABEL: test_mm256_castsi256_ps:
    318 ; CHECK:       # %bb.0:
    319 ; CHECK-NEXT:    ret{{[l|q]}}
    320   %res = bitcast <4 x i64> %a0 to <8 x float>
    321   ret <8 x float> %res
    322 }
    323 
    324 define <2 x i64> @test_mm256_castsi256_si128(<4 x i64> %a0) nounwind {
    325 ; CHECK-LABEL: test_mm256_castsi256_si128:
    326 ; CHECK:       # %bb.0:
    327 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    328 ; CHECK-NEXT:    vzeroupper
    329 ; CHECK-NEXT:    ret{{[l|q]}}
    330   %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 0, i32 1>
    331   ret <2 x i64> %res
    332 }
    333 
    334 define <4 x double> @test_mm256_ceil_pd(<4 x double> %a0) nounwind {
    335 ; CHECK-LABEL: test_mm256_ceil_pd:
    336 ; CHECK:       # %bb.0:
    337 ; CHECK-NEXT:    vroundpd $2, %ymm0, %ymm0
    338 ; CHECK-NEXT:    ret{{[l|q]}}
    339   %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 2)
    340   ret <4 x double> %res
    341 }
    342 declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) nounwind readnone
    343 
    344 define <8 x float> @test_mm256_ceil_ps(<8 x float> %a0) nounwind {
    345 ; CHECK-LABEL: test_mm256_ceil_ps:
    346 ; CHECK:       # %bb.0:
    347 ; CHECK-NEXT:    vroundps $2, %ymm0, %ymm0
    348 ; CHECK-NEXT:    ret{{[l|q]}}
    349   %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 2)
    350   ret <8 x float> %res
    351 }
    352 declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) nounwind readnone
    353 
    354 define <2 x double> @test_mm_cmp_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
    355 ; CHECK-LABEL: test_mm_cmp_pd:
    356 ; CHECK:       # %bb.0:
    357 ; CHECK-NEXT:    vcmpgepd %xmm1, %xmm0, %xmm0
    358 ; CHECK-NEXT:    ret{{[l|q]}}
    359   %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 13)
    360   ret <2 x double> %res
    361 }
    362 declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
    363 
    364 define <4 x double> @test_mm256_cmp_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
    365 ; CHECK-LABEL: test_mm256_cmp_pd:
    366 ; CHECK:       # %bb.0:
    367 ; CHECK-NEXT:    vcmpgepd %ymm1, %ymm0, %ymm0
    368 ; CHECK-NEXT:    ret{{[l|q]}}
    369   %res = call <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double> %a0, <4 x double> %a1, i8 13)
    370   ret <4 x double> %res
    371 }
    372 declare <4 x double> @llvm.x86.avx.cmp.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
    373 
    374 define <4 x float> @test_mm_cmp_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    375 ; CHECK-LABEL: test_mm_cmp_ps:
    376 ; CHECK:       # %bb.0:
    377 ; CHECK-NEXT:    vcmpgeps %xmm1, %xmm0, %xmm0
    378 ; CHECK-NEXT:    ret{{[l|q]}}
    379   %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 13)
    380   ret <4 x float> %res
    381 }
    382 declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
    383 
    384 define <8 x float> @test_mm256_cmp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
    385 ; CHECK-LABEL: test_mm256_cmp_ps:
    386 ; CHECK:       # %bb.0:
    387 ; CHECK-NEXT:    vcmpgeps %ymm1, %ymm0, %ymm0
    388 ; CHECK-NEXT:    ret{{[l|q]}}
    389   %res = call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 13)
    390   ret <8 x float> %res
    391 }
    392 declare <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
    393 
    394 define <2 x double> @test_mm_cmp_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
    395 ; CHECK-LABEL: test_mm_cmp_sd:
    396 ; CHECK:       # %bb.0:
    397 ; CHECK-NEXT:    vcmpgesd %xmm1, %xmm0, %xmm0
    398 ; CHECK-NEXT:    ret{{[l|q]}}
    399   %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 13)
    400   ret <2 x double> %res
    401 }
    402 declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
    403 
    404 define <4 x float> @test_mm_cmp_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    405 ; CHECK-LABEL: test_mm_cmp_ss:
    406 ; CHECK:       # %bb.0:
    407 ; CHECK-NEXT:    vcmpgess %xmm1, %xmm0, %xmm0
    408 ; CHECK-NEXT:    ret{{[l|q]}}
    409   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 13)
    410   ret <4 x float> %res
    411 }
    412 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
    413 
    414 define <4 x double> @test_mm256_cvtepi32_pd(<2 x i64> %a0) nounwind {
    415 ; CHECK-LABEL: test_mm256_cvtepi32_pd:
    416 ; CHECK:       # %bb.0:
    417 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
    418 ; CHECK-NEXT:    ret{{[l|q]}}
    419   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    420   %res = sitofp <4 x i32> %arg0 to <4 x double>
    421   ret <4 x double> %res
    422 }
    423 
    424 define <8 x float> @test_mm256_cvtepi32_ps(<4 x i64> %a0) nounwind {
    425 ; CHECK-LABEL: test_mm256_cvtepi32_ps:
    426 ; CHECK:       # %bb.0:
    427 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
    428 ; CHECK-NEXT:    ret{{[l|q]}}
    429   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
    430   %res = sitofp <8 x i32> %arg0 to <8 x float>
    431   ret <8 x float> %res
    432 }
    433 
    434 define <2 x i64> @test_mm256_cvtpd_epi32(<4 x double> %a0) nounwind {
    435 ; CHECK-LABEL: test_mm256_cvtpd_epi32:
    436 ; CHECK:       # %bb.0:
    437 ; CHECK-NEXT:    vcvtpd2dq %ymm0, %xmm0
    438 ; CHECK-NEXT:    vzeroupper
    439 ; CHECK-NEXT:    ret{{[l|q]}}
    440   %cvt = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0)
    441   %res = bitcast <4 x i32> %cvt to <2 x i64>
    442   ret <2 x i64> %res
    443 }
    444 declare <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double>) nounwind readnone
    445 
    446 define <4 x float> @test_mm256_cvtpd_ps(<4 x double> %a0) nounwind {
    447 ; CHECK-LABEL: test_mm256_cvtpd_ps:
    448 ; CHECK:       # %bb.0:
    449 ; CHECK-NEXT:    vcvtpd2ps %ymm0, %xmm0
    450 ; CHECK-NEXT:    vzeroupper
    451 ; CHECK-NEXT:    ret{{[l|q]}}
    452   %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0)
    453   ret <4 x float> %res
    454 }
    455 declare <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double>) nounwind readnone
    456 
    457 define <4 x i64> @test_mm256_cvtps_epi32(<8 x float> %a0) nounwind {
    458 ; CHECK-LABEL: test_mm256_cvtps_epi32:
    459 ; CHECK:       # %bb.0:
    460 ; CHECK-NEXT:    vcvtps2dq %ymm0, %ymm0
    461 ; CHECK-NEXT:    ret{{[l|q]}}
    462   %cvt = call <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float> %a0)
    463   %res = bitcast <8 x i32> %cvt to <4 x i64>
    464   ret <4 x i64> %res
    465 }
    466 declare <8 x i32> @llvm.x86.avx.cvt.ps2dq.256(<8 x float>) nounwind readnone
    467 
    468 define <4 x double> @test_mm256_cvtps_pd(<4 x float> %a0) nounwind {
    469 ; CHECK-LABEL: test_mm256_cvtps_pd:
    470 ; CHECK:       # %bb.0:
    471 ; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm0
    472 ; CHECK-NEXT:    ret{{[l|q]}}
    473   %res = fpext <4 x float> %a0 to <4 x double>
    474   ret <4 x double> %res
    475 }
    476 
    477 define <2 x i64> @test_mm256_cvttpd_epi32(<4 x double> %a0) nounwind {
    478 ; CHECK-LABEL: test_mm256_cvttpd_epi32:
    479 ; CHECK:       # %bb.0:
    480 ; CHECK-NEXT:    vcvttpd2dq %ymm0, %xmm0
    481 ; CHECK-NEXT:    vzeroupper
    482 ; CHECK-NEXT:    ret{{[l|q]}}
    483   %cvt = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0)
    484   %res = bitcast <4 x i32> %cvt to <2 x i64>
    485   ret <2 x i64> %res
    486 }
    487 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
    488 
    489 define <4 x i64> @test_mm256_cvttps_epi32(<8 x float> %a0) nounwind {
    490 ; CHECK-LABEL: test_mm256_cvttps_epi32:
    491 ; CHECK:       # %bb.0:
    492 ; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
    493 ; CHECK-NEXT:    ret{{[l|q]}}
    494   %cvt = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0)
    495   %res = bitcast <8 x i32> %cvt to <4 x i64>
    496   ret <4 x i64> %res
    497 }
    498 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
    499 
    500 define <4 x double> @test_mm256_div_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
    501 ; CHECK-LABEL: test_mm256_div_pd:
    502 ; CHECK:       # %bb.0:
    503 ; CHECK-NEXT:    vdivpd %ymm1, %ymm0, %ymm0
    504 ; CHECK-NEXT:    ret{{[l|q]}}
    505   %res = fdiv <4 x double> %a0, %a1
    506   ret <4 x double> %res
    507 }
    508 
    509 define <8 x float> @test_mm256_div_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
    510 ; CHECK-LABEL: test_mm256_div_ps:
    511 ; CHECK:       # %bb.0:
    512 ; CHECK-NEXT:    vdivps %ymm1, %ymm0, %ymm0
    513 ; CHECK-NEXT:    ret{{[l|q]}}
    514   %res = fdiv <8 x float> %a0, %a1
    515   ret <8 x float> %res
    516 }
    517 
    518 define <8 x float> @test_mm256_dp_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
    519 ; CHECK-LABEL: test_mm256_dp_ps:
    520 ; CHECK:       # %bb.0:
    521 ; CHECK-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0
    522 ; CHECK-NEXT:    ret{{[l|q]}}
    523   %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i8 7)
    524   ret <8 x float> %res
    525 }
    526 declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
    527 
    528 define i32 @test_mm256_extract_epi8(<4 x i64> %a0) nounwind {
    529 ; CHECK-LABEL: test_mm256_extract_epi8:
    530 ; CHECK:       # %bb.0:
    531 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
    532 ; CHECK-NEXT:    vpextrb $15, %xmm0, %eax
    533 ; CHECK-NEXT:    movzbl %al, %eax
    534 ; CHECK-NEXT:    vzeroupper
    535 ; CHECK-NEXT:    ret{{[l|q]}}
    536   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    537   %ext = extractelement <32 x i8> %arg0, i32 31
    538   %res = zext i8 %ext to i32
    539   ret i32 %res
    540 }
    541 
    542 define i32 @test_mm256_extract_epi16(<4 x i64> %a0) nounwind {
    543 ; CHECK-LABEL: test_mm256_extract_epi16:
    544 ; CHECK:       # %bb.0:
    545 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
    546 ; CHECK-NEXT:    vpextrw $3, %xmm0, %eax
    547 ; CHECK-NEXT:    movzwl %ax, %eax
    548 ; CHECK-NEXT:    vzeroupper
    549 ; CHECK-NEXT:    ret{{[l|q]}}
    550   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    551   %ext = extractelement <16 x i16> %arg0, i32 11
    552   %res = zext i16 %ext to i32
    553   ret i32 %res
    554 }
    555 
    556 define i32 @test_mm256_extract_epi32(<4 x i64> %a0) nounwind {
    557 ; CHECK-LABEL: test_mm256_extract_epi32:
    558 ; CHECK:       # %bb.0:
    559 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
    560 ; CHECK-NEXT:    vextractps $1, %xmm0, %eax
    561 ; CHECK-NEXT:    vzeroupper
    562 ; CHECK-NEXT:    ret{{[l|q]}}
    563   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
    564   %res = extractelement <8 x i32> %arg0, i32 5
    565   ret i32 %res
    566 }
    567 
    568 define i64 @test_mm256_extract_epi64(<4 x i64> %a0) nounwind {
    569 ; X86-LABEL: test_mm256_extract_epi64:
    570 ; X86:       # %bb.0:
    571 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
    572 ; X86-NEXT:    vextractps $2, %xmm0, %eax
    573 ; X86-NEXT:    vextractps $3, %xmm0, %edx
    574 ; X86-NEXT:    vzeroupper
    575 ; X86-NEXT:    retl
    576 ;
    577 ; X64-LABEL: test_mm256_extract_epi64:
    578 ; X64:       # %bb.0:
    579 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
    580 ; X64-NEXT:    vpextrq $1, %xmm0, %rax
    581 ; X64-NEXT:    vzeroupper
    582 ; X64-NEXT:    retq
    583   %res = extractelement <4 x i64> %a0, i32 3
    584   ret i64 %res
    585 }
    586 
    587 define <2 x double> @test_mm256_extractf128_pd(<4 x double> %a0) nounwind {
    588 ; CHECK-LABEL: test_mm256_extractf128_pd:
    589 ; CHECK:       # %bb.0:
    590 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
    591 ; CHECK-NEXT:    vzeroupper
    592 ; CHECK-NEXT:    ret{{[l|q]}}
    593   %res = shufflevector <4 x double> %a0, <4 x double> %a0, <2 x i32> <i32 2, i32 3>
    594   ret <2 x double> %res
    595 }
    596 
    597 define <4 x float> @test_mm256_extractf128_ps(<8 x float> %a0) nounwind {
    598 ; CHECK-LABEL: test_mm256_extractf128_ps:
    599 ; CHECK:       # %bb.0:
    600 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
    601 ; CHECK-NEXT:    vzeroupper
    602 ; CHECK-NEXT:    ret{{[l|q]}}
    603   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
    604   ret <4 x float> %res
    605 }
    606 
    607 define <2 x i64> @test_mm256_extractf128_si256(<4 x i64> %a0) nounwind {
    608 ; CHECK-LABEL: test_mm256_extractf128_si256:
    609 ; CHECK:       # %bb.0:
    610 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
    611 ; CHECK-NEXT:    vzeroupper
    612 ; CHECK-NEXT:    ret{{[l|q]}}
    613   %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> <i32 2, i32 3>
    614   ret <2 x i64> %res
    615 }
    616 
    617 define <4 x double> @test_mm256_floor_pd(<4 x double> %a0) nounwind {
    618 ; CHECK-LABEL: test_mm256_floor_pd:
    619 ; CHECK:       # %bb.0:
    620 ; CHECK-NEXT:    vroundpd $1, %ymm0, %ymm0
    621 ; CHECK-NEXT:    ret{{[l|q]}}
    622   %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 1)
    623   ret <4 x double> %res
    624 }
    625 
    626 define <8 x float> @test_mm256_floor_ps(<8 x float> %a0) nounwind {
    627 ; CHECK-LABEL: test_mm256_floor_ps:
    628 ; CHECK:       # %bb.0:
    629 ; CHECK-NEXT:    vroundps $1, %ymm0, %ymm0
    630 ; CHECK-NEXT:    ret{{[l|q]}}
    631   %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 1)
    632   ret <8 x float> %res
    633 }
    634 
    635 define <4 x double> @test_mm256_hadd_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
    636 ; CHECK-LABEL: test_mm256_hadd_pd:
    637 ; CHECK:       # %bb.0:
    638 ; CHECK-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0
    639 ; CHECK-NEXT:    ret{{[l|q]}}
    640   %res = call <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double> %a0, <4 x double> %a1)
    641   ret <4 x double> %res
    642 }
    643 declare <4 x double> @llvm.x86.avx.hadd.pd.256(<4 x double>, <4 x double>) nounwind readnone
    644 
    645 define <8 x float> @test_mm256_hadd_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
    646 ; CHECK-LABEL: test_mm256_hadd_ps:
    647 ; CHECK:       # %bb.0:
    648 ; CHECK-NEXT:    vhaddps %ymm1, %ymm0, %ymm0
    649 ; CHECK-NEXT:    ret{{[l|q]}}
    650   %res = call <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float> %a0, <8 x float> %a1)
    651   ret <8 x float> %res
    652 }
    653 declare <8 x float> @llvm.x86.avx.hadd.ps.256(<8 x float>, <8 x float>) nounwind readnone
    654 
    655 define <4 x double> @test_mm256_hsub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
    656 ; CHECK-LABEL: test_mm256_hsub_pd:
    657 ; CHECK:       # %bb.0:
    658 ; CHECK-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0
    659 ; CHECK-NEXT:    ret{{[l|q]}}
    660   %res = call <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double> %a0, <4 x double> %a1)
    661   ret <4 x double> %res
    662 }
    663 declare <4 x double> @llvm.x86.avx.hsub.pd.256(<4 x double>, <4 x double>) nounwind readnone
    664 
    665 define <8 x float> @test_mm256_hsub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
    666 ; CHECK-LABEL: test_mm256_hsub_ps:
    667 ; CHECK:       # %bb.0:
    668 ; CHECK-NEXT:    vhsubps %ymm1, %ymm0, %ymm0
    669 ; CHECK-NEXT:    ret{{[l|q]}}
    670   %res = call <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float> %a0, <8 x float> %a1)
    671   ret <8 x float> %res
    672 }
    673 declare <8 x float> @llvm.x86.avx.hsub.ps.256(<8 x float>, <8 x float>) nounwind readnone
    674 
    675 define <4 x i64> @test_mm256_insert_epi8(<4 x i64> %a0, i8 %a1) nounwind {
    676 ; X86-LABEL: test_mm256_insert_epi8:
    677 ; X86:       # %bb.0:
    678 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
    679 ; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
    680 ; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
    681 ; X86-NEXT:    retl
    682 ;
    683 ; X64-LABEL: test_mm256_insert_epi8:
    684 ; X64:       # %bb.0:
    685 ; X64-NEXT:    movzbl %dil, %eax
    686 ; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
    687 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
    688 ; X64-NEXT:    retq
    689   %arg0 = bitcast <4 x i64> %a0 to <32 x i8>
    690   %res = insertelement <32 x i8> %arg0, i8 %a1, i32 4
    691   %bc = bitcast <32 x i8> %res to <4 x i64>
    692   ret <4 x i64> %bc
    693 }
    694 
    695 define <4 x i64> @test_mm256_insert_epi16(<4 x i64> %a0, i16 %a1) nounwind {
    696 ; X86-LABEL: test_mm256_insert_epi16:
    697 ; X86:       # %bb.0:
    698 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
    699 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
    700 ; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
    701 ; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    702 ; X86-NEXT:    retl
    703 ;
    704 ; X64-LABEL: test_mm256_insert_epi16:
    705 ; X64:       # %bb.0:
    706 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
    707 ; X64-NEXT:    vpinsrw $6, %edi, %xmm1, %xmm1
    708 ; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    709 ; X64-NEXT:    retq
    710   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
    711   %res = insertelement <16 x i16> %arg0, i16 %a1, i32 14
    712   %bc = bitcast <16 x i16> %res to <4 x i64>
    713   ret <4 x i64> %bc
    714 }
    715 
    716 define <4 x i64> @test_mm256_insert_epi32(<4 x i64> %a0, i32 %a1) nounwind {
    717 ; X86-LABEL: test_mm256_insert_epi32:
    718 ; X86:       # %bb.0:
    719 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm1
    720 ; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
    721 ; X86-NEXT:    retl
    722 ;
    723 ; X64-LABEL: test_mm256_insert_epi32:
    724 ; X64:       # %bb.0:
    725 ; X64-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm1
    726 ; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
    727 ; X64-NEXT:    retq
    728   %arg0 = bitcast <4 x i64> %a0 to <8 x i32>
    729   %res = insertelement <8 x i32> %arg0, i32 %a1, i32 3
    730   %bc = bitcast <8 x i32> %res to <4 x i64>
    731   ret <4 x i64> %bc
    732 }
    733 
    734 define <4 x i64> @test_mm256_insert_epi64(<4 x i64> %a0, i64 %a1) nounwind {
    735 ; X86-LABEL: test_mm256_insert_epi64:
    736 ; X86:       # %bb.0:
    737 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
    738 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
    739 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
    740 ; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    741 ; X86-NEXT:    retl
    742 ;
    743 ; X64-LABEL: test_mm256_insert_epi64:
    744 ; X64:       # %bb.0:
    745 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
    746 ; X64-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
    747 ; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    748 ; X64-NEXT:    retq
    749   %res = insertelement <4 x i64> %a0, i64 %a1, i32 3
    750   ret <4 x i64> %res
    751 }
    752 
    753 define <4 x double> @test_mm256_insertf128_pd(<4 x double> %a0, <2 x double> %a1) nounwind {
    754 ; CHECK-LABEL: test_mm256_insertf128_pd:
    755 ; CHECK:       # %bb.0:
    756 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
    757 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
    758 ; CHECK-NEXT:    ret{{[l|q]}}
    759   %ext = shufflevector <2 x double> %a1, <2 x double> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    760   %res = shufflevector <4 x double> %a0, <4 x double> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
    761   ret <4 x double> %res
    762 }
    763 
    764 define <8 x float> @test_mm256_insertf128_ps(<8 x float> %a0, <4 x float> %a1) nounwind {
    765 ; CHECK-LABEL: test_mm256_insertf128_ps:
    766 ; CHECK:       # %bb.0:
    767 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    768 ; CHECK-NEXT:    ret{{[l|q]}}
    769   %ext = shufflevector <4 x float> %a1, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
    770   %res = shufflevector <8 x float> %a0, <8 x float> %ext, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
    771   ret <8 x float> %res
    772 }
    773 
    774 define <4 x i64> @test_mm256_insertf128_si256(<4 x i64> %a0, <2 x i64> %a1) nounwind {
    775 ; CHECK-LABEL: test_mm256_insertf128_si256:
    776 ; CHECK:       # %bb.0:
    777 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
    778 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
    779 ; CHECK-NEXT:    ret{{[l|q]}}
    780   %ext = shufflevector <2 x i64> %a1, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    781   %res = shufflevector <4 x i64> %a0, <4 x i64> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
    782   ret <4 x i64> %res
    783 }
    784 
    785 define <4 x i64> @test_mm256_lddqu_si256(<4 x i64>* %a0) nounwind {
    786 ; X86-LABEL: test_mm256_lddqu_si256:
    787 ; X86:       # %bb.0:
    788 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    789 ; X86-NEXT:    vlddqu (%eax), %ymm0
    790 ; X86-NEXT:    retl
    791 ;
    792 ; X64-LABEL: test_mm256_lddqu_si256:
    793 ; X64:       # %bb.0:
    794 ; X64-NEXT:    vlddqu (%rdi), %ymm0
    795 ; X64-NEXT:    retq
    796   %arg0 = bitcast <4 x i64>* %a0 to i8*
    797   %res = call <32 x i8> @llvm.x86.avx.ldu.dq.256(i8* %arg0)
    798   %bc = bitcast <32 x i8> %res to <4 x i64>
    799   ret <4 x i64> %bc
    800 }
    801 declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readnone
    802 
    803 define <4 x double> @test_mm256_load_pd(double* %a0) nounwind {
    804 ; X86-LABEL: test_mm256_load_pd:
    805 ; X86:       # %bb.0:
    806 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    807 ; X86-NEXT:    vmovaps (%eax), %ymm0
    808 ; X86-NEXT:    retl
    809 ;
    810 ; X64-LABEL: test_mm256_load_pd:
    811 ; X64:       # %bb.0:
    812 ; X64-NEXT:    vmovaps (%rdi), %ymm0
    813 ; X64-NEXT:    retq
    814   %arg0 = bitcast double* %a0 to <4 x double>*
    815   %res = load <4 x double>, <4 x double>* %arg0, align 32
    816   ret <4 x double> %res
    817 }
    818 
    819 define <8 x float> @test_mm256_load_ps(float* %a0) nounwind {
    820 ; X86-LABEL: test_mm256_load_ps:
    821 ; X86:       # %bb.0:
    822 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    823 ; X86-NEXT:    vmovaps (%eax), %ymm0
    824 ; X86-NEXT:    retl
    825 ;
    826 ; X64-LABEL: test_mm256_load_ps:
    827 ; X64:       # %bb.0:
    828 ; X64-NEXT:    vmovaps (%rdi), %ymm0
    829 ; X64-NEXT:    retq
    830   %arg0 = bitcast float* %a0 to <8 x float>*
    831   %res = load <8 x float>, <8 x float>* %arg0, align 32
    832   ret <8 x float> %res
    833 }
    834 
    835 define <4 x i64> @test_mm256_load_si256(<4 x i64>* %a0) nounwind {
    836 ; X86-LABEL: test_mm256_load_si256:
    837 ; X86:       # %bb.0:
    838 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    839 ; X86-NEXT:    vmovaps (%eax), %ymm0
    840 ; X86-NEXT:    retl
    841 ;
    842 ; X64-LABEL: test_mm256_load_si256:
    843 ; X64:       # %bb.0:
    844 ; X64-NEXT:    vmovaps (%rdi), %ymm0
    845 ; X64-NEXT:    retq
    846   %res = load <4 x i64>, <4 x i64>* %a0, align 32
    847   ret <4 x i64> %res
    848 }
    849 
    850 define <4 x double> @test_mm256_loadu_pd(double* %a0) nounwind {
    851 ; X86-LABEL: test_mm256_loadu_pd:
    852 ; X86:       # %bb.0:
    853 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    854 ; X86-NEXT:    vmovups (%eax), %ymm0
    855 ; X86-NEXT:    retl
    856 ;
    857 ; X64-LABEL: test_mm256_loadu_pd:
    858 ; X64:       # %bb.0:
    859 ; X64-NEXT:    vmovups (%rdi), %ymm0
    860 ; X64-NEXT:    retq
    861   %arg0 = bitcast double* %a0 to <4 x double>*
    862   %res = load <4 x double>, <4 x double>* %arg0, align 1
    863   ret <4 x double> %res
    864 }
    865 
    866 define <8 x float> @test_mm256_loadu_ps(float* %a0) nounwind {
    867 ; X86-LABEL: test_mm256_loadu_ps:
    868 ; X86:       # %bb.0:
    869 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    870 ; X86-NEXT:    vmovups (%eax), %ymm0
    871 ; X86-NEXT:    retl
    872 ;
    873 ; X64-LABEL: test_mm256_loadu_ps:
    874 ; X64:       # %bb.0:
    875 ; X64-NEXT:    vmovups (%rdi), %ymm0
    876 ; X64-NEXT:    retq
    877   %arg0 = bitcast float* %a0 to <8 x float>*
    878   %res = load <8 x float>, <8 x float>* %arg0, align 1
    879   ret <8 x float> %res
    880 }
    881 
    882 define <4 x i64> @test_mm256_loadu_si256(<4 x i64>* %a0) nounwind {
    883 ; X86-LABEL: test_mm256_loadu_si256:
    884 ; X86:       # %bb.0:
    885 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    886 ; X86-NEXT:    vmovups (%eax), %ymm0
    887 ; X86-NEXT:    retl
    888 ;
    889 ; X64-LABEL: test_mm256_loadu_si256:
    890 ; X64:       # %bb.0:
    891 ; X64-NEXT:    vmovups (%rdi), %ymm0
    892 ; X64-NEXT:    retq
    893   %res = load <4 x i64>, <4 x i64>* %a0, align 1
    894   ret <4 x i64> %res
    895 }
    896 
    897 define <8 x float> @test_mm256_loadu2_m128(float* %a0, float* %a1) nounwind {
    898 ; X86-LABEL: test_mm256_loadu2_m128:
    899 ; X86:       # %bb.0:
    900 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    901 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    902 ; X86-NEXT:    vmovups (%eax), %xmm0
    903 ; X86-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
    904 ; X86-NEXT:    retl
    905 ;
    906 ; X64-LABEL: test_mm256_loadu2_m128:
    907 ; X64:       # %bb.0:
    908 ; X64-NEXT:    vmovups (%rsi), %xmm0
    909 ; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
    910 ; X64-NEXT:    retq
    911   %arg0 = bitcast float* %a0 to <4 x float>*
    912   %hi4 = load <4 x float>, <4 x float>* %arg0, align 1
    913   %hi8 = shufflevector <4 x float> %hi4, <4 x float> %hi4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
    914   %arg1 = bitcast float* %a1 to <4 x float>*
    915   %lo4 = load <4 x float>, <4 x float>* %arg1, align 1
    916   %lo8 = shufflevector <4 x float> %lo4, <4 x float> %lo4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
    917   %res = shufflevector <8 x float> %lo8, <8 x float> %hi8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
    918   ret <8 x float> %res
    919 }
    920 
    921 define <4 x double> @test_mm256_loadu2_m128d(double* %a0, double* %a1) nounwind {
    922 ; X86-LABEL: test_mm256_loadu2_m128d:
    923 ; X86:       # %bb.0:
    924 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    925 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    926 ; X86-NEXT:    vmovups (%eax), %xmm0
    927 ; X86-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
    928 ; X86-NEXT:    retl
    929 ;
    930 ; X64-LABEL: test_mm256_loadu2_m128d:
    931 ; X64:       # %bb.0:
    932 ; X64-NEXT:    vmovups (%rsi), %xmm0
    933 ; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
    934 ; X64-NEXT:    retq
    935   %arg0 = bitcast double* %a0 to <2 x double>*
    936   %hi2 = load <2 x double>, <2 x double>* %arg0, align 1
    937   %hi4 = shufflevector <2 x double> %hi2, <2 x double> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    938   %arg1 = bitcast double* %a1 to <2 x double>*
    939   %lo2 = load <2 x double>, <2 x double>* %arg1, align 1
    940   %lo4 = shufflevector <2 x double> %lo2, <2 x double> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    941   %res = shufflevector <4 x double> %lo4, <4 x double> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    942   ret <4 x double> %res
    943 }
    944 
    945 define <4 x i64> @test_mm256_loadu2_m128i(i64* %a0, i64* %a1) nounwind {
    946 ; X86-LABEL: test_mm256_loadu2_m128i:
    947 ; X86:       # %bb.0:
    948 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    949 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    950 ; X86-NEXT:    vmovups (%eax), %xmm0
    951 ; X86-NEXT:    vinsertf128 $1, (%ecx), %ymm0, %ymm0
    952 ; X86-NEXT:    retl
    953 ;
    954 ; X64-LABEL: test_mm256_loadu2_m128i:
    955 ; X64:       # %bb.0:
    956 ; X64-NEXT:    vmovups (%rsi), %xmm0
    957 ; X64-NEXT:    vinsertf128 $1, (%rdi), %ymm0, %ymm0
    958 ; X64-NEXT:    retq
    959   %arg0 = bitcast i64* %a0 to <2 x i64>*
    960   %hi2 = load <2 x i64>, <2 x i64>* %arg0, align 1
    961   %hi4 = shufflevector <2 x i64> %hi2, <2 x i64> %hi2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    962   %arg1 = bitcast i64* %a1 to <2 x i64>*
    963   %lo2 = load <2 x i64>, <2 x i64>* %arg1, align 1
    964   %lo4 = shufflevector <2 x i64> %lo2, <2 x i64> %lo2, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
    965   %res = shufflevector <4 x i64> %lo4, <4 x i64> %hi4, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
    966   ret <4 x i64> %res
    967 }
    968 
    969 define <2 x double> @test_mm_maskload_pd(double* %a0, <2 x i64> %a1) nounwind {
    970 ; X86-LABEL: test_mm_maskload_pd:
    971 ; X86:       # %bb.0:
    972 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    973 ; X86-NEXT:    vmaskmovpd (%eax), %xmm0, %xmm0
    974 ; X86-NEXT:    retl
    975 ;
    976 ; X64-LABEL: test_mm_maskload_pd:
    977 ; X64:       # %bb.0:
    978 ; X64-NEXT:    vmaskmovpd (%rdi), %xmm0, %xmm0
    979 ; X64-NEXT:    retq
    980   %arg0 = bitcast double* %a0 to i8*
    981   %res = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %arg0, <2 x i64> %a1)
    982   ret <2 x double> %res
    983 }
    984 declare <2 x double> @llvm.x86.avx.maskload.pd(i8*, <2 x i64>) nounwind readnone
    985 
    986 define <4 x double> @test_mm256_maskload_pd(double* %a0, <4 x i64> %a1) nounwind {
    987 ; X86-LABEL: test_mm256_maskload_pd:
    988 ; X86:       # %bb.0:
    989 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
    990 ; X86-NEXT:    vmaskmovpd (%eax), %ymm0, %ymm0
    991 ; X86-NEXT:    retl
    992 ;
    993 ; X64-LABEL: test_mm256_maskload_pd:
    994 ; X64:       # %bb.0:
    995 ; X64-NEXT:    vmaskmovpd (%rdi), %ymm0, %ymm0
    996 ; X64-NEXT:    retq
    997   %arg0 = bitcast double* %a0 to i8*
    998   %res = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %arg0, <4 x i64> %a1)
    999   ret <4 x double> %res
   1000 }
   1001 declare <4 x double> @llvm.x86.avx.maskload.pd.256(i8*, <4 x i64>) nounwind readnone
   1002 
   1003 define <4 x float> @test_mm_maskload_ps(float* %a0, <2 x i64> %a1) nounwind {
   1004 ; X86-LABEL: test_mm_maskload_ps:
   1005 ; X86:       # %bb.0:
   1006 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1007 ; X86-NEXT:    vmaskmovps (%eax), %xmm0, %xmm0
   1008 ; X86-NEXT:    retl
   1009 ;
   1010 ; X64-LABEL: test_mm_maskload_ps:
   1011 ; X64:       # %bb.0:
   1012 ; X64-NEXT:    vmaskmovps (%rdi), %xmm0, %xmm0
   1013 ; X64-NEXT:    retq
   1014   %arg0 = bitcast float* %a0 to i8*
   1015   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   1016   %res = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %arg0, <4 x i32> %arg1)
   1017   ret <4 x float> %res
   1018 }
   1019 declare <4 x float> @llvm.x86.avx.maskload.ps(i8*, <4 x i32>) nounwind readnone
   1020 
   1021 define <8 x float> @test_mm256_maskload_ps(float* %a0, <4 x i64> %a1) nounwind {
   1022 ; X86-LABEL: test_mm256_maskload_ps:
   1023 ; X86:       # %bb.0:
   1024 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1025 ; X86-NEXT:    vmaskmovps (%eax), %ymm0, %ymm0
   1026 ; X86-NEXT:    retl
   1027 ;
   1028 ; X64-LABEL: test_mm256_maskload_ps:
   1029 ; X64:       # %bb.0:
   1030 ; X64-NEXT:    vmaskmovps (%rdi), %ymm0, %ymm0
   1031 ; X64-NEXT:    retq
   1032   %arg0 = bitcast float* %a0 to i8*
   1033   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1034   %res = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %arg0, <8 x i32> %arg1)
   1035   ret <8 x float> %res
   1036 }
   1037 declare <8 x float> @llvm.x86.avx.maskload.ps.256(i8*, <8 x i32>) nounwind readnone
   1038 
   1039 define void @test_mm_maskstore_pd(double* %a0, <2 x i64> %a1, <2 x double> %a2) nounwind {
   1040 ; X86-LABEL: test_mm_maskstore_pd:
   1041 ; X86:       # %bb.0:
   1042 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1043 ; X86-NEXT:    vmaskmovpd %xmm1, %xmm0, (%eax)
   1044 ; X86-NEXT:    retl
   1045 ;
   1046 ; X64-LABEL: test_mm_maskstore_pd:
   1047 ; X64:       # %bb.0:
   1048 ; X64-NEXT:    vmaskmovpd %xmm1, %xmm0, (%rdi)
   1049 ; X64-NEXT:    retq
   1050   %arg0 = bitcast double* %a0 to i8*
   1051   call void @llvm.x86.avx.maskstore.pd(i8* %arg0, <2 x i64> %a1, <2 x double> %a2)
   1052   ret void
   1053 }
   1054 declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind readnone
   1055 
   1056 define void @test_mm256_maskstore_pd(double* %a0, <4 x i64> %a1, <4 x double> %a2) nounwind {
   1057 ; X86-LABEL: test_mm256_maskstore_pd:
   1058 ; X86:       # %bb.0:
   1059 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1060 ; X86-NEXT:    vmaskmovpd %ymm1, %ymm0, (%eax)
   1061 ; X86-NEXT:    vzeroupper
   1062 ; X86-NEXT:    retl
   1063 ;
   1064 ; X64-LABEL: test_mm256_maskstore_pd:
   1065 ; X64:       # %bb.0:
   1066 ; X64-NEXT:    vmaskmovpd %ymm1, %ymm0, (%rdi)
   1067 ; X64-NEXT:    vzeroupper
   1068 ; X64-NEXT:    retq
   1069   %arg0 = bitcast double* %a0 to i8*
   1070   call void @llvm.x86.avx.maskstore.pd.256(i8* %arg0, <4 x i64> %a1, <4 x double> %a2)
   1071   ret void
   1072 }
   1073 declare void @llvm.x86.avx.maskstore.pd.256(i8*, <4 x i64>, <4 x double>) nounwind readnone
   1074 
   1075 define void @test_mm_maskstore_ps(float* %a0, <2 x i64> %a1, <4 x float> %a2) nounwind {
   1076 ; X86-LABEL: test_mm_maskstore_ps:
   1077 ; X86:       # %bb.0:
   1078 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1079 ; X86-NEXT:    vmaskmovps %xmm1, %xmm0, (%eax)
   1080 ; X86-NEXT:    retl
   1081 ;
   1082 ; X64-LABEL: test_mm_maskstore_ps:
   1083 ; X64:       # %bb.0:
   1084 ; X64-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
   1085 ; X64-NEXT:    retq
   1086   %arg0 = bitcast float* %a0 to i8*
   1087   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   1088   call void @llvm.x86.avx.maskstore.ps(i8* %arg0, <4 x i32> %arg1, <4 x float> %a2)
   1089   ret void
   1090 }
   1091 declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind readnone
   1092 
   1093 define void @test_mm256_maskstore_ps(float* %a0, <4 x i64> %a1, <8 x float> %a2) nounwind {
   1094 ; X86-LABEL: test_mm256_maskstore_ps:
   1095 ; X86:       # %bb.0:
   1096 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1097 ; X86-NEXT:    vmaskmovps %ymm1, %ymm0, (%eax)
   1098 ; X86-NEXT:    vzeroupper
   1099 ; X86-NEXT:    retl
   1100 ;
   1101 ; X64-LABEL: test_mm256_maskstore_ps:
   1102 ; X64:       # %bb.0:
   1103 ; X64-NEXT:    vmaskmovps %ymm1, %ymm0, (%rdi)
   1104 ; X64-NEXT:    vzeroupper
   1105 ; X64-NEXT:    retq
   1106   %arg0 = bitcast float* %a0 to i8*
   1107   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1108   call void @llvm.x86.avx.maskstore.ps.256(i8* %arg0, <8 x i32> %arg1, <8 x float> %a2)
   1109   ret void
   1110 }
   1111 declare void @llvm.x86.avx.maskstore.ps.256(i8*, <8 x i32>, <8 x float>) nounwind readnone
   1112 
   1113 define <4 x double> @test_mm256_max_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   1114 ; CHECK-LABEL: test_mm256_max_pd:
   1115 ; CHECK:       # %bb.0:
   1116 ; CHECK-NEXT:    vmaxpd %ymm1, %ymm0, %ymm0
   1117 ; CHECK-NEXT:    ret{{[l|q]}}
   1118   %res = call <4 x double> @llvm.x86.avx.max.pd.256(<4 x double> %a0, <4 x double> %a1)
   1119   ret <4 x double> %res
   1120 }
   1121 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
   1122 
   1123 define <8 x float> @test_mm256_max_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   1124 ; CHECK-LABEL: test_mm256_max_ps:
   1125 ; CHECK:       # %bb.0:
   1126 ; CHECK-NEXT:    vmaxps %ymm1, %ymm0, %ymm0
   1127 ; CHECK-NEXT:    ret{{[l|q]}}
   1128   %res = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
   1129   ret <8 x float> %res
   1130 }
   1131 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
   1132 
   1133 define <4 x double> @test_mm256_min_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   1134 ; CHECK-LABEL: test_mm256_min_pd:
   1135 ; CHECK:       # %bb.0:
   1136 ; CHECK-NEXT:    vminpd %ymm1, %ymm0, %ymm0
   1137 ; CHECK-NEXT:    ret{{[l|q]}}
   1138   %res = call <4 x double> @llvm.x86.avx.min.pd.256(<4 x double> %a0, <4 x double> %a1)
   1139   ret <4 x double> %res
   1140 }
   1141 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
   1142 
   1143 define <8 x float> @test_mm256_min_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   1144 ; CHECK-LABEL: test_mm256_min_ps:
   1145 ; CHECK:       # %bb.0:
   1146 ; CHECK-NEXT:    vminps %ymm1, %ymm0, %ymm0
   1147 ; CHECK-NEXT:    ret{{[l|q]}}
   1148   %res = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
   1149   ret <8 x float> %res
   1150 }
   1151 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
   1152 
   1153 define <4 x double> @test_mm256_movedup_pd(<4 x double> %a0) nounwind {
   1154 ; CHECK-LABEL: test_mm256_movedup_pd:
   1155 ; CHECK:       # %bb.0:
   1156 ; CHECK-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
   1157 ; CHECK-NEXT:    ret{{[l|q]}}
   1158   %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   1159   ret <4 x double> %res
   1160 }
   1161 
   1162 define <8 x float> @test_mm256_movehdup_ps(<8 x float> %a0) nounwind {
   1163 ; CHECK-LABEL: test_mm256_movehdup_ps:
   1164 ; CHECK:       # %bb.0:
   1165 ; CHECK-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
   1166 ; CHECK-NEXT:    ret{{[l|q]}}
   1167   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
   1168   ret <8 x float> %res
   1169 }
   1170 
   1171 define <8 x float> @test_mm256_moveldup_ps(<8 x float> %a0) nounwind {
   1172 ; CHECK-LABEL: test_mm256_moveldup_ps:
   1173 ; CHECK:       # %bb.0:
   1174 ; CHECK-NEXT:    vmovsldup {{.*#+}} ymm0 = ymm0[0,0,2,2,4,4,6,6]
   1175 ; CHECK-NEXT:    ret{{[l|q]}}
   1176   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
   1177   ret <8 x float> %res
   1178 }
   1179 
   1180 define i32 @test_mm256_movemask_pd(<4 x double> %a0) nounwind {
   1181 ; CHECK-LABEL: test_mm256_movemask_pd:
   1182 ; CHECK:       # %bb.0:
   1183 ; CHECK-NEXT:    vmovmskpd %ymm0, %eax
   1184 ; CHECK-NEXT:    vzeroupper
   1185 ; CHECK-NEXT:    ret{{[l|q]}}
   1186   %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0)
   1187   ret i32 %res
   1188 }
   1189 declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
   1190 
   1191 define i32 @test_mm256_movemask_ps(<8 x float> %a0) nounwind {
   1192 ; CHECK-LABEL: test_mm256_movemask_ps:
   1193 ; CHECK:       # %bb.0:
   1194 ; CHECK-NEXT:    vmovmskps %ymm0, %eax
   1195 ; CHECK-NEXT:    vzeroupper
   1196 ; CHECK-NEXT:    ret{{[l|q]}}
   1197   %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0)
   1198   ret i32 %res
   1199 }
   1200 declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
   1201 
   1202 define <4 x double> @test_mm256_mul_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   1203 ; CHECK-LABEL: test_mm256_mul_pd:
   1204 ; CHECK:       # %bb.0:
   1205 ; CHECK-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
   1206 ; CHECK-NEXT:    ret{{[l|q]}}
   1207   %res = fmul <4 x double> %a0, %a1
   1208   ret <4 x double> %res
   1209 }
   1210 
   1211 define <8 x float> @test_mm256_mul_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   1212 ; CHECK-LABEL: test_mm256_mul_ps:
   1213 ; CHECK:       # %bb.0:
   1214 ; CHECK-NEXT:    vmulps %ymm1, %ymm0, %ymm0
   1215 ; CHECK-NEXT:    ret{{[l|q]}}
   1216   %res = fmul <8 x float> %a0, %a1
   1217   ret <8 x float> %res
   1218 }
   1219 
   1220 define <4 x double> @test_mm256_or_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   1221 ; CHECK-LABEL: test_mm256_or_pd:
   1222 ; CHECK:       # %bb.0:
   1223 ; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
   1224 ; CHECK-NEXT:    ret{{[l|q]}}
   1225   %1 = bitcast <4 x double> %a0 to <4 x i64>
   1226   %2 = bitcast <4 x double> %a1 to <4 x i64>
   1227   %res = or <4 x i64> %1, %2
   1228   %bc = bitcast <4 x i64> %res to <4 x double>
   1229   ret <4 x double> %bc
   1230 }
   1231 
   1232 define <8 x float> @test_mm256_or_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   1233 ; CHECK-LABEL: test_mm256_or_ps:
   1234 ; CHECK:       # %bb.0:
   1235 ; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0
   1236 ; CHECK-NEXT:    ret{{[l|q]}}
   1237   %1 = bitcast <8 x float> %a0 to <8 x i32>
   1238   %2 = bitcast <8 x float> %a1 to <8 x i32>
   1239   %res = or <8 x i32> %1, %2
   1240   %bc = bitcast <8 x i32> %res to <8 x float>
   1241   ret <8 x float> %bc
   1242 }
   1243 
   1244 define <2 x double> @test_mm_permute_pd(<2 x double> %a0) nounwind {
   1245 ; CHECK-LABEL: test_mm_permute_pd:
   1246 ; CHECK:       # %bb.0:
   1247 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1248 ; CHECK-NEXT:    ret{{[l|q]}}
   1249   %res = shufflevector <2 x double> %a0, <2 x double> %a0, <2 x i32> <i32 1, i32 0>
   1250   ret <2 x double> %res
   1251 }
   1252 
   1253 define <4 x double> @test_mm256_permute_pd(<4 x double> %a0) nounwind {
   1254 ; CHECK-LABEL: test_mm256_permute_pd:
   1255 ; CHECK:       # %bb.0:
   1256 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2]
   1257 ; CHECK-NEXT:    ret{{[l|q]}}
   1258   %res = shufflevector <4 x double> %a0, <4 x double> %a0, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   1259   ret <4 x double> %res
   1260 }
   1261 
   1262 define <4 x float> @test_mm_permute_ps(<4 x float> %a0) nounwind {
   1263 ; CHECK-LABEL: test_mm_permute_ps:
   1264 ; CHECK:       # %bb.0:
   1265 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,2,1,0]
   1266 ; CHECK-NEXT:    ret{{[l|q]}}
   1267   %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   1268   ret <4 x float> %res
   1269 }
   1270 
   1271 define <4 x float> @test2_mm_permute_ps(<4 x float> %a0) nounwind {
   1272 ; CHECK-LABEL: test2_mm_permute_ps:
   1273 ; CHECK:       # %bb.0:
   1274 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,1,2,3]
   1275 ; CHECK-NEXT:    ret{{[l|q]}}
   1276   %res = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 2, i32 1, i32 2, i32 3>
   1277   ret <4 x float> %res
   1278 }
   1279 
   1280 define <8 x float> @test_mm256_permute_ps(<8 x float> %a0) nounwind {
   1281 ; CHECK-LABEL: test_mm256_permute_ps:
   1282 ; CHECK:       # %bb.0:
   1283 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
   1284 ; CHECK-NEXT:    ret{{[l|q]}}
   1285   %res = shufflevector <8 x float> %a0, <8 x float> %a0, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   1286   ret <8 x float> %res
   1287 }
   1288 
   1289 define <4 x double> @test_mm256_permute2f128_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   1290 ; CHECK-LABEL: test_mm256_permute2f128_pd:
   1291 ; CHECK:       # %bb.0:
   1292 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm1[0,1]
   1293 ; CHECK-NEXT:    ret{{[l|q]}}
   1294   %res = shufflevector <4 x double> zeroinitializer, <4 x double> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1295   ret <4 x double> %res
   1296 }
   1297 declare <4 x double> @llvm.x86.avx.vperm2f128.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
   1298 
   1299 ; PR26667
   1300 define <8 x float> @test_mm256_permute2f128_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   1301 ; CHECK-LABEL: test_mm256_permute2f128_ps:
   1302 ; CHECK:       # %bb.0:
   1303 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
   1304 ; CHECK-NEXT:    ret{{[l|q]}}
   1305   %res = shufflevector <8 x float> %a1, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
   1306   ret <8 x float> %res
   1307 }
   1308 declare <8 x float> @llvm.x86.avx.vperm2f128.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
   1309 
   1310 define <4 x i64> @test_mm256_permute2f128_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   1311 ; CHECK-LABEL: test_mm256_permute2f128_si256:
   1312 ; CHECK:       # %bb.0:
   1313 ; CHECK-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm1[2,3,0,1]
   1314 ; CHECK-NEXT:    ret{{[l|q]}}
   1315   %1 = bitcast <4 x i64> %a0 to <8 x i32>
   1316   %2 = bitcast <4 x i64> %a1 to <8 x i32>
   1317   %res = shufflevector <8 x i32> %2, <8 x i32> %2, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
   1318   %bc = bitcast <8 x i32> %res to <4 x i64>
   1319   ret <4 x i64> %bc
   1320 }
   1321 declare <8 x i32> @llvm.x86.avx.vperm2f128.si.256(<8 x i32>, <8 x i32>, i8) nounwind readnone
   1322 
   1323 define <2 x double> @test_mm_permutevar_pd(<2 x double> %a0, <2 x i64> %a1) nounwind {
   1324 ; CHECK-LABEL: test_mm_permutevar_pd:
   1325 ; CHECK:       # %bb.0:
   1326 ; CHECK-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
   1327 ; CHECK-NEXT:    ret{{[l|q]}}
   1328   %res = call <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double> %a0, <2 x i64> %a1)
   1329   ret <2 x double> %res
   1330 }
   1331 declare <2 x double> @llvm.x86.avx.vpermilvar.pd(<2 x double>, <2 x i64>) nounwind readnone
   1332 
   1333 define <4 x double> @test_mm256_permutevar_pd(<4 x double> %a0, <4 x i64> %a1) nounwind {
   1334 ; CHECK-LABEL: test_mm256_permutevar_pd:
   1335 ; CHECK:       # %bb.0:
   1336 ; CHECK-NEXT:    vpermilpd %ymm1, %ymm0, %ymm0
   1337 ; CHECK-NEXT:    ret{{[l|q]}}
   1338   %res = call <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double> %a0, <4 x i64> %a1)
   1339   ret <4 x double> %res
   1340 }
   1341 declare <4 x double> @llvm.x86.avx.vpermilvar.pd.256(<4 x double>, <4 x i64>) nounwind readnone
   1342 
   1343 define <4 x float> @test_mm_permutevar_ps(<4 x float> %a0, <2 x i64> %a1) nounwind {
   1344 ; CHECK-LABEL: test_mm_permutevar_ps:
   1345 ; CHECK:       # %bb.0:
   1346 ; CHECK-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
   1347 ; CHECK-NEXT:    ret{{[l|q]}}
   1348   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
   1349   %res = call <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float> %a0, <4 x i32> %arg1)
   1350   ret <4 x float> %res
   1351 }
   1352 declare <4 x float> @llvm.x86.avx.vpermilvar.ps(<4 x float>, <4 x i32>) nounwind readnone
   1353 
   1354 define <8 x float> @test_mm256_permutevar_ps(<8 x float> %a0, <4 x i64> %a1) nounwind {
   1355 ; CHECK-LABEL: test_mm256_permutevar_ps:
   1356 ; CHECK:       # %bb.0:
   1357 ; CHECK-NEXT:    vpermilps %ymm1, %ymm0, %ymm0
   1358 ; CHECK-NEXT:    ret{{[l|q]}}
   1359   %arg1 = bitcast <4 x i64> %a1 to <8 x i32>
   1360   %res = call <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float> %a0, <8 x i32> %arg1)
   1361   ret <8 x float> %res
   1362 }
   1363 declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) nounwind readnone
   1364 
   1365 define <8 x float> @test_mm256_rcp_ps(<8 x float> %a0) nounwind {
   1366 ; CHECK-LABEL: test_mm256_rcp_ps:
   1367 ; CHECK:       # %bb.0:
   1368 ; CHECK-NEXT:    vrcpps %ymm0, %ymm0
   1369 ; CHECK-NEXT:    ret{{[l|q]}}
   1370   %res = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %a0)
   1371   ret <8 x float> %res
   1372 }
   1373 declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) nounwind readnone
   1374 
   1375 define <4 x double> @test_mm256_round_pd(<4 x double> %a0) nounwind {
   1376 ; CHECK-LABEL: test_mm256_round_pd:
   1377 ; CHECK:       # %bb.0:
   1378 ; CHECK-NEXT:    vroundpd $4, %ymm0, %ymm0
   1379 ; CHECK-NEXT:    ret{{[l|q]}}
   1380   %res = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a0, i32 4)
   1381   ret <4 x double> %res
   1382 }
   1383 
   1384 define <8 x float> @test_mm256_round_ps(<8 x float> %a0) nounwind {
   1385 ; CHECK-LABEL: test_mm256_round_ps:
   1386 ; CHECK:       # %bb.0:
   1387 ; CHECK-NEXT:    vroundps $4, %ymm0, %ymm0
   1388 ; CHECK-NEXT:    ret{{[l|q]}}
   1389   %res = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a0, i32 4)
   1390   ret <8 x float> %res
   1391 }
   1392 
   1393 define <8 x float> @test_mm256_rsqrt_ps(<8 x float> %a0) nounwind {
   1394 ; CHECK-LABEL: test_mm256_rsqrt_ps:
   1395 ; CHECK:       # %bb.0:
   1396 ; CHECK-NEXT:    vrsqrtps %ymm0, %ymm0
   1397 ; CHECK-NEXT:    ret{{[l|q]}}
   1398   %res = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %a0)
   1399   ret <8 x float> %res
   1400 }
   1401 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
   1402 
   1403 define <4 x i64> @test_mm256_set_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
   1404 ; X86-LABEL: test_mm256_set_epi8:
   1405 ; X86:       # %bb.0:
   1406 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1407 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
   1408 ; X86-NEXT:    vmovd %ecx, %xmm0
   1409 ; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
   1410 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1411 ; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
   1412 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1413 ; X86-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
   1414 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1415 ; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
   1416 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1417 ; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
   1418 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1419 ; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
   1420 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1421 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
   1422 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1423 ; X86-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
   1424 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1425 ; X86-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
   1426 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1427 ; X86-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
   1428 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1429 ; X86-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
   1430 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1431 ; X86-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
   1432 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1433 ; X86-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
   1434 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1435 ; X86-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
   1436 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1437 ; X86-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
   1438 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1439 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
   1440 ; X86-NEXT:    vmovd %ecx, %xmm1
   1441 ; X86-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
   1442 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1443 ; X86-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
   1444 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1445 ; X86-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
   1446 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1447 ; X86-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
   1448 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1449 ; X86-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
   1450 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1451 ; X86-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
   1452 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1453 ; X86-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
   1454 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1455 ; X86-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
   1456 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1457 ; X86-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
   1458 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1459 ; X86-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
   1460 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1461 ; X86-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
   1462 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1463 ; X86-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
   1464 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1465 ; X86-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
   1466 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1467 ; X86-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
   1468 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1469 ; X86-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
   1470 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1471 ; X86-NEXT:    retl
   1472 ;
   1473 ; X64-LABEL: test_mm256_set_epi8:
   1474 ; X64:       # %bb.0:
   1475 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
   1476 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1477 ; X64-NEXT:    vmovd %eax, %xmm0
   1478 ; X64-NEXT:    vpinsrb $1, %r10d, %xmm0, %xmm0
   1479 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1480 ; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
   1481 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1482 ; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
   1483 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1484 ; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
   1485 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1486 ; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
   1487 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1488 ; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
   1489 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1490 ; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
   1491 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1492 ; X64-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
   1493 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1494 ; X64-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
   1495 ; X64-NEXT:    movzbl %r9b, %eax
   1496 ; X64-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
   1497 ; X64-NEXT:    movzbl %r8b, %eax
   1498 ; X64-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
   1499 ; X64-NEXT:    movzbl %cl, %eax
   1500 ; X64-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
   1501 ; X64-NEXT:    movzbl %dl, %eax
   1502 ; X64-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
   1503 ; X64-NEXT:    movzbl %sil, %eax
   1504 ; X64-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
   1505 ; X64-NEXT:    movzbl %dil, %eax
   1506 ; X64-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
   1507 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1508 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
   1509 ; X64-NEXT:    vmovd %ecx, %xmm1
   1510 ; X64-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
   1511 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1512 ; X64-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
   1513 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1514 ; X64-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
   1515 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1516 ; X64-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
   1517 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1518 ; X64-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
   1519 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1520 ; X64-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
   1521 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1522 ; X64-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
   1523 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1524 ; X64-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
   1525 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1526 ; X64-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
   1527 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1528 ; X64-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
   1529 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1530 ; X64-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
   1531 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1532 ; X64-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
   1533 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1534 ; X64-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
   1535 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1536 ; X64-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
   1537 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   1538 ; X64-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
   1539 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1540 ; X64-NEXT:    retq
   1541   %res0  = insertelement <32 x i8> undef,  i8 %a31, i32 0
   1542   %res1  = insertelement <32 x i8> %res0,  i8 %a30, i32 1
   1543   %res2  = insertelement <32 x i8> %res1,  i8 %a29, i32 2
   1544   %res3  = insertelement <32 x i8> %res2,  i8 %a28, i32 3
   1545   %res4  = insertelement <32 x i8> %res3,  i8 %a27, i32 4
   1546   %res5  = insertelement <32 x i8> %res4,  i8 %a26, i32 5
   1547   %res6  = insertelement <32 x i8> %res5,  i8 %a25, i32 6
   1548   %res7  = insertelement <32 x i8> %res6,  i8 %a24, i32 7
   1549   %res8  = insertelement <32 x i8> %res7,  i8 %a23, i32 8
   1550   %res9  = insertelement <32 x i8> %res8,  i8 %a22, i32 9
   1551   %res10 = insertelement <32 x i8> %res9,  i8 %a21, i32 10
   1552   %res11 = insertelement <32 x i8> %res10, i8 %a20, i32 11
   1553   %res12 = insertelement <32 x i8> %res11, i8 %a19, i32 12
   1554   %res13 = insertelement <32 x i8> %res12, i8 %a18, i32 13
   1555   %res14 = insertelement <32 x i8> %res13, i8 %a17, i32 14
   1556   %res15 = insertelement <32 x i8> %res14, i8 %a16, i32 15
   1557   %res16 = insertelement <32 x i8> %res15, i8 %a15, i32 16
   1558   %res17 = insertelement <32 x i8> %res16, i8 %a14, i32 17
   1559   %res18 = insertelement <32 x i8> %res17, i8 %a13, i32 18
   1560   %res19 = insertelement <32 x i8> %res18, i8 %a12, i32 19
   1561   %res20 = insertelement <32 x i8> %res19, i8 %a11, i32 20
   1562   %res21 = insertelement <32 x i8> %res20, i8 %a10, i32 21
   1563   %res22 = insertelement <32 x i8> %res21, i8 %a9 , i32 22
   1564   %res23 = insertelement <32 x i8> %res22, i8 %a8 , i32 23
   1565   %res24 = insertelement <32 x i8> %res23, i8 %a7 , i32 24
   1566   %res25 = insertelement <32 x i8> %res24, i8 %a6 , i32 25
   1567   %res26 = insertelement <32 x i8> %res25, i8 %a5 , i32 26
   1568   %res27 = insertelement <32 x i8> %res26, i8 %a4 , i32 27
   1569   %res28 = insertelement <32 x i8> %res27, i8 %a3 , i32 28
   1570   %res29 = insertelement <32 x i8> %res28, i8 %a2 , i32 29
   1571   %res30 = insertelement <32 x i8> %res29, i8 %a1 , i32 30
   1572   %res31 = insertelement <32 x i8> %res30, i8 %a0 , i32 31
   1573   %res = bitcast <32 x i8> %res31 to <4 x i64>
   1574   ret <4 x i64> %res
   1575 }
   1576 
   1577 define <4 x i64> @test_mm256_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
   1578 ; X86-LABEL: test_mm256_set_epi16:
   1579 ; X86:       # %bb.0:
   1580 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1581 ; X86-NEXT:    vmovd %eax, %xmm0
   1582 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1583 ; X86-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
   1584 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1585 ; X86-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
   1586 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1587 ; X86-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
   1588 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1589 ; X86-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
   1590 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1591 ; X86-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
   1592 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1593 ; X86-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
   1594 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1595 ; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
   1596 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1597 ; X86-NEXT:    vmovd %eax, %xmm1
   1598 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1599 ; X86-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
   1600 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1601 ; X86-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
   1602 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1603 ; X86-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
   1604 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1605 ; X86-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
   1606 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1607 ; X86-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
   1608 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1609 ; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
   1610 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1611 ; X86-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
   1612 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1613 ; X86-NEXT:    retl
   1614 ;
   1615 ; X64-LABEL: test_mm256_set_epi16:
   1616 ; X64:       # %bb.0:
   1617 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   1618 ; X64-NEXT:    vmovd %eax, %xmm0
   1619 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   1620 ; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
   1621 ; X64-NEXT:    vpinsrw $2, %r9d, %xmm0, %xmm0
   1622 ; X64-NEXT:    vpinsrw $3, %r8d, %xmm0, %xmm0
   1623 ; X64-NEXT:    vpinsrw $4, %ecx, %xmm0, %xmm0
   1624 ; X64-NEXT:    vpinsrw $5, %edx, %xmm0, %xmm0
   1625 ; X64-NEXT:    vpinsrw $6, %esi, %xmm0, %xmm0
   1626 ; X64-NEXT:    vpinsrw $7, %edi, %xmm0, %xmm0
   1627 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   1628 ; X64-NEXT:    vmovd %eax, %xmm1
   1629 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   1630 ; X64-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
   1631 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   1632 ; X64-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
   1633 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   1634 ; X64-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
   1635 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   1636 ; X64-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
   1637 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   1638 ; X64-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
   1639 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   1640 ; X64-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
   1641 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   1642 ; X64-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
   1643 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1644 ; X64-NEXT:    retq
   1645   %res0  = insertelement <16 x i16> undef,  i16 %a15, i32 0
   1646   %res1  = insertelement <16 x i16> %res0,  i16 %a14, i32 1
   1647   %res2  = insertelement <16 x i16> %res1,  i16 %a13, i32 2
   1648   %res3  = insertelement <16 x i16> %res2,  i16 %a12, i32 3
   1649   %res4  = insertelement <16 x i16> %res3,  i16 %a11, i32 4
   1650   %res5  = insertelement <16 x i16> %res4,  i16 %a10, i32 5
   1651   %res6  = insertelement <16 x i16> %res5,  i16 %a9 , i32 6
   1652   %res7  = insertelement <16 x i16> %res6,  i16 %a8 , i32 7
   1653   %res8  = insertelement <16 x i16> %res7,  i16 %a7 , i32 8
   1654   %res9  = insertelement <16 x i16> %res8,  i16 %a6 , i32 9
   1655   %res10 = insertelement <16 x i16> %res9,  i16 %a5 , i32 10
   1656   %res11 = insertelement <16 x i16> %res10, i16 %a4 , i32 11
   1657   %res12 = insertelement <16 x i16> %res11, i16 %a3 , i32 12
   1658   %res13 = insertelement <16 x i16> %res12, i16 %a2 , i32 13
   1659   %res14 = insertelement <16 x i16> %res13, i16 %a1 , i32 14
   1660   %res15 = insertelement <16 x i16> %res14, i16 %a0 , i32 15
   1661   %res = bitcast <16 x i16> %res15 to <4 x i64>
   1662   ret <4 x i64> %res
   1663 }
   1664 
   1665 define <4 x i64> @test_mm256_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
   1666 ; X86-LABEL: test_mm256_set_epi32:
   1667 ; X86:       # %bb.0:
   1668 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1669 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
   1670 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
   1671 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
   1672 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1673 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
   1674 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
   1675 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
   1676 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1677 ; X86-NEXT:    retl
   1678 ;
   1679 ; X64-LABEL: test_mm256_set_epi32:
   1680 ; X64:       # %bb.0:
   1681 ; X64-NEXT:    vmovd %ecx, %xmm0
   1682 ; X64-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
   1683 ; X64-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
   1684 ; X64-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
   1685 ; X64-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1686 ; X64-NEXT:    vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1
   1687 ; X64-NEXT:    vpinsrd $2, %r9d, %xmm1, %xmm1
   1688 ; X64-NEXT:    vpinsrd $3, %r8d, %xmm1, %xmm1
   1689 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1690 ; X64-NEXT:    retq
   1691   %res0 = insertelement <8 x i32> undef, i32 %a7, i32 0
   1692   %res1 = insertelement <8 x i32> %res0, i32 %a6, i32 1
   1693   %res2 = insertelement <8 x i32> %res1, i32 %a5, i32 2
   1694   %res3 = insertelement <8 x i32> %res2, i32 %a4, i32 3
   1695   %res4 = insertelement <8 x i32> %res3, i32 %a3, i32 4
   1696   %res5 = insertelement <8 x i32> %res4, i32 %a2, i32 5
   1697   %res6 = insertelement <8 x i32> %res5, i32 %a1, i32 6
   1698   %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
   1699   %res = bitcast <8 x i32> %res7 to <4 x i64>
   1700   ret <4 x i64> %res
   1701 }
   1702 
   1703 define <4 x i64> @test_mm256_set_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
   1704 ; X86-LABEL: test_mm256_set_epi64x:
   1705 ; X86:       # %bb.0:
   1706 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1707 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
   1708 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
   1709 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
   1710 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1711 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
   1712 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
   1713 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
   1714 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1715 ; X86-NEXT:    retl
   1716 ;
   1717 ; X64-LABEL: test_mm256_set_epi64x:
   1718 ; X64:       # %bb.0:
   1719 ; X64-NEXT:    vmovq %rdi, %xmm0
   1720 ; X64-NEXT:    vmovq %rsi, %xmm1
   1721 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1722 ; X64-NEXT:    vmovq %rdx, %xmm1
   1723 ; X64-NEXT:    vmovq %rcx, %xmm2
   1724 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
   1725 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1726 ; X64-NEXT:    retq
   1727   %res0 = insertelement <4 x i64> undef, i64 %a3, i32 0
   1728   %res1 = insertelement <4 x i64> %res0, i64 %a2, i32 1
   1729   %res2 = insertelement <4 x i64> %res1, i64 %a1, i32 2
   1730   %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
   1731   ret <4 x i64> %res3
   1732 }
   1733 
   1734 define <8 x float> @test_mm256_set_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
   1735 ; CHECK-LABEL: test_mm256_set_m128:
   1736 ; CHECK:       # %bb.0:
   1737 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
   1738 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1739 ; CHECK-NEXT:    ret{{[l|q]}}
   1740   %res = shufflevector <4 x float> %a1, <4 x float> %a0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1741   ret <8 x float> %res
   1742 }
   1743 
   1744 define <4 x double> @test_mm256_set_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
   1745 ; CHECK-LABEL: test_mm256_set_m128d:
   1746 ; CHECK:       # %bb.0:
   1747 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
   1748 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1749 ; CHECK-NEXT:    ret{{[l|q]}}
   1750   %arg0 = bitcast <2 x double> %a0 to <4 x float>
   1751   %arg1 = bitcast <2 x double> %a1 to <4 x float>
   1752   %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1753   %bc = bitcast <8 x float> %res to <4 x double>
   1754   ret <4 x double> %bc
   1755 }
   1756 
   1757 define <4 x i64> @test_mm256_set_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
   1758 ; CHECK-LABEL: test_mm256_set_m128i:
   1759 ; CHECK:       # %bb.0:
   1760 ; CHECK-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
   1761 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1762 ; CHECK-NEXT:    ret{{[l|q]}}
   1763   %arg0 = bitcast <2 x i64> %a0 to <4 x float>
   1764   %arg1 = bitcast <2 x i64> %a1 to <4 x float>
   1765   %res = shufflevector <4 x float> %arg1, <4 x float> %arg0, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   1766   %bc = bitcast <8 x float> %res to <4 x i64>
   1767   ret <4 x i64> %bc
   1768 }
   1769 
   1770 define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
   1771 ; X86-LABEL: test_mm256_set_pd:
   1772 ; X86:       # %bb.0:
   1773 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
   1774 ; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
   1775 ; X86-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1776 ; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
   1777 ; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
   1778 ; X86-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1779 ; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1780 ; X86-NEXT:    retl
   1781 ;
   1782 ; X64-LABEL: test_mm256_set_pd:
   1783 ; X64:       # %bb.0:
   1784 ; X64-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   1785 ; X64-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm3[0],xmm2[0]
   1786 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1787 ; X64-NEXT:    retq
   1788   %res0 = insertelement <4 x double> undef, double %a3, i32 0
   1789   %res1 = insertelement <4 x double> %res0, double %a2, i32 1
   1790   %res2 = insertelement <4 x double> %res1, double %a1, i32 2
   1791   %res3 = insertelement <4 x double> %res2, double %a0, i32 3
   1792   ret <4 x double> %res3
   1793 }
   1794 
   1795 define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
   1796 ; X86-LABEL: test_mm256_set_ps:
   1797 ; X86:       # %bb.0:
   1798 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1799 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1800 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
   1801 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1802 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
   1803 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1804 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
   1805 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1806 ; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1807 ; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
   1808 ; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1809 ; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
   1810 ; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1811 ; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0]
   1812 ; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1813 ; X86-NEXT:    retl
   1814 ;
   1815 ; X64-LABEL: test_mm256_set_ps:
   1816 ; X64:       # %bb.0:
   1817 ; X64-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
   1818 ; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
   1819 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   1820 ; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3]
   1821 ; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3]
   1822 ; X64-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
   1823 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   1824 ; X64-NEXT:    retq
   1825   %res0 = insertelement <8 x float> undef, float %a7, i32 0
   1826   %res1 = insertelement <8 x float> %res0, float %a6, i32 1
   1827   %res2 = insertelement <8 x float> %res1, float %a5, i32 2
   1828   %res3 = insertelement <8 x float> %res2, float %a4, i32 3
   1829   %res4 = insertelement <8 x float> %res3, float %a3, i32 4
   1830   %res5 = insertelement <8 x float> %res4, float %a2, i32 5
   1831   %res6 = insertelement <8 x float> %res5, float %a1, i32 6
   1832   %res7 = insertelement <8 x float> %res6, float %a0, i32 7
   1833   ret <8 x float> %res7
   1834 }
   1835 
   1836 define <4 x i64> @test_mm256_set1_epi8(i8 %a0) nounwind {
   1837 ; X86-LABEL: test_mm256_set1_epi8:
   1838 ; X86:       # %bb.0:
   1839 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   1840 ; X86-NEXT:    vmovd %eax, %xmm0
   1841 ; X86-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1842 ; X86-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
   1843 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1844 ; X86-NEXT:    retl
   1845 ;
   1846 ; X64-LABEL: test_mm256_set1_epi8:
   1847 ; X64:       # %bb.0:
   1848 ; X64-NEXT:    movzbl %dil, %eax
   1849 ; X64-NEXT:    vmovd %eax, %xmm0
   1850 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1851 ; X64-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
   1852 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1853 ; X64-NEXT:    retq
   1854   %res0  = insertelement <32 x i8> undef,  i8 %a0, i32 0
   1855   %res1  = insertelement <32 x i8> %res0,  i8 %a0, i32 1
   1856   %res2  = insertelement <32 x i8> %res1,  i8 %a0, i32 2
   1857   %res3  = insertelement <32 x i8> %res2,  i8 %a0, i32 3
   1858   %res4  = insertelement <32 x i8> %res3,  i8 %a0, i32 4
   1859   %res5  = insertelement <32 x i8> %res4,  i8 %a0, i32 5
   1860   %res6  = insertelement <32 x i8> %res5,  i8 %a0, i32 6
   1861   %res7  = insertelement <32 x i8> %res6,  i8 %a0, i32 7
   1862   %res8  = insertelement <32 x i8> %res7,  i8 %a0, i32 8
   1863   %res9  = insertelement <32 x i8> %res8,  i8 %a0, i32 9
   1864   %res10 = insertelement <32 x i8> %res9,  i8 %a0, i32 10
   1865   %res11 = insertelement <32 x i8> %res10, i8 %a0, i32 11
   1866   %res12 = insertelement <32 x i8> %res11, i8 %a0, i32 12
   1867   %res13 = insertelement <32 x i8> %res12, i8 %a0, i32 13
   1868   %res14 = insertelement <32 x i8> %res13, i8 %a0, i32 14
   1869   %res15 = insertelement <32 x i8> %res14, i8 %a0, i32 15
   1870   %res16 = insertelement <32 x i8> %res15, i8 %a0, i32 16
   1871   %res17 = insertelement <32 x i8> %res16, i8 %a0, i32 17
   1872   %res18 = insertelement <32 x i8> %res17, i8 %a0, i32 18
   1873   %res19 = insertelement <32 x i8> %res18, i8 %a0, i32 19
   1874   %res20 = insertelement <32 x i8> %res19, i8 %a0, i32 20
   1875   %res21 = insertelement <32 x i8> %res20, i8 %a0, i32 21
   1876   %res22 = insertelement <32 x i8> %res21, i8 %a0, i32 22
   1877   %res23 = insertelement <32 x i8> %res22, i8 %a0, i32 23
   1878   %res24 = insertelement <32 x i8> %res23, i8 %a0, i32 24
   1879   %res25 = insertelement <32 x i8> %res24, i8 %a0, i32 25
   1880   %res26 = insertelement <32 x i8> %res25, i8 %a0, i32 26
   1881   %res27 = insertelement <32 x i8> %res26, i8 %a0, i32 27
   1882   %res28 = insertelement <32 x i8> %res27, i8 %a0, i32 28
   1883   %res29 = insertelement <32 x i8> %res28, i8 %a0, i32 29
   1884   %res30 = insertelement <32 x i8> %res29, i8 %a0, i32 30
   1885   %res31 = insertelement <32 x i8> %res30, i8 %a0, i32 31
   1886   %res = bitcast <32 x i8> %res31 to <4 x i64>
   1887   ret <4 x i64> %res
   1888 }
   1889 
   1890 define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind {
   1891 ; X86-LABEL: test_mm256_set1_epi16:
   1892 ; X86:       # %bb.0:
   1893 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   1894 ; X86-NEXT:    vmovd %eax, %xmm0
   1895 ; X86-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
   1896 ; X86-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1897 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1898 ; X86-NEXT:    retl
   1899 ;
   1900 ; X64-LABEL: test_mm256_set1_epi16:
   1901 ; X64:       # %bb.0:
   1902 ; X64-NEXT:    vmovd %edi, %xmm0
   1903 ; X64-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
   1904 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1905 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1906 ; X64-NEXT:    retq
   1907   %res0  = insertelement <16 x i16> undef,  i16 %a0, i32 0
   1908   %res1  = insertelement <16 x i16> %res0,  i16 %a0, i32 1
   1909   %res2  = insertelement <16 x i16> %res1,  i16 %a0, i32 2
   1910   %res3  = insertelement <16 x i16> %res2,  i16 %a0, i32 3
   1911   %res4  = insertelement <16 x i16> %res3,  i16 %a0, i32 4
   1912   %res5  = insertelement <16 x i16> %res4,  i16 %a0, i32 5
   1913   %res6  = insertelement <16 x i16> %res5,  i16 %a0, i32 6
   1914   %res7  = insertelement <16 x i16> %res6,  i16 %a0, i32 7
   1915   %res8  = insertelement <16 x i16> %res7,  i16 %a0, i32 8
   1916   %res9  = insertelement <16 x i16> %res8,  i16 %a0, i32 9
   1917   %res10 = insertelement <16 x i16> %res9,  i16 %a0, i32 10
   1918   %res11 = insertelement <16 x i16> %res10, i16 %a0, i32 11
   1919   %res12 = insertelement <16 x i16> %res11, i16 %a0, i32 12
   1920   %res13 = insertelement <16 x i16> %res12, i16 %a0, i32 13
   1921   %res14 = insertelement <16 x i16> %res13, i16 %a0, i32 14
   1922   %res15 = insertelement <16 x i16> %res14, i16 %a0, i32 15
   1923   %res = bitcast <16 x i16> %res15 to <4 x i64>
   1924   ret <4 x i64> %res
   1925 }
   1926 
   1927 define <4 x i64> @test_mm256_set1_epi32(i32 %a0) nounwind {
   1928 ; X86-LABEL: test_mm256_set1_epi32:
   1929 ; X86:       # %bb.0:
   1930 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1931 ; X86-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1932 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1933 ; X86-NEXT:    retl
   1934 ;
   1935 ; X64-LABEL: test_mm256_set1_epi32:
   1936 ; X64:       # %bb.0:
   1937 ; X64-NEXT:    vmovd %edi, %xmm0
   1938 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1939 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1940 ; X64-NEXT:    retq
   1941   %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
   1942   %res1 = insertelement <8 x i32> %res0, i32 %a0, i32 1
   1943   %res2 = insertelement <8 x i32> %res1, i32 %a0, i32 2
   1944   %res3 = insertelement <8 x i32> %res2, i32 %a0, i32 3
   1945   %res4 = insertelement <8 x i32> %res3, i32 %a0, i32 4
   1946   %res5 = insertelement <8 x i32> %res4, i32 %a0, i32 5
   1947   %res6 = insertelement <8 x i32> %res5, i32 %a0, i32 6
   1948   %res7 = insertelement <8 x i32> %res6, i32 %a0, i32 7
   1949   %res = bitcast <8 x i32> %res7 to <4 x i64>
   1950   ret <4 x i64> %res
   1951 }
   1952 
   1953 define <4 x i64> @test_mm256_set1_epi64x(i64 %a0) nounwind {
   1954 ; X86-LABEL: test_mm256_set1_epi64x:
   1955 ; X86:       # %bb.0:
   1956 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1957 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1958 ; X86-NEXT:    vmovd %ecx, %xmm0
   1959 ; X86-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
   1960 ; X86-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
   1961 ; X86-NEXT:    vpinsrd $3, %eax, %xmm0, %xmm0
   1962 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1963 ; X86-NEXT:    retl
   1964 ;
   1965 ; X64-LABEL: test_mm256_set1_epi64x:
   1966 ; X64:       # %bb.0:
   1967 ; X64-NEXT:    vmovq %rdi, %xmm0
   1968 ; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
   1969 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1970 ; X64-NEXT:    retq
   1971   %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
   1972   %res1 = insertelement <4 x i64> %res0, i64 %a0, i32 1
   1973   %res2 = insertelement <4 x i64> %res1, i64 %a0, i32 2
   1974   %res3 = insertelement <4 x i64> %res2, i64 %a0, i32 3
   1975   ret <4 x i64> %res3
   1976 }
   1977 
   1978 define <4 x double> @test_mm256_set1_pd(double %a0) nounwind {
   1979 ; X86-LABEL: test_mm256_set1_pd:
   1980 ; X86:       # %bb.0:
   1981 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
   1982 ; X86-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   1983 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1984 ; X86-NEXT:    retl
   1985 ;
   1986 ; X64-LABEL: test_mm256_set1_pd:
   1987 ; X64:       # %bb.0:
   1988 ; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
   1989 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   1990 ; X64-NEXT:    retq
   1991   %res0 = insertelement <4 x double> undef, double %a0, i32 0
   1992   %res1 = insertelement <4 x double> %res0, double %a0, i32 1
   1993   %res2 = insertelement <4 x double> %res1, double %a0, i32 2
   1994   %res3 = insertelement <4 x double> %res2, double %a0, i32 3
   1995   ret <4 x double> %res3
   1996 }
   1997 
   1998 define <8 x float> @test_mm256_set1_ps(float %a0) nounwind {
   1999 ; X86-LABEL: test_mm256_set1_ps:
   2000 ; X86:       # %bb.0:
   2001 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2002 ; X86-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   2003 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   2004 ; X86-NEXT:    retl
   2005 ;
   2006 ; X64-LABEL: test_mm256_set1_ps:
   2007 ; X64:       # %bb.0:
   2008 ; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   2009 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
   2010 ; X64-NEXT:    retq
   2011   %res0 = insertelement <8 x float> undef, float %a0, i32 0
   2012   %res1 = insertelement <8 x float> %res0, float %a0, i32 1
   2013   %res2 = insertelement <8 x float> %res1, float %a0, i32 2
   2014   %res3 = insertelement <8 x float> %res2, float %a0, i32 3
   2015   %res4 = insertelement <8 x float> %res3, float %a0, i32 4
   2016   %res5 = insertelement <8 x float> %res4, float %a0, i32 5
   2017   %res6 = insertelement <8 x float> %res5, float %a0, i32 6
   2018   %res7 = insertelement <8 x float> %res6, float %a0, i32 7
   2019   ret <8 x float> %res7
   2020 }
   2021 
   2022 define <4 x i64> @test_mm256_setr_epi8(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15, i8 %a16, i8 %a17, i8 %a18, i8 %a19, i8 %a20, i8 %a21, i8 %a22, i8 %a23, i8 %a24, i8 %a25, i8 %a26, i8 %a27, i8 %a28, i8 %a29, i8 %a30, i8 %a31) nounwind {
   2023 ; X86-LABEL: test_mm256_setr_epi8:
   2024 ; X86:       # %bb.0:
   2025 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2026 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
   2027 ; X86-NEXT:    vmovd %ecx, %xmm0
   2028 ; X86-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
   2029 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2030 ; X86-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
   2031 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2032 ; X86-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
   2033 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2034 ; X86-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
   2035 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2036 ; X86-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
   2037 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2038 ; X86-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
   2039 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2040 ; X86-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
   2041 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2042 ; X86-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
   2043 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2044 ; X86-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
   2045 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2046 ; X86-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
   2047 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2048 ; X86-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
   2049 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2050 ; X86-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
   2051 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2052 ; X86-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
   2053 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2054 ; X86-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
   2055 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2056 ; X86-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
   2057 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2058 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
   2059 ; X86-NEXT:    vmovd %ecx, %xmm1
   2060 ; X86-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
   2061 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2062 ; X86-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
   2063 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2064 ; X86-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
   2065 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2066 ; X86-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
   2067 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2068 ; X86-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
   2069 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2070 ; X86-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
   2071 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2072 ; X86-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
   2073 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2074 ; X86-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
   2075 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2076 ; X86-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
   2077 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2078 ; X86-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
   2079 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2080 ; X86-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
   2081 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2082 ; X86-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
   2083 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2084 ; X86-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
   2085 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2086 ; X86-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
   2087 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
   2088 ; X86-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
   2089 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2090 ; X86-NEXT:    retl
   2091 ;
   2092 ; X64-LABEL: test_mm256_setr_epi8:
   2093 ; X64:       # %bb.0:
   2094 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
   2095 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2096 ; X64-NEXT:    vmovd %eax, %xmm0
   2097 ; X64-NEXT:    vpinsrb $1, %r10d, %xmm0, %xmm0
   2098 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2099 ; X64-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
   2100 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2101 ; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
   2102 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2103 ; X64-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
   2104 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2105 ; X64-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
   2106 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2107 ; X64-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
   2108 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2109 ; X64-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
   2110 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2111 ; X64-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
   2112 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2113 ; X64-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
   2114 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2115 ; X64-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
   2116 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2117 ; X64-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
   2118 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2119 ; X64-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
   2120 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2121 ; X64-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
   2122 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2123 ; X64-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
   2124 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2125 ; X64-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
   2126 ; X64-NEXT:    movzbl %sil, %eax
   2127 ; X64-NEXT:    movzbl %dil, %esi
   2128 ; X64-NEXT:    vmovd %esi, %xmm1
   2129 ; X64-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
   2130 ; X64-NEXT:    movzbl %dl, %eax
   2131 ; X64-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
   2132 ; X64-NEXT:    movzbl %cl, %eax
   2133 ; X64-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
   2134 ; X64-NEXT:    movzbl %r8b, %eax
   2135 ; X64-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
   2136 ; X64-NEXT:    movzbl %r9b, %eax
   2137 ; X64-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
   2138 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2139 ; X64-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
   2140 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2141 ; X64-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
   2142 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2143 ; X64-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
   2144 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2145 ; X64-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
   2146 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2147 ; X64-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
   2148 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2149 ; X64-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
   2150 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2151 ; X64-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
   2152 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2153 ; X64-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
   2154 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2155 ; X64-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
   2156 ; X64-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
   2157 ; X64-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
   2158 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2159 ; X64-NEXT:    retq
   2160   %res0  = insertelement <32 x i8> undef,  i8 %a0 , i32 0
   2161   %res1  = insertelement <32 x i8> %res0,  i8 %a1 , i32 1
   2162   %res2  = insertelement <32 x i8> %res1,  i8 %a2 , i32 2
   2163   %res3  = insertelement <32 x i8> %res2,  i8 %a3 , i32 3
   2164   %res4  = insertelement <32 x i8> %res3,  i8 %a4 , i32 4
   2165   %res5  = insertelement <32 x i8> %res4,  i8 %a5 , i32 5
   2166   %res6  = insertelement <32 x i8> %res5,  i8 %a6 , i32 6
   2167   %res7  = insertelement <32 x i8> %res6,  i8 %a7 , i32 7
   2168   %res8  = insertelement <32 x i8> %res7,  i8 %a8 , i32 8
   2169   %res9  = insertelement <32 x i8> %res8,  i8 %a9 , i32 9
   2170   %res10 = insertelement <32 x i8> %res9,  i8 %a10, i32 10
   2171   %res11 = insertelement <32 x i8> %res10, i8 %a11, i32 11
   2172   %res12 = insertelement <32 x i8> %res11, i8 %a12, i32 12
   2173   %res13 = insertelement <32 x i8> %res12, i8 %a13, i32 13
   2174   %res14 = insertelement <32 x i8> %res13, i8 %a14, i32 14
   2175   %res15 = insertelement <32 x i8> %res14, i8 %a15, i32 15
   2176   %res16 = insertelement <32 x i8> %res15, i8 %a16, i32 16
   2177   %res17 = insertelement <32 x i8> %res16, i8 %a17, i32 17
   2178   %res18 = insertelement <32 x i8> %res17, i8 %a18, i32 18
   2179   %res19 = insertelement <32 x i8> %res18, i8 %a19, i32 19
   2180   %res20 = insertelement <32 x i8> %res19, i8 %a20, i32 20
   2181   %res21 = insertelement <32 x i8> %res20, i8 %a21, i32 21
   2182   %res22 = insertelement <32 x i8> %res21, i8 %a22, i32 22
   2183   %res23 = insertelement <32 x i8> %res22, i8 %a23, i32 23
   2184   %res24 = insertelement <32 x i8> %res23, i8 %a24, i32 24
   2185   %res25 = insertelement <32 x i8> %res24, i8 %a25, i32 25
   2186   %res26 = insertelement <32 x i8> %res25, i8 %a26, i32 26
   2187   %res27 = insertelement <32 x i8> %res26, i8 %a27, i32 27
   2188   %res28 = insertelement <32 x i8> %res27, i8 %a28, i32 28
   2189   %res29 = insertelement <32 x i8> %res28, i8 %a29, i32 29
   2190   %res30 = insertelement <32 x i8> %res29, i8 %a30, i32 30
   2191   %res31 = insertelement <32 x i8> %res30, i8 %a31, i32 31
   2192   %res = bitcast <32 x i8> %res31 to <4 x i64>
   2193   ret <4 x i64> %res
   2194 }
   2195 
   2196 define <4 x i64> @test_mm256_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7, i16 %a8, i16 %a9, i16 %a10, i16 %a11, i16 %a12, i16 %a13, i16 %a14, i16 %a15) nounwind {
   2197 ; X86-LABEL: test_mm256_setr_epi16:
   2198 ; X86:       # %bb.0:
   2199 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2200 ; X86-NEXT:    vmovd %eax, %xmm0
   2201 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2202 ; X86-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
   2203 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2204 ; X86-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
   2205 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2206 ; X86-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
   2207 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2208 ; X86-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
   2209 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2210 ; X86-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
   2211 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2212 ; X86-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
   2213 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2214 ; X86-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
   2215 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2216 ; X86-NEXT:    vmovd %eax, %xmm1
   2217 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2218 ; X86-NEXT:    vpinsrw $1, %eax, %xmm1, %xmm1
   2219 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2220 ; X86-NEXT:    vpinsrw $2, %eax, %xmm1, %xmm1
   2221 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2222 ; X86-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
   2223 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2224 ; X86-NEXT:    vpinsrw $4, %eax, %xmm1, %xmm1
   2225 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2226 ; X86-NEXT:    vpinsrw $5, %eax, %xmm1, %xmm1
   2227 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2228 ; X86-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
   2229 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
   2230 ; X86-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
   2231 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2232 ; X86-NEXT:    retl
   2233 ;
   2234 ; X64-LABEL: test_mm256_setr_epi16:
   2235 ; X64:       # %bb.0:
   2236 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   2237 ; X64-NEXT:    vmovd %eax, %xmm0
   2238 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   2239 ; X64-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
   2240 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   2241 ; X64-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
   2242 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   2243 ; X64-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
   2244 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   2245 ; X64-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
   2246 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   2247 ; X64-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
   2248 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   2249 ; X64-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
   2250 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   2251 ; X64-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
   2252 ; X64-NEXT:    vmovd %edi, %xmm1
   2253 ; X64-NEXT:    vpinsrw $1, %esi, %xmm1, %xmm1
   2254 ; X64-NEXT:    vpinsrw $2, %edx, %xmm1, %xmm1
   2255 ; X64-NEXT:    vpinsrw $3, %ecx, %xmm1, %xmm1
   2256 ; X64-NEXT:    vpinsrw $4, %r8d, %xmm1, %xmm1
   2257 ; X64-NEXT:    vpinsrw $5, %r9d, %xmm1, %xmm1
   2258 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   2259 ; X64-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
   2260 ; X64-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
   2261 ; X64-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
   2262 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2263 ; X64-NEXT:    retq
   2264   %res0  = insertelement <16 x i16> undef,  i16 %a0 , i32 0
   2265   %res1  = insertelement <16 x i16> %res0,  i16 %a1 , i32 1
   2266   %res2  = insertelement <16 x i16> %res1,  i16 %a2 , i32 2
   2267   %res3  = insertelement <16 x i16> %res2,  i16 %a3 , i32 3
   2268   %res4  = insertelement <16 x i16> %res3,  i16 %a4 , i32 4
   2269   %res5  = insertelement <16 x i16> %res4,  i16 %a5 , i32 5
   2270   %res6  = insertelement <16 x i16> %res5,  i16 %a6 , i32 6
   2271   %res7  = insertelement <16 x i16> %res6,  i16 %a7 , i32 7
   2272   %res8  = insertelement <16 x i16> %res7,  i16 %a8 , i32 8
   2273   %res9  = insertelement <16 x i16> %res8,  i16 %a9 , i32 9
   2274   %res10 = insertelement <16 x i16> %res9,  i16 %a10, i32 10
   2275   %res11 = insertelement <16 x i16> %res10, i16 %a11, i32 11
   2276   %res12 = insertelement <16 x i16> %res11, i16 %a12, i32 12
   2277   %res13 = insertelement <16 x i16> %res12, i16 %a13, i32 13
   2278   %res14 = insertelement <16 x i16> %res13, i16 %a14, i32 14
   2279   %res15 = insertelement <16 x i16> %res14, i16 %a15, i32 15
   2280   %res = bitcast <16 x i16> %res15 to <4 x i64>
   2281   ret <4 x i64> %res
   2282 }
   2283 
   2284 define <4 x i64> @test_mm256_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7) nounwind {
   2285 ; X86-LABEL: test_mm256_setr_epi32:
   2286 ; X86:       # %bb.0:
   2287 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2288 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
   2289 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
   2290 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
   2291 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2292 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
   2293 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
   2294 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
   2295 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2296 ; X86-NEXT:    retl
   2297 ;
   2298 ; X64-LABEL: test_mm256_setr_epi32:
   2299 ; X64:       # %bb.0:
   2300 ; X64-NEXT:    vmovd %r8d, %xmm0
   2301 ; X64-NEXT:    vpinsrd $1, %r9d, %xmm0, %xmm0
   2302 ; X64-NEXT:    vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0
   2303 ; X64-NEXT:    vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0
   2304 ; X64-NEXT:    vmovd %edi, %xmm1
   2305 ; X64-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
   2306 ; X64-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
   2307 ; X64-NEXT:    vpinsrd $3, %ecx, %xmm1, %xmm1
   2308 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2309 ; X64-NEXT:    retq
   2310   %res0 = insertelement <8 x i32> undef, i32 %a0, i32 0
   2311   %res1 = insertelement <8 x i32> %res0, i32 %a1, i32 1
   2312   %res2 = insertelement <8 x i32> %res1, i32 %a2, i32 2
   2313   %res3 = insertelement <8 x i32> %res2, i32 %a3, i32 3
   2314   %res4 = insertelement <8 x i32> %res3, i32 %a4, i32 4
   2315   %res5 = insertelement <8 x i32> %res4, i32 %a5, i32 5
   2316   %res6 = insertelement <8 x i32> %res5, i32 %a6, i32 6
   2317   %res7 = insertelement <8 x i32> %res6, i32 %a7, i32 7
   2318   %res = bitcast <8 x i32> %res7 to <4 x i64>
   2319   ret <4 x i64> %res
   2320 }
   2321 
   2322 define <4 x i64> @test_mm256_setr_epi64x(i64 %a0, i64 %a1, i64 %a2, i64 %a3) nounwind {
   2323 ; X86-LABEL: test_mm256_setr_epi64x:
   2324 ; X86:       # %bb.0:
   2325 ; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2326 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
   2327 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
   2328 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
   2329 ; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2330 ; X86-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm1, %xmm1
   2331 ; X86-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1
   2332 ; X86-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1
   2333 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2334 ; X86-NEXT:    retl
   2335 ;
   2336 ; X64-LABEL: test_mm256_setr_epi64x:
   2337 ; X64:       # %bb.0:
   2338 ; X64-NEXT:    vmovq %rcx, %xmm0
   2339 ; X64-NEXT:    vmovq %rdx, %xmm1
   2340 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   2341 ; X64-NEXT:    vmovq %rsi, %xmm1
   2342 ; X64-NEXT:    vmovq %rdi, %xmm2
   2343 ; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
   2344 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2345 ; X64-NEXT:    retq
   2346   %res0 = insertelement <4 x i64> undef, i64 %a0, i32 0
   2347   %res1 = insertelement <4 x i64> %res0, i64 %a1, i32 1
   2348   %res2 = insertelement <4 x i64> %res1, i64 %a2, i32 2
   2349   %res3 = insertelement <4 x i64> %res2, i64 %a3, i32 3
   2350   ret <4 x i64> %res3
   2351 }
   2352 
   2353 define <8 x float> @test_mm256_setr_m128(<4 x float> %a0, <4 x float> %a1) nounwind {
   2354 ; CHECK-LABEL: test_mm256_setr_m128:
   2355 ; CHECK:       # %bb.0:
   2356 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   2357 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   2358 ; CHECK-NEXT:    ret{{[l|q]}}
   2359   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2360   ret <8 x float> %res
   2361 }
   2362 
   2363 define <4 x double> @test_mm256_setr_m128d(<2 x double> %a0, <2 x double> %a1) nounwind {
   2364 ; CHECK-LABEL: test_mm256_setr_m128d:
   2365 ; CHECK:       # %bb.0:
   2366 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   2367 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   2368 ; CHECK-NEXT:    ret{{[l|q]}}
   2369   %arg0 = bitcast <2 x double> %a0 to <4 x float>
   2370   %arg1 = bitcast <2 x double> %a1 to <4 x float>
   2371   %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2372   %bc = bitcast <8 x float> %res to <4 x double>
   2373   ret <4 x double> %bc
   2374 }
   2375 
   2376 define <4 x i64> @test_mm256_setr_m128i(<2 x i64> %a0, <2 x i64> %a1) nounwind {
   2377 ; CHECK-LABEL: test_mm256_setr_m128i:
   2378 ; CHECK:       # %bb.0:
   2379 ; CHECK-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
   2380 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   2381 ; CHECK-NEXT:    ret{{[l|q]}}
   2382   %arg0 = bitcast <2 x i64> %a0 to <4 x float>
   2383   %arg1 = bitcast <2 x i64> %a1 to <4 x float>
   2384   %res = shufflevector <4 x float> %arg0, <4 x float> %arg1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   2385   %bc = bitcast <8 x float> %res to <4 x i64>
   2386   ret <4 x i64> %bc
   2387 }
   2388 
   2389 define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, double %a3) nounwind {
   2390 ; X86-LABEL: test_mm256_setr_pd:
   2391 ; X86:       # %bb.0:
   2392 ; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
   2393 ; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
   2394 ; X86-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
   2395 ; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
   2396 ; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
   2397 ; X86-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
   2398 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2399 ; X86-NEXT:    retl
   2400 ;
   2401 ; X64-LABEL: test_mm256_setr_pd:
   2402 ; X64:       # %bb.0:
   2403 ; X64-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0]
   2404 ; X64-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2405 ; X64-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
   2406 ; X64-NEXT:    retq
   2407   %res0 = insertelement <4 x double> undef, double %a0, i32 0
   2408   %res1 = insertelement <4 x double> %res0, double %a1, i32 1
   2409   %res2 = insertelement <4 x double> %res1, double %a2, i32 2
   2410   %res3 = insertelement <4 x double> %res2, double %a3, i32 3
   2411   ret <4 x double> %res3
   2412 }
   2413 
   2414 define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7) nounwind {
   2415 ; X86-LABEL: test_mm256_setr_ps:
   2416 ; X86:       # %bb.0:
   2417 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2418 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2419 ; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   2420 ; X86-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
   2421 ; X86-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
   2422 ; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
   2423 ; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
   2424 ; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2425 ; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   2426 ; X86-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
   2427 ; X86-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
   2428 ; X86-NEXT:    vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3]
   2429 ; X86-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3]
   2430 ; X86-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0]
   2431 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2432 ; X86-NEXT:    retl
   2433 ;
   2434 ; X64-LABEL: test_mm256_setr_ps:
   2435 ; X64:       # %bb.0:
   2436 ; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3]
   2437 ; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3]
   2438 ; X64-NEXT:    vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0]
   2439 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
   2440 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
   2441 ; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
   2442 ; X64-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
   2443 ; X64-NEXT:    retq
   2444   %res0 = insertelement <8 x float> undef, float %a0, i32 0
   2445   %res1 = insertelement <8 x float> %res0, float %a1, i32 1
   2446   %res2 = insertelement <8 x float> %res1, float %a2, i32 2
   2447   %res3 = insertelement <8 x float> %res2, float %a3, i32 3
   2448   %res4 = insertelement <8 x float> %res3, float %a4, i32 4
   2449   %res5 = insertelement <8 x float> %res4, float %a5, i32 5
   2450   %res6 = insertelement <8 x float> %res5, float %a6, i32 6
   2451   %res7 = insertelement <8 x float> %res6, float %a7, i32 7
   2452   ret <8 x float> %res7
   2453 }
   2454 
   2455 define <4 x double> @test_mm256_setzero_pd() nounwind {
   2456 ; CHECK-LABEL: test_mm256_setzero_pd:
   2457 ; CHECK:       # %bb.0:
   2458 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   2459 ; CHECK-NEXT:    ret{{[l|q]}}
   2460   ret <4 x double> zeroinitializer
   2461 }
   2462 
   2463 define <8 x float> @test_mm256_setzero_ps() nounwind {
   2464 ; CHECK-LABEL: test_mm256_setzero_ps:
   2465 ; CHECK:       # %bb.0:
   2466 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   2467 ; CHECK-NEXT:    ret{{[l|q]}}
   2468   ret <8 x float> zeroinitializer
   2469 }
   2470 
   2471 define <4 x i64> @test_mm256_setzero_si256() nounwind {
   2472 ; CHECK-LABEL: test_mm256_setzero_si256:
   2473 ; CHECK:       # %bb.0:
   2474 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
   2475 ; CHECK-NEXT:    ret{{[l|q]}}
   2476   ret <4 x i64> zeroinitializer
   2477 }
   2478 
   2479 define <4 x double> @test_mm256_shuffle_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   2480 ; CHECK-LABEL: test_mm256_shuffle_pd:
   2481 ; CHECK:       # %bb.0:
   2482 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
   2483 ; CHECK-NEXT:    ret{{[l|q]}}
   2484   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   2485   ret <4 x double> %res
   2486 }
   2487 
   2488 define <8 x float> @test_mm256_shuffle_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   2489 ; CHECK-LABEL: test_mm256_shuffle_ps:
   2490 ; CHECK:       # %bb.0:
   2491 ; CHECK-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0],ymm1[0,0],ymm0[4,4],ymm1[4,4]
   2492 ; CHECK-NEXT:    ret{{[l|q]}}
   2493   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
   2494   ret <8 x float> %res
   2495 }
   2496 
   2497 define <4 x double> @test_mm256_sqrt_pd(<4 x double> %a0) nounwind {
   2498 ; CHECK-LABEL: test_mm256_sqrt_pd:
   2499 ; CHECK:       # %bb.0: # %entry
   2500 ; CHECK-NEXT:    vsqrtpd %ymm0, %ymm0
   2501 ; CHECK-NEXT:    ret{{[l|q]}}
   2502 entry:
   2503   %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2
   2504   ret <4 x double> %0
   2505 }
   2506 
   2507 declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1
   2508 
   2509 define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
   2510 ; CHECK-LABEL: test_mm256_sqrt_ps:
   2511 ; CHECK:       # %bb.0: # %entry
   2512 ; CHECK-NEXT:    vsqrtps %ymm0, %ymm0
   2513 ; CHECK-NEXT:    ret{{[l|q]}}
   2514 entry:
   2515   %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2
   2516   ret <8 x float> %0
   2517 }
   2518 
   2519 declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
   2520 
   2521 define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind {
   2522 ; X86-LABEL: test_mm256_store_pd:
   2523 ; X86:       # %bb.0:
   2524 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2525 ; X86-NEXT:    vmovaps %ymm0, (%eax)
   2526 ; X86-NEXT:    vzeroupper
   2527 ; X86-NEXT:    retl
   2528 ;
   2529 ; X64-LABEL: test_mm256_store_pd:
   2530 ; X64:       # %bb.0:
   2531 ; X64-NEXT:    vmovaps %ymm0, (%rdi)
   2532 ; X64-NEXT:    vzeroupper
   2533 ; X64-NEXT:    retq
   2534   %arg0 = bitcast double* %a0 to <4 x double>*
   2535   store <4 x double> %a1, <4 x double>* %arg0, align 32
   2536   ret void
   2537 }
   2538 
   2539 define void @test_mm256_store_ps(float* %a0, <8 x float> %a1) nounwind {
   2540 ; X86-LABEL: test_mm256_store_ps:
   2541 ; X86:       # %bb.0:
   2542 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2543 ; X86-NEXT:    vmovaps %ymm0, (%eax)
   2544 ; X86-NEXT:    vzeroupper
   2545 ; X86-NEXT:    retl
   2546 ;
   2547 ; X64-LABEL: test_mm256_store_ps:
   2548 ; X64:       # %bb.0:
   2549 ; X64-NEXT:    vmovaps %ymm0, (%rdi)
   2550 ; X64-NEXT:    vzeroupper
   2551 ; X64-NEXT:    retq
   2552   %arg0 = bitcast float* %a0 to <8 x float>*
   2553   store <8 x float> %a1, <8 x float>* %arg0, align 32
   2554   ret void
   2555 }
   2556 
   2557 define void @test_mm256_store_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
   2558 ; X86-LABEL: test_mm256_store_si256:
   2559 ; X86:       # %bb.0:
   2560 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2561 ; X86-NEXT:    vmovaps %ymm0, (%eax)
   2562 ; X86-NEXT:    vzeroupper
   2563 ; X86-NEXT:    retl
   2564 ;
   2565 ; X64-LABEL: test_mm256_store_si256:
   2566 ; X64:       # %bb.0:
   2567 ; X64-NEXT:    vmovaps %ymm0, (%rdi)
   2568 ; X64-NEXT:    vzeroupper
   2569 ; X64-NEXT:    retq
   2570   store <4 x i64> %a1, <4 x i64>* %a0, align 32
   2571   ret void
   2572 }
   2573 
   2574 define void @test_mm256_storeu_pd(double* %a0, <4 x double> %a1) nounwind {
   2575 ; X86-LABEL: test_mm256_storeu_pd:
   2576 ; X86:       # %bb.0:
   2577 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2578 ; X86-NEXT:    vmovups %ymm0, (%eax)
   2579 ; X86-NEXT:    vzeroupper
   2580 ; X86-NEXT:    retl
   2581 ;
   2582 ; X64-LABEL: test_mm256_storeu_pd:
   2583 ; X64:       # %bb.0:
   2584 ; X64-NEXT:    vmovups %ymm0, (%rdi)
   2585 ; X64-NEXT:    vzeroupper
   2586 ; X64-NEXT:    retq
   2587   %arg0 = bitcast double* %a0 to <4 x double>*
   2588   store <4 x double> %a1, <4 x double>* %arg0, align 1
   2589   ret void
   2590 }
   2591 
   2592 define void @test_mm256_storeu_ps(float* %a0, <8 x float> %a1) nounwind {
   2593 ; X86-LABEL: test_mm256_storeu_ps:
   2594 ; X86:       # %bb.0:
   2595 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2596 ; X86-NEXT:    vmovups %ymm0, (%eax)
   2597 ; X86-NEXT:    vzeroupper
   2598 ; X86-NEXT:    retl
   2599 ;
   2600 ; X64-LABEL: test_mm256_storeu_ps:
   2601 ; X64:       # %bb.0:
   2602 ; X64-NEXT:    vmovups %ymm0, (%rdi)
   2603 ; X64-NEXT:    vzeroupper
   2604 ; X64-NEXT:    retq
   2605   %arg0 = bitcast float* %a0 to <8 x float>*
   2606   store <8 x float> %a1, <8 x float>* %arg0, align 1
   2607   ret void
   2608 }
   2609 
   2610 define void @test_mm256_storeu_si256(<4 x i64>* %a0, <4 x i64> %a1) nounwind {
   2611 ; X86-LABEL: test_mm256_storeu_si256:
   2612 ; X86:       # %bb.0:
   2613 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2614 ; X86-NEXT:    vmovups %ymm0, (%eax)
   2615 ; X86-NEXT:    vzeroupper
   2616 ; X86-NEXT:    retl
   2617 ;
   2618 ; X64-LABEL: test_mm256_storeu_si256:
   2619 ; X64:       # %bb.0:
   2620 ; X64-NEXT:    vmovups %ymm0, (%rdi)
   2621 ; X64-NEXT:    vzeroupper
   2622 ; X64-NEXT:    retq
   2623   store <4 x i64> %a1, <4 x i64>* %a0, align 1
   2624   ret void
   2625 }
   2626 
   2627 define void @test_mm256_storeu2_m128(float* %a0, float* %a1, <8 x float> %a2) nounwind {
   2628 ; X86-LABEL: test_mm256_storeu2_m128:
   2629 ; X86:       # %bb.0:
   2630 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2631 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   2632 ; X86-NEXT:    vmovups %xmm0, (%ecx)
   2633 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2634 ; X86-NEXT:    vmovups %xmm0, (%eax)
   2635 ; X86-NEXT:    vzeroupper
   2636 ; X86-NEXT:    retl
   2637 ;
   2638 ; X64-LABEL: test_mm256_storeu2_m128:
   2639 ; X64:       # %bb.0:
   2640 ; X64-NEXT:    vmovups %xmm0, (%rdi)
   2641 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2642 ; X64-NEXT:    vmovups %xmm0, (%rsi)
   2643 ; X64-NEXT:    vzeroupper
   2644 ; X64-NEXT:    retq
   2645   %arg0 = bitcast float* %a0 to <4 x float>*
   2646   %lo = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   2647   store <4 x float> %lo, <4 x float>* %arg0, align 1
   2648   %arg1 = bitcast float* %a1 to <4 x float>*
   2649   %hi = shufflevector <8 x float> %a2, <8 x float> %a2, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   2650   store <4 x float> %hi, <4 x float>* %arg1, align 1
   2651   ret void
   2652 }
   2653 
   2654 define void @test_mm256_storeu2_m128d(double* %a0, double* %a1, <4 x double> %a2) nounwind {
   2655 ; X86-LABEL: test_mm256_storeu2_m128d:
   2656 ; X86:       # %bb.0:
   2657 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2658 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   2659 ; X86-NEXT:    vmovups %xmm0, (%ecx)
   2660 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2661 ; X86-NEXT:    vmovups %xmm0, (%eax)
   2662 ; X86-NEXT:    vzeroupper
   2663 ; X86-NEXT:    retl
   2664 ;
   2665 ; X64-LABEL: test_mm256_storeu2_m128d:
   2666 ; X64:       # %bb.0:
   2667 ; X64-NEXT:    vmovups %xmm0, (%rdi)
   2668 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2669 ; X64-NEXT:    vmovups %xmm0, (%rsi)
   2670 ; X64-NEXT:    vzeroupper
   2671 ; X64-NEXT:    retq
   2672   %arg0 = bitcast double* %a0 to <2 x double>*
   2673   %lo = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 0, i32 1>
   2674   store <2 x double> %lo, <2 x double>* %arg0, align 1
   2675   %arg1 = bitcast double* %a1 to <2 x double>*
   2676   %hi = shufflevector <4 x double> %a2, <4 x double> %a2, <2 x i32> <i32 2, i32 3>
   2677   store <2 x double> %hi, <2 x double>* %arg1, align 1
   2678   ret void
   2679 }
   2680 
   2681 define void @test_mm256_storeu2_m128i(<2 x i64>* %a0, <2 x i64>* %a1, <4 x i64> %a2) nounwind {
   2682 ; X86-LABEL: test_mm256_storeu2_m128i:
   2683 ; X86:       # %bb.0:
   2684 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2685 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   2686 ; X86-NEXT:    vmovups %xmm0, (%ecx)
   2687 ; X86-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2688 ; X86-NEXT:    vmovups %xmm0, (%eax)
   2689 ; X86-NEXT:    vzeroupper
   2690 ; X86-NEXT:    retl
   2691 ;
   2692 ; X64-LABEL: test_mm256_storeu2_m128i:
   2693 ; X64:       # %bb.0:
   2694 ; X64-NEXT:    vmovups %xmm0, (%rdi)
   2695 ; X64-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2696 ; X64-NEXT:    vmovups %xmm0, (%rsi)
   2697 ; X64-NEXT:    vzeroupper
   2698 ; X64-NEXT:    retq
   2699   %arg0 = bitcast <2 x i64>* %a0 to <2 x i64>*
   2700   %lo = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 0, i32 1>
   2701   store <2 x i64> %lo, <2 x i64>* %arg0, align 1
   2702   %arg1 = bitcast <2 x i64>* %a1 to <2 x i64>*
   2703   %hi = shufflevector <4 x i64> %a2, <4 x i64> %a2, <2 x i32> <i32 2, i32 3>
   2704   store <2 x i64> %hi, <2 x i64>* %arg1, align 1
   2705   ret void
   2706 }
   2707 
   2708 define void @test_mm256_stream_pd(double *%a0, <4 x double> %a1) nounwind {
   2709 ; X86-LABEL: test_mm256_stream_pd:
   2710 ; X86:       # %bb.0:
   2711 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2712 ; X86-NEXT:    vmovntps %ymm0, (%eax)
   2713 ; X86-NEXT:    vzeroupper
   2714 ; X86-NEXT:    retl
   2715 ;
   2716 ; X64-LABEL: test_mm256_stream_pd:
   2717 ; X64:       # %bb.0:
   2718 ; X64-NEXT:    vmovntps %ymm0, (%rdi)
   2719 ; X64-NEXT:    vzeroupper
   2720 ; X64-NEXT:    retq
   2721   %arg0 = bitcast double* %a0 to <4 x double>*
   2722   store <4 x double> %a1, <4 x double>* %arg0, align 32, !nontemporal !0
   2723   ret void
   2724 }
   2725 
   2726 define void @test_mm256_stream_ps(float *%a0, <8 x float> %a1) nounwind {
   2727 ; X86-LABEL: test_mm256_stream_ps:
   2728 ; X86:       # %bb.0:
   2729 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2730 ; X86-NEXT:    vmovntps %ymm0, (%eax)
   2731 ; X86-NEXT:    vzeroupper
   2732 ; X86-NEXT:    retl
   2733 ;
   2734 ; X64-LABEL: test_mm256_stream_ps:
   2735 ; X64:       # %bb.0:
   2736 ; X64-NEXT:    vmovntps %ymm0, (%rdi)
   2737 ; X64-NEXT:    vzeroupper
   2738 ; X64-NEXT:    retq
   2739   %arg0 = bitcast float* %a0 to <8 x float>*
   2740   store <8 x float> %a1, <8 x float>* %arg0, align 32, !nontemporal !0
   2741   ret void
   2742 }
   2743 
   2744 define void @test_mm256_stream_si256(<4 x i64> *%a0, <4 x i64> %a1) nounwind {
   2745 ; X86-LABEL: test_mm256_stream_si256:
   2746 ; X86:       # %bb.0:
   2747 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2748 ; X86-NEXT:    vmovntps %ymm0, (%eax)
   2749 ; X86-NEXT:    vzeroupper
   2750 ; X86-NEXT:    retl
   2751 ;
   2752 ; X64-LABEL: test_mm256_stream_si256:
   2753 ; X64:       # %bb.0:
   2754 ; X64-NEXT:    vmovntps %ymm0, (%rdi)
   2755 ; X64-NEXT:    vzeroupper
   2756 ; X64-NEXT:    retq
   2757   store <4 x i64> %a1, <4 x i64>* %a0, align 32, !nontemporal !0
   2758   ret void
   2759 }
   2760 
   2761 define <4 x double> @test_mm256_sub_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   2762 ; CHECK-LABEL: test_mm256_sub_pd:
   2763 ; CHECK:       # %bb.0:
   2764 ; CHECK-NEXT:    vsubpd %ymm1, %ymm0, %ymm0
   2765 ; CHECK-NEXT:    ret{{[l|q]}}
   2766   %res = fsub <4 x double> %a0, %a1
   2767   ret <4 x double> %res
   2768 }
   2769 
   2770 define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   2771 ; CHECK-LABEL: test_mm256_sub_ps:
   2772 ; CHECK:       # %bb.0:
   2773 ; CHECK-NEXT:    vsubps %ymm1, %ymm0, %ymm0
   2774 ; CHECK-NEXT:    ret{{[l|q]}}
   2775   %res = fsub <8 x float> %a0, %a1
   2776   ret <8 x float> %res
   2777 }
   2778 
   2779 define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
   2780 ; CHECK-LABEL: test_mm_testc_pd:
   2781 ; CHECK:       # %bb.0:
   2782 ; CHECK-NEXT:    xorl %eax, %eax
   2783 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0
   2784 ; CHECK-NEXT:    setb %al
   2785 ; CHECK-NEXT:    ret{{[l|q]}}
   2786   %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
   2787   ret i32 %res
   2788 }
   2789 declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnone
   2790 
   2791 define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   2792 ; CHECK-LABEL: test_mm256_testc_pd:
   2793 ; CHECK:       # %bb.0:
   2794 ; CHECK-NEXT:    xorl %eax, %eax
   2795 ; CHECK-NEXT:    vtestpd %ymm1, %ymm0
   2796 ; CHECK-NEXT:    setb %al
   2797 ; CHECK-NEXT:    vzeroupper
   2798 ; CHECK-NEXT:    ret{{[l|q]}}
   2799   %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
   2800   ret i32 %res
   2801 }
   2802 declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind readnone
   2803 
   2804 define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   2805 ; CHECK-LABEL: test_mm_testc_ps:
   2806 ; CHECK:       # %bb.0:
   2807 ; CHECK-NEXT:    xorl %eax, %eax
   2808 ; CHECK-NEXT:    vtestps %xmm1, %xmm0
   2809 ; CHECK-NEXT:    setb %al
   2810 ; CHECK-NEXT:    ret{{[l|q]}}
   2811   %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
   2812   ret i32 %res
   2813 }
   2814 declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
   2815 
   2816 define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   2817 ; CHECK-LABEL: test_mm256_testc_ps:
   2818 ; CHECK:       # %bb.0:
   2819 ; CHECK-NEXT:    xorl %eax, %eax
   2820 ; CHECK-NEXT:    vtestps %ymm1, %ymm0
   2821 ; CHECK-NEXT:    setb %al
   2822 ; CHECK-NEXT:    vzeroupper
   2823 ; CHECK-NEXT:    ret{{[l|q]}}
   2824   %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
   2825   ret i32 %res
   2826 }
   2827 declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readnone
   2828 
   2829 define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2830 ; CHECK-LABEL: test_mm256_testc_si256:
   2831 ; CHECK:       # %bb.0:
   2832 ; CHECK-NEXT:    xorl %eax, %eax
   2833 ; CHECK-NEXT:    vptest %ymm1, %ymm0
   2834 ; CHECK-NEXT:    setb %al
   2835 ; CHECK-NEXT:    vzeroupper
   2836 ; CHECK-NEXT:    ret{{[l|q]}}
   2837   %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
   2838   ret i32 %res
   2839 }
   2840 declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
   2841 
   2842 define i32 @test_mm_testnzc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
   2843 ; CHECK-LABEL: test_mm_testnzc_pd:
   2844 ; CHECK:       # %bb.0:
   2845 ; CHECK-NEXT:    xorl %eax, %eax
   2846 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0
   2847 ; CHECK-NEXT:    seta %al
   2848 ; CHECK-NEXT:    ret{{[l|q]}}
   2849   %res = call i32 @llvm.x86.avx.vtestnzc.pd(<2 x double> %a0, <2 x double> %a1)
   2850   ret i32 %res
   2851 }
   2852 declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readnone
   2853 
   2854 define i32 @test_mm256_testnzc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   2855 ; CHECK-LABEL: test_mm256_testnzc_pd:
   2856 ; CHECK:       # %bb.0:
   2857 ; CHECK-NEXT:    xorl %eax, %eax
   2858 ; CHECK-NEXT:    vtestpd %ymm1, %ymm0
   2859 ; CHECK-NEXT:    seta %al
   2860 ; CHECK-NEXT:    vzeroupper
   2861 ; CHECK-NEXT:    ret{{[l|q]}}
   2862   %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1)
   2863   ret i32 %res
   2864 }
   2865 declare i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double>, <4 x double>) nounwind readnone
   2866 
   2867 define i32 @test_mm_testnzc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   2868 ; CHECK-LABEL: test_mm_testnzc_ps:
   2869 ; CHECK:       # %bb.0:
   2870 ; CHECK-NEXT:    xorl %eax, %eax
   2871 ; CHECK-NEXT:    vtestps %xmm1, %xmm0
   2872 ; CHECK-NEXT:    seta %al
   2873 ; CHECK-NEXT:    ret{{[l|q]}}
   2874   %res = call i32 @llvm.x86.avx.vtestnzc.ps(<4 x float> %a0, <4 x float> %a1)
   2875   ret i32 %res
   2876 }
   2877 declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnone
   2878 
   2879 define i32 @test_mm256_testnzc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   2880 ; CHECK-LABEL: test_mm256_testnzc_ps:
   2881 ; CHECK:       # %bb.0:
   2882 ; CHECK-NEXT:    xorl %eax, %eax
   2883 ; CHECK-NEXT:    vtestps %ymm1, %ymm0
   2884 ; CHECK-NEXT:    seta %al
   2885 ; CHECK-NEXT:    vzeroupper
   2886 ; CHECK-NEXT:    ret{{[l|q]}}
   2887   %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1)
   2888   ret i32 %res
   2889 }
   2890 declare i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float>, <8 x float>) nounwind readnone
   2891 
   2892 define i32 @test_mm256_testnzc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2893 ; CHECK-LABEL: test_mm256_testnzc_si256:
   2894 ; CHECK:       # %bb.0:
   2895 ; CHECK-NEXT:    xorl %eax, %eax
   2896 ; CHECK-NEXT:    vptest %ymm1, %ymm0
   2897 ; CHECK-NEXT:    seta %al
   2898 ; CHECK-NEXT:    vzeroupper
   2899 ; CHECK-NEXT:    ret{{[l|q]}}
   2900   %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1)
   2901   ret i32 %res
   2902 }
   2903 declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
   2904 
   2905 define i32 @test_mm_testz_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
   2906 ; CHECK-LABEL: test_mm_testz_pd:
   2907 ; CHECK:       # %bb.0:
   2908 ; CHECK-NEXT:    xorl %eax, %eax
   2909 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0
   2910 ; CHECK-NEXT:    sete %al
   2911 ; CHECK-NEXT:    ret{{[l|q]}}
   2912   %res = call i32 @llvm.x86.avx.vtestz.pd(<2 x double> %a0, <2 x double> %a1)
   2913   ret i32 %res
   2914 }
   2915 declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnone
   2916 
   2917 define i32 @test_mm256_testz_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   2918 ; CHECK-LABEL: test_mm256_testz_pd:
   2919 ; CHECK:       # %bb.0:
   2920 ; CHECK-NEXT:    xorl %eax, %eax
   2921 ; CHECK-NEXT:    vtestpd %ymm1, %ymm0
   2922 ; CHECK-NEXT:    sete %al
   2923 ; CHECK-NEXT:    vzeroupper
   2924 ; CHECK-NEXT:    ret{{[l|q]}}
   2925   %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1)
   2926   ret i32 %res
   2927 }
   2928 declare i32 @llvm.x86.avx.vtestz.pd.256(<4 x double>, <4 x double>) nounwind readnone
   2929 
   2930 define i32 @test_mm_testz_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   2931 ; CHECK-LABEL: test_mm_testz_ps:
   2932 ; CHECK:       # %bb.0:
   2933 ; CHECK-NEXT:    xorl %eax, %eax
   2934 ; CHECK-NEXT:    vtestps %xmm1, %xmm0
   2935 ; CHECK-NEXT:    sete %al
   2936 ; CHECK-NEXT:    ret{{[l|q]}}
   2937   %res = call i32 @llvm.x86.avx.vtestz.ps(<4 x float> %a0, <4 x float> %a1)
   2938   ret i32 %res
   2939 }
   2940 declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
   2941 
   2942 define i32 @test_mm256_testz_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   2943 ; CHECK-LABEL: test_mm256_testz_ps:
   2944 ; CHECK:       # %bb.0:
   2945 ; CHECK-NEXT:    xorl %eax, %eax
   2946 ; CHECK-NEXT:    vtestps %ymm1, %ymm0
   2947 ; CHECK-NEXT:    sete %al
   2948 ; CHECK-NEXT:    vzeroupper
   2949 ; CHECK-NEXT:    ret{{[l|q]}}
   2950   %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1)
   2951   ret i32 %res
   2952 }
   2953 declare i32 @llvm.x86.avx.vtestz.ps.256(<8 x float>, <8 x float>) nounwind readnone
   2954 
   2955 define i32 @test_mm256_testz_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
   2956 ; CHECK-LABEL: test_mm256_testz_si256:
   2957 ; CHECK:       # %bb.0:
   2958 ; CHECK-NEXT:    xorl %eax, %eax
   2959 ; CHECK-NEXT:    vptest %ymm1, %ymm0
   2960 ; CHECK-NEXT:    sete %al
   2961 ; CHECK-NEXT:    vzeroupper
   2962 ; CHECK-NEXT:    ret{{[l|q]}}
   2963   %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1)
   2964   ret i32 %res
   2965 }
   2966 declare i32 @llvm.x86.avx.ptestz.256(<4 x i64>, <4 x i64>) nounwind readnone
   2967 
   2968 define <2 x double> @test_mm_undefined_pd() nounwind {
   2969 ; CHECK-LABEL: test_mm_undefined_pd:
   2970 ; CHECK:       # %bb.0:
   2971 ; CHECK-NEXT:    ret{{[l|q]}}
   2972   ret <2 x double> undef
   2973 }
   2974 
   2975 define <4 x double> @test_mm256_undefined_pd() nounwind {
   2976 ; CHECK-LABEL: test_mm256_undefined_pd:
   2977 ; CHECK:       # %bb.0:
   2978 ; CHECK-NEXT:    ret{{[l|q]}}
   2979   ret <4 x double> undef
   2980 }
   2981 
   2982 define <8 x float> @test_mm256_undefined_ps() nounwind {
   2983 ; CHECK-LABEL: test_mm256_undefined_ps:
   2984 ; CHECK:       # %bb.0:
   2985 ; CHECK-NEXT:    ret{{[l|q]}}
   2986   ret <8 x float> undef
   2987 }
   2988 
   2989 define <4 x i64> @test_mm256_undefined_si256() nounwind {
   2990 ; CHECK-LABEL: test_mm256_undefined_si256:
   2991 ; CHECK:       # %bb.0:
   2992 ; CHECK-NEXT:    ret{{[l|q]}}
   2993   ret <4 x i64> undef
   2994 }
   2995 
   2996 define <4 x double> @test_mm256_unpackhi_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   2997 ; CHECK-LABEL: test_mm256_unpackhi_pd:
   2998 ; CHECK:       # %bb.0:
   2999 ; CHECK-NEXT:    vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
   3000 ; CHECK-NEXT:    ret{{[l|q]}}
   3001   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   3002   ret <4 x double> %res
   3003 }
   3004 
   3005 define <8 x float> @test_mm256_unpackhi_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   3006 ; CHECK-LABEL: test_mm256_unpackhi_ps:
   3007 ; CHECK:       # %bb.0:
   3008 ; CHECK-NEXT:    vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
   3009 ; CHECK-NEXT:    ret{{[l|q]}}
   3010   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
   3011   ret <8 x float> %res
   3012 }
   3013 
   3014 define <4 x double> @test_mm256_unpacklo_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   3015 ; CHECK-LABEL: test_mm256_unpacklo_pd:
   3016 ; CHECK:       # %bb.0:
   3017 ; CHECK-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
   3018 ; CHECK-NEXT:    ret{{[l|q]}}
   3019   %res = shufflevector <4 x double> %a0, <4 x double> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   3020   ret <4 x double> %res
   3021 }
   3022 
   3023 define <8 x float> @test_mm256_unpacklo_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   3024 ; CHECK-LABEL: test_mm256_unpacklo_ps:
   3025 ; CHECK:       # %bb.0:
   3026 ; CHECK-NEXT:    vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
   3027 ; CHECK-NEXT:    ret{{[l|q]}}
   3028   %res = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
   3029   ret <8 x float> %res
   3030 }
   3031 
   3032 define <4 x double> @test_mm256_xor_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
   3033 ; CHECK-LABEL: test_mm256_xor_pd:
   3034 ; CHECK:       # %bb.0:
   3035 ; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
   3036 ; CHECK-NEXT:    ret{{[l|q]}}
   3037   %1 = bitcast <4 x double> %a0 to <4 x i64>
   3038   %2 = bitcast <4 x double> %a1 to <4 x i64>
   3039   %res = xor <4 x i64> %1, %2
   3040   %bc = bitcast <4 x i64> %res to <4 x double>
   3041   ret <4 x double> %bc
   3042 }
   3043 
   3044 define <8 x float> @test_mm256_xor_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
   3045 ; CHECK-LABEL: test_mm256_xor_ps:
   3046 ; CHECK:       # %bb.0:
   3047 ; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0
   3048 ; CHECK-NEXT:    ret{{[l|q]}}
   3049   %1 = bitcast <8 x float> %a0 to <8 x i32>
   3050   %2 = bitcast <8 x float> %a1 to <8 x i32>
   3051   %res = xor <8 x i32> %1, %2
   3052   %bc = bitcast <8 x i32> %res to <8 x float>
   3053   ret <8 x float> %bc
   3054 }
   3055 
   3056 define void @test_mm256_zeroall() nounwind {
   3057 ; CHECK-LABEL: test_mm256_zeroall:
   3058 ; CHECK:       # %bb.0:
   3059 ; CHECK-NEXT:    vzeroall
   3060 ; CHECK-NEXT:    ret{{[l|q]}}
   3061   call void @llvm.x86.avx.vzeroall()
   3062   ret void
   3063 }
   3064 declare void @llvm.x86.avx.vzeroall() nounwind readnone
   3065 
   3066 define void @test_mm256_zeroupper() nounwind {
   3067 ; CHECK-LABEL: test_mm256_zeroupper:
   3068 ; CHECK:       # %bb.0:
   3069 ; CHECK-NEXT:    vzeroupper
   3070 ; CHECK-NEXT:    ret{{[l|q]}}
   3071   call void @llvm.x86.avx.vzeroupper()
   3072   ret void
   3073 }
   3074 declare void @llvm.x86.avx.vzeroupper() nounwind readnone
   3075 
   3076 define <4 x double> @test_mm256_zextpd128_pd256(<2 x double> %a0) nounwind {
   3077 ; CHECK-LABEL: test_mm256_zextpd128_pd256:
   3078 ; CHECK:       # %bb.0:
   3079 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0
   3080 ; CHECK-NEXT:    ret{{[l|q]}}
   3081   %res = shufflevector <2 x double> %a0, <2 x double> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3082   ret <4 x double> %res
   3083 }
   3084 
   3085 define <8 x float> @test_mm256_zextps128_ps256(<4 x float> %a0) nounwind {
   3086 ; CHECK-LABEL: test_mm256_zextps128_ps256:
   3087 ; CHECK:       # %bb.0:
   3088 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0
   3089 ; CHECK-NEXT:    ret{{[l|q]}}
   3090   %res = shufflevector <4 x float> %a0, <4 x float> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   3091   ret <8 x float> %res
   3092 }
   3093 
   3094 define <4 x i64> @test_mm256_zextsi128_si256(<2 x i64> %a0) nounwind {
   3095 ; CHECK-LABEL: test_mm256_zextsi128_si256:
   3096 ; CHECK:       # %bb.0:
   3097 ; CHECK-NEXT:    vmovaps %xmm0, %xmm0
   3098 ; CHECK-NEXT:    ret{{[l|q]}}
   3099   %res = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   3100   ret <4 x i64> %res
   3101 }
   3102 
   3103 !0 = !{i32 1}
   3104