Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx | FileCheck %s
      3 
      4 ; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.
      5 
      6 define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
      7 ; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1:
      8 ; CHECK:       ## BB#0:
      9 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     10 ; CHECK-NEXT:    retl
     11   %res = call <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double> %a0, <2 x double> %a1, i8 1)
     12   ret <4 x double> %res
     13 }
     14 declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
     15 
     16 define <8 x float> @test_x86_avx_vinsertf128_ps_256_1(<8 x float> %a0, <4 x float> %a1) {
     17 ; CHECK-LABEL: test_x86_avx_vinsertf128_ps_256_1:
     18 ; CHECK:       ## BB#0:
     19 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     20 ; CHECK-NEXT:    retl
     21   %res = call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %a0, <4 x float> %a1, i8 1)
     22   ret <8 x float> %res
     23 }
     24 declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8) nounwind readnone
     25 
     26 define <8 x i32> @test_x86_avx_vinsertf128_si_256_1(<8 x i32> %a0, <4 x i32> %a1) {
     27 ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_1:
     28 ; CHECK:       ## BB#0:
     29 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
     30 ; CHECK-NEXT:    retl
     31   %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 1)
     32   ret <8 x i32> %res
     33 }
     34 
     35 ; Verify that high bits of the immediate are masked off. This should be the equivalent
     36 ; of a vinsertf128 $0 which should be optimized into a blend, so just check that it's
     37 ; not a vinsertf128 $1.
     38 define <8 x i32> @test_x86_avx_vinsertf128_si_256_2(<8 x i32> %a0, <4 x i32> %a1) {
     39 ; CHECK-LABEL: test_x86_avx_vinsertf128_si_256_2:
     40 ; CHECK:       ## BB#0:
     41 ; CHECK-NEXT:    ## kill: %XMM1<def> %XMM1<kill> %YMM1<def>
     42 ; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
     43 ; CHECK-NEXT:    retl
     44   %res = call <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32> %a0, <4 x i32> %a1, i8 2)
     45   ret <8 x i32> %res
     46 }
     47 declare <8 x i32> @llvm.x86.avx.vinsertf128.si.256(<8 x i32>, <4 x i32>, i8) nounwind readnone
     48 
     49 ; We don't check any vextractf128 variant with immediate 0 because that's just a move.
     50 
     51 define <2 x double> @test_x86_avx_vextractf128_pd_256_1(<4 x double> %a0) {
     52 ; CHECK-LABEL: test_x86_avx_vextractf128_pd_256_1:
     53 ; CHECK:       ## BB#0:
     54 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
     55 ; CHECK-NEXT:    vzeroupper
     56 ; CHECK-NEXT:    retl
     57   %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 1)
     58   ret <2 x double> %res
     59 }
     60 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
     61 
     62 define <4 x float> @test_x86_avx_vextractf128_ps_256_1(<8 x float> %a0) {
     63 ; CHECK-LABEL: test_x86_avx_vextractf128_ps_256_1:
     64 ; CHECK:       ## BB#0:
     65 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
     66 ; CHECK-NEXT:    vzeroupper
     67 ; CHECK-NEXT:    retl
     68   %res = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a0, i8 1)
     69   ret <4 x float> %res
     70 }
     71 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
     72 
     73 define <4 x i32> @test_x86_avx_vextractf128_si_256_1(<8 x i32> %a0) {
     74 ; CHECK-LABEL: test_x86_avx_vextractf128_si_256_1:
     75 ; CHECK:       ## BB#0:
     76 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm0
     77 ; CHECK-NEXT:    vzeroupper
     78 ; CHECK-NEXT:    retl
     79   %res = call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %a0, i8 1)
     80   ret <4 x i32> %res
     81 }
     82 declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
     83 
     84 ; Verify that high bits of the immediate are masked off. This should be the equivalent
     85 ; of a vextractf128 $0 which should be optimized away, so just check that it's
     86 ; not a vextractf128 of any kind.
     87 define <2 x double> @test_x86_avx_extractf128_pd_256_2(<4 x double> %a0) {
     88 ; CHECK-LABEL: test_x86_avx_extractf128_pd_256_2:
     89 ; CHECK:       ## BB#0:
     90 ; CHECK-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
     91 ; CHECK-NEXT:    vzeroupper
     92 ; CHECK-NEXT:    retl
     93   %res = call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a0, i8 2)
     94   ret <2 x double> %res
     95 }
     96 
     97 
     98 define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
     99 ; CHECK-LABEL: test_x86_avx_blend_pd_256:
    100 ; CHECK:       ## BB#0:
    101 ; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
    102 ; CHECK-NEXT:    retl
    103   %res = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 7) ; <<4 x double>> [#uses=1]
    104   ret <4 x double> %res
    105 }
    106 declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32) nounwind readnone
    107 
    108 
    109 define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
    110 ; CHECK-LABEL: test_x86_avx_blend_ps_256:
    111 ; CHECK:       ## BB#0:
    112 ; CHECK-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
    113 ; CHECK-NEXT:    retl
    114   %res = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
    115   ret <8 x float> %res
    116 }
    117 declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
    118 
    119 
    120 define <8 x float> @test_x86_avx_dp_ps_256(<8 x float> %a0, <8 x float> %a1) {
    121 ; CHECK-LABEL: test_x86_avx_dp_ps_256:
    122 ; CHECK:       ## BB#0:
    123 ; CHECK-NEXT:    vdpps $7, %ymm1, %ymm0, %ymm0
    124 ; CHECK-NEXT:    retl
    125   %res = call <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float> %a0, <8 x float> %a1, i32 7) ; <<8 x float>> [#uses=1]
    126   ret <8 x float> %res
    127 }
    128 declare <8 x float> @llvm.x86.avx.dp.ps.256(<8 x float>, <8 x float>, i32) nounwind readnone
    129 
    130 
    131 define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
    132 ; CHECK-LABEL: test_x86_sse2_psll_dq:
    133 ; CHECK:       ## BB#0:
    134 ; CHECK-NEXT:    vpslldq {{.*#+}} xmm0 = zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]
    135 ; CHECK-NEXT:    retl
    136   %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
    137   ret <2 x i64> %res
    138 }
    139 declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
    140 
    141 
    142 define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
    143 ; CHECK-LABEL: test_x86_sse2_psrl_dq:
    144 ; CHECK:       ## BB#0:
    145 ; CHECK-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero
    146 ; CHECK-NEXT:    retl
    147   %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 8) ; <<2 x i64>> [#uses=1]
    148   ret <2 x i64> %res
    149 }
    150 declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
    151 
    152 
    153 define <2 x double> @test_x86_sse41_blendpd(<2 x double> %a0, <2 x double> %a1) {
    154 ; CHECK-LABEL: test_x86_sse41_blendpd:
    155 ; CHECK:       ## BB#0:
    156 ; CHECK-NEXT:    vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
    157 ; CHECK-NEXT:    retl
    158   %res = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i8 2) ; <<2 x double>> [#uses=1]
    159   ret <2 x double> %res
    160 }
    161 declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i8) nounwind readnone
    162 
    163 
    164 define <4 x float> @test_x86_sse41_blendps(<4 x float> %a0, <4 x float> %a1) {
    165 ; CHECK-LABEL: test_x86_sse41_blendps:
    166 ; CHECK:       ## BB#0:
    167 ; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
    168 ; CHECK-NEXT:    retl
    169   %res = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
    170   ret <4 x float> %res
    171 }
    172 declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i8) nounwind readnone
    173 
    174 
    175 define <8 x i16> @test_x86_sse41_pblendw(<8 x i16> %a0, <8 x i16> %a1) {
    176 ; CHECK-LABEL: test_x86_sse41_pblendw:
    177 ; CHECK:       ## BB#0:
    178 ; CHECK-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6,7]
    179 ; CHECK-NEXT:    retl
    180   %res = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i8 7) ; <<8 x i16>> [#uses=1]
    181   ret <8 x i16> %res
    182 }
    183 declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i8) nounwind readnone
    184 
    185 
    186 define <4 x i32> @test_x86_sse41_pmovsxbd(<16 x i8> %a0) {
    187 ; CHECK-LABEL: test_x86_sse41_pmovsxbd:
    188 ; CHECK:       ## BB#0:
    189 ; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm0
    190 ; CHECK-NEXT:    retl
    191   %res = call <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
    192   ret <4 x i32> %res
    193 }
    194 declare <4 x i32> @llvm.x86.sse41.pmovsxbd(<16 x i8>) nounwind readnone
    195 
    196 
    197 define <2 x i64> @test_x86_sse41_pmovsxbq(<16 x i8> %a0) {
    198 ; CHECK-LABEL: test_x86_sse41_pmovsxbq:
    199 ; CHECK:       ## BB#0:
    200 ; CHECK-NEXT:    vpmovsxbq %xmm0, %xmm0
    201 ; CHECK-NEXT:    retl
    202   %res = call <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
    203   ret <2 x i64> %res
    204 }
    205 declare <2 x i64> @llvm.x86.sse41.pmovsxbq(<16 x i8>) nounwind readnone
    206 
    207 
    208 define <8 x i16> @test_x86_sse41_pmovsxbw(<16 x i8> %a0) {
    209 ; CHECK-LABEL: test_x86_sse41_pmovsxbw:
    210 ; CHECK:       ## BB#0:
    211 ; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm0
    212 ; CHECK-NEXT:    retl
    213   %res = call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
    214   ret <8 x i16> %res
    215 }
    216 declare <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8>) nounwind readnone
    217 
    218 
    219 define <2 x i64> @test_x86_sse41_pmovsxdq(<4 x i32> %a0) {
    220 ; CHECK-LABEL: test_x86_sse41_pmovsxdq:
    221 ; CHECK:       ## BB#0:
    222 ; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0
    223 ; CHECK-NEXT:    retl
    224   %res = call <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
    225   ret <2 x i64> %res
    226 }
    227 declare <2 x i64> @llvm.x86.sse41.pmovsxdq(<4 x i32>) nounwind readnone
    228 
    229 
    230 define <4 x i32> @test_x86_sse41_pmovsxwd(<8 x i16> %a0) {
    231 ; CHECK-LABEL: test_x86_sse41_pmovsxwd:
    232 ; CHECK:       ## BB#0:
    233 ; CHECK-NEXT:    vpmovsxwd %xmm0, %xmm0
    234 ; CHECK-NEXT:    retl
    235   %res = call <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
    236   ret <4 x i32> %res
    237 }
    238 declare <4 x i32> @llvm.x86.sse41.pmovsxwd(<8 x i16>) nounwind readnone
    239 
    240 
    241 define <2 x i64> @test_x86_sse41_pmovsxwq(<8 x i16> %a0) {
    242 ; CHECK-LABEL: test_x86_sse41_pmovsxwq:
    243 ; CHECK:       ## BB#0:
    244 ; CHECK-NEXT:    vpmovsxwq %xmm0, %xmm0
    245 ; CHECK-NEXT:    retl
    246   %res = call <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
    247   ret <2 x i64> %res
    248 }
    249 declare <2 x i64> @llvm.x86.sse41.pmovsxwq(<8 x i16>) nounwind readnone
    250 
    251 
    252 define <4 x i32> @test_x86_sse41_pmovzxbd(<16 x i8> %a0) {
    253 ; CHECK-LABEL: test_x86_sse41_pmovzxbd:
    254 ; CHECK:       ## BB#0:
    255 ; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    256 ; CHECK-NEXT:    retl
    257   %res = call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> %a0) ; <<4 x i32>> [#uses=1]
    258   ret <4 x i32> %res
    259 }
    260 declare <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8>) nounwind readnone
    261 
    262 
    263 define <2 x i64> @test_x86_sse41_pmovzxbq(<16 x i8> %a0) {
    264 ; CHECK-LABEL: test_x86_sse41_pmovzxbq:
    265 ; CHECK:       ## BB#0:
    266 ; CHECK-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
    267 ; CHECK-NEXT:    retl
    268   %res = call <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8> %a0) ; <<2 x i64>> [#uses=1]
    269   ret <2 x i64> %res
    270 }
    271 declare <2 x i64> @llvm.x86.sse41.pmovzxbq(<16 x i8>) nounwind readnone
    272 
    273 
    274 define <8 x i16> @test_x86_sse41_pmovzxbw(<16 x i8> %a0) {
    275 ; CHECK-LABEL: test_x86_sse41_pmovzxbw:
    276 ; CHECK:       ## BB#0:
    277 ; CHECK-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    278 ; CHECK-NEXT:    retl
    279   %res = call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %a0) ; <<8 x i16>> [#uses=1]
    280   ret <8 x i16> %res
    281 }
    282 declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
    283 
    284 
    285 define <2 x i64> @test_x86_sse41_pmovzxdq(<4 x i32> %a0) {
    286 ; CHECK-LABEL: test_x86_sse41_pmovzxdq:
    287 ; CHECK:       ## BB#0:
    288 ; CHECK-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    289 ; CHECK-NEXT:    retl
    290   %res = call <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32> %a0) ; <<2 x i64>> [#uses=1]
    291   ret <2 x i64> %res
    292 }
    293 declare <2 x i64> @llvm.x86.sse41.pmovzxdq(<4 x i32>) nounwind readnone
    294 
    295 
    296 define <4 x i32> @test_x86_sse41_pmovzxwd(<8 x i16> %a0) {
    297 ; CHECK-LABEL: test_x86_sse41_pmovzxwd:
    298 ; CHECK:       ## BB#0:
    299 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    300 ; CHECK-NEXT:    retl
    301   %res = call <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16> %a0) ; <<4 x i32>> [#uses=1]
    302   ret <4 x i32> %res
    303 }
    304 declare <4 x i32> @llvm.x86.sse41.pmovzxwd(<8 x i16>) nounwind readnone
    305 
    306 
    307 define <2 x i64> @test_x86_sse41_pmovzxwq(<8 x i16> %a0) {
    308 ; CHECK-LABEL: test_x86_sse41_pmovzxwq:
    309 ; CHECK:       ## BB#0:
    310 ; CHECK-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
    311 ; CHECK-NEXT:    retl
    312   %res = call <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16> %a0) ; <<2 x i64>> [#uses=1]
    313   ret <2 x i64> %res
    314 }
    315 declare <2 x i64> @llvm.x86.sse41.pmovzxwq(<8 x i16>) nounwind readnone
    316 
    317 
    318 define <2 x double> @test_x86_sse2_cvtdq2pd(<4 x i32> %a0) {
    319 ; CHECK-LABEL: test_x86_sse2_cvtdq2pd:
    320 ; CHECK:       ## BB#0:
    321 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %xmm0
    322 ; CHECK-NEXT:    retl
    323   %res = call <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32> %a0) ; <<2 x double>> [#uses=1]
    324   ret <2 x double> %res
    325 }
    326 declare <2 x double> @llvm.x86.sse2.cvtdq2pd(<4 x i32>) nounwind readnone
    327 
    328 
    329 define <4 x double> @test_x86_avx_cvtdq2_pd_256(<4 x i32> %a0) {
    330 ; CHECK-LABEL: test_x86_avx_cvtdq2_pd_256:
    331 ; CHECK:       ## BB#0:
    332 ; CHECK-NEXT:    vcvtdq2pd %xmm0, %ymm0
    333 ; CHECK-NEXT:    retl
    334   %res = call <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32> %a0) ; <<4 x double>> [#uses=1]
    335   ret <4 x double> %res
    336 }
    337 declare <4 x double> @llvm.x86.avx.cvtdq2.pd.256(<4 x i32>) nounwind readnone
    338 
    339 
    340 define <2 x double> @test_x86_sse2_cvtps2pd(<4 x float> %a0) {
    341 ; CHECK-LABEL: test_x86_sse2_cvtps2pd:
    342 ; CHECK:       ## BB#0:
    343 ; CHECK-NEXT:    vcvtps2pd %xmm0, %xmm0
    344 ; CHECK-NEXT:    retl
    345   %res = call <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float> %a0) ; <<2 x double>> [#uses=1]
    346   ret <2 x double> %res
    347 }
    348 declare <2 x double> @llvm.x86.sse2.cvtps2pd(<4 x float>) nounwind readnone
    349 
    350 
    351 define <4 x double> @test_x86_avx_cvt_ps2_pd_256(<4 x float> %a0) {
    352 ; CHECK-LABEL: test_x86_avx_cvt_ps2_pd_256:
    353 ; CHECK:       ## BB#0:
    354 ; CHECK-NEXT:    vcvtps2pd %xmm0, %ymm0
    355 ; CHECK-NEXT:    retl
    356   %res = call <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float> %a0) ; <<4 x double>> [#uses=1]
    357   ret <4 x double> %res
    358 }
    359 declare <4 x double> @llvm.x86.avx.cvt.ps2.pd.256(<4 x float>) nounwind readnone
    360 
    361 
    362 define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
    363 ; CHECK-LABEL: test_x86_avx_cvtt_pd2dq_256:
    364 ; CHECK:       ## BB#0:
    365 ; CHECK-NEXT:    vcvttpd2dqy %ymm0, %xmm0
    366 ; CHECK-NEXT:    vzeroupper
    367 ; CHECK-NEXT:    retl
    368   %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
    369   ret <4 x i32> %res
    370 }
    371 declare <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double>) nounwind readnone
    372 
    373 
    374 define <8 x i32> @test_x86_avx_cvtt_ps2dq_256(<8 x float> %a0) {
    375 ; CHECK-LABEL: test_x86_avx_cvtt_ps2dq_256:
    376 ; CHECK:       ## BB#0:
    377 ; CHECK-NEXT:    vcvttps2dq %ymm0, %ymm0
    378 ; CHECK-NEXT:    retl
    379   %res = call <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float> %a0) ; <<8 x i32>> [#uses=1]
    380   ret <8 x i32> %res
    381 }
    382 declare <8 x i32> @llvm.x86.avx.cvtt.ps2dq.256(<8 x float>) nounwind readnone
    383 
    384 
    385 define void @test_x86_sse2_storeu_dq(i8* %a0, <16 x i8> %a1) {
    386   ; add operation forces the execution domain.
    387 ; CHECK-LABEL: test_x86_sse2_storeu_dq:
    388 ; CHECK:       ## BB#0:
    389 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
    390 ; CHECK-NEXT:    vpaddb LCPI34_0, %xmm0, %xmm0
    391 ; CHECK-NEXT:    vmovdqu %xmm0, (%eax)
    392 ; CHECK-NEXT:    retl
    393   %a2 = add <16 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
    394   call void @llvm.x86.sse2.storeu.dq(i8* %a0, <16 x i8> %a2)
    395   ret void
    396 }
    397 declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
    398 
    399 
    400 define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
    401   ; fadd operation forces the execution domain.
    402 ; CHECK-LABEL: test_x86_sse2_storeu_pd:
    403 ; CHECK:       ## BB#0:
    404 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
    405 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
    406 ; CHECK-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
    407 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
    408 ; CHECK-NEXT:    vmovupd %xmm0, (%eax)
    409 ; CHECK-NEXT:    retl
    410   %a2 = fadd <2 x double> %a1, <double 0x0, double 0x4200000000000000>
    411   call void @llvm.x86.sse2.storeu.pd(i8* %a0, <2 x double> %a2)
    412   ret void
    413 }
    414 declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
    415 
    416 
    417 define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
    418 ; CHECK-LABEL: test_x86_sse_storeu_ps:
    419 ; CHECK:       ## BB#0:
    420 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
    421 ; CHECK-NEXT:    vmovups %xmm0, (%eax)
    422 ; CHECK-NEXT:    retl
    423   call void @llvm.x86.sse.storeu.ps(i8* %a0, <4 x float> %a1)
    424   ret void
    425 }
    426 declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
    427 
    428 
    429 define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
    430   ; FIXME: unfortunately the execution domain fix pass changes this to vmovups and its hard to force with no 256-bit integer instructions
    431   ; add operation forces the execution domain.
    432 ; CHECK-LABEL: test_x86_avx_storeu_dq_256:
    433 ; CHECK:       ## BB#0:
    434 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
    435 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
    436 ; CHECK-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
    437 ; CHECK-NEXT:    vpaddb %xmm2, %xmm1, %xmm1
    438 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
    439 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    440 ; CHECK-NEXT:    vmovups %ymm0, (%eax)
    441 ; CHECK-NEXT:    vzeroupper
    442 ; CHECK-NEXT:    retl
    443   %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
    444   call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
    445   ret void
    446 }
    447 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
    448 
    449 
    450 define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
    451   ; add operation forces the execution domain.
    452 ; CHECK-LABEL: test_x86_avx_storeu_pd_256:
    453 ; CHECK:       ## BB#0:
    454 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
    455 ; CHECK-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
    456 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
    457 ; CHECK-NEXT:    vmovupd %ymm0, (%eax)
    458 ; CHECK-NEXT:    vzeroupper
    459 ; CHECK-NEXT:    retl
    460   %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
    461   call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
    462   ret void
    463 }
    464 declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
    465 
    466 
    467 define void @test_x86_avx_storeu_ps_256(i8* %a0, <8 x float> %a1) {
    468 ; CHECK-LABEL: test_x86_avx_storeu_ps_256:
    469 ; CHECK:       ## BB#0:
    470 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
    471 ; CHECK-NEXT:    vmovups %ymm0, (%eax)
    472 ; CHECK-NEXT:    vzeroupper
    473 ; CHECK-NEXT:    retl
    474   call void @llvm.x86.avx.storeu.ps.256(i8* %a0, <8 x float> %a1)
    475   ret void
    476 }
    477 declare void @llvm.x86.avx.storeu.ps.256(i8*, <8 x float>) nounwind
    478 
    479 
    480 define <2 x double> @test_x86_avx_vpermil_pd(<2 x double> %a0) {
    481 ; CHECK-LABEL: test_x86_avx_vpermil_pd:
    482 ; CHECK:       ## BB#0:
    483 ; CHECK-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
    484 ; CHECK-NEXT:    retl
    485   %res = call <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double> %a0, i8 1) ; <<2 x double>> [#uses=1]
    486   ret <2 x double> %res
    487 }
    488 declare <2 x double> @llvm.x86.avx.vpermil.pd(<2 x double>, i8) nounwind readnone
    489 
    490 
    491 define <4 x double> @test_x86_avx_vpermil_pd_256(<4 x double> %a0) {
    492 ; CHECK-LABEL: test_x86_avx_vpermil_pd_256:
    493 ; CHECK:       ## BB#0:
    494 ; CHECK-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[1,1,3,2]
    495 ; CHECK-NEXT:    retl
    496   %res = call <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double> %a0, i8 7) ; <<4 x double>> [#uses=1]
    497   ret <4 x double> %res
    498 }
    499 declare <4 x double> @llvm.x86.avx.vpermil.pd.256(<4 x double>, i8) nounwind readnone
    500 
    501 
    502 define <4 x float> @test_x86_avx_vpermil_ps(<4 x float> %a0) {
    503 ; CHECK-LABEL: test_x86_avx_vpermil_ps:
    504 ; CHECK:       ## BB#0:
    505 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,0,0]
    506 ; CHECK-NEXT:    retl
    507   %res = call <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float> %a0, i8 7) ; <<4 x float>> [#uses=1]
    508   ret <4 x float> %res
    509 }
    510 declare <4 x float> @llvm.x86.avx.vpermil.ps(<4 x float>, i8) nounwind readnone
    511 
    512 
    513 define <8 x float> @test_x86_avx_vpermil_ps_256(<8 x float> %a0) {
    514 ; CHECK-LABEL: test_x86_avx_vpermil_ps_256:
    515 ; CHECK:       ## BB#0:
    516 ; CHECK-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[3,1,0,0,7,5,4,4]
    517 ; CHECK-NEXT:    retl
    518   %res = call <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float> %a0, i8 7) ; <<8 x float>> [#uses=1]
    519   ret <8 x float> %res
    520 }
    521 declare <8 x float> @llvm.x86.avx.vpermil.ps.256(<8 x float>, i8) nounwind readnone
    522