Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
      3 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
      4 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
      5 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
      6 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
      7 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
      8 
      9 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse41-builtins.c
     10 
     11 define <2 x i64> @test_mm_blend_epi16(<2 x i64> %a0, <2 x i64> %a1) {
     12 ; SSE-LABEL: test_mm_blend_epi16:
     13 ; SSE:       # %bb.0:
     14 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
     15 ; SSE-NEXT:    ret{{[l|q]}}
     16 ;
     17 ; AVX-LABEL: test_mm_blend_epi16:
     18 ; AVX:       # %bb.0:
     19 ; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7]
     20 ; AVX-NEXT:    ret{{[l|q]}}
     21   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
     22   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
     23   %shuf = shufflevector <8 x i16> %arg0, <8 x i16> %arg1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7>
     24   %res = bitcast <8 x i16> %shuf to <2 x i64>
     25   ret <2 x i64> %res
     26 }
     27 
     28 define <2 x double> @test_mm_blend_pd(<2 x double> %a0, <2 x double> %a1) {
     29 ; SSE-LABEL: test_mm_blend_pd:
     30 ; SSE:       # %bb.0:
     31 ; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
     32 ; SSE-NEXT:    ret{{[l|q]}}
     33 ;
     34 ; AVX-LABEL: test_mm_blend_pd:
     35 ; AVX:       # %bb.0:
     36 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
     37 ; AVX-NEXT:    ret{{[l|q]}}
     38   %res = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
     39   ret <2 x double> %res
     40 }
     41 
     42 define <4 x float> @test_mm_blend_ps(<4 x float> %a0, <4 x float> %a1) {
     43 ; SSE-LABEL: test_mm_blend_ps:
     44 ; SSE:       # %bb.0:
     45 ; SSE-NEXT:    blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
     46 ; SSE-NEXT:    ret{{[l|q]}}
     47 ;
     48 ; AVX-LABEL: test_mm_blend_ps:
     49 ; AVX:       # %bb.0:
     50 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3]
     51 ; AVX-NEXT:    ret{{[l|q]}}
     52   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
     53   ret <4 x float> %res
     54 }
     55 
     56 define <2 x i64> @test_mm_blendv_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
     57 ; SSE-LABEL: test_mm_blendv_epi8:
     58 ; SSE:       # %bb.0:
     59 ; SSE-NEXT:    movdqa %xmm0, %xmm3
     60 ; SSE-NEXT:    movaps %xmm2, %xmm0
     61 ; SSE-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
     62 ; SSE-NEXT:    movdqa %xmm3, %xmm0
     63 ; SSE-NEXT:    ret{{[l|q]}}
     64 ;
     65 ; AVX-LABEL: test_mm_blendv_epi8:
     66 ; AVX:       # %bb.0:
     67 ; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
     68 ; AVX-NEXT:    ret{{[l|q]}}
     69   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
     70   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
     71   %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
     72   %call = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %arg0, <16 x i8> %arg1, <16 x i8> %arg2)
     73   %res = bitcast <16 x i8> %call to <2 x i64>
     74   ret <2 x i64> %res
     75 }
     76 declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
     77 
     78 define <2 x double> @test_mm_blendv_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
     79 ; SSE-LABEL: test_mm_blendv_pd:
     80 ; SSE:       # %bb.0:
     81 ; SSE-NEXT:    movapd %xmm0, %xmm3
     82 ; SSE-NEXT:    movaps %xmm2, %xmm0
     83 ; SSE-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
     84 ; SSE-NEXT:    movapd %xmm3, %xmm0
     85 ; SSE-NEXT:    ret{{[l|q]}}
     86 ;
     87 ; AVX-LABEL: test_mm_blendv_pd:
     88 ; AVX:       # %bb.0:
     89 ; AVX-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
     90 ; AVX-NEXT:    ret{{[l|q]}}
     91   %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
     92   ret <2 x double> %res
     93 }
     94 declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
     95 
     96 define <4 x float> @test_mm_blendv_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
     97 ; SSE-LABEL: test_mm_blendv_ps:
     98 ; SSE:       # %bb.0:
     99 ; SSE-NEXT:    movaps %xmm0, %xmm3
    100 ; SSE-NEXT:    movaps %xmm2, %xmm0
    101 ; SSE-NEXT:    blendvps %xmm0, %xmm1, %xmm3
    102 ; SSE-NEXT:    movaps %xmm3, %xmm0
    103 ; SSE-NEXT:    ret{{[l|q]}}
    104 ;
    105 ; AVX-LABEL: test_mm_blendv_ps:
    106 ; AVX:       # %bb.0:
    107 ; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
    108 ; AVX-NEXT:    ret{{[l|q]}}
    109   %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
    110   ret <4 x float> %res
    111 }
    112 declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
    113 
    114 define <2 x double> @test_mm_ceil_pd(<2 x double> %a0) {
    115 ; SSE-LABEL: test_mm_ceil_pd:
    116 ; SSE:       # %bb.0:
    117 ; SSE-NEXT:    roundpd $2, %xmm0, %xmm0
    118 ; SSE-NEXT:    ret{{[l|q]}}
    119 ;
    120 ; AVX-LABEL: test_mm_ceil_pd:
    121 ; AVX:       # %bb.0:
    122 ; AVX-NEXT:    vroundpd $2, %xmm0, %xmm0
    123 ; AVX-NEXT:    ret{{[l|q]}}
    124   %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 2)
    125   ret <2 x double> %res
    126 }
    127 declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
    128 
    129 define <4 x float> @test_mm_ceil_ps(<4 x float> %a0) {
    130 ; SSE-LABEL: test_mm_ceil_ps:
    131 ; SSE:       # %bb.0:
    132 ; SSE-NEXT:    roundps $2, %xmm0, %xmm0
    133 ; SSE-NEXT:    ret{{[l|q]}}
    134 ;
    135 ; AVX-LABEL: test_mm_ceil_ps:
    136 ; AVX:       # %bb.0:
    137 ; AVX-NEXT:    vroundps $2, %xmm0, %xmm0
    138 ; AVX-NEXT:    ret{{[l|q]}}
    139   %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 2)
    140   ret <4 x float> %res
    141 }
    142 declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
    143 
    144 define <2 x double> @test_mm_ceil_sd(<2 x double> %a0, <2 x double> %a1) {
    145 ; SSE-LABEL: test_mm_ceil_sd:
    146 ; SSE:       # %bb.0:
    147 ; SSE-NEXT:    roundsd $2, %xmm1, %xmm0
    148 ; SSE-NEXT:    ret{{[l|q]}}
    149 ;
    150 ; AVX-LABEL: test_mm_ceil_sd:
    151 ; AVX:       # %bb.0:
    152 ; AVX-NEXT:    vroundsd $2, %xmm1, %xmm0, %xmm0
    153 ; AVX-NEXT:    ret{{[l|q]}}
    154   %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 2)
    155   ret <2 x double> %res
    156 }
    157 declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
    158 
    159 define <4 x float> @test_mm_ceil_ss(<4 x float> %a0, <4 x float> %a1) {
    160 ; SSE-LABEL: test_mm_ceil_ss:
    161 ; SSE:       # %bb.0:
    162 ; SSE-NEXT:    roundss $2, %xmm1, %xmm0
    163 ; SSE-NEXT:    ret{{[l|q]}}
    164 ;
    165 ; AVX-LABEL: test_mm_ceil_ss:
    166 ; AVX:       # %bb.0:
    167 ; AVX-NEXT:    vroundss $2, %xmm1, %xmm0, %xmm0
    168 ; AVX-NEXT:    ret{{[l|q]}}
    169   %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 2)
    170   ret <4 x float> %res
    171 }
    172 declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
    173 
    174 define <2 x i64> @test_mm_cmpeq_epi64(<2 x i64> %a0, <2 x i64> %a1) {
    175 ; SSE-LABEL: test_mm_cmpeq_epi64:
    176 ; SSE:       # %bb.0:
    177 ; SSE-NEXT:    pcmpeqq %xmm1, %xmm0
    178 ; SSE-NEXT:    ret{{[l|q]}}
    179 ;
    180 ; AVX1-LABEL: test_mm_cmpeq_epi64:
    181 ; AVX1:       # %bb.0:
    182 ; AVX1-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
    183 ; AVX1-NEXT:    ret{{[l|q]}}
    184 ;
    185 ; AVX512-LABEL: test_mm_cmpeq_epi64:
    186 ; AVX512:       # %bb.0:
    187 ; AVX512-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0
    188 ; AVX512-NEXT:    vpmovm2q %k0, %xmm0
    189 ; AVX512-NEXT:    ret{{[l|q]}}
    190   %cmp = icmp eq <2 x i64> %a0, %a1
    191   %res = sext <2 x i1> %cmp to <2 x i64>
    192   ret <2 x i64> %res
    193 }
    194 
    195 define <2 x i64> @test_mm_cvtepi8_epi16(<2 x i64> %a0) {
    196 ; SSE-LABEL: test_mm_cvtepi8_epi16:
    197 ; SSE:       # %bb.0:
    198 ; SSE-NEXT:    pmovsxbw %xmm0, %xmm0
    199 ; SSE-NEXT:    ret{{[l|q]}}
    200 ;
    201 ; AVX-LABEL: test_mm_cvtepi8_epi16:
    202 ; AVX:       # %bb.0:
    203 ; AVX-NEXT:    vpmovsxbw %xmm0, %xmm0
    204 ; AVX-NEXT:    ret{{[l|q]}}
    205   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    206   %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    207   %sext = sext <8 x i8> %ext0 to <8 x i16>
    208   %res = bitcast <8 x i16> %sext to <2 x i64>
    209   ret <2 x i64> %res
    210 }
    211 
    212 define <2 x i64> @test_mm_cvtepi8_epi32(<2 x i64> %a0) {
    213 ; SSE-LABEL: test_mm_cvtepi8_epi32:
    214 ; SSE:       # %bb.0:
    215 ; SSE-NEXT:    pmovsxbd %xmm0, %xmm0
    216 ; SSE-NEXT:    ret{{[l|q]}}
    217 ;
    218 ; AVX-LABEL: test_mm_cvtepi8_epi32:
    219 ; AVX:       # %bb.0:
    220 ; AVX-NEXT:    vpmovsxbd %xmm0, %xmm0
    221 ; AVX-NEXT:    ret{{[l|q]}}
    222   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    223   %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    224   %sext = sext <4 x i8> %ext0 to <4 x i32>
    225   %res = bitcast <4 x i32> %sext to <2 x i64>
    226   ret <2 x i64> %res
    227 }
    228 
    229 define <2 x i64> @test_mm_cvtepi8_epi64(<2 x i64> %a0) {
    230 ; SSE-LABEL: test_mm_cvtepi8_epi64:
    231 ; SSE:       # %bb.0:
    232 ; SSE-NEXT:    pmovsxbq %xmm0, %xmm0
    233 ; SSE-NEXT:    ret{{[l|q]}}
    234 ;
    235 ; AVX-LABEL: test_mm_cvtepi8_epi64:
    236 ; AVX:       # %bb.0:
    237 ; AVX-NEXT:    vpmovsxbq %xmm0, %xmm0
    238 ; AVX-NEXT:    ret{{[l|q]}}
    239   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    240   %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
    241   %sext = sext <2 x i8> %ext0 to <2 x i64>
    242   ret <2 x i64> %sext
    243 }
    244 
    245 define <2 x i64> @test_mm_cvtepi16_epi32(<2 x i64> %a0) {
    246 ; SSE-LABEL: test_mm_cvtepi16_epi32:
    247 ; SSE:       # %bb.0:
    248 ; SSE-NEXT:    pmovsxwd %xmm0, %xmm0
    249 ; SSE-NEXT:    ret{{[l|q]}}
    250 ;
    251 ; AVX-LABEL: test_mm_cvtepi16_epi32:
    252 ; AVX:       # %bb.0:
    253 ; AVX-NEXT:    vpmovsxwd %xmm0, %xmm0
    254 ; AVX-NEXT:    ret{{[l|q]}}
    255   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    256   %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    257   %sext = sext <4 x i16> %ext0 to <4 x i32>
    258   %res = bitcast <4 x i32> %sext to <2 x i64>
    259   ret <2 x i64> %res
    260 }
    261 
    262 define <2 x i64> @test_mm_cvtepi16_epi64(<2 x i64> %a0) {
    263 ; SSE-LABEL: test_mm_cvtepi16_epi64:
    264 ; SSE:       # %bb.0:
    265 ; SSE-NEXT:    pmovsxwq %xmm0, %xmm0
    266 ; SSE-NEXT:    ret{{[l|q]}}
    267 ;
    268 ; AVX-LABEL: test_mm_cvtepi16_epi64:
    269 ; AVX:       # %bb.0:
    270 ; AVX-NEXT:    vpmovsxwq %xmm0, %xmm0
    271 ; AVX-NEXT:    ret{{[l|q]}}
    272   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    273   %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
    274   %sext = sext <2 x i16> %ext0 to <2 x i64>
    275   ret <2 x i64> %sext
    276 }
    277 
    278 define <2 x i64> @test_mm_cvtepi32_epi64(<2 x i64> %a0) {
    279 ; SSE-LABEL: test_mm_cvtepi32_epi64:
    280 ; SSE:       # %bb.0:
    281 ; SSE-NEXT:    pmovsxdq %xmm0, %xmm0
    282 ; SSE-NEXT:    ret{{[l|q]}}
    283 ;
    284 ; AVX-LABEL: test_mm_cvtepi32_epi64:
    285 ; AVX:       # %bb.0:
    286 ; AVX-NEXT:    vpmovsxdq %xmm0, %xmm0
    287 ; AVX-NEXT:    ret{{[l|q]}}
    288   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    289   %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
    290   %sext = sext <2 x i32> %ext0 to <2 x i64>
    291   ret <2 x i64> %sext
    292 }
    293 
    294 define <2 x i64> @test_mm_cvtepu8_epi16(<2 x i64> %a0) {
    295 ; SSE-LABEL: test_mm_cvtepu8_epi16:
    296 ; SSE:       # %bb.0:
    297 ; SSE-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    298 ; SSE-NEXT:    ret{{[l|q]}}
    299 ;
    300 ; AVX-LABEL: test_mm_cvtepu8_epi16:
    301 ; AVX:       # %bb.0:
    302 ; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    303 ; AVX-NEXT:    ret{{[l|q]}}
    304   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    305   %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    306   %sext = zext <8 x i8> %ext0 to <8 x i16>
    307   %res = bitcast <8 x i16> %sext to <2 x i64>
    308   ret <2 x i64> %res
    309 }
    310 
    311 define <2 x i64> @test_mm_cvtepu8_epi32(<2 x i64> %a0) {
    312 ; SSE-LABEL: test_mm_cvtepu8_epi32:
    313 ; SSE:       # %bb.0:
    314 ; SSE-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    315 ; SSE-NEXT:    ret{{[l|q]}}
    316 ;
    317 ; AVX-LABEL: test_mm_cvtepu8_epi32:
    318 ; AVX:       # %bb.0:
    319 ; AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    320 ; AVX-NEXT:    ret{{[l|q]}}
    321   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    322   %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    323   %sext = zext <4 x i8> %ext0 to <4 x i32>
    324   %res = bitcast <4 x i32> %sext to <2 x i64>
    325   ret <2 x i64> %res
    326 }
    327 
    328 define <2 x i64> @test_mm_cvtepu8_epi64(<2 x i64> %a0) {
    329 ; SSE-LABEL: test_mm_cvtepu8_epi64:
    330 ; SSE:       # %bb.0:
    331 ; SSE-NEXT:    pmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
    332 ; SSE-NEXT:    ret{{[l|q]}}
    333 ;
    334 ; AVX-LABEL: test_mm_cvtepu8_epi64:
    335 ; AVX:       # %bb.0:
    336 ; AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
    337 ; AVX-NEXT:    ret{{[l|q]}}
    338   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    339   %ext0 = shufflevector <16 x i8> %arg0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
    340   %sext = zext <2 x i8> %ext0 to <2 x i64>
    341   ret <2 x i64> %sext
    342 }
    343 
    344 define <2 x i64> @test_mm_cvtepu16_epi32(<2 x i64> %a0) {
    345 ; SSE-LABEL: test_mm_cvtepu16_epi32:
    346 ; SSE:       # %bb.0:
    347 ; SSE-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    348 ; SSE-NEXT:    ret{{[l|q]}}
    349 ;
    350 ; AVX-LABEL: test_mm_cvtepu16_epi32:
    351 ; AVX:       # %bb.0:
    352 ; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    353 ; AVX-NEXT:    ret{{[l|q]}}
    354   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    355   %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
    356   %sext = zext <4 x i16> %ext0 to <4 x i32>
    357   %res = bitcast <4 x i32> %sext to <2 x i64>
    358   ret <2 x i64> %res
    359 }
    360 
    361 define <2 x i64> @test_mm_cvtepu16_epi64(<2 x i64> %a0) {
    362 ; SSE-LABEL: test_mm_cvtepu16_epi64:
    363 ; SSE:       # %bb.0:
    364 ; SSE-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
    365 ; SSE-NEXT:    ret{{[l|q]}}
    366 ;
    367 ; AVX-LABEL: test_mm_cvtepu16_epi64:
    368 ; AVX:       # %bb.0:
    369 ; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
    370 ; AVX-NEXT:    ret{{[l|q]}}
    371   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    372   %ext0 = shufflevector <8 x i16> %arg0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
    373   %sext = zext <2 x i16> %ext0 to <2 x i64>
    374   ret <2 x i64> %sext
    375 }
    376 
    377 define <2 x i64> @test_mm_cvtepu32_epi64(<2 x i64> %a0) {
    378 ; SSE-LABEL: test_mm_cvtepu32_epi64:
    379 ; SSE:       # %bb.0:
    380 ; SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    381 ; SSE-NEXT:    ret{{[l|q]}}
    382 ;
    383 ; AVX-LABEL: test_mm_cvtepu32_epi64:
    384 ; AVX:       # %bb.0:
    385 ; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
    386 ; AVX-NEXT:    ret{{[l|q]}}
    387   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    388   %ext0 = shufflevector <4 x i32> %arg0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
    389   %sext = zext <2 x i32> %ext0 to <2 x i64>
    390   ret <2 x i64> %sext
    391 }
    392 
    393 define <2 x double> @test_mm_dp_pd(<2 x double> %a0, <2 x double> %a1) {
    394 ; SSE-LABEL: test_mm_dp_pd:
    395 ; SSE:       # %bb.0:
    396 ; SSE-NEXT:    dppd $7, %xmm1, %xmm0
    397 ; SSE-NEXT:    ret{{[l|q]}}
    398 ;
    399 ; AVX-LABEL: test_mm_dp_pd:
    400 ; AVX:       # %bb.0:
    401 ; AVX-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0
    402 ; AVX-NEXT:    ret{{[l|q]}}
    403   %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
    404   ret <2 x double> %res
    405 }
    406 declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
    407 
    408 define <4 x float> @test_mm_dp_ps(<4 x float> %a0, <4 x float> %a1) {
    409 ; SSE-LABEL: test_mm_dp_ps:
    410 ; SSE:       # %bb.0:
    411 ; SSE-NEXT:    dpps $7, %xmm1, %xmm0
    412 ; SSE-NEXT:    ret{{[l|q]}}
    413 ;
    414 ; AVX-LABEL: test_mm_dp_ps:
    415 ; AVX:       # %bb.0:
    416 ; AVX-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0
    417 ; AVX-NEXT:    ret{{[l|q]}}
    418   %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
    419   ret <4 x float> %res
    420 }
    421 declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
    422 
    423 define i32 @test_mm_extract_epi8(<2 x i64> %a0) {
    424 ; SSE-LABEL: test_mm_extract_epi8:
    425 ; SSE:       # %bb.0:
    426 ; SSE-NEXT:    pextrb $1, %xmm0, %eax
    427 ; SSE-NEXT:    movzbl %al, %eax
    428 ; SSE-NEXT:    ret{{[l|q]}}
    429 ;
    430 ; AVX-LABEL: test_mm_extract_epi8:
    431 ; AVX:       # %bb.0:
    432 ; AVX-NEXT:    vpextrb $1, %xmm0, %eax
    433 ; AVX-NEXT:    movzbl %al, %eax
    434 ; AVX-NEXT:    ret{{[l|q]}}
    435   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    436   %ext = extractelement <16 x i8> %arg0, i32 1
    437   %res = zext i8 %ext to i32
    438   ret i32 %res
    439 }
    440 
    441 define i32 @test_mm_extract_epi32(<2 x i64> %a0) {
    442 ; SSE-LABEL: test_mm_extract_epi32:
    443 ; SSE:       # %bb.0:
    444 ; SSE-NEXT:    extractps $1, %xmm0, %eax
    445 ; SSE-NEXT:    ret{{[l|q]}}
    446 ;
    447 ; AVX-LABEL: test_mm_extract_epi32:
    448 ; AVX:       # %bb.0:
    449 ; AVX-NEXT:    vextractps $1, %xmm0, %eax
    450 ; AVX-NEXT:    ret{{[l|q]}}
    451   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    452   %ext = extractelement <4 x i32> %arg0, i32 1
    453   ret i32 %ext
    454 }
    455 
    456 define i64 @test_mm_extract_epi64(<2 x i64> %a0) {
    457 ; X86-SSE-LABEL: test_mm_extract_epi64:
    458 ; X86-SSE:       # %bb.0:
    459 ; X86-SSE-NEXT:    extractps $2, %xmm0, %eax
    460 ; X86-SSE-NEXT:    extractps $3, %xmm0, %edx
    461 ; X86-SSE-NEXT:    retl
    462 ;
    463 ; X86-AVX-LABEL: test_mm_extract_epi64:
    464 ; X86-AVX:       # %bb.0:
    465 ; X86-AVX-NEXT:    vextractps $2, %xmm0, %eax
    466 ; X86-AVX-NEXT:    vextractps $3, %xmm0, %edx
    467 ; X86-AVX-NEXT:    retl
    468 ;
    469 ; X64-SSE-LABEL: test_mm_extract_epi64:
    470 ; X64-SSE:       # %bb.0:
    471 ; X64-SSE-NEXT:    pextrq $1, %xmm0, %rax
    472 ; X64-SSE-NEXT:    retq
    473 ;
    474 ; X64-AVX-LABEL: test_mm_extract_epi64:
    475 ; X64-AVX:       # %bb.0:
    476 ; X64-AVX-NEXT:    vpextrq $1, %xmm0, %rax
    477 ; X64-AVX-NEXT:    retq
    478   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    479   %ext = extractelement <2 x i64> %a0, i32 1
    480   ret i64 %ext
    481 }
    482 
    483 define i32 @test_mm_extract_ps(<4 x float> %a0) {
    484 ; SSE-LABEL: test_mm_extract_ps:
    485 ; SSE:       # %bb.0:
    486 ; SSE-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    487 ; SSE-NEXT:    movd %xmm0, %eax
    488 ; SSE-NEXT:    ret{{[l|q]}}
    489 ;
    490 ; AVX-LABEL: test_mm_extract_ps:
    491 ; AVX:       # %bb.0:
    492 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    493 ; AVX-NEXT:    vmovd %xmm0, %eax
    494 ; AVX-NEXT:    ret{{[l|q]}}
    495   %ext = extractelement <4 x float> %a0, i32 1
    496   %bc = bitcast float %ext to i32
    497   ret i32 %bc
    498 }
    499 
    500 define <2 x double> @test_mm_floor_pd(<2 x double> %a0) {
    501 ; SSE-LABEL: test_mm_floor_pd:
    502 ; SSE:       # %bb.0:
    503 ; SSE-NEXT:    roundpd $1, %xmm0, %xmm0
    504 ; SSE-NEXT:    ret{{[l|q]}}
    505 ;
    506 ; AVX-LABEL: test_mm_floor_pd:
    507 ; AVX:       # %bb.0:
    508 ; AVX-NEXT:    vroundpd $1, %xmm0, %xmm0
    509 ; AVX-NEXT:    ret{{[l|q]}}
    510   %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 1)
    511   ret <2 x double> %res
    512 }
    513 
    514 define <4 x float> @test_mm_floor_ps(<4 x float> %a0) {
    515 ; SSE-LABEL: test_mm_floor_ps:
    516 ; SSE:       # %bb.0:
    517 ; SSE-NEXT:    roundps $1, %xmm0, %xmm0
    518 ; SSE-NEXT:    ret{{[l|q]}}
    519 ;
    520 ; AVX-LABEL: test_mm_floor_ps:
    521 ; AVX:       # %bb.0:
    522 ; AVX-NEXT:    vroundps $1, %xmm0, %xmm0
    523 ; AVX-NEXT:    ret{{[l|q]}}
    524   %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 1)
    525   ret <4 x float> %res
    526 }
    527 
    528 define <2 x double> @test_mm_floor_sd(<2 x double> %a0, <2 x double> %a1) {
    529 ; SSE-LABEL: test_mm_floor_sd:
    530 ; SSE:       # %bb.0:
    531 ; SSE-NEXT:    roundsd $1, %xmm1, %xmm0
    532 ; SSE-NEXT:    ret{{[l|q]}}
    533 ;
    534 ; AVX-LABEL: test_mm_floor_sd:
    535 ; AVX:       # %bb.0:
    536 ; AVX-NEXT:    vroundsd $1, %xmm1, %xmm0, %xmm0
    537 ; AVX-NEXT:    ret{{[l|q]}}
    538   %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 1)
    539   ret <2 x double> %res
    540 }
    541 
    542 define <4 x float> @test_mm_floor_ss(<4 x float> %a0, <4 x float> %a1) {
    543 ; SSE-LABEL: test_mm_floor_ss:
    544 ; SSE:       # %bb.0:
    545 ; SSE-NEXT:    roundss $1, %xmm1, %xmm0
    546 ; SSE-NEXT:    ret{{[l|q]}}
    547 ;
    548 ; AVX-LABEL: test_mm_floor_ss:
    549 ; AVX:       # %bb.0:
    550 ; AVX-NEXT:    vroundss $1, %xmm1, %xmm0, %xmm0
    551 ; AVX-NEXT:    ret{{[l|q]}}
    552   %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 1)
    553   ret <4 x float> %res
    554 }
    555 
    556 define <2 x i64> @test_mm_insert_epi8(<2 x i64> %a0, i8 %a1) {
    557 ; X86-SSE-LABEL: test_mm_insert_epi8:
    558 ; X86-SSE:       # %bb.0:
    559 ; X86-SSE-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
    560 ; X86-SSE-NEXT:    pinsrb $1, %eax, %xmm0
    561 ; X86-SSE-NEXT:    retl
    562 ;
    563 ; X86-AVX-LABEL: test_mm_insert_epi8:
    564 ; X86-AVX:       # %bb.0:
    565 ; X86-AVX-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
    566 ; X86-AVX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
    567 ; X86-AVX-NEXT:    retl
    568 ;
    569 ; X64-SSE-LABEL: test_mm_insert_epi8:
    570 ; X64-SSE:       # %bb.0:
    571 ; X64-SSE-NEXT:    movzbl %dil, %eax
    572 ; X64-SSE-NEXT:    pinsrb $1, %eax, %xmm0
    573 ; X64-SSE-NEXT:    retq
    574 ;
    575 ; X64-AVX-LABEL: test_mm_insert_epi8:
    576 ; X64-AVX:       # %bb.0:
    577 ; X64-AVX-NEXT:    movzbl %dil, %eax
    578 ; X64-AVX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
    579 ; X64-AVX-NEXT:    retq
    580   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    581   %res = insertelement <16 x i8> %arg0, i8 %a1,i32 1
    582   %bc = bitcast <16 x i8> %res to <2 x i64>
    583   ret <2 x i64> %bc
    584 }
    585 
    586 define <2 x i64> @test_mm_insert_epi32(<2 x i64> %a0, i32 %a1) {
    587 ; X86-SSE-LABEL: test_mm_insert_epi32:
    588 ; X86-SSE:       # %bb.0:
    589 ; X86-SSE-NEXT:    pinsrd $1, {{[0-9]+}}(%esp), %xmm0
    590 ; X86-SSE-NEXT:    retl
    591 ;
    592 ; X86-AVX-LABEL: test_mm_insert_epi32:
    593 ; X86-AVX:       # %bb.0:
    594 ; X86-AVX-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
    595 ; X86-AVX-NEXT:    retl
    596 ;
    597 ; X64-SSE-LABEL: test_mm_insert_epi32:
    598 ; X64-SSE:       # %bb.0:
    599 ; X64-SSE-NEXT:    pinsrd $1, %edi, %xmm0
    600 ; X64-SSE-NEXT:    retq
    601 ;
    602 ; X64-AVX-LABEL: test_mm_insert_epi32:
    603 ; X64-AVX:       # %bb.0:
    604 ; X64-AVX-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
    605 ; X64-AVX-NEXT:    retq
    606   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    607   %res = insertelement <4 x i32> %arg0, i32 %a1,i32 1
    608   %bc = bitcast <4 x i32> %res to <2 x i64>
    609   ret <2 x i64> %bc
    610 }
    611 
    612 define <2 x i64> @test_mm_insert_epi64(<2 x i64> %a0, i64 %a1) {
    613 ; X86-SSE-LABEL: test_mm_insert_epi64:
    614 ; X86-SSE:       # %bb.0:
    615 ; X86-SSE-NEXT:    pinsrd $2, {{[0-9]+}}(%esp), %xmm0
    616 ; X86-SSE-NEXT:    pinsrd $3, {{[0-9]+}}(%esp), %xmm0
    617 ; X86-SSE-NEXT:    retl
    618 ;
    619 ; X86-AVX-LABEL: test_mm_insert_epi64:
    620 ; X86-AVX:       # %bb.0:
    621 ; X86-AVX-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
    622 ; X86-AVX-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
    623 ; X86-AVX-NEXT:    retl
    624 ;
    625 ; X64-SSE-LABEL: test_mm_insert_epi64:
    626 ; X64-SSE:       # %bb.0:
    627 ; X64-SSE-NEXT:    pinsrq $1, %rdi, %xmm0
    628 ; X64-SSE-NEXT:    retq
    629 ;
    630 ; X64-AVX-LABEL: test_mm_insert_epi64:
    631 ; X64-AVX:       # %bb.0:
    632 ; X64-AVX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
    633 ; X64-AVX-NEXT:    retq
    634   %res = insertelement <2 x i64> %a0, i64 %a1,i32 1
    635   ret <2 x i64> %res
    636 }
    637 
    638 define <4 x float> @test_mm_insert_ps(<4 x float> %a0, <4 x float> %a1) {
    639 ; SSE-LABEL: test_mm_insert_ps:
    640 ; SSE:       # %bb.0:
    641 ; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3]
    642 ; SSE-NEXT:    ret{{[l|q]}}
    643 ;
    644 ; AVX-LABEL: test_mm_insert_ps:
    645 ; AVX:       # %bb.0:
    646 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[1],zero,xmm0[3]
    647 ; AVX-NEXT:    ret{{[l|q]}}
    648   %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 4)
    649   ret <4 x float> %res
    650 }
    651 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
    652 
    653 define <2 x i64> @test_mm_max_epi8(<2 x i64> %a0, <2 x i64> %a1) {
    654 ; SSE-LABEL: test_mm_max_epi8:
    655 ; SSE:       # %bb.0:
    656 ; SSE-NEXT:    pmaxsb %xmm1, %xmm0
    657 ; SSE-NEXT:    ret{{[l|q]}}
    658 ;
    659 ; AVX-LABEL: test_mm_max_epi8:
    660 ; AVX:       # %bb.0:
    661 ; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0
    662 ; AVX-NEXT:    ret{{[l|q]}}
    663   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    664   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
    665   %cmp = icmp sgt <16 x i8> %arg0, %arg1
    666   %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
    667   %bc = bitcast <16 x i8> %sel to <2 x i64>
    668   ret <2 x i64> %bc
    669 }
    670 
    671 define <2 x i64> @test_mm_max_epi32(<2 x i64> %a0, <2 x i64> %a1) {
    672 ; SSE-LABEL: test_mm_max_epi32:
    673 ; SSE:       # %bb.0:
    674 ; SSE-NEXT:    pmaxsd %xmm1, %xmm0
    675 ; SSE-NEXT:    ret{{[l|q]}}
    676 ;
    677 ; AVX-LABEL: test_mm_max_epi32:
    678 ; AVX:       # %bb.0:
    679 ; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
    680 ; AVX-NEXT:    ret{{[l|q]}}
    681   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    682   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    683   %cmp = icmp sgt <4 x i32> %arg0, %arg1
    684   %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
    685   %bc = bitcast <4 x i32> %sel to <2 x i64>
    686   ret <2 x i64> %bc
    687 }
    688 
    689 define <2 x i64> @test_mm_max_epu16(<2 x i64> %a0, <2 x i64> %a1) {
    690 ; SSE-LABEL: test_mm_max_epu16:
    691 ; SSE:       # %bb.0:
    692 ; SSE-NEXT:    pmaxuw %xmm1, %xmm0
    693 ; SSE-NEXT:    ret{{[l|q]}}
    694 ;
    695 ; AVX-LABEL: test_mm_max_epu16:
    696 ; AVX:       # %bb.0:
    697 ; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0
    698 ; AVX-NEXT:    ret{{[l|q]}}
    699   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    700   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
    701   %cmp = icmp ugt <8 x i16> %arg0, %arg1
    702   %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
    703   %bc = bitcast <8 x i16> %sel to <2 x i64>
    704   ret <2 x i64> %bc
    705 }
    706 
    707 define <2 x i64> @test_mm_max_epu32(<2 x i64> %a0, <2 x i64> %a1) {
    708 ; SSE-LABEL: test_mm_max_epu32:
    709 ; SSE:       # %bb.0:
    710 ; SSE-NEXT:    pmaxud %xmm1, %xmm0
    711 ; SSE-NEXT:    ret{{[l|q]}}
    712 ;
    713 ; AVX-LABEL: test_mm_max_epu32:
    714 ; AVX:       # %bb.0:
    715 ; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0
    716 ; AVX-NEXT:    ret{{[l|q]}}
    717   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    718   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    719   %cmp = icmp ugt <4 x i32> %arg0, %arg1
    720   %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
    721   %bc = bitcast <4 x i32> %sel to <2 x i64>
    722   ret <2 x i64> %bc
    723 }
    724 
    725 define <2 x i64> @test_mm_min_epi8(<2 x i64> %a0, <2 x i64> %a1) {
    726 ; SSE-LABEL: test_mm_min_epi8:
    727 ; SSE:       # %bb.0:
    728 ; SSE-NEXT:    pminsb %xmm1, %xmm0
    729 ; SSE-NEXT:    ret{{[l|q]}}
    730 ;
    731 ; AVX-LABEL: test_mm_min_epi8:
    732 ; AVX:       # %bb.0:
    733 ; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0
    734 ; AVX-NEXT:    ret{{[l|q]}}
    735   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    736   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
    737   %cmp = icmp slt <16 x i8> %arg0, %arg1
    738   %sel = select <16 x i1> %cmp, <16 x i8> %arg0, <16 x i8> %arg1
    739   %bc = bitcast <16 x i8> %sel to <2 x i64>
    740   ret <2 x i64> %bc
    741 }
    742 
    743 define <2 x i64> @test_mm_min_epi32(<2 x i64> %a0, <2 x i64> %a1) {
    744 ; SSE-LABEL: test_mm_min_epi32:
    745 ; SSE:       # %bb.0:
    746 ; SSE-NEXT:    pminsd %xmm1, %xmm0
    747 ; SSE-NEXT:    ret{{[l|q]}}
    748 ;
    749 ; AVX-LABEL: test_mm_min_epi32:
    750 ; AVX:       # %bb.0:
    751 ; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
    752 ; AVX-NEXT:    ret{{[l|q]}}
    753   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    754   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    755   %cmp = icmp slt <4 x i32> %arg0, %arg1
    756   %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
    757   %bc = bitcast <4 x i32> %sel to <2 x i64>
    758   ret <2 x i64> %bc
    759 }
    760 
    761 define <2 x i64> @test_mm_min_epu16(<2 x i64> %a0, <2 x i64> %a1) {
    762 ; SSE-LABEL: test_mm_min_epu16:
    763 ; SSE:       # %bb.0:
    764 ; SSE-NEXT:    pminuw %xmm1, %xmm0
    765 ; SSE-NEXT:    ret{{[l|q]}}
    766 ;
    767 ; AVX-LABEL: test_mm_min_epu16:
    768 ; AVX:       # %bb.0:
    769 ; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0
    770 ; AVX-NEXT:    ret{{[l|q]}}
    771   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    772   %arg1 = bitcast <2 x i64> %a1 to <8 x i16>
    773   %cmp = icmp ult <8 x i16> %arg0, %arg1
    774   %sel = select <8 x i1> %cmp, <8 x i16> %arg0, <8 x i16> %arg1
    775   %bc = bitcast <8 x i16> %sel to <2 x i64>
    776   ret <2 x i64> %bc
    777 }
    778 
    779 define <2 x i64> @test_mm_min_epu32(<2 x i64> %a0, <2 x i64> %a1) {
    780 ; SSE-LABEL: test_mm_min_epu32:
    781 ; SSE:       # %bb.0:
    782 ; SSE-NEXT:    pminud %xmm1, %xmm0
    783 ; SSE-NEXT:    ret{{[l|q]}}
    784 ;
    785 ; AVX-LABEL: test_mm_min_epu32:
    786 ; AVX:       # %bb.0:
    787 ; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0
    788 ; AVX-NEXT:    ret{{[l|q]}}
    789   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    790   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    791   %cmp = icmp ult <4 x i32> %arg0, %arg1
    792   %sel = select <4 x i1> %cmp, <4 x i32> %arg0, <4 x i32> %arg1
    793   %bc = bitcast <4 x i32> %sel to <2 x i64>
    794   ret <2 x i64> %bc
    795 }
    796 
    797 define <2 x i64> @test_mm_minpos_epu16(<2 x i64> %a0) {
    798 ; SSE-LABEL: test_mm_minpos_epu16:
    799 ; SSE:       # %bb.0:
    800 ; SSE-NEXT:    phminposuw %xmm0, %xmm0
    801 ; SSE-NEXT:    ret{{[l|q]}}
    802 ;
    803 ; AVX-LABEL: test_mm_minpos_epu16:
    804 ; AVX:       # %bb.0:
    805 ; AVX-NEXT:    vphminposuw %xmm0, %xmm0
    806 ; AVX-NEXT:    ret{{[l|q]}}
    807   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
    808   %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %arg0)
    809   %bc = bitcast <8 x i16> %res to <2 x i64>
    810   ret <2 x i64> %bc
    811 }
    812 declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
    813 
    814 define <2 x i64> @test_mm_mpsadbw_epu8(<2 x i64> %a0, <2 x i64> %a1) {
    815 ; SSE-LABEL: test_mm_mpsadbw_epu8:
    816 ; SSE:       # %bb.0:
    817 ; SSE-NEXT:    mpsadbw $1, %xmm1, %xmm0
    818 ; SSE-NEXT:    ret{{[l|q]}}
    819 ;
    820 ; AVX-LABEL: test_mm_mpsadbw_epu8:
    821 ; AVX:       # %bb.0:
    822 ; AVX-NEXT:    vmpsadbw $1, %xmm1, %xmm0, %xmm0
    823 ; AVX-NEXT:    ret{{[l|q]}}
    824   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
    825   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
    826   %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %arg0, <16 x i8> %arg1, i8 1)
    827   %bc = bitcast <8 x i16> %res to <2 x i64>
    828   ret <2 x i64> %bc
    829 }
    830 declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
    831 
    832 define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) {
    833 ; SSE-LABEL: test_mm_mul_epi32:
    834 ; SSE:       # %bb.0:
    835 ; SSE-NEXT:    psllq $32, %xmm0
    836 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    837 ; SSE-NEXT:    psrad $31, %xmm0
    838 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
    839 ; SSE-NEXT:    psllq $32, %xmm1
    840 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
    841 ; SSE-NEXT:    psrad $31, %xmm1
    842 ; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
    843 ; SSE-NEXT:    pmuldq %xmm1, %xmm0
    844 ; SSE-NEXT:    ret{{[l|q]}}
    845 ;
    846 ; AVX1-LABEL: test_mm_mul_epi32:
    847 ; AVX1:       # %bb.0:
    848 ; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
    849 ; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm2
    850 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
    851 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
    852 ; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
    853 ; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
    854 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    855 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
    856 ; AVX1-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0
    857 ; AVX1-NEXT:    ret{{[l|q]}}
    858 ;
    859 ; AVX512-LABEL: test_mm_mul_epi32:
    860 ; AVX512:       # %bb.0:
    861 ; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
    862 ; AVX512-NEXT:    vpsraq $32, %xmm0, %xmm0
    863 ; AVX512-NEXT:    vpsllq $32, %xmm1, %xmm1
    864 ; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1
    865 ; AVX512-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
    866 ; AVX512-NEXT:    ret{{[l|q]}}
    867   %A = shl <2 x i64> %a0, <i64 32, i64 32>
    868   %A1 = ashr exact <2 x i64> %A, <i64 32, i64 32>
    869   %B = shl <2 x i64> %a1, <i64 32, i64 32>
    870   %B1 = ashr exact <2 x i64> %B, <i64 32, i64 32>
    871   %res = mul nsw <2 x i64> %A1, %B1
    872   ret <2 x i64> %res
    873 }
    874 
    875 define <2 x i64> @test_mm_mullo_epi32(<2 x i64> %a0, <2 x i64> %a1) {
    876 ; SSE-LABEL: test_mm_mullo_epi32:
    877 ; SSE:       # %bb.0:
    878 ; SSE-NEXT:    pmulld %xmm1, %xmm0
    879 ; SSE-NEXT:    ret{{[l|q]}}
    880 ;
    881 ; AVX-LABEL: test_mm_mullo_epi32:
    882 ; AVX:       # %bb.0:
    883 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    884 ; AVX-NEXT:    ret{{[l|q]}}
    885   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    886   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    887   %res = mul <4 x i32> %arg0, %arg1
    888   %bc = bitcast <4 x i32> %res to <2 x i64>
    889   ret <2 x i64> %bc
    890 }
    891 
    892 define <2 x i64> @test_mm_packus_epi32(<2 x i64> %a0, <2 x i64> %a1) {
    893 ; SSE-LABEL: test_mm_packus_epi32:
    894 ; SSE:       # %bb.0:
    895 ; SSE-NEXT:    packusdw %xmm1, %xmm0
    896 ; SSE-NEXT:    ret{{[l|q]}}
    897 ;
    898 ; AVX-LABEL: test_mm_packus_epi32:
    899 ; AVX:       # %bb.0:
    900 ; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
    901 ; AVX-NEXT:    ret{{[l|q]}}
    902   %arg0 = bitcast <2 x i64> %a0 to <4 x i32>
    903   %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
    904   %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %arg0, <4 x i32> %arg1)
    905   %bc = bitcast <8 x i16> %res to <2 x i64>
    906   ret <2 x i64> %bc
    907 }
    908 declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
    909 
    910 define <2 x double> @test_mm_round_pd(<2 x double> %a0) {
    911 ; SSE-LABEL: test_mm_round_pd:
    912 ; SSE:       # %bb.0:
    913 ; SSE-NEXT:    roundpd $4, %xmm0, %xmm0
    914 ; SSE-NEXT:    ret{{[l|q]}}
    915 ;
    916 ; AVX-LABEL: test_mm_round_pd:
    917 ; AVX:       # %bb.0:
    918 ; AVX-NEXT:    vroundpd $4, %xmm0, %xmm0
    919 ; AVX-NEXT:    ret{{[l|q]}}
    920   %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 4)
    921   ret <2 x double> %res
    922 }
    923 
    924 define <4 x float> @test_mm_round_ps(<4 x float> %a0) {
    925 ; SSE-LABEL: test_mm_round_ps:
    926 ; SSE:       # %bb.0:
    927 ; SSE-NEXT:    roundps $4, %xmm0, %xmm0
    928 ; SSE-NEXT:    ret{{[l|q]}}
    929 ;
    930 ; AVX-LABEL: test_mm_round_ps:
    931 ; AVX:       # %bb.0:
    932 ; AVX-NEXT:    vroundps $4, %xmm0, %xmm0
    933 ; AVX-NEXT:    ret{{[l|q]}}
    934   %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 4)
    935   ret <4 x float> %res
    936 }
    937 
    938 define <2 x double> @test_mm_round_sd(<2 x double> %a0, <2 x double> %a1) {
    939 ; SSE-LABEL: test_mm_round_sd:
    940 ; SSE:       # %bb.0:
    941 ; SSE-NEXT:    roundsd $4, %xmm1, %xmm0
    942 ; SSE-NEXT:    ret{{[l|q]}}
    943 ;
    944 ; AVX-LABEL: test_mm_round_sd:
    945 ; AVX:       # %bb.0:
    946 ; AVX-NEXT:    vroundsd $4, %xmm1, %xmm0, %xmm0
    947 ; AVX-NEXT:    ret{{[l|q]}}
    948   %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 4)
    949   ret <2 x double> %res
    950 }
    951 
    952 define <4 x float> @test_mm_round_ss(<4 x float> %a0, <4 x float> %a1) {
    953 ; SSE-LABEL: test_mm_round_ss:
    954 ; SSE:       # %bb.0:
    955 ; SSE-NEXT:    roundss $4, %xmm1, %xmm0
    956 ; SSE-NEXT:    ret{{[l|q]}}
    957 ;
    958 ; AVX-LABEL: test_mm_round_ss:
    959 ; AVX:       # %bb.0:
    960 ; AVX-NEXT:    vroundss $4, %xmm1, %xmm0, %xmm0
    961 ; AVX-NEXT:    ret{{[l|q]}}
    962   %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 4)
    963   ret <4 x float> %res
    964 }
    965 
    966 define <2 x i64> @test_mm_stream_load_si128(<2 x i64>* %a0) {
    967 ; X86-SSE-LABEL: test_mm_stream_load_si128:
    968 ; X86-SSE:       # %bb.0:
    969 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    970 ; X86-SSE-NEXT:    movntdqa (%eax), %xmm0
    971 ; X86-SSE-NEXT:    retl
    972 ;
    973 ; X86-AVX-LABEL: test_mm_stream_load_si128:
    974 ; X86-AVX:       # %bb.0:
    975 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    976 ; X86-AVX-NEXT:    vmovntdqa (%eax), %xmm0
    977 ; X86-AVX-NEXT:    retl
    978 ;
    979 ; X64-SSE-LABEL: test_mm_stream_load_si128:
    980 ; X64-SSE:       # %bb.0:
    981 ; X64-SSE-NEXT:    movntdqa (%rdi), %xmm0
    982 ; X64-SSE-NEXT:    retq
    983 ;
    984 ; X64-AVX-LABEL: test_mm_stream_load_si128:
    985 ; X64-AVX:       # %bb.0:
    986 ; X64-AVX-NEXT:    vmovntdqa (%rdi), %xmm0
    987 ; X64-AVX-NEXT:    retq
    988   %arg0 = bitcast <2 x i64>* %a0 to i8*
    989   %res = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %arg0)
    990   ret <2 x i64> %res
    991 }
    992 declare <2 x i64> @llvm.x86.sse41.movntdqa(i8*) nounwind readnone
    993 
    994 define i32 @test_mm_test_all_ones(<2 x i64> %a0) {
    995 ; SSE-LABEL: test_mm_test_all_ones:
    996 ; SSE:       # %bb.0:
    997 ; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
    998 ; SSE-NEXT:    xorl %eax, %eax
    999 ; SSE-NEXT:    ptest %xmm1, %xmm0
   1000 ; SSE-NEXT:    setb %al
   1001 ; SSE-NEXT:    ret{{[l|q]}}
   1002 ;
   1003 ; AVX-LABEL: test_mm_test_all_ones:
   1004 ; AVX:       # %bb.0:
   1005 ; AVX-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
   1006 ; AVX-NEXT:    xorl %eax, %eax
   1007 ; AVX-NEXT:    vptest %xmm1, %xmm0
   1008 ; AVX-NEXT:    setb %al
   1009 ; AVX-NEXT:    ret{{[l|q]}}
   1010   %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> <i64 -1, i64 -1>)
   1011   ret i32 %res
   1012 }
   1013 declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
   1014 
   1015 define i32 @test_mm_test_all_zeros(<2 x i64> %a0, <2 x i64> %a1) {
   1016 ; SSE-LABEL: test_mm_test_all_zeros:
   1017 ; SSE:       # %bb.0:
   1018 ; SSE-NEXT:    xorl %eax, %eax
   1019 ; SSE-NEXT:    ptest %xmm1, %xmm0
   1020 ; SSE-NEXT:    sete %al
   1021 ; SSE-NEXT:    ret{{[l|q]}}
   1022 ;
   1023 ; AVX-LABEL: test_mm_test_all_zeros:
   1024 ; AVX:       # %bb.0:
   1025 ; AVX-NEXT:    xorl %eax, %eax
   1026 ; AVX-NEXT:    vptest %xmm1, %xmm0
   1027 ; AVX-NEXT:    sete %al
   1028 ; AVX-NEXT:    ret{{[l|q]}}
   1029   %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1)
   1030   ret i32 %res
   1031 }
   1032 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
   1033 
   1034 define i32 @test_mm_test_mix_ones_zeros(<2 x i64> %a0, <2 x i64> %a1) {
   1035 ; SSE-LABEL: test_mm_test_mix_ones_zeros:
   1036 ; SSE:       # %bb.0:
   1037 ; SSE-NEXT:    xorl %eax, %eax
   1038 ; SSE-NEXT:    ptest %xmm1, %xmm0
   1039 ; SSE-NEXT:    seta %al
   1040 ; SSE-NEXT:    ret{{[l|q]}}
   1041 ;
   1042 ; AVX-LABEL: test_mm_test_mix_ones_zeros:
   1043 ; AVX:       # %bb.0:
   1044 ; AVX-NEXT:    xorl %eax, %eax
   1045 ; AVX-NEXT:    vptest %xmm1, %xmm0
   1046 ; AVX-NEXT:    seta %al
   1047 ; AVX-NEXT:    ret{{[l|q]}}
   1048   %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1)
   1049   ret i32 %res
   1050 }
   1051 declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
   1052 
   1053 define i32 @test_mm_testc_si128(<2 x i64> %a0, <2 x i64> %a1) {
   1054 ; SSE-LABEL: test_mm_testc_si128:
   1055 ; SSE:       # %bb.0:
   1056 ; SSE-NEXT:    xorl %eax, %eax
   1057 ; SSE-NEXT:    ptest %xmm1, %xmm0
   1058 ; SSE-NEXT:    setb %al
   1059 ; SSE-NEXT:    ret{{[l|q]}}
   1060 ;
   1061 ; AVX-LABEL: test_mm_testc_si128:
   1062 ; AVX:       # %bb.0:
   1063 ; AVX-NEXT:    xorl %eax, %eax
   1064 ; AVX-NEXT:    vptest %xmm1, %xmm0
   1065 ; AVX-NEXT:    setb %al
   1066 ; AVX-NEXT:    ret{{[l|q]}}
   1067   %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
   1068   ret i32 %res
   1069 }
   1070 
   1071 define i32 @test_mm_testnzc_si128(<2 x i64> %a0, <2 x i64> %a1) {
   1072 ; SSE-LABEL: test_mm_testnzc_si128:
   1073 ; SSE:       # %bb.0:
   1074 ; SSE-NEXT:    xorl %eax, %eax
   1075 ; SSE-NEXT:    ptest %xmm1, %xmm0
   1076 ; SSE-NEXT:    seta %al
   1077 ; SSE-NEXT:    ret{{[l|q]}}
   1078 ;
   1079 ; AVX-LABEL: test_mm_testnzc_si128:
   1080 ; AVX:       # %bb.0:
   1081 ; AVX-NEXT:    xorl %eax, %eax
   1082 ; AVX-NEXT:    vptest %xmm1, %xmm0
   1083 ; AVX-NEXT:    seta %al
   1084 ; AVX-NEXT:    ret{{[l|q]}}
   1085   %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1)
   1086   ret i32 %res
   1087 }
   1088 
   1089 define i32 @test_mm_testz_si128(<2 x i64> %a0, <2 x i64> %a1) {
   1090 ; SSE-LABEL: test_mm_testz_si128:
   1091 ; SSE:       # %bb.0:
   1092 ; SSE-NEXT:    xorl %eax, %eax
   1093 ; SSE-NEXT:    ptest %xmm1, %xmm0
   1094 ; SSE-NEXT:    sete %al
   1095 ; SSE-NEXT:    ret{{[l|q]}}
   1096 ;
   1097 ; AVX-LABEL: test_mm_testz_si128:
   1098 ; AVX:       # %bb.0:
   1099 ; AVX-NEXT:    xorl %eax, %eax
   1100 ; AVX-NEXT:    vptest %xmm1, %xmm0
   1101 ; AVX-NEXT:    sete %al
   1102 ; AVX-NEXT:    ret{{[l|q]}}
   1103   %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1)
   1104   ret i32 %res
   1105 }
   1106