Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefix=ALL --check-prefix=X32
      3 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64
      4 
      5 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
      6 
      7 define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
      8 ; X32-LABEL: test_mm_add_ps:
      9 ; X32:       # BB#0:
     10 ; X32-NEXT:    addps %xmm1, %xmm0
     11 ; X32-NEXT:    retl
     12 ;
     13 ; X64-LABEL: test_mm_add_ps:
     14 ; X64:       # BB#0:
     15 ; X64-NEXT:    addps %xmm1, %xmm0
     16 ; X64-NEXT:    retq
     17   %res = fadd <4 x float> %a0, %a1
     18   ret <4 x float> %res
     19 }
     20 
     21 define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
     22 ; X32-LABEL: test_mm_add_ss:
     23 ; X32:       # BB#0:
     24 ; X32-NEXT:    addss %xmm1, %xmm0
     25 ; X32-NEXT:    retl
     26 ;
     27 ; X64-LABEL: test_mm_add_ss:
     28 ; X64:       # BB#0:
     29 ; X64-NEXT:    addss %xmm1, %xmm0
     30 ; X64-NEXT:    retq
     31   %ext0 = extractelement <4 x float> %a0, i32 0
     32   %ext1 = extractelement <4 x float> %a1, i32 0
     33   %fadd = fadd float %ext0, %ext1
     34   %res = insertelement <4 x float> %a0, float %fadd, i32 0
     35   ret <4 x float> %res
     36 }
     37 
     38 define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
     39 ; X32-LABEL: test_mm_and_ps:
     40 ; X32:       # BB#0:
     41 ; X32-NEXT:    pushl %ebp
     42 ; X32-NEXT:    movl %esp, %ebp
     43 ; X32-NEXT:    pushl %esi
     44 ; X32-NEXT:    andl $-16, %esp
     45 ; X32-NEXT:    subl $64, %esp
     46 ; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
     47 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
     48 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     49 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
     50 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
     51 ; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
     52 ; X32-NEXT:    andl {{[0-9]+}}(%esp), %esi
     53 ; X32-NEXT:    movl %esi, (%esp)
     54 ; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
     55 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
     56 ; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
     57 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
     58 ; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
     59 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
     60 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
     61 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
     62 ; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
     63 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
     64 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
     65 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
     66 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
     67 ; X32-NEXT:    leal -4(%ebp), %esp
     68 ; X32-NEXT:    popl %esi
     69 ; X32-NEXT:    popl %ebp
     70 ; X32-NEXT:    retl
     71 ;
     72 ; X64-LABEL: test_mm_and_ps:
     73 ; X64:       # BB#0:
     74 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
     75 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
     76 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8
     77 ; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
     78 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
     79 ; X64-NEXT:    movq %rdx, %rsi
     80 ; X64-NEXT:    andl %eax, %edx
     81 ; X64-NEXT:    shrq $32, %rax
     82 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
     83 ; X64-NEXT:    movq %rcx, %rdi
     84 ; X64-NEXT:    andl %r8d, %ecx
     85 ; X64-NEXT:    shrq $32, %r8
     86 ; X64-NEXT:    shrq $32, %rsi
     87 ; X64-NEXT:    shrq $32, %rdi
     88 ; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
     89 ; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
     90 ; X64-NEXT:    andl %r8d, %edi
     91 ; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
     92 ; X64-NEXT:    andl %eax, %esi
     93 ; X64-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
     94 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
     95 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
     96 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
     97 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
     98 ; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
     99 ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    100 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    101 ; X64-NEXT:    retq
    102   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
    103   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
    104   %res = and <4 x i32> %arg0, %arg1
    105   %bc = bitcast <4 x i32> %res to <4 x float>
    106   ret <4 x float> %bc
    107 }
    108 
    109 define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    110 ; X32-LABEL: test_mm_andnot_ps:
    111 ; X32:       # BB#0:
    112 ; X32-NEXT:    pushl %ebp
    113 ; X32-NEXT:    movl %esp, %ebp
    114 ; X32-NEXT:    pushl %esi
    115 ; X32-NEXT:    andl $-16, %esp
    116 ; X32-NEXT:    subl $64, %esp
    117 ; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
    118 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    119 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    120 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
    121 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
    122 ; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
    123 ; X32-NEXT:    notl %edx
    124 ; X32-NEXT:    notl %ecx
    125 ; X32-NEXT:    notl %esi
    126 ; X32-NEXT:    notl %eax
    127 ; X32-NEXT:    andl {{[0-9]+}}(%esp), %eax
    128 ; X32-NEXT:    movl %eax, (%esp)
    129 ; X32-NEXT:    andl {{[0-9]+}}(%esp), %esi
    130 ; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp)
    131 ; X32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
    132 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
    133 ; X32-NEXT:    andl {{[0-9]+}}(%esp), %edx
    134 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
    135 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    136 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    137 ; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
    138 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    139 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    140 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    141 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    142 ; X32-NEXT:    leal -4(%ebp), %esp
    143 ; X32-NEXT:    popl %esi
    144 ; X32-NEXT:    popl %ebp
    145 ; X32-NEXT:    retl
    146 ;
    147 ; X64-LABEL: test_mm_andnot_ps:
    148 ; X64:       # BB#0:
    149 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
    150 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
    151 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
    152 ; X64-NEXT:    movq %rcx, %rdx
    153 ; X64-NEXT:    shrq $32, %rdx
    154 ; X64-NEXT:    movq %rax, %rsi
    155 ; X64-NEXT:    shrq $32, %rsi
    156 ; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
    157 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi
    158 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8
    159 ; X64-NEXT:    notl %eax
    160 ; X64-NEXT:    andl %edi, %eax
    161 ; X64-NEXT:    shrq $32, %rdi
    162 ; X64-NEXT:    notl %ecx
    163 ; X64-NEXT:    andl %r8d, %ecx
    164 ; X64-NEXT:    shrq $32, %r8
    165 ; X64-NEXT:    notl %esi
    166 ; X64-NEXT:    notl %edx
    167 ; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
    168 ; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
    169 ; X64-NEXT:    andl %r8d, %edx
    170 ; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
    171 ; X64-NEXT:    andl %edi, %esi
    172 ; X64-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
    173 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    174 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    175 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    176 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    177 ; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
    178 ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    179 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    180 ; X64-NEXT:    retq
    181   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
    182   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
    183   %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
    184   %res = and <4 x i32> %not, %arg1
    185   %bc = bitcast <4 x i32> %res to <4 x float>
    186   ret <4 x float> %bc
    187 }
    188 
    189 define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    190 ; X32-LABEL: test_mm_cmpeq_ps:
    191 ; X32:       # BB#0:
    192 ; X32-NEXT:    cmpeqps %xmm1, %xmm0
    193 ; X32-NEXT:    retl
    194 ;
    195 ; X64-LABEL: test_mm_cmpeq_ps:
    196 ; X64:       # BB#0:
    197 ; X64-NEXT:    cmpeqps %xmm1, %xmm0
    198 ; X64-NEXT:    retq
    199   %cmp = fcmp oeq <4 x float> %a0, %a1
    200   %sext = sext <4 x i1> %cmp to <4 x i32>
    201   %res = bitcast <4 x i32> %sext to <4 x float>
    202   ret <4 x float> %res
    203 }
    204 
    205 define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    206 ; X32-LABEL: test_mm_cmpeq_ss:
    207 ; X32:       # BB#0:
    208 ; X32-NEXT:    cmpeqss %xmm1, %xmm0
    209 ; X32-NEXT:    retl
    210 ;
    211 ; X64-LABEL: test_mm_cmpeq_ss:
    212 ; X64:       # BB#0:
    213 ; X64-NEXT:    cmpeqss %xmm1, %xmm0
    214 ; X64-NEXT:    retq
    215   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
    216   ret <4 x float> %res
    217 }
    218 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
    219 
    220 define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    221 ; X32-LABEL: test_mm_cmpge_ps:
    222 ; X32:       # BB#0:
    223 ; X32-NEXT:    cmpleps %xmm0, %xmm1
    224 ; X32-NEXT:    movaps %xmm1, %xmm0
    225 ; X32-NEXT:    retl
    226 ;
    227 ; X64-LABEL: test_mm_cmpge_ps:
    228 ; X64:       # BB#0:
    229 ; X64-NEXT:    cmpleps %xmm0, %xmm1
    230 ; X64-NEXT:    movaps %xmm1, %xmm0
    231 ; X64-NEXT:    retq
    232   %cmp = fcmp ole <4 x float> %a1, %a0
    233   %sext = sext <4 x i1> %cmp to <4 x i32>
    234   %res = bitcast <4 x i32> %sext to <4 x float>
    235   ret <4 x float> %res
    236 }
    237 
    238 define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    239 ; X32-LABEL: test_mm_cmpge_ss:
    240 ; X32:       # BB#0:
    241 ; X32-NEXT:    cmpless %xmm0, %xmm1
    242 ; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    243 ; X32-NEXT:    retl
    244 ;
    245 ; X64-LABEL: test_mm_cmpge_ss:
    246 ; X64:       # BB#0:
    247 ; X64-NEXT:    cmpless %xmm0, %xmm1
    248 ; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    249 ; X64-NEXT:    retq
    250   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
    251   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
    252   ret <4 x float> %res
    253 }
    254 
    255 define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    256 ; X32-LABEL: test_mm_cmpgt_ps:
    257 ; X32:       # BB#0:
    258 ; X32-NEXT:    cmpltps %xmm0, %xmm1
    259 ; X32-NEXT:    movaps %xmm1, %xmm0
    260 ; X32-NEXT:    retl
    261 ;
    262 ; X64-LABEL: test_mm_cmpgt_ps:
    263 ; X64:       # BB#0:
    264 ; X64-NEXT:    cmpltps %xmm0, %xmm1
    265 ; X64-NEXT:    movaps %xmm1, %xmm0
    266 ; X64-NEXT:    retq
    267   %cmp = fcmp olt <4 x float> %a1, %a0
    268   %sext = sext <4 x i1> %cmp to <4 x i32>
    269   %res = bitcast <4 x i32> %sext to <4 x float>
    270   ret <4 x float> %res
    271 }
    272 
    273 define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    274 ; X32-LABEL: test_mm_cmpgt_ss:
    275 ; X32:       # BB#0:
    276 ; X32-NEXT:    cmpltss %xmm0, %xmm1
    277 ; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    278 ; X32-NEXT:    retl
    279 ;
    280 ; X64-LABEL: test_mm_cmpgt_ss:
    281 ; X64:       # BB#0:
    282 ; X64-NEXT:    cmpltss %xmm0, %xmm1
    283 ; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    284 ; X64-NEXT:    retq
    285   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
    286   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
    287   ret <4 x float> %res
    288 }
    289 
    290 define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    291 ; X32-LABEL: test_mm_cmple_ps:
    292 ; X32:       # BB#0:
    293 ; X32-NEXT:    cmpleps %xmm1, %xmm0
    294 ; X32-NEXT:    retl
    295 ;
    296 ; X64-LABEL: test_mm_cmple_ps:
    297 ; X64:       # BB#0:
    298 ; X64-NEXT:    cmpleps %xmm1, %xmm0
    299 ; X64-NEXT:    retq
    300   %cmp = fcmp ole <4 x float> %a0, %a1
    301   %sext = sext <4 x i1> %cmp to <4 x i32>
    302   %res = bitcast <4 x i32> %sext to <4 x float>
    303   ret <4 x float> %res
    304 }
    305 
    306 define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    307 ; X32-LABEL: test_mm_cmple_ss:
    308 ; X32:       # BB#0:
    309 ; X32-NEXT:    cmpless %xmm1, %xmm0
    310 ; X32-NEXT:    retl
    311 ;
    312 ; X64-LABEL: test_mm_cmple_ss:
    313 ; X64:       # BB#0:
    314 ; X64-NEXT:    cmpless %xmm1, %xmm0
    315 ; X64-NEXT:    retq
    316   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
    317   ret <4 x float> %res
    318 }
    319 
    320 define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    321 ; X32-LABEL: test_mm_cmplt_ps:
    322 ; X32:       # BB#0:
    323 ; X32-NEXT:    cmpltps %xmm1, %xmm0
    324 ; X32-NEXT:    retl
    325 ;
    326 ; X64-LABEL: test_mm_cmplt_ps:
    327 ; X64:       # BB#0:
    328 ; X64-NEXT:    cmpltps %xmm1, %xmm0
    329 ; X64-NEXT:    retq
    330   %cmp = fcmp olt <4 x float> %a0, %a1
    331   %sext = sext <4 x i1> %cmp to <4 x i32>
    332   %res = bitcast <4 x i32> %sext to <4 x float>
    333   ret <4 x float> %res
    334 }
    335 
    336 define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    337 ; X32-LABEL: test_mm_cmplt_ss:
    338 ; X32:       # BB#0:
    339 ; X32-NEXT:    cmpltss %xmm1, %xmm0
    340 ; X32-NEXT:    retl
    341 ;
    342 ; X64-LABEL: test_mm_cmplt_ss:
    343 ; X64:       # BB#0:
    344 ; X64-NEXT:    cmpltss %xmm1, %xmm0
    345 ; X64-NEXT:    retq
    346   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
    347   ret <4 x float> %res
    348 }
    349 
    350 define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    351 ; X32-LABEL: test_mm_cmpneq_ps:
    352 ; X32:       # BB#0:
    353 ; X32-NEXT:    cmpneqps %xmm1, %xmm0
    354 ; X32-NEXT:    retl
    355 ;
    356 ; X64-LABEL: test_mm_cmpneq_ps:
    357 ; X64:       # BB#0:
    358 ; X64-NEXT:    cmpneqps %xmm1, %xmm0
    359 ; X64-NEXT:    retq
    360   %cmp = fcmp une <4 x float> %a0, %a1
    361   %sext = sext <4 x i1> %cmp to <4 x i32>
    362   %res = bitcast <4 x i32> %sext to <4 x float>
    363   ret <4 x float> %res
    364 }
    365 
    366 define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    367 ; X32-LABEL: test_mm_cmpneq_ss:
    368 ; X32:       # BB#0:
    369 ; X32-NEXT:    cmpneqss %xmm1, %xmm0
    370 ; X32-NEXT:    retl
    371 ;
    372 ; X64-LABEL: test_mm_cmpneq_ss:
    373 ; X64:       # BB#0:
    374 ; X64-NEXT:    cmpneqss %xmm1, %xmm0
    375 ; X64-NEXT:    retq
    376   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
    377   ret <4 x float> %res
    378 }
    379 
    380 define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    381 ; X32-LABEL: test_mm_cmpnge_ps:
    382 ; X32:       # BB#0:
    383 ; X32-NEXT:    cmpnleps %xmm0, %xmm1
    384 ; X32-NEXT:    movaps %xmm1, %xmm0
    385 ; X32-NEXT:    retl
    386 ;
    387 ; X64-LABEL: test_mm_cmpnge_ps:
    388 ; X64:       # BB#0:
    389 ; X64-NEXT:    cmpnleps %xmm0, %xmm1
    390 ; X64-NEXT:    movaps %xmm1, %xmm0
    391 ; X64-NEXT:    retq
    392   %cmp = fcmp ugt <4 x float> %a1, %a0
    393   %sext = sext <4 x i1> %cmp to <4 x i32>
    394   %res = bitcast <4 x i32> %sext to <4 x float>
    395   ret <4 x float> %res
    396 }
    397 
    398 define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    399 ; X32-LABEL: test_mm_cmpnge_ss:
    400 ; X32:       # BB#0:
    401 ; X32-NEXT:    cmpnless %xmm0, %xmm1
    402 ; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    403 ; X32-NEXT:    retl
    404 ;
    405 ; X64-LABEL: test_mm_cmpnge_ss:
    406 ; X64:       # BB#0:
    407 ; X64-NEXT:    cmpnless %xmm0, %xmm1
    408 ; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    409 ; X64-NEXT:    retq
    410   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
    411   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
    412   ret <4 x float> %res
    413 }
    414 
    415 define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    416 ; X32-LABEL: test_mm_cmpngt_ps:
    417 ; X32:       # BB#0:
    418 ; X32-NEXT:    cmpnltps %xmm0, %xmm1
    419 ; X32-NEXT:    movaps %xmm1, %xmm0
    420 ; X32-NEXT:    retl
    421 ;
    422 ; X64-LABEL: test_mm_cmpngt_ps:
    423 ; X64:       # BB#0:
    424 ; X64-NEXT:    cmpnltps %xmm0, %xmm1
    425 ; X64-NEXT:    movaps %xmm1, %xmm0
    426 ; X64-NEXT:    retq
    427   %cmp = fcmp uge <4 x float> %a1, %a0
    428   %sext = sext <4 x i1> %cmp to <4 x i32>
    429   %res = bitcast <4 x i32> %sext to <4 x float>
    430   ret <4 x float> %res
    431 }
    432 
    433 define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    434 ; X32-LABEL: test_mm_cmpngt_ss:
    435 ; X32:       # BB#0:
    436 ; X32-NEXT:    cmpnltss %xmm0, %xmm1
    437 ; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    438 ; X32-NEXT:    retl
    439 ;
    440 ; X64-LABEL: test_mm_cmpngt_ss:
    441 ; X64:       # BB#0:
    442 ; X64-NEXT:    cmpnltss %xmm0, %xmm1
    443 ; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    444 ; X64-NEXT:    retq
    445   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
    446   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
    447   ret <4 x float> %res
    448 }
    449 
    450 define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    451 ; X32-LABEL: test_mm_cmpnle_ps:
    452 ; X32:       # BB#0:
    453 ; X32-NEXT:    cmpnleps %xmm1, %xmm0
    454 ; X32-NEXT:    retl
    455 ;
    456 ; X64-LABEL: test_mm_cmpnle_ps:
    457 ; X64:       # BB#0:
    458 ; X64-NEXT:    cmpnleps %xmm1, %xmm0
    459 ; X64-NEXT:    retq
    460   %cmp = fcmp ugt <4 x float> %a0, %a1
    461   %sext = sext <4 x i1> %cmp to <4 x i32>
    462   %res = bitcast <4 x i32> %sext to <4 x float>
    463   ret <4 x float> %res
    464 }
    465 
    466 define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    467 ; X32-LABEL: test_mm_cmpnle_ss:
    468 ; X32:       # BB#0:
    469 ; X32-NEXT:    cmpnless %xmm1, %xmm0
    470 ; X32-NEXT:    retl
    471 ;
    472 ; X64-LABEL: test_mm_cmpnle_ss:
    473 ; X64:       # BB#0:
    474 ; X64-NEXT:    cmpnless %xmm1, %xmm0
    475 ; X64-NEXT:    retq
    476   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
    477   ret <4 x float> %res
    478 }
    479 
    480 define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    481 ; X32-LABEL: test_mm_cmpnlt_ps:
    482 ; X32:       # BB#0:
    483 ; X32-NEXT:    cmpnltps %xmm1, %xmm0
    484 ; X32-NEXT:    retl
    485 ;
    486 ; X64-LABEL: test_mm_cmpnlt_ps:
    487 ; X64:       # BB#0:
    488 ; X64-NEXT:    cmpnltps %xmm1, %xmm0
    489 ; X64-NEXT:    retq
    490   %cmp = fcmp uge <4 x float> %a0, %a1
    491   %sext = sext <4 x i1> %cmp to <4 x i32>
    492   %res = bitcast <4 x i32> %sext to <4 x float>
    493   ret <4 x float> %res
    494 }
    495 
    496 define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    497 ; X32-LABEL: test_mm_cmpnlt_ss:
    498 ; X32:       # BB#0:
    499 ; X32-NEXT:    cmpnltss %xmm1, %xmm0
    500 ; X32-NEXT:    retl
    501 ;
    502 ; X64-LABEL: test_mm_cmpnlt_ss:
    503 ; X64:       # BB#0:
    504 ; X64-NEXT:    cmpnltss %xmm1, %xmm0
    505 ; X64-NEXT:    retq
    506   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
    507   ret <4 x float> %res
    508 }
    509 
    510 define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    511 ; X32-LABEL: test_mm_cmpord_ps:
    512 ; X32:       # BB#0:
    513 ; X32-NEXT:    cmpordps %xmm1, %xmm0
    514 ; X32-NEXT:    retl
    515 ;
    516 ; X64-LABEL: test_mm_cmpord_ps:
    517 ; X64:       # BB#0:
    518 ; X64-NEXT:    cmpordps %xmm1, %xmm0
    519 ; X64-NEXT:    retq
    520   %cmp = fcmp ord <4 x float> %a0, %a1
    521   %sext = sext <4 x i1> %cmp to <4 x i32>
    522   %res = bitcast <4 x i32> %sext to <4 x float>
    523   ret <4 x float> %res
    524 }
    525 
    526 define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    527 ; X32-LABEL: test_mm_cmpord_ss:
    528 ; X32:       # BB#0:
    529 ; X32-NEXT:    cmpordss %xmm1, %xmm0
    530 ; X32-NEXT:    retl
    531 ;
    532 ; X64-LABEL: test_mm_cmpord_ss:
    533 ; X64:       # BB#0:
    534 ; X64-NEXT:    cmpordss %xmm1, %xmm0
    535 ; X64-NEXT:    retq
    536   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
    537   ret <4 x float> %res
    538 }
    539 
    540 define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    541 ; X32-LABEL: test_mm_cmpunord_ps:
    542 ; X32:       # BB#0:
    543 ; X32-NEXT:    cmpunordps %xmm1, %xmm0
    544 ; X32-NEXT:    retl
    545 ;
    546 ; X64-LABEL: test_mm_cmpunord_ps:
    547 ; X64:       # BB#0:
    548 ; X64-NEXT:    cmpunordps %xmm1, %xmm0
    549 ; X64-NEXT:    retq
    550   %cmp = fcmp uno <4 x float> %a0, %a1
    551   %sext = sext <4 x i1> %cmp to <4 x i32>
    552   %res = bitcast <4 x i32> %sext to <4 x float>
    553   ret <4 x float> %res
    554 }
    555 
    556 define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    557 ; X32-LABEL: test_mm_cmpunord_ss:
    558 ; X32:       # BB#0:
    559 ; X32-NEXT:    cmpunordss %xmm1, %xmm0
    560 ; X32-NEXT:    retl
    561 ;
    562 ; X64-LABEL: test_mm_cmpunord_ss:
    563 ; X64:       # BB#0:
    564 ; X64-NEXT:    cmpunordss %xmm1, %xmm0
    565 ; X64-NEXT:    retq
    566   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
    567   ret <4 x float> %res
    568 }
    569 
    570 define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    571 ; X32-LABEL: test_mm_comieq_ss:
    572 ; X32:       # BB#0:
    573 ; X32-NEXT:    comiss %xmm1, %xmm0
    574 ; X32-NEXT:    setnp %al
    575 ; X32-NEXT:    sete %cl
    576 ; X32-NEXT:    andb %al, %cl
    577 ; X32-NEXT:    movzbl %cl, %eax
    578 ; X32-NEXT:    retl
    579 ;
    580 ; X64-LABEL: test_mm_comieq_ss:
    581 ; X64:       # BB#0:
    582 ; X64-NEXT:    comiss %xmm1, %xmm0
    583 ; X64-NEXT:    setnp %al
    584 ; X64-NEXT:    sete %cl
    585 ; X64-NEXT:    andb %al, %cl
    586 ; X64-NEXT:    movzbl %cl, %eax
    587 ; X64-NEXT:    retq
    588   %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
    589   ret i32 %res
    590 }
    591 declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
    592 
    593 define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    594 ; X32-LABEL: test_mm_comige_ss:
    595 ; X32:       # BB#0:
    596 ; X32-NEXT:    xorl %eax, %eax
    597 ; X32-NEXT:    comiss %xmm1, %xmm0
    598 ; X32-NEXT:    setae %al
    599 ; X32-NEXT:    retl
    600 ;
    601 ; X64-LABEL: test_mm_comige_ss:
    602 ; X64:       # BB#0:
    603 ; X64-NEXT:    xorl %eax, %eax
    604 ; X64-NEXT:    comiss %xmm1, %xmm0
    605 ; X64-NEXT:    setae %al
    606 ; X64-NEXT:    retq
    607   %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1)
    608   ret i32 %res
    609 }
    610 declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
    611 
    612 define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    613 ; X32-LABEL: test_mm_comigt_ss:
    614 ; X32:       # BB#0:
    615 ; X32-NEXT:    xorl %eax, %eax
    616 ; X32-NEXT:    comiss %xmm1, %xmm0
    617 ; X32-NEXT:    seta %al
    618 ; X32-NEXT:    retl
    619 ;
    620 ; X64-LABEL: test_mm_comigt_ss:
    621 ; X64:       # BB#0:
    622 ; X64-NEXT:    xorl %eax, %eax
    623 ; X64-NEXT:    comiss %xmm1, %xmm0
    624 ; X64-NEXT:    seta %al
    625 ; X64-NEXT:    retq
    626   %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1)
    627   ret i32 %res
    628 }
    629 declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
    630 
    631 define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    632 ; X32-LABEL: test_mm_comile_ss:
    633 ; X32:       # BB#0:
    634 ; X32-NEXT:    xorl %eax, %eax
    635 ; X32-NEXT:    comiss %xmm0, %xmm1
    636 ; X32-NEXT:    setae %al
    637 ; X32-NEXT:    retl
    638 ;
    639 ; X64-LABEL: test_mm_comile_ss:
    640 ; X64:       # BB#0:
    641 ; X64-NEXT:    xorl %eax, %eax
    642 ; X64-NEXT:    comiss %xmm0, %xmm1
    643 ; X64-NEXT:    setae %al
    644 ; X64-NEXT:    retq
    645   %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1)
    646   ret i32 %res
    647 }
    648 declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
    649 
    650 define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    651 ; X32-LABEL: test_mm_comilt_ss:
    652 ; X32:       # BB#0:
    653 ; X32-NEXT:    xorl %eax, %eax
    654 ; X32-NEXT:    comiss %xmm0, %xmm1
    655 ; X32-NEXT:    seta %al
    656 ; X32-NEXT:    retl
    657 ;
    658 ; X64-LABEL: test_mm_comilt_ss:
    659 ; X64:       # BB#0:
    660 ; X64-NEXT:    xorl %eax, %eax
    661 ; X64-NEXT:    comiss %xmm0, %xmm1
    662 ; X64-NEXT:    seta %al
    663 ; X64-NEXT:    retq
    664   %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1)
    665   ret i32 %res
    666 }
    667 declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
    668 
    669 define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    670 ; X32-LABEL: test_mm_comineq_ss:
    671 ; X32:       # BB#0:
    672 ; X32-NEXT:    comiss %xmm1, %xmm0
    673 ; X32-NEXT:    setp %al
    674 ; X32-NEXT:    setne %cl
    675 ; X32-NEXT:    orb %al, %cl
    676 ; X32-NEXT:    movzbl %cl, %eax
    677 ; X32-NEXT:    retl
    678 ;
    679 ; X64-LABEL: test_mm_comineq_ss:
    680 ; X64:       # BB#0:
    681 ; X64-NEXT:    comiss %xmm1, %xmm0
    682 ; X64-NEXT:    setp %al
    683 ; X64-NEXT:    setne %cl
    684 ; X64-NEXT:    orb %al, %cl
    685 ; X64-NEXT:    movzbl %cl, %eax
    686 ; X64-NEXT:    retq
    687   %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
    688   ret i32 %res
    689 }
    690 declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
    691 
    692 define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
    693 ; X32-LABEL: test_mm_cvt_ss2si:
    694 ; X32:       # BB#0:
    695 ; X32-NEXT:    cvtss2si %xmm0, %eax
    696 ; X32-NEXT:    retl
    697 ;
    698 ; X64-LABEL: test_mm_cvt_ss2si:
    699 ; X64:       # BB#0:
    700 ; X64-NEXT:    cvtss2si %xmm0, %eax
    701 ; X64-NEXT:    retq
    702   %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
    703   ret i32 %res
    704 }
    705 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
    706 
    707 define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
    708 ; X32-LABEL: test_mm_cvtsi32_ss:
    709 ; X32:       # BB#0:
    710 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    711 ; X32-NEXT:    cvtsi2ssl %eax, %xmm1
    712 ; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    713 ; X32-NEXT:    retl
    714 ;
    715 ; X64-LABEL: test_mm_cvtsi32_ss:
    716 ; X64:       # BB#0:
    717 ; X64-NEXT:    cvtsi2ssl %edi, %xmm1
    718 ; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
    719 ; X64-NEXT:    retq
    720   %cvt = sitofp i32 %a1 to float
    721   %res = insertelement <4 x float> %a0, float %cvt, i32 0
    722   ret <4 x float> %res
    723 }
    724 
    725 define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
    726 ; X32-LABEL: test_mm_cvtss_f32:
    727 ; X32:       # BB#0:
    728 ; X32-NEXT:    pushl %eax
    729 ; X32-NEXT:    movss %xmm0, (%esp)
    730 ; X32-NEXT:    flds (%esp)
    731 ; X32-NEXT:    popl %eax
    732 ; X32-NEXT:    retl
    733 ;
    734 ; X64-LABEL: test_mm_cvtss_f32:
    735 ; X64:       # BB#0:
    736 ; X64-NEXT:    retq
    737   %res = extractelement <4 x float> %a0, i32 0
    738   ret float %res
    739 }
    740 
    741 define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
    742 ; X32-LABEL: test_mm_cvtss_si32:
    743 ; X32:       # BB#0:
    744 ; X32-NEXT:    cvtss2si %xmm0, %eax
    745 ; X32-NEXT:    retl
    746 ;
    747 ; X64-LABEL: test_mm_cvtss_si32:
    748 ; X64:       # BB#0:
    749 ; X64-NEXT:    cvtss2si %xmm0, %eax
    750 ; X64-NEXT:    retq
    751   %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
    752   ret i32 %res
    753 }
    754 
    755 define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
    756 ; X32-LABEL: test_mm_cvttss_si:
    757 ; X32:       # BB#0:
    758 ; X32-NEXT:    cvttss2si %xmm0, %eax
    759 ; X32-NEXT:    retl
    760 ;
    761 ; X64-LABEL: test_mm_cvttss_si:
    762 ; X64:       # BB#0:
    763 ; X64-NEXT:    cvttss2si %xmm0, %eax
    764 ; X64-NEXT:    retq
    765   %cvt = extractelement <4 x float> %a0, i32 0
    766   %res = fptosi float %cvt to i32
    767   ret i32 %res
    768 }
    769 
    770 define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
    771 ; X32-LABEL: test_mm_cvttss_si32:
    772 ; X32:       # BB#0:
    773 ; X32-NEXT:    cvttss2si %xmm0, %eax
    774 ; X32-NEXT:    retl
    775 ;
    776 ; X64-LABEL: test_mm_cvttss_si32:
    777 ; X64:       # BB#0:
    778 ; X64-NEXT:    cvttss2si %xmm0, %eax
    779 ; X64-NEXT:    retq
    780   %cvt = extractelement <4 x float> %a0, i32 0
    781   %res = fptosi float %cvt to i32
    782   ret i32 %res
    783 }
    784 
    785 define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    786 ; X32-LABEL: test_mm_div_ps:
    787 ; X32:       # BB#0:
    788 ; X32-NEXT:    divps %xmm1, %xmm0
    789 ; X32-NEXT:    retl
    790 ;
    791 ; X64-LABEL: test_mm_div_ps:
    792 ; X64:       # BB#0:
    793 ; X64-NEXT:    divps %xmm1, %xmm0
    794 ; X64-NEXT:    retq
    795   %res = fdiv <4 x float> %a0, %a1
    796   ret <4 x float> %res
    797 }
    798 
    799 define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    800 ; X32-LABEL: test_mm_div_ss:
    801 ; X32:       # BB#0:
    802 ; X32-NEXT:    divss %xmm1, %xmm0
    803 ; X32-NEXT:    retl
    804 ;
    805 ; X64-LABEL: test_mm_div_ss:
    806 ; X64:       # BB#0:
    807 ; X64-NEXT:    divss %xmm1, %xmm0
    808 ; X64-NEXT:    retq
    809   %ext0 = extractelement <4 x float> %a0, i32 0
    810   %ext1 = extractelement <4 x float> %a1, i32 0
    811   %fdiv = fdiv float %ext0, %ext1
    812   %res = insertelement <4 x float> %a0, float %fdiv, i32 0
    813   ret <4 x float> %res
    814 }
    815 
    816 define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
    817 ; X32-LABEL: test_MM_GET_EXCEPTION_MASK:
    818 ; X32:       # BB#0:
    819 ; X32-NEXT:    pushl %eax
    820 ; X32-NEXT:    leal (%esp), %eax
    821 ; X32-NEXT:    stmxcsr (%eax)
    822 ; X32-NEXT:    movl (%esp), %eax
    823 ; X32-NEXT:    andl $8064, %eax # imm = 0x1F80
    824 ; X32-NEXT:    popl %ecx
    825 ; X32-NEXT:    retl
    826 ;
    827 ; X64-LABEL: test_MM_GET_EXCEPTION_MASK:
    828 ; X64:       # BB#0:
    829 ; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
    830 ; X64-NEXT:    stmxcsr (%rax)
    831 ; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
    832 ; X64-NEXT:    andl $8064, %eax # imm = 0x1F80
    833 ; X64-NEXT:    retq
    834   %1 = alloca i32, align 4
    835   %2 = bitcast i32* %1 to i8*
    836   call void @llvm.x86.sse.stmxcsr(i8* %2)
    837   %3 = load i32, i32* %1, align 4
    838   %4 = and i32 %3, 8064
    839   ret i32 %4
    840 }
    841 declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
    842 
    843 define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
    844 ; X32-LABEL: test_MM_GET_EXCEPTION_STATE:
    845 ; X32:       # BB#0:
    846 ; X32-NEXT:    pushl %eax
    847 ; X32-NEXT:    leal (%esp), %eax
    848 ; X32-NEXT:    stmxcsr (%eax)
    849 ; X32-NEXT:    movl (%esp), %eax
    850 ; X32-NEXT:    andl $63, %eax
    851 ; X32-NEXT:    popl %ecx
    852 ; X32-NEXT:    retl
    853 ;
    854 ; X64-LABEL: test_MM_GET_EXCEPTION_STATE:
    855 ; X64:       # BB#0:
    856 ; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
    857 ; X64-NEXT:    stmxcsr (%rax)
    858 ; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
    859 ; X64-NEXT:    andl $63, %eax
    860 ; X64-NEXT:    retq
    861   %1 = alloca i32, align 4
    862   %2 = bitcast i32* %1 to i8*
    863   call void @llvm.x86.sse.stmxcsr(i8* %2)
    864   %3 = load i32, i32* %1, align 4
    865   %4 = and i32 %3, 63
    866   ret i32 %4
    867 }
    868 
    869 define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
    870 ; X32-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
    871 ; X32:       # BB#0:
    872 ; X32-NEXT:    pushl %eax
    873 ; X32-NEXT:    leal (%esp), %eax
    874 ; X32-NEXT:    stmxcsr (%eax)
    875 ; X32-NEXT:    movl (%esp), %eax
    876 ; X32-NEXT:    andl $32768, %eax # imm = 0x8000
    877 ; X32-NEXT:    popl %ecx
    878 ; X32-NEXT:    retl
    879 ;
    880 ; X64-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
    881 ; X64:       # BB#0:
    882 ; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
    883 ; X64-NEXT:    stmxcsr (%rax)
    884 ; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
    885 ; X64-NEXT:    andl $32768, %eax # imm = 0x8000
    886 ; X64-NEXT:    retq
    887   %1 = alloca i32, align 4
    888   %2 = bitcast i32* %1 to i8*
    889   call void @llvm.x86.sse.stmxcsr(i8* %2)
    890   %3 = load i32, i32* %1, align 4
    891   %4 = and i32 %3, 32768
    892   ret i32 %4
    893 }
    894 
    895 define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
    896 ; X32-LABEL: test_MM_GET_ROUNDING_MODE:
    897 ; X32:       # BB#0:
    898 ; X32-NEXT:    pushl %eax
    899 ; X32-NEXT:    leal (%esp), %eax
    900 ; X32-NEXT:    stmxcsr (%eax)
    901 ; X32-NEXT:    movl (%esp), %eax
    902 ; X32-NEXT:    andl $24576, %eax # imm = 0x6000
    903 ; X32-NEXT:    popl %ecx
    904 ; X32-NEXT:    retl
    905 ;
    906 ; X64-LABEL: test_MM_GET_ROUNDING_MODE:
    907 ; X64:       # BB#0:
    908 ; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
    909 ; X64-NEXT:    stmxcsr (%rax)
    910 ; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
    911 ; X64-NEXT:    andl $24576, %eax # imm = 0x6000
    912 ; X64-NEXT:    retq
    913   %1 = alloca i32, align 4
    914   %2 = bitcast i32* %1 to i8*
    915   call void @llvm.x86.sse.stmxcsr(i8* %2)
    916   %3 = load i32, i32* %1, align 4
    917   %4 = and i32 %3, 24576
    918   ret i32 %4
    919 }
    920 
    921 define i32 @test_mm_getcsr() nounwind {
    922 ; X32-LABEL: test_mm_getcsr:
    923 ; X32:       # BB#0:
    924 ; X32-NEXT:    pushl %eax
    925 ; X32-NEXT:    leal (%esp), %eax
    926 ; X32-NEXT:    stmxcsr (%eax)
    927 ; X32-NEXT:    movl (%esp), %eax
    928 ; X32-NEXT:    popl %ecx
    929 ; X32-NEXT:    retl
    930 ;
    931 ; X64-LABEL: test_mm_getcsr:
    932 ; X64:       # BB#0:
    933 ; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
    934 ; X64-NEXT:    stmxcsr (%rax)
    935 ; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
    936 ; X64-NEXT:    retq
    937   %1 = alloca i32, align 4
    938   %2 = bitcast i32* %1 to i8*
    939   call void @llvm.x86.sse.stmxcsr(i8* %2)
    940   %3 = load i32, i32* %1, align 4
    941   ret i32 %3
    942 }
    943 
    944 define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
    945 ; X32-LABEL: test_mm_load_ps:
    946 ; X32:       # BB#0:
    947 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    948 ; X32-NEXT:    movaps (%eax), %xmm0
    949 ; X32-NEXT:    retl
    950 ;
    951 ; X64-LABEL: test_mm_load_ps:
    952 ; X64:       # BB#0:
    953 ; X64-NEXT:    movaps (%rdi), %xmm0
    954 ; X64-NEXT:    retq
    955   %arg0 = bitcast float* %a0 to <4 x float>*
    956   %res = load <4 x float>, <4 x float>* %arg0, align 16
    957   ret <4 x float> %res
    958 }
    959 
    960 define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
    961 ; X32-LABEL: test_mm_load_ps1:
    962 ; X32:       # BB#0:
    963 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    964 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    965 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
    966 ; X32-NEXT:    retl
    967 ;
    968 ; X64-LABEL: test_mm_load_ps1:
    969 ; X64:       # BB#0:
    970 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    971 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
    972 ; X64-NEXT:    retq
    973   %ld = load float, float* %a0, align 4
    974   %res0 = insertelement <4 x float> undef, float %ld, i32 0
    975   %res1 = insertelement <4 x float> %res0, float %ld, i32 1
    976   %res2 = insertelement <4 x float> %res1, float %ld, i32 2
    977   %res3 = insertelement <4 x float> %res2, float %ld, i32 3
    978   ret <4 x float> %res3
    979 }
    980 
    981 define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
    982 ; X32-LABEL: test_mm_load_ss:
    983 ; X32:       # BB#0:
    984 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
    985 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    986 ; X32-NEXT:    retl
    987 ;
    988 ; X64-LABEL: test_mm_load_ss:
    989 ; X64:       # BB#0:
    990 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
    991 ; X64-NEXT:    retq
    992   %ld = load float, float* %a0, align 1
    993   %res0 = insertelement <4 x float> undef, float %ld, i32 0
    994   %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
    995   %res2 = insertelement <4 x float> %res1, float 0.0, i32 2
    996   %res3 = insertelement <4 x float> %res2, float 0.0, i32 3
    997   ret <4 x float> %res3
    998 }
    999 
   1000 define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
   1001 ; X32-LABEL: test_mm_load1_ps:
   1002 ; X32:       # BB#0:
   1003 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1004 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1005 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1006 ; X32-NEXT:    retl
   1007 ;
   1008 ; X64-LABEL: test_mm_load1_ps:
   1009 ; X64:       # BB#0:
   1010 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1011 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1012 ; X64-NEXT:    retq
   1013   %ld = load float, float* %a0, align 4
   1014   %res0 = insertelement <4 x float> undef, float %ld, i32 0
   1015   %res1 = insertelement <4 x float> %res0, float %ld, i32 1
   1016   %res2 = insertelement <4 x float> %res1, float %ld, i32 2
   1017   %res3 = insertelement <4 x float> %res2, float %ld, i32 3
   1018   ret <4 x float> %res3
   1019 }
   1020 
   1021 define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
   1022 ; X32-LABEL: test_mm_loadh_pi:
   1023 ; X32:       # BB#0:
   1024 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1025 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1026 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1027 ; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1028 ; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1029 ; X32-NEXT:    retl
   1030 ;
   1031 ; X64-LABEL: test_mm_loadh_pi:
   1032 ; X64:       # BB#0:
   1033 ; X64-NEXT:    movq (%rdi), %rax
   1034 ; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
   1035 ; X64-NEXT:    shrq $32, %rax
   1036 ; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
   1037 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1038 ; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1039 ; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1040 ; X64-NEXT:    xorps %xmm2, %xmm2
   1041 ; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1042 ; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1043 ; X64-NEXT:    retq
   1044   %ptr = bitcast x86_mmx* %a1 to <2 x float>*
   1045   %ld  = load <2 x float>, <2 x float>* %ptr
   1046   %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   1047   %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1048   ret <4 x float> %res
   1049 }
   1050 
   1051 define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
   1052 ; X32-LABEL: test_mm_loadl_pi:
   1053 ; X32:       # BB#0:
   1054 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1055 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1056 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1057 ; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1058 ; X32-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
   1059 ; X32-NEXT:    movaps %xmm1, %xmm0
   1060 ; X32-NEXT:    retl
   1061 ;
   1062 ; X64-LABEL: test_mm_loadl_pi:
   1063 ; X64:       # BB#0:
   1064 ; X64-NEXT:    movq (%rdi), %rax
   1065 ; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
   1066 ; X64-NEXT:    shrq $32, %rax
   1067 ; X64-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
   1068 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1069 ; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1070 ; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1071 ; X64-NEXT:    xorps %xmm2, %xmm2
   1072 ; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   1073 ; X64-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
   1074 ; X64-NEXT:    movaps %xmm1, %xmm0
   1075 ; X64-NEXT:    retq
   1076   %ptr = bitcast x86_mmx* %a1 to <2 x float>*
   1077   %ld  = load <2 x float>, <2 x float>* %ptr
   1078   %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   1079   %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   1080   ret <4 x float> %res
   1081 }
   1082 
   1083 define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
   1084 ; X32-LABEL: test_mm_loadr_ps:
   1085 ; X32:       # BB#0:
   1086 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1087 ; X32-NEXT:    movaps (%eax), %xmm0
   1088 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
   1089 ; X32-NEXT:    retl
   1090 ;
   1091 ; X64-LABEL: test_mm_loadr_ps:
   1092 ; X64:       # BB#0:
   1093 ; X64-NEXT:    movaps (%rdi), %xmm0
   1094 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
   1095 ; X64-NEXT:    retq
   1096   %arg0 = bitcast float* %a0 to <4 x float>*
   1097   %ld = load <4 x float>, <4 x float>* %arg0, align 16
   1098   %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   1099   ret <4 x float> %res
   1100 }
   1101 
   1102 define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
   1103 ; X32-LABEL: test_mm_loadu_ps:
   1104 ; X32:       # BB#0:
   1105 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1106 ; X32-NEXT:    movups (%eax), %xmm0
   1107 ; X32-NEXT:    retl
   1108 ;
   1109 ; X64-LABEL: test_mm_loadu_ps:
   1110 ; X64:       # BB#0:
   1111 ; X64-NEXT:    movups (%rdi), %xmm0
   1112 ; X64-NEXT:    retq
   1113   %arg0 = bitcast float* %a0 to <4 x float>*
   1114   %res = load <4 x float>, <4 x float>* %arg0, align 1
   1115   ret <4 x float> %res
   1116 }
   1117 
   1118 define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
   1119 ; X32-LABEL: test_mm_max_ps:
   1120 ; X32:       # BB#0:
   1121 ; X32-NEXT:    maxps %xmm1, %xmm0
   1122 ; X32-NEXT:    retl
   1123 ;
   1124 ; X64-LABEL: test_mm_max_ps:
   1125 ; X64:       # BB#0:
   1126 ; X64-NEXT:    maxps %xmm1, %xmm0
   1127 ; X64-NEXT:    retq
   1128   %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
   1129   ret <4 x float> %res
   1130 }
   1131 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
   1132 
   1133 define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
   1134 ; X32-LABEL: test_mm_max_ss:
   1135 ; X32:       # BB#0:
   1136 ; X32-NEXT:    maxss %xmm1, %xmm0
   1137 ; X32-NEXT:    retl
   1138 ;
   1139 ; X64-LABEL: test_mm_max_ss:
   1140 ; X64:       # BB#0:
   1141 ; X64-NEXT:    maxss %xmm1, %xmm0
   1142 ; X64-NEXT:    retq
   1143   %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
   1144   ret <4 x float> %res
   1145 }
   1146 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
   1147 
   1148 define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
   1149 ; X32-LABEL: test_mm_min_ps:
   1150 ; X32:       # BB#0:
   1151 ; X32-NEXT:    minps %xmm1, %xmm0
   1152 ; X32-NEXT:    retl
   1153 ;
   1154 ; X64-LABEL: test_mm_min_ps:
   1155 ; X64:       # BB#0:
   1156 ; X64-NEXT:    minps %xmm1, %xmm0
   1157 ; X64-NEXT:    retq
   1158   %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   1159   ret <4 x float> %res
   1160 }
   1161 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
   1162 
   1163 define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
   1164 ; X32-LABEL: test_mm_min_ss:
   1165 ; X32:       # BB#0:
   1166 ; X32-NEXT:    minss %xmm1, %xmm0
   1167 ; X32-NEXT:    retl
   1168 ;
   1169 ; X64-LABEL: test_mm_min_ss:
   1170 ; X64:       # BB#0:
   1171 ; X64-NEXT:    minss %xmm1, %xmm0
   1172 ; X64-NEXT:    retq
   1173   %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
   1174   ret <4 x float> %res
   1175 }
   1176 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
   1177 
   1178 define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
   1179 ; X32-LABEL: test_mm_move_ss:
   1180 ; X32:       # BB#0:
   1181 ; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1182 ; X32-NEXT:    retl
   1183 ;
   1184 ; X64-LABEL: test_mm_move_ss:
   1185 ; X64:       # BB#0:
   1186 ; X64-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1187 ; X64-NEXT:    retq
   1188   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   1189   ret <4 x float> %res
   1190 }
   1191 
   1192 define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
   1193 ; X32-LABEL: test_mm_movehl_ps:
   1194 ; X32:       # BB#0:
   1195 ; X32-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   1196 ; X32-NEXT:    retl
   1197 ;
   1198 ; X64-LABEL: test_mm_movehl_ps:
   1199 ; X64:       # BB#0:
   1200 ; X64-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
   1201 ; X64-NEXT:    retq
   1202   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   1203   ret <4 x float> %res
   1204 }
   1205 
   1206 define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
   1207 ; X32-LABEL: test_mm_movelh_ps:
   1208 ; X32:       # BB#0:
   1209 ; X32-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1210 ; X32-NEXT:    retl
   1211 ;
   1212 ; X64-LABEL: test_mm_movelh_ps:
   1213 ; X64:       # BB#0:
   1214 ; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1215 ; X64-NEXT:    retq
   1216   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1217   ret <4 x float> %res
   1218 }
   1219 
   1220 define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
   1221 ; X32-LABEL: test_mm_movemask_ps:
   1222 ; X32:       # BB#0:
   1223 ; X32-NEXT:    movmskps %xmm0, %eax
   1224 ; X32-NEXT:    retl
   1225 ;
   1226 ; X64-LABEL: test_mm_movemask_ps:
   1227 ; X64:       # BB#0:
   1228 ; X64-NEXT:    movmskps %xmm0, %eax
   1229 ; X64-NEXT:    retq
   1230   %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
   1231   ret i32 %res
   1232 }
   1233 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
   1234 
   1235 define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   1236 ; X32-LABEL: test_mm_mul_ps:
   1237 ; X32:       # BB#0:
   1238 ; X32-NEXT:    mulps %xmm1, %xmm0
   1239 ; X32-NEXT:    retl
   1240 ;
   1241 ; X64-LABEL: test_mm_mul_ps:
   1242 ; X64:       # BB#0:
   1243 ; X64-NEXT:    mulps %xmm1, %xmm0
   1244 ; X64-NEXT:    retq
   1245   %res = fmul <4 x float> %a0, %a1
   1246   ret <4 x float> %res
   1247 }
   1248 
   1249 define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   1250 ; X32-LABEL: test_mm_mul_ss:
   1251 ; X32:       # BB#0:
   1252 ; X32-NEXT:    mulss %xmm1, %xmm0
   1253 ; X32-NEXT:    retl
   1254 ;
   1255 ; X64-LABEL: test_mm_mul_ss:
   1256 ; X64:       # BB#0:
   1257 ; X64-NEXT:    mulss %xmm1, %xmm0
   1258 ; X64-NEXT:    retq
   1259   %ext0 = extractelement <4 x float> %a0, i32 0
   1260   %ext1 = extractelement <4 x float> %a1, i32 0
   1261   %fmul = fmul float %ext0, %ext1
   1262   %res = insertelement <4 x float> %a0, float %fmul, i32 0
   1263   ret <4 x float> %res
   1264 }
   1265 
   1266 define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   1267 ; X32-LABEL: test_mm_or_ps:
   1268 ; X32:       # BB#0:
   1269 ; X32-NEXT:    pushl %ebp
   1270 ; X32-NEXT:    movl %esp, %ebp
   1271 ; X32-NEXT:    pushl %esi
   1272 ; X32-NEXT:    andl $-16, %esp
   1273 ; X32-NEXT:    subl $64, %esp
   1274 ; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
   1275 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1276 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1277 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1278 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
   1279 ; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
   1280 ; X32-NEXT:    orl {{[0-9]+}}(%esp), %esi
   1281 ; X32-NEXT:    movl %esi, (%esp)
   1282 ; X32-NEXT:    orl {{[0-9]+}}(%esp), %edx
   1283 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
   1284 ; X32-NEXT:    orl {{[0-9]+}}(%esp), %eax
   1285 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
   1286 ; X32-NEXT:    orl {{[0-9]+}}(%esp), %ecx
   1287 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
   1288 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1289 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1290 ; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   1291 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1292 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1293 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   1294 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1295 ; X32-NEXT:    leal -4(%ebp), %esp
   1296 ; X32-NEXT:    popl %esi
   1297 ; X32-NEXT:    popl %ebp
   1298 ; X32-NEXT:    retl
   1299 ;
   1300 ; X64-LABEL: test_mm_or_ps:
   1301 ; X64:       # BB#0:
   1302 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
   1303 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
   1304 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8
   1305 ; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
   1306 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
   1307 ; X64-NEXT:    movq %rdx, %rsi
   1308 ; X64-NEXT:    orl %eax, %edx
   1309 ; X64-NEXT:    shrq $32, %rax
   1310 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
   1311 ; X64-NEXT:    movq %rcx, %rdi
   1312 ; X64-NEXT:    orl %r8d, %ecx
   1313 ; X64-NEXT:    shrq $32, %r8
   1314 ; X64-NEXT:    shrq $32, %rsi
   1315 ; X64-NEXT:    shrq $32, %rdi
   1316 ; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
   1317 ; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
   1318 ; X64-NEXT:    orl %r8d, %edi
   1319 ; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
   1320 ; X64-NEXT:    orl %eax, %esi
   1321 ; X64-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
   1322 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1323 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1324 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1325 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1326 ; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1327 ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
   1328 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   1329 ; X64-NEXT:    retq
   1330   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
   1331   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
   1332   %res = or <4 x i32> %arg0, %arg1
   1333   %bc = bitcast <4 x i32> %res to <4 x float>
   1334   ret <4 x float> %bc
   1335 }
   1336 
   1337 define void @test_mm_prefetch(i8* %a0) {
   1338 ; X32-LABEL: test_mm_prefetch:
   1339 ; X32:       # BB#0:
   1340 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1341 ; X32-NEXT:    prefetchnta (%eax)
   1342 ; X32-NEXT:    retl
   1343 ;
   1344 ; X64-LABEL: test_mm_prefetch:
   1345 ; X64:       # BB#0:
   1346 ; X64-NEXT:    prefetchnta (%rdi)
   1347 ; X64-NEXT:    retq
   1348   call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
   1349   ret void
   1350 }
   1351 declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
   1352 
   1353 define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
   1354 ; X32-LABEL: test_mm_rcp_ps:
   1355 ; X32:       # BB#0:
   1356 ; X32-NEXT:    rcpps %xmm0, %xmm0
   1357 ; X32-NEXT:    retl
   1358 ;
   1359 ; X64-LABEL: test_mm_rcp_ps:
   1360 ; X64:       # BB#0:
   1361 ; X64-NEXT:    rcpps %xmm0, %xmm0
   1362 ; X64-NEXT:    retq
   1363   %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
   1364   ret <4 x float> %res
   1365 }
   1366 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
   1367 
   1368 define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
   1369 ; X32-LABEL: test_mm_rcp_ss:
   1370 ; X32:       # BB#0:
   1371 ; X32-NEXT:    rcpss %xmm0, %xmm0
   1372 ; X32-NEXT:    retl
   1373 ;
   1374 ; X64-LABEL: test_mm_rcp_ss:
   1375 ; X64:       # BB#0:
   1376 ; X64-NEXT:    rcpss %xmm0, %xmm0
   1377 ; X64-NEXT:    retq
   1378   %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
   1379   %ext0 = extractelement <4 x float> %rcp, i32 0
   1380   %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
   1381   %ext1 = extractelement <4 x float> %a0, i32 1
   1382   %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
   1383   %ext2 = extractelement <4 x float> %a0, i32 2
   1384   %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
   1385   %ext3 = extractelement <4 x float> %a0, i32 3
   1386   %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
   1387   ret <4 x float> %ins3
   1388 }
   1389 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
   1390 
   1391 define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
   1392 ; X32-LABEL: test_mm_rsqrt_ps:
   1393 ; X32:       # BB#0:
   1394 ; X32-NEXT:    rsqrtps %xmm0, %xmm0
   1395 ; X32-NEXT:    retl
   1396 ;
   1397 ; X64-LABEL: test_mm_rsqrt_ps:
   1398 ; X64:       # BB#0:
   1399 ; X64-NEXT:    rsqrtps %xmm0, %xmm0
   1400 ; X64-NEXT:    retq
   1401   %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
   1402   ret <4 x float> %res
   1403 }
   1404 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
   1405 
   1406 define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
   1407 ; X32-LABEL: test_mm_rsqrt_ss:
   1408 ; X32:       # BB#0:
   1409 ; X32-NEXT:    rsqrtss %xmm0, %xmm0
   1410 ; X32-NEXT:    retl
   1411 ;
   1412 ; X64-LABEL: test_mm_rsqrt_ss:
   1413 ; X64:       # BB#0:
   1414 ; X64-NEXT:    rsqrtss %xmm0, %xmm0
   1415 ; X64-NEXT:    retq
   1416   %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
   1417   %ext0 = extractelement <4 x float> %rsqrt, i32 0
   1418   %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
   1419   %ext1 = extractelement <4 x float> %a0, i32 1
   1420   %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
   1421   %ext2 = extractelement <4 x float> %a0, i32 2
   1422   %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
   1423   %ext3 = extractelement <4 x float> %a0, i32 3
   1424   %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
   1425   ret <4 x float> %ins3
   1426 }
   1427 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
   1428 
   1429 define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
   1430 ; X32-LABEL: test_MM_SET_EXCEPTION_MASK:
   1431 ; X32:       # BB#0:
   1432 ; X32-NEXT:    pushl %eax
   1433 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1434 ; X32-NEXT:    leal (%esp), %ecx
   1435 ; X32-NEXT:    stmxcsr (%ecx)
   1436 ; X32-NEXT:    movl (%esp), %edx
   1437 ; X32-NEXT:    andl $-8065, %edx # imm = 0xE07F
   1438 ; X32-NEXT:    orl %eax, %edx
   1439 ; X32-NEXT:    movl %edx, (%esp)
   1440 ; X32-NEXT:    ldmxcsr (%ecx)
   1441 ; X32-NEXT:    popl %eax
   1442 ; X32-NEXT:    retl
   1443 ;
   1444 ; X64-LABEL: test_MM_SET_EXCEPTION_MASK:
   1445 ; X64:       # BB#0:
   1446 ; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
   1447 ; X64-NEXT:    stmxcsr (%rax)
   1448 ; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
   1449 ; X64-NEXT:    andl $-8065, %ecx # imm = 0xE07F
   1450 ; X64-NEXT:    orl %edi, %ecx
   1451 ; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
   1452 ; X64-NEXT:    ldmxcsr (%rax)
   1453 ; X64-NEXT:    retq
   1454   %1 = alloca i32, align 4
   1455   %2 = bitcast i32* %1 to i8*
   1456   call void @llvm.x86.sse.stmxcsr(i8* %2)
   1457   %3 = load i32, i32* %1
   1458   %4 = and i32 %3, -8065
   1459   %5 = or i32 %4, %a0
   1460   store i32 %5, i32* %1
   1461   call void @llvm.x86.sse.ldmxcsr(i8* %2)
   1462   ret void
   1463 }
   1464 declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
   1465 
   1466 define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
   1467 ; X32-LABEL: test_MM_SET_EXCEPTION_STATE:
   1468 ; X32:       # BB#0:
   1469 ; X32-NEXT:    pushl %eax
   1470 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1471 ; X32-NEXT:    leal (%esp), %ecx
   1472 ; X32-NEXT:    stmxcsr (%ecx)
   1473 ; X32-NEXT:    movl (%esp), %edx
   1474 ; X32-NEXT:    andl $-64, %edx
   1475 ; X32-NEXT:    orl %eax, %edx
   1476 ; X32-NEXT:    movl %edx, (%esp)
   1477 ; X32-NEXT:    ldmxcsr (%ecx)
   1478 ; X32-NEXT:    popl %eax
   1479 ; X32-NEXT:    retl
   1480 ;
   1481 ; X64-LABEL: test_MM_SET_EXCEPTION_STATE:
   1482 ; X64:       # BB#0:
   1483 ; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
   1484 ; X64-NEXT:    stmxcsr (%rax)
   1485 ; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
   1486 ; X64-NEXT:    andl $-64, %ecx
   1487 ; X64-NEXT:    orl %edi, %ecx
   1488 ; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
   1489 ; X64-NEXT:    ldmxcsr (%rax)
   1490 ; X64-NEXT:    retq
   1491   %1 = alloca i32, align 4
   1492   %2 = bitcast i32* %1 to i8*
   1493   call void @llvm.x86.sse.stmxcsr(i8* %2)
   1494   %3 = load i32, i32* %1
   1495   %4 = and i32 %3, -64
   1496   %5 = or i32 %4, %a0
   1497   store i32 %5, i32* %1
   1498   call void @llvm.x86.sse.ldmxcsr(i8* %2)
   1499   ret void
   1500 }
   1501 
   1502 define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
   1503 ; X32-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
   1504 ; X32:       # BB#0:
   1505 ; X32-NEXT:    pushl %eax
   1506 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1507 ; X32-NEXT:    leal (%esp), %ecx
   1508 ; X32-NEXT:    stmxcsr (%ecx)
   1509 ; X32-NEXT:    movl (%esp), %edx
   1510 ; X32-NEXT:    andl $-32769, %edx # imm = 0xFFFF7FFF
   1511 ; X32-NEXT:    orl %eax, %edx
   1512 ; X32-NEXT:    movl %edx, (%esp)
   1513 ; X32-NEXT:    ldmxcsr (%ecx)
   1514 ; X32-NEXT:    popl %eax
   1515 ; X32-NEXT:    retl
   1516 ;
   1517 ; X64-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
   1518 ; X64:       # BB#0:
   1519 ; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
   1520 ; X64-NEXT:    stmxcsr (%rax)
   1521 ; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
   1522 ; X64-NEXT:    andl $-32769, %ecx # imm = 0xFFFF7FFF
   1523 ; X64-NEXT:    orl %edi, %ecx
   1524 ; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
   1525 ; X64-NEXT:    ldmxcsr (%rax)
   1526 ; X64-NEXT:    retq
   1527   %1 = alloca i32, align 4
   1528   %2 = bitcast i32* %1 to i8*
   1529   call void @llvm.x86.sse.stmxcsr(i8* %2)
   1530   %3 = load i32, i32* %1
   1531   %4 = and i32 %3, -32769
   1532   %5 = or i32 %4, %a0
   1533   store i32 %5, i32* %1
   1534   call void @llvm.x86.sse.ldmxcsr(i8* %2)
   1535   ret void
   1536 }
   1537 
   1538 define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
   1539 ; X32-LABEL: test_mm_set_ps:
   1540 ; X32:       # BB#0:
   1541 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1542 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1543 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1544 ; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
   1545 ; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
   1546 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   1547 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1548 ; X32-NEXT:    retl
   1549 ;
   1550 ; X64-LABEL: test_mm_set_ps:
   1551 ; X64:       # BB#0:
   1552 ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
   1553 ; X64-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
   1554 ; X64-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
   1555 ; X64-NEXT:    movaps %xmm3, %xmm0
   1556 ; X64-NEXT:    retq
   1557   %res0  = insertelement <4 x float> undef, float %a3, i32 0
   1558   %res1  = insertelement <4 x float> %res0, float %a2, i32 1
   1559   %res2  = insertelement <4 x float> %res1, float %a1, i32 2
   1560   %res3  = insertelement <4 x float> %res2, float %a0, i32 3
   1561   ret <4 x float> %res3
   1562 }
   1563 
   1564 define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
   1565 ; X32-LABEL: test_mm_set_ps1:
   1566 ; X32:       # BB#0:
   1567 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1568 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1569 ; X32-NEXT:    retl
   1570 ;
   1571 ; X64-LABEL: test_mm_set_ps1:
   1572 ; X64:       # BB#0:
   1573 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1574 ; X64-NEXT:    retq
   1575   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   1576   %res1  = insertelement <4 x float> %res0, float %a0, i32 1
   1577   %res2  = insertelement <4 x float> %res1, float %a0, i32 2
   1578   %res3  = insertelement <4 x float> %res2, float %a0, i32 3
   1579   ret <4 x float> %res3
   1580 }
   1581 
   1582 define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
   1583 ; X32-LABEL: test_MM_SET_ROUNDING_MODE:
   1584 ; X32:       # BB#0:
   1585 ; X32-NEXT:    pushl %eax
   1586 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1587 ; X32-NEXT:    leal (%esp), %ecx
   1588 ; X32-NEXT:    stmxcsr (%ecx)
   1589 ; X32-NEXT:    movl (%esp), %edx
   1590 ; X32-NEXT:    andl $-24577, %edx # imm = 0x9FFF
   1591 ; X32-NEXT:    orl %eax, %edx
   1592 ; X32-NEXT:    movl %edx, (%esp)
   1593 ; X32-NEXT:    ldmxcsr (%ecx)
   1594 ; X32-NEXT:    popl %eax
   1595 ; X32-NEXT:    retl
   1596 ;
   1597 ; X64-LABEL: test_MM_SET_ROUNDING_MODE:
   1598 ; X64:       # BB#0:
   1599 ; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
   1600 ; X64-NEXT:    stmxcsr (%rax)
   1601 ; X64-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx
   1602 ; X64-NEXT:    andl $-24577, %ecx # imm = 0x9FFF
   1603 ; X64-NEXT:    orl %edi, %ecx
   1604 ; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
   1605 ; X64-NEXT:    ldmxcsr (%rax)
   1606 ; X64-NEXT:    retq
   1607   %1 = alloca i32, align 4
   1608   %2 = bitcast i32* %1 to i8*
   1609   call void @llvm.x86.sse.stmxcsr(i8* %2)
   1610   %3 = load i32, i32* %1
   1611   %4 = and i32 %3, -24577
   1612   %5 = or i32 %4, %a0
   1613   store i32 %5, i32* %1
   1614   call void @llvm.x86.sse.ldmxcsr(i8* %2)
   1615   ret void
   1616 }
   1617 
   1618 define <4 x float> @test_mm_set_ss(float %a0) nounwind {
   1619 ; X32-LABEL: test_mm_set_ss:
   1620 ; X32:       # BB#0:
   1621 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1622 ; X32-NEXT:    xorps %xmm0, %xmm0
   1623 ; X32-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1624 ; X32-NEXT:    retl
   1625 ;
   1626 ; X64-LABEL: test_mm_set_ss:
   1627 ; X64:       # BB#0:
   1628 ; X64-NEXT:    xorps %xmm1, %xmm1
   1629 ; X64-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
   1630 ; X64-NEXT:    movaps %xmm1, %xmm0
   1631 ; X64-NEXT:    retq
   1632   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   1633   %res1  = insertelement <4 x float> %res0, float 0.0, i32 1
   1634   %res2  = insertelement <4 x float> %res1, float 0.0, i32 2
   1635   %res3  = insertelement <4 x float> %res2, float 0.0, i32 3
   1636   ret <4 x float> %res3
   1637 }
   1638 
   1639 define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
   1640 ; X32-LABEL: test_mm_set1_ps:
   1641 ; X32:       # BB#0:
   1642 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1643 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1644 ; X32-NEXT:    retl
   1645 ;
   1646 ; X64-LABEL: test_mm_set1_ps:
   1647 ; X64:       # BB#0:
   1648 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1649 ; X64-NEXT:    retq
   1650   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   1651   %res1  = insertelement <4 x float> %res0, float %a0, i32 1
   1652   %res2  = insertelement <4 x float> %res1, float %a0, i32 2
   1653   %res3  = insertelement <4 x float> %res2, float %a0, i32 3
   1654   ret <4 x float> %res3
   1655 }
   1656 
   1657 define void @test_mm_setcsr(i32 %a0) nounwind {
   1658 ; X32-LABEL: test_mm_setcsr:
   1659 ; X32:       # BB#0:
   1660 ; X32-NEXT:    pushl %eax
   1661 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1662 ; X32-NEXT:    leal (%esp), %ecx
   1663 ; X32-NEXT:    movl %eax, (%esp)
   1664 ; X32-NEXT:    ldmxcsr (%ecx)
   1665 ; X32-NEXT:    popl %eax
   1666 ; X32-NEXT:    retl
   1667 ;
   1668 ; X64-LABEL: test_mm_setcsr:
   1669 ; X64:       # BB#0:
   1670 ; X64-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
   1671 ; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
   1672 ; X64-NEXT:    ldmxcsr (%rax)
   1673 ; X64-NEXT:    retq
   1674   %st = alloca i32, align 4
   1675   store i32 %a0, i32* %st, align 4
   1676   %bc = bitcast i32* %st to i8*
   1677   call void @llvm.x86.sse.ldmxcsr(i8* %bc)
   1678   ret void
   1679 }
   1680 
   1681 define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
   1682 ; X32-LABEL: test_mm_setr_ps:
   1683 ; X32:       # BB#0:
   1684 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1685 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   1686 ; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
   1687 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1688 ; X32-NEXT:    unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
   1689 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   1690 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
   1691 ; X32-NEXT:    retl
   1692 ;
   1693 ; X64-LABEL: test_mm_setr_ps:
   1694 ; X64:       # BB#0:
   1695 ; X64-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
   1696 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   1697 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1698 ; X64-NEXT:    retq
   1699   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   1700   %res1  = insertelement <4 x float> %res0, float %a1, i32 1
   1701   %res2  = insertelement <4 x float> %res1, float %a2, i32 2
   1702   %res3  = insertelement <4 x float> %res2, float %a3, i32 3
   1703   ret <4 x float> %res3
   1704 }
   1705 
   1706 define <4 x float> @test_mm_setzero_ps() {
   1707 ; X32-LABEL: test_mm_setzero_ps:
   1708 ; X32:       # BB#0:
   1709 ; X32-NEXT:    xorps %xmm0, %xmm0
   1710 ; X32-NEXT:    retl
   1711 ;
   1712 ; X64-LABEL: test_mm_setzero_ps:
   1713 ; X64:       # BB#0:
   1714 ; X64-NEXT:    xorps %xmm0, %xmm0
   1715 ; X64-NEXT:    retq
   1716   ret <4 x float> zeroinitializer
   1717 }
   1718 
   1719 define void @test_mm_sfence() nounwind {
   1720 ; X32-LABEL: test_mm_sfence:
   1721 ; X32:       # BB#0:
   1722 ; X32-NEXT:    sfence
   1723 ; X32-NEXT:    retl
   1724 ;
   1725 ; X64-LABEL: test_mm_sfence:
   1726 ; X64:       # BB#0:
   1727 ; X64-NEXT:    sfence
   1728 ; X64-NEXT:    retq
   1729   call void @llvm.x86.sse.sfence()
   1730   ret void
   1731 }
   1732 declare void @llvm.x86.sse.sfence() nounwind readnone
   1733 
   1734 define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   1735 ; X32-LABEL: test_mm_shuffle_ps:
   1736 ; X32:       # BB#0:
   1737 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
   1738 ; X32-NEXT:    retl
   1739 ;
   1740 ; X64-LABEL: test_mm_shuffle_ps:
   1741 ; X64:       # BB#0:
   1742 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0]
   1743 ; X64-NEXT:    retq
   1744   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
   1745   ret <4 x float> %res
   1746 }
   1747 
   1748 define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
   1749 ; X32-LABEL: test_mm_sqrt_ps:
   1750 ; X32:       # BB#0:
   1751 ; X32-NEXT:    sqrtps %xmm0, %xmm0
   1752 ; X32-NEXT:    retl
   1753 ;
   1754 ; X64-LABEL: test_mm_sqrt_ps:
   1755 ; X64:       # BB#0:
   1756 ; X64-NEXT:    sqrtps %xmm0, %xmm0
   1757 ; X64-NEXT:    retq
   1758   %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
   1759   ret <4 x float> %res
   1760 }
   1761 declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
   1762 
   1763 define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
   1764 ; X32-LABEL: test_mm_sqrt_ss:
   1765 ; X32:       # BB#0:
   1766 ; X32-NEXT:    sqrtss %xmm0, %xmm0
   1767 ; X32-NEXT:    retl
   1768 ;
   1769 ; X64-LABEL: test_mm_sqrt_ss:
   1770 ; X64:       # BB#0:
   1771 ; X64-NEXT:    sqrtss %xmm0, %xmm0
   1772 ; X64-NEXT:    retq
   1773   %sqrt = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
   1774   %ext0 = extractelement <4 x float> %sqrt, i32 0
   1775   %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
   1776   %ext1 = extractelement <4 x float> %a0, i32 1
   1777   %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
   1778   %ext2 = extractelement <4 x float> %a0, i32 2
   1779   %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
   1780   %ext3 = extractelement <4 x float> %a0, i32 3
   1781   %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
   1782   ret <4 x float> %ins3
   1783 }
   1784 declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
   1785 
   1786 define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
   1787 ; X32-LABEL: test_mm_store_ps:
   1788 ; X32:       # BB#0:
   1789 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1790 ; X32-NEXT:    movaps %xmm0, (%eax)
   1791 ; X32-NEXT:    retl
   1792 ;
   1793 ; X64-LABEL: test_mm_store_ps:
   1794 ; X64:       # BB#0:
   1795 ; X64-NEXT:    movaps %xmm0, (%rdi)
   1796 ; X64-NEXT:    retq
   1797   %arg0 = bitcast float* %a0 to <4 x float>*
   1798   store <4 x float> %a1, <4 x float>* %arg0, align 16
   1799   ret void
   1800 }
   1801 
   1802 define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
   1803 ; X32-LABEL: test_mm_store_ps1:
   1804 ; X32:       # BB#0:
   1805 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1806 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1807 ; X32-NEXT:    movaps %xmm0, (%eax)
   1808 ; X32-NEXT:    retl
   1809 ;
   1810 ; X64-LABEL: test_mm_store_ps1:
   1811 ; X64:       # BB#0:
   1812 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1813 ; X64-NEXT:    movaps %xmm0, (%rdi)
   1814 ; X64-NEXT:    retq
   1815   %arg0 = bitcast float* %a0 to <4 x float>*
   1816   %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
   1817   store <4 x float> %shuf, <4 x float>* %arg0, align 16
   1818   ret void
   1819 }
   1820 
   1821 define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
   1822 ; X32-LABEL: test_mm_store_ss:
   1823 ; X32:       # BB#0:
   1824 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1825 ; X32-NEXT:    movss %xmm0, (%eax)
   1826 ; X32-NEXT:    retl
   1827 ;
   1828 ; X64-LABEL: test_mm_store_ss:
   1829 ; X64:       # BB#0:
   1830 ; X64-NEXT:    movss %xmm0, (%rdi)
   1831 ; X64-NEXT:    retq
   1832   %ext = extractelement <4 x float> %a1, i32 0
   1833   store float %ext, float* %a0, align 1
   1834   ret void
   1835 }
   1836 
   1837 define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
   1838 ; X32-LABEL: test_mm_store1_ps:
   1839 ; X32:       # BB#0:
   1840 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1841 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1842 ; X32-NEXT:    movaps %xmm0, (%eax)
   1843 ; X32-NEXT:    retl
   1844 ;
   1845 ; X64-LABEL: test_mm_store1_ps:
   1846 ; X64:       # BB#0:
   1847 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
   1848 ; X64-NEXT:    movaps %xmm0, (%rdi)
   1849 ; X64-NEXT:    retq
   1850   %arg0 = bitcast float* %a0 to <4 x float>*
   1851   %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
   1852   store <4 x float> %shuf, <4 x float>* %arg0, align 16
   1853   ret void
   1854 }
   1855 
   1856 define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
   1857 ; X32-LABEL: test_mm_storeh_ps:
   1858 ; X32:       # BB#0:
   1859 ; X32-NEXT:    pushl %ebp
   1860 ; X32-NEXT:    movl %esp, %ebp
   1861 ; X32-NEXT:    andl $-16, %esp
   1862 ; X32-NEXT:    subl $32, %esp
   1863 ; X32-NEXT:    movl 8(%ebp), %eax
   1864 ; X32-NEXT:    movaps %xmm0, (%esp)
   1865 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1866 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1867 ; X32-NEXT:    movl %edx, 4(%eax)
   1868 ; X32-NEXT:    movl %ecx, (%eax)
   1869 ; X32-NEXT:    movl %ebp, %esp
   1870 ; X32-NEXT:    popl %ebp
   1871 ; X32-NEXT:    retl
   1872 ;
   1873 ; X64-LABEL: test_mm_storeh_ps:
   1874 ; X64:       # BB#0:
   1875 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
   1876 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
   1877 ; X64-NEXT:    movq %rax, (%rdi)
   1878 ; X64-NEXT:    retq
   1879   %ptr = bitcast x86_mmx* %a0 to i64*
   1880   %bc  = bitcast <4 x float> %a1 to <2 x i64>
   1881   %ext = extractelement <2 x i64> %bc, i32 1
   1882   store i64 %ext, i64* %ptr
   1883   ret void
   1884 }
   1885 
   1886 define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
   1887 ; X32-LABEL: test_mm_storel_ps:
   1888 ; X32:       # BB#0:
   1889 ; X32-NEXT:    pushl %ebp
   1890 ; X32-NEXT:    movl %esp, %ebp
   1891 ; X32-NEXT:    andl $-16, %esp
   1892 ; X32-NEXT:    subl $32, %esp
   1893 ; X32-NEXT:    movl 8(%ebp), %eax
   1894 ; X32-NEXT:    movaps %xmm0, (%esp)
   1895 ; X32-NEXT:    movl (%esp), %ecx
   1896 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1897 ; X32-NEXT:    movl %edx, 4(%eax)
   1898 ; X32-NEXT:    movl %ecx, (%eax)
   1899 ; X32-NEXT:    movl %ebp, %esp
   1900 ; X32-NEXT:    popl %ebp
   1901 ; X32-NEXT:    retl
   1902 ;
   1903 ; X64-LABEL: test_mm_storel_ps:
   1904 ; X64:       # BB#0:
   1905 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
   1906 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
   1907 ; X64-NEXT:    movq %rax, (%rdi)
   1908 ; X64-NEXT:    retq
   1909   %ptr = bitcast x86_mmx* %a0 to i64*
   1910   %bc  = bitcast <4 x float> %a1 to <2 x i64>
   1911   %ext = extractelement <2 x i64> %bc, i32 0
   1912   store i64 %ext, i64* %ptr
   1913   ret void
   1914 }
   1915 
   1916 define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
   1917 ; X32-LABEL: test_mm_storer_ps:
   1918 ; X32:       # BB#0:
   1919 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1920 ; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
   1921 ; X32-NEXT:    movaps %xmm0, (%eax)
   1922 ; X32-NEXT:    retl
   1923 ;
   1924 ; X64-LABEL: test_mm_storer_ps:
   1925 ; X64:       # BB#0:
   1926 ; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2,1,0]
   1927 ; X64-NEXT:    movaps %xmm0, (%rdi)
   1928 ; X64-NEXT:    retq
   1929   %arg0 = bitcast float* %a0 to <4 x float>*
   1930   %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   1931   store <4 x float> %shuf, <4 x float>* %arg0, align 16
   1932   ret void
   1933 }
   1934 
   1935 define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
   1936 ; X32-LABEL: test_mm_storeu_ps:
   1937 ; X32:       # BB#0:
   1938 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1939 ; X32-NEXT:    movups %xmm0, (%eax)
   1940 ; X32-NEXT:    retl
   1941 ;
   1942 ; X64-LABEL: test_mm_storeu_ps:
   1943 ; X64:       # BB#0:
   1944 ; X64-NEXT:    movups %xmm0, (%rdi)
   1945 ; X64-NEXT:    retq
   1946   %arg0 = bitcast float* %a0 to <4 x float>*
   1947   store <4 x float> %a1, <4 x float>* %arg0, align 1
   1948   ret void
   1949 }
   1950 
   1951 define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
   1952 ; X32-LABEL: test_mm_stream_ps:
   1953 ; X32:       # BB#0:
   1954 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1955 ; X32-NEXT:    movntps %xmm0, (%eax)
   1956 ; X32-NEXT:    retl
   1957 ;
   1958 ; X64-LABEL: test_mm_stream_ps:
   1959 ; X64:       # BB#0:
   1960 ; X64-NEXT:    movntps %xmm0, (%rdi)
   1961 ; X64-NEXT:    retq
   1962   %arg0 = bitcast float* %a0 to <4 x float>*
   1963   store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0
   1964   ret void
   1965 }
   1966 
   1967 define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   1968 ; X32-LABEL: test_mm_sub_ps:
   1969 ; X32:       # BB#0:
   1970 ; X32-NEXT:    subps %xmm1, %xmm0
   1971 ; X32-NEXT:    retl
   1972 ;
   1973 ; X64-LABEL: test_mm_sub_ps:
   1974 ; X64:       # BB#0:
   1975 ; X64-NEXT:    subps %xmm1, %xmm0
   1976 ; X64-NEXT:    retq
   1977   %res = fsub <4 x float> %a0, %a1
   1978   ret <4 x float> %res
   1979 }
   1980 
   1981 define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   1982 ; X32-LABEL: test_mm_sub_ss:
   1983 ; X32:       # BB#0:
   1984 ; X32-NEXT:    subss %xmm1, %xmm0
   1985 ; X32-NEXT:    retl
   1986 ;
   1987 ; X64-LABEL: test_mm_sub_ss:
   1988 ; X64:       # BB#0:
   1989 ; X64-NEXT:    subss %xmm1, %xmm0
   1990 ; X64-NEXT:    retq
   1991   %ext0 = extractelement <4 x float> %a0, i32 0
   1992   %ext1 = extractelement <4 x float> %a1, i32 0
   1993   %fsub = fsub float %ext0, %ext1
   1994   %res = insertelement <4 x float> %a0, float %fsub, i32 0
   1995   ret <4 x float> %res
   1996 }
   1997 
   1998 define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
   1999 ; X32-LABEL: test_MM_TRANSPOSE4_PS:
   2000 ; X32:       # BB#0:
   2001 ; X32-NEXT:    pushl %esi
   2002 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2003 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   2004 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   2005 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
   2006 ; X32-NEXT:    movaps (%esi), %xmm0
   2007 ; X32-NEXT:    movaps (%edx), %xmm1
   2008 ; X32-NEXT:    movaps (%ecx), %xmm2
   2009 ; X32-NEXT:    movaps (%eax), %xmm3
   2010 ; X32-NEXT:    movaps %xmm0, %xmm4
   2011 ; X32-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   2012 ; X32-NEXT:    movaps %xmm2, %xmm5
   2013 ; X32-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
   2014 ; X32-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   2015 ; X32-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
   2016 ; X32-NEXT:    movaps %xmm4, %xmm1
   2017 ; X32-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
   2018 ; X32-NEXT:    movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
   2019 ; X32-NEXT:    movaps %xmm0, %xmm3
   2020 ; X32-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
   2021 ; X32-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
   2022 ; X32-NEXT:    movaps %xmm1, (%esi)
   2023 ; X32-NEXT:    movaps %xmm5, (%edx)
   2024 ; X32-NEXT:    movaps %xmm3, (%ecx)
   2025 ; X32-NEXT:    movaps %xmm2, (%eax)
   2026 ; X32-NEXT:    popl %esi
   2027 ; X32-NEXT:    retl
   2028 ;
   2029 ; X64-LABEL: test_MM_TRANSPOSE4_PS:
   2030 ; X64:       # BB#0:
   2031 ; X64-NEXT:    movaps (%rdi), %xmm0
   2032 ; X64-NEXT:    movaps (%rsi), %xmm1
   2033 ; X64-NEXT:    movaps (%rdx), %xmm2
   2034 ; X64-NEXT:    movaps (%rcx), %xmm3
   2035 ; X64-NEXT:    movaps %xmm0, %xmm4
   2036 ; X64-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   2037 ; X64-NEXT:    movaps %xmm2, %xmm5
   2038 ; X64-NEXT:    unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
   2039 ; X64-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   2040 ; X64-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
   2041 ; X64-NEXT:    movaps %xmm4, %xmm1
   2042 ; X64-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
   2043 ; X64-NEXT:    movhlps {{.*#+}} xmm5 = xmm4[1],xmm5[1]
   2044 ; X64-NEXT:    movaps %xmm0, %xmm3
   2045 ; X64-NEXT:    movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
   2046 ; X64-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
   2047 ; X64-NEXT:    movaps %xmm1, (%rdi)
   2048 ; X64-NEXT:    movaps %xmm5, (%rsi)
   2049 ; X64-NEXT:    movaps %xmm3, (%rdx)
   2050 ; X64-NEXT:    movaps %xmm2, (%rcx)
   2051 ; X64-NEXT:    retq
   2052   %row0 = load <4 x float>, <4 x float>* %a0, align 16
   2053   %row1 = load <4 x float>, <4 x float>* %a1, align 16
   2054   %row2 = load <4 x float>, <4 x float>* %a2, align 16
   2055   %row3 = load <4 x float>, <4 x float>* %a3, align 16
   2056   %tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   2057   %tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   2058   %tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   2059   %tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   2060   %res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   2061   %res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   2062   %res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   2063   %res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   2064   store <4 x float> %res0, <4 x float>* %a0, align 16
   2065   store <4 x float> %res1, <4 x float>* %a1, align 16
   2066   store <4 x float> %res2, <4 x float>* %a2, align 16
   2067   store <4 x float> %res3, <4 x float>* %a3, align 16
   2068   ret void
   2069 }
   2070 
   2071 define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   2072 ; X32-LABEL: test_mm_ucomieq_ss:
   2073 ; X32:       # BB#0:
   2074 ; X32-NEXT:    ucomiss %xmm1, %xmm0
   2075 ; X32-NEXT:    setnp %al
   2076 ; X32-NEXT:    sete %cl
   2077 ; X32-NEXT:    andb %al, %cl
   2078 ; X32-NEXT:    movzbl %cl, %eax
   2079 ; X32-NEXT:    retl
   2080 ;
   2081 ; X64-LABEL: test_mm_ucomieq_ss:
   2082 ; X64:       # BB#0:
   2083 ; X64-NEXT:    ucomiss %xmm1, %xmm0
   2084 ; X64-NEXT:    setnp %al
   2085 ; X64-NEXT:    sete %cl
   2086 ; X64-NEXT:    andb %al, %cl
   2087 ; X64-NEXT:    movzbl %cl, %eax
   2088 ; X64-NEXT:    retq
   2089   %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
   2090   ret i32 %res
   2091 }
   2092 declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
   2093 
   2094 define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   2095 ; X32-LABEL: test_mm_ucomige_ss:
   2096 ; X32:       # BB#0:
   2097 ; X32-NEXT:    xorl %eax, %eax
   2098 ; X32-NEXT:    ucomiss %xmm1, %xmm0
   2099 ; X32-NEXT:    setae %al
   2100 ; X32-NEXT:    retl
   2101 ;
   2102 ; X64-LABEL: test_mm_ucomige_ss:
   2103 ; X64:       # BB#0:
   2104 ; X64-NEXT:    xorl %eax, %eax
   2105 ; X64-NEXT:    ucomiss %xmm1, %xmm0
   2106 ; X64-NEXT:    setae %al
   2107 ; X64-NEXT:    retq
   2108   %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1)
   2109   ret i32 %res
   2110 }
   2111 declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
   2112 
   2113 define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   2114 ; X32-LABEL: test_mm_ucomigt_ss:
   2115 ; X32:       # BB#0:
   2116 ; X32-NEXT:    xorl %eax, %eax
   2117 ; X32-NEXT:    ucomiss %xmm1, %xmm0
   2118 ; X32-NEXT:    seta %al
   2119 ; X32-NEXT:    retl
   2120 ;
   2121 ; X64-LABEL: test_mm_ucomigt_ss:
   2122 ; X64:       # BB#0:
   2123 ; X64-NEXT:    xorl %eax, %eax
   2124 ; X64-NEXT:    ucomiss %xmm1, %xmm0
   2125 ; X64-NEXT:    seta %al
   2126 ; X64-NEXT:    retq
   2127   %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1)
   2128   ret i32 %res
   2129 }
   2130 declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
   2131 
   2132 define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   2133 ; X32-LABEL: test_mm_ucomile_ss:
   2134 ; X32:       # BB#0:
   2135 ; X32-NEXT:    xorl %eax, %eax
   2136 ; X32-NEXT:    ucomiss %xmm0, %xmm1
   2137 ; X32-NEXT:    setae %al
   2138 ; X32-NEXT:    retl
   2139 ;
   2140 ; X64-LABEL: test_mm_ucomile_ss:
   2141 ; X64:       # BB#0:
   2142 ; X64-NEXT:    xorl %eax, %eax
   2143 ; X64-NEXT:    ucomiss %xmm0, %xmm1
   2144 ; X64-NEXT:    setae %al
   2145 ; X64-NEXT:    retq
   2146   %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1)
   2147   ret i32 %res
   2148 }
   2149 declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
   2150 
   2151 define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   2152 ; X32-LABEL: test_mm_ucomilt_ss:
   2153 ; X32:       # BB#0:
   2154 ; X32-NEXT:    xorl %eax, %eax
   2155 ; X32-NEXT:    ucomiss %xmm0, %xmm1
   2156 ; X32-NEXT:    seta %al
   2157 ; X32-NEXT:    retl
   2158 ;
   2159 ; X64-LABEL: test_mm_ucomilt_ss:
   2160 ; X64:       # BB#0:
   2161 ; X64-NEXT:    xorl %eax, %eax
   2162 ; X64-NEXT:    ucomiss %xmm0, %xmm1
   2163 ; X64-NEXT:    seta %al
   2164 ; X64-NEXT:    retq
   2165   %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1)
   2166   ret i32 %res
   2167 }
   2168 declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
   2169 
   2170 define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   2171 ; X32-LABEL: test_mm_ucomineq_ss:
   2172 ; X32:       # BB#0:
   2173 ; X32-NEXT:    ucomiss %xmm1, %xmm0
   2174 ; X32-NEXT:    setp %al
   2175 ; X32-NEXT:    setne %cl
   2176 ; X32-NEXT:    orb %al, %cl
   2177 ; X32-NEXT:    movzbl %cl, %eax
   2178 ; X32-NEXT:    retl
   2179 ;
   2180 ; X64-LABEL: test_mm_ucomineq_ss:
   2181 ; X64:       # BB#0:
   2182 ; X64-NEXT:    ucomiss %xmm1, %xmm0
   2183 ; X64-NEXT:    setp %al
   2184 ; X64-NEXT:    setne %cl
   2185 ; X64-NEXT:    orb %al, %cl
   2186 ; X64-NEXT:    movzbl %cl, %eax
   2187 ; X64-NEXT:    retq
   2188   %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
   2189   ret i32 %res
   2190 }
   2191 declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
   2192 
   2193 define <4 x float> @test_mm_undefined_ps() {
   2194 ; X32-LABEL: test_mm_undefined_ps:
   2195 ; X32:       # BB#0:
   2196 ; X32-NEXT:    retl
   2197 ;
   2198 ; X64-LABEL: test_mm_undefined_ps:
   2199 ; X64:       # BB#0:
   2200 ; X64-NEXT:    retq
   2201   ret <4 x float> undef
   2202 }
   2203 
   2204 define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   2205 ; X32-LABEL: test_mm_unpackhi_ps:
   2206 ; X32:       # BB#0:
   2207 ; X32-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   2208 ; X32-NEXT:    retl
   2209 ;
   2210 ; X64-LABEL: test_mm_unpackhi_ps:
   2211 ; X64:       # BB#0:
   2212 ; X64-NEXT:    unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   2213 ; X64-NEXT:    retq
   2214   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   2215   ret <4 x float> %res
   2216 }
   2217 
   2218 define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   2219 ; X32-LABEL: test_mm_unpacklo_ps:
   2220 ; X32:       # BB#0:
   2221 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2222 ; X32-NEXT:    retl
   2223 ;
   2224 ; X64-LABEL: test_mm_unpacklo_ps:
   2225 ; X64:       # BB#0:
   2226 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2227 ; X64-NEXT:    retq
   2228   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   2229   ret <4 x float> %res
   2230 }
   2231 
   2232 define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   2233 ; X32-LABEL: test_mm_xor_ps:
   2234 ; X32:       # BB#0:
   2235 ; X32-NEXT:    pushl %ebp
   2236 ; X32-NEXT:    movl %esp, %ebp
   2237 ; X32-NEXT:    pushl %esi
   2238 ; X32-NEXT:    andl $-16, %esp
   2239 ; X32-NEXT:    subl $64, %esp
   2240 ; X32-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
   2241 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2242 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   2243 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
   2244 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
   2245 ; X32-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
   2246 ; X32-NEXT:    xorl {{[0-9]+}}(%esp), %esi
   2247 ; X32-NEXT:    movl %esi, (%esp)
   2248 ; X32-NEXT:    xorl {{[0-9]+}}(%esp), %edx
   2249 ; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp)
   2250 ; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
   2251 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
   2252 ; X32-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
   2253 ; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
   2254 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2255 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2256 ; X32-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   2257 ; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   2258 ; X32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2259 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   2260 ; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2261 ; X32-NEXT:    leal -4(%ebp), %esp
   2262 ; X32-NEXT:    popl %esi
   2263 ; X32-NEXT:    popl %ebp
   2264 ; X32-NEXT:    retl
   2265 ;
   2266 ; X64-LABEL: test_mm_xor_ps:
   2267 ; X64:       # BB#0:
   2268 ; X64-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
   2269 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
   2270 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %r8
   2271 ; X64-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
   2272 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
   2273 ; X64-NEXT:    movq %rdx, %rsi
   2274 ; X64-NEXT:    xorl %eax, %edx
   2275 ; X64-NEXT:    shrq $32, %rax
   2276 ; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
   2277 ; X64-NEXT:    movq %rcx, %rdi
   2278 ; X64-NEXT:    xorl %r8d, %ecx
   2279 ; X64-NEXT:    shrq $32, %r8
   2280 ; X64-NEXT:    shrq $32, %rsi
   2281 ; X64-NEXT:    shrq $32, %rdi
   2282 ; X64-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp)
   2283 ; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
   2284 ; X64-NEXT:    xorl %r8d, %edi
   2285 ; X64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
   2286 ; X64-NEXT:    xorl %eax, %esi
   2287 ; X64-NEXT:    movl %esi, -{{[0-9]+}}(%rsp)
   2288 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2289 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2290 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2291 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
   2292 ; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
   2293 ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
   2294 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   2295 ; X64-NEXT:    retq
   2296   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
   2297   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
   2298   %res = xor <4 x i32> %arg0, %arg1
   2299   %bc = bitcast <4 x i32> %res to <4 x float>
   2300   ret <4 x float> %bc
   2301 }
   2302 
   2303 !0 = !{i32 1}
   2304