Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE
      3 ; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
      4 ; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=i386-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
      5 ; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+sse,-sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE
      6 ; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
      7 ; RUN: llc -show-mc-encoding -fast-isel-sink-local-values < %s -fast-isel -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
      8 
      9 ; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse-builtins.c
     10 
     11 define <4 x float> @test_mm_add_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
     12 ; SSE-LABEL: test_mm_add_ps:
     13 ; SSE:       # %bb.0:
     14 ; SSE-NEXT:    addps %xmm1, %xmm0 # encoding: [0x0f,0x58,0xc1]
     15 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     16 ;
     17 ; AVX1-LABEL: test_mm_add_ps:
     18 ; AVX1:       # %bb.0:
     19 ; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x58,0xc1]
     20 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     21 ;
     22 ; AVX512-LABEL: test_mm_add_ps:
     23 ; AVX512:       # %bb.0:
     24 ; AVX512-NEXT:    vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1]
     25 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     26   %res = fadd <4 x float> %a0, %a1
     27   ret <4 x float> %res
     28 }
     29 
     30 define <4 x float> @test_mm_add_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
     31 ; SSE-LABEL: test_mm_add_ss:
     32 ; SSE:       # %bb.0:
     33 ; SSE-NEXT:    addss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x58,0xc1]
     34 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     35 ;
     36 ; AVX1-LABEL: test_mm_add_ss:
     37 ; AVX1:       # %bb.0:
     38 ; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x58,0xc1]
     39 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     40 ;
     41 ; AVX512-LABEL: test_mm_add_ss:
     42 ; AVX512:       # %bb.0:
     43 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x58,0xc1]
     44 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     45   %ext0 = extractelement <4 x float> %a0, i32 0
     46   %ext1 = extractelement <4 x float> %a1, i32 0
     47   %fadd = fadd float %ext0, %ext1
     48   %res = insertelement <4 x float> %a0, float %fadd, i32 0
     49   ret <4 x float> %res
     50 }
     51 
     52 define <4 x float> @test_mm_and_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
     53 ; SSE-LABEL: test_mm_and_ps:
     54 ; SSE:       # %bb.0:
     55 ; SSE-NEXT:    andps %xmm1, %xmm0 # encoding: [0x0f,0x54,0xc1]
     56 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     57 ;
     58 ; AVX1-LABEL: test_mm_and_ps:
     59 ; AVX1:       # %bb.0:
     60 ; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x54,0xc1]
     61 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     62 ;
     63 ; AVX512-LABEL: test_mm_and_ps:
     64 ; AVX512:       # %bb.0:
     65 ; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x54,0xc1]
     66 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     67   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
     68   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
     69   %res = and <4 x i32> %arg0, %arg1
     70   %bc = bitcast <4 x i32> %res to <4 x float>
     71   ret <4 x float> %bc
     72 }
     73 
     74 define <4 x float> @test_mm_andnot_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
     75 ; SSE-LABEL: test_mm_andnot_ps:
     76 ; SSE:       # %bb.0:
     77 ; SSE-NEXT:    andnps %xmm1, %xmm0 # encoding: [0x0f,0x55,0xc1]
     78 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     79 ;
     80 ; AVX1-LABEL: test_mm_andnot_ps:
     81 ; AVX1:       # %bb.0:
     82 ; AVX1-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x55,0xc1]
     83 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     84 ;
     85 ; AVX512-LABEL: test_mm_andnot_ps:
     86 ; AVX512:       # %bb.0:
     87 ; AVX512-NEXT:    vandnps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x55,0xc1]
     88 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
     89   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
     90   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
     91   %not = xor <4 x i32> %arg0, <i32 -1, i32 -1, i32 -1, i32 -1>
     92   %res = and <4 x i32> %not, %arg1
     93   %bc = bitcast <4 x i32> %res to <4 x float>
     94   ret <4 x float> %bc
     95 }
     96 
     97 define <4 x float> @test_mm_cmpeq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
     98 ; SSE-LABEL: test_mm_cmpeq_ps:
     99 ; SSE:       # %bb.0:
    100 ; SSE-NEXT:    cmpeqps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x00]
    101 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    102 ;
    103 ; AVX1-LABEL: test_mm_cmpeq_ps:
    104 ; AVX1:       # %bb.0:
    105 ; AVX1-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x00]
    106 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    107 ;
    108 ; AVX512-LABEL: test_mm_cmpeq_ps:
    109 ; AVX512:       # %bb.0:
    110 ; AVX512-NEXT:    vcmpeqps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x00]
    111 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
    112 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    113   %cmp = fcmp oeq <4 x float> %a0, %a1
    114   %sext = sext <4 x i1> %cmp to <4 x i32>
    115   %res = bitcast <4 x i32> %sext to <4 x float>
    116   ret <4 x float> %res
    117 }
    118 
    119 define <4 x float> @test_mm_cmpeq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    120 ; SSE-LABEL: test_mm_cmpeq_ss:
    121 ; SSE:       # %bb.0:
    122 ; SSE-NEXT:    cmpeqss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x00]
    123 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    124 ;
    125 ; AVX-LABEL: test_mm_cmpeq_ss:
    126 ; AVX:       # %bb.0:
    127 ; AVX-NEXT:    vcmpeqss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x00]
    128 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    129   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 0)
    130   ret <4 x float> %res
    131 }
    132 declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
    133 
    134 define <4 x float> @test_mm_cmpge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    135 ; SSE-LABEL: test_mm_cmpge_ps:
    136 ; SSE:       # %bb.0:
    137 ; SSE-NEXT:    cmpleps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x02]
    138 ; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
    139 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    140 ;
    141 ; AVX1-LABEL: test_mm_cmpge_ps:
    142 ; AVX1:       # %bb.0:
    143 ; AVX1-NEXT:    vcmpleps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x02]
    144 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    145 ;
    146 ; AVX512-LABEL: test_mm_cmpge_ps:
    147 ; AVX512:       # %bb.0:
    148 ; AVX512-NEXT:    vcmpleps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x02]
    149 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
    150 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    151   %cmp = fcmp ole <4 x float> %a1, %a0
    152   %sext = sext <4 x i1> %cmp to <4 x i32>
    153   %res = bitcast <4 x i32> %sext to <4 x float>
    154   ret <4 x float> %res
    155 }
    156 
    157 define <4 x float> @test_mm_cmpge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    158 ; SSE-LABEL: test_mm_cmpge_ss:
    159 ; SSE:       # %bb.0:
    160 ; SSE-NEXT:    cmpless %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x02]
    161 ; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
    162 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
    163 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    164 ;
    165 ; AVX-LABEL: test_mm_cmpge_ss:
    166 ; AVX:       # %bb.0:
    167 ; AVX-NEXT:    vcmpless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x02]
    168 ; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
    169 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
    170 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    171   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 2)
    172   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
    173   ret <4 x float> %res
    174 }
    175 
    176 define <4 x float> @test_mm_cmpgt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    177 ; SSE-LABEL: test_mm_cmpgt_ps:
    178 ; SSE:       # %bb.0:
    179 ; SSE-NEXT:    cmpltps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x01]
    180 ; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
    181 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    182 ;
    183 ; AVX1-LABEL: test_mm_cmpgt_ps:
    184 ; AVX1:       # %bb.0:
    185 ; AVX1-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x01]
    186 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    187 ;
    188 ; AVX512-LABEL: test_mm_cmpgt_ps:
    189 ; AVX512:       # %bb.0:
    190 ; AVX512-NEXT:    vcmpltps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x01]
    191 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
    192 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    193   %cmp = fcmp olt <4 x float> %a1, %a0
    194   %sext = sext <4 x i1> %cmp to <4 x i32>
    195   %res = bitcast <4 x i32> %sext to <4 x float>
    196   ret <4 x float> %res
    197 }
    198 
    199 define <4 x float> @test_mm_cmpgt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    200 ; SSE-LABEL: test_mm_cmpgt_ss:
    201 ; SSE:       # %bb.0:
    202 ; SSE-NEXT:    cmpltss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x01]
    203 ; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
    204 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
    205 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    206 ;
    207 ; AVX-LABEL: test_mm_cmpgt_ss:
    208 ; AVX:       # %bb.0:
    209 ; AVX-NEXT:    vcmpltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x01]
    210 ; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
    211 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
    212 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    213   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 1)
    214   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
    215   ret <4 x float> %res
    216 }
    217 
    218 define <4 x float> @test_mm_cmple_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    219 ; SSE-LABEL: test_mm_cmple_ps:
    220 ; SSE:       # %bb.0:
    221 ; SSE-NEXT:    cmpleps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x02]
    222 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    223 ;
    224 ; AVX1-LABEL: test_mm_cmple_ps:
    225 ; AVX1:       # %bb.0:
    226 ; AVX1-NEXT:    vcmpleps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x02]
    227 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    228 ;
    229 ; AVX512-LABEL: test_mm_cmple_ps:
    230 ; AVX512:       # %bb.0:
    231 ; AVX512-NEXT:    vcmpleps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
    232 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
    233 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    234   %cmp = fcmp ole <4 x float> %a0, %a1
    235   %sext = sext <4 x i1> %cmp to <4 x i32>
    236   %res = bitcast <4 x i32> %sext to <4 x float>
    237   ret <4 x float> %res
    238 }
    239 
    240 define <4 x float> @test_mm_cmple_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    241 ; SSE-LABEL: test_mm_cmple_ss:
    242 ; SSE:       # %bb.0:
    243 ; SSE-NEXT:    cmpless %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x02]
    244 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    245 ;
    246 ; AVX-LABEL: test_mm_cmple_ss:
    247 ; AVX:       # %bb.0:
    248 ; AVX-NEXT:    vcmpless %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x02]
    249 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    250   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 2)
    251   ret <4 x float> %res
    252 }
    253 
    254 define <4 x float> @test_mm_cmplt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    255 ; SSE-LABEL: test_mm_cmplt_ps:
    256 ; SSE:       # %bb.0:
    257 ; SSE-NEXT:    cmpltps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x01]
    258 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    259 ;
    260 ; AVX1-LABEL: test_mm_cmplt_ps:
    261 ; AVX1:       # %bb.0:
    262 ; AVX1-NEXT:    vcmpltps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x01]
    263 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    264 ;
    265 ; AVX512-LABEL: test_mm_cmplt_ps:
    266 ; AVX512:       # %bb.0:
    267 ; AVX512-NEXT:    vcmpltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x01]
    268 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
    269 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    270   %cmp = fcmp olt <4 x float> %a0, %a1
    271   %sext = sext <4 x i1> %cmp to <4 x i32>
    272   %res = bitcast <4 x i32> %sext to <4 x float>
    273   ret <4 x float> %res
    274 }
    275 
    276 define <4 x float> @test_mm_cmplt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    277 ; SSE-LABEL: test_mm_cmplt_ss:
    278 ; SSE:       # %bb.0:
    279 ; SSE-NEXT:    cmpltss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x01]
    280 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    281 ;
    282 ; AVX-LABEL: test_mm_cmplt_ss:
    283 ; AVX:       # %bb.0:
    284 ; AVX-NEXT:    vcmpltss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x01]
    285 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    286   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 1)
    287   ret <4 x float> %res
    288 }
    289 
    290 define <4 x float> @test_mm_cmpneq_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    291 ; SSE-LABEL: test_mm_cmpneq_ps:
    292 ; SSE:       # %bb.0:
    293 ; SSE-NEXT:    cmpneqps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x04]
    294 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    295 ;
    296 ; AVX1-LABEL: test_mm_cmpneq_ps:
    297 ; AVX1:       # %bb.0:
    298 ; AVX1-NEXT:    vcmpneqps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x04]
    299 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    300 ;
    301 ; AVX512-LABEL: test_mm_cmpneq_ps:
    302 ; AVX512:       # %bb.0:
    303 ; AVX512-NEXT:    vcmpneqps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x04]
    304 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
    305 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    306   %cmp = fcmp une <4 x float> %a0, %a1
    307   %sext = sext <4 x i1> %cmp to <4 x i32>
    308   %res = bitcast <4 x i32> %sext to <4 x float>
    309   ret <4 x float> %res
    310 }
    311 
    312 define <4 x float> @test_mm_cmpneq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    313 ; SSE-LABEL: test_mm_cmpneq_ss:
    314 ; SSE:       # %bb.0:
    315 ; SSE-NEXT:    cmpneqss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x04]
    316 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    317 ;
    318 ; AVX-LABEL: test_mm_cmpneq_ss:
    319 ; AVX:       # %bb.0:
    320 ; AVX-NEXT:    vcmpneqss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x04]
    321 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    322   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 4)
    323   ret <4 x float> %res
    324 }
    325 
    326 define <4 x float> @test_mm_cmpnge_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    327 ; SSE-LABEL: test_mm_cmpnge_ps:
    328 ; SSE:       # %bb.0:
    329 ; SSE-NEXT:    cmpnleps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x06]
    330 ; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
    331 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    332 ;
    333 ; AVX1-LABEL: test_mm_cmpnge_ps:
    334 ; AVX1:       # %bb.0:
    335 ; AVX1-NEXT:    vcmpnleps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x06]
    336 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    337 ;
    338 ; AVX512-LABEL: test_mm_cmpnge_ps:
    339 ; AVX512:       # %bb.0:
    340 ; AVX512-NEXT:    vcmpnleps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x06]
    341 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
    342 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    343   %cmp = fcmp ugt <4 x float> %a1, %a0
    344   %sext = sext <4 x i1> %cmp to <4 x i32>
    345   %res = bitcast <4 x i32> %sext to <4 x float>
    346   ret <4 x float> %res
    347 }
    348 
    349 define <4 x float> @test_mm_cmpnge_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    350 ; SSE-LABEL: test_mm_cmpnge_ss:
    351 ; SSE:       # %bb.0:
    352 ; SSE-NEXT:    cmpnless %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x06]
    353 ; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
    354 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
    355 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    356 ;
    357 ; AVX-LABEL: test_mm_cmpnge_ss:
    358 ; AVX:       # %bb.0:
    359 ; AVX-NEXT:    vcmpnless %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x06]
    360 ; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
    361 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
    362 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    363   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 6)
    364   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
    365   ret <4 x float> %res
    366 }
    367 
    368 define <4 x float> @test_mm_cmpngt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    369 ; SSE-LABEL: test_mm_cmpngt_ps:
    370 ; SSE:       # %bb.0:
    371 ; SSE-NEXT:    cmpnltps %xmm0, %xmm1 # encoding: [0x0f,0xc2,0xc8,0x05]
    372 ; SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
    373 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    374 ;
    375 ; AVX1-LABEL: test_mm_cmpngt_ps:
    376 ; AVX1:       # %bb.0:
    377 ; AVX1-NEXT:    vcmpnltps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0xc2,0xc0,0x05]
    378 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    379 ;
    380 ; AVX512-LABEL: test_mm_cmpngt_ps:
    381 ; AVX512:       # %bb.0:
    382 ; AVX512-NEXT:    vcmpnltps %xmm0, %xmm1, %k0 # encoding: [0x62,0xf1,0x74,0x08,0xc2,0xc0,0x05]
    383 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
    384 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    385   %cmp = fcmp uge <4 x float> %a1, %a0
    386   %sext = sext <4 x i1> %cmp to <4 x i32>
    387   %res = bitcast <4 x i32> %sext to <4 x float>
    388   ret <4 x float> %res
    389 }
    390 
    391 define <4 x float> @test_mm_cmpngt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    392 ; SSE-LABEL: test_mm_cmpngt_ss:
    393 ; SSE:       # %bb.0:
    394 ; SSE-NEXT:    cmpnltss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0xc2,0xc8,0x05]
    395 ; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
    396 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
    397 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    398 ;
    399 ; AVX-LABEL: test_mm_cmpngt_ss:
    400 ; AVX:       # %bb.0:
    401 ; AVX-NEXT:    vcmpnltss %xmm0, %xmm1, %xmm1 # encoding: [0xc5,0xf2,0xc2,0xc8,0x05]
    402 ; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
    403 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
    404 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    405   %cmp = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a1, <4 x float> %a0, i8 5)
    406   %res = shufflevector <4 x float> %a0, <4 x float> %cmp, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
    407   ret <4 x float> %res
    408 }
    409 
    410 define <4 x float> @test_mm_cmpnle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    411 ; SSE-LABEL: test_mm_cmpnle_ps:
    412 ; SSE:       # %bb.0:
    413 ; SSE-NEXT:    cmpnleps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x06]
    414 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    415 ;
    416 ; AVX1-LABEL: test_mm_cmpnle_ps:
    417 ; AVX1:       # %bb.0:
    418 ; AVX1-NEXT:    vcmpnleps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x06]
    419 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    420 ;
    421 ; AVX512-LABEL: test_mm_cmpnle_ps:
    422 ; AVX512:       # %bb.0:
    423 ; AVX512-NEXT:    vcmpnleps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x06]
    424 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
    425 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    426   %cmp = fcmp ugt <4 x float> %a0, %a1
    427   %sext = sext <4 x i1> %cmp to <4 x i32>
    428   %res = bitcast <4 x i32> %sext to <4 x float>
    429   ret <4 x float> %res
    430 }
    431 
    432 define <4 x float> @test_mm_cmpnle_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    433 ; SSE-LABEL: test_mm_cmpnle_ss:
    434 ; SSE:       # %bb.0:
    435 ; SSE-NEXT:    cmpnless %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x06]
    436 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    437 ;
    438 ; AVX-LABEL: test_mm_cmpnle_ss:
    439 ; AVX:       # %bb.0:
    440 ; AVX-NEXT:    vcmpnless %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x06]
    441 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    442   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 6)
    443   ret <4 x float> %res
    444 }
    445 
    446 define <4 x float> @test_mm_cmpnlt_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    447 ; SSE-LABEL: test_mm_cmpnlt_ps:
    448 ; SSE:       # %bb.0:
    449 ; SSE-NEXT:    cmpnltps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x05]
    450 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    451 ;
    452 ; AVX1-LABEL: test_mm_cmpnlt_ps:
    453 ; AVX1:       # %bb.0:
    454 ; AVX1-NEXT:    vcmpnltps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x05]
    455 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    456 ;
    457 ; AVX512-LABEL: test_mm_cmpnlt_ps:
    458 ; AVX512:       # %bb.0:
    459 ; AVX512-NEXT:    vcmpnltps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x05]
    460 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
    461 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    462   %cmp = fcmp uge <4 x float> %a0, %a1
    463   %sext = sext <4 x i1> %cmp to <4 x i32>
    464   %res = bitcast <4 x i32> %sext to <4 x float>
    465   ret <4 x float> %res
    466 }
    467 
    468 define <4 x float> @test_mm_cmpnlt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    469 ; SSE-LABEL: test_mm_cmpnlt_ss:
    470 ; SSE:       # %bb.0:
    471 ; SSE-NEXT:    cmpnltss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x05]
    472 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    473 ;
    474 ; AVX-LABEL: test_mm_cmpnlt_ss:
    475 ; AVX:       # %bb.0:
    476 ; AVX-NEXT:    vcmpnltss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x05]
    477 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    478   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 5)
    479   ret <4 x float> %res
    480 }
    481 
    482 define <4 x float> @test_mm_cmpord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    483 ; SSE-LABEL: test_mm_cmpord_ps:
    484 ; SSE:       # %bb.0:
    485 ; SSE-NEXT:    cmpordps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x07]
    486 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    487 ;
    488 ; AVX1-LABEL: test_mm_cmpord_ps:
    489 ; AVX1:       # %bb.0:
    490 ; AVX1-NEXT:    vcmpordps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x07]
    491 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    492 ;
    493 ; AVX512-LABEL: test_mm_cmpord_ps:
    494 ; AVX512:       # %bb.0:
    495 ; AVX512-NEXT:    vcmpordps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x07]
    496 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
    497 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    498   %cmp = fcmp ord <4 x float> %a0, %a1
    499   %sext = sext <4 x i1> %cmp to <4 x i32>
    500   %res = bitcast <4 x i32> %sext to <4 x float>
    501   ret <4 x float> %res
    502 }
    503 
    504 define <4 x float> @test_mm_cmpord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    505 ; SSE-LABEL: test_mm_cmpord_ss:
    506 ; SSE:       # %bb.0:
    507 ; SSE-NEXT:    cmpordss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x07]
    508 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    509 ;
    510 ; AVX-LABEL: test_mm_cmpord_ss:
    511 ; AVX:       # %bb.0:
    512 ; AVX-NEXT:    vcmpordss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x07]
    513 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    514   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7)
    515   ret <4 x float> %res
    516 }
    517 
    518 define <4 x float> @test_mm_cmpunord_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    519 ; SSE-LABEL: test_mm_cmpunord_ps:
    520 ; SSE:       # %bb.0:
    521 ; SSE-NEXT:    cmpunordps %xmm1, %xmm0 # encoding: [0x0f,0xc2,0xc1,0x03]
    522 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    523 ;
    524 ; AVX1-LABEL: test_mm_cmpunord_ps:
    525 ; AVX1:       # %bb.0:
    526 ; AVX1-NEXT:    vcmpunordps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc2,0xc1,0x03]
    527 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    528 ;
    529 ; AVX512-LABEL: test_mm_cmpunord_ps:
    530 ; AVX512:       # %bb.0:
    531 ; AVX512-NEXT:    vcmpunordps %xmm1, %xmm0, %k0 # encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x03]
    532 ; AVX512-NEXT:    vpmovm2d %k0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
    533 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    534   %cmp = fcmp uno <4 x float> %a0, %a1
    535   %sext = sext <4 x i1> %cmp to <4 x i32>
    536   %res = bitcast <4 x i32> %sext to <4 x float>
    537   ret <4 x float> %res
    538 }
    539 
    540 define <4 x float> @test_mm_cmpunord_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    541 ; SSE-LABEL: test_mm_cmpunord_ss:
    542 ; SSE:       # %bb.0:
    543 ; SSE-NEXT:    cmpunordss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0xc2,0xc1,0x03]
    544 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    545 ;
    546 ; AVX-LABEL: test_mm_cmpunord_ss:
    547 ; AVX:       # %bb.0:
    548 ; AVX-NEXT:    vcmpunordss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0xc2,0xc1,0x03]
    549 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    550   %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 3)
    551   ret <4 x float> %res
    552 }
    553 
    554 define i32 @test_mm_comieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    555 ; SSE-LABEL: test_mm_comieq_ss:
    556 ; SSE:       # %bb.0:
    557 ; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
    558 ; SSE-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
    559 ; SSE-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
    560 ; SSE-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
    561 ; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
    562 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    563 ;
    564 ; AVX1-LABEL: test_mm_comieq_ss:
    565 ; AVX1:       # %bb.0:
    566 ; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
    567 ; AVX1-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
    568 ; AVX1-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
    569 ; AVX1-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
    570 ; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
    571 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    572 ;
    573 ; AVX512-LABEL: test_mm_comieq_ss:
    574 ; AVX512:       # %bb.0:
    575 ; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
    576 ; AVX512-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
    577 ; AVX512-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
    578 ; AVX512-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
    579 ; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
    580 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    581   %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
    582   ret i32 %res
    583 }
    584 declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
    585 
    586 define i32 @test_mm_comige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    587 ; SSE-LABEL: test_mm_comige_ss:
    588 ; SSE:       # %bb.0:
    589 ; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
    590 ; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
    591 ; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
    592 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    593 ;
    594 ; AVX1-LABEL: test_mm_comige_ss:
    595 ; AVX1:       # %bb.0:
    596 ; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
    597 ; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
    598 ; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
    599 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    600 ;
    601 ; AVX512-LABEL: test_mm_comige_ss:
    602 ; AVX512:       # %bb.0:
    603 ; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
    604 ; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
    605 ; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
    606 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    607   %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1)
    608   ret i32 %res
    609 }
    610 declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
    611 
    612 define i32 @test_mm_comigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    613 ; SSE-LABEL: test_mm_comigt_ss:
    614 ; SSE:       # %bb.0:
    615 ; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
    616 ; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
    617 ; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
    618 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    619 ;
    620 ; AVX1-LABEL: test_mm_comigt_ss:
    621 ; AVX1:       # %bb.0:
    622 ; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
    623 ; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
    624 ; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
    625 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    626 ;
    627 ; AVX512-LABEL: test_mm_comigt_ss:
    628 ; AVX512:       # %bb.0:
    629 ; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
    630 ; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
    631 ; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
    632 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    633   %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1)
    634   ret i32 %res
    635 }
    636 declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
    637 
    638 define i32 @test_mm_comile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    639 ; SSE-LABEL: test_mm_comile_ss:
    640 ; SSE:       # %bb.0:
    641 ; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
    642 ; SSE-NEXT:    comiss %xmm0, %xmm1 # encoding: [0x0f,0x2f,0xc8]
    643 ; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
    644 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    645 ;
    646 ; AVX1-LABEL: test_mm_comile_ss:
    647 ; AVX1:       # %bb.0:
    648 ; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
    649 ; AVX1-NEXT:    vcomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2f,0xc8]
    650 ; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
    651 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    652 ;
    653 ; AVX512-LABEL: test_mm_comile_ss:
    654 ; AVX512:       # %bb.0:
    655 ; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
    656 ; AVX512-NEXT:    vcomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
    657 ; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
    658 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    659   %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1)
    660   ret i32 %res
    661 }
    662 declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
    663 
    664 define i32 @test_mm_comilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    665 ; SSE-LABEL: test_mm_comilt_ss:
    666 ; SSE:       # %bb.0:
    667 ; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
    668 ; SSE-NEXT:    comiss %xmm0, %xmm1 # encoding: [0x0f,0x2f,0xc8]
    669 ; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
    670 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    671 ;
    672 ; AVX1-LABEL: test_mm_comilt_ss:
    673 ; AVX1:       # %bb.0:
    674 ; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
    675 ; AVX1-NEXT:    vcomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2f,0xc8]
    676 ; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
    677 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    678 ;
    679 ; AVX512-LABEL: test_mm_comilt_ss:
    680 ; AVX512:       # %bb.0:
    681 ; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
    682 ; AVX512-NEXT:    vcomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
    683 ; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
    684 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    685   %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1)
    686   ret i32 %res
    687 }
    688 declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
    689 
    690 define i32 @test_mm_comineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    691 ; SSE-LABEL: test_mm_comineq_ss:
    692 ; SSE:       # %bb.0:
    693 ; SSE-NEXT:    comiss %xmm1, %xmm0 # encoding: [0x0f,0x2f,0xc1]
    694 ; SSE-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
    695 ; SSE-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
    696 ; SSE-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
    697 ; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
    698 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    699 ;
    700 ; AVX1-LABEL: test_mm_comineq_ss:
    701 ; AVX1:       # %bb.0:
    702 ; AVX1-NEXT:    vcomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2f,0xc1]
    703 ; AVX1-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
    704 ; AVX1-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
    705 ; AVX1-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
    706 ; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
    707 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    708 ;
    709 ; AVX512-LABEL: test_mm_comineq_ss:
    710 ; AVX512:       # %bb.0:
    711 ; AVX512-NEXT:    vcomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
    712 ; AVX512-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
    713 ; AVX512-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
    714 ; AVX512-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
    715 ; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
    716 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    717   %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1)
    718   ret i32 %res
    719 }
    720 declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
    721 
    722 define i32 @test_mm_cvt_ss2si(<4 x float> %a0) nounwind {
    723 ; SSE-LABEL: test_mm_cvt_ss2si:
    724 ; SSE:       # %bb.0:
    725 ; SSE-NEXT:    cvtss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2d,0xc0]
    726 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    727 ;
    728 ; AVX1-LABEL: test_mm_cvt_ss2si:
    729 ; AVX1:       # %bb.0:
    730 ; AVX1-NEXT:    vcvtss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2d,0xc0]
    731 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    732 ;
    733 ; AVX512-LABEL: test_mm_cvt_ss2si:
    734 ; AVX512:       # %bb.0:
    735 ; AVX512-NEXT:    vcvtss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
    736 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    737   %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
    738   ret i32 %res
    739 }
    740 declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
    741 
    742 define <4 x float> @test_mm_cvtsi32_ss(<4 x float> %a0, i32 %a1) nounwind {
    743 ; X86-SSE-LABEL: test_mm_cvtsi32_ss:
    744 ; X86-SSE:       # %bb.0:
    745 ; X86-SSE-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x2a,0x44,0x24,0x04]
    746 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
    747 ;
    748 ; X86-AVX1-LABEL: test_mm_cvtsi32_ss:
    749 ; X86-AVX1:       # %bb.0:
    750 ; X86-AVX1-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04]
    751 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
    752 ;
    753 ; X86-AVX512-LABEL: test_mm_cvtsi32_ss:
    754 ; X86-AVX512:       # %bb.0:
    755 ; X86-AVX512-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0x44,0x24,0x04]
    756 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
    757 ;
    758 ; X64-SSE-LABEL: test_mm_cvtsi32_ss:
    759 ; X64-SSE:       # %bb.0:
    760 ; X64-SSE-NEXT:    cvtsi2ssl %edi, %xmm0 # encoding: [0xf3,0x0f,0x2a,0xc7]
    761 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
    762 ;
    763 ; X64-AVX1-LABEL: test_mm_cvtsi32_ss:
    764 ; X64-AVX1:       # %bb.0:
    765 ; X64-AVX1-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x2a,0xc7]
    766 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
    767 ;
    768 ; X64-AVX512-LABEL: test_mm_cvtsi32_ss:
    769 ; X64-AVX512:       # %bb.0:
    770 ; X64-AVX512-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0xc7]
    771 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
    772   %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 %a1)
    773   ret <4 x float> %res
    774 }
    775 declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
    776 
    777 define float @test_mm_cvtss_f32(<4 x float> %a0) nounwind {
    778 ; X86-SSE-LABEL: test_mm_cvtss_f32:
    779 ; X86-SSE:       # %bb.0:
    780 ; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
    781 ; X86-SSE-NEXT:    movss %xmm0, (%esp) # encoding: [0xf3,0x0f,0x11,0x04,0x24]
    782 ; X86-SSE-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
    783 ; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
    784 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
    785 ;
    786 ; X86-AVX1-LABEL: test_mm_cvtss_f32:
    787 ; X86-AVX1:       # %bb.0:
    788 ; X86-AVX1-NEXT:    pushl %eax # encoding: [0x50]
    789 ; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) # encoding: [0xc5,0xfa,0x11,0x04,0x24]
    790 ; X86-AVX1-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
    791 ; X86-AVX1-NEXT:    popl %eax # encoding: [0x58]
    792 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
    793 ;
    794 ; X86-AVX512-LABEL: test_mm_cvtss_f32:
    795 ; X86-AVX512:       # %bb.0:
    796 ; X86-AVX512-NEXT:    pushl %eax # encoding: [0x50]
    797 ; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
    798 ; X86-AVX512-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
    799 ; X86-AVX512-NEXT:    popl %eax # encoding: [0x58]
    800 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
    801 ;
    802 ; X64-LABEL: test_mm_cvtss_f32:
    803 ; X64:       # %bb.0:
    804 ; X64-NEXT:    retq # encoding: [0xc3]
    805   %res = extractelement <4 x float> %a0, i32 0
    806   ret float %res
    807 }
    808 
    809 define i32 @test_mm_cvtss_si32(<4 x float> %a0) nounwind {
    810 ; SSE-LABEL: test_mm_cvtss_si32:
    811 ; SSE:       # %bb.0:
    812 ; SSE-NEXT:    cvtss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2d,0xc0]
    813 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    814 ;
    815 ; AVX1-LABEL: test_mm_cvtss_si32:
    816 ; AVX1:       # %bb.0:
    817 ; AVX1-NEXT:    vcvtss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2d,0xc0]
    818 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    819 ;
    820 ; AVX512-LABEL: test_mm_cvtss_si32:
    821 ; AVX512:       # %bb.0:
    822 ; AVX512-NEXT:    vcvtss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
    823 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    824   %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0)
    825   ret i32 %res
    826 }
    827 
    828 define i32 @test_mm_cvttss_si(<4 x float> %a0) nounwind {
    829 ; SSE-LABEL: test_mm_cvttss_si:
    830 ; SSE:       # %bb.0:
    831 ; SSE-NEXT:    cvttss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2c,0xc0]
    832 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    833 ;
    834 ; AVX1-LABEL: test_mm_cvttss_si:
    835 ; AVX1:       # %bb.0:
    836 ; AVX1-NEXT:    vcvttss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2c,0xc0]
    837 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    838 ;
    839 ; AVX512-LABEL: test_mm_cvttss_si:
    840 ; AVX512:       # %bb.0:
    841 ; AVX512-NEXT:    vcvttss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
    842 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    843   %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
    844   ret i32 %res
    845 }
    846 declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
    847 
    848 define i32 @test_mm_cvttss_si32(<4 x float> %a0) nounwind {
    849 ; SSE-LABEL: test_mm_cvttss_si32:
    850 ; SSE:       # %bb.0:
    851 ; SSE-NEXT:    cvttss2si %xmm0, %eax # encoding: [0xf3,0x0f,0x2c,0xc0]
    852 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    853 ;
    854 ; AVX1-LABEL: test_mm_cvttss_si32:
    855 ; AVX1:       # %bb.0:
    856 ; AVX1-NEXT:    vcvttss2si %xmm0, %eax # encoding: [0xc5,0xfa,0x2c,0xc0]
    857 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    858 ;
    859 ; AVX512-LABEL: test_mm_cvttss_si32:
    860 ; AVX512:       # %bb.0:
    861 ; AVX512-NEXT:    vcvttss2si %xmm0, %eax # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
    862 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    863   %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0)
    864   ret i32 %res
    865 }
    866 
    867 define <4 x float> @test_mm_div_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
    868 ; SSE-LABEL: test_mm_div_ps:
    869 ; SSE:       # %bb.0:
    870 ; SSE-NEXT:    divps %xmm1, %xmm0 # encoding: [0x0f,0x5e,0xc1]
    871 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    872 ;
    873 ; AVX1-LABEL: test_mm_div_ps:
    874 ; AVX1:       # %bb.0:
    875 ; AVX1-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5e,0xc1]
    876 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    877 ;
    878 ; AVX512-LABEL: test_mm_div_ps:
    879 ; AVX512:       # %bb.0:
    880 ; AVX512-NEXT:    vdivps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5e,0xc1]
    881 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    882   %res = fdiv <4 x float> %a0, %a1
    883   ret <4 x float> %res
    884 }
    885 
    886 define <4 x float> @test_mm_div_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
    887 ; SSE-LABEL: test_mm_div_ss:
    888 ; SSE:       # %bb.0:
    889 ; SSE-NEXT:    divss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5e,0xc1]
    890 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    891 ;
    892 ; AVX1-LABEL: test_mm_div_ss:
    893 ; AVX1:       # %bb.0:
    894 ; AVX1-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5e,0xc1]
    895 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    896 ;
    897 ; AVX512-LABEL: test_mm_div_ss:
    898 ; AVX512:       # %bb.0:
    899 ; AVX512-NEXT:    vdivss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5e,0xc1]
    900 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
    901   %ext0 = extractelement <4 x float> %a0, i32 0
    902   %ext1 = extractelement <4 x float> %a1, i32 0
    903   %fdiv = fdiv float %ext0, %ext1
    904   %res = insertelement <4 x float> %a0, float %fdiv, i32 0
    905   ret <4 x float> %res
    906 }
    907 
    908 define i32 @test_MM_GET_EXCEPTION_MASK() nounwind {
    909 ; X86-SSE-LABEL: test_MM_GET_EXCEPTION_MASK:
    910 ; X86-SSE:       # %bb.0:
    911 ; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
    912 ; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
    913 ; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
    914 ; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
    915 ; X86-SSE-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
    916 ; X86-SSE-NEXT:    # imm = 0x1F80
    917 ; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
    918 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
    919 ;
    920 ; X86-AVX-LABEL: test_MM_GET_EXCEPTION_MASK:
    921 ; X86-AVX:       # %bb.0:
    922 ; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
    923 ; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
    924 ; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
    925 ; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
    926 ; X86-AVX-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
    927 ; X86-AVX-NEXT:    # imm = 0x1F80
    928 ; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
    929 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
    930 ;
    931 ; X64-SSE-LABEL: test_MM_GET_EXCEPTION_MASK:
    932 ; X64-SSE:       # %bb.0:
    933 ; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
    934 ; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
    935 ; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
    936 ; X64-SSE-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
    937 ; X64-SSE-NEXT:    # imm = 0x1F80
    938 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
    939 ;
    940 ; X64-AVX-LABEL: test_MM_GET_EXCEPTION_MASK:
    941 ; X64-AVX:       # %bb.0:
    942 ; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
    943 ; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
    944 ; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
    945 ; X64-AVX-NEXT:    andl $8064, %eax # encoding: [0x25,0x80,0x1f,0x00,0x00]
    946 ; X64-AVX-NEXT:    # imm = 0x1F80
    947 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
    948   %1 = alloca i32, align 4
    949   %2 = bitcast i32* %1 to i8*
    950   call void @llvm.x86.sse.stmxcsr(i8* %2)
    951   %3 = load i32, i32* %1, align 4
    952   %4 = and i32 %3, 8064
    953   ret i32 %4
    954 }
    955 declare void @llvm.x86.sse.stmxcsr(i8*) nounwind readnone
    956 
    957 define i32 @test_MM_GET_EXCEPTION_STATE() nounwind {
    958 ; X86-SSE-LABEL: test_MM_GET_EXCEPTION_STATE:
    959 ; X86-SSE:       # %bb.0:
    960 ; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
    961 ; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
    962 ; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
    963 ; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
    964 ; X86-SSE-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
    965 ; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
    966 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
    967 ;
    968 ; X86-AVX-LABEL: test_MM_GET_EXCEPTION_STATE:
    969 ; X86-AVX:       # %bb.0:
    970 ; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
    971 ; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
    972 ; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
    973 ; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
    974 ; X86-AVX-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
    975 ; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
    976 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
    977 ;
    978 ; X64-SSE-LABEL: test_MM_GET_EXCEPTION_STATE:
    979 ; X64-SSE:       # %bb.0:
    980 ; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
    981 ; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
    982 ; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
    983 ; X64-SSE-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
    984 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
    985 ;
    986 ; X64-AVX-LABEL: test_MM_GET_EXCEPTION_STATE:
    987 ; X64-AVX:       # %bb.0:
    988 ; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
    989 ; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
    990 ; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
    991 ; X64-AVX-NEXT:    andl $63, %eax # encoding: [0x83,0xe0,0x3f]
    992 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
    993   %1 = alloca i32, align 4
    994   %2 = bitcast i32* %1 to i8*
    995   call void @llvm.x86.sse.stmxcsr(i8* %2)
    996   %3 = load i32, i32* %1, align 4
    997   %4 = and i32 %3, 63
    998   ret i32 %4
    999 }
   1000 
   1001 define i32 @test_MM_GET_FLUSH_ZERO_MODE() nounwind {
   1002 ; X86-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
   1003 ; X86-SSE:       # %bb.0:
   1004 ; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
   1005 ; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
   1006 ; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
   1007 ; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
   1008 ; X86-SSE-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
   1009 ; X86-SSE-NEXT:    # imm = 0x8000
   1010 ; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
   1011 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1012 ;
   1013 ; X86-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
   1014 ; X86-AVX:       # %bb.0:
   1015 ; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
   1016 ; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
   1017 ; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
   1018 ; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
   1019 ; X86-AVX-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
   1020 ; X86-AVX-NEXT:    # imm = 0x8000
   1021 ; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
   1022 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
   1023 ;
   1024 ; X64-SSE-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
   1025 ; X64-SSE:       # %bb.0:
   1026 ; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   1027 ; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
   1028 ; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
   1029 ; X64-SSE-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
   1030 ; X64-SSE-NEXT:    # imm = 0x8000
   1031 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1032 ;
   1033 ; X64-AVX-LABEL: test_MM_GET_FLUSH_ZERO_MODE:
   1034 ; X64-AVX:       # %bb.0:
   1035 ; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   1036 ; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
   1037 ; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
   1038 ; X64-AVX-NEXT:    andl $32768, %eax # encoding: [0x25,0x00,0x80,0x00,0x00]
   1039 ; X64-AVX-NEXT:    # imm = 0x8000
   1040 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
   1041   %1 = alloca i32, align 4
   1042   %2 = bitcast i32* %1 to i8*
   1043   call void @llvm.x86.sse.stmxcsr(i8* %2)
   1044   %3 = load i32, i32* %1, align 4
   1045   %4 = and i32 %3, 32768
   1046   ret i32 %4
   1047 }
   1048 
   1049 define i32 @test_MM_GET_ROUNDING_MODE() nounwind {
   1050 ; X86-SSE-LABEL: test_MM_GET_ROUNDING_MODE:
   1051 ; X86-SSE:       # %bb.0:
   1052 ; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
   1053 ; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
   1054 ; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
   1055 ; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
   1056 ; X86-SSE-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
   1057 ; X86-SSE-NEXT:    # imm = 0x6000
   1058 ; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
   1059 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1060 ;
   1061 ; X86-AVX-LABEL: test_MM_GET_ROUNDING_MODE:
   1062 ; X86-AVX:       # %bb.0:
   1063 ; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
   1064 ; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
   1065 ; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
   1066 ; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
   1067 ; X86-AVX-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
   1068 ; X86-AVX-NEXT:    # imm = 0x6000
   1069 ; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
   1070 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
   1071 ;
   1072 ; X64-SSE-LABEL: test_MM_GET_ROUNDING_MODE:
   1073 ; X64-SSE:       # %bb.0:
   1074 ; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   1075 ; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
   1076 ; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
   1077 ; X64-SSE-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
   1078 ; X64-SSE-NEXT:    # imm = 0x6000
   1079 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1080 ;
   1081 ; X64-AVX-LABEL: test_MM_GET_ROUNDING_MODE:
   1082 ; X64-AVX:       # %bb.0:
   1083 ; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   1084 ; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
   1085 ; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
   1086 ; X64-AVX-NEXT:    andl $24576, %eax # encoding: [0x25,0x00,0x60,0x00,0x00]
   1087 ; X64-AVX-NEXT:    # imm = 0x6000
   1088 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
   1089   %1 = alloca i32, align 4
   1090   %2 = bitcast i32* %1 to i8*
   1091   call void @llvm.x86.sse.stmxcsr(i8* %2)
   1092   %3 = load i32, i32* %1, align 4
   1093   %4 = and i32 %3, 24576
   1094   ret i32 %4
   1095 }
   1096 
   1097 define i32 @test_mm_getcsr() nounwind {
   1098 ; X86-SSE-LABEL: test_mm_getcsr:
   1099 ; X86-SSE:       # %bb.0:
   1100 ; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
   1101 ; X86-SSE-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
   1102 ; X86-SSE-NEXT:    stmxcsr (%eax) # encoding: [0x0f,0xae,0x18]
   1103 ; X86-SSE-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
   1104 ; X86-SSE-NEXT:    popl %ecx # encoding: [0x59]
   1105 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1106 ;
   1107 ; X86-AVX-LABEL: test_mm_getcsr:
   1108 ; X86-AVX:       # %bb.0:
   1109 ; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
   1110 ; X86-AVX-NEXT:    movl %esp, %eax # encoding: [0x89,0xe0]
   1111 ; X86-AVX-NEXT:    vstmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x18]
   1112 ; X86-AVX-NEXT:    movl (%esp), %eax # encoding: [0x8b,0x04,0x24]
   1113 ; X86-AVX-NEXT:    popl %ecx # encoding: [0x59]
   1114 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
   1115 ;
   1116 ; X64-SSE-LABEL: test_mm_getcsr:
   1117 ; X64-SSE:       # %bb.0:
   1118 ; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   1119 ; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
   1120 ; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
   1121 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1122 ;
   1123 ; X64-AVX-LABEL: test_mm_getcsr:
   1124 ; X64-AVX:       # %bb.0:
   1125 ; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   1126 ; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
   1127 ; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %eax # encoding: [0x8b,0x44,0x24,0xfc]
   1128 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
   1129   %1 = alloca i32, align 4
   1130   %2 = bitcast i32* %1 to i8*
   1131   call void @llvm.x86.sse.stmxcsr(i8* %2)
   1132   %3 = load i32, i32* %1, align 4
   1133   ret i32 %3
   1134 }
   1135 
   1136 define <4 x float> @test_mm_load_ps(float* %a0) nounwind {
   1137 ; X86-SSE-LABEL: test_mm_load_ps:
   1138 ; X86-SSE:       # %bb.0:
   1139 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1140 ; X86-SSE-NEXT:    movaps (%eax), %xmm0 # encoding: [0x0f,0x28,0x00]
   1141 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1142 ;
   1143 ; X86-AVX1-LABEL: test_mm_load_ps:
   1144 ; X86-AVX1:       # %bb.0:
   1145 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1146 ; X86-AVX1-NEXT:    vmovaps (%eax), %xmm0 # encoding: [0xc5,0xf8,0x28,0x00]
   1147 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   1148 ;
   1149 ; X86-AVX512-LABEL: test_mm_load_ps:
   1150 ; X86-AVX512:       # %bb.0:
   1151 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1152 ; X86-AVX512-NEXT:    vmovaps (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00]
   1153 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   1154 ;
   1155 ; X64-SSE-LABEL: test_mm_load_ps:
   1156 ; X64-SSE:       # %bb.0:
   1157 ; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
   1158 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1159 ;
   1160 ; X64-AVX1-LABEL: test_mm_load_ps:
   1161 ; X64-AVX1:       # %bb.0:
   1162 ; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07]
   1163 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   1164 ;
   1165 ; X64-AVX512-LABEL: test_mm_load_ps:
   1166 ; X64-AVX512:       # %bb.0:
   1167 ; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
   1168 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   1169   %arg0 = bitcast float* %a0 to <4 x float>*
   1170   %res = load <4 x float>, <4 x float>* %arg0, align 16
   1171   ret <4 x float> %res
   1172 }
   1173 
   1174 define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
   1175 ; X86-SSE-LABEL: test_mm_load_ps1:
   1176 ; X86-SSE:       # %bb.0:
   1177 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1178 ; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
   1179 ; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
   1180 ; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
   1181 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
   1182 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1183 ;
   1184 ; X86-AVX1-LABEL: test_mm_load_ps1:
   1185 ; X86-AVX1:       # %bb.0:
   1186 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1187 ; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x00]
   1188 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   1189 ;
   1190 ; X86-AVX512-LABEL: test_mm_load_ps1:
   1191 ; X86-AVX512:       # %bb.0:
   1192 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1193 ; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x00]
   1194 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   1195 ;
   1196 ; X64-SSE-LABEL: test_mm_load_ps1:
   1197 ; X64-SSE:       # %bb.0:
   1198 ; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
   1199 ; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
   1200 ; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
   1201 ; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
   1202 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1203 ;
   1204 ; X64-AVX1-LABEL: test_mm_load_ps1:
   1205 ; X64-AVX1:       # %bb.0:
   1206 ; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x07]
   1207 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   1208 ;
   1209 ; X64-AVX512-LABEL: test_mm_load_ps1:
   1210 ; X64-AVX512:       # %bb.0:
   1211 ; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
   1212 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   1213   %ld = load float, float* %a0, align 4
   1214   %res0 = insertelement <4 x float> undef, float %ld, i32 0
   1215   %res1 = insertelement <4 x float> %res0, float %ld, i32 1
   1216   %res2 = insertelement <4 x float> %res1, float %ld, i32 2
   1217   %res3 = insertelement <4 x float> %res2, float %ld, i32 3
   1218   ret <4 x float> %res3
   1219 }
   1220 
   1221 define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
   1222 ; X86-SSE-LABEL: test_mm_load_ss:
   1223 ; X86-SSE:       # %bb.0:
   1224 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1225 ; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
   1226 ; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
   1227 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1228 ;
   1229 ; X86-AVX1-LABEL: test_mm_load_ss:
   1230 ; X86-AVX1:       # %bb.0:
   1231 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1232 ; X86-AVX1-NEXT:    vmovss (%eax), %xmm0 # encoding: [0xc5,0xfa,0x10,0x00]
   1233 ; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
   1234 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   1235 ;
   1236 ; X86-AVX512-LABEL: test_mm_load_ss:
   1237 ; X86-AVX512:       # %bb.0:
   1238 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1239 ; X86-AVX512-NEXT:    vmovss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
   1240 ; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
   1241 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   1242 ;
   1243 ; X64-SSE-LABEL: test_mm_load_ss:
   1244 ; X64-SSE:       # %bb.0:
   1245 ; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
   1246 ; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
   1247 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1248 ;
   1249 ; X64-AVX1-LABEL: test_mm_load_ss:
   1250 ; X64-AVX1:       # %bb.0:
   1251 ; X64-AVX1-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
   1252 ; X64-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
   1253 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   1254 ;
   1255 ; X64-AVX512-LABEL: test_mm_load_ss:
   1256 ; X64-AVX512:       # %bb.0:
   1257 ; X64-AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
   1258 ; X64-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
   1259 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   1260   %ld = load float, float* %a0, align 1
   1261   %res0 = insertelement <4 x float> undef, float %ld, i32 0
   1262   %res1 = insertelement <4 x float> %res0, float 0.0, i32 1
   1263   %res2 = insertelement <4 x float> %res1, float 0.0, i32 2
   1264   %res3 = insertelement <4 x float> %res2, float 0.0, i32 3
   1265   ret <4 x float> %res3
   1266 }
   1267 
   1268 define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
   1269 ; X86-SSE-LABEL: test_mm_load1_ps:
   1270 ; X86-SSE:       # %bb.0:
   1271 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1272 ; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
   1273 ; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
   1274 ; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
   1275 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
   1276 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1277 ;
   1278 ; X86-AVX1-LABEL: test_mm_load1_ps:
   1279 ; X86-AVX1:       # %bb.0:
   1280 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1281 ; X86-AVX1-NEXT:    vbroadcastss (%eax), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x00]
   1282 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   1283 ;
   1284 ; X86-AVX512-LABEL: test_mm_load1_ps:
   1285 ; X86-AVX512:       # %bb.0:
   1286 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1287 ; X86-AVX512-NEXT:    vbroadcastss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x00]
   1288 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   1289 ;
   1290 ; X64-SSE-LABEL: test_mm_load1_ps:
   1291 ; X64-SSE:       # %bb.0:
   1292 ; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
   1293 ; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
   1294 ; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
   1295 ; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
   1296 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1297 ;
   1298 ; X64-AVX1-LABEL: test_mm_load1_ps:
   1299 ; X64-AVX1:       # %bb.0:
   1300 ; X64-AVX1-NEXT:    vbroadcastss (%rdi), %xmm0 # encoding: [0xc4,0xe2,0x79,0x18,0x07]
   1301 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   1302 ;
   1303 ; X64-AVX512-LABEL: test_mm_load1_ps:
   1304 ; X64-AVX512:       # %bb.0:
   1305 ; X64-AVX512-NEXT:    vbroadcastss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x07]
   1306 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   1307   %ld = load float, float* %a0, align 4
   1308   %res0 = insertelement <4 x float> undef, float %ld, i32 0
   1309   %res1 = insertelement <4 x float> %res0, float %ld, i32 1
   1310   %res2 = insertelement <4 x float> %res1, float %ld, i32 2
   1311   %res3 = insertelement <4 x float> %res2, float %ld, i32 3
   1312   ret <4 x float> %res3
   1313 }
   1314 
   1315 define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
   1316 ; X86-SSE-LABEL: test_mm_loadh_pi:
   1317 ; X86-SSE:       # %bb.0:
   1318 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1319 ; X86-SSE-NEXT:    movss (%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x08]
   1320 ; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
   1321 ; X86-SSE-NEXT:    movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04]
   1322 ; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
   1323 ; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
   1324 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1325 ; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
   1326 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
   1327 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1328 ;
   1329 ; X86-AVX1-LABEL: test_mm_loadh_pi:
   1330 ; X86-AVX1:       # %bb.0:
   1331 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1332 ; X86-AVX1-NEXT:    vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08]
   1333 ; X86-AVX1-NEXT:    # xmm1 = mem[0],zero
   1334 ; X86-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1]
   1335 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0]
   1336 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   1337 ;
   1338 ; X86-AVX512-LABEL: test_mm_loadh_pi:
   1339 ; X86-AVX512:       # %bb.0:
   1340 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1341 ; X86-AVX512-NEXT:    vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08]
   1342 ; X86-AVX512-NEXT:    # xmm1 = mem[0],zero
   1343 ; X86-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
   1344 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0]
   1345 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   1346 ;
   1347 ; X64-SSE-LABEL: test_mm_loadh_pi:
   1348 ; X64-SSE:       # %bb.0:
   1349 ; X64-SSE-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
   1350 ; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xf8]
   1351 ; X64-SSE-NEXT:    shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20]
   1352 ; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xfc]
   1353 ; X64-SSE-NEXT:    movss -{{[0-9]+}}(%rsp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0xf8]
   1354 ; X64-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
   1355 ; X64-SSE-NEXT:    movss -{{[0-9]+}}(%rsp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0xfc]
   1356 ; X64-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
   1357 ; X64-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
   1358 ; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1359 ; X64-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
   1360 ; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
   1361 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1362 ;
   1363 ; X64-AVX1-LABEL: test_mm_loadh_pi:
   1364 ; X64-AVX1:       # %bb.0:
   1365 ; X64-AVX1-NEXT:    vmovhpd (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x16,0x07]
   1366 ; X64-AVX1-NEXT:    # xmm0 = xmm0[0],mem[0]
   1367 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   1368 ;
   1369 ; X64-AVX512-LABEL: test_mm_loadh_pi:
   1370 ; X64-AVX512:       # %bb.0:
   1371 ; X64-AVX512-NEXT:    vmovhpd (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x16,0x07]
   1372 ; X64-AVX512-NEXT:    # xmm0 = xmm0[0],mem[0]
   1373 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   1374   %ptr = bitcast x86_mmx* %a1 to <2 x float>*
   1375   %ld  = load <2 x float>, <2 x float>* %ptr
   1376   %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   1377   %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1378   ret <4 x float> %res
   1379 }
   1380 
   1381 define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
   1382 ; X86-SSE-LABEL: test_mm_loadl_pi:
   1383 ; X86-SSE:       # %bb.0:
   1384 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1385 ; X86-SSE-NEXT:    movss (%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x08]
   1386 ; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
   1387 ; X86-SSE-NEXT:    movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04]
   1388 ; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
   1389 ; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
   1390 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1391 ; X86-SSE-NEXT:    shufps $228, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe4]
   1392 ; X86-SSE-NEXT:    # xmm1 = xmm1[0,1],xmm0[2,3]
   1393 ; X86-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
   1394 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1395 ;
   1396 ; X86-AVX1-LABEL: test_mm_loadl_pi:
   1397 ; X86-AVX1:       # %bb.0:
   1398 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1399 ; X86-AVX1-NEXT:    vmovsd (%eax), %xmm1 # encoding: [0xc5,0xfb,0x10,0x08]
   1400 ; X86-AVX1-NEXT:    # xmm1 = mem[0],zero
   1401 ; X86-AVX1-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
   1402 ; X86-AVX1-NEXT:    # xmm0 = xmm1[0,1],xmm0[2,3]
   1403 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   1404 ;
   1405 ; X86-AVX512-LABEL: test_mm_loadl_pi:
   1406 ; X86-AVX512:       # %bb.0:
   1407 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1408 ; X86-AVX512-NEXT:    vmovsd (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x08]
   1409 ; X86-AVX512-NEXT:    # xmm1 = mem[0],zero
   1410 ; X86-AVX512-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
   1411 ; X86-AVX512-NEXT:    # xmm0 = xmm1[0,1],xmm0[2,3]
   1412 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   1413 ;
   1414 ; X64-SSE-LABEL: test_mm_loadl_pi:
   1415 ; X64-SSE:       # %bb.0:
   1416 ; X64-SSE-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
   1417 ; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xf8]
   1418 ; X64-SSE-NEXT:    shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20]
   1419 ; X64-SSE-NEXT:    movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xfc]
   1420 ; X64-SSE-NEXT:    movss -{{[0-9]+}}(%rsp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0xf8]
   1421 ; X64-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
   1422 ; X64-SSE-NEXT:    movss -{{[0-9]+}}(%rsp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0xfc]
   1423 ; X64-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
   1424 ; X64-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
   1425 ; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   1426 ; X64-SSE-NEXT:    shufps $228, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe4]
   1427 ; X64-SSE-NEXT:    # xmm1 = xmm1[0,1],xmm0[2,3]
   1428 ; X64-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
   1429 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1430 ;
   1431 ; X64-AVX1-LABEL: test_mm_loadl_pi:
   1432 ; X64-AVX1:       # %bb.0:
   1433 ; X64-AVX1-NEXT:    vmovlpd (%rdi), %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x12,0x07]
   1434 ; X64-AVX1-NEXT:    # xmm0 = mem[0],xmm0[1]
   1435 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   1436 ;
   1437 ; X64-AVX512-LABEL: test_mm_loadl_pi:
   1438 ; X64-AVX512:       # %bb.0:
   1439 ; X64-AVX512-NEXT:    vmovlpd (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x12,0x07]
   1440 ; X64-AVX512-NEXT:    # xmm0 = mem[0],xmm0[1]
   1441 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   1442   %ptr = bitcast x86_mmx* %a1 to <2 x float>*
   1443   %ld  = load <2 x float>, <2 x float>* %ptr
   1444   %ext = shufflevector <2 x float> %ld, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   1445   %res = shufflevector <4 x float> %a0, <4 x float> %ext, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
   1446   ret <4 x float> %res
   1447 }
   1448 
   1449 define <4 x float> @test_mm_loadr_ps(float* %a0) nounwind {
   1450 ; X86-SSE-LABEL: test_mm_loadr_ps:
   1451 ; X86-SSE:       # %bb.0:
   1452 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1453 ; X86-SSE-NEXT:    movaps (%eax), %xmm0 # encoding: [0x0f,0x28,0x00]
   1454 ; X86-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
   1455 ; X86-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
   1456 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1457 ;
   1458 ; X86-AVX1-LABEL: test_mm_loadr_ps:
   1459 ; X86-AVX1:       # %bb.0:
   1460 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1461 ; X86-AVX1-NEXT:    vpermilps $27, (%eax), %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0x00,0x1b]
   1462 ; X86-AVX1-NEXT:    # xmm0 = mem[3,2,1,0]
   1463 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   1464 ;
   1465 ; X86-AVX512-LABEL: test_mm_loadr_ps:
   1466 ; X86-AVX512:       # %bb.0:
   1467 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1468 ; X86-AVX512-NEXT:    vpermilps $27, (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x00,0x1b]
   1469 ; X86-AVX512-NEXT:    # xmm0 = mem[3,2,1,0]
   1470 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   1471 ;
   1472 ; X64-SSE-LABEL: test_mm_loadr_ps:
   1473 ; X64-SSE:       # %bb.0:
   1474 ; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
   1475 ; X64-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
   1476 ; X64-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
   1477 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1478 ;
   1479 ; X64-AVX1-LABEL: test_mm_loadr_ps:
   1480 ; X64-AVX1:       # %bb.0:
   1481 ; X64-AVX1-NEXT:    vpermilps $27, (%rdi), %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0x07,0x1b]
   1482 ; X64-AVX1-NEXT:    # xmm0 = mem[3,2,1,0]
   1483 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   1484 ;
   1485 ; X64-AVX512-LABEL: test_mm_loadr_ps:
   1486 ; X64-AVX512:       # %bb.0:
   1487 ; X64-AVX512-NEXT:    vpermilps $27, (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0x07,0x1b]
   1488 ; X64-AVX512-NEXT:    # xmm0 = mem[3,2,1,0]
   1489 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   1490   %arg0 = bitcast float* %a0 to <4 x float>*
   1491   %ld = load <4 x float>, <4 x float>* %arg0, align 16
   1492   %res = shufflevector <4 x float> %ld, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   1493   ret <4 x float> %res
   1494 }
   1495 
   1496 define <4 x float> @test_mm_loadu_ps(float* %a0) nounwind {
   1497 ; X86-SSE-LABEL: test_mm_loadu_ps:
   1498 ; X86-SSE:       # %bb.0:
   1499 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1500 ; X86-SSE-NEXT:    movups (%eax), %xmm0 # encoding: [0x0f,0x10,0x00]
   1501 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1502 ;
   1503 ; X86-AVX1-LABEL: test_mm_loadu_ps:
   1504 ; X86-AVX1:       # %bb.0:
   1505 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1506 ; X86-AVX1-NEXT:    vmovups (%eax), %xmm0 # encoding: [0xc5,0xf8,0x10,0x00]
   1507 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   1508 ;
   1509 ; X86-AVX512-LABEL: test_mm_loadu_ps:
   1510 ; X86-AVX512:       # %bb.0:
   1511 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1512 ; X86-AVX512-NEXT:    vmovups (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x00]
   1513 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   1514 ;
   1515 ; X64-SSE-LABEL: test_mm_loadu_ps:
   1516 ; X64-SSE:       # %bb.0:
   1517 ; X64-SSE-NEXT:    movups (%rdi), %xmm0 # encoding: [0x0f,0x10,0x07]
   1518 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1519 ;
   1520 ; X64-AVX1-LABEL: test_mm_loadu_ps:
   1521 ; X64-AVX1:       # %bb.0:
   1522 ; X64-AVX1-NEXT:    vmovups (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x10,0x07]
   1523 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   1524 ;
   1525 ; X64-AVX512-LABEL: test_mm_loadu_ps:
   1526 ; X64-AVX512:       # %bb.0:
   1527 ; X64-AVX512-NEXT:    vmovups (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x10,0x07]
   1528 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   1529   %arg0 = bitcast float* %a0 to <4 x float>*
   1530   %res = load <4 x float>, <4 x float>* %arg0, align 1
   1531   ret <4 x float> %res
   1532 }
   1533 
   1534 define <4 x float> @test_mm_max_ps(<4 x float> %a0, <4 x float> %a1) {
   1535 ; SSE-LABEL: test_mm_max_ps:
   1536 ; SSE:       # %bb.0:
   1537 ; SSE-NEXT:    maxps %xmm1, %xmm0 # encoding: [0x0f,0x5f,0xc1]
   1538 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1539 ;
   1540 ; AVX1-LABEL: test_mm_max_ps:
   1541 ; AVX1:       # %bb.0:
   1542 ; AVX1-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5f,0xc1]
   1543 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1544 ;
   1545 ; AVX512-LABEL: test_mm_max_ps:
   1546 ; AVX512:       # %bb.0:
   1547 ; AVX512-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
   1548 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1549   %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
   1550   ret <4 x float> %res
   1551 }
   1552 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
   1553 
   1554 define <4 x float> @test_mm_max_ss(<4 x float> %a0, <4 x float> %a1) {
   1555 ; SSE-LABEL: test_mm_max_ss:
   1556 ; SSE:       # %bb.0:
   1557 ; SSE-NEXT:    maxss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5f,0xc1]
   1558 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1559 ;
   1560 ; AVX1-LABEL: test_mm_max_ss:
   1561 ; AVX1:       # %bb.0:
   1562 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5f,0xc1]
   1563 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1564 ;
   1565 ; AVX512-LABEL: test_mm_max_ss:
   1566 ; AVX512:       # %bb.0:
   1567 ; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5f,0xc1]
   1568 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1569   %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
   1570   ret <4 x float> %res
   1571 }
   1572 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
   1573 
   1574 define <4 x float> @test_mm_min_ps(<4 x float> %a0, <4 x float> %a1) {
   1575 ; SSE-LABEL: test_mm_min_ps:
   1576 ; SSE:       # %bb.0:
   1577 ; SSE-NEXT:    minps %xmm1, %xmm0 # encoding: [0x0f,0x5d,0xc1]
   1578 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1579 ;
   1580 ; AVX1-LABEL: test_mm_min_ps:
   1581 ; AVX1:       # %bb.0:
   1582 ; AVX1-NEXT:    vminps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5d,0xc1]
   1583 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1584 ;
   1585 ; AVX512-LABEL: test_mm_min_ps:
   1586 ; AVX512:       # %bb.0:
   1587 ; AVX512-NEXT:    vminps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
   1588 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1589   %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   1590   ret <4 x float> %res
   1591 }
   1592 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
   1593 
   1594 define <4 x float> @test_mm_min_ss(<4 x float> %a0, <4 x float> %a1) {
   1595 ; SSE-LABEL: test_mm_min_ss:
   1596 ; SSE:       # %bb.0:
   1597 ; SSE-NEXT:    minss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5d,0xc1]
   1598 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1599 ;
   1600 ; AVX1-LABEL: test_mm_min_ss:
   1601 ; AVX1:       # %bb.0:
   1602 ; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5d,0xc1]
   1603 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1604 ;
   1605 ; AVX512-LABEL: test_mm_min_ss:
   1606 ; AVX512:       # %bb.0:
   1607 ; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5d,0xc1]
   1608 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1609   %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
   1610   ret <4 x float> %res
   1611 }
   1612 declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
   1613 
   1614 define <4 x float> @test_mm_move_ss(<4 x float> %a0, <4 x float> %a1) {
   1615 ; SSE-LABEL: test_mm_move_ss:
   1616 ; SSE:       # %bb.0:
   1617 ; SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
   1618 ; SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
   1619 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1620 ;
   1621 ; AVX-LABEL: test_mm_move_ss:
   1622 ; AVX:       # %bb.0:
   1623 ; AVX-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
   1624 ; AVX-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
   1625 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1626   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
   1627   ret <4 x float> %res
   1628 }
   1629 
   1630 define <4 x float> @test_mm_movehl_ps(<4 x float> %a0, <4 x float> %a1) {
   1631 ; SSE-LABEL: test_mm_movehl_ps:
   1632 ; SSE:       # %bb.0:
   1633 ; SSE-NEXT:    movhlps %xmm1, %xmm0 # encoding: [0x0f,0x12,0xc1]
   1634 ; SSE-NEXT:    # xmm0 = xmm1[1],xmm0[1]
   1635 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1636 ;
   1637 ; AVX1-LABEL: test_mm_movehl_ps:
   1638 ; AVX1:       # %bb.0:
   1639 ; AVX1-NEXT:    vunpckhpd %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf1,0x15,0xc0]
   1640 ; AVX1-NEXT:    # xmm0 = xmm1[1],xmm0[1]
   1641 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1642 ;
   1643 ; AVX512-LABEL: test_mm_movehl_ps:
   1644 ; AVX512:       # %bb.0:
   1645 ; AVX512-NEXT:    vunpckhpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x15,0xc0]
   1646 ; AVX512-NEXT:    # xmm0 = xmm1[1],xmm0[1]
   1647 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1648   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   1649   ret <4 x float> %res
   1650 }
   1651 
   1652 define <4 x float> @test_mm_movelh_ps(<4 x float> %a0, <4 x float> %a1) {
   1653 ; SSE-LABEL: test_mm_movelh_ps:
   1654 ; SSE:       # %bb.0:
   1655 ; SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
   1656 ; SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
   1657 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1658 ;
   1659 ; AVX1-LABEL: test_mm_movelh_ps:
   1660 ; AVX1:       # %bb.0:
   1661 ; AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1]
   1662 ; AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0]
   1663 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1664 ;
   1665 ; AVX512-LABEL: test_mm_movelh_ps:
   1666 ; AVX512:       # %bb.0:
   1667 ; AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
   1668 ; AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0]
   1669 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1670   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   1671   ret <4 x float> %res
   1672 }
   1673 
   1674 define i32 @test_mm_movemask_ps(<4 x float> %a0) nounwind {
   1675 ; SSE-LABEL: test_mm_movemask_ps:
   1676 ; SSE:       # %bb.0:
   1677 ; SSE-NEXT:    movmskps %xmm0, %eax # encoding: [0x0f,0x50,0xc0]
   1678 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1679 ;
   1680 ; AVX-LABEL: test_mm_movemask_ps:
   1681 ; AVX:       # %bb.0:
   1682 ; AVX-NEXT:    vmovmskps %xmm0, %eax # encoding: [0xc5,0xf8,0x50,0xc0]
   1683 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1684   %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
   1685   ret i32 %res
   1686 }
   1687 declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
   1688 
   1689 define <4 x float> @test_mm_mul_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   1690 ; SSE-LABEL: test_mm_mul_ps:
   1691 ; SSE:       # %bb.0:
   1692 ; SSE-NEXT:    mulps %xmm1, %xmm0 # encoding: [0x0f,0x59,0xc1]
   1693 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1694 ;
   1695 ; AVX1-LABEL: test_mm_mul_ps:
   1696 ; AVX1:       # %bb.0:
   1697 ; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x59,0xc1]
   1698 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1699 ;
   1700 ; AVX512-LABEL: test_mm_mul_ps:
   1701 ; AVX512:       # %bb.0:
   1702 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x59,0xc1]
   1703 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1704   %res = fmul <4 x float> %a0, %a1
   1705   ret <4 x float> %res
   1706 }
   1707 
   1708 define <4 x float> @test_mm_mul_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   1709 ; SSE-LABEL: test_mm_mul_ss:
   1710 ; SSE:       # %bb.0:
   1711 ; SSE-NEXT:    mulss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x59,0xc1]
   1712 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1713 ;
   1714 ; AVX1-LABEL: test_mm_mul_ss:
   1715 ; AVX1:       # %bb.0:
   1716 ; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x59,0xc1]
   1717 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1718 ;
   1719 ; AVX512-LABEL: test_mm_mul_ss:
   1720 ; AVX512:       # %bb.0:
   1721 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x59,0xc1]
   1722 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1723   %ext0 = extractelement <4 x float> %a0, i32 0
   1724   %ext1 = extractelement <4 x float> %a1, i32 0
   1725   %fmul = fmul float %ext0, %ext1
   1726   %res = insertelement <4 x float> %a0, float %fmul, i32 0
   1727   ret <4 x float> %res
   1728 }
   1729 
   1730 define <4 x float> @test_mm_or_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   1731 ; SSE-LABEL: test_mm_or_ps:
   1732 ; SSE:       # %bb.0:
   1733 ; SSE-NEXT:    orps %xmm1, %xmm0 # encoding: [0x0f,0x56,0xc1]
   1734 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1735 ;
   1736 ; AVX1-LABEL: test_mm_or_ps:
   1737 ; AVX1:       # %bb.0:
   1738 ; AVX1-NEXT:    vorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x56,0xc1]
   1739 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1740 ;
   1741 ; AVX512-LABEL: test_mm_or_ps:
   1742 ; AVX512:       # %bb.0:
   1743 ; AVX512-NEXT:    vorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x56,0xc1]
   1744 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1745   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
   1746   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
   1747   %res = or <4 x i32> %arg0, %arg1
   1748   %bc = bitcast <4 x i32> %res to <4 x float>
   1749   ret <4 x float> %bc
   1750 }
   1751 
   1752 define void @test_mm_prefetch(i8* %a0) {
   1753 ; X86-LABEL: test_mm_prefetch:
   1754 ; X86:       # %bb.0:
   1755 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   1756 ; X86-NEXT:    prefetchnta (%eax) # encoding: [0x0f,0x18,0x00]
   1757 ; X86-NEXT:    retl # encoding: [0xc3]
   1758 ;
   1759 ; X64-LABEL: test_mm_prefetch:
   1760 ; X64:       # %bb.0:
   1761 ; X64-NEXT:    prefetchnta (%rdi) # encoding: [0x0f,0x18,0x07]
   1762 ; X64-NEXT:    retq # encoding: [0xc3]
   1763   call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
   1764   ret void
   1765 }
   1766 declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind readnone
   1767 
   1768 define <4 x float> @test_mm_rcp_ps(<4 x float> %a0) {
   1769 ; SSE-LABEL: test_mm_rcp_ps:
   1770 ; SSE:       # %bb.0:
   1771 ; SSE-NEXT:    rcpps %xmm0, %xmm0 # encoding: [0x0f,0x53,0xc0]
   1772 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1773 ;
   1774 ; AVX-LABEL: test_mm_rcp_ps:
   1775 ; AVX:       # %bb.0:
   1776 ; AVX-NEXT:    vrcpps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x53,0xc0]
   1777 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1778   %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
   1779   ret <4 x float> %res
   1780 }
   1781 declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
   1782 
   1783 define <4 x float> @test_mm_rcp_ss(<4 x float> %a0) {
   1784 ; SSE-LABEL: test_mm_rcp_ss:
   1785 ; SSE:       # %bb.0:
   1786 ; SSE-NEXT:    rcpss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x53,0xc0]
   1787 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1788 ;
   1789 ; AVX-LABEL: test_mm_rcp_ss:
   1790 ; AVX:       # %bb.0:
   1791 ; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x53,0xc0]
   1792 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1793   %rcp = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0)
   1794   ret <4 x float> %rcp
   1795 }
   1796 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
   1797 
   1798 define <4 x float> @test_mm_rsqrt_ps(<4 x float> %a0) {
   1799 ; SSE-LABEL: test_mm_rsqrt_ps:
   1800 ; SSE:       # %bb.0:
   1801 ; SSE-NEXT:    rsqrtps %xmm0, %xmm0 # encoding: [0x0f,0x52,0xc0]
   1802 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1803 ;
   1804 ; AVX-LABEL: test_mm_rsqrt_ps:
   1805 ; AVX:       # %bb.0:
   1806 ; AVX-NEXT:    vrsqrtps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x52,0xc0]
   1807 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1808   %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
   1809   ret <4 x float> %res
   1810 }
   1811 declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
   1812 
   1813 define <4 x float> @test_mm_rsqrt_ss(<4 x float> %a0) {
   1814 ; SSE-LABEL: test_mm_rsqrt_ss:
   1815 ; SSE:       # %bb.0:
   1816 ; SSE-NEXT:    rsqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x52,0xc0]
   1817 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1818 ;
   1819 ; AVX-LABEL: test_mm_rsqrt_ss:
   1820 ; AVX:       # %bb.0:
   1821 ; AVX-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x52,0xc0]
   1822 ; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   1823   %rsqrt = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0)
   1824   ret <4 x float> %rsqrt
   1825 }
   1826 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
   1827 
   1828 define void @test_MM_SET_EXCEPTION_MASK(i32 %a0) nounwind {
   1829 ; X86-SSE-LABEL: test_MM_SET_EXCEPTION_MASK:
   1830 ; X86-SSE:       # %bb.0:
   1831 ; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
   1832 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
   1833 ; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
   1834 ; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
   1835 ; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
   1836 ; X86-SSE-NEXT:    andl $-8065, %edx # encoding: [0x81,0xe2,0x7f,0xe0,0xff,0xff]
   1837 ; X86-SSE-NEXT:    # imm = 0xE07F
   1838 ; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
   1839 ; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
   1840 ; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
   1841 ; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
   1842 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1843 ;
   1844 ; X86-AVX-LABEL: test_MM_SET_EXCEPTION_MASK:
   1845 ; X86-AVX:       # %bb.0:
   1846 ; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
   1847 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
   1848 ; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
   1849 ; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
   1850 ; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
   1851 ; X86-AVX-NEXT:    andl $-8065, %edx # encoding: [0x81,0xe2,0x7f,0xe0,0xff,0xff]
   1852 ; X86-AVX-NEXT:    # imm = 0xE07F
   1853 ; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
   1854 ; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
   1855 ; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
   1856 ; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
   1857 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
   1858 ;
   1859 ; X64-SSE-LABEL: test_MM_SET_EXCEPTION_MASK:
   1860 ; X64-SSE:       # %bb.0:
   1861 ; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   1862 ; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
   1863 ; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
   1864 ; X64-SSE-NEXT:    andl $-8065, %ecx # encoding: [0x81,0xe1,0x7f,0xe0,0xff,0xff]
   1865 ; X64-SSE-NEXT:    # imm = 0xE07F
   1866 ; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
   1867 ; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
   1868 ; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
   1869 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1870 ;
   1871 ; X64-AVX-LABEL: test_MM_SET_EXCEPTION_MASK:
   1872 ; X64-AVX:       # %bb.0:
   1873 ; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   1874 ; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
   1875 ; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
   1876 ; X64-AVX-NEXT:    andl $-8065, %ecx # encoding: [0x81,0xe1,0x7f,0xe0,0xff,0xff]
   1877 ; X64-AVX-NEXT:    # imm = 0xE07F
   1878 ; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
   1879 ; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
   1880 ; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
   1881 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
   1882   %1 = alloca i32, align 4
   1883   %2 = bitcast i32* %1 to i8*
   1884   call void @llvm.x86.sse.stmxcsr(i8* %2)
   1885   %3 = load i32, i32* %1
   1886   %4 = and i32 %3, -8065
   1887   %5 = or i32 %4, %a0
   1888   store i32 %5, i32* %1
   1889   call void @llvm.x86.sse.ldmxcsr(i8* %2)
   1890   ret void
   1891 }
   1892 declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind readnone
   1893 
   1894 define void @test_MM_SET_EXCEPTION_STATE(i32 %a0) nounwind {
   1895 ; X86-SSE-LABEL: test_MM_SET_EXCEPTION_STATE:
   1896 ; X86-SSE:       # %bb.0:
   1897 ; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
   1898 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
   1899 ; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
   1900 ; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
   1901 ; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
   1902 ; X86-SSE-NEXT:    andl $-64, %edx # encoding: [0x83,0xe2,0xc0]
   1903 ; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
   1904 ; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
   1905 ; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
   1906 ; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
   1907 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1908 ;
   1909 ; X86-AVX-LABEL: test_MM_SET_EXCEPTION_STATE:
   1910 ; X86-AVX:       # %bb.0:
   1911 ; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
   1912 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
   1913 ; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
   1914 ; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
   1915 ; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
   1916 ; X86-AVX-NEXT:    andl $-64, %edx # encoding: [0x83,0xe2,0xc0]
   1917 ; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
   1918 ; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
   1919 ; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
   1920 ; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
   1921 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
   1922 ;
   1923 ; X64-SSE-LABEL: test_MM_SET_EXCEPTION_STATE:
   1924 ; X64-SSE:       # %bb.0:
   1925 ; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   1926 ; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
   1927 ; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
   1928 ; X64-SSE-NEXT:    andl $-64, %ecx # encoding: [0x83,0xe1,0xc0]
   1929 ; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
   1930 ; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
   1931 ; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
   1932 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1933 ;
   1934 ; X64-AVX-LABEL: test_MM_SET_EXCEPTION_STATE:
   1935 ; X64-AVX:       # %bb.0:
   1936 ; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   1937 ; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
   1938 ; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
   1939 ; X64-AVX-NEXT:    andl $-64, %ecx # encoding: [0x83,0xe1,0xc0]
   1940 ; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
   1941 ; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
   1942 ; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
   1943 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
   1944   %1 = alloca i32, align 4
   1945   %2 = bitcast i32* %1 to i8*
   1946   call void @llvm.x86.sse.stmxcsr(i8* %2)
   1947   %3 = load i32, i32* %1
   1948   %4 = and i32 %3, -64
   1949   %5 = or i32 %4, %a0
   1950   store i32 %5, i32* %1
   1951   call void @llvm.x86.sse.ldmxcsr(i8* %2)
   1952   ret void
   1953 }
   1954 
   1955 define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
   1956 ; X86-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
   1957 ; X86-SSE:       # %bb.0:
   1958 ; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
   1959 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
   1960 ; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
   1961 ; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
   1962 ; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
   1963 ; X86-SSE-NEXT:    andl $-32769, %edx # encoding: [0x81,0xe2,0xff,0x7f,0xff,0xff]
   1964 ; X86-SSE-NEXT:    # imm = 0xFFFF7FFF
   1965 ; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
   1966 ; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
   1967 ; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
   1968 ; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
   1969 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   1970 ;
   1971 ; X86-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
   1972 ; X86-AVX:       # %bb.0:
   1973 ; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
   1974 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
   1975 ; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
   1976 ; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
   1977 ; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
   1978 ; X86-AVX-NEXT:    andl $-32769, %edx # encoding: [0x81,0xe2,0xff,0x7f,0xff,0xff]
   1979 ; X86-AVX-NEXT:    # imm = 0xFFFF7FFF
   1980 ; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
   1981 ; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
   1982 ; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
   1983 ; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
   1984 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
   1985 ;
   1986 ; X64-SSE-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
   1987 ; X64-SSE:       # %bb.0:
   1988 ; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   1989 ; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
   1990 ; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
   1991 ; X64-SSE-NEXT:    andl $-32769, %ecx # encoding: [0x81,0xe1,0xff,0x7f,0xff,0xff]
   1992 ; X64-SSE-NEXT:    # imm = 0xFFFF7FFF
   1993 ; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
   1994 ; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
   1995 ; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
   1996 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   1997 ;
   1998 ; X64-AVX-LABEL: test_MM_SET_FLUSH_ZERO_MODE:
   1999 ; X64-AVX:       # %bb.0:
   2000 ; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   2001 ; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
   2002 ; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
   2003 ; X64-AVX-NEXT:    andl $-32769, %ecx # encoding: [0x81,0xe1,0xff,0x7f,0xff,0xff]
   2004 ; X64-AVX-NEXT:    # imm = 0xFFFF7FFF
   2005 ; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
   2006 ; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
   2007 ; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
   2008 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
   2009   %1 = alloca i32, align 4
   2010   %2 = bitcast i32* %1 to i8*
   2011   call void @llvm.x86.sse.stmxcsr(i8* %2)
   2012   %3 = load i32, i32* %1
   2013   %4 = and i32 %3, -32769
   2014   %5 = or i32 %4, %a0
   2015   store i32 %5, i32* %1
   2016   call void @llvm.x86.sse.ldmxcsr(i8* %2)
   2017   ret void
   2018 }
   2019 
   2020 define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
   2021 ; X86-SSE-LABEL: test_mm_set_ps:
   2022 ; X86-SSE:       # %bb.0:
   2023 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
   2024 ; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2025 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
   2026 ; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
   2027 ; X86-SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
   2028 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2029 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x08]
   2030 ; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
   2031 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x04]
   2032 ; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
   2033 ; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
   2034 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
   2035 ; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
   2036 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
   2037 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2038 ;
   2039 ; X86-AVX1-LABEL: test_mm_set_ps:
   2040 ; X86-AVX1:       # %bb.0:
   2041 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
   2042 ; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2043 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
   2044 ; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
   2045 ; X86-AVX1-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
   2046 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
   2047 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
   2048 ; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
   2049 ; X86-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
   2050 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
   2051 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
   2052 ; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
   2053 ; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
   2054 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[0]
   2055 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2056 ;
   2057 ; X86-AVX512-LABEL: test_mm_set_ps:
   2058 ; X86-AVX512:       # %bb.0:
   2059 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
   2060 ; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2061 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
   2062 ; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
   2063 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
   2064 ; X86-AVX512-NEXT:    # xmm2 = mem[0],zero,zero,zero
   2065 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
   2066 ; X86-AVX512-NEXT:    # xmm3 = mem[0],zero,zero,zero
   2067 ; X86-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
   2068 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
   2069 ; X86-AVX512-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
   2070 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
   2071 ; X86-AVX512-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
   2072 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
   2073 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2074 ;
   2075 ; X64-SSE-LABEL: test_mm_set_ps:
   2076 ; X64-SSE:       # %bb.0:
   2077 ; X64-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
   2078 ; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   2079 ; X64-SSE-NEXT:    unpcklps %xmm2, %xmm3 # encoding: [0x0f,0x14,0xda]
   2080 ; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
   2081 ; X64-SSE-NEXT:    movlhps %xmm1, %xmm3 # encoding: [0x0f,0x16,0xd9]
   2082 ; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm1[0]
   2083 ; X64-SSE-NEXT:    movaps %xmm3, %xmm0 # encoding: [0x0f,0x28,0xc3]
   2084 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2085 ;
   2086 ; X64-AVX1-LABEL: test_mm_set_ps:
   2087 ; X64-AVX1:       # %bb.0:
   2088 ; X64-AVX1-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
   2089 ; X64-AVX1-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
   2090 ; X64-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
   2091 ; X64-AVX1-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
   2092 ; X64-AVX1-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
   2093 ; X64-AVX1-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
   2094 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2095 ;
   2096 ; X64-AVX512-LABEL: test_mm_set_ps:
   2097 ; X64-AVX512:       # %bb.0:
   2098 ; X64-AVX512-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
   2099 ; X64-AVX512-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
   2100 ; X64-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
   2101 ; X64-AVX512-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
   2102 ; X64-AVX512-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
   2103 ; X64-AVX512-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
   2104 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2105   %res0  = insertelement <4 x float> undef, float %a3, i32 0
   2106   %res1  = insertelement <4 x float> %res0, float %a2, i32 1
   2107   %res2  = insertelement <4 x float> %res1, float %a1, i32 2
   2108   %res3  = insertelement <4 x float> %res2, float %a0, i32 3
   2109   ret <4 x float> %res3
   2110 }
   2111 
   2112 define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
   2113 ; X86-SSE-LABEL: test_mm_set_ps1:
   2114 ; X86-SSE:       # %bb.0:
   2115 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
   2116 ; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2117 ; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
   2118 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2119 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2120 ;
   2121 ; X86-AVX1-LABEL: test_mm_set_ps1:
   2122 ; X86-AVX1:       # %bb.0:
   2123 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
   2124 ; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2125 ; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
   2126 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2127 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2128 ;
   2129 ; X86-AVX512-LABEL: test_mm_set_ps1:
   2130 ; X86-AVX512:       # %bb.0:
   2131 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
   2132 ; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2133 ; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
   2134 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2135 ;
   2136 ; X64-SSE-LABEL: test_mm_set_ps1:
   2137 ; X64-SSE:       # %bb.0:
   2138 ; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
   2139 ; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2140 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2141 ;
   2142 ; X64-AVX1-LABEL: test_mm_set_ps1:
   2143 ; X64-AVX1:       # %bb.0:
   2144 ; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
   2145 ; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2146 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2147 ;
   2148 ; X64-AVX512-LABEL: test_mm_set_ps1:
   2149 ; X64-AVX512:       # %bb.0:
   2150 ; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
   2151 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2152   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   2153   %res1  = insertelement <4 x float> %res0, float %a0, i32 1
   2154   %res2  = insertelement <4 x float> %res1, float %a0, i32 2
   2155   %res3  = insertelement <4 x float> %res2, float %a0, i32 3
   2156   ret <4 x float> %res3
   2157 }
   2158 
   2159 define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
   2160 ; X86-SSE-LABEL: test_MM_SET_ROUNDING_MODE:
   2161 ; X86-SSE:       # %bb.0:
   2162 ; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
   2163 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
   2164 ; X86-SSE-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
   2165 ; X86-SSE-NEXT:    stmxcsr (%ecx) # encoding: [0x0f,0xae,0x19]
   2166 ; X86-SSE-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
   2167 ; X86-SSE-NEXT:    andl $-24577, %edx # encoding: [0x81,0xe2,0xff,0x9f,0xff,0xff]
   2168 ; X86-SSE-NEXT:    # imm = 0x9FFF
   2169 ; X86-SSE-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
   2170 ; X86-SSE-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
   2171 ; X86-SSE-NEXT:    ldmxcsr (%ecx) # encoding: [0x0f,0xae,0x11]
   2172 ; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
   2173 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2174 ;
   2175 ; X86-AVX-LABEL: test_MM_SET_ROUNDING_MODE:
   2176 ; X86-AVX:       # %bb.0:
   2177 ; X86-AVX-NEXT:    pushl %eax # encoding: [0x50]
   2178 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08]
   2179 ; X86-AVX-NEXT:    movl %esp, %ecx # encoding: [0x89,0xe1]
   2180 ; X86-AVX-NEXT:    vstmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x19]
   2181 ; X86-AVX-NEXT:    movl (%esp), %edx # encoding: [0x8b,0x14,0x24]
   2182 ; X86-AVX-NEXT:    andl $-24577, %edx # encoding: [0x81,0xe2,0xff,0x9f,0xff,0xff]
   2183 ; X86-AVX-NEXT:    # imm = 0x9FFF
   2184 ; X86-AVX-NEXT:    orl %eax, %edx # encoding: [0x09,0xc2]
   2185 ; X86-AVX-NEXT:    movl %edx, (%esp) # encoding: [0x89,0x14,0x24]
   2186 ; X86-AVX-NEXT:    vldmxcsr (%ecx) # encoding: [0xc5,0xf8,0xae,0x11]
   2187 ; X86-AVX-NEXT:    popl %eax # encoding: [0x58]
   2188 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
   2189 ;
   2190 ; X64-SSE-LABEL: test_MM_SET_ROUNDING_MODE:
   2191 ; X64-SSE:       # %bb.0:
   2192 ; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   2193 ; X64-SSE-NEXT:    stmxcsr (%rax) # encoding: [0x0f,0xae,0x18]
   2194 ; X64-SSE-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
   2195 ; X64-SSE-NEXT:    andl $-24577, %ecx # encoding: [0x81,0xe1,0xff,0x9f,0xff,0xff]
   2196 ; X64-SSE-NEXT:    # imm = 0x9FFF
   2197 ; X64-SSE-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
   2198 ; X64-SSE-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
   2199 ; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
   2200 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2201 ;
   2202 ; X64-AVX-LABEL: test_MM_SET_ROUNDING_MODE:
   2203 ; X64-AVX:       # %bb.0:
   2204 ; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   2205 ; X64-AVX-NEXT:    vstmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x18]
   2206 ; X64-AVX-NEXT:    movl -{{[0-9]+}}(%rsp), %ecx # encoding: [0x8b,0x4c,0x24,0xfc]
   2207 ; X64-AVX-NEXT:    andl $-24577, %ecx # encoding: [0x81,0xe1,0xff,0x9f,0xff,0xff]
   2208 ; X64-AVX-NEXT:    # imm = 0x9FFF
   2209 ; X64-AVX-NEXT:    orl %edi, %ecx # encoding: [0x09,0xf9]
   2210 ; X64-AVX-NEXT:    movl %ecx, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x4c,0x24,0xfc]
   2211 ; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
   2212 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
   2213   %1 = alloca i32, align 4
   2214   %2 = bitcast i32* %1 to i8*
   2215   call void @llvm.x86.sse.stmxcsr(i8* %2)
   2216   %3 = load i32, i32* %1
   2217   %4 = and i32 %3, -24577
   2218   %5 = or i32 %4, %a0
   2219   store i32 %5, i32* %1
   2220   call void @llvm.x86.sse.ldmxcsr(i8* %2)
   2221   ret void
   2222 }
   2223 
   2224 define <4 x float> @test_mm_set_ss(float %a0) nounwind {
   2225 ; X86-SSE-LABEL: test_mm_set_ss:
   2226 ; X86-SSE:       # %bb.0:
   2227 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
   2228 ; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
   2229 ; X86-SSE-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
   2230 ; X86-SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
   2231 ; X86-SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
   2232 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2233 ;
   2234 ; X86-AVX1-LABEL: test_mm_set_ss:
   2235 ; X86-AVX1:       # %bb.0:
   2236 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
   2237 ; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2238 ; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
   2239 ; X86-AVX1-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
   2240 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
   2241 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2242 ;
   2243 ; X86-AVX512-LABEL: test_mm_set_ss:
   2244 ; X86-AVX512:       # %bb.0:
   2245 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
   2246 ; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2247 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
   2248 ; X86-AVX512-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
   2249 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
   2250 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2251 ;
   2252 ; X64-SSE-LABEL: test_mm_set_ss:
   2253 ; X64-SSE:       # %bb.0:
   2254 ; X64-SSE-NEXT:    xorps %xmm1, %xmm1 # encoding: [0x0f,0x57,0xc9]
   2255 ; X64-SSE-NEXT:    movss %xmm0, %xmm1 # encoding: [0xf3,0x0f,0x10,0xc8]
   2256 ; X64-SSE-NEXT:    # xmm1 = xmm0[0],xmm1[1,2,3]
   2257 ; X64-SSE-NEXT:    movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
   2258 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2259 ;
   2260 ; X64-AVX-LABEL: test_mm_set_ss:
   2261 ; X64-AVX:       # %bb.0:
   2262 ; X64-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
   2263 ; X64-AVX-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
   2264 ; X64-AVX-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
   2265 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
   2266   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   2267   %res1  = insertelement <4 x float> %res0, float 0.0, i32 1
   2268   %res2  = insertelement <4 x float> %res1, float 0.0, i32 2
   2269   %res3  = insertelement <4 x float> %res2, float 0.0, i32 3
   2270   ret <4 x float> %res3
   2271 }
   2272 
   2273 define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
   2274 ; X86-SSE-LABEL: test_mm_set1_ps:
   2275 ; X86-SSE:       # %bb.0:
   2276 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
   2277 ; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2278 ; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
   2279 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2280 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2281 ;
   2282 ; X86-AVX1-LABEL: test_mm_set1_ps:
   2283 ; X86-AVX1:       # %bb.0:
   2284 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
   2285 ; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2286 ; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
   2287 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2288 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2289 ;
   2290 ; X86-AVX512-LABEL: test_mm_set1_ps:
   2291 ; X86-AVX512:       # %bb.0:
   2292 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
   2293 ; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2294 ; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
   2295 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2296 ;
   2297 ; X64-SSE-LABEL: test_mm_set1_ps:
   2298 ; X64-SSE:       # %bb.0:
   2299 ; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
   2300 ; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2301 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2302 ;
   2303 ; X64-AVX1-LABEL: test_mm_set1_ps:
   2304 ; X64-AVX1:       # %bb.0:
   2305 ; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
   2306 ; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2307 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2308 ;
   2309 ; X64-AVX512-LABEL: test_mm_set1_ps:
   2310 ; X64-AVX512:       # %bb.0:
   2311 ; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
   2312 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2313   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   2314   %res1  = insertelement <4 x float> %res0, float %a0, i32 1
   2315   %res2  = insertelement <4 x float> %res1, float %a0, i32 2
   2316   %res3  = insertelement <4 x float> %res2, float %a0, i32 3
   2317   ret <4 x float> %res3
   2318 }
   2319 
   2320 define void @test_mm_setcsr(i32 %a0) nounwind {
   2321 ; X86-SSE-LABEL: test_mm_setcsr:
   2322 ; X86-SSE:       # %bb.0:
   2323 ; X86-SSE-NEXT:    leal {{[0-9]+}}(%esp), %eax # encoding: [0x8d,0x44,0x24,0x04]
   2324 ; X86-SSE-NEXT:    ldmxcsr (%eax) # encoding: [0x0f,0xae,0x10]
   2325 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2326 ;
   2327 ; X86-AVX-LABEL: test_mm_setcsr:
   2328 ; X86-AVX:       # %bb.0:
   2329 ; X86-AVX-NEXT:    leal {{[0-9]+}}(%esp), %eax # encoding: [0x8d,0x44,0x24,0x04]
   2330 ; X86-AVX-NEXT:    vldmxcsr (%eax) # encoding: [0xc5,0xf8,0xae,0x10]
   2331 ; X86-AVX-NEXT:    retl # encoding: [0xc3]
   2332 ;
   2333 ; X64-SSE-LABEL: test_mm_setcsr:
   2334 ; X64-SSE:       # %bb.0:
   2335 ; X64-SSE-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x7c,0x24,0xfc]
   2336 ; X64-SSE-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   2337 ; X64-SSE-NEXT:    ldmxcsr (%rax) # encoding: [0x0f,0xae,0x10]
   2338 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2339 ;
   2340 ; X64-AVX-LABEL: test_mm_setcsr:
   2341 ; X64-AVX:       # %bb.0:
   2342 ; X64-AVX-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x7c,0x24,0xfc]
   2343 ; X64-AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8d,0x44,0x24,0xfc]
   2344 ; X64-AVX-NEXT:    vldmxcsr (%rax) # encoding: [0xc5,0xf8,0xae,0x10]
   2345 ; X64-AVX-NEXT:    retq # encoding: [0xc3]
   2346   %st = alloca i32, align 4
   2347   store i32 %a0, i32* %st, align 4
   2348   %bc = bitcast i32* %st to i8*
   2349   call void @llvm.x86.sse.ldmxcsr(i8* %bc)
   2350   ret void
   2351 }
   2352 
   2353 define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
   2354 ; X86-SSE-LABEL: test_mm_setr_ps:
   2355 ; X86-SSE:       # %bb.0:
   2356 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
   2357 ; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2358 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
   2359 ; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
   2360 ; X86-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
   2361 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   2362 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
   2363 ; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
   2364 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
   2365 ; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2366 ; X86-SSE-NEXT:    unpcklps %xmm2, %xmm0 # encoding: [0x0f,0x14,0xc2]
   2367 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   2368 ; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
   2369 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
   2370 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2371 ;
   2372 ; X86-AVX1-LABEL: test_mm_setr_ps:
   2373 ; X86-AVX1:       # %bb.0:
   2374 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
   2375 ; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2376 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
   2377 ; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
   2378 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
   2379 ; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero
   2380 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
   2381 ; X86-AVX1-NEXT:    # xmm3 = mem[0],zero,zero,zero
   2382 ; X86-AVX1-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
   2383 ; X86-AVX1-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
   2384 ; X86-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
   2385 ; X86-AVX1-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
   2386 ; X86-AVX1-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
   2387 ; X86-AVX1-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
   2388 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2389 ;
   2390 ; X86-AVX512-LABEL: test_mm_setr_ps:
   2391 ; X86-AVX512:       # %bb.0:
   2392 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
   2393 ; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2394 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
   2395 ; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
   2396 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
   2397 ; X86-AVX512-NEXT:    # xmm2 = mem[0],zero,zero,zero
   2398 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
   2399 ; X86-AVX512-NEXT:    # xmm3 = mem[0],zero,zero,zero
   2400 ; X86-AVX512-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
   2401 ; X86-AVX512-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
   2402 ; X86-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
   2403 ; X86-AVX512-NEXT:    # xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
   2404 ; X86-AVX512-NEXT:    vinsertps $48, %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x71,0x21,0xc0,0x30]
   2405 ; X86-AVX512-NEXT:    # xmm0 = xmm1[0,1,2],xmm0[0]
   2406 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2407 ;
   2408 ; X64-SSE-LABEL: test_mm_setr_ps:
   2409 ; X64-SSE:       # %bb.0:
   2410 ; X64-SSE-NEXT:    unpcklps %xmm3, %xmm2 # encoding: [0x0f,0x14,0xd3]
   2411 ; X64-SSE-NEXT:    # xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
   2412 ; X64-SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
   2413 ; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   2414 ; X64-SSE-NEXT:    movlhps %xmm2, %xmm0 # encoding: [0x0f,0x16,0xc2]
   2415 ; X64-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0]
   2416 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2417 ;
   2418 ; X64-AVX1-LABEL: test_mm_setr_ps:
   2419 ; X64-AVX1:       # %bb.0:
   2420 ; X64-AVX1-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
   2421 ; X64-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
   2422 ; X64-AVX1-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
   2423 ; X64-AVX1-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
   2424 ; X64-AVX1-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
   2425 ; X64-AVX1-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
   2426 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2427 ;
   2428 ; X64-AVX512-LABEL: test_mm_setr_ps:
   2429 ; X64-AVX512:       # %bb.0:
   2430 ; X64-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
   2431 ; X64-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
   2432 ; X64-AVX512-NEXT:    vinsertps $32, %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc2,0x20]
   2433 ; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
   2434 ; X64-AVX512-NEXT:    vinsertps $48, %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc3,0x30]
   2435 ; X64-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm3[0]
   2436 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2437   %res0  = insertelement <4 x float> undef, float %a0, i32 0
   2438   %res1  = insertelement <4 x float> %res0, float %a1, i32 1
   2439   %res2  = insertelement <4 x float> %res1, float %a2, i32 2
   2440   %res3  = insertelement <4 x float> %res2, float %a3, i32 3
   2441   ret <4 x float> %res3
   2442 }
   2443 
   2444 define <4 x float> @test_mm_setzero_ps() {
   2445 ; SSE-LABEL: test_mm_setzero_ps:
   2446 ; SSE:       # %bb.0:
   2447 ; SSE-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
   2448 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2449 ;
   2450 ; AVX1-LABEL: test_mm_setzero_ps:
   2451 ; AVX1:       # %bb.0:
   2452 ; AVX1-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
   2453 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2454 ;
   2455 ; AVX512-LABEL: test_mm_setzero_ps:
   2456 ; AVX512:       # %bb.0:
   2457 ; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc0]
   2458 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2459   ret <4 x float> zeroinitializer
   2460 }
   2461 
   2462 define void @test_mm_sfence() nounwind {
   2463 ; CHECK-LABEL: test_mm_sfence:
   2464 ; CHECK:       # %bb.0:
   2465 ; CHECK-NEXT:    sfence # encoding: [0x0f,0xae,0xf8]
   2466 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2467   call void @llvm.x86.sse.sfence()
   2468   ret void
   2469 }
   2470 declare void @llvm.x86.sse.sfence() nounwind readnone
   2471 
   2472 define <4 x float> @test_mm_shuffle_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   2473 ; SSE-LABEL: test_mm_shuffle_ps:
   2474 ; SSE:       # %bb.0:
   2475 ; SSE-NEXT:    shufps $0, %xmm1, %xmm0 # encoding: [0x0f,0xc6,0xc1,0x00]
   2476 ; SSE-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
   2477 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2478 ;
   2479 ; AVX1-LABEL: test_mm_shuffle_ps:
   2480 ; AVX1:       # %bb.0:
   2481 ; AVX1-NEXT:    vshufps $0, %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc1,0x00]
   2482 ; AVX1-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
   2483 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2484 ;
   2485 ; AVX512-LABEL: test_mm_shuffle_ps:
   2486 ; AVX512:       # %bb.0:
   2487 ; AVX512-NEXT:    vshufps $0, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc1,0x00]
   2488 ; AVX512-NEXT:    # xmm0 = xmm0[0,0],xmm1[0,0]
   2489 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2490   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
   2491   ret <4 x float> %res
   2492 }
   2493 
   2494 define <4 x float> @test_mm_sqrt_ps(<4 x float> %a0) {
   2495 ; SSE-LABEL: test_mm_sqrt_ps:
   2496 ; SSE:       # %bb.0:
   2497 ; SSE-NEXT:    sqrtps %xmm0, %xmm0 # encoding: [0x0f,0x51,0xc0]
   2498 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2499 ;
   2500 ; AVX1-LABEL: test_mm_sqrt_ps:
   2501 ; AVX1:       # %bb.0:
   2502 ; AVX1-NEXT:    vsqrtps %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x51,0xc0]
   2503 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2504 ;
   2505 ; AVX512-LABEL: test_mm_sqrt_ps:
   2506 ; AVX512:       # %bb.0:
   2507 ; AVX512-NEXT:    vsqrtps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
   2508 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2509   %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
   2510   ret <4 x float> %res
   2511 }
   2512 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone
   2513 
   2514 define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
   2515 ; SSE-LABEL: test_mm_sqrt_ss:
   2516 ; SSE:       # %bb.0:
   2517 ; SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
   2518 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2519 ;
   2520 ; AVX1-LABEL: test_mm_sqrt_ss:
   2521 ; AVX1:       # %bb.0:
   2522 ; AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
   2523 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2524 ;
   2525 ; AVX512-LABEL: test_mm_sqrt_ss:
   2526 ; AVX512:       # %bb.0:
   2527 ; AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
   2528 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   2529   %ext = extractelement <4 x float> %a0, i32 0
   2530   %sqrt = call float @llvm.sqrt.f32(float %ext)
   2531   %ins = insertelement <4 x float> %a0, float %sqrt, i32 0
   2532   ret <4 x float> %ins
   2533 }
   2534 declare float @llvm.sqrt.f32(float) nounwind readnone
   2535 
   2536 define float @test_mm_sqrt_ss_scalar(float %a0) {
   2537 ; X86-SSE-LABEL: test_mm_sqrt_ss_scalar:
   2538 ; X86-SSE:       # %bb.0:
   2539 ; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
   2540 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
   2541 ; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x08]
   2542 ; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2543 ; X86-SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
   2544 ; X86-SSE-NEXT:    movss %xmm0, (%esp) # encoding: [0xf3,0x0f,0x11,0x04,0x24]
   2545 ; X86-SSE-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
   2546 ; X86-SSE-NEXT:    popl %eax # encoding: [0x58]
   2547 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
   2548 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2549 ;
   2550 ; X86-AVX1-LABEL: test_mm_sqrt_ss_scalar:
   2551 ; X86-AVX1:       # %bb.0:
   2552 ; X86-AVX1-NEXT:    pushl %eax # encoding: [0x50]
   2553 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
   2554 ; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
   2555 ; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2556 ; X86-AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
   2557 ; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) # encoding: [0xc5,0xfa,0x11,0x04,0x24]
   2558 ; X86-AVX1-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
   2559 ; X86-AVX1-NEXT:    popl %eax # encoding: [0x58]
   2560 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
   2561 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2562 ;
   2563 ; X86-AVX512-LABEL: test_mm_sqrt_ss_scalar:
   2564 ; X86-AVX512:       # %bb.0:
   2565 ; X86-AVX512-NEXT:    pushl %eax # encoding: [0x50]
   2566 ; X86-AVX512-NEXT:    .cfi_def_cfa_offset 8
   2567 ; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
   2568 ; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
   2569 ; X86-AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
   2570 ; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
   2571 ; X86-AVX512-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
   2572 ; X86-AVX512-NEXT:    popl %eax # encoding: [0x58]
   2573 ; X86-AVX512-NEXT:    .cfi_def_cfa_offset 4
   2574 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2575 ;
   2576 ; X64-SSE-LABEL: test_mm_sqrt_ss_scalar:
   2577 ; X64-SSE:       # %bb.0:
   2578 ; X64-SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
   2579 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2580 ;
   2581 ; X64-AVX1-LABEL: test_mm_sqrt_ss_scalar:
   2582 ; X64-AVX1:       # %bb.0:
   2583 ; X64-AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
   2584 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2585 ;
   2586 ; X64-AVX512-LABEL: test_mm_sqrt_ss_scalar:
   2587 ; X64-AVX512:       # %bb.0:
   2588 ; X64-AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
   2589 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2590   %sqrt = call float @llvm.sqrt.f32(float %a0)
   2591   ret float %sqrt
   2592 }
   2593 
   2594 define void @test_mm_store_ps(float *%a0, <4 x float> %a1) {
   2595 ; X86-SSE-LABEL: test_mm_store_ps:
   2596 ; X86-SSE:       # %bb.0:
   2597 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2598 ; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
   2599 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2600 ;
   2601 ; X86-AVX1-LABEL: test_mm_store_ps:
   2602 ; X86-AVX1:       # %bb.0:
   2603 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2604 ; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
   2605 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2606 ;
   2607 ; X86-AVX512-LABEL: test_mm_store_ps:
   2608 ; X86-AVX512:       # %bb.0:
   2609 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2610 ; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
   2611 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2612 ;
   2613 ; X64-SSE-LABEL: test_mm_store_ps:
   2614 ; X64-SSE:       # %bb.0:
   2615 ; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
   2616 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2617 ;
   2618 ; X64-AVX1-LABEL: test_mm_store_ps:
   2619 ; X64-AVX1:       # %bb.0:
   2620 ; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
   2621 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2622 ;
   2623 ; X64-AVX512-LABEL: test_mm_store_ps:
   2624 ; X64-AVX512:       # %bb.0:
   2625 ; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
   2626 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2627   %arg0 = bitcast float* %a0 to <4 x float>*
   2628   store <4 x float> %a1, <4 x float>* %arg0, align 16
   2629   ret void
   2630 }
   2631 
   2632 define void @test_mm_store_ps1(float *%a0, <4 x float> %a1) {
   2633 ; X86-SSE-LABEL: test_mm_store_ps1:
   2634 ; X86-SSE:       # %bb.0:
   2635 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2636 ; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
   2637 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2638 ; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
   2639 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2640 ;
   2641 ; X86-AVX1-LABEL: test_mm_store_ps1:
   2642 ; X86-AVX1:       # %bb.0:
   2643 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2644 ; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
   2645 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2646 ; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
   2647 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2648 ;
   2649 ; X86-AVX512-LABEL: test_mm_store_ps1:
   2650 ; X86-AVX512:       # %bb.0:
   2651 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2652 ; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
   2653 ; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
   2654 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2655 ;
   2656 ; X64-SSE-LABEL: test_mm_store_ps1:
   2657 ; X64-SSE:       # %bb.0:
   2658 ; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
   2659 ; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2660 ; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
   2661 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2662 ;
   2663 ; X64-AVX1-LABEL: test_mm_store_ps1:
   2664 ; X64-AVX1:       # %bb.0:
   2665 ; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
   2666 ; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2667 ; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
   2668 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2669 ;
   2670 ; X64-AVX512-LABEL: test_mm_store_ps1:
   2671 ; X64-AVX512:       # %bb.0:
   2672 ; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
   2673 ; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
   2674 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2675   %arg0 = bitcast float* %a0 to <4 x float>*
   2676   %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
   2677   store <4 x float> %shuf, <4 x float>* %arg0, align 16
   2678   ret void
   2679 }
   2680 
   2681 define void @test_mm_store_ss(float *%a0, <4 x float> %a1) {
   2682 ; X86-SSE-LABEL: test_mm_store_ss:
   2683 ; X86-SSE:       # %bb.0:
   2684 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2685 ; X86-SSE-NEXT:    movss %xmm0, (%eax) # encoding: [0xf3,0x0f,0x11,0x00]
   2686 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2687 ;
   2688 ; X86-AVX1-LABEL: test_mm_store_ss:
   2689 ; X86-AVX1:       # %bb.0:
   2690 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2691 ; X86-AVX1-NEXT:    vmovss %xmm0, (%eax) # encoding: [0xc5,0xfa,0x11,0x00]
   2692 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2693 ;
   2694 ; X86-AVX512-LABEL: test_mm_store_ss:
   2695 ; X86-AVX512:       # %bb.0:
   2696 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2697 ; X86-AVX512-NEXT:    vmovss %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x00]
   2698 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2699 ;
   2700 ; X64-SSE-LABEL: test_mm_store_ss:
   2701 ; X64-SSE:       # %bb.0:
   2702 ; X64-SSE-NEXT:    movss %xmm0, (%rdi) # encoding: [0xf3,0x0f,0x11,0x07]
   2703 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2704 ;
   2705 ; X64-AVX1-LABEL: test_mm_store_ss:
   2706 ; X64-AVX1:       # %bb.0:
   2707 ; X64-AVX1-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
   2708 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2709 ;
   2710 ; X64-AVX512-LABEL: test_mm_store_ss:
   2711 ; X64-AVX512:       # %bb.0:
   2712 ; X64-AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
   2713 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2714   %ext = extractelement <4 x float> %a1, i32 0
   2715   store float %ext, float* %a0, align 1
   2716   ret void
   2717 }
   2718 
   2719 define void @test_mm_store1_ps(float *%a0, <4 x float> %a1) {
   2720 ; X86-SSE-LABEL: test_mm_store1_ps:
   2721 ; X86-SSE:       # %bb.0:
   2722 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2723 ; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
   2724 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2725 ; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
   2726 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2727 ;
   2728 ; X86-AVX1-LABEL: test_mm_store1_ps:
   2729 ; X86-AVX1:       # %bb.0:
   2730 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2731 ; X86-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
   2732 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2733 ; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
   2734 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2735 ;
   2736 ; X86-AVX512-LABEL: test_mm_store1_ps:
   2737 ; X86-AVX512:       # %bb.0:
   2738 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2739 ; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
   2740 ; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
   2741 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2742 ;
   2743 ; X64-SSE-LABEL: test_mm_store1_ps:
   2744 ; X64-SSE:       # %bb.0:
   2745 ; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
   2746 ; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2747 ; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
   2748 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2749 ;
   2750 ; X64-AVX1-LABEL: test_mm_store1_ps:
   2751 ; X64-AVX1:       # %bb.0:
   2752 ; X64-AVX1-NEXT:    vpermilps $0, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x00]
   2753 ; X64-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
   2754 ; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
   2755 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2756 ;
   2757 ; X64-AVX512-LABEL: test_mm_store1_ps:
   2758 ; X64-AVX512:       # %bb.0:
   2759 ; X64-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
   2760 ; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
   2761 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2762   %arg0 = bitcast float* %a0 to <4 x float>*
   2763   %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer
   2764   store <4 x float> %shuf, <4 x float>* %arg0, align 16
   2765   ret void
   2766 }
   2767 
   2768 define void @test_mm_storeh_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
   2769 ; X86-SSE-LABEL: test_mm_storeh_ps:
   2770 ; X86-SSE:       # %bb.0:
   2771 ; X86-SSE-NEXT:    pushl %ebp # encoding: [0x55]
   2772 ; X86-SSE-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
   2773 ; X86-SSE-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
   2774 ; X86-SSE-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
   2775 ; X86-SSE-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
   2776 ; X86-SSE-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
   2777 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08]
   2778 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
   2779 ; X86-SSE-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
   2780 ; X86-SSE-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
   2781 ; X86-SSE-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
   2782 ; X86-SSE-NEXT:    popl %ebp # encoding: [0x5d]
   2783 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2784 ;
   2785 ; X86-AVX1-LABEL: test_mm_storeh_ps:
   2786 ; X86-AVX1:       # %bb.0:
   2787 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2788 ; X86-AVX1-NEXT:    vmovhpd %xmm0, (%eax) # encoding: [0xc5,0xf9,0x17,0x00]
   2789 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2790 ;
   2791 ; X86-AVX512-LABEL: test_mm_storeh_ps:
   2792 ; X86-AVX512:       # %bb.0:
   2793 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2794 ; X86-AVX512-NEXT:    vmovhpd %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x17,0x00]
   2795 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2796 ;
   2797 ; X64-SSE-LABEL: test_mm_storeh_ps:
   2798 ; X64-SSE:       # %bb.0:
   2799 ; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
   2800 ; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xf0]
   2801 ; X64-SSE-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
   2802 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2803 ;
   2804 ; X64-AVX1-LABEL: test_mm_storeh_ps:
   2805 ; X64-AVX1:       # %bb.0:
   2806 ; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax # encoding: [0xc4,0xe3,0xf9,0x16,0xc0,0x01]
   2807 ; X64-AVX1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
   2808 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2809 ;
   2810 ; X64-AVX512-LABEL: test_mm_storeh_ps:
   2811 ; X64-AVX512:       # %bb.0:
   2812 ; X64-AVX512-NEXT:    vpextrq $1, %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xf9,0x16,0xc0,0x01]
   2813 ; X64-AVX512-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
   2814 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2815   %ptr = bitcast x86_mmx* %a0 to i64*
   2816   %bc  = bitcast <4 x float> %a1 to <2 x i64>
   2817   %ext = extractelement <2 x i64> %bc, i32 1
   2818   store i64 %ext, i64* %ptr
   2819   ret void
   2820 }
   2821 
   2822 define void @test_mm_storel_ps(x86_mmx *%a0, <4 x float> %a1) nounwind {
   2823 ; X86-SSE-LABEL: test_mm_storel_ps:
   2824 ; X86-SSE:       # %bb.0:
   2825 ; X86-SSE-NEXT:    pushl %ebp # encoding: [0x55]
   2826 ; X86-SSE-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
   2827 ; X86-SSE-NEXT:    andl $-16, %esp # encoding: [0x83,0xe4,0xf0]
   2828 ; X86-SSE-NEXT:    subl $32, %esp # encoding: [0x83,0xec,0x20]
   2829 ; X86-SSE-NEXT:    movl 8(%ebp), %eax # encoding: [0x8b,0x45,0x08]
   2830 ; X86-SSE-NEXT:    movaps %xmm0, (%esp) # encoding: [0x0f,0x29,0x04,0x24]
   2831 ; X86-SSE-NEXT:    movl (%esp), %ecx # encoding: [0x8b,0x0c,0x24]
   2832 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04]
   2833 ; X86-SSE-NEXT:    movl %edx, 4(%eax) # encoding: [0x89,0x50,0x04]
   2834 ; X86-SSE-NEXT:    movl %ecx, (%eax) # encoding: [0x89,0x08]
   2835 ; X86-SSE-NEXT:    movl %ebp, %esp # encoding: [0x89,0xec]
   2836 ; X86-SSE-NEXT:    popl %ebp # encoding: [0x5d]
   2837 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2838 ;
   2839 ; X86-AVX1-LABEL: test_mm_storel_ps:
   2840 ; X86-AVX1:       # %bb.0:
   2841 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2842 ; X86-AVX1-NEXT:    vmovlps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x13,0x00]
   2843 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2844 ;
   2845 ; X86-AVX512-LABEL: test_mm_storel_ps:
   2846 ; X86-AVX512:       # %bb.0:
   2847 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2848 ; X86-AVX512-NEXT:    vmovlps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x13,0x00]
   2849 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2850 ;
   2851 ; X64-SSE-LABEL: test_mm_storel_ps:
   2852 ; X64-SSE:       # %bb.0:
   2853 ; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
   2854 ; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8]
   2855 ; X64-SSE-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
   2856 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2857 ;
   2858 ; X64-AVX1-LABEL: test_mm_storel_ps:
   2859 ; X64-AVX1:       # %bb.0:
   2860 ; X64-AVX1-NEXT:    vmovq %xmm0, %rax # encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
   2861 ; X64-AVX1-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
   2862 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2863 ;
   2864 ; X64-AVX512-LABEL: test_mm_storel_ps:
   2865 ; X64-AVX512:       # %bb.0:
   2866 ; X64-AVX512-NEXT:    vmovq %xmm0, %rax # EVEX TO VEX Compression encoding: [0xc4,0xe1,0xf9,0x7e,0xc0]
   2867 ; X64-AVX512-NEXT:    movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
   2868 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2869   %ptr = bitcast x86_mmx* %a0 to i64*
   2870   %bc  = bitcast <4 x float> %a1 to <2 x i64>
   2871   %ext = extractelement <2 x i64> %bc, i32 0
   2872   store i64 %ext, i64* %ptr
   2873   ret void
   2874 }
   2875 
   2876 define void @test_mm_storer_ps(float *%a0, <4 x float> %a1) {
   2877 ; X86-SSE-LABEL: test_mm_storer_ps:
   2878 ; X86-SSE:       # %bb.0:
   2879 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2880 ; X86-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
   2881 ; X86-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
   2882 ; X86-SSE-NEXT:    movaps %xmm0, (%eax) # encoding: [0x0f,0x29,0x00]
   2883 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2884 ;
   2885 ; X86-AVX1-LABEL: test_mm_storer_ps:
   2886 ; X86-AVX1:       # %bb.0:
   2887 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2888 ; X86-AVX1-NEXT:    vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
   2889 ; X86-AVX1-NEXT:    # xmm0 = xmm0[3,2,1,0]
   2890 ; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
   2891 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2892 ;
   2893 ; X86-AVX512-LABEL: test_mm_storer_ps:
   2894 ; X86-AVX512:       # %bb.0:
   2895 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2896 ; X86-AVX512-NEXT:    vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
   2897 ; X86-AVX512-NEXT:    # xmm0 = xmm0[3,2,1,0]
   2898 ; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
   2899 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2900 ;
   2901 ; X64-SSE-LABEL: test_mm_storer_ps:
   2902 ; X64-SSE:       # %bb.0:
   2903 ; X64-SSE-NEXT:    shufps $27, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x1b]
   2904 ; X64-SSE-NEXT:    # xmm0 = xmm0[3,2,1,0]
   2905 ; X64-SSE-NEXT:    movaps %xmm0, (%rdi) # encoding: [0x0f,0x29,0x07]
   2906 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2907 ;
   2908 ; X64-AVX1-LABEL: test_mm_storer_ps:
   2909 ; X64-AVX1:       # %bb.0:
   2910 ; X64-AVX1-NEXT:    vpermilps $27, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
   2911 ; X64-AVX1-NEXT:    # xmm0 = xmm0[3,2,1,0]
   2912 ; X64-AVX1-NEXT:    vmovaps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x29,0x07]
   2913 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2914 ;
   2915 ; X64-AVX512-LABEL: test_mm_storer_ps:
   2916 ; X64-AVX512:       # %bb.0:
   2917 ; X64-AVX512-NEXT:    vpermilps $27, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x1b]
   2918 ; X64-AVX512-NEXT:    # xmm0 = xmm0[3,2,1,0]
   2919 ; X64-AVX512-NEXT:    vmovaps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x07]
   2920 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2921   %arg0 = bitcast float* %a0 to <4 x float>*
   2922   %shuf = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   2923   store <4 x float> %shuf, <4 x float>* %arg0, align 16
   2924   ret void
   2925 }
   2926 
   2927 define void @test_mm_storeu_ps(float *%a0, <4 x float> %a1) {
   2928 ; X86-SSE-LABEL: test_mm_storeu_ps:
   2929 ; X86-SSE:       # %bb.0:
   2930 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2931 ; X86-SSE-NEXT:    movups %xmm0, (%eax) # encoding: [0x0f,0x11,0x00]
   2932 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2933 ;
   2934 ; X86-AVX1-LABEL: test_mm_storeu_ps:
   2935 ; X86-AVX1:       # %bb.0:
   2936 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2937 ; X86-AVX1-NEXT:    vmovups %xmm0, (%eax) # encoding: [0xc5,0xf8,0x11,0x00]
   2938 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2939 ;
   2940 ; X86-AVX512-LABEL: test_mm_storeu_ps:
   2941 ; X86-AVX512:       # %bb.0:
   2942 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2943 ; X86-AVX512-NEXT:    vmovups %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x00]
   2944 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2945 ;
   2946 ; X64-SSE-LABEL: test_mm_storeu_ps:
   2947 ; X64-SSE:       # %bb.0:
   2948 ; X64-SSE-NEXT:    movups %xmm0, (%rdi) # encoding: [0x0f,0x11,0x07]
   2949 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2950 ;
   2951 ; X64-AVX1-LABEL: test_mm_storeu_ps:
   2952 ; X64-AVX1:       # %bb.0:
   2953 ; X64-AVX1-NEXT:    vmovups %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x11,0x07]
   2954 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2955 ;
   2956 ; X64-AVX512-LABEL: test_mm_storeu_ps:
   2957 ; X64-AVX512:       # %bb.0:
   2958 ; X64-AVX512-NEXT:    vmovups %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x11,0x07]
   2959 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2960   %arg0 = bitcast float* %a0 to <4 x float>*
   2961   store <4 x float> %a1, <4 x float>* %arg0, align 1
   2962   ret void
   2963 }
   2964 
   2965 define void @test_mm_stream_ps(float *%a0, <4 x float> %a1) {
   2966 ; X86-SSE-LABEL: test_mm_stream_ps:
   2967 ; X86-SSE:       # %bb.0:
   2968 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2969 ; X86-SSE-NEXT:    movntps %xmm0, (%eax) # encoding: [0x0f,0x2b,0x00]
   2970 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   2971 ;
   2972 ; X86-AVX1-LABEL: test_mm_stream_ps:
   2973 ; X86-AVX1:       # %bb.0:
   2974 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2975 ; X86-AVX1-NEXT:    vmovntps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x2b,0x00]
   2976 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   2977 ;
   2978 ; X86-AVX512-LABEL: test_mm_stream_ps:
   2979 ; X86-AVX512:       # %bb.0:
   2980 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
   2981 ; X86-AVX512-NEXT:    vmovntps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x00]
   2982 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   2983 ;
   2984 ; X64-SSE-LABEL: test_mm_stream_ps:
   2985 ; X64-SSE:       # %bb.0:
   2986 ; X64-SSE-NEXT:    movntps %xmm0, (%rdi) # encoding: [0x0f,0x2b,0x07]
   2987 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   2988 ;
   2989 ; X64-AVX1-LABEL: test_mm_stream_ps:
   2990 ; X64-AVX1:       # %bb.0:
   2991 ; X64-AVX1-NEXT:    vmovntps %xmm0, (%rdi) # encoding: [0xc5,0xf8,0x2b,0x07]
   2992 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   2993 ;
   2994 ; X64-AVX512-LABEL: test_mm_stream_ps:
   2995 ; X64-AVX512:       # %bb.0:
   2996 ; X64-AVX512-NEXT:    vmovntps %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2b,0x07]
   2997 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   2998   %arg0 = bitcast float* %a0 to <4 x float>*
   2999   store <4 x float> %a1, <4 x float>* %arg0, align 16, !nontemporal !0
   3000   ret void
   3001 }
   3002 
   3003 define <4 x float> @test_mm_sub_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   3004 ; SSE-LABEL: test_mm_sub_ps:
   3005 ; SSE:       # %bb.0:
   3006 ; SSE-NEXT:    subps %xmm1, %xmm0 # encoding: [0x0f,0x5c,0xc1]
   3007 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3008 ;
   3009 ; AVX1-LABEL: test_mm_sub_ps:
   3010 ; AVX1:       # %bb.0:
   3011 ; AVX1-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x5c,0xc1]
   3012 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3013 ;
   3014 ; AVX512-LABEL: test_mm_sub_ps:
   3015 ; AVX512:       # %bb.0:
   3016 ; AVX512-NEXT:    vsubps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5c,0xc1]
   3017 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3018   %res = fsub <4 x float> %a0, %a1
   3019   ret <4 x float> %res
   3020 }
   3021 
   3022 define <4 x float> @test_mm_sub_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   3023 ; SSE-LABEL: test_mm_sub_ss:
   3024 ; SSE:       # %bb.0:
   3025 ; SSE-NEXT:    subss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x5c,0xc1]
   3026 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3027 ;
   3028 ; AVX1-LABEL: test_mm_sub_ss:
   3029 ; AVX1:       # %bb.0:
   3030 ; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x5c,0xc1]
   3031 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3032 ;
   3033 ; AVX512-LABEL: test_mm_sub_ss:
   3034 ; AVX512:       # %bb.0:
   3035 ; AVX512-NEXT:    vsubss %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5c,0xc1]
   3036 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3037   %ext0 = extractelement <4 x float> %a0, i32 0
   3038   %ext1 = extractelement <4 x float> %a1, i32 0
   3039   %fsub = fsub float %ext0, %ext1
   3040   %res = insertelement <4 x float> %a0, float %fsub, i32 0
   3041   ret <4 x float> %res
   3042 }
   3043 
   3044 define void @test_MM_TRANSPOSE4_PS(<4 x float>* %a0, <4 x float>* %a1, <4 x float>* %a2, <4 x float>* %a3) nounwind {
   3045 ; X86-SSE-LABEL: test_MM_TRANSPOSE4_PS:
   3046 ; X86-SSE:       # %bb.0:
   3047 ; X86-SSE-NEXT:    pushl %esi # encoding: [0x56]
   3048 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
   3049 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
   3050 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
   3051 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
   3052 ; X86-SSE-NEXT:    movaps (%esi), %xmm0 # encoding: [0x0f,0x28,0x06]
   3053 ; X86-SSE-NEXT:    movaps (%edx), %xmm1 # encoding: [0x0f,0x28,0x0a]
   3054 ; X86-SSE-NEXT:    movaps (%ecx), %xmm2 # encoding: [0x0f,0x28,0x11]
   3055 ; X86-SSE-NEXT:    movaps (%eax), %xmm3 # encoding: [0x0f,0x28,0x18]
   3056 ; X86-SSE-NEXT:    movaps %xmm0, %xmm4 # encoding: [0x0f,0x28,0xe0]
   3057 ; X86-SSE-NEXT:    unpcklps %xmm1, %xmm4 # encoding: [0x0f,0x14,0xe1]
   3058 ; X86-SSE-NEXT:    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   3059 ; X86-SSE-NEXT:    movaps %xmm2, %xmm5 # encoding: [0x0f,0x28,0xea]
   3060 ; X86-SSE-NEXT:    unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb]
   3061 ; X86-SSE-NEXT:    # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
   3062 ; X86-SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
   3063 ; X86-SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   3064 ; X86-SSE-NEXT:    unpckhps %xmm3, %xmm2 # encoding: [0x0f,0x15,0xd3]
   3065 ; X86-SSE-NEXT:    # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
   3066 ; X86-SSE-NEXT:    movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc]
   3067 ; X86-SSE-NEXT:    movlhps %xmm5, %xmm1 # encoding: [0x0f,0x16,0xcd]
   3068 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm5[0]
   3069 ; X86-SSE-NEXT:    movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec]
   3070 ; X86-SSE-NEXT:    # xmm5 = xmm4[1],xmm5[1]
   3071 ; X86-SSE-NEXT:    movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8]
   3072 ; X86-SSE-NEXT:    movlhps %xmm2, %xmm3 # encoding: [0x0f,0x16,0xda]
   3073 ; X86-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0]
   3074 ; X86-SSE-NEXT:    movhlps %xmm0, %xmm2 # encoding: [0x0f,0x12,0xd0]
   3075 ; X86-SSE-NEXT:    # xmm2 = xmm0[1],xmm2[1]
   3076 ; X86-SSE-NEXT:    movaps %xmm1, (%esi) # encoding: [0x0f,0x29,0x0e]
   3077 ; X86-SSE-NEXT:    movaps %xmm5, (%edx) # encoding: [0x0f,0x29,0x2a]
   3078 ; X86-SSE-NEXT:    movaps %xmm3, (%ecx) # encoding: [0x0f,0x29,0x19]
   3079 ; X86-SSE-NEXT:    movaps %xmm2, (%eax) # encoding: [0x0f,0x29,0x10]
   3080 ; X86-SSE-NEXT:    popl %esi # encoding: [0x5e]
   3081 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
   3082 ;
   3083 ; X86-AVX1-LABEL: test_MM_TRANSPOSE4_PS:
   3084 ; X86-AVX1:       # %bb.0:
   3085 ; X86-AVX1-NEXT:    pushl %esi # encoding: [0x56]
   3086 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
   3087 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
   3088 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
   3089 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
   3090 ; X86-AVX1-NEXT:    vmovaps (%esi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x06]
   3091 ; X86-AVX1-NEXT:    vmovaps (%edx), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0a]
   3092 ; X86-AVX1-NEXT:    vmovaps (%ecx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x11]
   3093 ; X86-AVX1-NEXT:    vmovaps (%eax), %xmm3 # encoding: [0xc5,0xf8,0x28,0x18]
   3094 ; X86-AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1]
   3095 ; X86-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   3096 ; X86-AVX1-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb]
   3097 ; X86-AVX1-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
   3098 ; X86-AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
   3099 ; X86-AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   3100 ; X86-AVX1-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x15,0xcb]
   3101 ; X86-AVX1-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
   3102 ; X86-AVX1-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # encoding: [0xc5,0xd8,0x16,0xd5]
   3103 ; X86-AVX1-NEXT:    # xmm2 = xmm4[0],xmm5[0]
   3104 ; X86-AVX1-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x15,0xdd]
   3105 ; X86-AVX1-NEXT:    # xmm3 = xmm4[1],xmm5[1]
   3106 ; X86-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x16,0xe1]
   3107 ; X86-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0]
   3108 ; X86-AVX1-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1]
   3109 ; X86-AVX1-NEXT:    # xmm0 = xmm0[1],xmm1[1]
   3110 ; X86-AVX1-NEXT:    vmovaps %xmm2, (%esi) # encoding: [0xc5,0xf8,0x29,0x16]
   3111 ; X86-AVX1-NEXT:    vmovaps %xmm3, (%edx) # encoding: [0xc5,0xf8,0x29,0x1a]
   3112 ; X86-AVX1-NEXT:    vmovaps %xmm4, (%ecx) # encoding: [0xc5,0xf8,0x29,0x21]
   3113 ; X86-AVX1-NEXT:    vmovaps %xmm0, (%eax) # encoding: [0xc5,0xf8,0x29,0x00]
   3114 ; X86-AVX1-NEXT:    popl %esi # encoding: [0x5e]
   3115 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
   3116 ;
   3117 ; X86-AVX512-LABEL: test_MM_TRANSPOSE4_PS:
   3118 ; X86-AVX512:       # %bb.0:
   3119 ; X86-AVX512-NEXT:    pushl %esi # encoding: [0x56]
   3120 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x14]
   3121 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x10]
   3122 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c]
   3123 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08]
   3124 ; X86-AVX512-NEXT:    vmovaps (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x06]
   3125 ; X86-AVX512-NEXT:    vmovaps (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0a]
   3126 ; X86-AVX512-NEXT:    vmovaps (%ecx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x11]
   3127 ; X86-AVX512-NEXT:    vmovaps (%eax), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x18]
   3128 ; X86-AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1]
   3129 ; X86-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   3130 ; X86-AVX512-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb]
   3131 ; X86-AVX512-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
   3132 ; X86-AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
   3133 ; X86-AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   3134 ; X86-AVX512-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb]
   3135 ; X86-AVX512-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
   3136 ; X86-AVX512-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5]
   3137 ; X86-AVX512-NEXT:    # xmm2 = xmm4[0],xmm5[0]
   3138 ; X86-AVX512-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd]
   3139 ; X86-AVX512-NEXT:    # xmm3 = xmm4[1],xmm5[1]
   3140 ; X86-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1]
   3141 ; X86-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0]
   3142 ; X86-AVX512-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1]
   3143 ; X86-AVX512-NEXT:    # xmm0 = xmm0[1],xmm1[1]
   3144 ; X86-AVX512-NEXT:    vmovaps %xmm2, (%esi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x16]
   3145 ; X86-AVX512-NEXT:    vmovaps %xmm3, (%edx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1a]
   3146 ; X86-AVX512-NEXT:    vmovaps %xmm4, (%ecx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x21]
   3147 ; X86-AVX512-NEXT:    vmovaps %xmm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x00]
   3148 ; X86-AVX512-NEXT:    popl %esi # encoding: [0x5e]
   3149 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
   3150 ;
   3151 ; X64-SSE-LABEL: test_MM_TRANSPOSE4_PS:
   3152 ; X64-SSE:       # %bb.0:
   3153 ; X64-SSE-NEXT:    movaps (%rdi), %xmm0 # encoding: [0x0f,0x28,0x07]
   3154 ; X64-SSE-NEXT:    movaps (%rsi), %xmm1 # encoding: [0x0f,0x28,0x0e]
   3155 ; X64-SSE-NEXT:    movaps (%rdx), %xmm2 # encoding: [0x0f,0x28,0x12]
   3156 ; X64-SSE-NEXT:    movaps (%rcx), %xmm3 # encoding: [0x0f,0x28,0x19]
   3157 ; X64-SSE-NEXT:    movaps %xmm0, %xmm4 # encoding: [0x0f,0x28,0xe0]
   3158 ; X64-SSE-NEXT:    unpcklps %xmm1, %xmm4 # encoding: [0x0f,0x14,0xe1]
   3159 ; X64-SSE-NEXT:    # xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
   3160 ; X64-SSE-NEXT:    movaps %xmm2, %xmm5 # encoding: [0x0f,0x28,0xea]
   3161 ; X64-SSE-NEXT:    unpcklps %xmm3, %xmm5 # encoding: [0x0f,0x14,0xeb]
   3162 ; X64-SSE-NEXT:    # xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
   3163 ; X64-SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
   3164 ; X64-SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   3165 ; X64-SSE-NEXT:    unpckhps %xmm3, %xmm2 # encoding: [0x0f,0x15,0xd3]
   3166 ; X64-SSE-NEXT:    # xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
   3167 ; X64-SSE-NEXT:    movaps %xmm4, %xmm1 # encoding: [0x0f,0x28,0xcc]
   3168 ; X64-SSE-NEXT:    movlhps %xmm5, %xmm1 # encoding: [0x0f,0x16,0xcd]
   3169 ; X64-SSE-NEXT:    # xmm1 = xmm1[0],xmm5[0]
   3170 ; X64-SSE-NEXT:    movhlps %xmm4, %xmm5 # encoding: [0x0f,0x12,0xec]
   3171 ; X64-SSE-NEXT:    # xmm5 = xmm4[1],xmm5[1]
   3172 ; X64-SSE-NEXT:    movaps %xmm0, %xmm3 # encoding: [0x0f,0x28,0xd8]
   3173 ; X64-SSE-NEXT:    movlhps %xmm2, %xmm3 # encoding: [0x0f,0x16,0xda]
   3174 ; X64-SSE-NEXT:    # xmm3 = xmm3[0],xmm2[0]
   3175 ; X64-SSE-NEXT:    movhlps %xmm0, %xmm2 # encoding: [0x0f,0x12,0xd0]
   3176 ; X64-SSE-NEXT:    # xmm2 = xmm0[1],xmm2[1]
   3177 ; X64-SSE-NEXT:    movaps %xmm1, (%rdi) # encoding: [0x0f,0x29,0x0f]
   3178 ; X64-SSE-NEXT:    movaps %xmm5, (%rsi) # encoding: [0x0f,0x29,0x2e]
   3179 ; X64-SSE-NEXT:    movaps %xmm3, (%rdx) # encoding: [0x0f,0x29,0x1a]
   3180 ; X64-SSE-NEXT:    movaps %xmm2, (%rcx) # encoding: [0x0f,0x29,0x11]
   3181 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
   3182 ;
   3183 ; X64-AVX1-LABEL: test_MM_TRANSPOSE4_PS:
   3184 ; X64-AVX1:       # %bb.0:
   3185 ; X64-AVX1-NEXT:    vmovaps (%rdi), %xmm0 # encoding: [0xc5,0xf8,0x28,0x07]
   3186 ; X64-AVX1-NEXT:    vmovaps (%rsi), %xmm1 # encoding: [0xc5,0xf8,0x28,0x0e]
   3187 ; X64-AVX1-NEXT:    vmovaps (%rdx), %xmm2 # encoding: [0xc5,0xf8,0x28,0x12]
   3188 ; X64-AVX1-NEXT:    vmovaps (%rcx), %xmm3 # encoding: [0xc5,0xf8,0x28,0x19]
   3189 ; X64-AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x14,0xe1]
   3190 ; X64-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   3191 ; X64-AVX1-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # encoding: [0xc5,0xe8,0x14,0xeb]
   3192 ; X64-AVX1-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
   3193 ; X64-AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
   3194 ; X64-AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   3195 ; X64-AVX1-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # encoding: [0xc5,0xe8,0x15,0xcb]
   3196 ; X64-AVX1-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
   3197 ; X64-AVX1-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # encoding: [0xc5,0xd8,0x16,0xd5]
   3198 ; X64-AVX1-NEXT:    # xmm2 = xmm4[0],xmm5[0]
   3199 ; X64-AVX1-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # encoding: [0xc5,0xd9,0x15,0xdd]
   3200 ; X64-AVX1-NEXT:    # xmm3 = xmm4[1],xmm5[1]
   3201 ; X64-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # encoding: [0xc5,0xf8,0x16,0xe1]
   3202 ; X64-AVX1-NEXT:    # xmm4 = xmm0[0],xmm1[0]
   3203 ; X64-AVX1-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x15,0xc1]
   3204 ; X64-AVX1-NEXT:    # xmm0 = xmm0[1],xmm1[1]
   3205 ; X64-AVX1-NEXT:    vmovaps %xmm2, (%rdi) # encoding: [0xc5,0xf8,0x29,0x17]
   3206 ; X64-AVX1-NEXT:    vmovaps %xmm3, (%rsi) # encoding: [0xc5,0xf8,0x29,0x1e]
   3207 ; X64-AVX1-NEXT:    vmovaps %xmm4, (%rdx) # encoding: [0xc5,0xf8,0x29,0x22]
   3208 ; X64-AVX1-NEXT:    vmovaps %xmm0, (%rcx) # encoding: [0xc5,0xf8,0x29,0x01]
   3209 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
   3210 ;
   3211 ; X64-AVX512-LABEL: test_MM_TRANSPOSE4_PS:
   3212 ; X64-AVX512:       # %bb.0:
   3213 ; X64-AVX512-NEXT:    vmovaps (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x07]
   3214 ; X64-AVX512-NEXT:    vmovaps (%rsi), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x0e]
   3215 ; X64-AVX512-NEXT:    vmovaps (%rdx), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x12]
   3216 ; X64-AVX512-NEXT:    vmovaps (%rcx), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x19]
   3217 ; X64-AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xe1]
   3218 ; X64-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   3219 ; X64-AVX512-NEXT:    vunpcklps %xmm3, %xmm2, %xmm5 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x14,0xeb]
   3220 ; X64-AVX512-NEXT:    # xmm5 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
   3221 ; X64-AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
   3222 ; X64-AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   3223 ; X64-AVX512-NEXT:    vunpckhps %xmm3, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x15,0xcb]
   3224 ; X64-AVX512-NEXT:    # xmm1 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
   3225 ; X64-AVX512-NEXT:    vmovlhps %xmm5, %xmm4, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xd8,0x16,0xd5]
   3226 ; X64-AVX512-NEXT:    # xmm2 = xmm4[0],xmm5[0]
   3227 ; X64-AVX512-NEXT:    vunpckhpd %xmm5, %xmm4, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0x15,0xdd]
   3228 ; X64-AVX512-NEXT:    # xmm3 = xmm4[1],xmm5[1]
   3229 ; X64-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xe1]
   3230 ; X64-AVX512-NEXT:    # xmm4 = xmm0[0],xmm1[0]
   3231 ; X64-AVX512-NEXT:    vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1]
   3232 ; X64-AVX512-NEXT:    # xmm0 = xmm0[1],xmm1[1]
   3233 ; X64-AVX512-NEXT:    vmovaps %xmm2, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x17]
   3234 ; X64-AVX512-NEXT:    vmovaps %xmm3, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x1e]
   3235 ; X64-AVX512-NEXT:    vmovaps %xmm4, (%rdx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x22]
   3236 ; X64-AVX512-NEXT:    vmovaps %xmm0, (%rcx) # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x29,0x01]
   3237 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   3238   %row0 = load <4 x float>, <4 x float>* %a0, align 16
   3239   %row1 = load <4 x float>, <4 x float>* %a1, align 16
   3240   %row2 = load <4 x float>, <4 x float>* %a2, align 16
   3241   %row3 = load <4 x float>, <4 x float>* %a3, align 16
   3242   %tmp0 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   3243   %tmp2 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   3244   %tmp1 = shufflevector <4 x float> %row0, <4 x float> %row1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   3245   %tmp3 = shufflevector <4 x float> %row2, <4 x float> %row3, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   3246   %res0 = shufflevector <4 x float> %tmp0, <4 x float> %tmp2, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   3247   %res1 = shufflevector <4 x float> %tmp2, <4 x float> %tmp0, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   3248   %res2 = shufflevector <4 x float> %tmp1, <4 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   3249   %res3 = shufflevector <4 x float> %tmp3, <4 x float> %tmp1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
   3250   store <4 x float> %res0, <4 x float>* %a0, align 16
   3251   store <4 x float> %res1, <4 x float>* %a1, align 16
   3252   store <4 x float> %res2, <4 x float>* %a2, align 16
   3253   store <4 x float> %res3, <4 x float>* %a3, align 16
   3254   ret void
   3255 }
   3256 
   3257 define i32 @test_mm_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   3258 ; SSE-LABEL: test_mm_ucomieq_ss:
   3259 ; SSE:       # %bb.0:
   3260 ; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
   3261 ; SSE-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
   3262 ; SSE-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
   3263 ; SSE-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
   3264 ; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
   3265 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3266 ;
   3267 ; AVX1-LABEL: test_mm_ucomieq_ss:
   3268 ; AVX1:       # %bb.0:
   3269 ; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
   3270 ; AVX1-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
   3271 ; AVX1-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
   3272 ; AVX1-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
   3273 ; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
   3274 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3275 ;
   3276 ; AVX512-LABEL: test_mm_ucomieq_ss:
   3277 ; AVX512:       # %bb.0:
   3278 ; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
   3279 ; AVX512-NEXT:    setnp %al # encoding: [0x0f,0x9b,0xc0]
   3280 ; AVX512-NEXT:    sete %cl # encoding: [0x0f,0x94,0xc1]
   3281 ; AVX512-NEXT:    andb %al, %cl # encoding: [0x20,0xc1]
   3282 ; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
   3283 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3284   %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
   3285   ret i32 %res
   3286 }
   3287 declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
   3288 
   3289 define i32 @test_mm_ucomige_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   3290 ; SSE-LABEL: test_mm_ucomige_ss:
   3291 ; SSE:       # %bb.0:
   3292 ; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
   3293 ; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
   3294 ; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
   3295 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3296 ;
   3297 ; AVX1-LABEL: test_mm_ucomige_ss:
   3298 ; AVX1:       # %bb.0:
   3299 ; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
   3300 ; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
   3301 ; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
   3302 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3303 ;
   3304 ; AVX512-LABEL: test_mm_ucomige_ss:
   3305 ; AVX512:       # %bb.0:
   3306 ; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
   3307 ; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
   3308 ; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
   3309 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3310   %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1)
   3311   ret i32 %res
   3312 }
   3313 declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
   3314 
   3315 define i32 @test_mm_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   3316 ; SSE-LABEL: test_mm_ucomigt_ss:
   3317 ; SSE:       # %bb.0:
   3318 ; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
   3319 ; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
   3320 ; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
   3321 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3322 ;
   3323 ; AVX1-LABEL: test_mm_ucomigt_ss:
   3324 ; AVX1:       # %bb.0:
   3325 ; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
   3326 ; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
   3327 ; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
   3328 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3329 ;
   3330 ; AVX512-LABEL: test_mm_ucomigt_ss:
   3331 ; AVX512:       # %bb.0:
   3332 ; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
   3333 ; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
   3334 ; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
   3335 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3336   %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1)
   3337   ret i32 %res
   3338 }
   3339 declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
   3340 
   3341 define i32 @test_mm_ucomile_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   3342 ; SSE-LABEL: test_mm_ucomile_ss:
   3343 ; SSE:       # %bb.0:
   3344 ; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
   3345 ; SSE-NEXT:    ucomiss %xmm0, %xmm1 # encoding: [0x0f,0x2e,0xc8]
   3346 ; SSE-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
   3347 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3348 ;
   3349 ; AVX1-LABEL: test_mm_ucomile_ss:
   3350 ; AVX1:       # %bb.0:
   3351 ; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
   3352 ; AVX1-NEXT:    vucomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2e,0xc8]
   3353 ; AVX1-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
   3354 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3355 ;
   3356 ; AVX512-LABEL: test_mm_ucomile_ss:
   3357 ; AVX512:       # %bb.0:
   3358 ; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
   3359 ; AVX512-NEXT:    vucomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
   3360 ; AVX512-NEXT:    setae %al # encoding: [0x0f,0x93,0xc0]
   3361 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3362   %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1)
   3363   ret i32 %res
   3364 }
   3365 declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
   3366 
   3367 define i32 @test_mm_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   3368 ; SSE-LABEL: test_mm_ucomilt_ss:
   3369 ; SSE:       # %bb.0:
   3370 ; SSE-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
   3371 ; SSE-NEXT:    ucomiss %xmm0, %xmm1 # encoding: [0x0f,0x2e,0xc8]
   3372 ; SSE-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
   3373 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3374 ;
   3375 ; AVX1-LABEL: test_mm_ucomilt_ss:
   3376 ; AVX1:       # %bb.0:
   3377 ; AVX1-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
   3378 ; AVX1-NEXT:    vucomiss %xmm0, %xmm1 # encoding: [0xc5,0xf8,0x2e,0xc8]
   3379 ; AVX1-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
   3380 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3381 ;
   3382 ; AVX512-LABEL: test_mm_ucomilt_ss:
   3383 ; AVX512:       # %bb.0:
   3384 ; AVX512-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
   3385 ; AVX512-NEXT:    vucomiss %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
   3386 ; AVX512-NEXT:    seta %al # encoding: [0x0f,0x97,0xc0]
   3387 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3388   %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1)
   3389   ret i32 %res
   3390 }
   3391 declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
   3392 
   3393 define i32 @test_mm_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) nounwind {
   3394 ; SSE-LABEL: test_mm_ucomineq_ss:
   3395 ; SSE:       # %bb.0:
   3396 ; SSE-NEXT:    ucomiss %xmm1, %xmm0 # encoding: [0x0f,0x2e,0xc1]
   3397 ; SSE-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
   3398 ; SSE-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
   3399 ; SSE-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
   3400 ; SSE-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
   3401 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3402 ;
   3403 ; AVX1-LABEL: test_mm_ucomineq_ss:
   3404 ; AVX1:       # %bb.0:
   3405 ; AVX1-NEXT:    vucomiss %xmm1, %xmm0 # encoding: [0xc5,0xf8,0x2e,0xc1]
   3406 ; AVX1-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
   3407 ; AVX1-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
   3408 ; AVX1-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
   3409 ; AVX1-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
   3410 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3411 ;
   3412 ; AVX512-LABEL: test_mm_ucomineq_ss:
   3413 ; AVX512:       # %bb.0:
   3414 ; AVX512-NEXT:    vucomiss %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
   3415 ; AVX512-NEXT:    setp %al # encoding: [0x0f,0x9a,0xc0]
   3416 ; AVX512-NEXT:    setne %cl # encoding: [0x0f,0x95,0xc1]
   3417 ; AVX512-NEXT:    orb %al, %cl # encoding: [0x08,0xc1]
   3418 ; AVX512-NEXT:    movzbl %cl, %eax # encoding: [0x0f,0xb6,0xc1]
   3419 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3420   %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1)
   3421   ret i32 %res
   3422 }
   3423 declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
   3424 
   3425 define <4 x float> @test_mm_undefined_ps() {
   3426 ; CHECK-LABEL: test_mm_undefined_ps:
   3427 ; CHECK:       # %bb.0:
   3428 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3429   ret <4 x float> undef
   3430 }
   3431 
   3432 define <4 x float> @test_mm_unpackhi_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   3433 ; SSE-LABEL: test_mm_unpackhi_ps:
   3434 ; SSE:       # %bb.0:
   3435 ; SSE-NEXT:    unpckhps %xmm1, %xmm0 # encoding: [0x0f,0x15,0xc1]
   3436 ; SSE-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   3437 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3438 ;
   3439 ; AVX1-LABEL: test_mm_unpackhi_ps:
   3440 ; AVX1:       # %bb.0:
   3441 ; AVX1-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x15,0xc1]
   3442 ; AVX1-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   3443 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3444 ;
   3445 ; AVX512-LABEL: test_mm_unpackhi_ps:
   3446 ; AVX512:       # %bb.0:
   3447 ; AVX512-NEXT:    vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1]
   3448 ; AVX512-NEXT:    # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   3449 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3450   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
   3451   ret <4 x float> %res
   3452 }
   3453 
   3454 define <4 x float> @test_mm_unpacklo_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   3455 ; SSE-LABEL: test_mm_unpacklo_ps:
   3456 ; SSE:       # %bb.0:
   3457 ; SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
   3458 ; SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   3459 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3460 ;
   3461 ; AVX1-LABEL: test_mm_unpacklo_ps:
   3462 ; AVX1:       # %bb.0:
   3463 ; AVX1-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x14,0xc1]
   3464 ; AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   3465 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3466 ;
   3467 ; AVX512-LABEL: test_mm_unpacklo_ps:
   3468 ; AVX512:       # %bb.0:
   3469 ; AVX512-NEXT:    vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1]
   3470 ; AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   3471 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3472   %res = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
   3473   ret <4 x float> %res
   3474 }
   3475 
   3476 define <4 x float> @test_mm_xor_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
   3477 ; SSE-LABEL: test_mm_xor_ps:
   3478 ; SSE:       # %bb.0:
   3479 ; SSE-NEXT:    xorps %xmm1, %xmm0 # encoding: [0x0f,0x57,0xc1]
   3480 ; SSE-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3481 ;
   3482 ; AVX1-LABEL: test_mm_xor_ps:
   3483 ; AVX1:       # %bb.0:
   3484 ; AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc1]
   3485 ; AVX1-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3486 ;
   3487 ; AVX512-LABEL: test_mm_xor_ps:
   3488 ; AVX512:       # %bb.0:
   3489 ; AVX512-NEXT:    vxorps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x57,0xc1]
   3490 ; AVX512-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
   3491   %arg0 = bitcast <4 x float> %a0 to <4 x i32>
   3492   %arg1 = bitcast <4 x float> %a1 to <4 x i32>
   3493   %res = xor <4 x i32> %arg0, %arg1
   3494   %bc = bitcast <4 x i32> %res to <4 x float>
   3495   ret <4 x float> %bc
   3496 }
   3497 
   3498 !0 = !{i32 1}
   3499