Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE2,X86-SSE2
      3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X86,SSE,X86-SSE,SSE41,X86-SSE41
      4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX1,X86-AVX1
      5 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X86,AVX,X86-AVX,AVX512,X86-AVX512
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE2,X64-SSE2
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,X64,SSE,X64-SSE,SSE41,X64-SSE41
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX1,X64-AVX1
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,X64,AVX,X64-AVX,AVX512,X64-AVX512
     10 
     11 ; Ensure that the backend no longer emits unnecessary vector insert
     12 ; instructions immediately after SSE scalar fp instructions
     13 ; like addss or mulss.
     14 
     15 define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
     16 ; SSE-LABEL: test_add_ss:
     17 ; SSE:       # %bb.0:
     18 ; SSE-NEXT:    addss %xmm1, %xmm0
     19 ; SSE-NEXT:    ret{{[l|q]}}
     20 ;
     21 ; AVX-LABEL: test_add_ss:
     22 ; AVX:       # %bb.0:
     23 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
     24 ; AVX-NEXT:    ret{{[l|q]}}
     25   %1 = extractelement <4 x float> %b, i32 0
     26   %2 = extractelement <4 x float> %a, i32 0
     27   %add = fadd float %2, %1
     28   %3 = insertelement <4 x float> %a, float %add, i32 0
     29   ret <4 x float> %3
     30 }
     31 
     32 define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
     33 ; SSE-LABEL: test_sub_ss:
     34 ; SSE:       # %bb.0:
     35 ; SSE-NEXT:    subss %xmm1, %xmm0
     36 ; SSE-NEXT:    ret{{[l|q]}}
     37 ;
     38 ; AVX-LABEL: test_sub_ss:
     39 ; AVX:       # %bb.0:
     40 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
     41 ; AVX-NEXT:    ret{{[l|q]}}
     42   %1 = extractelement <4 x float> %b, i32 0
     43   %2 = extractelement <4 x float> %a, i32 0
     44   %sub = fsub float %2, %1
     45   %3 = insertelement <4 x float> %a, float %sub, i32 0
     46   ret <4 x float> %3
     47 }
     48 
     49 define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
     50 ; SSE-LABEL: test_mul_ss:
     51 ; SSE:       # %bb.0:
     52 ; SSE-NEXT:    mulss %xmm1, %xmm0
     53 ; SSE-NEXT:    ret{{[l|q]}}
     54 ;
     55 ; AVX-LABEL: test_mul_ss:
     56 ; AVX:       # %bb.0:
     57 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
     58 ; AVX-NEXT:    ret{{[l|q]}}
     59   %1 = extractelement <4 x float> %b, i32 0
     60   %2 = extractelement <4 x float> %a, i32 0
     61   %mul = fmul float %2, %1
     62   %3 = insertelement <4 x float> %a, float %mul, i32 0
     63   ret <4 x float> %3
     64 }
     65 
     66 define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
     67 ; SSE-LABEL: test_div_ss:
     68 ; SSE:       # %bb.0:
     69 ; SSE-NEXT:    divss %xmm1, %xmm0
     70 ; SSE-NEXT:    ret{{[l|q]}}
     71 ;
     72 ; AVX-LABEL: test_div_ss:
     73 ; AVX:       # %bb.0:
     74 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
     75 ; AVX-NEXT:    ret{{[l|q]}}
     76   %1 = extractelement <4 x float> %b, i32 0
     77   %2 = extractelement <4 x float> %a, i32 0
     78   %div = fdiv float %2, %1
     79   %3 = insertelement <4 x float> %a, float %div, i32 0
     80   ret <4 x float> %3
     81 }
     82 
     83 define <4 x float> @test_sqrt_ss(<4 x float> %a) {
     84 ; SSE-LABEL: test_sqrt_ss:
     85 ; SSE:       # %bb.0:
     86 ; SSE-NEXT:    sqrtss %xmm0, %xmm0
     87 ; SSE-NEXT:    ret{{[l|q]}}
     88 ;
     89 ; AVX-LABEL: test_sqrt_ss:
     90 ; AVX:       # %bb.0:
     91 ; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
     92 ; AVX-NEXT:    ret{{[l|q]}}
     93   %1 = extractelement <4 x float> %a, i32 0
     94   %2 = call float @llvm.sqrt.f32(float %1)
     95   %3 = insertelement <4 x float> %a, float %2, i32 0
     96   ret <4 x float> %3
     97 }
     98 declare float @llvm.sqrt.f32(float)
     99 
    100 define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
    101 ; SSE-LABEL: test_add_sd:
    102 ; SSE:       # %bb.0:
    103 ; SSE-NEXT:    addsd %xmm1, %xmm0
    104 ; SSE-NEXT:    ret{{[l|q]}}
    105 ;
    106 ; AVX-LABEL: test_add_sd:
    107 ; AVX:       # %bb.0:
    108 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
    109 ; AVX-NEXT:    ret{{[l|q]}}
    110   %1 = extractelement <2 x double> %b, i32 0
    111   %2 = extractelement <2 x double> %a, i32 0
    112   %add = fadd double %2, %1
    113   %3 = insertelement <2 x double> %a, double %add, i32 0
    114   ret <2 x double> %3
    115 }
    116 
    117 define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
    118 ; SSE-LABEL: test_sub_sd:
    119 ; SSE:       # %bb.0:
    120 ; SSE-NEXT:    subsd %xmm1, %xmm0
    121 ; SSE-NEXT:    ret{{[l|q]}}
    122 ;
    123 ; AVX-LABEL: test_sub_sd:
    124 ; AVX:       # %bb.0:
    125 ; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
    126 ; AVX-NEXT:    ret{{[l|q]}}
    127   %1 = extractelement <2 x double> %b, i32 0
    128   %2 = extractelement <2 x double> %a, i32 0
    129   %sub = fsub double %2, %1
    130   %3 = insertelement <2 x double> %a, double %sub, i32 0
    131   ret <2 x double> %3
    132 }
    133 
    134 define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
    135 ; SSE-LABEL: test_mul_sd:
    136 ; SSE:       # %bb.0:
    137 ; SSE-NEXT:    mulsd %xmm1, %xmm0
    138 ; SSE-NEXT:    ret{{[l|q]}}
    139 ;
    140 ; AVX-LABEL: test_mul_sd:
    141 ; AVX:       # %bb.0:
    142 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
    143 ; AVX-NEXT:    ret{{[l|q]}}
    144   %1 = extractelement <2 x double> %b, i32 0
    145   %2 = extractelement <2 x double> %a, i32 0
    146   %mul = fmul double %2, %1
    147   %3 = insertelement <2 x double> %a, double %mul, i32 0
    148   ret <2 x double> %3
    149 }
    150 
    151 define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
    152 ; SSE-LABEL: test_div_sd:
    153 ; SSE:       # %bb.0:
    154 ; SSE-NEXT:    divsd %xmm1, %xmm0
    155 ; SSE-NEXT:    ret{{[l|q]}}
    156 ;
    157 ; AVX-LABEL: test_div_sd:
    158 ; AVX:       # %bb.0:
    159 ; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
    160 ; AVX-NEXT:    ret{{[l|q]}}
    161   %1 = extractelement <2 x double> %b, i32 0
    162   %2 = extractelement <2 x double> %a, i32 0
    163   %div = fdiv double %2, %1
    164   %3 = insertelement <2 x double> %a, double %div, i32 0
    165   ret <2 x double> %3
    166 }
    167 
    168 define <2 x double> @test_sqrt_sd(<2 x double> %a) {
    169 ; SSE-LABEL: test_sqrt_sd:
    170 ; SSE:       # %bb.0:
    171 ; SSE-NEXT:    sqrtsd %xmm0, %xmm0
    172 ; SSE-NEXT:    ret{{[l|q]}}
    173 ;
    174 ; AVX-LABEL: test_sqrt_sd:
    175 ; AVX:       # %bb.0:
    176 ; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
    177 ; AVX-NEXT:    ret{{[l|q]}}
    178   %1 = extractelement <2 x double> %a, i32 0
    179   %2 = call double @llvm.sqrt.f64(double %1)
    180   %3 = insertelement <2 x double> %a, double %2, i32 0
    181   ret <2 x double> %3
    182 }
    183 declare double @llvm.sqrt.f64(double)
    184 
    185 define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
    186 ; SSE-LABEL: test2_add_ss:
    187 ; SSE:       # %bb.0:
    188 ; SSE-NEXT:    addss %xmm0, %xmm1
    189 ; SSE-NEXT:    movaps %xmm1, %xmm0
    190 ; SSE-NEXT:    ret{{[l|q]}}
    191 ;
    192 ; AVX-LABEL: test2_add_ss:
    193 ; AVX:       # %bb.0:
    194 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    195 ; AVX-NEXT:    ret{{[l|q]}}
    196   %1 = extractelement <4 x float> %a, i32 0
    197   %2 = extractelement <4 x float> %b, i32 0
    198   %add = fadd float %1, %2
    199   %3 = insertelement <4 x float> %b, float %add, i32 0
    200   ret <4 x float> %3
    201 }
    202 
    203 define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
    204 ; SSE-LABEL: test2_sub_ss:
    205 ; SSE:       # %bb.0:
    206 ; SSE-NEXT:    subss %xmm0, %xmm1
    207 ; SSE-NEXT:    movaps %xmm1, %xmm0
    208 ; SSE-NEXT:    ret{{[l|q]}}
    209 ;
    210 ; AVX-LABEL: test2_sub_ss:
    211 ; AVX:       # %bb.0:
    212 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
    213 ; AVX-NEXT:    ret{{[l|q]}}
    214   %1 = extractelement <4 x float> %a, i32 0
    215   %2 = extractelement <4 x float> %b, i32 0
    216   %sub = fsub float %2, %1
    217   %3 = insertelement <4 x float> %b, float %sub, i32 0
    218   ret <4 x float> %3
    219 }
    220 
    221 define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
    222 ; SSE-LABEL: test2_mul_ss:
    223 ; SSE:       # %bb.0:
    224 ; SSE-NEXT:    mulss %xmm0, %xmm1
    225 ; SSE-NEXT:    movaps %xmm1, %xmm0
    226 ; SSE-NEXT:    ret{{[l|q]}}
    227 ;
    228 ; AVX-LABEL: test2_mul_ss:
    229 ; AVX:       # %bb.0:
    230 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    231 ; AVX-NEXT:    ret{{[l|q]}}
    232   %1 = extractelement <4 x float> %a, i32 0
    233   %2 = extractelement <4 x float> %b, i32 0
    234   %mul = fmul float %1, %2
    235   %3 = insertelement <4 x float> %b, float %mul, i32 0
    236   ret <4 x float> %3
    237 }
    238 
    239 define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
    240 ; SSE-LABEL: test2_div_ss:
    241 ; SSE:       # %bb.0:
    242 ; SSE-NEXT:    divss %xmm0, %xmm1
    243 ; SSE-NEXT:    movaps %xmm1, %xmm0
    244 ; SSE-NEXT:    ret{{[l|q]}}
    245 ;
    246 ; AVX-LABEL: test2_div_ss:
    247 ; AVX:       # %bb.0:
    248 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
    249 ; AVX-NEXT:    ret{{[l|q]}}
    250   %1 = extractelement <4 x float> %a, i32 0
    251   %2 = extractelement <4 x float> %b, i32 0
    252   %div = fdiv float %2, %1
    253   %3 = insertelement <4 x float> %b, float %div, i32 0
    254   ret <4 x float> %3
    255 }
    256 
    257 define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
    258 ; SSE-LABEL: test2_add_sd:
    259 ; SSE:       # %bb.0:
    260 ; SSE-NEXT:    addsd %xmm0, %xmm1
    261 ; SSE-NEXT:    movapd %xmm1, %xmm0
    262 ; SSE-NEXT:    ret{{[l|q]}}
    263 ;
    264 ; AVX-LABEL: test2_add_sd:
    265 ; AVX:       # %bb.0:
    266 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
    267 ; AVX-NEXT:    ret{{[l|q]}}
    268   %1 = extractelement <2 x double> %a, i32 0
    269   %2 = extractelement <2 x double> %b, i32 0
    270   %add = fadd double %1, %2
    271   %3 = insertelement <2 x double> %b, double %add, i32 0
    272   ret <2 x double> %3
    273 }
    274 
    275 define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
    276 ; SSE-LABEL: test2_sub_sd:
    277 ; SSE:       # %bb.0:
    278 ; SSE-NEXT:    subsd %xmm0, %xmm1
    279 ; SSE-NEXT:    movapd %xmm1, %xmm0
    280 ; SSE-NEXT:    ret{{[l|q]}}
    281 ;
    282 ; AVX-LABEL: test2_sub_sd:
    283 ; AVX:       # %bb.0:
    284 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
    285 ; AVX-NEXT:    ret{{[l|q]}}
    286   %1 = extractelement <2 x double> %a, i32 0
    287   %2 = extractelement <2 x double> %b, i32 0
    288   %sub = fsub double %2, %1
    289   %3 = insertelement <2 x double> %b, double %sub, i32 0
    290   ret <2 x double> %3
    291 }
    292 
    293 define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
    294 ; SSE-LABEL: test2_mul_sd:
    295 ; SSE:       # %bb.0:
    296 ; SSE-NEXT:    mulsd %xmm0, %xmm1
    297 ; SSE-NEXT:    movapd %xmm1, %xmm0
    298 ; SSE-NEXT:    ret{{[l|q]}}
    299 ;
    300 ; AVX-LABEL: test2_mul_sd:
    301 ; AVX:       # %bb.0:
    302 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
    303 ; AVX-NEXT:    ret{{[l|q]}}
    304   %1 = extractelement <2 x double> %a, i32 0
    305   %2 = extractelement <2 x double> %b, i32 0
    306   %mul = fmul double %1, %2
    307   %3 = insertelement <2 x double> %b, double %mul, i32 0
    308   ret <2 x double> %3
    309 }
    310 
    311 define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
    312 ; SSE-LABEL: test2_div_sd:
    313 ; SSE:       # %bb.0:
    314 ; SSE-NEXT:    divsd %xmm0, %xmm1
    315 ; SSE-NEXT:    movapd %xmm1, %xmm0
    316 ; SSE-NEXT:    ret{{[l|q]}}
    317 ;
    318 ; AVX-LABEL: test2_div_sd:
    319 ; AVX:       # %bb.0:
    320 ; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
    321 ; AVX-NEXT:    ret{{[l|q]}}
    322   %1 = extractelement <2 x double> %a, i32 0
    323   %2 = extractelement <2 x double> %b, i32 0
    324   %div = fdiv double %2, %1
    325   %3 = insertelement <2 x double> %b, double %div, i32 0
    326   ret <2 x double> %3
    327 }
    328 
    329 define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
    330 ; SSE-LABEL: test_multiple_add_ss:
    331 ; SSE:       # %bb.0:
    332 ; SSE-NEXT:    addss %xmm0, %xmm1
    333 ; SSE-NEXT:    addss %xmm1, %xmm0
    334 ; SSE-NEXT:    ret{{[l|q]}}
    335 ;
    336 ; AVX-LABEL: test_multiple_add_ss:
    337 ; AVX:       # %bb.0:
    338 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
    339 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    340 ; AVX-NEXT:    ret{{[l|q]}}
    341   %1 = extractelement <4 x float> %b, i32 0
    342   %2 = extractelement <4 x float> %a, i32 0
    343   %add = fadd float %2, %1
    344   %add2 = fadd float %2, %add
    345   %3 = insertelement <4 x float> %a, float %add2, i32 0
    346   ret <4 x float> %3
    347 }
    348 
    349 define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
    350 ; SSE-LABEL: test_multiple_sub_ss:
    351 ; SSE:       # %bb.0:
    352 ; SSE-NEXT:    movaps %xmm0, %xmm2
    353 ; SSE-NEXT:    subss %xmm1, %xmm2
    354 ; SSE-NEXT:    subss %xmm2, %xmm0
    355 ; SSE-NEXT:    ret{{[l|q]}}
    356 ;
    357 ; AVX-LABEL: test_multiple_sub_ss:
    358 ; AVX:       # %bb.0:
    359 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm1
    360 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
    361 ; AVX-NEXT:    ret{{[l|q]}}
    362   %1 = extractelement <4 x float> %b, i32 0
    363   %2 = extractelement <4 x float> %a, i32 0
    364   %sub = fsub float %2, %1
    365   %sub2 = fsub float %2, %sub
    366   %3 = insertelement <4 x float> %a, float %sub2, i32 0
    367   ret <4 x float> %3
    368 }
    369 
    370 define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
    371 ; SSE-LABEL: test_multiple_mul_ss:
    372 ; SSE:       # %bb.0:
    373 ; SSE-NEXT:    mulss %xmm0, %xmm1
    374 ; SSE-NEXT:    mulss %xmm1, %xmm0
    375 ; SSE-NEXT:    ret{{[l|q]}}
    376 ;
    377 ; AVX-LABEL: test_multiple_mul_ss:
    378 ; AVX:       # %bb.0:
    379 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
    380 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    381 ; AVX-NEXT:    ret{{[l|q]}}
    382   %1 = extractelement <4 x float> %b, i32 0
    383   %2 = extractelement <4 x float> %a, i32 0
    384   %mul = fmul float %2, %1
    385   %mul2 = fmul float %2, %mul
    386   %3 = insertelement <4 x float> %a, float %mul2, i32 0
    387   ret <4 x float> %3
    388 }
    389 
    390 define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
    391 ; SSE-LABEL: test_multiple_div_ss:
    392 ; SSE:       # %bb.0:
    393 ; SSE-NEXT:    movaps %xmm0, %xmm2
    394 ; SSE-NEXT:    divss %xmm1, %xmm2
    395 ; SSE-NEXT:    divss %xmm2, %xmm0
    396 ; SSE-NEXT:    ret{{[l|q]}}
    397 ;
    398 ; AVX-LABEL: test_multiple_div_ss:
    399 ; AVX:       # %bb.0:
    400 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm1
    401 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
    402 ; AVX-NEXT:    ret{{[l|q]}}
    403   %1 = extractelement <4 x float> %b, i32 0
    404   %2 = extractelement <4 x float> %a, i32 0
    405   %div = fdiv float %2, %1
    406   %div2 = fdiv float %2, %div
    407   %3 = insertelement <4 x float> %a, float %div2, i32 0
    408   ret <4 x float> %3
    409 }
    410 
    411 ; With SSE4.1 or greater, the shuffles in the following tests may
    412 ; be lowered to X86Blendi nodes.
    413 
    414 define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
    415 ; X86-SSE-LABEL: blend_add_ss:
    416 ; X86-SSE:       # %bb.0:
    417 ; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    418 ; X86-SSE-NEXT:    addss %xmm1, %xmm0
    419 ; X86-SSE-NEXT:    retl
    420 ;
    421 ; X86-AVX-LABEL: blend_add_ss:
    422 ; X86-AVX:       # %bb.0:
    423 ; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    424 ; X86-AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    425 ; X86-AVX-NEXT:    retl
    426 ;
    427 ; X64-SSE-LABEL: blend_add_ss:
    428 ; X64-SSE:       # %bb.0:
    429 ; X64-SSE-NEXT:    addss %xmm1, %xmm0
    430 ; X64-SSE-NEXT:    retq
    431 ;
    432 ; X64-AVX-LABEL: blend_add_ss:
    433 ; X64-AVX:       # %bb.0:
    434 ; X64-AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    435 ; X64-AVX-NEXT:    retq
    436 
    437   %ext = extractelement <4 x float> %a, i32 0
    438   %op = fadd float %b, %ext
    439   %ins = insertelement <4 x float> undef, float %op, i32 0
    440   %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
    441   ret <4 x float> %shuf
    442 }
    443 
    444 define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
    445 ; X86-SSE-LABEL: blend_sub_ss:
    446 ; X86-SSE:       # %bb.0:
    447 ; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    448 ; X86-SSE-NEXT:    subss %xmm1, %xmm0
    449 ; X86-SSE-NEXT:    retl
    450 ;
    451 ; X86-AVX-LABEL: blend_sub_ss:
    452 ; X86-AVX:       # %bb.0:
    453 ; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    454 ; X86-AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
    455 ; X86-AVX-NEXT:    retl
    456 ;
    457 ; X64-SSE-LABEL: blend_sub_ss:
    458 ; X64-SSE:       # %bb.0:
    459 ; X64-SSE-NEXT:    subss %xmm1, %xmm0
    460 ; X64-SSE-NEXT:    retq
    461 ;
    462 ; X64-AVX-LABEL: blend_sub_ss:
    463 ; X64-AVX:       # %bb.0:
    464 ; X64-AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
    465 ; X64-AVX-NEXT:    retq
    466 
    467   %ext = extractelement <4 x float> %a, i32 0
    468   %op = fsub float %ext, %b
    469   %ins = insertelement <4 x float> undef, float %op, i32 0
    470   %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
    471   ret <4 x float> %shuf
    472 }
    473 
    474 define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
    475 ; X86-SSE-LABEL: blend_mul_ss:
    476 ; X86-SSE:       # %bb.0:
    477 ; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    478 ; X86-SSE-NEXT:    mulss %xmm1, %xmm0
    479 ; X86-SSE-NEXT:    retl
    480 ;
    481 ; X86-AVX-LABEL: blend_mul_ss:
    482 ; X86-AVX:       # %bb.0:
    483 ; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    484 ; X86-AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    485 ; X86-AVX-NEXT:    retl
    486 ;
    487 ; X64-SSE-LABEL: blend_mul_ss:
    488 ; X64-SSE:       # %bb.0:
    489 ; X64-SSE-NEXT:    mulss %xmm1, %xmm0
    490 ; X64-SSE-NEXT:    retq
    491 ;
    492 ; X64-AVX-LABEL: blend_mul_ss:
    493 ; X64-AVX:       # %bb.0:
    494 ; X64-AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    495 ; X64-AVX-NEXT:    retq
    496 
    497   %ext = extractelement <4 x float> %a, i32 0
    498   %op = fmul float %b, %ext
    499   %ins = insertelement <4 x float> undef, float %op, i32 0
    500   %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
    501   ret <4 x float> %shuf
    502 }
    503 
    504 define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
    505 ; X86-SSE-LABEL: blend_div_ss:
    506 ; X86-SSE:       # %bb.0:
    507 ; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    508 ; X86-SSE-NEXT:    divss %xmm1, %xmm0
    509 ; X86-SSE-NEXT:    retl
    510 ;
    511 ; X86-AVX-LABEL: blend_div_ss:
    512 ; X86-AVX:       # %bb.0:
    513 ; X86-AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
    514 ; X86-AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
    515 ; X86-AVX-NEXT:    retl
    516 ;
    517 ; X64-SSE-LABEL: blend_div_ss:
    518 ; X64-SSE:       # %bb.0:
    519 ; X64-SSE-NEXT:    divss %xmm1, %xmm0
    520 ; X64-SSE-NEXT:    retq
    521 ;
    522 ; X64-AVX-LABEL: blend_div_ss:
    523 ; X64-AVX:       # %bb.0:
    524 ; X64-AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
    525 ; X64-AVX-NEXT:    retq
    526 
    527   %ext = extractelement <4 x float> %a, i32 0
    528   %op = fdiv float %ext, %b
    529   %ins = insertelement <4 x float> undef, float %op, i32 0
    530   %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
    531   ret <4 x float> %shuf
    532 }
    533 
    534 define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
    535 ; X86-SSE-LABEL: blend_add_sd:
    536 ; X86-SSE:       # %bb.0:
    537 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
    538 ; X86-SSE-NEXT:    addsd %xmm1, %xmm0
    539 ; X86-SSE-NEXT:    retl
    540 ;
    541 ; X86-AVX-LABEL: blend_add_sd:
    542 ; X86-AVX:       # %bb.0:
    543 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
    544 ; X86-AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
    545 ; X86-AVX-NEXT:    retl
    546 ;
    547 ; X64-SSE-LABEL: blend_add_sd:
    548 ; X64-SSE:       # %bb.0:
    549 ; X64-SSE-NEXT:    addsd %xmm1, %xmm0
    550 ; X64-SSE-NEXT:    retq
    551 ;
    552 ; X64-AVX-LABEL: blend_add_sd:
    553 ; X64-AVX:       # %bb.0:
    554 ; X64-AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
    555 ; X64-AVX-NEXT:    retq
    556 
    557   %ext = extractelement <2 x double> %a, i32 0
    558   %op = fadd double %b, %ext
    559   %ins = insertelement <2 x double> undef, double %op, i32 0
    560   %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
    561   ret <2 x double> %shuf
    562 }
    563 
    564 define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
    565 ; X86-SSE-LABEL: blend_sub_sd:
    566 ; X86-SSE:       # %bb.0:
    567 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
    568 ; X86-SSE-NEXT:    subsd %xmm1, %xmm0
    569 ; X86-SSE-NEXT:    retl
    570 ;
    571 ; X86-AVX-LABEL: blend_sub_sd:
    572 ; X86-AVX:       # %bb.0:
    573 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
    574 ; X86-AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
    575 ; X86-AVX-NEXT:    retl
    576 ;
    577 ; X64-SSE-LABEL: blend_sub_sd:
    578 ; X64-SSE:       # %bb.0:
    579 ; X64-SSE-NEXT:    subsd %xmm1, %xmm0
    580 ; X64-SSE-NEXT:    retq
    581 ;
    582 ; X64-AVX-LABEL: blend_sub_sd:
    583 ; X64-AVX:       # %bb.0:
    584 ; X64-AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
    585 ; X64-AVX-NEXT:    retq
    586 
    587   %ext = extractelement <2 x double> %a, i32 0
    588   %op = fsub double %ext, %b
    589   %ins = insertelement <2 x double> undef, double %op, i32 0
    590   %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
    591   ret <2 x double> %shuf
    592 }
    593 
    594 define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
    595 ; X86-SSE-LABEL: blend_mul_sd:
    596 ; X86-SSE:       # %bb.0:
    597 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
    598 ; X86-SSE-NEXT:    mulsd %xmm1, %xmm0
    599 ; X86-SSE-NEXT:    retl
    600 ;
    601 ; X86-AVX-LABEL: blend_mul_sd:
    602 ; X86-AVX:       # %bb.0:
    603 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
    604 ; X86-AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
    605 ; X86-AVX-NEXT:    retl
    606 ;
    607 ; X64-SSE-LABEL: blend_mul_sd:
    608 ; X64-SSE:       # %bb.0:
    609 ; X64-SSE-NEXT:    mulsd %xmm1, %xmm0
    610 ; X64-SSE-NEXT:    retq
    611 ;
    612 ; X64-AVX-LABEL: blend_mul_sd:
    613 ; X64-AVX:       # %bb.0:
    614 ; X64-AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
    615 ; X64-AVX-NEXT:    retq
    616 
    617   %ext = extractelement <2 x double> %a, i32 0
    618   %op = fmul double %b, %ext
    619   %ins = insertelement <2 x double> undef, double %op, i32 0
    620   %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
    621   ret <2 x double> %shuf
    622 }
    623 
    624 define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
    625 ; X86-SSE-LABEL: blend_div_sd:
    626 ; X86-SSE:       # %bb.0:
    627 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
    628 ; X86-SSE-NEXT:    divsd %xmm1, %xmm0
    629 ; X86-SSE-NEXT:    retl
    630 ;
    631 ; X86-AVX-LABEL: blend_div_sd:
    632 ; X86-AVX:       # %bb.0:
    633 ; X86-AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
    634 ; X86-AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
    635 ; X86-AVX-NEXT:    retl
    636 ;
    637 ; X64-SSE-LABEL: blend_div_sd:
    638 ; X64-SSE:       # %bb.0:
    639 ; X64-SSE-NEXT:    divsd %xmm1, %xmm0
    640 ; X64-SSE-NEXT:    retq
    641 ;
    642 ; X64-AVX-LABEL: blend_div_sd:
    643 ; X64-AVX:       # %bb.0:
    644 ; X64-AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
    645 ; X64-AVX-NEXT:    retq
    646 
    647   %ext = extractelement <2 x double> %a, i32 0
    648   %op = fdiv double %ext, %b
    649   %ins = insertelement <2 x double> undef, double %op, i32 0
    650   %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
    651   ret <2 x double> %shuf
    652 }
    653 
    654 ; Ensure that the backend selects SSE/AVX scalar fp instructions
    655 ; from a packed fp instruction plus a vector insert.
    656 
    657 define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
    658 ; SSE-LABEL: insert_test_add_ss:
    659 ; SSE:       # %bb.0:
    660 ; SSE-NEXT:    addss %xmm1, %xmm0
    661 ; SSE-NEXT:    ret{{[l|q]}}
    662 ;
    663 ; AVX-LABEL: insert_test_add_ss:
    664 ; AVX:       # %bb.0:
    665 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    666 ; AVX-NEXT:    ret{{[l|q]}}
    667   %1 = fadd <4 x float> %a, %b
    668   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
    669   ret <4 x float> %2
    670 }
    671 
    672 define <4 x float> @insert_test_sub_ss(<4 x float> %a, <4 x float> %b) {
    673 ; SSE-LABEL: insert_test_sub_ss:
    674 ; SSE:       # %bb.0:
    675 ; SSE-NEXT:    subss %xmm1, %xmm0
    676 ; SSE-NEXT:    ret{{[l|q]}}
    677 ;
    678 ; AVX-LABEL: insert_test_sub_ss:
    679 ; AVX:       # %bb.0:
    680 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
    681 ; AVX-NEXT:    ret{{[l|q]}}
    682   %1 = fsub <4 x float> %a, %b
    683   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
    684   ret <4 x float> %2
    685 }
    686 
    687 define <4 x float> @insert_test_mul_ss(<4 x float> %a, <4 x float> %b) {
    688 ; SSE-LABEL: insert_test_mul_ss:
    689 ; SSE:       # %bb.0:
    690 ; SSE-NEXT:    mulss %xmm1, %xmm0
    691 ; SSE-NEXT:    ret{{[l|q]}}
    692 ;
    693 ; AVX-LABEL: insert_test_mul_ss:
    694 ; AVX:       # %bb.0:
    695 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    696 ; AVX-NEXT:    ret{{[l|q]}}
    697   %1 = fmul <4 x float> %a, %b
    698   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
    699   ret <4 x float> %2
    700 }
    701 
    702 define <4 x float> @insert_test_div_ss(<4 x float> %a, <4 x float> %b) {
    703 ; SSE-LABEL: insert_test_div_ss:
    704 ; SSE:       # %bb.0:
    705 ; SSE-NEXT:    divss %xmm1, %xmm0
    706 ; SSE-NEXT:    ret{{[l|q]}}
    707 ;
    708 ; AVX-LABEL: insert_test_div_ss:
    709 ; AVX:       # %bb.0:
    710 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
    711 ; AVX-NEXT:    ret{{[l|q]}}
    712   %1 = fdiv <4 x float> %a, %b
    713   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
    714   ret <4 x float> %2
    715 }
    716 
    717 define <2 x double> @insert_test_add_sd(<2 x double> %a, <2 x double> %b) {
    718 ; SSE-LABEL: insert_test_add_sd:
    719 ; SSE:       # %bb.0:
    720 ; SSE-NEXT:    addsd %xmm1, %xmm0
    721 ; SSE-NEXT:    ret{{[l|q]}}
    722 ;
    723 ; AVX-LABEL: insert_test_add_sd:
    724 ; AVX:       # %bb.0:
    725 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
    726 ; AVX-NEXT:    ret{{[l|q]}}
    727   %1 = fadd <2 x double> %a, %b
    728   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
    729   ret <2 x double> %2
    730 }
    731 
    732 define <2 x double> @insert_test_sub_sd(<2 x double> %a, <2 x double> %b) {
    733 ; SSE-LABEL: insert_test_sub_sd:
    734 ; SSE:       # %bb.0:
    735 ; SSE-NEXT:    subsd %xmm1, %xmm0
    736 ; SSE-NEXT:    ret{{[l|q]}}
    737 ;
    738 ; AVX-LABEL: insert_test_sub_sd:
    739 ; AVX:       # %bb.0:
    740 ; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
    741 ; AVX-NEXT:    ret{{[l|q]}}
    742   %1 = fsub <2 x double> %a, %b
    743   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
    744   ret <2 x double> %2
    745 }
    746 
    747 define <2 x double> @insert_test_mul_sd(<2 x double> %a, <2 x double> %b) {
    748 ; SSE-LABEL: insert_test_mul_sd:
    749 ; SSE:       # %bb.0:
    750 ; SSE-NEXT:    mulsd %xmm1, %xmm0
    751 ; SSE-NEXT:    ret{{[l|q]}}
    752 ;
    753 ; AVX-LABEL: insert_test_mul_sd:
    754 ; AVX:       # %bb.0:
    755 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
    756 ; AVX-NEXT:    ret{{[l|q]}}
    757   %1 = fmul <2 x double> %a, %b
    758   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
    759   ret <2 x double> %2
    760 }
    761 
    762 define <2 x double> @insert_test_div_sd(<2 x double> %a, <2 x double> %b) {
    763 ; SSE-LABEL: insert_test_div_sd:
    764 ; SSE:       # %bb.0:
    765 ; SSE-NEXT:    divsd %xmm1, %xmm0
    766 ; SSE-NEXT:    ret{{[l|q]}}
    767 ;
    768 ; AVX-LABEL: insert_test_div_sd:
    769 ; AVX:       # %bb.0:
    770 ; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
    771 ; AVX-NEXT:    ret{{[l|q]}}
    772   %1 = fdiv <2 x double> %a, %b
    773   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
    774   ret <2 x double> %2
    775 }
    776 
    777 define <4 x float> @insert_test2_add_ss(<4 x float> %a, <4 x float> %b) {
    778 ; SSE-LABEL: insert_test2_add_ss:
    779 ; SSE:       # %bb.0:
    780 ; SSE-NEXT:    addss %xmm0, %xmm1
    781 ; SSE-NEXT:    movaps %xmm1, %xmm0
    782 ; SSE-NEXT:    ret{{[l|q]}}
    783 ;
    784 ; AVX-LABEL: insert_test2_add_ss:
    785 ; AVX:       # %bb.0:
    786 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    787 ; AVX-NEXT:    ret{{[l|q]}}
    788   %1 = fadd <4 x float> %b, %a
    789   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
    790   ret <4 x float> %2
    791 }
    792 
    793 define <4 x float> @insert_test2_sub_ss(<4 x float> %a, <4 x float> %b) {
    794 ; SSE-LABEL: insert_test2_sub_ss:
    795 ; SSE:       # %bb.0:
    796 ; SSE-NEXT:    subss %xmm0, %xmm1
    797 ; SSE-NEXT:    movaps %xmm1, %xmm0
    798 ; SSE-NEXT:    ret{{[l|q]}}
    799 ;
    800 ; AVX-LABEL: insert_test2_sub_ss:
    801 ; AVX:       # %bb.0:
    802 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
    803 ; AVX-NEXT:    ret{{[l|q]}}
    804   %1 = fsub <4 x float> %b, %a
    805   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
    806   ret <4 x float> %2
    807 }
    808 
    809 define <4 x float> @insert_test2_mul_ss(<4 x float> %a, <4 x float> %b) {
    810 ; SSE-LABEL: insert_test2_mul_ss:
    811 ; SSE:       # %bb.0:
    812 ; SSE-NEXT:    mulss %xmm0, %xmm1
    813 ; SSE-NEXT:    movaps %xmm1, %xmm0
    814 ; SSE-NEXT:    ret{{[l|q]}}
    815 ;
    816 ; AVX-LABEL: insert_test2_mul_ss:
    817 ; AVX:       # %bb.0:
    818 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    819 ; AVX-NEXT:    ret{{[l|q]}}
    820   %1 = fmul <4 x float> %b, %a
    821   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
    822   ret <4 x float> %2
    823 }
    824 
    825 define <4 x float> @insert_test2_div_ss(<4 x float> %a, <4 x float> %b) {
    826 ; SSE-LABEL: insert_test2_div_ss:
    827 ; SSE:       # %bb.0:
    828 ; SSE-NEXT:    divss %xmm0, %xmm1
    829 ; SSE-NEXT:    movaps %xmm1, %xmm0
    830 ; SSE-NEXT:    ret{{[l|q]}}
    831 ;
    832 ; AVX-LABEL: insert_test2_div_ss:
    833 ; AVX:       # %bb.0:
    834 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
    835 ; AVX-NEXT:    ret{{[l|q]}}
    836   %1 = fdiv <4 x float> %b, %a
    837   %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
    838   ret <4 x float> %2
    839 }
    840 
    841 define <2 x double> @insert_test2_add_sd(<2 x double> %a, <2 x double> %b) {
    842 ; SSE-LABEL: insert_test2_add_sd:
    843 ; SSE:       # %bb.0:
    844 ; SSE-NEXT:    addsd %xmm0, %xmm1
    845 ; SSE-NEXT:    movapd %xmm1, %xmm0
    846 ; SSE-NEXT:    ret{{[l|q]}}
    847 ;
    848 ; AVX-LABEL: insert_test2_add_sd:
    849 ; AVX:       # %bb.0:
    850 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
    851 ; AVX-NEXT:    ret{{[l|q]}}
    852   %1 = fadd <2 x double> %b, %a
    853   %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
    854   ret <2 x double> %2
    855 }
    856 
    857 define <2 x double> @insert_test2_sub_sd(<2 x double> %a, <2 x double> %b) {
    858 ; SSE-LABEL: insert_test2_sub_sd:
    859 ; SSE:       # %bb.0:
    860 ; SSE-NEXT:    subsd %xmm0, %xmm1
    861 ; SSE-NEXT:    movapd %xmm1, %xmm0
    862 ; SSE-NEXT:    ret{{[l|q]}}
    863 ;
    864 ; AVX-LABEL: insert_test2_sub_sd:
    865 ; AVX:       # %bb.0:
    866 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
    867 ; AVX-NEXT:    ret{{[l|q]}}
    868   %1 = fsub <2 x double> %b, %a
    869   %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
    870   ret <2 x double> %2
    871 }
    872 
    873 define <2 x double> @insert_test2_mul_sd(<2 x double> %a, <2 x double> %b) {
    874 ; SSE-LABEL: insert_test2_mul_sd:
    875 ; SSE:       # %bb.0:
    876 ; SSE-NEXT:    mulsd %xmm0, %xmm1
    877 ; SSE-NEXT:    movapd %xmm1, %xmm0
    878 ; SSE-NEXT:    ret{{[l|q]}}
    879 ;
    880 ; AVX-LABEL: insert_test2_mul_sd:
    881 ; AVX:       # %bb.0:
    882 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
    883 ; AVX-NEXT:    ret{{[l|q]}}
    884   %1 = fmul <2 x double> %b, %a
    885   %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
    886   ret <2 x double> %2
    887 }
    888 
    889 define <2 x double> @insert_test2_div_sd(<2 x double> %a, <2 x double> %b) {
    890 ; SSE-LABEL: insert_test2_div_sd:
    891 ; SSE:       # %bb.0:
    892 ; SSE-NEXT:    divsd %xmm0, %xmm1
    893 ; SSE-NEXT:    movapd %xmm1, %xmm0
    894 ; SSE-NEXT:    ret{{[l|q]}}
    895 ;
    896 ; AVX-LABEL: insert_test2_div_sd:
    897 ; AVX:       # %bb.0:
    898 ; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
    899 ; AVX-NEXT:    ret{{[l|q]}}
    900   %1 = fdiv <2 x double> %b, %a
    901   %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
    902   ret <2 x double> %2
    903 }
    904 
    905 define <4 x float> @insert_test3_add_ss(<4 x float> %a, <4 x float> %b) {
    906 ; SSE-LABEL: insert_test3_add_ss:
    907 ; SSE:       # %bb.0:
    908 ; SSE-NEXT:    addss %xmm1, %xmm0
    909 ; SSE-NEXT:    ret{{[l|q]}}
    910 ;
    911 ; AVX-LABEL: insert_test3_add_ss:
    912 ; AVX:       # %bb.0:
    913 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    914 ; AVX-NEXT:    ret{{[l|q]}}
    915   %1 = fadd <4 x float> %a, %b
    916   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
    917   ret <4 x float> %2
    918 }
    919 
    920 define <4 x float> @insert_test3_sub_ss(<4 x float> %a, <4 x float> %b) {
    921 ; SSE-LABEL: insert_test3_sub_ss:
    922 ; SSE:       # %bb.0:
    923 ; SSE-NEXT:    subss %xmm1, %xmm0
    924 ; SSE-NEXT:    ret{{[l|q]}}
    925 ;
    926 ; AVX-LABEL: insert_test3_sub_ss:
    927 ; AVX:       # %bb.0:
    928 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
    929 ; AVX-NEXT:    ret{{[l|q]}}
    930   %1 = fsub <4 x float> %a, %b
    931   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
    932   ret <4 x float> %2
    933 }
    934 
    935 define <4 x float> @insert_test3_mul_ss(<4 x float> %a, <4 x float> %b) {
    936 ; SSE-LABEL: insert_test3_mul_ss:
    937 ; SSE:       # %bb.0:
    938 ; SSE-NEXT:    mulss %xmm1, %xmm0
    939 ; SSE-NEXT:    ret{{[l|q]}}
    940 ;
    941 ; AVX-LABEL: insert_test3_mul_ss:
    942 ; AVX:       # %bb.0:
    943 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    944 ; AVX-NEXT:    ret{{[l|q]}}
    945   %1 = fmul <4 x float> %a, %b
    946   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
    947   ret <4 x float> %2
    948 }
    949 
    950 define <4 x float> @insert_test3_div_ss(<4 x float> %a, <4 x float> %b) {
    951 ; SSE-LABEL: insert_test3_div_ss:
    952 ; SSE:       # %bb.0:
    953 ; SSE-NEXT:    divss %xmm1, %xmm0
    954 ; SSE-NEXT:    ret{{[l|q]}}
    955 ;
    956 ; AVX-LABEL: insert_test3_div_ss:
    957 ; AVX:       # %bb.0:
    958 ; AVX-NEXT:    vdivss %xmm1, %xmm0, %xmm0
    959 ; AVX-NEXT:    ret{{[l|q]}}
    960   %1 = fdiv <4 x float> %a, %b
    961   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
    962   ret <4 x float> %2
    963 }
    964 
    965 define <2 x double> @insert_test3_add_sd(<2 x double> %a, <2 x double> %b) {
    966 ; SSE-LABEL: insert_test3_add_sd:
    967 ; SSE:       # %bb.0:
    968 ; SSE-NEXT:    addsd %xmm1, %xmm0
    969 ; SSE-NEXT:    ret{{[l|q]}}
    970 ;
    971 ; AVX-LABEL: insert_test3_add_sd:
    972 ; AVX:       # %bb.0:
    973 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
    974 ; AVX-NEXT:    ret{{[l|q]}}
    975   %1 = fadd <2 x double> %a, %b
    976   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
    977   ret <2 x double> %2
    978 }
    979 
    980 define <2 x double> @insert_test3_sub_sd(<2 x double> %a, <2 x double> %b) {
    981 ; SSE-LABEL: insert_test3_sub_sd:
    982 ; SSE:       # %bb.0:
    983 ; SSE-NEXT:    subsd %xmm1, %xmm0
    984 ; SSE-NEXT:    ret{{[l|q]}}
    985 ;
    986 ; AVX-LABEL: insert_test3_sub_sd:
    987 ; AVX:       # %bb.0:
    988 ; AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
    989 ; AVX-NEXT:    ret{{[l|q]}}
    990   %1 = fsub <2 x double> %a, %b
    991   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
    992   ret <2 x double> %2
    993 }
    994 
    995 define <2 x double> @insert_test3_mul_sd(<2 x double> %a, <2 x double> %b) {
    996 ; SSE-LABEL: insert_test3_mul_sd:
    997 ; SSE:       # %bb.0:
    998 ; SSE-NEXT:    mulsd %xmm1, %xmm0
    999 ; SSE-NEXT:    ret{{[l|q]}}
   1000 ;
   1001 ; AVX-LABEL: insert_test3_mul_sd:
   1002 ; AVX:       # %bb.0:
   1003 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1004 ; AVX-NEXT:    ret{{[l|q]}}
   1005   %1 = fmul <2 x double> %a, %b
   1006   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
   1007   ret <2 x double> %2
   1008 }
   1009 
   1010 define <2 x double> @insert_test3_div_sd(<2 x double> %a, <2 x double> %b) {
   1011 ; SSE-LABEL: insert_test3_div_sd:
   1012 ; SSE:       # %bb.0:
   1013 ; SSE-NEXT:    divsd %xmm1, %xmm0
   1014 ; SSE-NEXT:    ret{{[l|q]}}
   1015 ;
   1016 ; AVX-LABEL: insert_test3_div_sd:
   1017 ; AVX:       # %bb.0:
   1018 ; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
   1019 ; AVX-NEXT:    ret{{[l|q]}}
   1020   %1 = fdiv <2 x double> %a, %b
   1021   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
   1022   ret <2 x double> %2
   1023 }
   1024 
   1025 define <4 x float> @insert_test4_add_ss(<4 x float> %a, <4 x float> %b) {
   1026 ; SSE-LABEL: insert_test4_add_ss:
   1027 ; SSE:       # %bb.0:
   1028 ; SSE-NEXT:    addss %xmm0, %xmm1
   1029 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1030 ; SSE-NEXT:    ret{{[l|q]}}
   1031 ;
   1032 ; AVX-LABEL: insert_test4_add_ss:
   1033 ; AVX:       # %bb.0:
   1034 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
   1035 ; AVX-NEXT:    ret{{[l|q]}}
   1036   %1 = fadd <4 x float> %b, %a
   1037   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
   1038   ret <4 x float> %2
   1039 }
   1040 
   1041 define <4 x float> @insert_test4_sub_ss(<4 x float> %a, <4 x float> %b) {
   1042 ; SSE-LABEL: insert_test4_sub_ss:
   1043 ; SSE:       # %bb.0:
   1044 ; SSE-NEXT:    subss %xmm0, %xmm1
   1045 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1046 ; SSE-NEXT:    ret{{[l|q]}}
   1047 ;
   1048 ; AVX-LABEL: insert_test4_sub_ss:
   1049 ; AVX:       # %bb.0:
   1050 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
   1051 ; AVX-NEXT:    ret{{[l|q]}}
   1052   %1 = fsub <4 x float> %b, %a
   1053   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
   1054   ret <4 x float> %2
   1055 }
   1056 
   1057 define <4 x float> @insert_test4_mul_ss(<4 x float> %a, <4 x float> %b) {
   1058 ; SSE-LABEL: insert_test4_mul_ss:
   1059 ; SSE:       # %bb.0:
   1060 ; SSE-NEXT:    mulss %xmm0, %xmm1
   1061 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1062 ; SSE-NEXT:    ret{{[l|q]}}
   1063 ;
   1064 ; AVX-LABEL: insert_test4_mul_ss:
   1065 ; AVX:       # %bb.0:
   1066 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
   1067 ; AVX-NEXT:    ret{{[l|q]}}
   1068   %1 = fmul <4 x float> %b, %a
   1069   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
   1070   ret <4 x float> %2
   1071 }
   1072 
   1073 define <4 x float> @insert_test4_div_ss(<4 x float> %a, <4 x float> %b) {
   1074 ; SSE-LABEL: insert_test4_div_ss:
   1075 ; SSE:       # %bb.0:
   1076 ; SSE-NEXT:    divss %xmm0, %xmm1
   1077 ; SSE-NEXT:    movaps %xmm1, %xmm0
   1078 ; SSE-NEXT:    ret{{[l|q]}}
   1079 ;
   1080 ; AVX-LABEL: insert_test4_div_ss:
   1081 ; AVX:       # %bb.0:
   1082 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
   1083 ; AVX-NEXT:    ret{{[l|q]}}
   1084   %1 = fdiv <4 x float> %b, %a
   1085   %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
   1086   ret <4 x float> %2
   1087 }
   1088 
   1089 define <2 x double> @insert_test4_add_sd(<2 x double> %a, <2 x double> %b) {
   1090 ; SSE-LABEL: insert_test4_add_sd:
   1091 ; SSE:       # %bb.0:
   1092 ; SSE-NEXT:    addsd %xmm0, %xmm1
   1093 ; SSE-NEXT:    movapd %xmm1, %xmm0
   1094 ; SSE-NEXT:    ret{{[l|q]}}
   1095 ;
   1096 ; AVX-LABEL: insert_test4_add_sd:
   1097 ; AVX:       # %bb.0:
   1098 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
   1099 ; AVX-NEXT:    ret{{[l|q]}}
   1100   %1 = fadd <2 x double> %b, %a
   1101   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
   1102   ret <2 x double> %2
   1103 }
   1104 
   1105 define <2 x double> @insert_test4_sub_sd(<2 x double> %a, <2 x double> %b) {
   1106 ; SSE-LABEL: insert_test4_sub_sd:
   1107 ; SSE:       # %bb.0:
   1108 ; SSE-NEXT:    subsd %xmm0, %xmm1
   1109 ; SSE-NEXT:    movapd %xmm1, %xmm0
   1110 ; SSE-NEXT:    ret{{[l|q]}}
   1111 ;
   1112 ; AVX-LABEL: insert_test4_sub_sd:
   1113 ; AVX:       # %bb.0:
   1114 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
   1115 ; AVX-NEXT:    ret{{[l|q]}}
   1116   %1 = fsub <2 x double> %b, %a
   1117   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
   1118   ret <2 x double> %2
   1119 }
   1120 
   1121 define <2 x double> @insert_test4_mul_sd(<2 x double> %a, <2 x double> %b) {
   1122 ; SSE-LABEL: insert_test4_mul_sd:
   1123 ; SSE:       # %bb.0:
   1124 ; SSE-NEXT:    mulsd %xmm0, %xmm1
   1125 ; SSE-NEXT:    movapd %xmm1, %xmm0
   1126 ; SSE-NEXT:    ret{{[l|q]}}
   1127 ;
   1128 ; AVX-LABEL: insert_test4_mul_sd:
   1129 ; AVX:       # %bb.0:
   1130 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
   1131 ; AVX-NEXT:    ret{{[l|q]}}
   1132   %1 = fmul <2 x double> %b, %a
   1133   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
   1134   ret <2 x double> %2
   1135 }
   1136 
   1137 define <2 x double> @insert_test4_div_sd(<2 x double> %a, <2 x double> %b) {
   1138 ; SSE-LABEL: insert_test4_div_sd:
   1139 ; SSE:       # %bb.0:
   1140 ; SSE-NEXT:    divsd %xmm0, %xmm1
   1141 ; SSE-NEXT:    movapd %xmm1, %xmm0
   1142 ; SSE-NEXT:    ret{{[l|q]}}
   1143 ;
   1144 ; AVX-LABEL: insert_test4_div_sd:
   1145 ; AVX:       # %bb.0:
   1146 ; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
   1147 ; AVX-NEXT:    ret{{[l|q]}}
   1148   %1 = fdiv <2 x double> %b, %a
   1149   %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
   1150   ret <2 x double> %2
   1151 }
   1152 
   1153 define <4 x float> @insert_test5_add_ss(<4 x float> %a, <4 x float> %b) {
   1154 ; SSE-LABEL: insert_test5_add_ss:
   1155 ; SSE:       # %bb.0:
   1156 ; SSE-NEXT:    addss %xmm1, %xmm0
   1157 ; SSE-NEXT:    ret{{[l|q]}}
   1158 ;
   1159 ; AVX-LABEL: insert_test5_add_ss:
   1160 ; AVX:       # %bb.0:
   1161 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
   1162 ; AVX-NEXT:    ret{{[l|q]}}
   1163   %1 = fadd <4 x float> %b, %a
   1164   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   1165   ret <4 x float> %2
   1166 }
   1167 
   1168 define <4 x float> @insert_test5_sub_ss(<4 x float> %a, <4 x float> %b) {
   1169 ; SSE2-LABEL: insert_test5_sub_ss:
   1170 ; SSE2:       # %bb.0:
   1171 ; SSE2-NEXT:    subps %xmm0, %xmm1
   1172 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1173 ; SSE2-NEXT:    ret{{[l|q]}}
   1174 ;
   1175 ; SSE41-LABEL: insert_test5_sub_ss:
   1176 ; SSE41:       # %bb.0:
   1177 ; SSE41-NEXT:    subps %xmm0, %xmm1
   1178 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1179 ; SSE41-NEXT:    ret{{[l|q]}}
   1180 ;
   1181 ; AVX-LABEL: insert_test5_sub_ss:
   1182 ; AVX:       # %bb.0:
   1183 ; AVX-NEXT:    vsubps %xmm0, %xmm1, %xmm1
   1184 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1185 ; AVX-NEXT:    ret{{[l|q]}}
   1186   %1 = fsub <4 x float> %b, %a
   1187   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   1188   ret <4 x float> %2
   1189 }
   1190 
   1191 define <4 x float> @insert_test5_mul_ss(<4 x float> %a, <4 x float> %b) {
   1192 ; SSE-LABEL: insert_test5_mul_ss:
   1193 ; SSE:       # %bb.0:
   1194 ; SSE-NEXT:    mulss %xmm1, %xmm0
   1195 ; SSE-NEXT:    ret{{[l|q]}}
   1196 ;
   1197 ; AVX-LABEL: insert_test5_mul_ss:
   1198 ; AVX:       # %bb.0:
   1199 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
   1200 ; AVX-NEXT:    ret{{[l|q]}}
   1201   %1 = fmul <4 x float> %b, %a
   1202   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   1203   ret <4 x float> %2
   1204 }
   1205 
   1206 define <4 x float> @insert_test5_div_ss(<4 x float> %a, <4 x float> %b) {
   1207 ; SSE2-LABEL: insert_test5_div_ss:
   1208 ; SSE2:       # %bb.0:
   1209 ; SSE2-NEXT:    divps %xmm0, %xmm1
   1210 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1211 ; SSE2-NEXT:    ret{{[l|q]}}
   1212 ;
   1213 ; SSE41-LABEL: insert_test5_div_ss:
   1214 ; SSE41:       # %bb.0:
   1215 ; SSE41-NEXT:    divps %xmm0, %xmm1
   1216 ; SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1217 ; SSE41-NEXT:    ret{{[l|q]}}
   1218 ;
   1219 ; AVX-LABEL: insert_test5_div_ss:
   1220 ; AVX:       # %bb.0:
   1221 ; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm1
   1222 ; AVX-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1223 ; AVX-NEXT:    ret{{[l|q]}}
   1224   %1 = fdiv <4 x float> %b, %a
   1225   %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
   1226   ret <4 x float> %2
   1227 }
   1228 
   1229 define <2 x double> @insert_test5_add_sd(<2 x double> %a, <2 x double> %b) {
   1230 ; SSE-LABEL: insert_test5_add_sd:
   1231 ; SSE:       # %bb.0:
   1232 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1233 ; SSE-NEXT:    ret{{[l|q]}}
   1234 ;
   1235 ; AVX-LABEL: insert_test5_add_sd:
   1236 ; AVX:       # %bb.0:
   1237 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1238 ; AVX-NEXT:    ret{{[l|q]}}
   1239   %1 = fadd <2 x double> %b, %a
   1240   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
   1241   ret <2 x double> %2
   1242 }
   1243 
   1244 define <2 x double> @insert_test5_sub_sd(<2 x double> %a, <2 x double> %b) {
   1245 ; SSE2-LABEL: insert_test5_sub_sd:
   1246 ; SSE2:       # %bb.0:
   1247 ; SSE2-NEXT:    subpd %xmm0, %xmm1
   1248 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1249 ; SSE2-NEXT:    ret{{[l|q]}}
   1250 ;
   1251 ; SSE41-LABEL: insert_test5_sub_sd:
   1252 ; SSE41:       # %bb.0:
   1253 ; SSE41-NEXT:    subpd %xmm0, %xmm1
   1254 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1255 ; SSE41-NEXT:    ret{{[l|q]}}
   1256 ;
   1257 ; AVX-LABEL: insert_test5_sub_sd:
   1258 ; AVX:       # %bb.0:
   1259 ; AVX-NEXT:    vsubpd %xmm0, %xmm1, %xmm1
   1260 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1261 ; AVX-NEXT:    ret{{[l|q]}}
   1262   %1 = fsub <2 x double> %b, %a
   1263   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
   1264   ret <2 x double> %2
   1265 }
   1266 
   1267 define <2 x double> @insert_test5_mul_sd(<2 x double> %a, <2 x double> %b) {
   1268 ; SSE-LABEL: insert_test5_mul_sd:
   1269 ; SSE:       # %bb.0:
   1270 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1271 ; SSE-NEXT:    ret{{[l|q]}}
   1272 ;
   1273 ; AVX-LABEL: insert_test5_mul_sd:
   1274 ; AVX:       # %bb.0:
   1275 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1276 ; AVX-NEXT:    ret{{[l|q]}}
   1277   %1 = fmul <2 x double> %b, %a
   1278   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
   1279   ret <2 x double> %2
   1280 }
   1281 
   1282 define <2 x double> @insert_test5_div_sd(<2 x double> %a, <2 x double> %b) {
   1283 ; SSE2-LABEL: insert_test5_div_sd:
   1284 ; SSE2:       # %bb.0:
   1285 ; SSE2-NEXT:    divpd %xmm0, %xmm1
   1286 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1287 ; SSE2-NEXT:    ret{{[l|q]}}
   1288 ;
   1289 ; SSE41-LABEL: insert_test5_div_sd:
   1290 ; SSE41:       # %bb.0:
   1291 ; SSE41-NEXT:    divpd %xmm0, %xmm1
   1292 ; SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1293 ; SSE41-NEXT:    ret{{[l|q]}}
   1294 ;
   1295 ; AVX-LABEL: insert_test5_div_sd:
   1296 ; AVX:       # %bb.0:
   1297 ; AVX-NEXT:    vdivpd %xmm0, %xmm1, %xmm1
   1298 ; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1299 ; AVX-NEXT:    ret{{[l|q]}}
   1300   %1 = fdiv <2 x double> %b, %a
   1301   %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
   1302   ret <2 x double> %2
   1303 }
   1304 
   1305 define <4 x float> @add_ss_mask(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
   1306 ; X86-SSE2-LABEL: add_ss_mask:
   1307 ; X86-SSE2:       # %bb.0:
   1308 ; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
   1309 ; X86-SSE2-NEXT:    jne .LBB70_1
   1310 ; X86-SSE2-NEXT:  # %bb.2:
   1311 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
   1312 ; X86-SSE2-NEXT:    retl
   1313 ; X86-SSE2-NEXT:  .LBB70_1:
   1314 ; X86-SSE2-NEXT:    addss %xmm0, %xmm1
   1315 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1316 ; X86-SSE2-NEXT:    retl
   1317 ;
   1318 ; X86-SSE41-LABEL: add_ss_mask:
   1319 ; X86-SSE41:       # %bb.0:
   1320 ; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
   1321 ; X86-SSE41-NEXT:    jne .LBB70_1
   1322 ; X86-SSE41-NEXT:  # %bb.2:
   1323 ; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
   1324 ; X86-SSE41-NEXT:    retl
   1325 ; X86-SSE41-NEXT:  .LBB70_1:
   1326 ; X86-SSE41-NEXT:    addss %xmm0, %xmm1
   1327 ; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1328 ; X86-SSE41-NEXT:    retl
   1329 ;
   1330 ; X86-AVX1-LABEL: add_ss_mask:
   1331 ; X86-AVX1:       # %bb.0:
   1332 ; X86-AVX1-NEXT:    testb $1, {{[0-9]+}}(%esp)
   1333 ; X86-AVX1-NEXT:    je .LBB70_2
   1334 ; X86-AVX1-NEXT:  # %bb.1:
   1335 ; X86-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
   1336 ; X86-AVX1-NEXT:  .LBB70_2:
   1337 ; X86-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
   1338 ; X86-AVX1-NEXT:    retl
   1339 ;
   1340 ; X86-AVX512-LABEL: add_ss_mask:
   1341 ; X86-AVX512:       # %bb.0:
   1342 ; X86-AVX512-NEXT:    movb {{[0-9]+}}(%esp), %al
   1343 ; X86-AVX512-NEXT:    kmovw %eax, %k1
   1344 ; X86-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
   1345 ; X86-AVX512-NEXT:    vmovaps %xmm2, %xmm0
   1346 ; X86-AVX512-NEXT:    retl
   1347 ;
   1348 ; X64-SSE2-LABEL: add_ss_mask:
   1349 ; X64-SSE2:       # %bb.0:
   1350 ; X64-SSE2-NEXT:    testb $1, %dil
   1351 ; X64-SSE2-NEXT:    jne .LBB70_1
   1352 ; X64-SSE2-NEXT:  # %bb.2:
   1353 ; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
   1354 ; X64-SSE2-NEXT:    retq
   1355 ; X64-SSE2-NEXT:  .LBB70_1:
   1356 ; X64-SSE2-NEXT:    addss %xmm0, %xmm1
   1357 ; X64-SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1358 ; X64-SSE2-NEXT:    retq
   1359 ;
   1360 ; X64-SSE41-LABEL: add_ss_mask:
   1361 ; X64-SSE41:       # %bb.0:
   1362 ; X64-SSE41-NEXT:    testb $1, %dil
   1363 ; X64-SSE41-NEXT:    jne .LBB70_1
   1364 ; X64-SSE41-NEXT:  # %bb.2:
   1365 ; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
   1366 ; X64-SSE41-NEXT:    retq
   1367 ; X64-SSE41-NEXT:  .LBB70_1:
   1368 ; X64-SSE41-NEXT:    addss %xmm0, %xmm1
   1369 ; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
   1370 ; X64-SSE41-NEXT:    retq
   1371 ;
   1372 ; X64-AVX1-LABEL: add_ss_mask:
   1373 ; X64-AVX1:       # %bb.0:
   1374 ; X64-AVX1-NEXT:    testb $1, %dil
   1375 ; X64-AVX1-NEXT:    je .LBB70_2
   1376 ; X64-AVX1-NEXT:  # %bb.1:
   1377 ; X64-AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
   1378 ; X64-AVX1-NEXT:  .LBB70_2:
   1379 ; X64-AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
   1380 ; X64-AVX1-NEXT:    retq
   1381 ;
   1382 ; X64-AVX512-LABEL: add_ss_mask:
   1383 ; X64-AVX512:       # %bb.0:
   1384 ; X64-AVX512-NEXT:    kmovw %edi, %k1
   1385 ; X64-AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm2 {%k1}
   1386 ; X64-AVX512-NEXT:    vmovaps %xmm2, %xmm0
   1387 ; X64-AVX512-NEXT:    retq
   1388   %1 = extractelement <4 x float> %a, i64 0
   1389   %2 = extractelement <4 x float> %b, i64 0
   1390   %3 = fadd float %1, %2
   1391   %4 = extractelement <4 x float> %c, i32 0
   1392   %5 = bitcast i8 %mask to <8 x i1>
   1393   %6 = extractelement <8 x i1> %5, i64 0
   1394   %7 = select i1 %6, float %3, float %4
   1395   %8 = insertelement <4 x float> %a, float %7, i64 0
   1396   ret <4 x float> %8
   1397 }
   1398 
   1399 define <2 x double> @add_sd_mask(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
   1400 ; X86-SSE2-LABEL: add_sd_mask:
   1401 ; X86-SSE2:       # %bb.0:
   1402 ; X86-SSE2-NEXT:    testb $1, {{[0-9]+}}(%esp)
   1403 ; X86-SSE2-NEXT:    jne .LBB71_1
   1404 ; X86-SSE2-NEXT:  # %bb.2:
   1405 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
   1406 ; X86-SSE2-NEXT:    retl
   1407 ; X86-SSE2-NEXT:  .LBB71_1:
   1408 ; X86-SSE2-NEXT:    addsd %xmm0, %xmm1
   1409 ; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1410 ; X86-SSE2-NEXT:    retl
   1411 ;
   1412 ; X86-SSE41-LABEL: add_sd_mask:
   1413 ; X86-SSE41:       # %bb.0:
   1414 ; X86-SSE41-NEXT:    testb $1, {{[0-9]+}}(%esp)
   1415 ; X86-SSE41-NEXT:    jne .LBB71_1
   1416 ; X86-SSE41-NEXT:  # %bb.2:
   1417 ; X86-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
   1418 ; X86-SSE41-NEXT:    retl
   1419 ; X86-SSE41-NEXT:  .LBB71_1:
   1420 ; X86-SSE41-NEXT:    addsd %xmm0, %xmm1
   1421 ; X86-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1422 ; X86-SSE41-NEXT:    retl
   1423 ;
   1424 ; X86-AVX1-LABEL: add_sd_mask:
   1425 ; X86-AVX1:       # %bb.0:
   1426 ; X86-AVX1-NEXT:    testb $1, {{[0-9]+}}(%esp)
   1427 ; X86-AVX1-NEXT:    je .LBB71_2
   1428 ; X86-AVX1-NEXT:  # %bb.1:
   1429 ; X86-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
   1430 ; X86-AVX1-NEXT:  .LBB71_2:
   1431 ; X86-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
   1432 ; X86-AVX1-NEXT:    retl
   1433 ;
   1434 ; X86-AVX512-LABEL: add_sd_mask:
   1435 ; X86-AVX512:       # %bb.0:
   1436 ; X86-AVX512-NEXT:    movb {{[0-9]+}}(%esp), %al
   1437 ; X86-AVX512-NEXT:    kmovw %eax, %k1
   1438 ; X86-AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
   1439 ; X86-AVX512-NEXT:    vmovapd %xmm2, %xmm0
   1440 ; X86-AVX512-NEXT:    retl
   1441 ;
   1442 ; X64-SSE2-LABEL: add_sd_mask:
   1443 ; X64-SSE2:       # %bb.0:
   1444 ; X64-SSE2-NEXT:    testb $1, %dil
   1445 ; X64-SSE2-NEXT:    jne .LBB71_1
   1446 ; X64-SSE2-NEXT:  # %bb.2:
   1447 ; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
   1448 ; X64-SSE2-NEXT:    retq
   1449 ; X64-SSE2-NEXT:  .LBB71_1:
   1450 ; X64-SSE2-NEXT:    addsd %xmm0, %xmm1
   1451 ; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1452 ; X64-SSE2-NEXT:    retq
   1453 ;
   1454 ; X64-SSE41-LABEL: add_sd_mask:
   1455 ; X64-SSE41:       # %bb.0:
   1456 ; X64-SSE41-NEXT:    testb $1, %dil
   1457 ; X64-SSE41-NEXT:    jne .LBB71_1
   1458 ; X64-SSE41-NEXT:  # %bb.2:
   1459 ; X64-SSE41-NEXT:    blendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
   1460 ; X64-SSE41-NEXT:    retq
   1461 ; X64-SSE41-NEXT:  .LBB71_1:
   1462 ; X64-SSE41-NEXT:    addsd %xmm0, %xmm1
   1463 ; X64-SSE41-NEXT:    blendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
   1464 ; X64-SSE41-NEXT:    retq
   1465 ;
   1466 ; X64-AVX1-LABEL: add_sd_mask:
   1467 ; X64-AVX1:       # %bb.0:
   1468 ; X64-AVX1-NEXT:    testb $1, %dil
   1469 ; X64-AVX1-NEXT:    je .LBB71_2
   1470 ; X64-AVX1-NEXT:  # %bb.1:
   1471 ; X64-AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
   1472 ; X64-AVX1-NEXT:  .LBB71_2:
   1473 ; X64-AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
   1474 ; X64-AVX1-NEXT:    retq
   1475 ;
   1476 ; X64-AVX512-LABEL: add_sd_mask:
   1477 ; X64-AVX512:       # %bb.0:
   1478 ; X64-AVX512-NEXT:    kmovw %edi, %k1
   1479 ; X64-AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm2 {%k1}
   1480 ; X64-AVX512-NEXT:    vmovapd %xmm2, %xmm0
   1481 ; X64-AVX512-NEXT:    retq
   1482   %1 = extractelement <2 x double> %a, i64 0
   1483   %2 = extractelement <2 x double> %b, i64 0
   1484   %3 = fadd double %1, %2
   1485   %4 = extractelement <2 x double> %c, i32 0
   1486   %5 = bitcast i8 %mask to <8 x i1>
   1487   %6 = extractelement <8 x i1> %5, i64 0
   1488   %7 = select i1 %6, double %3, double %4
   1489   %8 = insertelement <2 x double> %a, double %7, i64 0
   1490   ret <2 x double> %8
   1491 }
   1492