Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse3 | FileCheck %s --check-prefix=SSE
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s --check-prefix=AVX
      4 
      5 ; Verify that we correctly generate 'addsub' instructions from
      6 ; a sequence of vector extracts + float add/sub + vector inserts.
      7 
      8 define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
      9 ; SSE-LABEL: test1:
     10 ; SSE:       # BB#0:
     11 ; SSE-NEXT:    addsubps %xmm1, %xmm0
     12 ; SSE-NEXT:    retq
     13 ;
     14 ; AVX-LABEL: test1:
     15 ; AVX:       # BB#0:
     16 ; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
     17 ; AVX-NEXT:    retq
     18   %1 = extractelement <4 x float> %A, i32 0
     19   %2 = extractelement <4 x float> %B, i32 0
     20   %sub = fsub float %1, %2
     21   %3 = extractelement <4 x float> %A, i32 2
     22   %4 = extractelement <4 x float> %B, i32 2
     23   %sub2 = fsub float %3, %4
     24   %5 = extractelement <4 x float> %A, i32 1
     25   %6 = extractelement <4 x float> %B, i32 1
     26   %add = fadd float %5, %6
     27   %7 = extractelement <4 x float> %A, i32 3
     28   %8 = extractelement <4 x float> %B, i32 3
     29   %add2 = fadd float %7, %8
     30   %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
     31   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
     32   %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
     33   %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
     34   ret <4 x float> %vecinsert4
     35 }
     36 
     37 define <4 x float> @test2(<4 x float> %A, <4 x float> %B) {
     38 ; SSE-LABEL: test2:
     39 ; SSE:       # BB#0:
     40 ; SSE-NEXT:    addsubps %xmm1, %xmm0
     41 ; SSE-NEXT:    retq
     42 ;
     43 ; AVX-LABEL: test2:
     44 ; AVX:       # BB#0:
     45 ; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
     46 ; AVX-NEXT:    retq
     47   %1 = extractelement <4 x float> %A, i32 2
     48   %2 = extractelement <4 x float> %B, i32 2
     49   %sub2 = fsub float %1, %2
     50   %3 = extractelement <4 x float> %A, i32 3
     51   %4 = extractelement <4 x float> %B, i32 3
     52   %add2 = fadd float %3, %4
     53   %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 2
     54   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
     55   ret <4 x float> %vecinsert2
     56 }
     57 
     58 define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
     59 ; SSE-LABEL: test3:
     60 ; SSE:       # BB#0:
     61 ; SSE-NEXT:    addsubps %xmm1, %xmm0
     62 ; SSE-NEXT:    retq
     63 ;
     64 ; AVX-LABEL: test3:
     65 ; AVX:       # BB#0:
     66 ; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
     67 ; AVX-NEXT:    retq
     68   %1 = extractelement <4 x float> %A, i32 0
     69   %2 = extractelement <4 x float> %B, i32 0
     70   %sub = fsub float %1, %2
     71   %3 = extractelement <4 x float> %A, i32 3
     72   %4 = extractelement <4 x float> %B, i32 3
     73   %add = fadd float %4, %3
     74   %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
     75   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 3
     76   ret <4 x float> %vecinsert2
     77 }
     78 
     79 define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
     80 ; SSE-LABEL: test4:
     81 ; SSE:       # BB#0:
     82 ; SSE-NEXT:    addsubps %xmm1, %xmm0
     83 ; SSE-NEXT:    retq
     84 ;
     85 ; AVX-LABEL: test4:
     86 ; AVX:       # BB#0:
     87 ; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
     88 ; AVX-NEXT:    retq
     89   %1 = extractelement <4 x float> %A, i32 2
     90   %2 = extractelement <4 x float> %B, i32 2
     91   %sub = fsub float %1, %2
     92   %3 = extractelement <4 x float> %A, i32 1
     93   %4 = extractelement <4 x float> %B, i32 1
     94   %add = fadd float %3, %4
     95   %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
     96   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add, i32 1
     97   ret <4 x float> %vecinsert2
     98 }
     99 
    100 define <4 x float> @test5(<4 x float> %A, <4 x float> %B) {
    101 ; SSE-LABEL: test5:
    102 ; SSE:       # BB#0:
    103 ; SSE-NEXT:    addsubps %xmm1, %xmm0
    104 ; SSE-NEXT:    retq
    105 ;
    106 ; AVX-LABEL: test5:
    107 ; AVX:       # BB#0:
    108 ; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
    109 ; AVX-NEXT:    retq
    110   %1 = extractelement <4 x float> %A, i32 0
    111   %2 = extractelement <4 x float> %B, i32 0
    112   %sub2 = fsub float %1, %2
    113   %3 = extractelement <4 x float> %A, i32 1
    114   %4 = extractelement <4 x float> %B, i32 1
    115   %add2 = fadd float %3, %4
    116   %vecinsert1 = insertelement <4 x float> undef, float %sub2, i32 0
    117   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 1
    118   ret <4 x float> %vecinsert2
    119 }
    120 
    121 define <4 x float> @test6(<4 x float> %A, <4 x float> %B) {
    122 ; SSE-LABEL: test6:
    123 ; SSE:       # BB#0:
    124 ; SSE-NEXT:    addsubps %xmm1, %xmm0
    125 ; SSE-NEXT:    retq
    126 ;
    127 ; AVX-LABEL: test6:
    128 ; AVX:       # BB#0:
    129 ; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
    130 ; AVX-NEXT:    retq
    131   %1 = extractelement <4 x float> %A, i32 0
    132   %2 = extractelement <4 x float> %B, i32 0
    133   %sub = fsub float %1, %2
    134   %3 = extractelement <4 x float> %A, i32 2
    135   %4 = extractelement <4 x float> %B, i32 2
    136   %sub2 = fsub float %3, %4
    137   %5 = extractelement <4 x float> %A, i32 1
    138   %6 = extractelement <4 x float> %B, i32 1
    139   %add = fadd float %5, %6
    140   %7 = extractelement <4 x float> %A, i32 3
    141   %8 = extractelement <4 x float> %B, i32 3
    142   %add2 = fadd float %7, %8
    143   %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
    144   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
    145   %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
    146   %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
    147   ret <4 x float> %vecinsert4
    148 }
    149 
    150 define <4 x double> @test7(<4 x double> %A, <4 x double> %B) {
    151 ; SSE-LABEL: test7:
    152 ; SSE:       # BB#0:
    153 ; SSE-NEXT:    addsubpd %xmm2, %xmm0
    154 ; SSE-NEXT:    addsubpd %xmm3, %xmm1
    155 ; SSE-NEXT:    retq
    156 ;
    157 ; AVX-LABEL: test7:
    158 ; AVX:       # BB#0:
    159 ; AVX-NEXT:    vaddsubpd %ymm1, %ymm0, %ymm0
    160 ; AVX-NEXT:    retq
    161   %1 = extractelement <4 x double> %A, i32 0
    162   %2 = extractelement <4 x double> %B, i32 0
    163   %sub = fsub double %1, %2
    164   %3 = extractelement <4 x double> %A, i32 2
    165   %4 = extractelement <4 x double> %B, i32 2
    166   %sub2 = fsub double %3, %4
    167   %5 = extractelement <4 x double> %A, i32 1
    168   %6 = extractelement <4 x double> %B, i32 1
    169   %add = fadd double %5, %6
    170   %7 = extractelement <4 x double> %A, i32 3
    171   %8 = extractelement <4 x double> %B, i32 3
    172   %add2 = fadd double %7, %8
    173   %vecinsert1 = insertelement <4 x double> undef, double %add, i32 1
    174   %vecinsert2 = insertelement <4 x double> %vecinsert1, double %add2, i32 3
    175   %vecinsert3 = insertelement <4 x double> %vecinsert2, double %sub, i32 0
    176   %vecinsert4 = insertelement <4 x double> %vecinsert3, double %sub2, i32 2
    177   ret <4 x double> %vecinsert4
    178 }
    179 
    180 define <2 x double> @test8(<2 x double> %A, <2 x double> %B) {
    181 ; SSE-LABEL: test8:
    182 ; SSE:       # BB#0:
    183 ; SSE-NEXT:    addsubpd %xmm1, %xmm0
    184 ; SSE-NEXT:    retq
    185 ;
    186 ; AVX-LABEL: test8:
    187 ; AVX:       # BB#0:
    188 ; AVX-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0
    189 ; AVX-NEXT:    retq
    190   %1 = extractelement <2 x double> %A, i32 0
    191   %2 = extractelement <2 x double> %B, i32 0
    192   %sub = fsub double %1, %2
    193   %3 = extractelement <2 x double> %A, i32 1
    194   %4 = extractelement <2 x double> %B, i32 1
    195   %add = fadd double %3, %4
    196   %vecinsert1 = insertelement <2 x double> undef, double %sub, i32 0
    197   %vecinsert2 = insertelement <2 x double> %vecinsert1, double %add, i32 1
    198   ret <2 x double> %vecinsert2
    199 }
    200 
    201 define <8 x float> @test9(<8 x float> %A, <8 x float> %B) {
    202 ; SSE-LABEL: test9:
    203 ; SSE:       # BB#0:
    204 ; SSE-NEXT:    addsubps %xmm2, %xmm0
    205 ; SSE-NEXT:    addsubps %xmm3, %xmm1
    206 ; SSE-NEXT:    retq
    207 ;
    208 ; AVX-LABEL: test9:
    209 ; AVX:       # BB#0:
    210 ; AVX-NEXT:    vaddsubps %ymm1, %ymm0, %ymm0
    211 ; AVX-NEXT:    retq
    212   %1 = extractelement <8 x float> %A, i32 0
    213   %2 = extractelement <8 x float> %B, i32 0
    214   %sub = fsub float %1, %2
    215   %3 = extractelement <8 x float> %A, i32 2
    216   %4 = extractelement <8 x float> %B, i32 2
    217   %sub2 = fsub float %3, %4
    218   %5 = extractelement <8 x float> %A, i32 1
    219   %6 = extractelement <8 x float> %B, i32 1
    220   %add = fadd float %5, %6
    221   %7 = extractelement <8 x float> %A, i32 3
    222   %8 = extractelement <8 x float> %B, i32 3
    223   %add2 = fadd float %7, %8
    224   %9 = extractelement <8 x float> %A, i32 4
    225   %10 = extractelement <8 x float> %B, i32 4
    226   %sub3 = fsub float %9, %10
    227   %11 = extractelement <8 x float> %A, i32 6
    228   %12 = extractelement <8 x float> %B, i32 6
    229   %sub4 = fsub float %11, %12
    230   %13 = extractelement <8 x float> %A, i32 5
    231   %14 = extractelement <8 x float> %B, i32 5
    232   %add3 = fadd float %13, %14
    233   %15 = extractelement <8 x float> %A, i32 7
    234   %16 = extractelement <8 x float> %B, i32 7
    235   %add4 = fadd float %15, %16
    236   %vecinsert1 = insertelement <8 x float> undef, float %add, i32 1
    237   %vecinsert2 = insertelement <8 x float> %vecinsert1, float %add2, i32 3
    238   %vecinsert3 = insertelement <8 x float> %vecinsert2, float %sub, i32 0
    239   %vecinsert4 = insertelement <8 x float> %vecinsert3, float %sub2, i32 2
    240   %vecinsert5 = insertelement <8 x float> %vecinsert4, float %add3, i32 5
    241   %vecinsert6 = insertelement <8 x float> %vecinsert5, float %add4, i32 7
    242   %vecinsert7 = insertelement <8 x float> %vecinsert6, float %sub3, i32 4
    243   %vecinsert8 = insertelement <8 x float> %vecinsert7, float %sub4, i32 6
    244   ret <8 x float> %vecinsert8
    245 }
    246 
    247 ; Verify that we don't generate addsub instruction for the following
    248 ; functions.
    249 
    250 define <4 x float> @test10(<4 x float> %A, <4 x float> %B) {
    251 ; SSE-LABEL: test10:
    252 ; SSE:       # BB#0:
    253 ; SSE-NEXT:    subss %xmm1, %xmm0
    254 ; SSE-NEXT:    retq
    255 ;
    256 ; AVX-LABEL: test10:
    257 ; AVX:       # BB#0:
    258 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
    259 ; AVX-NEXT:    retq
    260   %1 = extractelement <4 x float> %A, i32 0
    261   %2 = extractelement <4 x float> %B, i32 0
    262   %sub = fsub float %1, %2
    263   %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
    264   ret <4 x float> %vecinsert1
    265 }
    266 
    267 define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
    268 ; SSE-LABEL: test11:
    269 ; SSE:       # BB#0:
    270 ; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
    271 ; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
    272 ; SSE-NEXT:    subss %xmm1, %xmm0
    273 ; SSE-NEXT:    movddup {{.*#+}} xmm0 = xmm0[0,0]
    274 ; SSE-NEXT:    retq
    275 ;
    276 ; AVX-LABEL: test11:
    277 ; AVX:       # BB#0:
    278 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
    279 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
    280 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
    281 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
    282 ; AVX-NEXT:    retq
    283   %1 = extractelement <4 x float> %A, i32 2
    284   %2 = extractelement <4 x float> %B, i32 2
    285   %sub = fsub float %1, %2
    286   %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 2
    287   ret <4 x float> %vecinsert1
    288 }
    289 
    290 define <4 x float> @test12(<4 x float> %A, <4 x float> %B) {
    291 ; SSE-LABEL: test12:
    292 ; SSE:       # BB#0:
    293 ; SSE-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    294 ; SSE-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
    295 ; SSE-NEXT:    addss %xmm0, %xmm1
    296 ; SSE-NEXT:    movsldup {{.*#+}} xmm0 = xmm1[0,0,2,2]
    297 ; SSE-NEXT:    retq
    298 ;
    299 ; AVX-LABEL: test12:
    300 ; AVX:       # BB#0:
    301 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    302 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
    303 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    304 ; AVX-NEXT:    vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
    305 ; AVX-NEXT:    retq
    306   %1 = extractelement <4 x float> %A, i32 1
    307   %2 = extractelement <4 x float> %B, i32 1
    308   %add = fadd float %1, %2
    309   %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
    310   ret <4 x float> %vecinsert1
    311 }
    312 
    313 define <4 x float> @test13(<4 x float> %A, <4 x float> %B) {
    314 ; SSE-LABEL: test13:
    315 ; SSE:       # BB#0:
    316 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    317 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    318 ; SSE-NEXT:    addss %xmm0, %xmm1
    319 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1,2,0]
    320 ; SSE-NEXT:    movaps %xmm1, %xmm0
    321 ; SSE-NEXT:    retq
    322 ;
    323 ; AVX-LABEL: test13:
    324 ; AVX:       # BB#0:
    325 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    326 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    327 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    328 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0]
    329 ; AVX-NEXT:    retq
    330   %1 = extractelement <4 x float> %A, i32 3
    331   %2 = extractelement <4 x float> %B, i32 3
    332   %add = fadd float %1, %2
    333   %vecinsert1 = insertelement <4 x float> undef, float %add, i32 3
    334   ret <4 x float> %vecinsert1
    335 }
    336 
    337 define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
    338 ; SSE-LABEL: test14:
    339 ; SSE:       # BB#0:
    340 ; SSE-NEXT:    movaps %xmm0, %xmm2
    341 ; SSE-NEXT:    subss %xmm1, %xmm2
    342 ; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1,0]
    343 ; SSE-NEXT:    shufpd {{.*#+}} xmm1 = xmm1[1,0]
    344 ; SSE-NEXT:    subss %xmm1, %xmm0
    345 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
    346 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1,1,3]
    347 ; SSE-NEXT:    movaps %xmm2, %xmm0
    348 ; SSE-NEXT:    retq
    349 ;
    350 ; AVX-LABEL: test14:
    351 ; AVX:       # BB#0:
    352 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm2
    353 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
    354 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
    355 ; AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
    356 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
    357 ; AVX-NEXT:    retq
    358   %1 = extractelement <4 x float> %A, i32 0
    359   %2 = extractelement <4 x float> %B, i32 0
    360   %sub = fsub float %1, %2
    361   %3 = extractelement <4 x float> %A, i32 2
    362   %4 = extractelement <4 x float> %B, i32 2
    363   %sub2 = fsub float %3, %4
    364   %vecinsert1 = insertelement <4 x float> undef, float %sub, i32 0
    365   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %sub2, i32 2
    366   ret <4 x float> %vecinsert2
    367 }
    368 
    369 define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
    370 ; SSE-LABEL: test15:
    371 ; SSE:       # BB#0:
    372 ; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    373 ; SSE-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    374 ; SSE-NEXT:    addss %xmm3, %xmm2
    375 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    376 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    377 ; SSE-NEXT:    addss %xmm0, %xmm1
    378 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
    379 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,2,1]
    380 ; SSE-NEXT:    movaps %xmm2, %xmm0
    381 ; SSE-NEXT:    retq
    382 ;
    383 ; AVX-LABEL: test15:
    384 ; AVX:       # BB#0:
    385 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    386 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    387 ; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
    388 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    389 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    390 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    391 ; AVX-NEXT:    vmovsldup {{.*#+}} xmm1 = xmm2[0,0,2,2]
    392 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    393 ; AVX-NEXT:    retq
    394   %1 = extractelement <4 x float> %A, i32 1
    395   %2 = extractelement <4 x float> %B, i32 1
    396   %add = fadd float %1, %2
    397   %3 = extractelement <4 x float> %A, i32 3
    398   %4 = extractelement <4 x float> %B, i32 3
    399   %add2 = fadd float %3, %4
    400   %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
    401   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
    402   ret <4 x float> %vecinsert2
    403 }
    404 
    405 define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
    406 ; SSE-LABEL: test16:
    407 ; SSE:       # BB#0:
    408 ; SSE-NEXT:    movaps %xmm0, %xmm2
    409 ; SSE-NEXT:    subss %xmm0, %xmm2
    410 ; SSE-NEXT:    movaps %xmm0, %xmm3
    411 ; SSE-NEXT:    shufpd {{.*#+}} xmm3 = xmm3[1,0]
    412 ; SSE-NEXT:    movapd %xmm1, %xmm4
    413 ; SSE-NEXT:    shufpd {{.*#+}} xmm4 = xmm4[1,0]
    414 ; SSE-NEXT:    subss %xmm4, %xmm3
    415 ; SSE-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
    416 ; SSE-NEXT:    addss %xmm0, %xmm4
    417 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    418 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    419 ; SSE-NEXT:    addss %xmm0, %xmm1
    420 ; SSE-NEXT:    unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
    421 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
    422 ; SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
    423 ; SSE-NEXT:    movaps %xmm2, %xmm0
    424 ; SSE-NEXT:    retq
    425 ;
    426 ; AVX-LABEL: test16:
    427 ; AVX:       # BB#0:
    428 ; AVX-NEXT:    vsubss %xmm0, %xmm0, %xmm2
    429 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
    430 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
    431 ; AVX-NEXT:    vsubss %xmm4, %xmm3, %xmm3
    432 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
    433 ; AVX-NEXT:    vaddss %xmm0, %xmm4, %xmm4
    434 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    435 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    436 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    437 ; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[2,3]
    438 ; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
    439 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
    440 ; AVX-NEXT:    retq
    441   %1 = extractelement <4 x float> %A, i32 0
    442   %2 = extractelement <4 x float> %B, i32 0
    443   %sub = fsub float %1, undef
    444   %3 = extractelement <4 x float> %A, i32 2
    445   %4 = extractelement <4 x float> %B, i32 2
    446   %sub2 = fsub float %3, %4
    447   %5 = extractelement <4 x float> %A, i32 1
    448   %6 = extractelement <4 x float> %B, i32 1
    449   %add = fadd float %5, undef
    450   %7 = extractelement <4 x float> %A, i32 3
    451   %8 = extractelement <4 x float> %B, i32 3
    452   %add2 = fadd float %7, %8
    453   %vecinsert1 = insertelement <4 x float> undef, float %add, i32 1
    454   %vecinsert2 = insertelement <4 x float> %vecinsert1, float %add2, i32 3
    455   %vecinsert3 = insertelement <4 x float> %vecinsert2, float %sub, i32 0
    456   %vecinsert4 = insertelement <4 x float> %vecinsert3, float %sub2, i32 2
    457   ret <4 x float> %vecinsert4
    458 }
    459 
    460 define <2 x float> @test_v2f32(<2 x float> %v0, <2 x float> %v1) {
    461 ; SSE-LABEL: test_v2f32:
    462 ; SSE:       # BB#0:
    463 ; SSE-NEXT:    addsubps %xmm1, %xmm0
    464 ; SSE-NEXT:    retq
    465 ;
    466 ; AVX-LABEL: test_v2f32:
    467 ; AVX:       # BB#0:
    468 ; AVX-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0
    469 ; AVX-NEXT:    retq
    470   %v2 = extractelement <2 x float> %v0, i32 0
    471   %v3 = extractelement <2 x float> %v1, i32 0
    472   %v4 = extractelement <2 x float> %v0, i32 1
    473   %v5 = extractelement <2 x float> %v1, i32 1
    474   %sub = fsub float %v2, %v3
    475   %add = fadd float %v5, %v4
    476   %res0 = insertelement <2 x float> undef, float %sub, i32 0
    477   %res1 = insertelement <2 x float> %res0, float %add, i32 1
    478   ret <2 x float> %res1
    479 }
    480