Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
      8 
      9 ;
     10 ; vXf32 (accum)
     11 ;
     12 
     13 define float @test_v2f32(float %a0, <2 x float> %a1) {
     14 ; SSE2-LABEL: test_v2f32:
     15 ; SSE2:       # %bb.0:
     16 ; SSE2-NEXT:    addss %xmm1, %xmm0
     17 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
     18 ; SSE2-NEXT:    addss %xmm1, %xmm0
     19 ; SSE2-NEXT:    retq
     20 ;
     21 ; SSE41-LABEL: test_v2f32:
     22 ; SSE41:       # %bb.0:
     23 ; SSE41-NEXT:    addss %xmm1, %xmm0
     24 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
     25 ; SSE41-NEXT:    addss %xmm1, %xmm0
     26 ; SSE41-NEXT:    retq
     27 ;
     28 ; AVX-LABEL: test_v2f32:
     29 ; AVX:       # %bb.0:
     30 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
     31 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
     32 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
     33 ; AVX-NEXT:    retq
     34 ;
     35 ; AVX512-LABEL: test_v2f32:
     36 ; AVX512:       # %bb.0:
     37 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
     38 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
     39 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
     40 ; AVX512-NEXT:    retq
     41   %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float %a0, <2 x float> %a1)
     42   ret float %1
     43 }
     44 
     45 define float @test_v4f32(float %a0, <4 x float> %a1) {
     46 ; SSE2-LABEL: test_v4f32:
     47 ; SSE2:       # %bb.0:
     48 ; SSE2-NEXT:    addss %xmm1, %xmm0
     49 ; SSE2-NEXT:    movaps %xmm1, %xmm2
     50 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
     51 ; SSE2-NEXT:    addss %xmm2, %xmm0
     52 ; SSE2-NEXT:    movaps %xmm1, %xmm2
     53 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
     54 ; SSE2-NEXT:    addss %xmm2, %xmm0
     55 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
     56 ; SSE2-NEXT:    addss %xmm1, %xmm0
     57 ; SSE2-NEXT:    retq
     58 ;
     59 ; SSE41-LABEL: test_v4f32:
     60 ; SSE41:       # %bb.0:
     61 ; SSE41-NEXT:    addss %xmm1, %xmm0
     62 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
     63 ; SSE41-NEXT:    addss %xmm2, %xmm0
     64 ; SSE41-NEXT:    movaps %xmm1, %xmm2
     65 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
     66 ; SSE41-NEXT:    addss %xmm2, %xmm0
     67 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
     68 ; SSE41-NEXT:    addss %xmm1, %xmm0
     69 ; SSE41-NEXT:    retq
     70 ;
     71 ; AVX-LABEL: test_v4f32:
     72 ; AVX:       # %bb.0:
     73 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
     74 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
     75 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
     76 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
     77 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
     78 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
     79 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
     80 ; AVX-NEXT:    retq
     81 ;
     82 ; AVX512-LABEL: test_v4f32:
     83 ; AVX512:       # %bb.0:
     84 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
     85 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
     86 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
     87 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
     88 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
     89 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
     90 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
     91 ; AVX512-NEXT:    retq
     92   %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float %a0, <4 x float> %a1)
     93   ret float %1
     94 }
     95 
     96 define float @test_v8f32(float %a0, <8 x float> %a1) {
     97 ; SSE2-LABEL: test_v8f32:
     98 ; SSE2:       # %bb.0:
     99 ; SSE2-NEXT:    addss %xmm1, %xmm0
    100 ; SSE2-NEXT:    movaps %xmm1, %xmm3
    101 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[2,3]
    102 ; SSE2-NEXT:    addss %xmm3, %xmm0
    103 ; SSE2-NEXT:    movaps %xmm1, %xmm3
    104 ; SSE2-NEXT:    movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
    105 ; SSE2-NEXT:    addss %xmm3, %xmm0
    106 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    107 ; SSE2-NEXT:    addss %xmm1, %xmm0
    108 ; SSE2-NEXT:    addss %xmm2, %xmm0
    109 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    110 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
    111 ; SSE2-NEXT:    addss %xmm1, %xmm0
    112 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    113 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    114 ; SSE2-NEXT:    addss %xmm1, %xmm0
    115 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    116 ; SSE2-NEXT:    addss %xmm2, %xmm0
    117 ; SSE2-NEXT:    retq
    118 ;
    119 ; SSE41-LABEL: test_v8f32:
    120 ; SSE41:       # %bb.0:
    121 ; SSE41-NEXT:    addss %xmm1, %xmm0
    122 ; SSE41-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    123 ; SSE41-NEXT:    addss %xmm3, %xmm0
    124 ; SSE41-NEXT:    movaps %xmm1, %xmm3
    125 ; SSE41-NEXT:    movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
    126 ; SSE41-NEXT:    addss %xmm3, %xmm0
    127 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    128 ; SSE41-NEXT:    addss %xmm1, %xmm0
    129 ; SSE41-NEXT:    addss %xmm2, %xmm0
    130 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
    131 ; SSE41-NEXT:    addss %xmm1, %xmm0
    132 ; SSE41-NEXT:    movaps %xmm2, %xmm1
    133 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    134 ; SSE41-NEXT:    addss %xmm1, %xmm0
    135 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    136 ; SSE41-NEXT:    addss %xmm2, %xmm0
    137 ; SSE41-NEXT:    retq
    138 ;
    139 ; AVX-LABEL: test_v8f32:
    140 ; AVX:       # %bb.0:
    141 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    142 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    143 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    144 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    145 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    146 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
    147 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    148 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
    149 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    150 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    151 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    152 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    153 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    154 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    155 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    156 ; AVX-NEXT:    vzeroupper
    157 ; AVX-NEXT:    retq
    158 ;
    159 ; AVX512-LABEL: test_v8f32:
    160 ; AVX512:       # %bb.0:
    161 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    162 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    163 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    164 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    165 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    166 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
    167 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    168 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
    169 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    170 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    171 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    172 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    173 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    174 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    175 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    176 ; AVX512-NEXT:    vzeroupper
    177 ; AVX512-NEXT:    retq
    178   %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float %a0, <8 x float> %a1)
    179   ret float %1
    180 }
    181 
    182 define float @test_v16f32(float %a0, <16 x float> %a1) {
    183 ; SSE2-LABEL: test_v16f32:
    184 ; SSE2:       # %bb.0:
    185 ; SSE2-NEXT:    addss %xmm1, %xmm0
    186 ; SSE2-NEXT:    movaps %xmm1, %xmm5
    187 ; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[2,3]
    188 ; SSE2-NEXT:    addss %xmm5, %xmm0
    189 ; SSE2-NEXT:    movaps %xmm1, %xmm5
    190 ; SSE2-NEXT:    movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1]
    191 ; SSE2-NEXT:    addss %xmm5, %xmm0
    192 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    193 ; SSE2-NEXT:    addss %xmm1, %xmm0
    194 ; SSE2-NEXT:    addss %xmm2, %xmm0
    195 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    196 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
    197 ; SSE2-NEXT:    addss %xmm1, %xmm0
    198 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    199 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    200 ; SSE2-NEXT:    addss %xmm1, %xmm0
    201 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    202 ; SSE2-NEXT:    addss %xmm2, %xmm0
    203 ; SSE2-NEXT:    addss %xmm3, %xmm0
    204 ; SSE2-NEXT:    movaps %xmm3, %xmm1
    205 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
    206 ; SSE2-NEXT:    addss %xmm1, %xmm0
    207 ; SSE2-NEXT:    movaps %xmm3, %xmm1
    208 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
    209 ; SSE2-NEXT:    addss %xmm1, %xmm0
    210 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    211 ; SSE2-NEXT:    addss %xmm3, %xmm0
    212 ; SSE2-NEXT:    addss %xmm4, %xmm0
    213 ; SSE2-NEXT:    movaps %xmm4, %xmm1
    214 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[2,3]
    215 ; SSE2-NEXT:    addss %xmm1, %xmm0
    216 ; SSE2-NEXT:    movaps %xmm4, %xmm1
    217 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1]
    218 ; SSE2-NEXT:    addss %xmm1, %xmm0
    219 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1,2,3]
    220 ; SSE2-NEXT:    addss %xmm4, %xmm0
    221 ; SSE2-NEXT:    retq
    222 ;
    223 ; SSE41-LABEL: test_v16f32:
    224 ; SSE41:       # %bb.0:
    225 ; SSE41-NEXT:    addss %xmm1, %xmm0
    226 ; SSE41-NEXT:    movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
    227 ; SSE41-NEXT:    addss %xmm5, %xmm0
    228 ; SSE41-NEXT:    movaps %xmm1, %xmm5
    229 ; SSE41-NEXT:    movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1]
    230 ; SSE41-NEXT:    addss %xmm5, %xmm0
    231 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    232 ; SSE41-NEXT:    addss %xmm1, %xmm0
    233 ; SSE41-NEXT:    addss %xmm2, %xmm0
    234 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
    235 ; SSE41-NEXT:    addss %xmm1, %xmm0
    236 ; SSE41-NEXT:    movaps %xmm2, %xmm1
    237 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    238 ; SSE41-NEXT:    addss %xmm1, %xmm0
    239 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    240 ; SSE41-NEXT:    addss %xmm2, %xmm0
    241 ; SSE41-NEXT:    addss %xmm3, %xmm0
    242 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
    243 ; SSE41-NEXT:    addss %xmm1, %xmm0
    244 ; SSE41-NEXT:    movaps %xmm3, %xmm1
    245 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
    246 ; SSE41-NEXT:    addss %xmm1, %xmm0
    247 ; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    248 ; SSE41-NEXT:    addss %xmm3, %xmm0
    249 ; SSE41-NEXT:    addss %xmm4, %xmm0
    250 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
    251 ; SSE41-NEXT:    addss %xmm1, %xmm0
    252 ; SSE41-NEXT:    movaps %xmm4, %xmm1
    253 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1]
    254 ; SSE41-NEXT:    addss %xmm1, %xmm0
    255 ; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1,2,3]
    256 ; SSE41-NEXT:    addss %xmm4, %xmm0
    257 ; SSE41-NEXT:    retq
    258 ;
    259 ; AVX-LABEL: test_v16f32:
    260 ; AVX:       # %bb.0:
    261 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    262 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    263 ; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
    264 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
    265 ; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
    266 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
    267 ; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
    268 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
    269 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    270 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    271 ; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
    272 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
    273 ; AVX-NEXT:    vaddss %xmm3, %xmm0, %xmm0
    274 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    275 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    276 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    277 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
    278 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    279 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
    280 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    281 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
    282 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    283 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
    284 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    285 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    286 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    287 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    288 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    289 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    290 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    291 ; AVX-NEXT:    vzeroupper
    292 ; AVX-NEXT:    retq
    293 ;
    294 ; AVX512-LABEL: test_v16f32:
    295 ; AVX512:       # %bb.0:
    296 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    297 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    298 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    299 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    300 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    301 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
    302 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    303 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
    304 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    305 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
    306 ; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
    307 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
    308 ; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
    309 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    310 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    311 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
    312 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    313 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
    314 ; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
    315 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
    316 ; AVX512-NEXT:    vaddss %xmm3, %xmm0, %xmm0
    317 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    318 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    319 ; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
    320 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    321 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    322 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    323 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    324 ; AVX512-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    325 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    326 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    327 ; AVX512-NEXT:    vzeroupper
    328 ; AVX512-NEXT:    retq
    329   %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float %a0, <16 x float> %a1)
    330   ret float %1
    331 }
    332 
    333 ;
    334 ; vXf32 (zero)
    335 ;
    336 
    337 define float @test_v2f32_zero(<2 x float> %a0) {
    338 ; SSE2-LABEL: test_v2f32_zero:
    339 ; SSE2:       # %bb.0:
    340 ; SSE2-NEXT:    xorps %xmm1, %xmm1
    341 ; SSE2-NEXT:    addss %xmm0, %xmm1
    342 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
    343 ; SSE2-NEXT:    addss %xmm1, %xmm0
    344 ; SSE2-NEXT:    retq
    345 ;
    346 ; SSE41-LABEL: test_v2f32_zero:
    347 ; SSE41:       # %bb.0:
    348 ; SSE41-NEXT:    xorps %xmm1, %xmm1
    349 ; SSE41-NEXT:    addss %xmm0, %xmm1
    350 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    351 ; SSE41-NEXT:    addss %xmm1, %xmm0
    352 ; SSE41-NEXT:    retq
    353 ;
    354 ; AVX-LABEL: test_v2f32_zero:
    355 ; AVX:       # %bb.0:
    356 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    357 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
    358 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    359 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    360 ; AVX-NEXT:    retq
    361 ;
    362 ; AVX512-LABEL: test_v2f32_zero:
    363 ; AVX512:       # %bb.0:
    364 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    365 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
    366 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    367 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    368 ; AVX512-NEXT:    retq
    369   %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float 0.0, <2 x float> %a0)
    370   ret float %1
    371 }
    372 
    373 define float @test_v4f32_zero(<4 x float> %a0) {
    374 ; SSE2-LABEL: test_v4f32_zero:
    375 ; SSE2:       # %bb.0:
    376 ; SSE2-NEXT:    xorps %xmm1, %xmm1
    377 ; SSE2-NEXT:    addss %xmm0, %xmm1
    378 ; SSE2-NEXT:    movaps %xmm0, %xmm2
    379 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
    380 ; SSE2-NEXT:    addss %xmm1, %xmm2
    381 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    382 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    383 ; SSE2-NEXT:    addss %xmm2, %xmm1
    384 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    385 ; SSE2-NEXT:    addss %xmm1, %xmm0
    386 ; SSE2-NEXT:    retq
    387 ;
    388 ; SSE41-LABEL: test_v4f32_zero:
    389 ; SSE41:       # %bb.0:
    390 ; SSE41-NEXT:    xorps %xmm1, %xmm1
    391 ; SSE41-NEXT:    addss %xmm0, %xmm1
    392 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    393 ; SSE41-NEXT:    addss %xmm1, %xmm2
    394 ; SSE41-NEXT:    movaps %xmm0, %xmm1
    395 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    396 ; SSE41-NEXT:    addss %xmm2, %xmm1
    397 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    398 ; SSE41-NEXT:    addss %xmm1, %xmm0
    399 ; SSE41-NEXT:    retq
    400 ;
    401 ; AVX-LABEL: test_v4f32_zero:
    402 ; AVX:       # %bb.0:
    403 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    404 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
    405 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    406 ; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    407 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    408 ; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    409 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    410 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    411 ; AVX-NEXT:    retq
    412 ;
    413 ; AVX512-LABEL: test_v4f32_zero:
    414 ; AVX512:       # %bb.0:
    415 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    416 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
    417 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    418 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    419 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    420 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    421 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    422 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    423 ; AVX512-NEXT:    retq
    424   %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float 0.0, <4 x float> %a0)
    425   ret float %1
    426 }
    427 
    428 define float @test_v8f32_zero(<8 x float> %a0) {
    429 ; SSE2-LABEL: test_v8f32_zero:
    430 ; SSE2:       # %bb.0:
    431 ; SSE2-NEXT:    xorps %xmm2, %xmm2
    432 ; SSE2-NEXT:    addss %xmm0, %xmm2
    433 ; SSE2-NEXT:    movaps %xmm0, %xmm3
    434 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[2,3]
    435 ; SSE2-NEXT:    addss %xmm2, %xmm3
    436 ; SSE2-NEXT:    movaps %xmm0, %xmm2
    437 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
    438 ; SSE2-NEXT:    addss %xmm3, %xmm2
    439 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    440 ; SSE2-NEXT:    addss %xmm2, %xmm0
    441 ; SSE2-NEXT:    addss %xmm1, %xmm0
    442 ; SSE2-NEXT:    movaps %xmm1, %xmm2
    443 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
    444 ; SSE2-NEXT:    addss %xmm2, %xmm0
    445 ; SSE2-NEXT:    movaps %xmm1, %xmm2
    446 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
    447 ; SSE2-NEXT:    addss %xmm2, %xmm0
    448 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    449 ; SSE2-NEXT:    addss %xmm1, %xmm0
    450 ; SSE2-NEXT:    retq
    451 ;
    452 ; SSE41-LABEL: test_v8f32_zero:
    453 ; SSE41:       # %bb.0:
    454 ; SSE41-NEXT:    xorps %xmm2, %xmm2
    455 ; SSE41-NEXT:    addss %xmm0, %xmm2
    456 ; SSE41-NEXT:    movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    457 ; SSE41-NEXT:    addss %xmm2, %xmm3
    458 ; SSE41-NEXT:    movaps %xmm0, %xmm2
    459 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
    460 ; SSE41-NEXT:    addss %xmm3, %xmm2
    461 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    462 ; SSE41-NEXT:    addss %xmm2, %xmm0
    463 ; SSE41-NEXT:    addss %xmm1, %xmm0
    464 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    465 ; SSE41-NEXT:    addss %xmm2, %xmm0
    466 ; SSE41-NEXT:    movaps %xmm1, %xmm2
    467 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
    468 ; SSE41-NEXT:    addss %xmm2, %xmm0
    469 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    470 ; SSE41-NEXT:    addss %xmm1, %xmm0
    471 ; SSE41-NEXT:    retq
    472 ;
    473 ; AVX-LABEL: test_v8f32_zero:
    474 ; AVX:       # %bb.0:
    475 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    476 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm1
    477 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    478 ; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    479 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    480 ; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    481 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
    482 ; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    483 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
    484 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm1
    485 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    486 ; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    487 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    488 ; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    489 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    490 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    491 ; AVX-NEXT:    vzeroupper
    492 ; AVX-NEXT:    retq
    493 ;
    494 ; AVX512-LABEL: test_v8f32_zero:
    495 ; AVX512:       # %bb.0:
    496 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    497 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
    498 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    499 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    500 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    501 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    502 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
    503 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    504 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
    505 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
    506 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    507 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    508 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    509 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    510 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    511 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    512 ; AVX512-NEXT:    vzeroupper
    513 ; AVX512-NEXT:    retq
    514   %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float 0.0, <8 x float> %a0)
    515   ret float %1
    516 }
    517 
    518 define float @test_v16f32_zero(<16 x float> %a0) {
    519 ; SSE2-LABEL: test_v16f32_zero:
    520 ; SSE2:       # %bb.0:
    521 ; SSE2-NEXT:    xorps %xmm4, %xmm4
    522 ; SSE2-NEXT:    addss %xmm0, %xmm4
    523 ; SSE2-NEXT:    movaps %xmm0, %xmm5
    524 ; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm0[2,3]
    525 ; SSE2-NEXT:    addss %xmm4, %xmm5
    526 ; SSE2-NEXT:    movaps %xmm0, %xmm4
    527 ; SSE2-NEXT:    movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1]
    528 ; SSE2-NEXT:    addss %xmm5, %xmm4
    529 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    530 ; SSE2-NEXT:    addss %xmm4, %xmm0
    531 ; SSE2-NEXT:    addss %xmm1, %xmm0
    532 ; SSE2-NEXT:    movaps %xmm1, %xmm4
    533 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3]
    534 ; SSE2-NEXT:    addss %xmm4, %xmm0
    535 ; SSE2-NEXT:    movaps %xmm1, %xmm4
    536 ; SSE2-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
    537 ; SSE2-NEXT:    addss %xmm4, %xmm0
    538 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    539 ; SSE2-NEXT:    addss %xmm1, %xmm0
    540 ; SSE2-NEXT:    addss %xmm2, %xmm0
    541 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    542 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
    543 ; SSE2-NEXT:    addss %xmm1, %xmm0
    544 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    545 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    546 ; SSE2-NEXT:    addss %xmm1, %xmm0
    547 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    548 ; SSE2-NEXT:    addss %xmm2, %xmm0
    549 ; SSE2-NEXT:    addss %xmm3, %xmm0
    550 ; SSE2-NEXT:    movaps %xmm3, %xmm1
    551 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
    552 ; SSE2-NEXT:    addss %xmm1, %xmm0
    553 ; SSE2-NEXT:    movaps %xmm3, %xmm1
    554 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
    555 ; SSE2-NEXT:    addss %xmm1, %xmm0
    556 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    557 ; SSE2-NEXT:    addss %xmm3, %xmm0
    558 ; SSE2-NEXT:    retq
    559 ;
    560 ; SSE41-LABEL: test_v16f32_zero:
    561 ; SSE41:       # %bb.0:
    562 ; SSE41-NEXT:    xorps %xmm4, %xmm4
    563 ; SSE41-NEXT:    addss %xmm0, %xmm4
    564 ; SSE41-NEXT:    movshdup {{.*#+}} xmm5 = xmm0[1,1,3,3]
    565 ; SSE41-NEXT:    addss %xmm4, %xmm5
    566 ; SSE41-NEXT:    movaps %xmm0, %xmm4
    567 ; SSE41-NEXT:    movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1]
    568 ; SSE41-NEXT:    addss %xmm5, %xmm4
    569 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    570 ; SSE41-NEXT:    addss %xmm4, %xmm0
    571 ; SSE41-NEXT:    addss %xmm1, %xmm0
    572 ; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
    573 ; SSE41-NEXT:    addss %xmm4, %xmm0
    574 ; SSE41-NEXT:    movaps %xmm1, %xmm4
    575 ; SSE41-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
    576 ; SSE41-NEXT:    addss %xmm4, %xmm0
    577 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    578 ; SSE41-NEXT:    addss %xmm1, %xmm0
    579 ; SSE41-NEXT:    addss %xmm2, %xmm0
    580 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
    581 ; SSE41-NEXT:    addss %xmm1, %xmm0
    582 ; SSE41-NEXT:    movaps %xmm2, %xmm1
    583 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    584 ; SSE41-NEXT:    addss %xmm1, %xmm0
    585 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    586 ; SSE41-NEXT:    addss %xmm2, %xmm0
    587 ; SSE41-NEXT:    addss %xmm3, %xmm0
    588 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
    589 ; SSE41-NEXT:    addss %xmm1, %xmm0
    590 ; SSE41-NEXT:    movaps %xmm3, %xmm1
    591 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
    592 ; SSE41-NEXT:    addss %xmm1, %xmm0
    593 ; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    594 ; SSE41-NEXT:    addss %xmm3, %xmm0
    595 ; SSE41-NEXT:    retq
    596 ;
    597 ; AVX-LABEL: test_v16f32_zero:
    598 ; AVX:       # %bb.0:
    599 ; AVX-NEXT:    vxorps %xmm2, %xmm2, %xmm2
    600 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm2
    601 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    602 ; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
    603 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
    604 ; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
    605 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
    606 ; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
    607 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
    608 ; AVX-NEXT:    vaddss %xmm0, %xmm2, %xmm2
    609 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    610 ; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
    611 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
    612 ; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
    613 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    614 ; AVX-NEXT:    vaddss %xmm0, %xmm2, %xmm0
    615 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    616 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    617 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    618 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    619 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    620 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
    621 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    622 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
    623 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    624 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    625 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    626 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    627 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    628 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    629 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    630 ; AVX-NEXT:    vzeroupper
    631 ; AVX-NEXT:    retq
    632 ;
    633 ; AVX512-LABEL: test_v16f32_zero:
    634 ; AVX512:       # %bb.0:
    635 ; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
    636 ; AVX512-NEXT:    vaddss %xmm1, %xmm0, %xmm1
    637 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    638 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    639 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    640 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    641 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
    642 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    643 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
    644 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    645 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
    646 ; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
    647 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
    648 ; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
    649 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    650 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    651 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
    652 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    653 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
    654 ; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
    655 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
    656 ; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
    657 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    658 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    659 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
    660 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
    661 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    662 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    663 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    664 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    665 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    666 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    667 ; AVX512-NEXT:    vzeroupper
    668 ; AVX512-NEXT:    retq
    669   %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float 0.0, <16 x float> %a0)
    670   ret float %1
    671 }
    672 
    673 ;
    674 ; vXf32 (undef)
    675 ;
    676 
    677 define float @test_v2f32_undef(<2 x float> %a0) {
    678 ; SSE2-LABEL: test_v2f32_undef:
    679 ; SSE2:       # %bb.0:
    680 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
    681 ; SSE2-NEXT:    addss {{.*}}(%rip), %xmm0
    682 ; SSE2-NEXT:    retq
    683 ;
    684 ; SSE41-LABEL: test_v2f32_undef:
    685 ; SSE41:       # %bb.0:
    686 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    687 ; SSE41-NEXT:    addss {{.*}}(%rip), %xmm0
    688 ; SSE41-NEXT:    retq
    689 ;
    690 ; AVX-LABEL: test_v2f32_undef:
    691 ; AVX:       # %bb.0:
    692 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    693 ; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
    694 ; AVX-NEXT:    retq
    695 ;
    696 ; AVX512-LABEL: test_v2f32_undef:
    697 ; AVX512:       # %bb.0:
    698 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    699 ; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm0, %xmm0
    700 ; AVX512-NEXT:    retq
    701   %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float undef, <2 x float> %a0)
    702   ret float %1
    703 }
    704 
    705 define float @test_v4f32_undef(<4 x float> %a0) {
    706 ; SSE2-LABEL: test_v4f32_undef:
    707 ; SSE2:       # %bb.0:
    708 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    709 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
    710 ; SSE2-NEXT:    addss {{.*}}(%rip), %xmm1
    711 ; SSE2-NEXT:    movaps %xmm0, %xmm2
    712 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
    713 ; SSE2-NEXT:    addss %xmm1, %xmm2
    714 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    715 ; SSE2-NEXT:    addss %xmm2, %xmm0
    716 ; SSE2-NEXT:    retq
    717 ;
    718 ; SSE41-LABEL: test_v4f32_undef:
    719 ; SSE41:       # %bb.0:
    720 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    721 ; SSE41-NEXT:    addss {{.*}}(%rip), %xmm1
    722 ; SSE41-NEXT:    movaps %xmm0, %xmm2
    723 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
    724 ; SSE41-NEXT:    addss %xmm1, %xmm2
    725 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    726 ; SSE41-NEXT:    addss %xmm2, %xmm0
    727 ; SSE41-NEXT:    retq
    728 ;
    729 ; AVX-LABEL: test_v4f32_undef:
    730 ; AVX:       # %bb.0:
    731 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    732 ; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
    733 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    734 ; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    735 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    736 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    737 ; AVX-NEXT:    retq
    738 ;
    739 ; AVX512-LABEL: test_v4f32_undef:
    740 ; AVX512:       # %bb.0:
    741 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    742 ; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
    743 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    744 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    745 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    746 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    747 ; AVX512-NEXT:    retq
    748   %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> %a0)
    749   ret float %1
    750 }
    751 
    752 define float @test_v8f32_undef(<8 x float> %a0) {
    753 ; SSE2-LABEL: test_v8f32_undef:
    754 ; SSE2:       # %bb.0:
    755 ; SSE2-NEXT:    movaps %xmm0, %xmm2
    756 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
    757 ; SSE2-NEXT:    addss {{.*}}(%rip), %xmm2
    758 ; SSE2-NEXT:    movaps %xmm0, %xmm3
    759 ; SSE2-NEXT:    movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
    760 ; SSE2-NEXT:    addss %xmm2, %xmm3
    761 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    762 ; SSE2-NEXT:    addss %xmm3, %xmm0
    763 ; SSE2-NEXT:    addss %xmm1, %xmm0
    764 ; SSE2-NEXT:    movaps %xmm1, %xmm2
    765 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
    766 ; SSE2-NEXT:    addss %xmm2, %xmm0
    767 ; SSE2-NEXT:    movaps %xmm1, %xmm2
    768 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
    769 ; SSE2-NEXT:    addss %xmm2, %xmm0
    770 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    771 ; SSE2-NEXT:    addss %xmm1, %xmm0
    772 ; SSE2-NEXT:    retq
    773 ;
    774 ; SSE41-LABEL: test_v8f32_undef:
    775 ; SSE41:       # %bb.0:
    776 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    777 ; SSE41-NEXT:    addss {{.*}}(%rip), %xmm2
    778 ; SSE41-NEXT:    movaps %xmm0, %xmm3
    779 ; SSE41-NEXT:    movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
    780 ; SSE41-NEXT:    addss %xmm2, %xmm3
    781 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    782 ; SSE41-NEXT:    addss %xmm3, %xmm0
    783 ; SSE41-NEXT:    addss %xmm1, %xmm0
    784 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    785 ; SSE41-NEXT:    addss %xmm2, %xmm0
    786 ; SSE41-NEXT:    movaps %xmm1, %xmm2
    787 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
    788 ; SSE41-NEXT:    addss %xmm2, %xmm0
    789 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    790 ; SSE41-NEXT:    addss %xmm1, %xmm0
    791 ; SSE41-NEXT:    retq
    792 ;
    793 ; AVX-LABEL: test_v8f32_undef:
    794 ; AVX:       # %bb.0:
    795 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    796 ; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
    797 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    798 ; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    799 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
    800 ; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    801 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
    802 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm1
    803 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    804 ; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    805 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    806 ; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    807 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    808 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    809 ; AVX-NEXT:    vzeroupper
    810 ; AVX-NEXT:    retq
    811 ;
    812 ; AVX512-LABEL: test_v8f32_undef:
    813 ; AVX512:       # %bb.0:
    814 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    815 ; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
    816 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    817 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    818 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
    819 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    820 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
    821 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
    822 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    823 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    824 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    825 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    826 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    827 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    828 ; AVX512-NEXT:    vzeroupper
    829 ; AVX512-NEXT:    retq
    830   %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float undef, <8 x float> %a0)
    831   ret float %1
    832 }
    833 
    834 define float @test_v16f32_undef(<16 x float> %a0) {
    835 ; SSE2-LABEL: test_v16f32_undef:
    836 ; SSE2:       # %bb.0:
    837 ; SSE2-NEXT:    movaps %xmm0, %xmm4
    838 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[2,3]
    839 ; SSE2-NEXT:    addss {{.*}}(%rip), %xmm4
    840 ; SSE2-NEXT:    movaps %xmm0, %xmm5
    841 ; SSE2-NEXT:    movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
    842 ; SSE2-NEXT:    addss %xmm4, %xmm5
    843 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    844 ; SSE2-NEXT:    addss %xmm5, %xmm0
    845 ; SSE2-NEXT:    addss %xmm1, %xmm0
    846 ; SSE2-NEXT:    movaps %xmm1, %xmm4
    847 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3]
    848 ; SSE2-NEXT:    addss %xmm4, %xmm0
    849 ; SSE2-NEXT:    movaps %xmm1, %xmm4
    850 ; SSE2-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
    851 ; SSE2-NEXT:    addss %xmm4, %xmm0
    852 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    853 ; SSE2-NEXT:    addss %xmm1, %xmm0
    854 ; SSE2-NEXT:    addss %xmm2, %xmm0
    855 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    856 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
    857 ; SSE2-NEXT:    addss %xmm1, %xmm0
    858 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    859 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    860 ; SSE2-NEXT:    addss %xmm1, %xmm0
    861 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    862 ; SSE2-NEXT:    addss %xmm2, %xmm0
    863 ; SSE2-NEXT:    addss %xmm3, %xmm0
    864 ; SSE2-NEXT:    movaps %xmm3, %xmm1
    865 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
    866 ; SSE2-NEXT:    addss %xmm1, %xmm0
    867 ; SSE2-NEXT:    movaps %xmm3, %xmm1
    868 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
    869 ; SSE2-NEXT:    addss %xmm1, %xmm0
    870 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    871 ; SSE2-NEXT:    addss %xmm3, %xmm0
    872 ; SSE2-NEXT:    retq
    873 ;
    874 ; SSE41-LABEL: test_v16f32_undef:
    875 ; SSE41:       # %bb.0:
    876 ; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
    877 ; SSE41-NEXT:    addss {{.*}}(%rip), %xmm4
    878 ; SSE41-NEXT:    movaps %xmm0, %xmm5
    879 ; SSE41-NEXT:    movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
    880 ; SSE41-NEXT:    addss %xmm4, %xmm5
    881 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    882 ; SSE41-NEXT:    addss %xmm5, %xmm0
    883 ; SSE41-NEXT:    addss %xmm1, %xmm0
    884 ; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
    885 ; SSE41-NEXT:    addss %xmm4, %xmm0
    886 ; SSE41-NEXT:    movaps %xmm1, %xmm4
    887 ; SSE41-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
    888 ; SSE41-NEXT:    addss %xmm4, %xmm0
    889 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    890 ; SSE41-NEXT:    addss %xmm1, %xmm0
    891 ; SSE41-NEXT:    addss %xmm2, %xmm0
    892 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
    893 ; SSE41-NEXT:    addss %xmm1, %xmm0
    894 ; SSE41-NEXT:    movaps %xmm2, %xmm1
    895 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    896 ; SSE41-NEXT:    addss %xmm1, %xmm0
    897 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    898 ; SSE41-NEXT:    addss %xmm2, %xmm0
    899 ; SSE41-NEXT:    addss %xmm3, %xmm0
    900 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
    901 ; SSE41-NEXT:    addss %xmm1, %xmm0
    902 ; SSE41-NEXT:    movaps %xmm3, %xmm1
    903 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
    904 ; SSE41-NEXT:    addss %xmm1, %xmm0
    905 ; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    906 ; SSE41-NEXT:    addss %xmm3, %xmm0
    907 ; SSE41-NEXT:    retq
    908 ;
    909 ; AVX-LABEL: test_v16f32_undef:
    910 ; AVX:       # %bb.0:
    911 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    912 ; AVX-NEXT:    vaddss {{.*}}(%rip), %xmm2, %xmm2
    913 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
    914 ; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
    915 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
    916 ; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
    917 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
    918 ; AVX-NEXT:    vaddss %xmm0, %xmm2, %xmm2
    919 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    920 ; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
    921 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
    922 ; AVX-NEXT:    vaddss %xmm3, %xmm2, %xmm2
    923 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    924 ; AVX-NEXT:    vaddss %xmm0, %xmm2, %xmm0
    925 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    926 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    927 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    928 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    929 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    930 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
    931 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    932 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
    933 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    934 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    935 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    936 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    937 ; AVX-NEXT:    vaddss %xmm2, %xmm0, %xmm0
    938 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    939 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
    940 ; AVX-NEXT:    vzeroupper
    941 ; AVX-NEXT:    retq
    942 ;
    943 ; AVX512-LABEL: test_v16f32_undef:
    944 ; AVX512:       # %bb.0:
    945 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    946 ; AVX512-NEXT:    vaddss {{.*}}(%rip), %xmm1, %xmm1
    947 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    948 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    949 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
    950 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    951 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
    952 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    953 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
    954 ; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
    955 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
    956 ; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
    957 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    958 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    959 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
    960 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    961 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
    962 ; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
    963 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
    964 ; AVX512-NEXT:    vaddss %xmm3, %xmm1, %xmm1
    965 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    966 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    967 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
    968 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm1
    969 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    970 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    971 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    972 ; AVX512-NEXT:    vaddss %xmm2, %xmm1, %xmm1
    973 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    974 ; AVX512-NEXT:    vaddss %xmm0, %xmm1, %xmm0
    975 ; AVX512-NEXT:    vzeroupper
    976 ; AVX512-NEXT:    retq
    977   %1 = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float undef, <16 x float> %a0)
    978   ret float %1
    979 }
    980 
    981 ;
    982 ; vXf64 (accum)
    983 ;
    984 
    985 define double @test_v2f64(double %a0, <2 x double> %a1) {
    986 ; SSE-LABEL: test_v2f64:
    987 ; SSE:       # %bb.0:
    988 ; SSE-NEXT:    addsd %xmm1, %xmm0
    989 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
    990 ; SSE-NEXT:    addsd %xmm1, %xmm0
    991 ; SSE-NEXT:    retq
    992 ;
    993 ; AVX-LABEL: test_v2f64:
    994 ; AVX:       # %bb.0:
    995 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
    996 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
    997 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
    998 ; AVX-NEXT:    retq
    999 ;
   1000 ; AVX512-LABEL: test_v2f64:
   1001 ; AVX512:       # %bb.0:
   1002 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1003 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1004 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1005 ; AVX512-NEXT:    retq
   1006   %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double %a0, <2 x double> %a1)
   1007   ret double %1
   1008 }
   1009 
   1010 define double @test_v4f64(double %a0, <4 x double> %a1) {
   1011 ; SSE-LABEL: test_v4f64:
   1012 ; SSE:       # %bb.0:
   1013 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1014 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1015 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1016 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1017 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1018 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1019 ; SSE-NEXT:    retq
   1020 ;
   1021 ; AVX-LABEL: test_v4f64:
   1022 ; AVX:       # %bb.0:
   1023 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1024 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1025 ; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1026 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1027 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1028 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1029 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1030 ; AVX-NEXT:    vzeroupper
   1031 ; AVX-NEXT:    retq
   1032 ;
   1033 ; AVX512-LABEL: test_v4f64:
   1034 ; AVX512:       # %bb.0:
   1035 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1036 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1037 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1038 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1039 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1040 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1041 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1042 ; AVX512-NEXT:    vzeroupper
   1043 ; AVX512-NEXT:    retq
   1044   %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double %a0, <4 x double> %a1)
   1045   ret double %1
   1046 }
   1047 
   1048 define double @test_v8f64(double %a0, <8 x double> %a1) {
   1049 ; SSE-LABEL: test_v8f64:
   1050 ; SSE:       # %bb.0:
   1051 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1052 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1053 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1054 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1055 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1056 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1057 ; SSE-NEXT:    addsd %xmm3, %xmm0
   1058 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
   1059 ; SSE-NEXT:    addsd %xmm3, %xmm0
   1060 ; SSE-NEXT:    addsd %xmm4, %xmm0
   1061 ; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
   1062 ; SSE-NEXT:    addsd %xmm4, %xmm0
   1063 ; SSE-NEXT:    retq
   1064 ;
   1065 ; AVX-LABEL: test_v8f64:
   1066 ; AVX:       # %bb.0:
   1067 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1068 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
   1069 ; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
   1070 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1071 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1072 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1073 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1074 ; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1075 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1076 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1077 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1078 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1079 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1080 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1081 ; AVX-NEXT:    vzeroupper
   1082 ; AVX-NEXT:    retq
   1083 ;
   1084 ; AVX512-LABEL: test_v8f64:
   1085 ; AVX512:       # %bb.0:
   1086 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1087 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1088 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1089 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1090 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1091 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1092 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1093 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
   1094 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1095 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1096 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1097 ; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
   1098 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1099 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1100 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1101 ; AVX512-NEXT:    vzeroupper
   1102 ; AVX512-NEXT:    retq
   1103   %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double %a0, <8 x double> %a1)
   1104   ret double %1
   1105 }
   1106 
   1107 define double @test_v16f64(double %a0, <16 x double> %a1) {
   1108 ; SSE-LABEL: test_v16f64:
   1109 ; SSE:       # %bb.0:
   1110 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
   1111 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1112 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1113 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1114 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1115 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1116 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1117 ; SSE-NEXT:    addsd %xmm3, %xmm0
   1118 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
   1119 ; SSE-NEXT:    addsd %xmm3, %xmm0
   1120 ; SSE-NEXT:    addsd %xmm4, %xmm0
   1121 ; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
   1122 ; SSE-NEXT:    addsd %xmm4, %xmm0
   1123 ; SSE-NEXT:    addsd %xmm5, %xmm0
   1124 ; SSE-NEXT:    movhlps {{.*#+}} xmm5 = xmm5[1,1]
   1125 ; SSE-NEXT:    addsd %xmm5, %xmm0
   1126 ; SSE-NEXT:    addsd %xmm6, %xmm0
   1127 ; SSE-NEXT:    movhlps {{.*#+}} xmm6 = xmm6[1,1]
   1128 ; SSE-NEXT:    addsd %xmm6, %xmm0
   1129 ; SSE-NEXT:    addsd %xmm7, %xmm0
   1130 ; SSE-NEXT:    movhlps {{.*#+}} xmm7 = xmm7[1,1]
   1131 ; SSE-NEXT:    addsd %xmm7, %xmm0
   1132 ; SSE-NEXT:    addsd %xmm8, %xmm0
   1133 ; SSE-NEXT:    movhlps {{.*#+}} xmm8 = xmm8[1,1]
   1134 ; SSE-NEXT:    addsd %xmm8, %xmm0
   1135 ; SSE-NEXT:    retq
   1136 ;
   1137 ; AVX-LABEL: test_v16f64:
   1138 ; AVX:       # %bb.0:
   1139 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1140 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
   1141 ; AVX-NEXT:    vaddsd %xmm5, %xmm0, %xmm0
   1142 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1143 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1144 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1145 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1146 ; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1147 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1148 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1149 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1150 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1151 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1152 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1153 ; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
   1154 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
   1155 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1156 ; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
   1157 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1158 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1159 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1160 ; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
   1161 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm4[1,0]
   1162 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1163 ; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm1
   1164 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1165 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1166 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1167 ; AVX-NEXT:    vzeroupper
   1168 ; AVX-NEXT:    retq
   1169 ;
   1170 ; AVX512-LABEL: test_v16f64:
   1171 ; AVX512:       # %bb.0:
   1172 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1173 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
   1174 ; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
   1175 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm3
   1176 ; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
   1177 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
   1178 ; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
   1179 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm3
   1180 ; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
   1181 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
   1182 ; AVX512-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
   1183 ; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
   1184 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1185 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1186 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1187 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1188 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1189 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1190 ; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1191 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1192 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1193 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1194 ; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm1
   1195 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1196 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1197 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1198 ; AVX512-NEXT:    vextractf32x4 $3, %zmm2, %xmm1
   1199 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1200 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1201 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1202 ; AVX512-NEXT:    vzeroupper
   1203 ; AVX512-NEXT:    retq
   1204   %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double %a0, <16 x double> %a1)
   1205   ret double %1
   1206 }
   1207 
   1208 ;
   1209 ; vXf64 (zero)
   1210 ;
   1211 
   1212 define double @test_v2f64_zero(<2 x double> %a0) {
   1213 ; SSE-LABEL: test_v2f64_zero:
   1214 ; SSE:       # %bb.0:
   1215 ; SSE-NEXT:    xorpd %xmm1, %xmm1
   1216 ; SSE-NEXT:    addsd %xmm0, %xmm1
   1217 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1218 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1219 ; SSE-NEXT:    retq
   1220 ;
   1221 ; AVX-LABEL: test_v2f64_zero:
   1222 ; AVX:       # %bb.0:
   1223 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
   1224 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
   1225 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1226 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
   1227 ; AVX-NEXT:    retq
   1228 ;
   1229 ; AVX512-LABEL: test_v2f64_zero:
   1230 ; AVX512:       # %bb.0:
   1231 ; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
   1232 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
   1233 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1234 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
   1235 ; AVX512-NEXT:    retq
   1236   %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double 0.0, <2 x double> %a0)
   1237   ret double %1
   1238 }
   1239 
   1240 define double @test_v4f64_zero(<4 x double> %a0) {
   1241 ; SSE-LABEL: test_v4f64_zero:
   1242 ; SSE:       # %bb.0:
   1243 ; SSE-NEXT:    xorpd %xmm2, %xmm2
   1244 ; SSE-NEXT:    addsd %xmm0, %xmm2
   1245 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1246 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1247 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1248 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1249 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1250 ; SSE-NEXT:    retq
   1251 ;
   1252 ; AVX-LABEL: test_v4f64_zero:
   1253 ; AVX:       # %bb.0:
   1254 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
   1255 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
   1256 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
   1257 ; AVX-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
   1258 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1259 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
   1260 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1261 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
   1262 ; AVX-NEXT:    vzeroupper
   1263 ; AVX-NEXT:    retq
   1264 ;
   1265 ; AVX512-LABEL: test_v4f64_zero:
   1266 ; AVX512:       # %bb.0:
   1267 ; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
   1268 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
   1269 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
   1270 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
   1271 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1272 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
   1273 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1274 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
   1275 ; AVX512-NEXT:    vzeroupper
   1276 ; AVX512-NEXT:    retq
   1277   %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double 0.0, <4 x double> %a0)
   1278   ret double %1
   1279 }
   1280 
   1281 define double @test_v8f64_zero(<8 x double> %a0) {
   1282 ; SSE-LABEL: test_v8f64_zero:
   1283 ; SSE:       # %bb.0:
   1284 ; SSE-NEXT:    xorpd %xmm4, %xmm4
   1285 ; SSE-NEXT:    addsd %xmm0, %xmm4
   1286 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1287 ; SSE-NEXT:    addsd %xmm4, %xmm0
   1288 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1289 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1290 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1291 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1292 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1293 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1294 ; SSE-NEXT:    addsd %xmm3, %xmm0
   1295 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
   1296 ; SSE-NEXT:    addsd %xmm3, %xmm0
   1297 ; SSE-NEXT:    retq
   1298 ;
   1299 ; AVX-LABEL: test_v8f64_zero:
   1300 ; AVX:       # %bb.0:
   1301 ; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
   1302 ; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm2
   1303 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
   1304 ; AVX-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
   1305 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1306 ; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
   1307 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1308 ; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
   1309 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1310 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1311 ; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1312 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1313 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1314 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1315 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1316 ; AVX-NEXT:    vzeroupper
   1317 ; AVX-NEXT:    retq
   1318 ;
   1319 ; AVX512-LABEL: test_v8f64_zero:
   1320 ; AVX512:       # %bb.0:
   1321 ; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
   1322 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm1
   1323 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
   1324 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
   1325 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1326 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
   1327 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1328 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
   1329 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
   1330 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
   1331 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1332 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
   1333 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
   1334 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
   1335 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1336 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
   1337 ; AVX512-NEXT:    vzeroupper
   1338 ; AVX512-NEXT:    retq
   1339   %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double 0.0, <8 x double> %a0)
   1340   ret double %1
   1341 }
   1342 
   1343 define double @test_v16f64_zero(<16 x double> %a0) {
   1344 ; SSE-LABEL: test_v16f64_zero:
   1345 ; SSE:       # %bb.0:
   1346 ; SSE-NEXT:    xorpd %xmm8, %xmm8
   1347 ; SSE-NEXT:    addsd %xmm0, %xmm8
   1348 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1349 ; SSE-NEXT:    addsd %xmm8, %xmm0
   1350 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1351 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1352 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1353 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1354 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1355 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1356 ; SSE-NEXT:    addsd %xmm3, %xmm0
   1357 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
   1358 ; SSE-NEXT:    addsd %xmm3, %xmm0
   1359 ; SSE-NEXT:    addsd %xmm4, %xmm0
   1360 ; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
   1361 ; SSE-NEXT:    addsd %xmm4, %xmm0
   1362 ; SSE-NEXT:    addsd %xmm5, %xmm0
   1363 ; SSE-NEXT:    movhlps {{.*#+}} xmm5 = xmm5[1,1]
   1364 ; SSE-NEXT:    addsd %xmm5, %xmm0
   1365 ; SSE-NEXT:    addsd %xmm6, %xmm0
   1366 ; SSE-NEXT:    movhlps {{.*#+}} xmm6 = xmm6[1,1]
   1367 ; SSE-NEXT:    addsd %xmm6, %xmm0
   1368 ; SSE-NEXT:    addsd %xmm7, %xmm0
   1369 ; SSE-NEXT:    movhlps {{.*#+}} xmm7 = xmm7[1,1]
   1370 ; SSE-NEXT:    addsd %xmm7, %xmm0
   1371 ; SSE-NEXT:    retq
   1372 ;
   1373 ; AVX-LABEL: test_v16f64_zero:
   1374 ; AVX:       # %bb.0:
   1375 ; AVX-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
   1376 ; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm4
   1377 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm0[1,0]
   1378 ; AVX-NEXT:    vaddsd %xmm5, %xmm4, %xmm4
   1379 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1380 ; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
   1381 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1382 ; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
   1383 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1384 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
   1385 ; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
   1386 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1387 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1388 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1389 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1390 ; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1391 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1392 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1393 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1394 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1395 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1396 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1397 ; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
   1398 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
   1399 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1400 ; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
   1401 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1402 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1403 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1404 ; AVX-NEXT:    vzeroupper
   1405 ; AVX-NEXT:    retq
   1406 ;
   1407 ; AVX512-LABEL: test_v16f64_zero:
   1408 ; AVX512:       # %bb.0:
   1409 ; AVX512-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
   1410 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm2
   1411 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
   1412 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
   1413 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1414 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
   1415 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
   1416 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
   1417 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
   1418 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
   1419 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
   1420 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
   1421 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
   1422 ; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
   1423 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1424 ; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
   1425 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1426 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1427 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1428 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1429 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1430 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1431 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1432 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
   1433 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1434 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1435 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1436 ; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
   1437 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1438 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1439 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1440 ; AVX512-NEXT:    vzeroupper
   1441 ; AVX512-NEXT:    retq
   1442   %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double 0.0, <16 x double> %a0)
   1443   ret double %1
   1444 }
   1445 
   1446 ;
   1447 ; vXf64 (undef)
   1448 ;
   1449 
   1450 define double @test_v2f64_undef(<2 x double> %a0) {
   1451 ; SSE-LABEL: test_v2f64_undef:
   1452 ; SSE:       # %bb.0:
   1453 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1454 ; SSE-NEXT:    addsd {{.*}}(%rip), %xmm0
   1455 ; SSE-NEXT:    retq
   1456 ;
   1457 ; AVX-LABEL: test_v2f64_undef:
   1458 ; AVX:       # %bb.0:
   1459 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1460 ; AVX-NEXT:    vaddsd {{.*}}(%rip), %xmm0, %xmm0
   1461 ; AVX-NEXT:    retq
   1462 ;
   1463 ; AVX512-LABEL: test_v2f64_undef:
   1464 ; AVX512:       # %bb.0:
   1465 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1466 ; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm0, %xmm0
   1467 ; AVX512-NEXT:    retq
   1468   %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double undef, <2 x double> %a0)
   1469   ret double %1
   1470 }
   1471 
   1472 define double @test_v4f64_undef(<4 x double> %a0) {
   1473 ; SSE-LABEL: test_v4f64_undef:
   1474 ; SSE:       # %bb.0:
   1475 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1476 ; SSE-NEXT:    addsd {{.*}}(%rip), %xmm0
   1477 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1478 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1479 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1480 ; SSE-NEXT:    retq
   1481 ;
   1482 ; AVX-LABEL: test_v4f64_undef:
   1483 ; AVX:       # %bb.0:
   1484 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1485 ; AVX-NEXT:    vaddsd {{.*}}(%rip), %xmm1, %xmm1
   1486 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1487 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
   1488 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1489 ; AVX-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
   1490 ; AVX-NEXT:    vzeroupper
   1491 ; AVX-NEXT:    retq
   1492 ;
   1493 ; AVX512-LABEL: test_v4f64_undef:
   1494 ; AVX512:       # %bb.0:
   1495 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1496 ; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm1, %xmm1
   1497 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1498 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
   1499 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1500 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
   1501 ; AVX512-NEXT:    vzeroupper
   1502 ; AVX512-NEXT:    retq
   1503   %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double undef, <4 x double> %a0)
   1504   ret double %1
   1505 }
   1506 
   1507 define double @test_v8f64_undef(<8 x double> %a0) {
   1508 ; SSE-LABEL: test_v8f64_undef:
   1509 ; SSE:       # %bb.0:
   1510 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1511 ; SSE-NEXT:    addsd {{.*}}(%rip), %xmm0
   1512 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1513 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1514 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1515 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1516 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1517 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1518 ; SSE-NEXT:    addsd %xmm3, %xmm0
   1519 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
   1520 ; SSE-NEXT:    addsd %xmm3, %xmm0
   1521 ; SSE-NEXT:    retq
   1522 ;
   1523 ; AVX-LABEL: test_v8f64_undef:
   1524 ; AVX:       # %bb.0:
   1525 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
   1526 ; AVX-NEXT:    vaddsd {{.*}}(%rip), %xmm2, %xmm2
   1527 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1528 ; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
   1529 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1530 ; AVX-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
   1531 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1532 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1533 ; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1534 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1535 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1536 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1537 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1538 ; AVX-NEXT:    vzeroupper
   1539 ; AVX-NEXT:    retq
   1540 ;
   1541 ; AVX512-LABEL: test_v8f64_undef:
   1542 ; AVX512:       # %bb.0:
   1543 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1544 ; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm1, %xmm1
   1545 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1546 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
   1547 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1548 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
   1549 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
   1550 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
   1551 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1552 ; AVX512-NEXT:    vaddsd %xmm2, %xmm1, %xmm1
   1553 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
   1554 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm1
   1555 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1556 ; AVX512-NEXT:    vaddsd %xmm0, %xmm1, %xmm0
   1557 ; AVX512-NEXT:    vzeroupper
   1558 ; AVX512-NEXT:    retq
   1559   %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double undef, <8 x double> %a0)
   1560   ret double %1
   1561 }
   1562 
   1563 define double @test_v16f64_undef(<16 x double> %a0) {
   1564 ; SSE-LABEL: test_v16f64_undef:
   1565 ; SSE:       # %bb.0:
   1566 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1567 ; SSE-NEXT:    addsd {{.*}}(%rip), %xmm0
   1568 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1569 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1570 ; SSE-NEXT:    addsd %xmm1, %xmm0
   1571 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1572 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1573 ; SSE-NEXT:    addsd %xmm2, %xmm0
   1574 ; SSE-NEXT:    addsd %xmm3, %xmm0
   1575 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
   1576 ; SSE-NEXT:    addsd %xmm3, %xmm0
   1577 ; SSE-NEXT:    addsd %xmm4, %xmm0
   1578 ; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
   1579 ; SSE-NEXT:    addsd %xmm4, %xmm0
   1580 ; SSE-NEXT:    addsd %xmm5, %xmm0
   1581 ; SSE-NEXT:    movhlps {{.*#+}} xmm5 = xmm5[1,1]
   1582 ; SSE-NEXT:    addsd %xmm5, %xmm0
   1583 ; SSE-NEXT:    addsd %xmm6, %xmm0
   1584 ; SSE-NEXT:    movhlps {{.*#+}} xmm6 = xmm6[1,1]
   1585 ; SSE-NEXT:    addsd %xmm6, %xmm0
   1586 ; SSE-NEXT:    addsd %xmm7, %xmm0
   1587 ; SSE-NEXT:    movhlps {{.*#+}} xmm7 = xmm7[1,1]
   1588 ; SSE-NEXT:    addsd %xmm7, %xmm0
   1589 ; SSE-NEXT:    retq
   1590 ;
   1591 ; AVX-LABEL: test_v16f64_undef:
   1592 ; AVX:       # %bb.0:
   1593 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
   1594 ; AVX-NEXT:    vaddsd {{.*}}(%rip), %xmm4, %xmm4
   1595 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1596 ; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm4
   1597 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1598 ; AVX-NEXT:    vaddsd %xmm0, %xmm4, %xmm0
   1599 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1600 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
   1601 ; AVX-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
   1602 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1603 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1604 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1605 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1606 ; AVX-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1607 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1608 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1609 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1610 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1611 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1612 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1613 ; AVX-NEXT:    vaddsd %xmm3, %xmm0, %xmm0
   1614 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
   1615 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1616 ; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
   1617 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1618 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1619 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1620 ; AVX-NEXT:    vzeroupper
   1621 ; AVX-NEXT:    retq
   1622 ;
   1623 ; AVX512-LABEL: test_v16f64_undef:
   1624 ; AVX512:       # %bb.0:
   1625 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
   1626 ; AVX512-NEXT:    vaddsd {{.*}}(%rip), %xmm2, %xmm2
   1627 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1628 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
   1629 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
   1630 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
   1631 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
   1632 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
   1633 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
   1634 ; AVX512-NEXT:    vaddsd %xmm3, %xmm2, %xmm2
   1635 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
   1636 ; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm2
   1637 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1638 ; AVX512-NEXT:    vaddsd %xmm0, %xmm2, %xmm0
   1639 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1640 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1641 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1642 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1643 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1644 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1645 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1646 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
   1647 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1648 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1649 ; AVX512-NEXT:    vaddsd %xmm2, %xmm0, %xmm0
   1650 ; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
   1651 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1652 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1653 ; AVX512-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
   1654 ; AVX512-NEXT:    vzeroupper
   1655 ; AVX512-NEXT:    retq
   1656   %1 = call double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double undef, <16 x double> %a0)
   1657   ret double %1
   1658 }
   1659 
   1660 declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v2f32(float, <2 x float>)
   1661 declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float, <4 x float>)
   1662 declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v8f32(float, <8 x float>)
   1663 declare float @llvm.experimental.vector.reduce.fadd.f32.f32.v16f32(float, <16 x float>)
   1664 
   1665 declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v2f64(double, <2 x double>)
   1666 declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v4f64(double, <4 x double>)
   1667 declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v8f64(double, <8 x double>)
   1668 declare double @llvm.experimental.vector.reduce.fadd.f64.f64.v16f64(double, <16 x double>)
   1669