Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
      8 
      9 ;
     10 ; vXf32 (accum)
     11 ;
     12 
     13 define float @test_v2f32(float %a0, <2 x float> %a1) {
     14 ; SSE2-LABEL: test_v2f32:
     15 ; SSE2:       # %bb.0:
     16 ; SSE2-NEXT:    mulss %xmm1, %xmm0
     17 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
     18 ; SSE2-NEXT:    mulss %xmm1, %xmm0
     19 ; SSE2-NEXT:    retq
     20 ;
     21 ; SSE41-LABEL: test_v2f32:
     22 ; SSE41:       # %bb.0:
     23 ; SSE41-NEXT:    mulss %xmm1, %xmm0
     24 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
     25 ; SSE41-NEXT:    mulss %xmm1, %xmm0
     26 ; SSE41-NEXT:    retq
     27 ;
     28 ; AVX-LABEL: test_v2f32:
     29 ; AVX:       # %bb.0:
     30 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
     31 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
     32 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
     33 ; AVX-NEXT:    retq
     34 ;
     35 ; AVX512-LABEL: test_v2f32:
     36 ; AVX512:       # %bb.0:
     37 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
     38 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm1[1,1,3,3]
     39 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
     40 ; AVX512-NEXT:    retq
     41   %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
     42   ret float %1
     43 }
     44 
     45 define float @test_v4f32(float %a0, <4 x float> %a1) {
     46 ; SSE2-LABEL: test_v4f32:
     47 ; SSE2:       # %bb.0:
     48 ; SSE2-NEXT:    mulss %xmm1, %xmm0
     49 ; SSE2-NEXT:    movaps %xmm1, %xmm2
     50 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
     51 ; SSE2-NEXT:    mulss %xmm2, %xmm0
     52 ; SSE2-NEXT:    movaps %xmm1, %xmm2
     53 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
     54 ; SSE2-NEXT:    mulss %xmm2, %xmm0
     55 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
     56 ; SSE2-NEXT:    mulss %xmm1, %xmm0
     57 ; SSE2-NEXT:    retq
     58 ;
     59 ; SSE41-LABEL: test_v4f32:
     60 ; SSE41:       # %bb.0:
     61 ; SSE41-NEXT:    mulss %xmm1, %xmm0
     62 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
     63 ; SSE41-NEXT:    mulss %xmm2, %xmm0
     64 ; SSE41-NEXT:    movaps %xmm1, %xmm2
     65 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
     66 ; SSE41-NEXT:    mulss %xmm2, %xmm0
     67 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
     68 ; SSE41-NEXT:    mulss %xmm1, %xmm0
     69 ; SSE41-NEXT:    retq
     70 ;
     71 ; AVX-LABEL: test_v4f32:
     72 ; AVX:       # %bb.0:
     73 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
     74 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
     75 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
     76 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
     77 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
     78 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
     79 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
     80 ; AVX-NEXT:    retq
     81 ;
     82 ; AVX512-LABEL: test_v4f32:
     83 ; AVX512:       # %bb.0:
     84 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
     85 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
     86 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
     87 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
     88 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
     89 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
     90 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
     91 ; AVX512-NEXT:    retq
     92   %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
     93   ret float %1
     94 }
     95 
     96 define float @test_v8f32(float %a0, <8 x float> %a1) {
     97 ; SSE2-LABEL: test_v8f32:
     98 ; SSE2:       # %bb.0:
     99 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    100 ; SSE2-NEXT:    movaps %xmm1, %xmm3
    101 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[2,3]
    102 ; SSE2-NEXT:    mulss %xmm3, %xmm0
    103 ; SSE2-NEXT:    movaps %xmm1, %xmm3
    104 ; SSE2-NEXT:    movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
    105 ; SSE2-NEXT:    mulss %xmm3, %xmm0
    106 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    107 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    108 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    109 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    110 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
    111 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    112 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    113 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    114 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    115 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    116 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    117 ; SSE2-NEXT:    retq
    118 ;
    119 ; SSE41-LABEL: test_v8f32:
    120 ; SSE41:       # %bb.0:
    121 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    122 ; SSE41-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    123 ; SSE41-NEXT:    mulss %xmm3, %xmm0
    124 ; SSE41-NEXT:    movaps %xmm1, %xmm3
    125 ; SSE41-NEXT:    movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
    126 ; SSE41-NEXT:    mulss %xmm3, %xmm0
    127 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    128 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    129 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    130 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
    131 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    132 ; SSE41-NEXT:    movaps %xmm2, %xmm1
    133 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    134 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    135 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    136 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    137 ; SSE41-NEXT:    retq
    138 ;
    139 ; AVX-LABEL: test_v8f32:
    140 ; AVX:       # %bb.0:
    141 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    142 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    143 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    144 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    145 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    146 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
    147 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    148 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
    149 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    150 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    151 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    152 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    153 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    154 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    155 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    156 ; AVX-NEXT:    vzeroupper
    157 ; AVX-NEXT:    retq
    158 ;
    159 ; AVX512-LABEL: test_v8f32:
    160 ; AVX512:       # %bb.0:
    161 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    162 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    163 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    164 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    165 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    166 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
    167 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    168 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
    169 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    170 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    171 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    172 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    173 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    174 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    175 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    176 ; AVX512-NEXT:    vzeroupper
    177 ; AVX512-NEXT:    retq
    178   %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
    179   ret float %1
    180 }
    181 
    182 define float @test_v16f32(float %a0, <16 x float> %a1) {
    183 ; SSE2-LABEL: test_v16f32:
    184 ; SSE2:       # %bb.0:
    185 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    186 ; SSE2-NEXT:    movaps %xmm1, %xmm5
    187 ; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[2,3]
    188 ; SSE2-NEXT:    mulss %xmm5, %xmm0
    189 ; SSE2-NEXT:    movaps %xmm1, %xmm5
    190 ; SSE2-NEXT:    movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1]
    191 ; SSE2-NEXT:    mulss %xmm5, %xmm0
    192 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    193 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    194 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    195 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    196 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
    197 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    198 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    199 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    200 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    201 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    202 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    203 ; SSE2-NEXT:    mulss %xmm3, %xmm0
    204 ; SSE2-NEXT:    movaps %xmm3, %xmm1
    205 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
    206 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    207 ; SSE2-NEXT:    movaps %xmm3, %xmm1
    208 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
    209 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    210 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    211 ; SSE2-NEXT:    mulss %xmm3, %xmm0
    212 ; SSE2-NEXT:    mulss %xmm4, %xmm0
    213 ; SSE2-NEXT:    movaps %xmm4, %xmm1
    214 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[2,3]
    215 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    216 ; SSE2-NEXT:    movaps %xmm4, %xmm1
    217 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1]
    218 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    219 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1,2,3]
    220 ; SSE2-NEXT:    mulss %xmm4, %xmm0
    221 ; SSE2-NEXT:    retq
    222 ;
    223 ; SSE41-LABEL: test_v16f32:
    224 ; SSE41:       # %bb.0:
    225 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    226 ; SSE41-NEXT:    movshdup {{.*#+}} xmm5 = xmm1[1,1,3,3]
    227 ; SSE41-NEXT:    mulss %xmm5, %xmm0
    228 ; SSE41-NEXT:    movaps %xmm1, %xmm5
    229 ; SSE41-NEXT:    movhlps {{.*#+}} xmm5 = xmm1[1],xmm5[1]
    230 ; SSE41-NEXT:    mulss %xmm5, %xmm0
    231 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    232 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    233 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    234 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
    235 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    236 ; SSE41-NEXT:    movaps %xmm2, %xmm1
    237 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    238 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    239 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    240 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    241 ; SSE41-NEXT:    mulss %xmm3, %xmm0
    242 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
    243 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    244 ; SSE41-NEXT:    movaps %xmm3, %xmm1
    245 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
    246 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    247 ; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    248 ; SSE41-NEXT:    mulss %xmm3, %xmm0
    249 ; SSE41-NEXT:    mulss %xmm4, %xmm0
    250 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm4[1,1,3,3]
    251 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    252 ; SSE41-NEXT:    movaps %xmm4, %xmm1
    253 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm4[1],xmm1[1]
    254 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    255 ; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[3,1,2,3]
    256 ; SSE41-NEXT:    mulss %xmm4, %xmm0
    257 ; SSE41-NEXT:    retq
    258 ;
    259 ; AVX-LABEL: test_v16f32:
    260 ; AVX:       # %bb.0:
    261 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    262 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    263 ; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
    264 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
    265 ; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
    266 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
    267 ; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
    268 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
    269 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    270 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
    271 ; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
    272 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
    273 ; AVX-NEXT:    vmulss %xmm3, %xmm0, %xmm0
    274 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    275 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    276 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    277 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
    278 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    279 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
    280 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    281 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm2[3,1,2,3]
    282 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    283 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
    284 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    285 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    286 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    287 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    288 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    289 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    290 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    291 ; AVX-NEXT:    vzeroupper
    292 ; AVX-NEXT:    retq
    293 ;
    294 ; AVX512-LABEL: test_v16f32:
    295 ; AVX512:       # %bb.0:
    296 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    297 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    298 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    299 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    300 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    301 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
    302 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    303 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
    304 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    305 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
    306 ; AVX512-NEXT:    vmulss %xmm3, %xmm0, %xmm0
    307 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
    308 ; AVX512-NEXT:    vmulss %xmm3, %xmm0, %xmm0
    309 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    310 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    311 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
    312 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    313 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
    314 ; AVX512-NEXT:    vmulss %xmm3, %xmm0, %xmm0
    315 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
    316 ; AVX512-NEXT:    vmulss %xmm3, %xmm0, %xmm0
    317 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    318 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    319 ; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
    320 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    321 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    322 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    323 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    324 ; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    325 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    326 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    327 ; AVX512-NEXT:    vzeroupper
    328 ; AVX512-NEXT:    retq
    329   %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
    330   ret float %1
    331 }
    332 
    333 ;
    334 ; vXf32 (one)
    335 ;
    336 
    337 define float @test_v2f32_one(<2 x float> %a0) {
    338 ; SSE2-LABEL: test_v2f32_one:
    339 ; SSE2:       # %bb.0:
    340 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    341 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
    342 ; SSE2-NEXT:    mulss %xmm0, %xmm1
    343 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    344 ; SSE2-NEXT:    retq
    345 ;
    346 ; SSE41-LABEL: test_v2f32_one:
    347 ; SSE41:       # %bb.0:
    348 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    349 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    350 ; SSE41-NEXT:    retq
    351 ;
    352 ; AVX-LABEL: test_v2f32_one:
    353 ; AVX:       # %bb.0:
    354 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    355 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    356 ; AVX-NEXT:    retq
    357 ;
    358 ; AVX512-LABEL: test_v2f32_one:
    359 ; AVX512:       # %bb.0:
    360 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    361 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    362 ; AVX512-NEXT:    retq
    363   %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
    364   ret float %1
    365 }
    366 
    367 define float @test_v4f32_one(<4 x float> %a0) {
    368 ; SSE2-LABEL: test_v4f32_one:
    369 ; SSE2:       # %bb.0:
    370 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    371 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
    372 ; SSE2-NEXT:    mulss %xmm0, %xmm1
    373 ; SSE2-NEXT:    movaps %xmm0, %xmm2
    374 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
    375 ; SSE2-NEXT:    mulss %xmm1, %xmm2
    376 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    377 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    378 ; SSE2-NEXT:    retq
    379 ;
    380 ; SSE41-LABEL: test_v4f32_one:
    381 ; SSE41:       # %bb.0:
    382 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    383 ; SSE41-NEXT:    mulss %xmm0, %xmm1
    384 ; SSE41-NEXT:    movaps %xmm0, %xmm2
    385 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
    386 ; SSE41-NEXT:    mulss %xmm1, %xmm2
    387 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    388 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    389 ; SSE41-NEXT:    retq
    390 ;
    391 ; AVX-LABEL: test_v4f32_one:
    392 ; AVX:       # %bb.0:
    393 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    394 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
    395 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    396 ; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    397 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    398 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    399 ; AVX-NEXT:    retq
    400 ;
    401 ; AVX512-LABEL: test_v4f32_one:
    402 ; AVX512:       # %bb.0:
    403 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    404 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm1
    405 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    406 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    407 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    408 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    409 ; AVX512-NEXT:    retq
    410   %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
    411   ret float %1
    412 }
    413 
    414 define float @test_v8f32_one(<8 x float> %a0) {
    415 ; SSE2-LABEL: test_v8f32_one:
    416 ; SSE2:       # %bb.0:
    417 ; SSE2-NEXT:    movaps %xmm0, %xmm2
    418 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
    419 ; SSE2-NEXT:    mulss %xmm0, %xmm2
    420 ; SSE2-NEXT:    movaps %xmm0, %xmm3
    421 ; SSE2-NEXT:    movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
    422 ; SSE2-NEXT:    mulss %xmm2, %xmm3
    423 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    424 ; SSE2-NEXT:    mulss %xmm3, %xmm0
    425 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    426 ; SSE2-NEXT:    movaps %xmm1, %xmm2
    427 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
    428 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    429 ; SSE2-NEXT:    movaps %xmm1, %xmm2
    430 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
    431 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    432 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    433 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    434 ; SSE2-NEXT:    retq
    435 ;
    436 ; SSE41-LABEL: test_v8f32_one:
    437 ; SSE41:       # %bb.0:
    438 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    439 ; SSE41-NEXT:    mulss %xmm0, %xmm2
    440 ; SSE41-NEXT:    movaps %xmm0, %xmm3
    441 ; SSE41-NEXT:    movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
    442 ; SSE41-NEXT:    mulss %xmm2, %xmm3
    443 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    444 ; SSE41-NEXT:    mulss %xmm3, %xmm0
    445 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    446 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    447 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    448 ; SSE41-NEXT:    movaps %xmm1, %xmm2
    449 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
    450 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    451 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    452 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    453 ; SSE41-NEXT:    retq
    454 ;
    455 ; AVX-LABEL: test_v8f32_one:
    456 ; AVX:       # %bb.0:
    457 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    458 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm1
    459 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    460 ; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    461 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
    462 ; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    463 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
    464 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm1
    465 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    466 ; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    467 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    468 ; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    469 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    470 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    471 ; AVX-NEXT:    vzeroupper
    472 ; AVX-NEXT:    retq
    473 ;
    474 ; AVX512-LABEL: test_v8f32_one:
    475 ; AVX512:       # %bb.0:
    476 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    477 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm1
    478 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    479 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    480 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
    481 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    482 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
    483 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm1
    484 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    485 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    486 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    487 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    488 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    489 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    490 ; AVX512-NEXT:    vzeroupper
    491 ; AVX512-NEXT:    retq
    492   %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
    493   ret float %1
    494 }
    495 
    496 define float @test_v16f32_one(<16 x float> %a0) {
    497 ; SSE2-LABEL: test_v16f32_one:
    498 ; SSE2:       # %bb.0:
    499 ; SSE2-NEXT:    movaps %xmm0, %xmm4
    500 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[2,3]
    501 ; SSE2-NEXT:    mulss %xmm0, %xmm4
    502 ; SSE2-NEXT:    movaps %xmm0, %xmm5
    503 ; SSE2-NEXT:    movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
    504 ; SSE2-NEXT:    mulss %xmm4, %xmm5
    505 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    506 ; SSE2-NEXT:    mulss %xmm5, %xmm0
    507 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    508 ; SSE2-NEXT:    movaps %xmm1, %xmm4
    509 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3]
    510 ; SSE2-NEXT:    mulss %xmm4, %xmm0
    511 ; SSE2-NEXT:    movaps %xmm1, %xmm4
    512 ; SSE2-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
    513 ; SSE2-NEXT:    mulss %xmm4, %xmm0
    514 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    515 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    516 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    517 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    518 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
    519 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    520 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    521 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    522 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    523 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    524 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    525 ; SSE2-NEXT:    mulss %xmm3, %xmm0
    526 ; SSE2-NEXT:    movaps %xmm3, %xmm1
    527 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
    528 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    529 ; SSE2-NEXT:    movaps %xmm3, %xmm1
    530 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
    531 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    532 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    533 ; SSE2-NEXT:    mulss %xmm3, %xmm0
    534 ; SSE2-NEXT:    retq
    535 ;
    536 ; SSE41-LABEL: test_v16f32_one:
    537 ; SSE41:       # %bb.0:
    538 ; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
    539 ; SSE41-NEXT:    mulss %xmm0, %xmm4
    540 ; SSE41-NEXT:    movaps %xmm0, %xmm5
    541 ; SSE41-NEXT:    movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
    542 ; SSE41-NEXT:    mulss %xmm4, %xmm5
    543 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    544 ; SSE41-NEXT:    mulss %xmm5, %xmm0
    545 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    546 ; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
    547 ; SSE41-NEXT:    mulss %xmm4, %xmm0
    548 ; SSE41-NEXT:    movaps %xmm1, %xmm4
    549 ; SSE41-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
    550 ; SSE41-NEXT:    mulss %xmm4, %xmm0
    551 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    552 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    553 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    554 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
    555 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    556 ; SSE41-NEXT:    movaps %xmm2, %xmm1
    557 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    558 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    559 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    560 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    561 ; SSE41-NEXT:    mulss %xmm3, %xmm0
    562 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
    563 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    564 ; SSE41-NEXT:    movaps %xmm3, %xmm1
    565 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
    566 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    567 ; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    568 ; SSE41-NEXT:    mulss %xmm3, %xmm0
    569 ; SSE41-NEXT:    retq
    570 ;
    571 ; AVX-LABEL: test_v16f32_one:
    572 ; AVX:       # %bb.0:
    573 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    574 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm2
    575 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
    576 ; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
    577 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
    578 ; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
    579 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
    580 ; AVX-NEXT:    vmulss %xmm0, %xmm2, %xmm2
    581 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    582 ; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
    583 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
    584 ; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
    585 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    586 ; AVX-NEXT:    vmulss %xmm0, %xmm2, %xmm0
    587 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    588 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    589 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    590 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    591 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    592 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
    593 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    594 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
    595 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    596 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    597 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    598 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    599 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    600 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    601 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    602 ; AVX-NEXT:    vzeroupper
    603 ; AVX-NEXT:    retq
    604 ;
    605 ; AVX512-LABEL: test_v16f32_one:
    606 ; AVX512:       # %bb.0:
    607 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    608 ; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm1
    609 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    610 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    611 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
    612 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    613 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
    614 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    615 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
    616 ; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
    617 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
    618 ; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
    619 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    620 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    621 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
    622 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    623 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
    624 ; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
    625 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
    626 ; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
    627 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    628 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    629 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
    630 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm1
    631 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    632 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    633 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    634 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    635 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    636 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    637 ; AVX512-NEXT:    vzeroupper
    638 ; AVX512-NEXT:    retq
    639   %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
    640   ret float %1
    641 }
    642 
    643 ;
    644 ; vXf32 (undef)
    645 ;
    646 
    647 define float @test_v2f32_undef(<2 x float> %a0) {
    648 ; SSE2-LABEL: test_v2f32_undef:
    649 ; SSE2:       # %bb.0:
    650 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
    651 ; SSE2-NEXT:    mulss {{.*}}(%rip), %xmm0
    652 ; SSE2-NEXT:    retq
    653 ;
    654 ; SSE41-LABEL: test_v2f32_undef:
    655 ; SSE41:       # %bb.0:
    656 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    657 ; SSE41-NEXT:    mulss {{.*}}(%rip), %xmm0
    658 ; SSE41-NEXT:    retq
    659 ;
    660 ; AVX-LABEL: test_v2f32_undef:
    661 ; AVX:       # %bb.0:
    662 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    663 ; AVX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
    664 ; AVX-NEXT:    retq
    665 ;
    666 ; AVX512-LABEL: test_v2f32_undef:
    667 ; AVX512:       # %bb.0:
    668 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
    669 ; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
    670 ; AVX512-NEXT:    retq
    671   %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
    672   ret float %1
    673 }
    674 
    675 define float @test_v4f32_undef(<4 x float> %a0) {
    676 ; SSE2-LABEL: test_v4f32_undef:
    677 ; SSE2:       # %bb.0:
    678 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    679 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
    680 ; SSE2-NEXT:    mulss {{.*}}(%rip), %xmm1
    681 ; SSE2-NEXT:    movaps %xmm0, %xmm2
    682 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
    683 ; SSE2-NEXT:    mulss %xmm1, %xmm2
    684 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    685 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    686 ; SSE2-NEXT:    retq
    687 ;
    688 ; SSE41-LABEL: test_v4f32_undef:
    689 ; SSE41:       # %bb.0:
    690 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    691 ; SSE41-NEXT:    mulss {{.*}}(%rip), %xmm1
    692 ; SSE41-NEXT:    movaps %xmm0, %xmm2
    693 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
    694 ; SSE41-NEXT:    mulss %xmm1, %xmm2
    695 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    696 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    697 ; SSE41-NEXT:    retq
    698 ;
    699 ; AVX-LABEL: test_v4f32_undef:
    700 ; AVX:       # %bb.0:
    701 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    702 ; AVX-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm1
    703 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    704 ; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    705 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    706 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    707 ; AVX-NEXT:    retq
    708 ;
    709 ; AVX512-LABEL: test_v4f32_undef:
    710 ; AVX512:       # %bb.0:
    711 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    712 ; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm1
    713 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    714 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    715 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    716 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    717 ; AVX512-NEXT:    retq
    718   %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
    719   ret float %1
    720 }
    721 
    722 define float @test_v8f32_undef(<8 x float> %a0) {
    723 ; SSE2-LABEL: test_v8f32_undef:
    724 ; SSE2:       # %bb.0:
    725 ; SSE2-NEXT:    movaps %xmm0, %xmm2
    726 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[2,3]
    727 ; SSE2-NEXT:    mulss {{.*}}(%rip), %xmm2
    728 ; SSE2-NEXT:    movaps %xmm0, %xmm3
    729 ; SSE2-NEXT:    movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
    730 ; SSE2-NEXT:    mulss %xmm2, %xmm3
    731 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    732 ; SSE2-NEXT:    mulss %xmm3, %xmm0
    733 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    734 ; SSE2-NEXT:    movaps %xmm1, %xmm2
    735 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[2,3]
    736 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    737 ; SSE2-NEXT:    movaps %xmm1, %xmm2
    738 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
    739 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    740 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    741 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    742 ; SSE2-NEXT:    retq
    743 ;
    744 ; SSE41-LABEL: test_v8f32_undef:
    745 ; SSE41:       # %bb.0:
    746 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    747 ; SSE41-NEXT:    mulss {{.*}}(%rip), %xmm2
    748 ; SSE41-NEXT:    movaps %xmm0, %xmm3
    749 ; SSE41-NEXT:    movhlps {{.*#+}} xmm3 = xmm0[1],xmm3[1]
    750 ; SSE41-NEXT:    mulss %xmm2, %xmm3
    751 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    752 ; SSE41-NEXT:    mulss %xmm3, %xmm0
    753 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    754 ; SSE41-NEXT:    movshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    755 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    756 ; SSE41-NEXT:    movaps %xmm1, %xmm2
    757 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
    758 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    759 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    760 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    761 ; SSE41-NEXT:    retq
    762 ;
    763 ; AVX-LABEL: test_v8f32_undef:
    764 ; AVX:       # %bb.0:
    765 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    766 ; AVX-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm1
    767 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    768 ; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    769 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
    770 ; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    771 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
    772 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm1
    773 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    774 ; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    775 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    776 ; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    777 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    778 ; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    779 ; AVX-NEXT:    vzeroupper
    780 ; AVX-NEXT:    retq
    781 ;
    782 ; AVX512-LABEL: test_v8f32_undef:
    783 ; AVX512:       # %bb.0:
    784 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    785 ; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm1
    786 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    787 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    788 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
    789 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    790 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
    791 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm1
    792 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    793 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    794 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    795 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    796 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    797 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    798 ; AVX512-NEXT:    vzeroupper
    799 ; AVX512-NEXT:    retq
    800   %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
    801   ret float %1
    802 }
    803 
    804 define float @test_v16f32_undef(<16 x float> %a0) {
    805 ; SSE2-LABEL: test_v16f32_undef:
    806 ; SSE2:       # %bb.0:
    807 ; SSE2-NEXT:    movaps %xmm0, %xmm4
    808 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[2,3]
    809 ; SSE2-NEXT:    mulss {{.*}}(%rip), %xmm4
    810 ; SSE2-NEXT:    movaps %xmm0, %xmm5
    811 ; SSE2-NEXT:    movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
    812 ; SSE2-NEXT:    mulss %xmm4, %xmm5
    813 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    814 ; SSE2-NEXT:    mulss %xmm5, %xmm0
    815 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    816 ; SSE2-NEXT:    movaps %xmm1, %xmm4
    817 ; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[2,3]
    818 ; SSE2-NEXT:    mulss %xmm4, %xmm0
    819 ; SSE2-NEXT:    movaps %xmm1, %xmm4
    820 ; SSE2-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
    821 ; SSE2-NEXT:    mulss %xmm4, %xmm0
    822 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    823 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    824 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    825 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    826 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm2[2,3]
    827 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    828 ; SSE2-NEXT:    movaps %xmm2, %xmm1
    829 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    830 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    831 ; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    832 ; SSE2-NEXT:    mulss %xmm2, %xmm0
    833 ; SSE2-NEXT:    mulss %xmm3, %xmm0
    834 ; SSE2-NEXT:    movaps %xmm3, %xmm1
    835 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm3[2,3]
    836 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    837 ; SSE2-NEXT:    movaps %xmm3, %xmm1
    838 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
    839 ; SSE2-NEXT:    mulss %xmm1, %xmm0
    840 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    841 ; SSE2-NEXT:    mulss %xmm3, %xmm0
    842 ; SSE2-NEXT:    retq
    843 ;
    844 ; SSE41-LABEL: test_v16f32_undef:
    845 ; SSE41:       # %bb.0:
    846 ; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
    847 ; SSE41-NEXT:    mulss {{.*}}(%rip), %xmm4
    848 ; SSE41-NEXT:    movaps %xmm0, %xmm5
    849 ; SSE41-NEXT:    movhlps {{.*#+}} xmm5 = xmm0[1],xmm5[1]
    850 ; SSE41-NEXT:    mulss %xmm4, %xmm5
    851 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    852 ; SSE41-NEXT:    mulss %xmm5, %xmm0
    853 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    854 ; SSE41-NEXT:    movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
    855 ; SSE41-NEXT:    mulss %xmm4, %xmm0
    856 ; SSE41-NEXT:    movaps %xmm1, %xmm4
    857 ; SSE41-NEXT:    movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
    858 ; SSE41-NEXT:    mulss %xmm4, %xmm0
    859 ; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    860 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    861 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    862 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
    863 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    864 ; SSE41-NEXT:    movaps %xmm2, %xmm1
    865 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm2[1],xmm1[1]
    866 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    867 ; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    868 ; SSE41-NEXT:    mulss %xmm2, %xmm0
    869 ; SSE41-NEXT:    mulss %xmm3, %xmm0
    870 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm3[1,1,3,3]
    871 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    872 ; SSE41-NEXT:    movaps %xmm3, %xmm1
    873 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
    874 ; SSE41-NEXT:    mulss %xmm1, %xmm0
    875 ; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
    876 ; SSE41-NEXT:    mulss %xmm3, %xmm0
    877 ; SSE41-NEXT:    retq
    878 ;
    879 ; AVX-LABEL: test_v16f32_undef:
    880 ; AVX:       # %bb.0:
    881 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    882 ; AVX-NEXT:    vmulss {{.*}}(%rip), %xmm2, %xmm2
    883 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
    884 ; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
    885 ; AVX-NEXT:    vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
    886 ; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
    887 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
    888 ; AVX-NEXT:    vmulss %xmm0, %xmm2, %xmm2
    889 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
    890 ; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
    891 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm0[1,0]
    892 ; AVX-NEXT:    vmulss %xmm3, %xmm2, %xmm2
    893 ; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    894 ; AVX-NEXT:    vmulss %xmm0, %xmm2, %xmm0
    895 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    896 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    897 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    898 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    899 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    900 ; AVX-NEXT:    vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3]
    901 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    902 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
    903 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    904 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
    905 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    906 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    907 ; AVX-NEXT:    vmulss %xmm2, %xmm0, %xmm0
    908 ; AVX-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3]
    909 ; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
    910 ; AVX-NEXT:    vzeroupper
    911 ; AVX-NEXT:    retq
    912 ;
    913 ; AVX512-LABEL: test_v16f32_undef:
    914 ; AVX512:       # %bb.0:
    915 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    916 ; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm1, %xmm1
    917 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    918 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    919 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[3,1,2,3]
    920 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    921 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
    922 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    923 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
    924 ; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
    925 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
    926 ; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
    927 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    928 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    929 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
    930 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    931 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3]
    932 ; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
    933 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm2[1,0]
    934 ; AVX512-NEXT:    vmulss %xmm3, %xmm1, %xmm1
    935 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm2 = xmm2[3,1,2,3]
    936 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    937 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
    938 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm1
    939 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
    940 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    941 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
    942 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
    943 ; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3]
    944 ; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
    945 ; AVX512-NEXT:    vzeroupper
    946 ; AVX512-NEXT:    retq
    947   %1 = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
    948   ret float %1
    949 }
    950 
    951 ;
    952 ; vXf64 (accum)
    953 ;
    954 
    955 define double @test_v2f64(double %a0, <2 x double> %a1) {
    956 ; SSE-LABEL: test_v2f64:
    957 ; SSE:       # %bb.0:
    958 ; SSE-NEXT:    mulsd %xmm1, %xmm0
    959 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
    960 ; SSE-NEXT:    mulsd %xmm1, %xmm0
    961 ; SSE-NEXT:    retq
    962 ;
    963 ; AVX-LABEL: test_v2f64:
    964 ; AVX:       # %bb.0:
    965 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
    966 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
    967 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
    968 ; AVX-NEXT:    retq
    969 ;
    970 ; AVX512-LABEL: test_v2f64:
    971 ; AVX512:       # %bb.0:
    972 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
    973 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
    974 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
    975 ; AVX512-NEXT:    retq
    976   %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
    977   ret double %1
    978 }
    979 
    980 define double @test_v4f64(double %a0, <4 x double> %a1) {
    981 ; SSE-LABEL: test_v4f64:
    982 ; SSE:       # %bb.0:
    983 ; SSE-NEXT:    mulsd %xmm1, %xmm0
    984 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
    985 ; SSE-NEXT:    mulsd %xmm1, %xmm0
    986 ; SSE-NEXT:    mulsd %xmm2, %xmm0
    987 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
    988 ; SSE-NEXT:    mulsd %xmm2, %xmm0
    989 ; SSE-NEXT:    retq
    990 ;
    991 ; AVX-LABEL: test_v4f64:
    992 ; AVX:       # %bb.0:
    993 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
    994 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
    995 ; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
    996 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
    997 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
    998 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
    999 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1000 ; AVX-NEXT:    vzeroupper
   1001 ; AVX-NEXT:    retq
   1002 ;
   1003 ; AVX512-LABEL: test_v4f64:
   1004 ; AVX512:       # %bb.0:
   1005 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1006 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1007 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1008 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1009 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1010 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1011 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1012 ; AVX512-NEXT:    vzeroupper
   1013 ; AVX512-NEXT:    retq
   1014   %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
   1015   ret double %1
   1016 }
   1017 
   1018 define double @test_v8f64(double %a0, <8 x double> %a1) {
   1019 ; SSE-LABEL: test_v8f64:
   1020 ; SSE:       # %bb.0:
   1021 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1022 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1023 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1024 ; SSE-NEXT:    mulsd %xmm2, %xmm0
   1025 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1026 ; SSE-NEXT:    mulsd %xmm2, %xmm0
   1027 ; SSE-NEXT:    mulsd %xmm3, %xmm0
   1028 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
   1029 ; SSE-NEXT:    mulsd %xmm3, %xmm0
   1030 ; SSE-NEXT:    mulsd %xmm4, %xmm0
   1031 ; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
   1032 ; SSE-NEXT:    mulsd %xmm4, %xmm0
   1033 ; SSE-NEXT:    retq
   1034 ;
   1035 ; AVX-LABEL: test_v8f64:
   1036 ; AVX:       # %bb.0:
   1037 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1038 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
   1039 ; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
   1040 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1041 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1042 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1043 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1044 ; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1045 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1046 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1047 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1048 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1049 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1050 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1051 ; AVX-NEXT:    vzeroupper
   1052 ; AVX-NEXT:    retq
   1053 ;
   1054 ; AVX512-LABEL: test_v8f64:
   1055 ; AVX512:       # %bb.0:
   1056 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1057 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1058 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1059 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1060 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1061 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1062 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1063 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
   1064 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1065 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1066 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1067 ; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
   1068 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1069 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1070 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1071 ; AVX512-NEXT:    vzeroupper
   1072 ; AVX512-NEXT:    retq
   1073   %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
   1074   ret double %1
   1075 }
   1076 
   1077 define double @test_v16f64(double %a0, <16 x double> %a1) {
   1078 ; SSE-LABEL: test_v16f64:
   1079 ; SSE:       # %bb.0:
   1080 ; SSE-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
   1081 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1082 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1083 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1084 ; SSE-NEXT:    mulsd %xmm2, %xmm0
   1085 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1086 ; SSE-NEXT:    mulsd %xmm2, %xmm0
   1087 ; SSE-NEXT:    mulsd %xmm3, %xmm0
   1088 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
   1089 ; SSE-NEXT:    mulsd %xmm3, %xmm0
   1090 ; SSE-NEXT:    mulsd %xmm4, %xmm0
   1091 ; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
   1092 ; SSE-NEXT:    mulsd %xmm4, %xmm0
   1093 ; SSE-NEXT:    mulsd %xmm5, %xmm0
   1094 ; SSE-NEXT:    movhlps {{.*#+}} xmm5 = xmm5[1,1]
   1095 ; SSE-NEXT:    mulsd %xmm5, %xmm0
   1096 ; SSE-NEXT:    mulsd %xmm6, %xmm0
   1097 ; SSE-NEXT:    movhlps {{.*#+}} xmm6 = xmm6[1,1]
   1098 ; SSE-NEXT:    mulsd %xmm6, %xmm0
   1099 ; SSE-NEXT:    mulsd %xmm7, %xmm0
   1100 ; SSE-NEXT:    movhlps {{.*#+}} xmm7 = xmm7[1,1]
   1101 ; SSE-NEXT:    mulsd %xmm7, %xmm0
   1102 ; SSE-NEXT:    mulsd %xmm8, %xmm0
   1103 ; SSE-NEXT:    movhlps {{.*#+}} xmm8 = xmm8[1,1]
   1104 ; SSE-NEXT:    mulsd %xmm8, %xmm0
   1105 ; SSE-NEXT:    retq
   1106 ;
   1107 ; AVX-LABEL: test_v16f64:
   1108 ; AVX:       # %bb.0:
   1109 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1110 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
   1111 ; AVX-NEXT:    vmulsd %xmm5, %xmm0, %xmm0
   1112 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1113 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1114 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1115 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1116 ; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1117 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1118 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1119 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1120 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1121 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1122 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1123 ; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
   1124 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
   1125 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1126 ; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
   1127 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1128 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1129 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1130 ; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm0
   1131 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm4[1,0]
   1132 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1133 ; AVX-NEXT:    vextractf128 $1, %ymm4, %xmm1
   1134 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1135 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1136 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1137 ; AVX-NEXT:    vzeroupper
   1138 ; AVX-NEXT:    retq
   1139 ;
   1140 ; AVX512-LABEL: test_v16f64:
   1141 ; AVX512:       # %bb.0:
   1142 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1143 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
   1144 ; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
   1145 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm3
   1146 ; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
   1147 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
   1148 ; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
   1149 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm3
   1150 ; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
   1151 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
   1152 ; AVX512-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
   1153 ; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
   1154 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1155 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1156 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1157 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1158 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1159 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1160 ; AVX512-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1161 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1162 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1163 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1164 ; AVX512-NEXT:    vextractf32x4 $2, %zmm2, %xmm1
   1165 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1166 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1167 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1168 ; AVX512-NEXT:    vextractf32x4 $3, %zmm2, %xmm1
   1169 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1170 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1171 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1172 ; AVX512-NEXT:    vzeroupper
   1173 ; AVX512-NEXT:    retq
   1174   %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
   1175   ret double %1
   1176 }
   1177 
   1178 ;
   1179 ; vXf64 (one)
   1180 ;
   1181 
   1182 define double @test_v2f64_one(<2 x double> %a0) {
   1183 ; SSE-LABEL: test_v2f64_one:
   1184 ; SSE:       # %bb.0:
   1185 ; SSE-NEXT:    movaps %xmm0, %xmm1
   1186 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
   1187 ; SSE-NEXT:    mulsd %xmm0, %xmm1
   1188 ; SSE-NEXT:    movapd %xmm1, %xmm0
   1189 ; SSE-NEXT:    retq
   1190 ;
   1191 ; AVX-LABEL: test_v2f64_one:
   1192 ; AVX:       # %bb.0:
   1193 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1194 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1195 ; AVX-NEXT:    retq
   1196 ;
   1197 ; AVX512-LABEL: test_v2f64_one:
   1198 ; AVX512:       # %bb.0:
   1199 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1200 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1201 ; AVX512-NEXT:    retq
   1202   %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
   1203   ret double %1
   1204 }
   1205 
   1206 define double @test_v4f64_one(<4 x double> %a0) {
   1207 ; SSE-LABEL: test_v4f64_one:
   1208 ; SSE:       # %bb.0:
   1209 ; SSE-NEXT:    movaps %xmm0, %xmm2
   1210 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
   1211 ; SSE-NEXT:    mulsd %xmm0, %xmm2
   1212 ; SSE-NEXT:    mulsd %xmm1, %xmm2
   1213 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1214 ; SSE-NEXT:    mulsd %xmm1, %xmm2
   1215 ; SSE-NEXT:    movapd %xmm2, %xmm0
   1216 ; SSE-NEXT:    retq
   1217 ;
   1218 ; AVX-LABEL: test_v4f64_one:
   1219 ; AVX:       # %bb.0:
   1220 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1221 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm1
   1222 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1223 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
   1224 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1225 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
   1226 ; AVX-NEXT:    vzeroupper
   1227 ; AVX-NEXT:    retq
   1228 ;
   1229 ; AVX512-LABEL: test_v4f64_one:
   1230 ; AVX512:       # %bb.0:
   1231 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1232 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm1
   1233 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1234 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
   1235 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1236 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
   1237 ; AVX512-NEXT:    vzeroupper
   1238 ; AVX512-NEXT:    retq
   1239   %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
   1240   ret double %1
   1241 }
   1242 
   1243 define double @test_v8f64_one(<8 x double> %a0) {
   1244 ; SSE-LABEL: test_v8f64_one:
   1245 ; SSE:       # %bb.0:
   1246 ; SSE-NEXT:    movaps %xmm0, %xmm4
   1247 ; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm0[1],xmm4[1]
   1248 ; SSE-NEXT:    mulsd %xmm0, %xmm4
   1249 ; SSE-NEXT:    mulsd %xmm1, %xmm4
   1250 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1251 ; SSE-NEXT:    mulsd %xmm1, %xmm4
   1252 ; SSE-NEXT:    mulsd %xmm2, %xmm4
   1253 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1254 ; SSE-NEXT:    mulsd %xmm2, %xmm4
   1255 ; SSE-NEXT:    mulsd %xmm3, %xmm4
   1256 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
   1257 ; SSE-NEXT:    mulsd %xmm3, %xmm4
   1258 ; SSE-NEXT:    movapd %xmm4, %xmm0
   1259 ; SSE-NEXT:    retq
   1260 ;
   1261 ; AVX-LABEL: test_v8f64_one:
   1262 ; AVX:       # %bb.0:
   1263 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
   1264 ; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm2
   1265 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1266 ; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
   1267 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1268 ; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
   1269 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1270 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1271 ; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1272 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1273 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1274 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1275 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1276 ; AVX-NEXT:    vzeroupper
   1277 ; AVX-NEXT:    retq
   1278 ;
   1279 ; AVX512-LABEL: test_v8f64_one:
   1280 ; AVX512:       # %bb.0:
   1281 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1282 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm1
   1283 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1284 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
   1285 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1286 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
   1287 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
   1288 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
   1289 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1290 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
   1291 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
   1292 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
   1293 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1294 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
   1295 ; AVX512-NEXT:    vzeroupper
   1296 ; AVX512-NEXT:    retq
   1297   %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
   1298   ret double %1
   1299 }
   1300 
   1301 define double @test_v16f64_one(<16 x double> %a0) {
   1302 ; SSE-LABEL: test_v16f64_one:
   1303 ; SSE:       # %bb.0:
   1304 ; SSE-NEXT:    movaps %xmm0, %xmm8
   1305 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1306 ; SSE-NEXT:    mulsd %xmm8, %xmm0
   1307 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1308 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1309 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1310 ; SSE-NEXT:    mulsd %xmm2, %xmm0
   1311 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1312 ; SSE-NEXT:    mulsd %xmm2, %xmm0
   1313 ; SSE-NEXT:    mulsd %xmm3, %xmm0
   1314 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
   1315 ; SSE-NEXT:    mulsd %xmm3, %xmm0
   1316 ; SSE-NEXT:    mulsd %xmm4, %xmm0
   1317 ; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
   1318 ; SSE-NEXT:    mulsd %xmm4, %xmm0
   1319 ; SSE-NEXT:    mulsd %xmm5, %xmm0
   1320 ; SSE-NEXT:    movhlps {{.*#+}} xmm5 = xmm5[1,1]
   1321 ; SSE-NEXT:    mulsd %xmm5, %xmm0
   1322 ; SSE-NEXT:    mulsd %xmm6, %xmm0
   1323 ; SSE-NEXT:    movhlps {{.*#+}} xmm6 = xmm6[1,1]
   1324 ; SSE-NEXT:    mulsd %xmm6, %xmm0
   1325 ; SSE-NEXT:    mulsd %xmm7, %xmm0
   1326 ; SSE-NEXT:    movhlps {{.*#+}} xmm7 = xmm7[1,1]
   1327 ; SSE-NEXT:    mulsd %xmm7, %xmm0
   1328 ; SSE-NEXT:    retq
   1329 ;
   1330 ; AVX-LABEL: test_v16f64_one:
   1331 ; AVX:       # %bb.0:
   1332 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
   1333 ; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm4
   1334 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1335 ; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm4
   1336 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1337 ; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm0
   1338 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1339 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
   1340 ; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm0
   1341 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1342 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1343 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1344 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1345 ; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1346 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1347 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1348 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1349 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1350 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1351 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1352 ; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
   1353 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
   1354 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1355 ; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
   1356 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1357 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1358 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1359 ; AVX-NEXT:    vzeroupper
   1360 ; AVX-NEXT:    retq
   1361 ;
   1362 ; AVX512-LABEL: test_v16f64_one:
   1363 ; AVX512:       # %bb.0:
   1364 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
   1365 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm2
   1366 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1367 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
   1368 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
   1369 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
   1370 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
   1371 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
   1372 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
   1373 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
   1374 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
   1375 ; AVX512-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
   1376 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1377 ; AVX512-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
   1378 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1379 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1380 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1381 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1382 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1383 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1384 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1385 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
   1386 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1387 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1388 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1389 ; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
   1390 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1391 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1392 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1393 ; AVX512-NEXT:    vzeroupper
   1394 ; AVX512-NEXT:    retq
   1395   %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
   1396   ret double %1
   1397 }
   1398 
   1399 ;
   1400 ; vXf64 (undef)
   1401 ;
   1402 
   1403 define double @test_v2f64_undef(<2 x double> %a0) {
   1404 ; SSE-LABEL: test_v2f64_undef:
   1405 ; SSE:       # %bb.0:
   1406 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1407 ; SSE-NEXT:    mulsd {{.*}}(%rip), %xmm0
   1408 ; SSE-NEXT:    retq
   1409 ;
   1410 ; AVX-LABEL: test_v2f64_undef:
   1411 ; AVX:       # %bb.0:
   1412 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1413 ; AVX-NEXT:    vmulsd {{.*}}(%rip), %xmm0, %xmm0
   1414 ; AVX-NEXT:    retq
   1415 ;
   1416 ; AVX512-LABEL: test_v2f64_undef:
   1417 ; AVX512:       # %bb.0:
   1418 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1419 ; AVX512-NEXT:    vmulsd {{.*}}(%rip), %xmm0, %xmm0
   1420 ; AVX512-NEXT:    retq
   1421   %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
   1422   ret double %1
   1423 }
   1424 
   1425 define double @test_v4f64_undef(<4 x double> %a0) {
   1426 ; SSE-LABEL: test_v4f64_undef:
   1427 ; SSE:       # %bb.0:
   1428 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1429 ; SSE-NEXT:    mulsd {{.*}}(%rip), %xmm0
   1430 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1431 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1432 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1433 ; SSE-NEXT:    retq
   1434 ;
   1435 ; AVX-LABEL: test_v4f64_undef:
   1436 ; AVX:       # %bb.0:
   1437 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1438 ; AVX-NEXT:    vmulsd {{.*}}(%rip), %xmm1, %xmm1
   1439 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1440 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
   1441 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1442 ; AVX-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
   1443 ; AVX-NEXT:    vzeroupper
   1444 ; AVX-NEXT:    retq
   1445 ;
   1446 ; AVX512-LABEL: test_v4f64_undef:
   1447 ; AVX512:       # %bb.0:
   1448 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1449 ; AVX512-NEXT:    vmulsd {{.*}}(%rip), %xmm1, %xmm1
   1450 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1451 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
   1452 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1453 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
   1454 ; AVX512-NEXT:    vzeroupper
   1455 ; AVX512-NEXT:    retq
   1456   %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
   1457   ret double %1
   1458 }
   1459 
   1460 define double @test_v8f64_undef(<8 x double> %a0) {
   1461 ; SSE-LABEL: test_v8f64_undef:
   1462 ; SSE:       # %bb.0:
   1463 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1464 ; SSE-NEXT:    mulsd {{.*}}(%rip), %xmm0
   1465 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1466 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1467 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1468 ; SSE-NEXT:    mulsd %xmm2, %xmm0
   1469 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1470 ; SSE-NEXT:    mulsd %xmm2, %xmm0
   1471 ; SSE-NEXT:    mulsd %xmm3, %xmm0
   1472 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
   1473 ; SSE-NEXT:    mulsd %xmm3, %xmm0
   1474 ; SSE-NEXT:    retq
   1475 ;
   1476 ; AVX-LABEL: test_v8f64_undef:
   1477 ; AVX:       # %bb.0:
   1478 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
   1479 ; AVX-NEXT:    vmulsd {{.*}}(%rip), %xmm2, %xmm2
   1480 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1481 ; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
   1482 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1483 ; AVX-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
   1484 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1485 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1486 ; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1487 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1488 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1489 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1490 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1491 ; AVX-NEXT:    vzeroupper
   1492 ; AVX-NEXT:    retq
   1493 ;
   1494 ; AVX512-LABEL: test_v8f64_undef:
   1495 ; AVX512:       # %bb.0:
   1496 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
   1497 ; AVX512-NEXT:    vmulsd {{.*}}(%rip), %xmm1, %xmm1
   1498 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1499 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
   1500 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1501 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
   1502 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm2
   1503 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
   1504 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1505 ; AVX512-NEXT:    vmulsd %xmm2, %xmm1, %xmm1
   1506 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
   1507 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm1
   1508 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1509 ; AVX512-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
   1510 ; AVX512-NEXT:    vzeroupper
   1511 ; AVX512-NEXT:    retq
   1512   %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
   1513   ret double %1
   1514 }
   1515 
   1516 define double @test_v16f64_undef(<16 x double> %a0) {
   1517 ; SSE-LABEL: test_v16f64_undef:
   1518 ; SSE:       # %bb.0:
   1519 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
   1520 ; SSE-NEXT:    mulsd {{.*}}(%rip), %xmm0
   1521 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1522 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
   1523 ; SSE-NEXT:    mulsd %xmm1, %xmm0
   1524 ; SSE-NEXT:    mulsd %xmm2, %xmm0
   1525 ; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
   1526 ; SSE-NEXT:    mulsd %xmm2, %xmm0
   1527 ; SSE-NEXT:    mulsd %xmm3, %xmm0
   1528 ; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
   1529 ; SSE-NEXT:    mulsd %xmm3, %xmm0
   1530 ; SSE-NEXT:    mulsd %xmm4, %xmm0
   1531 ; SSE-NEXT:    movhlps {{.*#+}} xmm4 = xmm4[1,1]
   1532 ; SSE-NEXT:    mulsd %xmm4, %xmm0
   1533 ; SSE-NEXT:    mulsd %xmm5, %xmm0
   1534 ; SSE-NEXT:    movhlps {{.*#+}} xmm5 = xmm5[1,1]
   1535 ; SSE-NEXT:    mulsd %xmm5, %xmm0
   1536 ; SSE-NEXT:    mulsd %xmm6, %xmm0
   1537 ; SSE-NEXT:    movhlps {{.*#+}} xmm6 = xmm6[1,1]
   1538 ; SSE-NEXT:    mulsd %xmm6, %xmm0
   1539 ; SSE-NEXT:    mulsd %xmm7, %xmm0
   1540 ; SSE-NEXT:    movhlps {{.*#+}} xmm7 = xmm7[1,1]
   1541 ; SSE-NEXT:    mulsd %xmm7, %xmm0
   1542 ; SSE-NEXT:    retq
   1543 ;
   1544 ; AVX-LABEL: test_v16f64_undef:
   1545 ; AVX:       # %bb.0:
   1546 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm0[1,0]
   1547 ; AVX-NEXT:    vmulsd {{.*}}(%rip), %xmm4, %xmm4
   1548 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm0
   1549 ; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm4
   1550 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1551 ; AVX-NEXT:    vmulsd %xmm0, %xmm4, %xmm0
   1552 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1553 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
   1554 ; AVX-NEXT:    vmulsd %xmm4, %xmm0, %xmm0
   1555 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1556 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1557 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1558 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1559 ; AVX-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1560 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm2[1,0]
   1561 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1562 ; AVX-NEXT:    vextractf128 $1, %ymm2, %xmm1
   1563 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1564 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1565 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1566 ; AVX-NEXT:    vmulsd %xmm3, %xmm0, %xmm0
   1567 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm3[1,0]
   1568 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1569 ; AVX-NEXT:    vextractf128 $1, %ymm3, %xmm1
   1570 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1571 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1572 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1573 ; AVX-NEXT:    vzeroupper
   1574 ; AVX-NEXT:    retq
   1575 ;
   1576 ; AVX512-LABEL: test_v16f64_undef:
   1577 ; AVX512:       # %bb.0:
   1578 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
   1579 ; AVX512-NEXT:    vmulsd {{.*}}(%rip), %xmm2, %xmm2
   1580 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1581 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
   1582 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
   1583 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
   1584 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm3
   1585 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
   1586 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
   1587 ; AVX512-NEXT:    vmulsd %xmm3, %xmm2, %xmm2
   1588 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
   1589 ; AVX512-NEXT:    vmulsd %xmm0, %xmm2, %xmm2
   1590 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
   1591 ; AVX512-NEXT:    vmulsd %xmm0, %xmm2, %xmm0
   1592 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1593 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
   1594 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1595 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1596 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1597 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1598 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1599 ; AVX512-NEXT:    vextractf32x4 $2, %zmm1, %xmm2
   1600 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1601 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm2[1,0]
   1602 ; AVX512-NEXT:    vmulsd %xmm2, %xmm0, %xmm0
   1603 ; AVX512-NEXT:    vextractf32x4 $3, %zmm1, %xmm1
   1604 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1605 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
   1606 ; AVX512-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
   1607 ; AVX512-NEXT:    vzeroupper
   1608 ; AVX512-NEXT:    retq
   1609   %1 = call double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
   1610   ret double %1
   1611 }
   1612 
   1613 declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>)
   1614 declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>)
   1615 declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>)
   1616 declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>)
   1617 
   1618 declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>)
   1619 declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>)
   1620 declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>)
   1621 declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>)
   1622