Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
      8 
      9 ;
     10 ; vXf32 (accum)
     11 ;
     12 
     13 define float @test_v2f32(float %a0, <2 x float> %a1) {
     14 ; SSE2-LABEL: test_v2f32:
     15 ; SSE2:       # %bb.0:
     16 ; SSE2-NEXT:    movaps %xmm1, %xmm0
     17 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
     18 ; SSE2-NEXT:    mulps %xmm1, %xmm0
     19 ; SSE2-NEXT:    retq
     20 ;
     21 ; SSE41-LABEL: test_v2f32:
     22 ; SSE41:       # %bb.0:
     23 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
     24 ; SSE41-NEXT:    mulps %xmm1, %xmm0
     25 ; SSE41-NEXT:    retq
     26 ;
     27 ; AVX-LABEL: test_v2f32:
     28 ; AVX:       # %bb.0:
     29 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
     30 ; AVX-NEXT:    vmulps %xmm0, %xmm1, %xmm0
     31 ; AVX-NEXT:    retq
     32 ;
     33 ; AVX512-LABEL: test_v2f32:
     34 ; AVX512:       # %bb.0:
     35 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
     36 ; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
     37 ; AVX512-NEXT:    retq
     38   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float %a0, <2 x float> %a1)
     39   ret float %1
     40 }
     41 
     42 define float @test_v4f32(float %a0, <4 x float> %a1) {
     43 ; SSE2-LABEL: test_v4f32:
     44 ; SSE2:       # %bb.0:
     45 ; SSE2-NEXT:    movaps %xmm1, %xmm2
     46 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
     47 ; SSE2-NEXT:    mulps %xmm1, %xmm2
     48 ; SSE2-NEXT:    movaps %xmm2, %xmm0
     49 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
     50 ; SSE2-NEXT:    mulps %xmm2, %xmm0
     51 ; SSE2-NEXT:    retq
     52 ;
     53 ; SSE41-LABEL: test_v4f32:
     54 ; SSE41:       # %bb.0:
     55 ; SSE41-NEXT:    movaps %xmm1, %xmm2
     56 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
     57 ; SSE41-NEXT:    mulps %xmm1, %xmm2
     58 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
     59 ; SSE41-NEXT:    mulps %xmm2, %xmm0
     60 ; SSE41-NEXT:    retq
     61 ;
     62 ; AVX-LABEL: test_v4f32:
     63 ; AVX:       # %bb.0:
     64 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
     65 ; AVX-NEXT:    vmulps %xmm0, %xmm1, %xmm0
     66 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
     67 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
     68 ; AVX-NEXT:    retq
     69 ;
     70 ; AVX512-LABEL: test_v4f32:
     71 ; AVX512:       # %bb.0:
     72 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
     73 ; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
     74 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
     75 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
     76 ; AVX512-NEXT:    retq
     77   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float %a0, <4 x float> %a1)
     78   ret float %1
     79 }
     80 
     81 define float @test_v8f32(float %a0, <8 x float> %a1) {
     82 ; SSE2-LABEL: test_v8f32:
     83 ; SSE2:       # %bb.0:
     84 ; SSE2-NEXT:    mulps %xmm2, %xmm1
     85 ; SSE2-NEXT:    movaps %xmm1, %xmm2
     86 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
     87 ; SSE2-NEXT:    mulps %xmm1, %xmm2
     88 ; SSE2-NEXT:    movaps %xmm2, %xmm0
     89 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
     90 ; SSE2-NEXT:    mulps %xmm2, %xmm0
     91 ; SSE2-NEXT:    retq
     92 ;
     93 ; SSE41-LABEL: test_v8f32:
     94 ; SSE41:       # %bb.0:
     95 ; SSE41-NEXT:    mulps %xmm2, %xmm1
     96 ; SSE41-NEXT:    movaps %xmm1, %xmm2
     97 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
     98 ; SSE41-NEXT:    mulps %xmm1, %xmm2
     99 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
    100 ; SSE41-NEXT:    mulps %xmm2, %xmm0
    101 ; SSE41-NEXT:    retq
    102 ;
    103 ; AVX-LABEL: test_v8f32:
    104 ; AVX:       # %bb.0:
    105 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
    106 ; AVX-NEXT:    vmulps %ymm0, %ymm1, %ymm0
    107 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    108 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    109 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    110 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    111 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    112 ; AVX-NEXT:    vzeroupper
    113 ; AVX-NEXT:    retq
    114 ;
    115 ; AVX512-LABEL: test_v8f32:
    116 ; AVX512:       # %bb.0:
    117 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
    118 ; AVX512-NEXT:    vmulps %ymm0, %ymm1, %ymm0
    119 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    120 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    121 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    122 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    123 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    124 ; AVX512-NEXT:    vzeroupper
    125 ; AVX512-NEXT:    retq
    126   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float %a0, <8 x float> %a1)
    127   ret float %1
    128 }
    129 
    130 define float @test_v16f32(float %a0, <16 x float> %a1) {
    131 ; SSE2-LABEL: test_v16f32:
    132 ; SSE2:       # %bb.0:
    133 ; SSE2-NEXT:    mulps %xmm4, %xmm2
    134 ; SSE2-NEXT:    mulps %xmm3, %xmm1
    135 ; SSE2-NEXT:    mulps %xmm2, %xmm1
    136 ; SSE2-NEXT:    movaps %xmm1, %xmm2
    137 ; SSE2-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
    138 ; SSE2-NEXT:    mulps %xmm1, %xmm2
    139 ; SSE2-NEXT:    movaps %xmm2, %xmm0
    140 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm2[2,3]
    141 ; SSE2-NEXT:    mulps %xmm2, %xmm0
    142 ; SSE2-NEXT:    retq
    143 ;
    144 ; SSE41-LABEL: test_v16f32:
    145 ; SSE41:       # %bb.0:
    146 ; SSE41-NEXT:    mulps %xmm4, %xmm2
    147 ; SSE41-NEXT:    mulps %xmm3, %xmm1
    148 ; SSE41-NEXT:    mulps %xmm2, %xmm1
    149 ; SSE41-NEXT:    movaps %xmm1, %xmm2
    150 ; SSE41-NEXT:    movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
    151 ; SSE41-NEXT:    mulps %xmm1, %xmm2
    152 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
    153 ; SSE41-NEXT:    mulps %xmm2, %xmm0
    154 ; SSE41-NEXT:    retq
    155 ;
    156 ; AVX-LABEL: test_v16f32:
    157 ; AVX:       # %bb.0:
    158 ; AVX-NEXT:    vmulps %ymm2, %ymm1, %ymm0
    159 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    160 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    161 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    162 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    163 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    164 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    165 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    166 ; AVX-NEXT:    vzeroupper
    167 ; AVX-NEXT:    retq
    168 ;
    169 ; AVX512-LABEL: test_v16f32:
    170 ; AVX512:       # %bb.0:
    171 ; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
    172 ; AVX512-NEXT:    vmulps %zmm0, %zmm1, %zmm0
    173 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    174 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
    175 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    176 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
    177 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    178 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
    179 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    180 ; AVX512-NEXT:    vzeroupper
    181 ; AVX512-NEXT:    retq
    182   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float %a0, <16 x float> %a1)
    183   ret float %1
    184 }
    185 
    186 ;
    187 ; vXf32 (one)
    188 ;
    189 
    190 define float @test_v2f32_zero(<2 x float> %a0) {
    191 ; SSE2-LABEL: test_v2f32_zero:
    192 ; SSE2:       # %bb.0:
    193 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    194 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
    195 ; SSE2-NEXT:    mulps %xmm0, %xmm1
    196 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    197 ; SSE2-NEXT:    retq
    198 ;
    199 ; SSE41-LABEL: test_v2f32_zero:
    200 ; SSE41:       # %bb.0:
    201 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    202 ; SSE41-NEXT:    mulps %xmm1, %xmm0
    203 ; SSE41-NEXT:    retq
    204 ;
    205 ; AVX-LABEL: test_v2f32_zero:
    206 ; AVX:       # %bb.0:
    207 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    208 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    209 ; AVX-NEXT:    retq
    210 ;
    211 ; AVX512-LABEL: test_v2f32_zero:
    212 ; AVX512:       # %bb.0:
    213 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    214 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    215 ; AVX512-NEXT:    retq
    216   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float 1.0, <2 x float> %a0)
    217   ret float %1
    218 }
    219 
    220 define float @test_v4f32_zero(<4 x float> %a0) {
    221 ; SSE2-LABEL: test_v4f32_zero:
    222 ; SSE2:       # %bb.0:
    223 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    224 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    225 ; SSE2-NEXT:    mulps %xmm0, %xmm1
    226 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    227 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
    228 ; SSE2-NEXT:    mulps %xmm1, %xmm0
    229 ; SSE2-NEXT:    retq
    230 ;
    231 ; SSE41-LABEL: test_v4f32_zero:
    232 ; SSE41:       # %bb.0:
    233 ; SSE41-NEXT:    movaps %xmm0, %xmm1
    234 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    235 ; SSE41-NEXT:    mulps %xmm0, %xmm1
    236 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
    237 ; SSE41-NEXT:    mulps %xmm0, %xmm1
    238 ; SSE41-NEXT:    movaps %xmm1, %xmm0
    239 ; SSE41-NEXT:    retq
    240 ;
    241 ; AVX-LABEL: test_v4f32_zero:
    242 ; AVX:       # %bb.0:
    243 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    244 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    245 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    246 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    247 ; AVX-NEXT:    retq
    248 ;
    249 ; AVX512-LABEL: test_v4f32_zero:
    250 ; AVX512:       # %bb.0:
    251 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    252 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    253 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    254 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    255 ; AVX512-NEXT:    retq
    256   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float 1.0, <4 x float> %a0)
    257   ret float %1
    258 }
    259 
    260 define float @test_v8f32_zero(<8 x float> %a0) {
    261 ; SSE2-LABEL: test_v8f32_zero:
    262 ; SSE2:       # %bb.0:
    263 ; SSE2-NEXT:    mulps %xmm1, %xmm0
    264 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    265 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    266 ; SSE2-NEXT:    mulps %xmm0, %xmm1
    267 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    268 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
    269 ; SSE2-NEXT:    mulps %xmm1, %xmm0
    270 ; SSE2-NEXT:    retq
    271 ;
    272 ; SSE41-LABEL: test_v8f32_zero:
    273 ; SSE41:       # %bb.0:
    274 ; SSE41-NEXT:    mulps %xmm1, %xmm0
    275 ; SSE41-NEXT:    movaps %xmm0, %xmm1
    276 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    277 ; SSE41-NEXT:    mulps %xmm0, %xmm1
    278 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
    279 ; SSE41-NEXT:    mulps %xmm0, %xmm1
    280 ; SSE41-NEXT:    movaps %xmm1, %xmm0
    281 ; SSE41-NEXT:    retq
    282 ;
    283 ; AVX-LABEL: test_v8f32_zero:
    284 ; AVX:       # %bb.0:
    285 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    286 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    287 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    288 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    289 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    290 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    291 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    292 ; AVX-NEXT:    vzeroupper
    293 ; AVX-NEXT:    retq
    294 ;
    295 ; AVX512-LABEL: test_v8f32_zero:
    296 ; AVX512:       # %bb.0:
    297 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    298 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    299 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    300 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    301 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    302 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    303 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    304 ; AVX512-NEXT:    vzeroupper
    305 ; AVX512-NEXT:    retq
    306   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float 1.0, <8 x float> %a0)
    307   ret float %1
    308 }
    309 
    310 define float @test_v16f32_zero(<16 x float> %a0) {
    311 ; SSE2-LABEL: test_v16f32_zero:
    312 ; SSE2:       # %bb.0:
    313 ; SSE2-NEXT:    mulps %xmm3, %xmm1
    314 ; SSE2-NEXT:    mulps %xmm2, %xmm0
    315 ; SSE2-NEXT:    mulps %xmm1, %xmm0
    316 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    317 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    318 ; SSE2-NEXT:    mulps %xmm0, %xmm1
    319 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    320 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
    321 ; SSE2-NEXT:    mulps %xmm1, %xmm0
    322 ; SSE2-NEXT:    retq
    323 ;
    324 ; SSE41-LABEL: test_v16f32_zero:
    325 ; SSE41:       # %bb.0:
    326 ; SSE41-NEXT:    mulps %xmm3, %xmm1
    327 ; SSE41-NEXT:    mulps %xmm2, %xmm0
    328 ; SSE41-NEXT:    mulps %xmm1, %xmm0
    329 ; SSE41-NEXT:    movaps %xmm0, %xmm1
    330 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    331 ; SSE41-NEXT:    mulps %xmm0, %xmm1
    332 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
    333 ; SSE41-NEXT:    mulps %xmm0, %xmm1
    334 ; SSE41-NEXT:    movaps %xmm1, %xmm0
    335 ; SSE41-NEXT:    retq
    336 ;
    337 ; AVX-LABEL: test_v16f32_zero:
    338 ; AVX:       # %bb.0:
    339 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    340 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    341 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    342 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    343 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    344 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    345 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    346 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    347 ; AVX-NEXT:    vzeroupper
    348 ; AVX-NEXT:    retq
    349 ;
    350 ; AVX512-LABEL: test_v16f32_zero:
    351 ; AVX512:       # %bb.0:
    352 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
    353 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
    354 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    355 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
    356 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    357 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
    358 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    359 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
    360 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    361 ; AVX512-NEXT:    vzeroupper
    362 ; AVX512-NEXT:    retq
    363   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float 1.0, <16 x float> %a0)
    364   ret float %1
    365 }
    366 
    367 ;
    368 ; vXf32 (undef)
    369 ;
    370 
    371 define float @test_v2f32_undef(<2 x float> %a0) {
    372 ; SSE2-LABEL: test_v2f32_undef:
    373 ; SSE2:       # %bb.0:
    374 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    375 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
    376 ; SSE2-NEXT:    mulps %xmm0, %xmm1
    377 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    378 ; SSE2-NEXT:    retq
    379 ;
    380 ; SSE41-LABEL: test_v2f32_undef:
    381 ; SSE41:       # %bb.0:
    382 ; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    383 ; SSE41-NEXT:    mulps %xmm1, %xmm0
    384 ; SSE41-NEXT:    retq
    385 ;
    386 ; AVX-LABEL: test_v2f32_undef:
    387 ; AVX:       # %bb.0:
    388 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    389 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    390 ; AVX-NEXT:    retq
    391 ;
    392 ; AVX512-LABEL: test_v2f32_undef:
    393 ; AVX512:       # %bb.0:
    394 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    395 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    396 ; AVX512-NEXT:    retq
    397   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float undef, <2 x float> %a0)
    398   ret float %1
    399 }
    400 
    401 define float @test_v4f32_undef(<4 x float> %a0) {
    402 ; SSE2-LABEL: test_v4f32_undef:
    403 ; SSE2:       # %bb.0:
    404 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    405 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    406 ; SSE2-NEXT:    mulps %xmm0, %xmm1
    407 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    408 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
    409 ; SSE2-NEXT:    mulps %xmm1, %xmm0
    410 ; SSE2-NEXT:    retq
    411 ;
    412 ; SSE41-LABEL: test_v4f32_undef:
    413 ; SSE41:       # %bb.0:
    414 ; SSE41-NEXT:    movaps %xmm0, %xmm1
    415 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    416 ; SSE41-NEXT:    mulps %xmm0, %xmm1
    417 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
    418 ; SSE41-NEXT:    mulps %xmm0, %xmm1
    419 ; SSE41-NEXT:    movaps %xmm1, %xmm0
    420 ; SSE41-NEXT:    retq
    421 ;
    422 ; AVX-LABEL: test_v4f32_undef:
    423 ; AVX:       # %bb.0:
    424 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    425 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    426 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    427 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    428 ; AVX-NEXT:    retq
    429 ;
    430 ; AVX512-LABEL: test_v4f32_undef:
    431 ; AVX512:       # %bb.0:
    432 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    433 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    434 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    435 ; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
    436 ; AVX512-NEXT:    retq
    437   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> %a0)
    438   ret float %1
    439 }
    440 
    441 define float @test_v8f32_undef(<8 x float> %a0) {
    442 ; SSE2-LABEL: test_v8f32_undef:
    443 ; SSE2:       # %bb.0:
    444 ; SSE2-NEXT:    mulps %xmm1, %xmm0
    445 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    446 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    447 ; SSE2-NEXT:    mulps %xmm0, %xmm1
    448 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    449 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
    450 ; SSE2-NEXT:    mulps %xmm1, %xmm0
    451 ; SSE2-NEXT:    retq
    452 ;
    453 ; SSE41-LABEL: test_v8f32_undef:
    454 ; SSE41:       # %bb.0:
    455 ; SSE41-NEXT:    mulps %xmm1, %xmm0
    456 ; SSE41-NEXT:    movaps %xmm0, %xmm1
    457 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    458 ; SSE41-NEXT:    mulps %xmm0, %xmm1
    459 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
    460 ; SSE41-NEXT:    mulps %xmm0, %xmm1
    461 ; SSE41-NEXT:    movaps %xmm1, %xmm0
    462 ; SSE41-NEXT:    retq
    463 ;
    464 ; AVX-LABEL: test_v8f32_undef:
    465 ; AVX:       # %bb.0:
    466 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    467 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    468 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    469 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    470 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    471 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    472 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    473 ; AVX-NEXT:    vzeroupper
    474 ; AVX-NEXT:    retq
    475 ;
    476 ; AVX512-LABEL: test_v8f32_undef:
    477 ; AVX512:       # %bb.0:
    478 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    479 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    480 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    481 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    482 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    483 ; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    484 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    485 ; AVX512-NEXT:    vzeroupper
    486 ; AVX512-NEXT:    retq
    487   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float undef, <8 x float> %a0)
    488   ret float %1
    489 }
    490 
    491 define float @test_v16f32_undef(<16 x float> %a0) {
    492 ; SSE2-LABEL: test_v16f32_undef:
    493 ; SSE2:       # %bb.0:
    494 ; SSE2-NEXT:    mulps %xmm3, %xmm1
    495 ; SSE2-NEXT:    mulps %xmm2, %xmm0
    496 ; SSE2-NEXT:    mulps %xmm1, %xmm0
    497 ; SSE2-NEXT:    movaps %xmm0, %xmm1
    498 ; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    499 ; SSE2-NEXT:    mulps %xmm0, %xmm1
    500 ; SSE2-NEXT:    movaps %xmm1, %xmm0
    501 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[2,3]
    502 ; SSE2-NEXT:    mulps %xmm1, %xmm0
    503 ; SSE2-NEXT:    retq
    504 ;
    505 ; SSE41-LABEL: test_v16f32_undef:
    506 ; SSE41:       # %bb.0:
    507 ; SSE41-NEXT:    mulps %xmm3, %xmm1
    508 ; SSE41-NEXT:    mulps %xmm2, %xmm0
    509 ; SSE41-NEXT:    mulps %xmm1, %xmm0
    510 ; SSE41-NEXT:    movaps %xmm0, %xmm1
    511 ; SSE41-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    512 ; SSE41-NEXT:    mulps %xmm0, %xmm1
    513 ; SSE41-NEXT:    movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
    514 ; SSE41-NEXT:    mulps %xmm0, %xmm1
    515 ; SSE41-NEXT:    movaps %xmm1, %xmm0
    516 ; SSE41-NEXT:    retq
    517 ;
    518 ; AVX-LABEL: test_v16f32_undef:
    519 ; AVX:       # %bb.0:
    520 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    521 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    522 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    523 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    524 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    525 ; AVX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    526 ; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
    527 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    528 ; AVX-NEXT:    vzeroupper
    529 ; AVX-NEXT:    retq
    530 ;
    531 ; AVX512-LABEL: test_v16f32_undef:
    532 ; AVX512:       # %bb.0:
    533 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
    534 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
    535 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    536 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
    537 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    538 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
    539 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
    540 ; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
    541 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    542 ; AVX512-NEXT:    vzeroupper
    543 ; AVX512-NEXT:    retq
    544   %1 = call fast float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float undef, <16 x float> %a0)
    545   ret float %1
    546 }
    547 
    548 ;
    549 ; vXf64 (accum)
    550 ;
    551 
    552 define double @test_v2f64(double %a0, <2 x double> %a1) {
    553 ; SSE-LABEL: test_v2f64:
    554 ; SSE:       # %bb.0:
    555 ; SSE-NEXT:    movaps %xmm1, %xmm0
    556 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
    557 ; SSE-NEXT:    mulpd %xmm1, %xmm0
    558 ; SSE-NEXT:    retq
    559 ;
    560 ; AVX-LABEL: test_v2f64:
    561 ; AVX:       # %bb.0:
    562 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
    563 ; AVX-NEXT:    vmulpd %xmm0, %xmm1, %xmm0
    564 ; AVX-NEXT:    retq
    565 ;
    566 ; AVX512-LABEL: test_v2f64:
    567 ; AVX512:       # %bb.0:
    568 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
    569 ; AVX512-NEXT:    vmulpd %xmm0, %xmm1, %xmm0
    570 ; AVX512-NEXT:    retq
    571   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double %a0, <2 x double> %a1)
    572   ret double %1
    573 }
    574 
    575 define double @test_v4f64(double %a0, <4 x double> %a1) {
    576 ; SSE-LABEL: test_v4f64:
    577 ; SSE:       # %bb.0:
    578 ; SSE-NEXT:    mulpd %xmm2, %xmm1
    579 ; SSE-NEXT:    movapd %xmm1, %xmm0
    580 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
    581 ; SSE-NEXT:    mulpd %xmm1, %xmm0
    582 ; SSE-NEXT:    retq
    583 ;
    584 ; AVX-LABEL: test_v4f64:
    585 ; AVX:       # %bb.0:
    586 ; AVX-NEXT:    vextractf128 $1, %ymm1, %xmm0
    587 ; AVX-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
    588 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    589 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    590 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    591 ; AVX-NEXT:    vzeroupper
    592 ; AVX-NEXT:    retq
    593 ;
    594 ; AVX512-LABEL: test_v4f64:
    595 ; AVX512:       # %bb.0:
    596 ; AVX512-NEXT:    vextractf128 $1, %ymm1, %xmm0
    597 ; AVX512-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
    598 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    599 ; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    600 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    601 ; AVX512-NEXT:    vzeroupper
    602 ; AVX512-NEXT:    retq
    603   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double %a0, <4 x double> %a1)
    604   ret double %1
    605 }
    606 
    607 define double @test_v8f64(double %a0, <8 x double> %a1) {
    608 ; SSE-LABEL: test_v8f64:
    609 ; SSE:       # %bb.0:
    610 ; SSE-NEXT:    mulpd %xmm4, %xmm2
    611 ; SSE-NEXT:    mulpd %xmm3, %xmm1
    612 ; SSE-NEXT:    mulpd %xmm2, %xmm1
    613 ; SSE-NEXT:    movapd %xmm1, %xmm0
    614 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
    615 ; SSE-NEXT:    mulpd %xmm1, %xmm0
    616 ; SSE-NEXT:    retq
    617 ;
    618 ; AVX-LABEL: test_v8f64:
    619 ; AVX:       # %bb.0:
    620 ; AVX-NEXT:    vmulpd %ymm2, %ymm1, %ymm0
    621 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    622 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    623 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    624 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    625 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    626 ; AVX-NEXT:    vzeroupper
    627 ; AVX-NEXT:    retq
    628 ;
    629 ; AVX512-LABEL: test_v8f64:
    630 ; AVX512:       # %bb.0:
    631 ; AVX512-NEXT:    vextractf64x4 $1, %zmm1, %ymm0
    632 ; AVX512-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
    633 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    634 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    635 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    636 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    637 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    638 ; AVX512-NEXT:    vzeroupper
    639 ; AVX512-NEXT:    retq
    640   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double %a0, <8 x double> %a1)
    641   ret double %1
    642 }
    643 
    644 define double @test_v16f64(double %a0, <16 x double> %a1) {
    645 ; SSE-LABEL: test_v16f64:
    646 ; SSE:       # %bb.0:
    647 ; SSE-NEXT:    mulpd %xmm6, %xmm2
    648 ; SSE-NEXT:    mulpd %xmm7, %xmm3
    649 ; SSE-NEXT:    mulpd %xmm5, %xmm1
    650 ; SSE-NEXT:    mulpd %xmm3, %xmm1
    651 ; SSE-NEXT:    mulpd {{[0-9]+}}(%rsp), %xmm4
    652 ; SSE-NEXT:    mulpd %xmm2, %xmm4
    653 ; SSE-NEXT:    mulpd %xmm1, %xmm4
    654 ; SSE-NEXT:    movapd %xmm4, %xmm0
    655 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm4[1],xmm0[1]
    656 ; SSE-NEXT:    mulpd %xmm4, %xmm0
    657 ; SSE-NEXT:    retq
    658 ;
    659 ; AVX-LABEL: test_v16f64:
    660 ; AVX:       # %bb.0:
    661 ; AVX-NEXT:    vmulpd %ymm4, %ymm2, %ymm0
    662 ; AVX-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
    663 ; AVX-NEXT:    vmulpd %ymm0, %ymm1, %ymm0
    664 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    665 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    666 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    667 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    668 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    669 ; AVX-NEXT:    vzeroupper
    670 ; AVX-NEXT:    retq
    671 ;
    672 ; AVX512-LABEL: test_v16f64:
    673 ; AVX512:       # %bb.0:
    674 ; AVX512-NEXT:    vmulpd %zmm2, %zmm1, %zmm0
    675 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
    676 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    677 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    678 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    679 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    680 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    681 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    682 ; AVX512-NEXT:    vzeroupper
    683 ; AVX512-NEXT:    retq
    684   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double %a0, <16 x double> %a1)
    685   ret double %1
    686 }
    687 
    688 ;
    689 ; vXf64 (one)
    690 ;
    691 
    692 define double @test_v2f64_zero(<2 x double> %a0) {
    693 ; SSE-LABEL: test_v2f64_zero:
    694 ; SSE:       # %bb.0:
    695 ; SSE-NEXT:    movaps %xmm0, %xmm1
    696 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    697 ; SSE-NEXT:    mulpd %xmm0, %xmm1
    698 ; SSE-NEXT:    movapd %xmm1, %xmm0
    699 ; SSE-NEXT:    retq
    700 ;
    701 ; AVX-LABEL: test_v2f64_zero:
    702 ; AVX:       # %bb.0:
    703 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    704 ; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
    705 ; AVX-NEXT:    retq
    706 ;
    707 ; AVX512-LABEL: test_v2f64_zero:
    708 ; AVX512:       # %bb.0:
    709 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    710 ; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
    711 ; AVX512-NEXT:    retq
    712   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double 1.0, <2 x double> %a0)
    713   ret double %1
    714 }
    715 
    716 define double @test_v4f64_zero(<4 x double> %a0) {
    717 ; SSE-LABEL: test_v4f64_zero:
    718 ; SSE:       # %bb.0:
    719 ; SSE-NEXT:    mulpd %xmm1, %xmm0
    720 ; SSE-NEXT:    movapd %xmm0, %xmm1
    721 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    722 ; SSE-NEXT:    mulpd %xmm0, %xmm1
    723 ; SSE-NEXT:    movapd %xmm1, %xmm0
    724 ; SSE-NEXT:    retq
    725 ;
    726 ; AVX-LABEL: test_v4f64_zero:
    727 ; AVX:       # %bb.0:
    728 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    729 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    730 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    731 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    732 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    733 ; AVX-NEXT:    vzeroupper
    734 ; AVX-NEXT:    retq
    735 ;
    736 ; AVX512-LABEL: test_v4f64_zero:
    737 ; AVX512:       # %bb.0:
    738 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    739 ; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    740 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    741 ; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    742 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    743 ; AVX512-NEXT:    vzeroupper
    744 ; AVX512-NEXT:    retq
    745   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double 1.0, <4 x double> %a0)
    746   ret double %1
    747 }
    748 
    749 define double @test_v8f64_zero(<8 x double> %a0) {
    750 ; SSE-LABEL: test_v8f64_zero:
    751 ; SSE:       # %bb.0:
    752 ; SSE-NEXT:    mulpd %xmm3, %xmm1
    753 ; SSE-NEXT:    mulpd %xmm2, %xmm0
    754 ; SSE-NEXT:    mulpd %xmm1, %xmm0
    755 ; SSE-NEXT:    movapd %xmm0, %xmm1
    756 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    757 ; SSE-NEXT:    mulpd %xmm0, %xmm1
    758 ; SSE-NEXT:    movapd %xmm1, %xmm0
    759 ; SSE-NEXT:    retq
    760 ;
    761 ; AVX-LABEL: test_v8f64_zero:
    762 ; AVX:       # %bb.0:
    763 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    764 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    765 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    766 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    767 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    768 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    769 ; AVX-NEXT:    vzeroupper
    770 ; AVX-NEXT:    retq
    771 ;
    772 ; AVX512-LABEL: test_v8f64_zero:
    773 ; AVX512:       # %bb.0:
    774 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
    775 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    776 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    777 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    778 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    779 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    780 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    781 ; AVX512-NEXT:    vzeroupper
    782 ; AVX512-NEXT:    retq
    783   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double 1.0, <8 x double> %a0)
    784   ret double %1
    785 }
    786 
    787 define double @test_v16f64_zero(<16 x double> %a0) {
    788 ; SSE-LABEL: test_v16f64_zero:
    789 ; SSE:       # %bb.0:
    790 ; SSE-NEXT:    mulpd %xmm6, %xmm2
    791 ; SSE-NEXT:    mulpd %xmm4, %xmm0
    792 ; SSE-NEXT:    mulpd %xmm2, %xmm0
    793 ; SSE-NEXT:    mulpd %xmm7, %xmm3
    794 ; SSE-NEXT:    mulpd %xmm5, %xmm1
    795 ; SSE-NEXT:    mulpd %xmm3, %xmm1
    796 ; SSE-NEXT:    mulpd %xmm0, %xmm1
    797 ; SSE-NEXT:    movapd %xmm1, %xmm0
    798 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
    799 ; SSE-NEXT:    mulpd %xmm1, %xmm0
    800 ; SSE-NEXT:    retq
    801 ;
    802 ; AVX-LABEL: test_v16f64_zero:
    803 ; AVX:       # %bb.0:
    804 ; AVX-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
    805 ; AVX-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
    806 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    807 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    808 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    809 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    810 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    811 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    812 ; AVX-NEXT:    vzeroupper
    813 ; AVX-NEXT:    retq
    814 ;
    815 ; AVX512-LABEL: test_v16f64_zero:
    816 ; AVX512:       # %bb.0:
    817 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    818 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
    819 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    820 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    821 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    822 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    823 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    824 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    825 ; AVX512-NEXT:    vzeroupper
    826 ; AVX512-NEXT:    retq
    827   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double 1.0, <16 x double> %a0)
    828   ret double %1
    829 }
    830 
    831 ;
    832 ; vXf64 (undef)
    833 ;
    834 
    835 define double @test_v2f64_undef(<2 x double> %a0) {
    836 ; SSE-LABEL: test_v2f64_undef:
    837 ; SSE:       # %bb.0:
    838 ; SSE-NEXT:    movaps %xmm0, %xmm1
    839 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    840 ; SSE-NEXT:    mulpd %xmm0, %xmm1
    841 ; SSE-NEXT:    movapd %xmm1, %xmm0
    842 ; SSE-NEXT:    retq
    843 ;
    844 ; AVX-LABEL: test_v2f64_undef:
    845 ; AVX:       # %bb.0:
    846 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    847 ; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
    848 ; AVX-NEXT:    retq
    849 ;
    850 ; AVX512-LABEL: test_v2f64_undef:
    851 ; AVX512:       # %bb.0:
    852 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    853 ; AVX512-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
    854 ; AVX512-NEXT:    retq
    855   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double undef, <2 x double> %a0)
    856   ret double %1
    857 }
    858 
    859 define double @test_v4f64_undef(<4 x double> %a0) {
    860 ; SSE-LABEL: test_v4f64_undef:
    861 ; SSE:       # %bb.0:
    862 ; SSE-NEXT:    mulpd %xmm1, %xmm0
    863 ; SSE-NEXT:    movapd %xmm0, %xmm1
    864 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    865 ; SSE-NEXT:    mulpd %xmm0, %xmm1
    866 ; SSE-NEXT:    movapd %xmm1, %xmm0
    867 ; SSE-NEXT:    retq
    868 ;
    869 ; AVX-LABEL: test_v4f64_undef:
    870 ; AVX:       # %bb.0:
    871 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    872 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    873 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    874 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    875 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    876 ; AVX-NEXT:    vzeroupper
    877 ; AVX-NEXT:    retq
    878 ;
    879 ; AVX512-LABEL: test_v4f64_undef:
    880 ; AVX512:       # %bb.0:
    881 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    882 ; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    883 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    884 ; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    885 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    886 ; AVX512-NEXT:    vzeroupper
    887 ; AVX512-NEXT:    retq
    888   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double undef, <4 x double> %a0)
    889   ret double %1
    890 }
    891 
    892 define double @test_v8f64_undef(<8 x double> %a0) {
    893 ; SSE-LABEL: test_v8f64_undef:
    894 ; SSE:       # %bb.0:
    895 ; SSE-NEXT:    mulpd %xmm3, %xmm1
    896 ; SSE-NEXT:    mulpd %xmm2, %xmm0
    897 ; SSE-NEXT:    mulpd %xmm1, %xmm0
    898 ; SSE-NEXT:    movapd %xmm0, %xmm1
    899 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
    900 ; SSE-NEXT:    mulpd %xmm0, %xmm1
    901 ; SSE-NEXT:    movapd %xmm1, %xmm0
    902 ; SSE-NEXT:    retq
    903 ;
    904 ; AVX-LABEL: test_v8f64_undef:
    905 ; AVX:       # %bb.0:
    906 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    907 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    908 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    909 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    910 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    911 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    912 ; AVX-NEXT:    vzeroupper
    913 ; AVX-NEXT:    retq
    914 ;
    915 ; AVX512-LABEL: test_v8f64_undef:
    916 ; AVX512:       # %bb.0:
    917 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
    918 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    919 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    920 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    921 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    922 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    923 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    924 ; AVX512-NEXT:    vzeroupper
    925 ; AVX512-NEXT:    retq
    926   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double undef, <8 x double> %a0)
    927   ret double %1
    928 }
    929 
    930 define double @test_v16f64_undef(<16 x double> %a0) {
    931 ; SSE-LABEL: test_v16f64_undef:
    932 ; SSE:       # %bb.0:
    933 ; SSE-NEXT:    mulpd %xmm6, %xmm2
    934 ; SSE-NEXT:    mulpd %xmm4, %xmm0
    935 ; SSE-NEXT:    mulpd %xmm2, %xmm0
    936 ; SSE-NEXT:    mulpd %xmm7, %xmm3
    937 ; SSE-NEXT:    mulpd %xmm5, %xmm1
    938 ; SSE-NEXT:    mulpd %xmm3, %xmm1
    939 ; SSE-NEXT:    mulpd %xmm0, %xmm1
    940 ; SSE-NEXT:    movapd %xmm1, %xmm0
    941 ; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
    942 ; SSE-NEXT:    mulpd %xmm1, %xmm0
    943 ; SSE-NEXT:    retq
    944 ;
    945 ; AVX-LABEL: test_v16f64_undef:
    946 ; AVX:       # %bb.0:
    947 ; AVX-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
    948 ; AVX-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
    949 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    950 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
    951 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    952 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    953 ; AVX-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
    954 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
    955 ; AVX-NEXT:    vzeroupper
    956 ; AVX-NEXT:    retq
    957 ;
    958 ; AVX512-LABEL: test_v16f64_undef:
    959 ; AVX512:       # %bb.0:
    960 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    961 ; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm1
    962 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    963 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm1
    964 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    965 ; AVX512-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
    966 ; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
    967 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
    968 ; AVX512-NEXT:    vzeroupper
    969 ; AVX512-NEXT:    retq
    970   %1 = call fast double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double undef, <16 x double> %a0)
    971   ret double %1
    972 }
    973 
    974 declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v2f32(float, <2 x float>)
    975 declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float, <4 x float>)
    976 declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v8f32(float, <8 x float>)
    977 declare float @llvm.experimental.vector.reduce.fmul.f32.f32.v16f32(float, <16 x float>)
    978 
    979 declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v2f64(double, <2 x double>)
    980 declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v4f64(double, <4 x double>)
    981 declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v8f64(double, <8 x double>)
    982 declare double @llvm.experimental.vector.reduce.fmul.f64.f64.v16f64(double, <16 x double>)
    983