Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
      4 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512BWVL
      8 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512DQ
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL --check-prefix=AVX512DQVL
     10 
     11 ;
     12 ; vXi64
     13 ;
     14 
     15 define i64 @test_v2i64(<2 x i64> %a0) {
     16 ; SSE-LABEL: test_v2i64:
     17 ; SSE:       # %bb.0:
     18 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     19 ; SSE-NEXT:    movdqa %xmm0, %xmm2
     20 ; SSE-NEXT:    psrlq $32, %xmm2
     21 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
     22 ; SSE-NEXT:    movdqa %xmm1, %xmm3
     23 ; SSE-NEXT:    psrlq $32, %xmm3
     24 ; SSE-NEXT:    pmuludq %xmm0, %xmm3
     25 ; SSE-NEXT:    paddq %xmm2, %xmm3
     26 ; SSE-NEXT:    psllq $32, %xmm3
     27 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
     28 ; SSE-NEXT:    paddq %xmm3, %xmm0
     29 ; SSE-NEXT:    movq %xmm0, %rax
     30 ; SSE-NEXT:    retq
     31 ;
     32 ; AVX-LABEL: test_v2i64:
     33 ; AVX:       # %bb.0:
     34 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     35 ; AVX-NEXT:    vpsrlq $32, %xmm0, %xmm2
     36 ; AVX-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
     37 ; AVX-NEXT:    vpsrlq $32, %xmm1, %xmm3
     38 ; AVX-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
     39 ; AVX-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
     40 ; AVX-NEXT:    vpsllq $32, %xmm2, %xmm2
     41 ; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
     42 ; AVX-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
     43 ; AVX-NEXT:    vmovq %xmm0, %rax
     44 ; AVX-NEXT:    retq
     45 ;
     46 ; AVX512BW-LABEL: test_v2i64:
     47 ; AVX512BW:       # %bb.0:
     48 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     49 ; AVX512BW-NEXT:    vpsrlq $32, %xmm0, %xmm2
     50 ; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
     51 ; AVX512BW-NEXT:    vpsrlq $32, %xmm1, %xmm3
     52 ; AVX512BW-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
     53 ; AVX512BW-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
     54 ; AVX512BW-NEXT:    vpsllq $32, %xmm2, %xmm2
     55 ; AVX512BW-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
     56 ; AVX512BW-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
     57 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
     58 ; AVX512BW-NEXT:    retq
     59 ;
     60 ; AVX512BWVL-LABEL: test_v2i64:
     61 ; AVX512BWVL:       # %bb.0:
     62 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     63 ; AVX512BWVL-NEXT:    vpsrlq $32, %xmm0, %xmm2
     64 ; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
     65 ; AVX512BWVL-NEXT:    vpsrlq $32, %xmm1, %xmm3
     66 ; AVX512BWVL-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
     67 ; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
     68 ; AVX512BWVL-NEXT:    vpsllq $32, %xmm2, %xmm2
     69 ; AVX512BWVL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
     70 ; AVX512BWVL-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
     71 ; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
     72 ; AVX512BWVL-NEXT:    retq
     73 ;
     74 ; AVX512DQ-LABEL: test_v2i64:
     75 ; AVX512DQ:       # %bb.0:
     76 ; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
     77 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     78 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
     79 ; AVX512DQ-NEXT:    vmovq %xmm0, %rax
     80 ; AVX512DQ-NEXT:    vzeroupper
     81 ; AVX512DQ-NEXT:    retq
     82 ;
     83 ; AVX512DQVL-LABEL: test_v2i64:
     84 ; AVX512DQVL:       # %bb.0:
     85 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
     86 ; AVX512DQVL-NEXT:    vpmullq %xmm1, %xmm0, %xmm0
     87 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
     88 ; AVX512DQVL-NEXT:    retq
     89   %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a0)
     90   ret i64 %1
     91 }
     92 
     93 define i64 @test_v4i64(<4 x i64> %a0) {
     94 ; SSE-LABEL: test_v4i64:
     95 ; SSE:       # %bb.0:
     96 ; SSE-NEXT:    movdqa %xmm0, %xmm2
     97 ; SSE-NEXT:    psrlq $32, %xmm2
     98 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
     99 ; SSE-NEXT:    movdqa %xmm1, %xmm3
    100 ; SSE-NEXT:    psrlq $32, %xmm3
    101 ; SSE-NEXT:    pmuludq %xmm0, %xmm3
    102 ; SSE-NEXT:    paddq %xmm2, %xmm3
    103 ; SSE-NEXT:    psllq $32, %xmm3
    104 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
    105 ; SSE-NEXT:    paddq %xmm3, %xmm0
    106 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    107 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    108 ; SSE-NEXT:    psrlq $32, %xmm2
    109 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
    110 ; SSE-NEXT:    movdqa %xmm1, %xmm3
    111 ; SSE-NEXT:    psrlq $32, %xmm3
    112 ; SSE-NEXT:    pmuludq %xmm0, %xmm3
    113 ; SSE-NEXT:    paddq %xmm2, %xmm3
    114 ; SSE-NEXT:    psllq $32, %xmm3
    115 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
    116 ; SSE-NEXT:    paddq %xmm3, %xmm0
    117 ; SSE-NEXT:    movq %xmm0, %rax
    118 ; SSE-NEXT:    retq
    119 ;
    120 ; AVX1-LABEL: test_v4i64:
    121 ; AVX1:       # %bb.0:
    122 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    123 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
    124 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
    125 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
    126 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
    127 ; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
    128 ; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
    129 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
    130 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
    131 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    132 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
    133 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
    134 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
    135 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
    136 ; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
    137 ; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
    138 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
    139 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
    140 ; AVX1-NEXT:    vmovq %xmm0, %rax
    141 ; AVX1-NEXT:    vzeroupper
    142 ; AVX1-NEXT:    retq
    143 ;
    144 ; AVX2-LABEL: test_v4i64:
    145 ; AVX2:       # %bb.0:
    146 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    147 ; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
    148 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
    149 ; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
    150 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
    151 ; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    152 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
    153 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    154 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    155 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    156 ; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
    157 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
    158 ; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
    159 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
    160 ; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    161 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
    162 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    163 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    164 ; AVX2-NEXT:    vmovq %xmm0, %rax
    165 ; AVX2-NEXT:    vzeroupper
    166 ; AVX2-NEXT:    retq
    167 ;
    168 ; AVX512BW-LABEL: test_v4i64:
    169 ; AVX512BW:       # %bb.0:
    170 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    171 ; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm2
    172 ; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
    173 ; AVX512BW-NEXT:    vpsrlq $32, %ymm1, %ymm3
    174 ; AVX512BW-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
    175 ; AVX512BW-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    176 ; AVX512BW-NEXT:    vpsllq $32, %ymm2, %ymm2
    177 ; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    178 ; AVX512BW-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    179 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    180 ; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm2
    181 ; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
    182 ; AVX512BW-NEXT:    vpsrlq $32, %ymm1, %ymm3
    183 ; AVX512BW-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
    184 ; AVX512BW-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    185 ; AVX512BW-NEXT:    vpsllq $32, %ymm2, %ymm2
    186 ; AVX512BW-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    187 ; AVX512BW-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    188 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
    189 ; AVX512BW-NEXT:    vzeroupper
    190 ; AVX512BW-NEXT:    retq
    191 ;
    192 ; AVX512BWVL-LABEL: test_v4i64:
    193 ; AVX512BWVL:       # %bb.0:
    194 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    195 ; AVX512BWVL-NEXT:    vpsrlq $32, %ymm0, %ymm2
    196 ; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
    197 ; AVX512BWVL-NEXT:    vpsrlq $32, %ymm1, %ymm3
    198 ; AVX512BWVL-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
    199 ; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    200 ; AVX512BWVL-NEXT:    vpsllq $32, %ymm2, %ymm2
    201 ; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    202 ; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    203 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    204 ; AVX512BWVL-NEXT:    vpsrlq $32, %ymm0, %ymm2
    205 ; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
    206 ; AVX512BWVL-NEXT:    vpsrlq $32, %ymm1, %ymm3
    207 ; AVX512BWVL-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
    208 ; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    209 ; AVX512BWVL-NEXT:    vpsllq $32, %ymm2, %ymm2
    210 ; AVX512BWVL-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    211 ; AVX512BWVL-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    212 ; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
    213 ; AVX512BWVL-NEXT:    vzeroupper
    214 ; AVX512BWVL-NEXT:    retq
    215 ;
    216 ; AVX512DQ-LABEL: test_v4i64:
    217 ; AVX512DQ:       # %bb.0:
    218 ; AVX512DQ-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
    219 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
    220 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    221 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    222 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    223 ; AVX512DQ-NEXT:    vmovq %xmm0, %rax
    224 ; AVX512DQ-NEXT:    vzeroupper
    225 ; AVX512DQ-NEXT:    retq
    226 ;
    227 ; AVX512DQVL-LABEL: test_v4i64:
    228 ; AVX512DQVL:       # %bb.0:
    229 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    230 ; AVX512DQVL-NEXT:    vpmullq %ymm1, %ymm0, %ymm0
    231 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    232 ; AVX512DQVL-NEXT:    vpmullq %ymm1, %ymm0, %ymm0
    233 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
    234 ; AVX512DQVL-NEXT:    vzeroupper
    235 ; AVX512DQVL-NEXT:    retq
    236   %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64> %a0)
    237   ret i64 %1
    238 }
    239 
    240 define i64 @test_v8i64(<8 x i64> %a0) {
    241 ; SSE-LABEL: test_v8i64:
    242 ; SSE:       # %bb.0:
    243 ; SSE-NEXT:    movdqa %xmm1, %xmm4
    244 ; SSE-NEXT:    psrlq $32, %xmm4
    245 ; SSE-NEXT:    pmuludq %xmm3, %xmm4
    246 ; SSE-NEXT:    movdqa %xmm3, %xmm5
    247 ; SSE-NEXT:    psrlq $32, %xmm5
    248 ; SSE-NEXT:    pmuludq %xmm1, %xmm5
    249 ; SSE-NEXT:    paddq %xmm4, %xmm5
    250 ; SSE-NEXT:    psllq $32, %xmm5
    251 ; SSE-NEXT:    pmuludq %xmm3, %xmm1
    252 ; SSE-NEXT:    paddq %xmm5, %xmm1
    253 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    254 ; SSE-NEXT:    psrlq $32, %xmm3
    255 ; SSE-NEXT:    pmuludq %xmm2, %xmm3
    256 ; SSE-NEXT:    movdqa %xmm2, %xmm4
    257 ; SSE-NEXT:    psrlq $32, %xmm4
    258 ; SSE-NEXT:    pmuludq %xmm0, %xmm4
    259 ; SSE-NEXT:    paddq %xmm3, %xmm4
    260 ; SSE-NEXT:    psllq $32, %xmm4
    261 ; SSE-NEXT:    pmuludq %xmm2, %xmm0
    262 ; SSE-NEXT:    paddq %xmm4, %xmm0
    263 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    264 ; SSE-NEXT:    psrlq $32, %xmm2
    265 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
    266 ; SSE-NEXT:    movdqa %xmm1, %xmm3
    267 ; SSE-NEXT:    psrlq $32, %xmm3
    268 ; SSE-NEXT:    pmuludq %xmm0, %xmm3
    269 ; SSE-NEXT:    paddq %xmm2, %xmm3
    270 ; SSE-NEXT:    psllq $32, %xmm3
    271 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
    272 ; SSE-NEXT:    paddq %xmm3, %xmm0
    273 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    274 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    275 ; SSE-NEXT:    psrlq $32, %xmm2
    276 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
    277 ; SSE-NEXT:    movdqa %xmm1, %xmm3
    278 ; SSE-NEXT:    psrlq $32, %xmm3
    279 ; SSE-NEXT:    pmuludq %xmm0, %xmm3
    280 ; SSE-NEXT:    paddq %xmm2, %xmm3
    281 ; SSE-NEXT:    psllq $32, %xmm3
    282 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
    283 ; SSE-NEXT:    paddq %xmm3, %xmm0
    284 ; SSE-NEXT:    movq %xmm0, %rax
    285 ; SSE-NEXT:    retq
    286 ;
    287 ; AVX1-LABEL: test_v8i64:
    288 ; AVX1:       # %bb.0:
    289 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    290 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    291 ; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm4
    292 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm4, %xmm4
    293 ; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm5
    294 ; AVX1-NEXT:    vpmuludq %xmm5, %xmm3, %xmm5
    295 ; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
    296 ; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
    297 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm2
    298 ; AVX1-NEXT:    vpaddq %xmm4, %xmm2, %xmm2
    299 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
    300 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm3, %xmm3
    301 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
    302 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm0, %xmm4
    303 ; AVX1-NEXT:    vpaddq %xmm3, %xmm4, %xmm3
    304 ; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
    305 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
    306 ; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
    307 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm1
    308 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm1, %xmm1
    309 ; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm3
    310 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
    311 ; AVX1-NEXT:    vpaddq %xmm1, %xmm3, %xmm1
    312 ; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
    313 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
    314 ; AVX1-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
    315 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    316 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
    317 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
    318 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
    319 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
    320 ; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
    321 ; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
    322 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
    323 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
    324 ; AVX1-NEXT:    vmovq %xmm0, %rax
    325 ; AVX1-NEXT:    vzeroupper
    326 ; AVX1-NEXT:    retq
    327 ;
    328 ; AVX2-LABEL: test_v8i64:
    329 ; AVX2:       # %bb.0:
    330 ; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
    331 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
    332 ; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
    333 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
    334 ; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    335 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
    336 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    337 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    338 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    339 ; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
    340 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
    341 ; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
    342 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
    343 ; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    344 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
    345 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    346 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    347 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    348 ; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
    349 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
    350 ; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
    351 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
    352 ; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    353 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
    354 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    355 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    356 ; AVX2-NEXT:    vmovq %xmm0, %rax
    357 ; AVX2-NEXT:    vzeroupper
    358 ; AVX2-NEXT:    retq
    359 ;
    360 ; AVX512BW-LABEL: test_v8i64:
    361 ; AVX512BW:       # %bb.0:
    362 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    363 ; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
    364 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    365 ; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
    366 ; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    367 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    368 ; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
    369 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    370 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    371 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    372 ; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
    373 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    374 ; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
    375 ; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    376 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    377 ; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
    378 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    379 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    380 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    381 ; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
    382 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    383 ; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
    384 ; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    385 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    386 ; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
    387 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    388 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    389 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
    390 ; AVX512BW-NEXT:    vzeroupper
    391 ; AVX512BW-NEXT:    retq
    392 ;
    393 ; AVX512BWVL-LABEL: test_v8i64:
    394 ; AVX512BWVL:       # %bb.0:
    395 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    396 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
    397 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    398 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
    399 ; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    400 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    401 ; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
    402 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    403 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    404 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    405 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
    406 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    407 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
    408 ; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    409 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    410 ; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
    411 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    412 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    413 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    414 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
    415 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    416 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
    417 ; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    418 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    419 ; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
    420 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    421 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    422 ; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
    423 ; AVX512BWVL-NEXT:    vzeroupper
    424 ; AVX512BWVL-NEXT:    retq
    425 ;
    426 ; AVX512DQ-LABEL: test_v8i64:
    427 ; AVX512DQ:       # %bb.0:
    428 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    429 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    430 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
    431 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    432 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    433 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    434 ; AVX512DQ-NEXT:    vmovq %xmm0, %rax
    435 ; AVX512DQ-NEXT:    vzeroupper
    436 ; AVX512DQ-NEXT:    retq
    437 ;
    438 ; AVX512DQVL-LABEL: test_v8i64:
    439 ; AVX512DQVL:       # %bb.0:
    440 ; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    441 ; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    442 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    443 ; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    444 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    445 ; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    446 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
    447 ; AVX512DQVL-NEXT:    vzeroupper
    448 ; AVX512DQVL-NEXT:    retq
    449   %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64> %a0)
    450   ret i64 %1
    451 }
    452 
    453 define i64 @test_v16i64(<16 x i64> %a0) {
    454 ; SSE-LABEL: test_v16i64:
    455 ; SSE:       # %bb.0:
    456 ; SSE-NEXT:    movdqa %xmm2, %xmm8
    457 ; SSE-NEXT:    psrlq $32, %xmm8
    458 ; SSE-NEXT:    pmuludq %xmm6, %xmm8
    459 ; SSE-NEXT:    movdqa %xmm6, %xmm9
    460 ; SSE-NEXT:    psrlq $32, %xmm9
    461 ; SSE-NEXT:    pmuludq %xmm2, %xmm9
    462 ; SSE-NEXT:    paddq %xmm8, %xmm9
    463 ; SSE-NEXT:    psllq $32, %xmm9
    464 ; SSE-NEXT:    pmuludq %xmm6, %xmm2
    465 ; SSE-NEXT:    paddq %xmm9, %xmm2
    466 ; SSE-NEXT:    movdqa %xmm0, %xmm8
    467 ; SSE-NEXT:    psrlq $32, %xmm8
    468 ; SSE-NEXT:    pmuludq %xmm4, %xmm8
    469 ; SSE-NEXT:    movdqa %xmm4, %xmm6
    470 ; SSE-NEXT:    psrlq $32, %xmm6
    471 ; SSE-NEXT:    pmuludq %xmm0, %xmm6
    472 ; SSE-NEXT:    paddq %xmm8, %xmm6
    473 ; SSE-NEXT:    psllq $32, %xmm6
    474 ; SSE-NEXT:    pmuludq %xmm4, %xmm0
    475 ; SSE-NEXT:    paddq %xmm6, %xmm0
    476 ; SSE-NEXT:    movdqa %xmm3, %xmm4
    477 ; SSE-NEXT:    psrlq $32, %xmm4
    478 ; SSE-NEXT:    pmuludq %xmm7, %xmm4
    479 ; SSE-NEXT:    movdqa %xmm7, %xmm6
    480 ; SSE-NEXT:    psrlq $32, %xmm6
    481 ; SSE-NEXT:    pmuludq %xmm3, %xmm6
    482 ; SSE-NEXT:    paddq %xmm4, %xmm6
    483 ; SSE-NEXT:    psllq $32, %xmm6
    484 ; SSE-NEXT:    pmuludq %xmm7, %xmm3
    485 ; SSE-NEXT:    paddq %xmm6, %xmm3
    486 ; SSE-NEXT:    movdqa %xmm1, %xmm4
    487 ; SSE-NEXT:    psrlq $32, %xmm4
    488 ; SSE-NEXT:    pmuludq %xmm5, %xmm4
    489 ; SSE-NEXT:    movdqa %xmm5, %xmm6
    490 ; SSE-NEXT:    psrlq $32, %xmm6
    491 ; SSE-NEXT:    pmuludq %xmm1, %xmm6
    492 ; SSE-NEXT:    paddq %xmm4, %xmm6
    493 ; SSE-NEXT:    psllq $32, %xmm6
    494 ; SSE-NEXT:    pmuludq %xmm5, %xmm1
    495 ; SSE-NEXT:    paddq %xmm6, %xmm1
    496 ; SSE-NEXT:    movdqa %xmm1, %xmm4
    497 ; SSE-NEXT:    psrlq $32, %xmm4
    498 ; SSE-NEXT:    pmuludq %xmm3, %xmm4
    499 ; SSE-NEXT:    movdqa %xmm3, %xmm5
    500 ; SSE-NEXT:    psrlq $32, %xmm5
    501 ; SSE-NEXT:    pmuludq %xmm1, %xmm5
    502 ; SSE-NEXT:    paddq %xmm4, %xmm5
    503 ; SSE-NEXT:    psllq $32, %xmm5
    504 ; SSE-NEXT:    pmuludq %xmm3, %xmm1
    505 ; SSE-NEXT:    paddq %xmm5, %xmm1
    506 ; SSE-NEXT:    movdqa %xmm0, %xmm3
    507 ; SSE-NEXT:    psrlq $32, %xmm3
    508 ; SSE-NEXT:    pmuludq %xmm2, %xmm3
    509 ; SSE-NEXT:    movdqa %xmm2, %xmm4
    510 ; SSE-NEXT:    psrlq $32, %xmm4
    511 ; SSE-NEXT:    pmuludq %xmm0, %xmm4
    512 ; SSE-NEXT:    paddq %xmm3, %xmm4
    513 ; SSE-NEXT:    psllq $32, %xmm4
    514 ; SSE-NEXT:    pmuludq %xmm2, %xmm0
    515 ; SSE-NEXT:    paddq %xmm4, %xmm0
    516 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    517 ; SSE-NEXT:    psrlq $32, %xmm2
    518 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
    519 ; SSE-NEXT:    movdqa %xmm1, %xmm3
    520 ; SSE-NEXT:    psrlq $32, %xmm3
    521 ; SSE-NEXT:    pmuludq %xmm0, %xmm3
    522 ; SSE-NEXT:    paddq %xmm2, %xmm3
    523 ; SSE-NEXT:    psllq $32, %xmm3
    524 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
    525 ; SSE-NEXT:    paddq %xmm3, %xmm0
    526 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    527 ; SSE-NEXT:    movdqa %xmm0, %xmm2
    528 ; SSE-NEXT:    psrlq $32, %xmm2
    529 ; SSE-NEXT:    pmuludq %xmm1, %xmm2
    530 ; SSE-NEXT:    movdqa %xmm1, %xmm3
    531 ; SSE-NEXT:    psrlq $32, %xmm3
    532 ; SSE-NEXT:    pmuludq %xmm0, %xmm3
    533 ; SSE-NEXT:    paddq %xmm2, %xmm3
    534 ; SSE-NEXT:    psllq $32, %xmm3
    535 ; SSE-NEXT:    pmuludq %xmm1, %xmm0
    536 ; SSE-NEXT:    paddq %xmm3, %xmm0
    537 ; SSE-NEXT:    movq %xmm0, %rax
    538 ; SSE-NEXT:    retq
    539 ;
    540 ; AVX1-LABEL: test_v16i64:
    541 ; AVX1:       # %bb.0:
    542 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
    543 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm4
    544 ; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
    545 ; AVX1-NEXT:    vpmuludq %xmm5, %xmm1, %xmm5
    546 ; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
    547 ; AVX1-NEXT:    vpsllq $32, %xmm4, %xmm4
    548 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm5
    549 ; AVX1-NEXT:    vpaddq %xmm4, %xmm5, %xmm4
    550 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm5
    551 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm5
    552 ; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm6
    553 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm6
    554 ; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
    555 ; AVX1-NEXT:    vpsllq $32, %xmm5, %xmm5
    556 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm6
    557 ; AVX1-NEXT:    vpaddq %xmm5, %xmm6, %xmm5
    558 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
    559 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
    560 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm6
    561 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm6, %xmm6
    562 ; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm7
    563 ; AVX1-NEXT:    vpmuludq %xmm7, %xmm1, %xmm7
    564 ; AVX1-NEXT:    vpaddq %xmm6, %xmm7, %xmm6
    565 ; AVX1-NEXT:    vpsllq $32, %xmm6, %xmm6
    566 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
    567 ; AVX1-NEXT:    vpaddq %xmm6, %xmm1, %xmm1
    568 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
    569 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
    570 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
    571 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm3, %xmm3
    572 ; AVX1-NEXT:    vpsrlq $32, %xmm2, %xmm6
    573 ; AVX1-NEXT:    vpmuludq %xmm6, %xmm0, %xmm6
    574 ; AVX1-NEXT:    vpaddq %xmm3, %xmm6, %xmm3
    575 ; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
    576 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm0, %xmm0
    577 ; AVX1-NEXT:    vpaddq %xmm3, %xmm0, %xmm0
    578 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
    579 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
    580 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
    581 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
    582 ; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
    583 ; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
    584 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
    585 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
    586 ; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm1
    587 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm1, %xmm1
    588 ; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm2
    589 ; AVX1-NEXT:    vpmuludq %xmm2, %xmm5, %xmm2
    590 ; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
    591 ; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
    592 ; AVX1-NEXT:    vpmuludq %xmm4, %xmm5, %xmm2
    593 ; AVX1-NEXT:    vpaddq %xmm1, %xmm2, %xmm1
    594 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm2
    595 ; AVX1-NEXT:    vpmuludq %xmm0, %xmm2, %xmm2
    596 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm3
    597 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm1, %xmm3
    598 ; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
    599 ; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
    600 ; AVX1-NEXT:    vpmuludq %xmm0, %xmm1, %xmm0
    601 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
    602 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    603 ; AVX1-NEXT:    vpsrlq $32, %xmm0, %xmm2
    604 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm2, %xmm2
    605 ; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
    606 ; AVX1-NEXT:    vpmuludq %xmm3, %xmm0, %xmm3
    607 ; AVX1-NEXT:    vpaddq %xmm2, %xmm3, %xmm2
    608 ; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
    609 ; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
    610 ; AVX1-NEXT:    vpaddq %xmm2, %xmm0, %xmm0
    611 ; AVX1-NEXT:    vmovq %xmm0, %rax
    612 ; AVX1-NEXT:    vzeroupper
    613 ; AVX1-NEXT:    retq
    614 ;
    615 ; AVX2-LABEL: test_v16i64:
    616 ; AVX2:       # %bb.0:
    617 ; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm4
    618 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm4, %ymm4
    619 ; AVX2-NEXT:    vpsrlq $32, %ymm3, %ymm5
    620 ; AVX2-NEXT:    vpmuludq %ymm5, %ymm1, %ymm5
    621 ; AVX2-NEXT:    vpaddq %ymm4, %ymm5, %ymm4
    622 ; AVX2-NEXT:    vpsllq $32, %ymm4, %ymm4
    623 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm1, %ymm1
    624 ; AVX2-NEXT:    vpaddq %ymm4, %ymm1, %ymm1
    625 ; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm3
    626 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm3, %ymm3
    627 ; AVX2-NEXT:    vpsrlq $32, %ymm2, %ymm4
    628 ; AVX2-NEXT:    vpmuludq %ymm4, %ymm0, %ymm4
    629 ; AVX2-NEXT:    vpaddq %ymm3, %ymm4, %ymm3
    630 ; AVX2-NEXT:    vpsllq $32, %ymm3, %ymm3
    631 ; AVX2-NEXT:    vpmuludq %ymm2, %ymm0, %ymm0
    632 ; AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm0
    633 ; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
    634 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
    635 ; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
    636 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
    637 ; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    638 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
    639 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    640 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    641 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    642 ; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
    643 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
    644 ; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
    645 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
    646 ; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    647 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
    648 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    649 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    650 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    651 ; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm2
    652 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm2, %ymm2
    653 ; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm3
    654 ; AVX2-NEXT:    vpmuludq %ymm3, %ymm0, %ymm3
    655 ; AVX2-NEXT:    vpaddq %ymm2, %ymm3, %ymm2
    656 ; AVX2-NEXT:    vpsllq $32, %ymm2, %ymm2
    657 ; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
    658 ; AVX2-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
    659 ; AVX2-NEXT:    vmovq %xmm0, %rax
    660 ; AVX2-NEXT:    vzeroupper
    661 ; AVX2-NEXT:    retq
    662 ;
    663 ; AVX512BW-LABEL: test_v16i64:
    664 ; AVX512BW:       # %bb.0:
    665 ; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
    666 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    667 ; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
    668 ; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    669 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    670 ; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
    671 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    672 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    673 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    674 ; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
    675 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    676 ; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
    677 ; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    678 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    679 ; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
    680 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    681 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    682 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
    683 ; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
    684 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    685 ; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
    686 ; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    687 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    688 ; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
    689 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    690 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    691 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    692 ; AVX512BW-NEXT:    vpsrlq $32, %zmm0, %zmm2
    693 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    694 ; AVX512BW-NEXT:    vpsrlq $32, %zmm1, %zmm3
    695 ; AVX512BW-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    696 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    697 ; AVX512BW-NEXT:    vpsllq $32, %zmm2, %zmm2
    698 ; AVX512BW-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    699 ; AVX512BW-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    700 ; AVX512BW-NEXT:    vmovq %xmm0, %rax
    701 ; AVX512BW-NEXT:    vzeroupper
    702 ; AVX512BW-NEXT:    retq
    703 ;
    704 ; AVX512BWVL-LABEL: test_v16i64:
    705 ; AVX512BWVL:       # %bb.0:
    706 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
    707 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    708 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
    709 ; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    710 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    711 ; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
    712 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    713 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    714 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    715 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
    716 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    717 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
    718 ; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    719 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    720 ; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
    721 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    722 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    723 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    724 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
    725 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    726 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
    727 ; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    728 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    729 ; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
    730 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    731 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    732 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    733 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm0, %zmm2
    734 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm2, %zmm2
    735 ; AVX512BWVL-NEXT:    vpsrlq $32, %zmm1, %zmm3
    736 ; AVX512BWVL-NEXT:    vpmuludq %zmm3, %zmm0, %zmm3
    737 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm3, %zmm2
    738 ; AVX512BWVL-NEXT:    vpsllq $32, %zmm2, %zmm2
    739 ; AVX512BWVL-NEXT:    vpmuludq %zmm1, %zmm0, %zmm0
    740 ; AVX512BWVL-NEXT:    vpaddq %zmm2, %zmm0, %zmm0
    741 ; AVX512BWVL-NEXT:    vmovq %xmm0, %rax
    742 ; AVX512BWVL-NEXT:    vzeroupper
    743 ; AVX512BWVL-NEXT:    retq
    744 ;
    745 ; AVX512DQ-LABEL: test_v16i64:
    746 ; AVX512DQ:       # %bb.0:
    747 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    748 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    749 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    750 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
    751 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    752 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    753 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    754 ; AVX512DQ-NEXT:    vmovq %xmm0, %rax
    755 ; AVX512DQ-NEXT:    vzeroupper
    756 ; AVX512DQ-NEXT:    retq
    757 ;
    758 ; AVX512DQVL-LABEL: test_v16i64:
    759 ; AVX512DQVL:       # %bb.0:
    760 ; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    761 ; AVX512DQVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    762 ; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    763 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
    764 ; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    765 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    766 ; AVX512DQVL-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
    767 ; AVX512DQVL-NEXT:    vmovq %xmm0, %rax
    768 ; AVX512DQVL-NEXT:    vzeroupper
    769 ; AVX512DQVL-NEXT:    retq
    770   %1 = call i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64> %a0)
    771   ret i64 %1
    772 }
    773 
    774 ;
    775 ; vXi32
    776 ;
    777 
    778 define i32 @test_v4i32(<4 x i32> %a0) {
    779 ; SSE2-LABEL: test_v4i32:
    780 ; SSE2:       # %bb.0:
    781 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    782 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    783 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    784 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    785 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    786 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    787 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    788 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    789 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    790 ; SSE2-NEXT:    pmuludq %xmm0, %xmm1
    791 ; SSE2-NEXT:    movd %xmm1, %eax
    792 ; SSE2-NEXT:    retq
    793 ;
    794 ; SSE41-LABEL: test_v4i32:
    795 ; SSE41:       # %bb.0:
    796 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    797 ; SSE41-NEXT:    pmulld %xmm0, %xmm1
    798 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    799 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
    800 ; SSE41-NEXT:    movd %xmm0, %eax
    801 ; SSE41-NEXT:    retq
    802 ;
    803 ; AVX-LABEL: test_v4i32:
    804 ; AVX:       # %bb.0:
    805 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    806 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    807 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    808 ; AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    809 ; AVX-NEXT:    vmovd %xmm0, %eax
    810 ; AVX-NEXT:    retq
    811 ;
    812 ; AVX512-LABEL: test_v4i32:
    813 ; AVX512:       # %bb.0:
    814 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    815 ; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    816 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    817 ; AVX512-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    818 ; AVX512-NEXT:    vmovd %xmm0, %eax
    819 ; AVX512-NEXT:    retq
    820   %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a0)
    821   ret i32 %1
    822 }
    823 
    824 define i32 @test_v8i32(<8 x i32> %a0) {
    825 ; SSE2-LABEL: test_v8i32:
    826 ; SSE2:       # %bb.0:
    827 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    828 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    829 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    830 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    831 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    832 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    833 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    834 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    835 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    836 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    837 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    838 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    839 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    840 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    841 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    842 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    843 ; SSE2-NEXT:    pmuludq %xmm0, %xmm1
    844 ; SSE2-NEXT:    movd %xmm1, %eax
    845 ; SSE2-NEXT:    retq
    846 ;
    847 ; SSE41-LABEL: test_v8i32:
    848 ; SSE41:       # %bb.0:
    849 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
    850 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    851 ; SSE41-NEXT:    pmulld %xmm0, %xmm1
    852 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    853 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
    854 ; SSE41-NEXT:    movd %xmm0, %eax
    855 ; SSE41-NEXT:    retq
    856 ;
    857 ; AVX1-LABEL: test_v8i32:
    858 ; AVX1:       # %bb.0:
    859 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
    860 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    861 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    862 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    863 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    864 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    865 ; AVX1-NEXT:    vmovd %xmm0, %eax
    866 ; AVX1-NEXT:    vzeroupper
    867 ; AVX1-NEXT:    retq
    868 ;
    869 ; AVX2-LABEL: test_v8i32:
    870 ; AVX2:       # %bb.0:
    871 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    872 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    873 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    874 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    875 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    876 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    877 ; AVX2-NEXT:    vmovd %xmm0, %eax
    878 ; AVX2-NEXT:    vzeroupper
    879 ; AVX2-NEXT:    retq
    880 ;
    881 ; AVX512-LABEL: test_v8i32:
    882 ; AVX512:       # %bb.0:
    883 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
    884 ; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    885 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    886 ; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    887 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    888 ; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    889 ; AVX512-NEXT:    vmovd %xmm0, %eax
    890 ; AVX512-NEXT:    vzeroupper
    891 ; AVX512-NEXT:    retq
    892   %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32> %a0)
    893   ret i32 %1
    894 }
    895 
    896 define i32 @test_v16i32(<16 x i32> %a0) {
    897 ; SSE2-LABEL: test_v16i32:
    898 ; SSE2:       # %bb.0:
    899 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
    900 ; SSE2-NEXT:    pmuludq %xmm3, %xmm1
    901 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    902 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
    903 ; SSE2-NEXT:    pmuludq %xmm4, %xmm3
    904 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
    905 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
    906 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
    907 ; SSE2-NEXT:    pmuludq %xmm2, %xmm0
    908 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    909 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
    910 ; SSE2-NEXT:    pmuludq %xmm3, %xmm2
    911 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    912 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
    913 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    914 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    915 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    916 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    917 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    918 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    919 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    920 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    921 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
    922 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
    923 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    924 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
    925 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
    926 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
    927 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
    928 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    929 ; SSE2-NEXT:    pmuludq %xmm0, %xmm1
    930 ; SSE2-NEXT:    movd %xmm1, %eax
    931 ; SSE2-NEXT:    retq
    932 ;
    933 ; SSE41-LABEL: test_v16i32:
    934 ; SSE41:       # %bb.0:
    935 ; SSE41-NEXT:    pmulld %xmm3, %xmm1
    936 ; SSE41-NEXT:    pmulld %xmm2, %xmm0
    937 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
    938 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    939 ; SSE41-NEXT:    pmulld %xmm0, %xmm1
    940 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
    941 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
    942 ; SSE41-NEXT:    movd %xmm0, %eax
    943 ; SSE41-NEXT:    retq
    944 ;
    945 ; AVX1-LABEL: test_v16i32:
    946 ; AVX1:       # %bb.0:
    947 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
    948 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
    949 ; AVX1-NEXT:    vpmulld %xmm2, %xmm3, %xmm2
    950 ; AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
    951 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    952 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    953 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    954 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    955 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    956 ; AVX1-NEXT:    vmovd %xmm0, %eax
    957 ; AVX1-NEXT:    vzeroupper
    958 ; AVX1-NEXT:    retq
    959 ;
    960 ; AVX2-LABEL: test_v16i32:
    961 ; AVX2:       # %bb.0:
    962 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    963 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
    964 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    965 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    966 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    967 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    968 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    969 ; AVX2-NEXT:    vmovd %xmm0, %eax
    970 ; AVX2-NEXT:    vzeroupper
    971 ; AVX2-NEXT:    retq
    972 ;
    973 ; AVX512-LABEL: test_v16i32:
    974 ; AVX512:       # %bb.0:
    975 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
    976 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
    977 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
    978 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
    979 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    980 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
    981 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    982 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
    983 ; AVX512-NEXT:    vmovd %xmm0, %eax
    984 ; AVX512-NEXT:    vzeroupper
    985 ; AVX512-NEXT:    retq
    986   %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32> %a0)
    987   ret i32 %1
    988 }
    989 
    990 define i32 @test_v32i32(<32 x i32> %a0) {
    991 ; SSE2-LABEL: test_v32i32:
    992 ; SSE2:       # %bb.0:
    993 ; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3]
    994 ; SSE2-NEXT:    pmuludq %xmm6, %xmm2
    995 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
    996 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
    997 ; SSE2-NEXT:    pmuludq %xmm8, %xmm6
    998 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
    999 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
   1000 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
   1001 ; SSE2-NEXT:    pmuludq %xmm4, %xmm0
   1002 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1003 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
   1004 ; SSE2-NEXT:    pmuludq %xmm6, %xmm4
   1005 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
   1006 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
   1007 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
   1008 ; SSE2-NEXT:    pmuludq %xmm7, %xmm3
   1009 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
   1010 ; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3]
   1011 ; SSE2-NEXT:    pmuludq %xmm4, %xmm6
   1012 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
   1013 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
   1014 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
   1015 ; SSE2-NEXT:    pmuludq %xmm5, %xmm1
   1016 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1017 ; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
   1018 ; SSE2-NEXT:    pmuludq %xmm4, %xmm5
   1019 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
   1020 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
   1021 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
   1022 ; SSE2-NEXT:    pmuludq %xmm3, %xmm1
   1023 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1024 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
   1025 ; SSE2-NEXT:    pmuludq %xmm4, %xmm3
   1026 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
   1027 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
   1028 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
   1029 ; SSE2-NEXT:    pmuludq %xmm2, %xmm0
   1030 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1031 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
   1032 ; SSE2-NEXT:    pmuludq %xmm3, %xmm2
   1033 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
   1034 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
   1035 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
   1036 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
   1037 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1038 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
   1039 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
   1040 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1041 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1042 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1043 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
   1044 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
   1045 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1046 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
   1047 ; SSE2-NEXT:    pmuludq %xmm2, %xmm1
   1048 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
   1049 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
   1050 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1051 ; SSE2-NEXT:    pmuludq %xmm0, %xmm1
   1052 ; SSE2-NEXT:    movd %xmm1, %eax
   1053 ; SSE2-NEXT:    retq
   1054 ;
   1055 ; SSE41-LABEL: test_v32i32:
   1056 ; SSE41:       # %bb.0:
   1057 ; SSE41-NEXT:    pmulld %xmm6, %xmm2
   1058 ; SSE41-NEXT:    pmulld %xmm4, %xmm0
   1059 ; SSE41-NEXT:    pmulld %xmm2, %xmm0
   1060 ; SSE41-NEXT:    pmulld %xmm7, %xmm3
   1061 ; SSE41-NEXT:    pmulld %xmm5, %xmm1
   1062 ; SSE41-NEXT:    pmulld %xmm3, %xmm1
   1063 ; SSE41-NEXT:    pmulld %xmm0, %xmm1
   1064 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
   1065 ; SSE41-NEXT:    pmulld %xmm1, %xmm0
   1066 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1067 ; SSE41-NEXT:    pmulld %xmm0, %xmm1
   1068 ; SSE41-NEXT:    movd %xmm1, %eax
   1069 ; SSE41-NEXT:    retq
   1070 ;
   1071 ; AVX1-LABEL: test_v32i32:
   1072 ; AVX1:       # %bb.0:
   1073 ; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm4
   1074 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
   1075 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1076 ; AVX1-NEXT:    vpmulld %xmm3, %xmm1, %xmm1
   1077 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
   1078 ; AVX1-NEXT:    vpmulld %xmm1, %xmm3, %xmm1
   1079 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1080 ; AVX1-NEXT:    vpmulld %xmm1, %xmm3, %xmm1
   1081 ; AVX1-NEXT:    vpmulld %xmm4, %xmm2, %xmm2
   1082 ; AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
   1083 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   1084 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1085 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   1086 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1087 ; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   1088 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1089 ; AVX1-NEXT:    vzeroupper
   1090 ; AVX1-NEXT:    retq
   1091 ;
   1092 ; AVX2-LABEL: test_v32i32:
   1093 ; AVX2:       # %bb.0:
   1094 ; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
   1095 ; AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
   1096 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
   1097 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1098 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
   1099 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1100 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
   1101 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1102 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
   1103 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1104 ; AVX2-NEXT:    vzeroupper
   1105 ; AVX2-NEXT:    retq
   1106 ;
   1107 ; AVX512-LABEL: test_v32i32:
   1108 ; AVX512:       # %bb.0:
   1109 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
   1110 ; AVX512-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1111 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
   1112 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1113 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
   1114 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1115 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
   1116 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1117 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
   1118 ; AVX512-NEXT:    vmovd %xmm0, %eax
   1119 ; AVX512-NEXT:    vzeroupper
   1120 ; AVX512-NEXT:    retq
   1121   %1 = call i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32> %a0)
   1122   ret i32 %1
   1123 }
   1124 
   1125 ;
   1126 ; vXi16
   1127 ;
   1128 
   1129 define i16 @test_v8i16(<8 x i16> %a0) {
   1130 ; SSE-LABEL: test_v8i16:
   1131 ; SSE:       # %bb.0:
   1132 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1133 ; SSE-NEXT:    pmullw %xmm0, %xmm1
   1134 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1135 ; SSE-NEXT:    pmullw %xmm1, %xmm0
   1136 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1137 ; SSE-NEXT:    psrld $16, %xmm1
   1138 ; SSE-NEXT:    pmullw %xmm0, %xmm1
   1139 ; SSE-NEXT:    movd %xmm1, %eax
   1140 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
   1141 ; SSE-NEXT:    retq
   1142 ;
   1143 ; AVX-LABEL: test_v8i16:
   1144 ; AVX:       # %bb.0:
   1145 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1146 ; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1147 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1148 ; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1149 ; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
   1150 ; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1151 ; AVX-NEXT:    vmovd %xmm0, %eax
   1152 ; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
   1153 ; AVX-NEXT:    retq
   1154 ;
   1155 ; AVX512-LABEL: test_v8i16:
   1156 ; AVX512:       # %bb.0:
   1157 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1158 ; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1159 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1160 ; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1161 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
   1162 ; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1163 ; AVX512-NEXT:    vmovd %xmm0, %eax
   1164 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
   1165 ; AVX512-NEXT:    retq
   1166   %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16> %a0)
   1167   ret i16 %1
   1168 }
   1169 
   1170 define i16 @test_v16i16(<16 x i16> %a0) {
   1171 ; SSE-LABEL: test_v16i16:
   1172 ; SSE:       # %bb.0:
   1173 ; SSE-NEXT:    pmullw %xmm1, %xmm0
   1174 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1175 ; SSE-NEXT:    pmullw %xmm0, %xmm1
   1176 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1177 ; SSE-NEXT:    pmullw %xmm1, %xmm0
   1178 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1179 ; SSE-NEXT:    psrld $16, %xmm1
   1180 ; SSE-NEXT:    pmullw %xmm0, %xmm1
   1181 ; SSE-NEXT:    movd %xmm1, %eax
   1182 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
   1183 ; SSE-NEXT:    retq
   1184 ;
   1185 ; AVX1-LABEL: test_v16i16:
   1186 ; AVX1:       # %bb.0:
   1187 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
   1188 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1189 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1190 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1191 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1192 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1193 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
   1194 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1195 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1196 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
   1197 ; AVX1-NEXT:    vzeroupper
   1198 ; AVX1-NEXT:    retq
   1199 ;
   1200 ; AVX2-LABEL: test_v16i16:
   1201 ; AVX2:       # %bb.0:
   1202 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1203 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1204 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1205 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1206 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1207 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1208 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
   1209 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1210 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1211 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
   1212 ; AVX2-NEXT:    vzeroupper
   1213 ; AVX2-NEXT:    retq
   1214 ;
   1215 ; AVX512-LABEL: test_v16i16:
   1216 ; AVX512:       # %bb.0:
   1217 ; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1218 ; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1219 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1220 ; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1221 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1222 ; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1223 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm1
   1224 ; AVX512-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1225 ; AVX512-NEXT:    vmovd %xmm0, %eax
   1226 ; AVX512-NEXT:    # kill: def $ax killed $ax killed $eax
   1227 ; AVX512-NEXT:    vzeroupper
   1228 ; AVX512-NEXT:    retq
   1229   %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16> %a0)
   1230   ret i16 %1
   1231 }
   1232 
   1233 define i16 @test_v32i16(<32 x i16> %a0) {
   1234 ; SSE-LABEL: test_v32i16:
   1235 ; SSE:       # %bb.0:
   1236 ; SSE-NEXT:    pmullw %xmm3, %xmm1
   1237 ; SSE-NEXT:    pmullw %xmm2, %xmm0
   1238 ; SSE-NEXT:    pmullw %xmm1, %xmm0
   1239 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1240 ; SSE-NEXT:    pmullw %xmm0, %xmm1
   1241 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
   1242 ; SSE-NEXT:    pmullw %xmm1, %xmm0
   1243 ; SSE-NEXT:    movdqa %xmm0, %xmm1
   1244 ; SSE-NEXT:    psrld $16, %xmm1
   1245 ; SSE-NEXT:    pmullw %xmm0, %xmm1
   1246 ; SSE-NEXT:    movd %xmm1, %eax
   1247 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
   1248 ; SSE-NEXT:    retq
   1249 ;
   1250 ; AVX1-LABEL: test_v32i16:
   1251 ; AVX1:       # %bb.0:
   1252 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
   1253 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1254 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
   1255 ; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
   1256 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1257 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1258 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1259 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1260 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1261 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
   1262 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1263 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1264 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
   1265 ; AVX1-NEXT:    vzeroupper
   1266 ; AVX1-NEXT:    retq
   1267 ;
   1268 ; AVX2-LABEL: test_v32i16:
   1269 ; AVX2:       # %bb.0:
   1270 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1271 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1272 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1273 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1274 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1275 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1276 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1277 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
   1278 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1279 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1280 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
   1281 ; AVX2-NEXT:    vzeroupper
   1282 ; AVX2-NEXT:    retq
   1283 ;
   1284 ; AVX512BW-LABEL: test_v32i16:
   1285 ; AVX512BW:       # %bb.0:
   1286 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1287 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1288 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1289 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1290 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1291 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1292 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1293 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1294 ; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
   1295 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1296 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
   1297 ; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
   1298 ; AVX512BW-NEXT:    vzeroupper
   1299 ; AVX512BW-NEXT:    retq
   1300 ;
   1301 ; AVX512BWVL-LABEL: test_v32i16:
   1302 ; AVX512BWVL:       # %bb.0:
   1303 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1304 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1305 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1306 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1307 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1308 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1309 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1310 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1311 ; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
   1312 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1313 ; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
   1314 ; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
   1315 ; AVX512BWVL-NEXT:    vzeroupper
   1316 ; AVX512BWVL-NEXT:    retq
   1317 ;
   1318 ; AVX512DQ-LABEL: test_v32i16:
   1319 ; AVX512DQ:       # %bb.0:
   1320 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1321 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1322 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1323 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1324 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1325 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1326 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1327 ; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
   1328 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1329 ; AVX512DQ-NEXT:    vmovd %xmm0, %eax
   1330 ; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
   1331 ; AVX512DQ-NEXT:    vzeroupper
   1332 ; AVX512DQ-NEXT:    retq
   1333 ;
   1334 ; AVX512DQVL-LABEL: test_v32i16:
   1335 ; AVX512DQVL:       # %bb.0:
   1336 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1337 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1338 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1339 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1340 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1341 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1342 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1343 ; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
   1344 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1345 ; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
   1346 ; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
   1347 ; AVX512DQVL-NEXT:    vzeroupper
   1348 ; AVX512DQVL-NEXT:    retq
   1349   %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16> %a0)
   1350   ret i16 %1
   1351 }
   1352 
   1353 define i16 @test_v64i16(<64 x i16> %a0) {
   1354 ; SSE-LABEL: test_v64i16:
   1355 ; SSE:       # %bb.0:
   1356 ; SSE-NEXT:    pmullw %xmm6, %xmm2
   1357 ; SSE-NEXT:    pmullw %xmm4, %xmm0
   1358 ; SSE-NEXT:    pmullw %xmm2, %xmm0
   1359 ; SSE-NEXT:    pmullw %xmm7, %xmm3
   1360 ; SSE-NEXT:    pmullw %xmm5, %xmm1
   1361 ; SSE-NEXT:    pmullw %xmm3, %xmm1
   1362 ; SSE-NEXT:    pmullw %xmm0, %xmm1
   1363 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
   1364 ; SSE-NEXT:    pmullw %xmm1, %xmm0
   1365 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1366 ; SSE-NEXT:    pmullw %xmm0, %xmm1
   1367 ; SSE-NEXT:    movdqa %xmm1, %xmm0
   1368 ; SSE-NEXT:    psrld $16, %xmm0
   1369 ; SSE-NEXT:    pmullw %xmm1, %xmm0
   1370 ; SSE-NEXT:    movd %xmm0, %eax
   1371 ; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
   1372 ; SSE-NEXT:    retq
   1373 ;
   1374 ; AVX1-LABEL: test_v64i16:
   1375 ; AVX1:       # %bb.0:
   1376 ; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm4
   1377 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm3
   1378 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   1379 ; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
   1380 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
   1381 ; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
   1382 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
   1383 ; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm1
   1384 ; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
   1385 ; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
   1386 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1387 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1388 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1389 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1390 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1391 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
   1392 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1393 ; AVX1-NEXT:    vmovd %xmm0, %eax
   1394 ; AVX1-NEXT:    # kill: def $ax killed $ax killed $eax
   1395 ; AVX1-NEXT:    vzeroupper
   1396 ; AVX1-NEXT:    retq
   1397 ;
   1398 ; AVX2-LABEL: test_v64i16:
   1399 ; AVX2:       # %bb.0:
   1400 ; AVX2-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
   1401 ; AVX2-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
   1402 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1403 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1404 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1405 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1406 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1407 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1408 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1409 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
   1410 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1411 ; AVX2-NEXT:    vmovd %xmm0, %eax
   1412 ; AVX2-NEXT:    # kill: def $ax killed $ax killed $eax
   1413 ; AVX2-NEXT:    vzeroupper
   1414 ; AVX2-NEXT:    retq
   1415 ;
   1416 ; AVX512BW-LABEL: test_v64i16:
   1417 ; AVX512BW:       # %bb.0:
   1418 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1419 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1420 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1421 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1422 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1423 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1424 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1425 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1426 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1427 ; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
   1428 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1429 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
   1430 ; AVX512BW-NEXT:    # kill: def $ax killed $ax killed $eax
   1431 ; AVX512BW-NEXT:    vzeroupper
   1432 ; AVX512BW-NEXT:    retq
   1433 ;
   1434 ; AVX512BWVL-LABEL: test_v64i16:
   1435 ; AVX512BWVL:       # %bb.0:
   1436 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1437 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   1438 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1439 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1440 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1441 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1442 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1443 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1444 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1445 ; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
   1446 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1447 ; AVX512BWVL-NEXT:    vmovd %xmm0, %eax
   1448 ; AVX512BWVL-NEXT:    # kill: def $ax killed $ax killed $eax
   1449 ; AVX512BWVL-NEXT:    vzeroupper
   1450 ; AVX512BWVL-NEXT:    retq
   1451 ;
   1452 ; AVX512DQ-LABEL: test_v64i16:
   1453 ; AVX512DQ:       # %bb.0:
   1454 ; AVX512DQ-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
   1455 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
   1456 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1457 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1458 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1459 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1460 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1461 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1462 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1463 ; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
   1464 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1465 ; AVX512DQ-NEXT:    vmovd %xmm0, %eax
   1466 ; AVX512DQ-NEXT:    # kill: def $ax killed $ax killed $eax
   1467 ; AVX512DQ-NEXT:    vzeroupper
   1468 ; AVX512DQ-NEXT:    retq
   1469 ;
   1470 ; AVX512DQVL-LABEL: test_v64i16:
   1471 ; AVX512DQVL:       # %bb.0:
   1472 ; AVX512DQVL-NEXT:    vpmullw %ymm3, %ymm1, %ymm1
   1473 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
   1474 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1475 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1476 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1477 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1478 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1479 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1480 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1481 ; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
   1482 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1483 ; AVX512DQVL-NEXT:    vmovd %xmm0, %eax
   1484 ; AVX512DQVL-NEXT:    # kill: def $ax killed $ax killed $eax
   1485 ; AVX512DQVL-NEXT:    vzeroupper
   1486 ; AVX512DQVL-NEXT:    retq
   1487   %1 = call i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16> %a0)
   1488   ret i16 %1
   1489 }
   1490 
   1491 ;
   1492 ; vXi8
   1493 ;
   1494 
   1495 define i8 @test_v16i8(<16 x i8> %a0) {
   1496 ; SSE2-LABEL: test_v16i8:
   1497 ; SSE2:       # %bb.0:
   1498 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
   1499 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
   1500 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1501 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
   1502 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
   1503 ; SSE2-NEXT:    pand %xmm1, %xmm0
   1504 ; SSE2-NEXT:    pxor %xmm3, %xmm3
   1505 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
   1506 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,2,3,3]
   1507 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1508 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
   1509 ; SSE2-NEXT:    pmullw %xmm0, %xmm2
   1510 ; SSE2-NEXT:    pand %xmm1, %xmm2
   1511 ; SSE2-NEXT:    packuswb %xmm3, %xmm2
   1512 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
   1513 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1514 ; SSE2-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
   1515 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
   1516 ; SSE2-NEXT:    pmullw %xmm0, %xmm2
   1517 ; SSE2-NEXT:    pand %xmm1, %xmm2
   1518 ; SSE2-NEXT:    packuswb %xmm3, %xmm2
   1519 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
   1520 ; SSE2-NEXT:    psrlw $8, %xmm0
   1521 ; SSE2-NEXT:    movdqa %xmm2, %xmm3
   1522 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
   1523 ; SSE2-NEXT:    pmullw %xmm0, %xmm3
   1524 ; SSE2-NEXT:    pand %xmm1, %xmm3
   1525 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
   1526 ; SSE2-NEXT:    pmullw %xmm0, %xmm2
   1527 ; SSE2-NEXT:    pand %xmm1, %xmm2
   1528 ; SSE2-NEXT:    packuswb %xmm3, %xmm2
   1529 ; SSE2-NEXT:    movd %xmm2, %eax
   1530 ; SSE2-NEXT:    # kill: def $al killed $al killed $eax
   1531 ; SSE2-NEXT:    retq
   1532 ;
   1533 ; SSE41-LABEL: test_v16i8:
   1534 ; SSE41:       # %bb.0:
   1535 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1536 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1537 ; SSE41-NEXT:    pmullw %xmm1, %xmm0
   1538 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
   1539 ; SSE41-NEXT:    pand %xmm1, %xmm0
   1540 ; SSE41-NEXT:    pxor %xmm2, %xmm2
   1541 ; SSE41-NEXT:    packuswb %xmm2, %xmm0
   1542 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1543 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1544 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1545 ; SSE41-NEXT:    pmullw %xmm3, %xmm0
   1546 ; SSE41-NEXT:    pand %xmm1, %xmm0
   1547 ; SSE41-NEXT:    packuswb %xmm2, %xmm0
   1548 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1549 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1550 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
   1551 ; SSE41-NEXT:    pmullw %xmm3, %xmm0
   1552 ; SSE41-NEXT:    pand %xmm1, %xmm0
   1553 ; SSE41-NEXT:    packuswb %xmm2, %xmm0
   1554 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1555 ; SSE41-NEXT:    psrlw $8, %xmm0
   1556 ; SSE41-NEXT:    pmullw %xmm1, %xmm0
   1557 ; SSE41-NEXT:    pextrb $0, %xmm0, %eax
   1558 ; SSE41-NEXT:    # kill: def $al killed $al killed $eax
   1559 ; SSE41-NEXT:    retq
   1560 ;
   1561 ; AVX1-LABEL: test_v16i8:
   1562 ; AVX1:       # %bb.0:
   1563 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1564 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1565 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1566 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
   1567 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1568 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
   1569 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   1570 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1571 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1572 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
   1573 ; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
   1574 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1575 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   1576 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1577 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1578 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
   1579 ; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
   1580 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1581 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   1582 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
   1583 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1584 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   1585 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
   1586 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
   1587 ; AVX1-NEXT:    retq
   1588 ;
   1589 ; AVX2-LABEL: test_v16i8:
   1590 ; AVX2:       # %bb.0:
   1591 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1592 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   1593 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   1594 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1595 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1596 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1597 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1598 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1599 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1600 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1601 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   1602 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   1603 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1604 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1605 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1606 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1607 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1608 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
   1609 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   1610 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   1611 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1612 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1613 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1614 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1615 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1616 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
   1617 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   1618 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   1619 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1620 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
   1621 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
   1622 ; AVX2-NEXT:    vzeroupper
   1623 ; AVX2-NEXT:    retq
   1624 ;
   1625 ; AVX512BW-LABEL: test_v16i8:
   1626 ; AVX512BW:       # %bb.0:
   1627 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1628 ; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
   1629 ; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
   1630 ; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1631 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1632 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1633 ; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
   1634 ; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
   1635 ; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1636 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1637 ; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
   1638 ; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
   1639 ; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
   1640 ; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1641 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1642 ; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
   1643 ; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
   1644 ; AVX512BW-NEXT:    vpmovsxbw %xmm1, %ymm1
   1645 ; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1646 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1647 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
   1648 ; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
   1649 ; AVX512BW-NEXT:    vzeroupper
   1650 ; AVX512BW-NEXT:    retq
   1651 ;
   1652 ; AVX512BWVL-LABEL: test_v16i8:
   1653 ; AVX512BWVL:       # %bb.0:
   1654 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1655 ; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   1656 ; AVX512BWVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   1657 ; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1658 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
   1659 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1660 ; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   1661 ; AVX512BWVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   1662 ; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1663 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
   1664 ; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
   1665 ; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   1666 ; AVX512BWVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   1667 ; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1668 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
   1669 ; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
   1670 ; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   1671 ; AVX512BWVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   1672 ; AVX512BWVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1673 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
   1674 ; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
   1675 ; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
   1676 ; AVX512BWVL-NEXT:    vzeroupper
   1677 ; AVX512BWVL-NEXT:    retq
   1678 ;
   1679 ; AVX512DQ-LABEL: test_v16i8:
   1680 ; AVX512DQ:       # %bb.0:
   1681 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1682 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   1683 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   1684 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1685 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   1686 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   1687 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1688 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   1689 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   1690 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1691 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   1692 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   1693 ; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
   1694 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   1695 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   1696 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1697 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   1698 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   1699 ; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
   1700 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   1701 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   1702 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1703 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   1704 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   1705 ; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
   1706 ; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
   1707 ; AVX512DQ-NEXT:    vzeroupper
   1708 ; AVX512DQ-NEXT:    retq
   1709 ;
   1710 ; AVX512DQVL-LABEL: test_v16i8:
   1711 ; AVX512DQVL:       # %bb.0:
   1712 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1713 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   1714 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   1715 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1716 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   1717 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   1718 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1719 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   1720 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   1721 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1722 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   1723 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   1724 ; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
   1725 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   1726 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   1727 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1728 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   1729 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   1730 ; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
   1731 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   1732 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   1733 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1734 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   1735 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   1736 ; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
   1737 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
   1738 ; AVX512DQVL-NEXT:    vzeroupper
   1739 ; AVX512DQVL-NEXT:    retq
   1740   %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8> %a0)
   1741   ret i8 %1
   1742 }
   1743 
   1744 define i8 @test_v32i8(<32 x i8> %a0) {
   1745 ; SSE2-LABEL: test_v32i8:
   1746 ; SSE2:       # %bb.0:
   1747 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
   1748 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
   1749 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   1750 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
   1751 ; SSE2-NEXT:    pmullw %xmm2, %xmm3
   1752 ; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   1753 ; SSE2-NEXT:    pand %xmm2, %xmm3
   1754 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1755 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1756 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
   1757 ; SSE2-NEXT:    pand %xmm2, %xmm0
   1758 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
   1759 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
   1760 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
   1761 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1762 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
   1763 ; SSE2-NEXT:    pand %xmm2, %xmm0
   1764 ; SSE2-NEXT:    pxor %xmm3, %xmm3
   1765 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
   1766 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
   1767 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1768 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1769 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
   1770 ; SSE2-NEXT:    pand %xmm2, %xmm1
   1771 ; SSE2-NEXT:    packuswb %xmm3, %xmm1
   1772 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1773 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1774 ; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
   1775 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1776 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
   1777 ; SSE2-NEXT:    pand %xmm2, %xmm1
   1778 ; SSE2-NEXT:    packuswb %xmm3, %xmm1
   1779 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   1780 ; SSE2-NEXT:    psrlw $8, %xmm0
   1781 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
   1782 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
   1783 ; SSE2-NEXT:    pmullw %xmm0, %xmm3
   1784 ; SSE2-NEXT:    pand %xmm2, %xmm3
   1785 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   1786 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
   1787 ; SSE2-NEXT:    pand %xmm2, %xmm1
   1788 ; SSE2-NEXT:    packuswb %xmm3, %xmm1
   1789 ; SSE2-NEXT:    movd %xmm1, %eax
   1790 ; SSE2-NEXT:    # kill: def $al killed $al killed $eax
   1791 ; SSE2-NEXT:    retq
   1792 ;
   1793 ; SSE41-LABEL: test_v32i8:
   1794 ; SSE41:       # %bb.0:
   1795 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   1796 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1797 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1798 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1799 ; SSE41-NEXT:    pmullw %xmm1, %xmm0
   1800 ; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
   1801 ; SSE41-NEXT:    pand %xmm1, %xmm0
   1802 ; SSE41-NEXT:    pmullw %xmm2, %xmm3
   1803 ; SSE41-NEXT:    pand %xmm1, %xmm3
   1804 ; SSE41-NEXT:    packuswb %xmm0, %xmm3
   1805 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
   1806 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1807 ; SSE41-NEXT:    pmullw %xmm0, %xmm3
   1808 ; SSE41-NEXT:    pand %xmm1, %xmm3
   1809 ; SSE41-NEXT:    pxor %xmm0, %xmm0
   1810 ; SSE41-NEXT:    packuswb %xmm0, %xmm3
   1811 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
   1812 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1813 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
   1814 ; SSE41-NEXT:    pmullw %xmm2, %xmm3
   1815 ; SSE41-NEXT:    pand %xmm1, %xmm3
   1816 ; SSE41-NEXT:    packuswb %xmm0, %xmm3
   1817 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
   1818 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1819 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
   1820 ; SSE41-NEXT:    pmullw %xmm2, %xmm3
   1821 ; SSE41-NEXT:    pand %xmm1, %xmm3
   1822 ; SSE41-NEXT:    packuswb %xmm0, %xmm3
   1823 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
   1824 ; SSE41-NEXT:    psrlw $8, %xmm3
   1825 ; SSE41-NEXT:    pmullw %xmm0, %xmm3
   1826 ; SSE41-NEXT:    pextrb $0, %xmm3, %eax
   1827 ; SSE41-NEXT:    # kill: def $al killed $al killed $eax
   1828 ; SSE41-NEXT:    retq
   1829 ;
   1830 ; AVX1-LABEL: test_v32i8:
   1831 ; AVX1:       # %bb.0:
   1832 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
   1833 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1834 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1835 ; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
   1836 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
   1837 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
   1838 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   1839 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1840 ; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
   1841 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1842 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
   1843 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
   1844 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1845 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1846 ; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
   1847 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
   1848 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1849 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   1850 ; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
   1851 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1852 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
   1853 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
   1854 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1855 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1856 ; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
   1857 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
   1858 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1859 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   1860 ; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
   1861 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1862 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
   1863 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm2
   1864 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1865 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1866 ; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
   1867 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
   1868 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1869 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   1870 ; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
   1871 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1872 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
   1873 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm2
   1874 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1875 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   1876 ; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
   1877 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm3
   1878 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1879 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   1880 ; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
   1881 ; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
   1882 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
   1883 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
   1884 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
   1885 ; AVX1-NEXT:    vzeroupper
   1886 ; AVX1-NEXT:    retq
   1887 ;
   1888 ; AVX2-LABEL: test_v32i8:
   1889 ; AVX2:       # %bb.0:
   1890 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1891 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   1892 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   1893 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1894 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1895 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   1896 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1897 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1898 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1899 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1900 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   1901 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   1902 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1903 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1904 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1905 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1906 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1907 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1908 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   1909 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   1910 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1911 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1912 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1913 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1914 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1915 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
   1916 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   1917 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   1918 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1919 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1920 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   1921 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   1922 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   1923 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
   1924 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   1925 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   1926 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   1927 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
   1928 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
   1929 ; AVX2-NEXT:    vzeroupper
   1930 ; AVX2-NEXT:    retq
   1931 ;
   1932 ; AVX512BW-LABEL: test_v32i8:
   1933 ; AVX512BW:       # %bb.0:
   1934 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1935 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   1936 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   1937 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1938 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1939 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1940 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   1941 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   1942 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1943 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1944 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1945 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   1946 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   1947 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1948 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1949 ; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
   1950 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   1951 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   1952 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1953 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1954 ; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
   1955 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   1956 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   1957 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1958 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   1959 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
   1960 ; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
   1961 ; AVX512BW-NEXT:    vzeroupper
   1962 ; AVX512BW-NEXT:    retq
   1963 ;
   1964 ; AVX512BWVL-LABEL: test_v32i8:
   1965 ; AVX512BWVL:       # %bb.0:
   1966 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1967 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   1968 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   1969 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1970 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   1971 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1972 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   1973 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   1974 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1975 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   1976 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1977 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   1978 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   1979 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1980 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   1981 ; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
   1982 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   1983 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   1984 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1985 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   1986 ; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
   1987 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   1988 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   1989 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   1990 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   1991 ; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
   1992 ; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
   1993 ; AVX512BWVL-NEXT:    vzeroupper
   1994 ; AVX512BWVL-NEXT:    retq
   1995 ;
   1996 ; AVX512DQ-LABEL: test_v32i8:
   1997 ; AVX512DQ:       # %bb.0:
   1998 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1999 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   2000 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   2001 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2002 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   2003 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2004 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2005 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   2006 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   2007 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2008 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   2009 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2010 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   2011 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   2012 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   2013 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2014 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   2015 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2016 ; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
   2017 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   2018 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   2019 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2020 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   2021 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2022 ; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
   2023 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   2024 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   2025 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2026 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   2027 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2028 ; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
   2029 ; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
   2030 ; AVX512DQ-NEXT:    vzeroupper
   2031 ; AVX512DQ-NEXT:    retq
   2032 ;
   2033 ; AVX512DQVL-LABEL: test_v32i8:
   2034 ; AVX512DQVL:       # %bb.0:
   2035 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2036 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   2037 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   2038 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2039 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   2040 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   2041 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2042 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   2043 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   2044 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2045 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   2046 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   2047 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   2048 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   2049 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   2050 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2051 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   2052 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   2053 ; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
   2054 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   2055 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   2056 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2057 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   2058 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   2059 ; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
   2060 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   2061 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   2062 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2063 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   2064 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   2065 ; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
   2066 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
   2067 ; AVX512DQVL-NEXT:    vzeroupper
   2068 ; AVX512DQVL-NEXT:    retq
   2069   %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8> %a0)
   2070   ret i8 %1
   2071 }
   2072 
   2073 define i8 @test_v64i8(<64 x i8> %a0) {
   2074 ; SSE2-LABEL: test_v64i8:
   2075 ; SSE2:       # %bb.0:
   2076 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   2077 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
   2078 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   2079 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
   2080 ; SSE2-NEXT:    pmullw %xmm4, %xmm5
   2081 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
   2082 ; SSE2-NEXT:    pand %xmm4, %xmm5
   2083 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
   2084 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2085 ; SSE2-NEXT:    pmullw %xmm2, %xmm0
   2086 ; SSE2-NEXT:    pand %xmm4, %xmm0
   2087 ; SSE2-NEXT:    packuswb %xmm5, %xmm0
   2088 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
   2089 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
   2090 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
   2091 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
   2092 ; SSE2-NEXT:    pmullw %xmm2, %xmm5
   2093 ; SSE2-NEXT:    pand %xmm4, %xmm5
   2094 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
   2095 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2096 ; SSE2-NEXT:    pmullw %xmm3, %xmm1
   2097 ; SSE2-NEXT:    pand %xmm4, %xmm1
   2098 ; SSE2-NEXT:    packuswb %xmm5, %xmm1
   2099 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
   2100 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
   2101 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   2102 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
   2103 ; SSE2-NEXT:    pmullw %xmm2, %xmm3
   2104 ; SSE2-NEXT:    pand %xmm4, %xmm3
   2105 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2106 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2107 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
   2108 ; SSE2-NEXT:    pand %xmm4, %xmm0
   2109 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
   2110 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
   2111 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
   2112 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2113 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
   2114 ; SSE2-NEXT:    pand %xmm4, %xmm0
   2115 ; SSE2-NEXT:    pxor %xmm2, %xmm2
   2116 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
   2117 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
   2118 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2119 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2120 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
   2121 ; SSE2-NEXT:    pand %xmm4, %xmm1
   2122 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
   2123 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   2124 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2125 ; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
   2126 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2127 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
   2128 ; SSE2-NEXT:    pand %xmm4, %xmm1
   2129 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
   2130 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   2131 ; SSE2-NEXT:    psrlw $8, %xmm0
   2132 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
   2133 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
   2134 ; SSE2-NEXT:    pmullw %xmm0, %xmm2
   2135 ; SSE2-NEXT:    pand %xmm4, %xmm2
   2136 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2137 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
   2138 ; SSE2-NEXT:    pand %xmm4, %xmm1
   2139 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
   2140 ; SSE2-NEXT:    movd %xmm1, %eax
   2141 ; SSE2-NEXT:    # kill: def $al killed $al killed $eax
   2142 ; SSE2-NEXT:    retq
   2143 ;
   2144 ; SSE41-LABEL: test_v64i8:
   2145 ; SSE41:       # %bb.0:
   2146 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   2147 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2148 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2149 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2150 ; SSE41-NEXT:    pmullw %xmm2, %xmm0
   2151 ; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   2152 ; SSE41-NEXT:    pand %xmm2, %xmm0
   2153 ; SSE41-NEXT:    pmullw %xmm5, %xmm4
   2154 ; SSE41-NEXT:    pand %xmm2, %xmm4
   2155 ; SSE41-NEXT:    packuswb %xmm0, %xmm4
   2156 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
   2157 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2158 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2159 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2160 ; SSE41-NEXT:    pmullw %xmm3, %xmm1
   2161 ; SSE41-NEXT:    pand %xmm2, %xmm1
   2162 ; SSE41-NEXT:    pmullw %xmm0, %xmm5
   2163 ; SSE41-NEXT:    pand %xmm2, %xmm5
   2164 ; SSE41-NEXT:    packuswb %xmm1, %xmm5
   2165 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
   2166 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2167 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
   2168 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2169 ; SSE41-NEXT:    pmullw %xmm5, %xmm4
   2170 ; SSE41-NEXT:    pand %xmm2, %xmm4
   2171 ; SSE41-NEXT:    pmullw %xmm0, %xmm1
   2172 ; SSE41-NEXT:    pand %xmm2, %xmm1
   2173 ; SSE41-NEXT:    packuswb %xmm4, %xmm1
   2174 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2175 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2176 ; SSE41-NEXT:    pmullw %xmm0, %xmm1
   2177 ; SSE41-NEXT:    pand %xmm2, %xmm1
   2178 ; SSE41-NEXT:    pxor %xmm0, %xmm0
   2179 ; SSE41-NEXT:    packuswb %xmm0, %xmm1
   2180 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2181 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2182 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
   2183 ; SSE41-NEXT:    pmullw %xmm3, %xmm1
   2184 ; SSE41-NEXT:    pand %xmm2, %xmm1
   2185 ; SSE41-NEXT:    packuswb %xmm0, %xmm1
   2186 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2187 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2188 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
   2189 ; SSE41-NEXT:    pmullw %xmm3, %xmm1
   2190 ; SSE41-NEXT:    pand %xmm2, %xmm1
   2191 ; SSE41-NEXT:    packuswb %xmm0, %xmm1
   2192 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2193 ; SSE41-NEXT:    psrlw $8, %xmm1
   2194 ; SSE41-NEXT:    pmullw %xmm0, %xmm1
   2195 ; SSE41-NEXT:    pextrb $0, %xmm1, %eax
   2196 ; SSE41-NEXT:    # kill: def $al killed $al killed $eax
   2197 ; SSE41-NEXT:    retq
   2198 ;
   2199 ; AVX1-LABEL: test_v64i8:
   2200 ; AVX1:       # %bb.0:
   2201 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2202 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2203 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm3
   2204 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
   2205 ; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
   2206 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2207 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2208 ; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
   2209 ; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
   2210 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm4, %xmm3
   2211 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
   2212 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2213 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2214 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2215 ; AVX1-NEXT:    vpmullw %xmm4, %xmm5, %xmm4
   2216 ; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
   2217 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2218 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2219 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2220 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
   2221 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm0, %xmm0
   2222 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2223 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2224 ; AVX1-NEXT:    vpmullw %xmm1, %xmm4, %xmm1
   2225 ; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
   2226 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2227 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
   2228 ; AVX1-NEXT:    vpmullw %xmm0, %xmm3, %xmm0
   2229 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
   2230 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
   2231 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2232 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2233 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2234 ; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
   2235 ; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
   2236 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2237 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2238 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2239 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
   2240 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
   2241 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   2242 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2243 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2244 ; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
   2245 ; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
   2246 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2247 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2248 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2249 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
   2250 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
   2251 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
   2252 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2253 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2254 ; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
   2255 ; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
   2256 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2257 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2258 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2259 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
   2260 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
   2261 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
   2262 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2263 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2264 ; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
   2265 ; AVX1-NEXT:    vpand %xmm2, %xmm3, %xmm3
   2266 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2267 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2268 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2269 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
   2270 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
   2271 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
   2272 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
   2273 ; AVX1-NEXT:    vzeroupper
   2274 ; AVX1-NEXT:    retq
   2275 ;
   2276 ; AVX2-LABEL: test_v64i8:
   2277 ; AVX2:       # %bb.0:
   2278 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm2
   2279 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm3
   2280 ; AVX2-NEXT:    vpmullw %ymm2, %ymm3, %ymm3
   2281 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm4
   2282 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   2283 ; AVX2-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
   2284 ; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
   2285 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
   2286 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
   2287 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   2288 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
   2289 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2290 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2291 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2292 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   2293 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   2294 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2295 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2296 ; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm1
   2297 ; AVX2-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
   2298 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2299 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   2300 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   2301 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2302 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2303 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2304 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   2305 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2306 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2307 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   2308 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   2309 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2310 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   2311 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2312 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   2313 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2314 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2315 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   2316 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   2317 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2318 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
   2319 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2320 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   2321 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2322 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2323 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
   2324 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
   2325 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2326 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
   2327 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2328 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   2329 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2330 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
   2331 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
   2332 ; AVX2-NEXT:    vzeroupper
   2333 ; AVX2-NEXT:    retq
   2334 ;
   2335 ; AVX512BW-LABEL: test_v64i8:
   2336 ; AVX512BW:       # %bb.0:
   2337 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   2338 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   2339 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2340 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2341 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2342 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2343 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2344 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   2345 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2346 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2347 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2348 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2349 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   2350 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2351 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2352 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   2353 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2354 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   2355 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2356 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2357 ; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
   2358 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2359 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   2360 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2361 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2362 ; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
   2363 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2364 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   2365 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2366 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2367 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
   2368 ; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
   2369 ; AVX512BW-NEXT:    vzeroupper
   2370 ; AVX512BW-NEXT:    retq
   2371 ;
   2372 ; AVX512BWVL-LABEL: test_v64i8:
   2373 ; AVX512BWVL:       # %bb.0:
   2374 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
   2375 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   2376 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   2377 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2378 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   2379 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2380 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   2381 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   2382 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2383 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   2384 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2385 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   2386 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   2387 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2388 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   2389 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   2390 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   2391 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   2392 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2393 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   2394 ; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
   2395 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   2396 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   2397 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2398 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   2399 ; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
   2400 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   2401 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   2402 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2403 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   2404 ; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
   2405 ; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
   2406 ; AVX512BWVL-NEXT:    vzeroupper
   2407 ; AVX512BWVL-NEXT:    retq
   2408 ;
   2409 ; AVX512DQ-LABEL: test_v64i8:
   2410 ; AVX512DQ:       # %bb.0:
   2411 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm2
   2412 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm3
   2413 ; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
   2414 ; AVX512DQ-NEXT:    vpmovsxwd %ymm2, %zmm2
   2415 ; AVX512DQ-NEXT:    vpmovdb %zmm2, %xmm2
   2416 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm1
   2417 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   2418 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
   2419 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   2420 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2421 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   2422 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2423 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   2424 ; AVX512DQ-NEXT:    vpmovsxbw %xmm2, %ymm1
   2425 ; AVX512DQ-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
   2426 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   2427 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2428 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2429 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   2430 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   2431 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2432 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   2433 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2434 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   2435 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   2436 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   2437 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2438 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   2439 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2440 ; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
   2441 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   2442 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   2443 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2444 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   2445 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2446 ; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
   2447 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   2448 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   2449 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2450 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   2451 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   2452 ; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
   2453 ; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
   2454 ; AVX512DQ-NEXT:    vzeroupper
   2455 ; AVX512DQ-NEXT:    retq
   2456 ;
   2457 ; AVX512DQVL-LABEL: test_v64i8:
   2458 ; AVX512DQVL:       # %bb.0:
   2459 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm2
   2460 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm3
   2461 ; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm3, %ymm2
   2462 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm2, %zmm2
   2463 ; AVX512DQVL-NEXT:    vpmovdb %zmm2, %xmm2
   2464 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm1, %xmm1
   2465 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   2466 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm0
   2467 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   2468 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2469 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   2470 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   2471 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   2472 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm2, %ymm1
   2473 ; AVX512DQVL-NEXT:    vpmullw %ymm0, %ymm1, %ymm0
   2474 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   2475 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   2476 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2477 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   2478 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   2479 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2480 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   2481 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   2482 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   2483 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   2484 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   2485 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2486 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   2487 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   2488 ; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
   2489 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   2490 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   2491 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2492 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   2493 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   2494 ; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
   2495 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   2496 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   2497 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2498 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   2499 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   2500 ; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
   2501 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
   2502 ; AVX512DQVL-NEXT:    vzeroupper
   2503 ; AVX512DQVL-NEXT:    retq
   2504   %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8> %a0)
   2505   ret i8 %1
   2506 }
   2507 
   2508 define i8 @test_v128i8(<128 x i8> %a0) {
   2509 ; SSE2-LABEL: test_v128i8:
   2510 ; SSE2:       # %bb.0:
   2511 ; SSE2-NEXT:    movdqa %xmm5, %xmm8
   2512 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
   2513 ; SSE2-NEXT:    movdqa %xmm1, %xmm9
   2514 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
   2515 ; SSE2-NEXT:    pmullw %xmm8, %xmm9
   2516 ; SSE2-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
   2517 ; SSE2-NEXT:    pand %xmm8, %xmm9
   2518 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
   2519 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2520 ; SSE2-NEXT:    pmullw %xmm5, %xmm1
   2521 ; SSE2-NEXT:    pand %xmm8, %xmm1
   2522 ; SSE2-NEXT:    packuswb %xmm9, %xmm1
   2523 ; SSE2-NEXT:    movdqa %xmm7, %xmm9
   2524 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
   2525 ; SSE2-NEXT:    movdqa %xmm3, %xmm5
   2526 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
   2527 ; SSE2-NEXT:    pmullw %xmm9, %xmm5
   2528 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2529 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
   2530 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
   2531 ; SSE2-NEXT:    pmullw %xmm7, %xmm3
   2532 ; SSE2-NEXT:    pand %xmm8, %xmm3
   2533 ; SSE2-NEXT:    packuswb %xmm5, %xmm3
   2534 ; SSE2-NEXT:    movdqa %xmm4, %xmm5
   2535 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
   2536 ; SSE2-NEXT:    movdqa %xmm0, %xmm7
   2537 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
   2538 ; SSE2-NEXT:    pmullw %xmm5, %xmm7
   2539 ; SSE2-NEXT:    pand %xmm8, %xmm7
   2540 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
   2541 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2542 ; SSE2-NEXT:    pmullw %xmm4, %xmm0
   2543 ; SSE2-NEXT:    pand %xmm8, %xmm0
   2544 ; SSE2-NEXT:    packuswb %xmm7, %xmm0
   2545 ; SSE2-NEXT:    movdqa %xmm6, %xmm4
   2546 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
   2547 ; SSE2-NEXT:    movdqa %xmm2, %xmm5
   2548 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
   2549 ; SSE2-NEXT:    pmullw %xmm4, %xmm5
   2550 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2551 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
   2552 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
   2553 ; SSE2-NEXT:    pmullw %xmm6, %xmm2
   2554 ; SSE2-NEXT:    pand %xmm8, %xmm2
   2555 ; SSE2-NEXT:    packuswb %xmm5, %xmm2
   2556 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
   2557 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
   2558 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
   2559 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
   2560 ; SSE2-NEXT:    pmullw %xmm4, %xmm5
   2561 ; SSE2-NEXT:    pand %xmm8, %xmm5
   2562 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
   2563 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2564 ; SSE2-NEXT:    pmullw %xmm2, %xmm0
   2565 ; SSE2-NEXT:    pand %xmm8, %xmm0
   2566 ; SSE2-NEXT:    packuswb %xmm5, %xmm0
   2567 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
   2568 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
   2569 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
   2570 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
   2571 ; SSE2-NEXT:    pmullw %xmm2, %xmm4
   2572 ; SSE2-NEXT:    pand %xmm8, %xmm4
   2573 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
   2574 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2575 ; SSE2-NEXT:    pmullw %xmm3, %xmm1
   2576 ; SSE2-NEXT:    pand %xmm8, %xmm1
   2577 ; SSE2-NEXT:    packuswb %xmm4, %xmm1
   2578 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
   2579 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
   2580 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
   2581 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
   2582 ; SSE2-NEXT:    pmullw %xmm2, %xmm3
   2583 ; SSE2-NEXT:    pand %xmm8, %xmm3
   2584 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2585 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2586 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
   2587 ; SSE2-NEXT:    pand %xmm8, %xmm0
   2588 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
   2589 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
   2590 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
   2591 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2592 ; SSE2-NEXT:    pmullw %xmm1, %xmm0
   2593 ; SSE2-NEXT:    pand %xmm8, %xmm0
   2594 ; SSE2-NEXT:    pxor %xmm2, %xmm2
   2595 ; SSE2-NEXT:    packuswb %xmm2, %xmm0
   2596 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,2,3,3]
   2597 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2598 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2599 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
   2600 ; SSE2-NEXT:    pand %xmm8, %xmm1
   2601 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
   2602 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   2603 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2604 ; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zero,zero
   2605 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2606 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
   2607 ; SSE2-NEXT:    pand %xmm8, %xmm1
   2608 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
   2609 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
   2610 ; SSE2-NEXT:    psrlw $8, %xmm0
   2611 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
   2612 ; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
   2613 ; SSE2-NEXT:    pmullw %xmm0, %xmm2
   2614 ; SSE2-NEXT:    pand %xmm8, %xmm2
   2615 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
   2616 ; SSE2-NEXT:    pmullw %xmm0, %xmm1
   2617 ; SSE2-NEXT:    pand %xmm8, %xmm1
   2618 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
   2619 ; SSE2-NEXT:    movd %xmm1, %eax
   2620 ; SSE2-NEXT:    # kill: def $al killed $al killed $eax
   2621 ; SSE2-NEXT:    retq
   2622 ;
   2623 ; SSE41-LABEL: test_v128i8:
   2624 ; SSE41:       # %bb.0:
   2625 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm9 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
   2626 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2627 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm8 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2628 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2629 ; SSE41-NEXT:    pmullw %xmm5, %xmm1
   2630 ; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
   2631 ; SSE41-NEXT:    pand %xmm5, %xmm1
   2632 ; SSE41-NEXT:    pmullw %xmm9, %xmm8
   2633 ; SSE41-NEXT:    pand %xmm5, %xmm8
   2634 ; SSE41-NEXT:    packuswb %xmm1, %xmm8
   2635 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm9 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero
   2636 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2637 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
   2638 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2639 ; SSE41-NEXT:    pmullw %xmm7, %xmm3
   2640 ; SSE41-NEXT:    pand %xmm5, %xmm3
   2641 ; SSE41-NEXT:    pmullw %xmm9, %xmm1
   2642 ; SSE41-NEXT:    pand %xmm5, %xmm1
   2643 ; SSE41-NEXT:    packuswb %xmm3, %xmm1
   2644 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
   2645 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2646 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2647 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2648 ; SSE41-NEXT:    pmullw %xmm4, %xmm0
   2649 ; SSE41-NEXT:    pand %xmm5, %xmm0
   2650 ; SSE41-NEXT:    pmullw %xmm7, %xmm3
   2651 ; SSE41-NEXT:    pand %xmm5, %xmm3
   2652 ; SSE41-NEXT:    packuswb %xmm0, %xmm3
   2653 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
   2654 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2655 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   2656 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2657 ; SSE41-NEXT:    pmullw %xmm6, %xmm2
   2658 ; SSE41-NEXT:    pand %xmm5, %xmm2
   2659 ; SSE41-NEXT:    pmullw %xmm0, %xmm4
   2660 ; SSE41-NEXT:    pand %xmm5, %xmm4
   2661 ; SSE41-NEXT:    packuswb %xmm2, %xmm4
   2662 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
   2663 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2664 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
   2665 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2666 ; SSE41-NEXT:    pmullw %xmm4, %xmm3
   2667 ; SSE41-NEXT:    pand %xmm5, %xmm3
   2668 ; SSE41-NEXT:    pmullw %xmm2, %xmm0
   2669 ; SSE41-NEXT:    pand %xmm5, %xmm0
   2670 ; SSE41-NEXT:    packuswb %xmm3, %xmm0
   2671 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2672 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2673 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
   2674 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2675 ; SSE41-NEXT:    pmullw %xmm1, %xmm8
   2676 ; SSE41-NEXT:    pand %xmm5, %xmm8
   2677 ; SSE41-NEXT:    pmullw %xmm2, %xmm3
   2678 ; SSE41-NEXT:    pand %xmm5, %xmm3
   2679 ; SSE41-NEXT:    packuswb %xmm8, %xmm3
   2680 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
   2681 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2682 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2683 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2684 ; SSE41-NEXT:    pmullw %xmm3, %xmm0
   2685 ; SSE41-NEXT:    pand %xmm5, %xmm0
   2686 ; SSE41-NEXT:    pmullw %xmm1, %xmm2
   2687 ; SSE41-NEXT:    pand %xmm5, %xmm2
   2688 ; SSE41-NEXT:    packuswb %xmm0, %xmm2
   2689 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   2690 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2691 ; SSE41-NEXT:    pmullw %xmm0, %xmm2
   2692 ; SSE41-NEXT:    pand %xmm5, %xmm2
   2693 ; SSE41-NEXT:    pxor %xmm0, %xmm0
   2694 ; SSE41-NEXT:    packuswb %xmm0, %xmm2
   2695 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   2696 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2697 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
   2698 ; SSE41-NEXT:    pmullw %xmm1, %xmm2
   2699 ; SSE41-NEXT:    pand %xmm5, %xmm2
   2700 ; SSE41-NEXT:    packuswb %xmm0, %xmm2
   2701 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   2702 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   2703 ; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
   2704 ; SSE41-NEXT:    pmullw %xmm1, %xmm2
   2705 ; SSE41-NEXT:    pand %xmm5, %xmm2
   2706 ; SSE41-NEXT:    packuswb %xmm0, %xmm2
   2707 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   2708 ; SSE41-NEXT:    psrlw $8, %xmm2
   2709 ; SSE41-NEXT:    pmullw %xmm0, %xmm2
   2710 ; SSE41-NEXT:    pextrb $0, %xmm2, %eax
   2711 ; SSE41-NEXT:    # kill: def $al killed $al killed $eax
   2712 ; SSE41-NEXT:    retq
   2713 ;
   2714 ; AVX1-LABEL: test_v128i8:
   2715 ; AVX1:       # %bb.0:
   2716 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
   2717 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2718 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
   2719 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2720 ; AVX1-NEXT:    vpmullw %xmm4, %xmm7, %xmm7
   2721 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
   2722 ; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
   2723 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
   2724 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
   2725 ; AVX1-NEXT:    vpmullw %xmm5, %xmm6, %xmm5
   2726 ; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
   2727 ; AVX1-NEXT:    vpackuswb %xmm7, %xmm5, %xmm8
   2728 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
   2729 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2730 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
   2731 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2732 ; AVX1-NEXT:    vpmullw %xmm9, %xmm7, %xmm7
   2733 ; AVX1-NEXT:    vpand %xmm4, %xmm7, %xmm7
   2734 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm6 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
   2735 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm5 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero
   2736 ; AVX1-NEXT:    vpmullw %xmm6, %xmm5, %xmm5
   2737 ; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
   2738 ; AVX1-NEXT:    vpackuswb %xmm7, %xmm5, %xmm6
   2739 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2740 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2741 ; AVX1-NEXT:    vpmullw %xmm5, %xmm7, %xmm5
   2742 ; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
   2743 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
   2744 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2745 ; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
   2746 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   2747 ; AVX1-NEXT:    vpackuswb %xmm5, %xmm0, %xmm0
   2748 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2749 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2750 ; AVX1-NEXT:    vpmullw %xmm2, %xmm5, %xmm2
   2751 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2752 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
   2753 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2754 ; AVX1-NEXT:    vpmullw %xmm3, %xmm1, %xmm1
   2755 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   2756 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
   2757 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2758 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2759 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
   2760 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2761 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2762 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2763 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2764 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   2765 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   2766 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2767 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2768 ; AVX1-NEXT:    vpmullw %xmm1, %xmm2, %xmm1
   2769 ; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
   2770 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm6[0],zero,xmm6[1],zero,xmm6[2],zero,xmm6[3],zero,xmm6[4],zero,xmm6[5],zero,xmm6[6],zero,xmm6[7],zero
   2771 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero
   2772 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
   2773 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2774 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm2, %xmm1
   2775 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2776 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2777 ; AVX1-NEXT:    vpmullw %xmm2, %xmm3, %xmm2
   2778 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2779 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2780 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2781 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2782 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   2783 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   2784 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2785 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2786 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2787 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
   2788 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2789 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2790 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2791 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2792 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   2793 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   2794 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   2795 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2796 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2797 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
   2798 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2799 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2800 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2801 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2802 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   2803 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   2804 ; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
   2805 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2806 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2807 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
   2808 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2809 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2810 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2811 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2812 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   2813 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   2814 ; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
   2815 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2816 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
   2817 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
   2818 ; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
   2819 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   2820 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   2821 ; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
   2822 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
   2823 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
   2824 ; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
   2825 ; AVX1-NEXT:    # kill: def $al killed $al killed $eax
   2826 ; AVX1-NEXT:    vzeroupper
   2827 ; AVX1-NEXT:    retq
   2828 ;
   2829 ; AVX2-LABEL: test_v128i8:
   2830 ; AVX2:       # %bb.0:
   2831 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm4
   2832 ; AVX2-NEXT:    vpmovsxbw %xmm4, %ymm4
   2833 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm5
   2834 ; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm5
   2835 ; AVX2-NEXT:    vpmullw %ymm4, %ymm5, %ymm5
   2836 ; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
   2837 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
   2838 ; AVX2-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
   2839 ; AVX2-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
   2840 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm6[0]
   2841 ; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm6
   2842 ; AVX2-NEXT:    vpmovsxbw %xmm6, %ymm6
   2843 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm7
   2844 ; AVX2-NEXT:    vpmovsxbw %xmm7, %ymm7
   2845 ; AVX2-NEXT:    vpmullw %ymm6, %ymm7, %ymm6
   2846 ; AVX2-NEXT:    vextracti128 $1, %ymm6, %xmm7
   2847 ; AVX2-NEXT:    vpshufb %xmm4, %xmm7, %xmm7
   2848 ; AVX2-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
   2849 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
   2850 ; AVX2-NEXT:    vpmovsxbw %xmm2, %ymm2
   2851 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2852 ; AVX2-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
   2853 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
   2854 ; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   2855 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   2856 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
   2857 ; AVX2-NEXT:    vpmovsxbw %xmm3, %ymm2
   2858 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   2859 ; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
   2860 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   2861 ; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   2862 ; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
   2863 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   2864 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   2865 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2866 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2867 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2868 ; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
   2869 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   2870 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2871 ; AVX2-NEXT:    vpmovsxbw %xmm6, %ymm1
   2872 ; AVX2-NEXT:    vpmovsxbw %xmm5, %ymm2
   2873 ; AVX2-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
   2874 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
   2875 ; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
   2876 ; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
   2877 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   2878 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   2879 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2880 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2881 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2882 ; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
   2883 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   2884 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2885 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2886 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2887 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   2888 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2889 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2890 ; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
   2891 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   2892 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2893 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   2894 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2895 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   2896 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2897 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2898 ; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
   2899 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   2900 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2901 ; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
   2902 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2903 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   2904 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2905 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2906 ; AVX2-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
   2907 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
   2908 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
   2909 ; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
   2910 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
   2911 ; AVX2-NEXT:    vpmovsxbw %xmm1, %ymm1
   2912 ; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   2913 ; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
   2914 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
   2915 ; AVX2-NEXT:    vzeroupper
   2916 ; AVX2-NEXT:    retq
   2917 ;
   2918 ; AVX512BW-LABEL: test_v128i8:
   2919 ; AVX512BW:       # %bb.0:
   2920 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm2
   2921 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm3
   2922 ; AVX512BW-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
   2923 ; AVX512BW-NEXT:    vpmovwb %zmm2, %ymm2
   2924 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
   2925 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   2926 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
   2927 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2928 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2929 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2930 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2931 ; AVX512BW-NEXT:    vpmovsxbw %ymm2, %zmm1
   2932 ; AVX512BW-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
   2933 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2934 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2935 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2936 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   2937 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2938 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2939 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2940 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2941 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   2942 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2943 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2944 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   2945 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2946 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   2947 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2948 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2949 ; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1
   2950 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2951 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   2952 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2953 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2954 ; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm1
   2955 ; AVX512BW-NEXT:    vpmovsxbw %ymm0, %zmm0
   2956 ; AVX512BW-NEXT:    vpmovsxbw %ymm1, %zmm1
   2957 ; AVX512BW-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2958 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
   2959 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %eax
   2960 ; AVX512BW-NEXT:    # kill: def $al killed $al killed $eax
   2961 ; AVX512BW-NEXT:    vzeroupper
   2962 ; AVX512BW-NEXT:    retq
   2963 ;
   2964 ; AVX512BWVL-LABEL: test_v128i8:
   2965 ; AVX512BWVL:       # %bb.0:
   2966 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm2
   2967 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm3
   2968 ; AVX512BWVL-NEXT:    vpmullw %zmm2, %zmm3, %zmm2
   2969 ; AVX512BWVL-NEXT:    vpmovwb %zmm2, %ymm2
   2970 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
   2971 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   2972 ; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
   2973 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   2974 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2975 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   2976 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   2977 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm2, %zmm1
   2978 ; AVX512BWVL-NEXT:    vpmullw %zmm0, %zmm1, %zmm0
   2979 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   2980 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2981 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   2982 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   2983 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2984 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   2985 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   2986 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   2987 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   2988 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2989 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   2990 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   2991 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   2992 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   2993 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2994 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   2995 ; AVX512BWVL-NEXT:    vpsrld $16, %xmm0, %xmm1
   2996 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   2997 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   2998 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   2999 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   3000 ; AVX512BWVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
   3001 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm0, %zmm0
   3002 ; AVX512BWVL-NEXT:    vpmovsxbw %ymm1, %zmm1
   3003 ; AVX512BWVL-NEXT:    vpmullw %zmm1, %zmm0, %zmm0
   3004 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
   3005 ; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
   3006 ; AVX512BWVL-NEXT:    # kill: def $al killed $al killed $eax
   3007 ; AVX512BWVL-NEXT:    vzeroupper
   3008 ; AVX512BWVL-NEXT:    retq
   3009 ;
   3010 ; AVX512DQ-LABEL: test_v128i8:
   3011 ; AVX512DQ:       # %bb.0:
   3012 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm4
   3013 ; AVX512DQ-NEXT:    vpmovsxbw %xmm4, %ymm4
   3014 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm5
   3015 ; AVX512DQ-NEXT:    vpmovsxbw %xmm5, %ymm5
   3016 ; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
   3017 ; AVX512DQ-NEXT:    vpmovsxwd %ymm4, %zmm4
   3018 ; AVX512DQ-NEXT:    vpmovdb %zmm4, %xmm4
   3019 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm3, %xmm5
   3020 ; AVX512DQ-NEXT:    vpmovsxbw %xmm5, %ymm5
   3021 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm6
   3022 ; AVX512DQ-NEXT:    vpmovsxbw %xmm6, %ymm6
   3023 ; AVX512DQ-NEXT:    vpmullw %ymm5, %ymm6, %ymm5
   3024 ; AVX512DQ-NEXT:    vpmovsxwd %ymm5, %zmm5
   3025 ; AVX512DQ-NEXT:    vpmovdb %zmm5, %xmm5
   3026 ; AVX512DQ-NEXT:    vpmovsxbw %xmm2, %ymm2
   3027 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   3028 ; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
   3029 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   3030 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   3031 ; AVX512DQ-NEXT:    vpmovsxbw %xmm3, %ymm2
   3032 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   3033 ; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
   3034 ; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
   3035 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
   3036 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   3037 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   3038 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   3039 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   3040 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   3041 ; AVX512DQ-NEXT:    vpmovsxbw %xmm5, %ymm1
   3042 ; AVX512DQ-NEXT:    vpmovsxbw %xmm4, %ymm2
   3043 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
   3044 ; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
   3045 ; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
   3046 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   3047 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   3048 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   3049 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   3050 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   3051 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   3052 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   3053 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   3054 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   3055 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   3056 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   3057 ; AVX512DQ-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   3058 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   3059 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   3060 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   3061 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   3062 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   3063 ; AVX512DQ-NEXT:    vpsrld $16, %xmm0, %xmm1
   3064 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   3065 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   3066 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   3067 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   3068 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   3069 ; AVX512DQ-NEXT:    vpsrlw $8, %xmm0, %xmm1
   3070 ; AVX512DQ-NEXT:    vpmovsxbw %xmm0, %ymm0
   3071 ; AVX512DQ-NEXT:    vpmovsxbw %xmm1, %ymm1
   3072 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   3073 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
   3074 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
   3075 ; AVX512DQ-NEXT:    vpextrb $0, %xmm0, %eax
   3076 ; AVX512DQ-NEXT:    # kill: def $al killed $al killed $eax
   3077 ; AVX512DQ-NEXT:    vzeroupper
   3078 ; AVX512DQ-NEXT:    retq
   3079 ;
   3080 ; AVX512DQVL-LABEL: test_v128i8:
   3081 ; AVX512DQVL:       # %bb.0:
   3082 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm2, %xmm4
   3083 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm4, %ymm4
   3084 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm0, %xmm5
   3085 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm5, %ymm5
   3086 ; AVX512DQVL-NEXT:    vpmullw %ymm4, %ymm5, %ymm4
   3087 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm4, %zmm4
   3088 ; AVX512DQVL-NEXT:    vpmovdb %zmm4, %xmm4
   3089 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm3, %xmm5
   3090 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm5, %ymm5
   3091 ; AVX512DQVL-NEXT:    vextracti128 $1, %ymm1, %xmm6
   3092 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm6, %ymm6
   3093 ; AVX512DQVL-NEXT:    vpmullw %ymm5, %ymm6, %ymm5
   3094 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm5, %zmm5
   3095 ; AVX512DQVL-NEXT:    vpmovdb %zmm5, %xmm5
   3096 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm2, %ymm2
   3097 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   3098 ; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
   3099 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   3100 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   3101 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm3, %ymm2
   3102 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   3103 ; AVX512DQVL-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
   3104 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm1, %zmm1
   3105 ; AVX512DQVL-NEXT:    vpmovdb %zmm1, %xmm1
   3106 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   3107 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   3108 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   3109 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   3110 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   3111 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm5, %ymm1
   3112 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm4, %ymm2
   3113 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm2, %ymm1
   3114 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm1, %zmm1
   3115 ; AVX512DQVL-NEXT:    vpmovdb %zmm1, %xmm1
   3116 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   3117 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   3118 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   3119 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   3120 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   3121 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   3122 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   3123 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   3124 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   3125 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   3126 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   3127 ; AVX512DQVL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   3128 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   3129 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   3130 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   3131 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   3132 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   3133 ; AVX512DQVL-NEXT:    vpsrld $16, %xmm0, %xmm1
   3134 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   3135 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   3136 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   3137 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   3138 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   3139 ; AVX512DQVL-NEXT:    vpsrlw $8, %xmm0, %xmm1
   3140 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm0, %ymm0
   3141 ; AVX512DQVL-NEXT:    vpmovsxbw %xmm1, %ymm1
   3142 ; AVX512DQVL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
   3143 ; AVX512DQVL-NEXT:    vpmovsxwd %ymm0, %zmm0
   3144 ; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
   3145 ; AVX512DQVL-NEXT:    vpextrb $0, %xmm0, %eax
   3146 ; AVX512DQVL-NEXT:    # kill: def $al killed $al killed $eax
   3147 ; AVX512DQVL-NEXT:    vzeroupper
   3148 ; AVX512DQVL-NEXT:    retq
   3149   %1 = call i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8> %a0)
   3150   ret i8 %1
   3151 }
   3152 
   3153 declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>)
   3154 declare i64 @llvm.experimental.vector.reduce.mul.i64.v4i64(<4 x i64>)
   3155 declare i64 @llvm.experimental.vector.reduce.mul.i64.v8i64(<8 x i64>)
   3156 declare i64 @llvm.experimental.vector.reduce.mul.i64.v16i64(<16 x i64>)
   3157 
   3158 declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32>)
   3159 declare i32 @llvm.experimental.vector.reduce.mul.i32.v8i32(<8 x i32>)
   3160 declare i32 @llvm.experimental.vector.reduce.mul.i32.v16i32(<16 x i32>)
   3161 declare i32 @llvm.experimental.vector.reduce.mul.i32.v32i32(<32 x i32>)
   3162 
   3163 declare i16 @llvm.experimental.vector.reduce.mul.i16.v8i16(<8 x i16>)
   3164 declare i16 @llvm.experimental.vector.reduce.mul.i16.v16i16(<16 x i16>)
   3165 declare i16 @llvm.experimental.vector.reduce.mul.i16.v32i16(<32 x i16>)
   3166 declare i16 @llvm.experimental.vector.reduce.mul.i16.v64i16(<64 x i16>)
   3167 
   3168 declare i8 @llvm.experimental.vector.reduce.mul.i8.v16i8(<16 x i8>)
   3169 declare i8 @llvm.experimental.vector.reduce.mul.i8.v32i8(<32 x i8>)
   3170 declare i8 @llvm.experimental.vector.reduce.mul.i8.v64i8(<64 x i8>)
   3171 declare i8 @llvm.experimental.vector.reduce.mul.i8.v128i8(<128 x i8>)
   3172