Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK32,SLM32
      3 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=silvermont | FileCheck %s --check-prefixes=CHECK64,SLM64
      4 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK32,SLOW32
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2,+slow-pmulld | FileCheck %s --check-prefixes=CHECK64,SLOW64
      6 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-32
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE4-64
      8 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-32,AVX2-32
      9 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-64,AVX2-64
     10 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512DQ-32
     11 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX-64,AVX512-64,AVX512DQ-64
     12 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX-32,AVX512-32,AVX512BW-32
     13 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX-64,AVX512-64,AVX512BW-64
     14 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX-32,AVX512-32,KNL-32
     15 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefixes=AVX-64,AVX512-64,KNL-64
     16 
     17 ; Make sure that the slow-pmulld feature can be used without SSE4.1.
     18 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=silvermont -mattr=-sse4.1
     19 
     20 define <4 x i32> @test_mul_v4i32_v4i8(<4 x i8> %A) {
     21 ; CHECK32-LABEL: test_mul_v4i32_v4i8:
     22 ; CHECK32:       # %bb.0:
     23 ; CHECK32-NEXT:    pand {{\.LCPI.*}}, %xmm0
     24 ; CHECK32-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
     25 ; CHECK32-NEXT:    retl
     26 ;
     27 ; CHECK64-LABEL: test_mul_v4i32_v4i8:
     28 ; CHECK64:       # %bb.0:
     29 ; CHECK64-NEXT:    pand {{.*}}(%rip), %xmm0
     30 ; CHECK64-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
     31 ; CHECK64-NEXT:    retq
     32 ;
     33 ; SSE4-32-LABEL: test_mul_v4i32_v4i8:
     34 ; SSE4-32:       # %bb.0:
     35 ; SSE4-32-NEXT:    pand {{\.LCPI.*}}, %xmm0
     36 ; SSE4-32-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
     37 ; SSE4-32-NEXT:    retl
     38 ;
     39 ; SSE4-64-LABEL: test_mul_v4i32_v4i8:
     40 ; SSE4-64:       # %bb.0:
     41 ; SSE4-64-NEXT:    pand {{.*}}(%rip), %xmm0
     42 ; SSE4-64-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
     43 ; SSE4-64-NEXT:    retq
     44 ;
     45 ; AVX2-32-LABEL: test_mul_v4i32_v4i8:
     46 ; AVX2-32:       # %bb.0:
     47 ; AVX2-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
     48 ; AVX2-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
     49 ; AVX2-32-NEXT:    retl
     50 ;
     51 ; AVX2-64-LABEL: test_mul_v4i32_v4i8:
     52 ; AVX2-64:       # %bb.0:
     53 ; AVX2-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
     54 ; AVX2-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
     55 ; AVX2-64-NEXT:    retq
     56 ;
     57 ; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8:
     58 ; AVX512DQ-32:       # %bb.0:
     59 ; AVX512DQ-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
     60 ; AVX512DQ-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
     61 ; AVX512DQ-32-NEXT:    retl
     62 ;
     63 ; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8:
     64 ; AVX512DQ-64:       # %bb.0:
     65 ; AVX512DQ-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
     66 ; AVX512DQ-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
     67 ; AVX512DQ-64-NEXT:    retq
     68 ;
     69 ; AVX512BW-32-LABEL: test_mul_v4i32_v4i8:
     70 ; AVX512BW-32:       # %bb.0:
     71 ; AVX512BW-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
     72 ; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
     73 ; AVX512BW-32-NEXT:    retl
     74 ;
     75 ; AVX512BW-64-LABEL: test_mul_v4i32_v4i8:
     76 ; AVX512BW-64:       # %bb.0:
     77 ; AVX512BW-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
     78 ; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
     79 ; AVX512BW-64-NEXT:    retq
     80 ;
     81 ; KNL-32-LABEL: test_mul_v4i32_v4i8:
     82 ; KNL-32:       # %bb.0:
     83 ; KNL-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
     84 ; KNL-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
     85 ; KNL-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
     86 ; KNL-32-NEXT:    retl
     87 ;
     88 ; KNL-64-LABEL: test_mul_v4i32_v4i8:
     89 ; KNL-64:       # %bb.0:
     90 ; KNL-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
     91 ; KNL-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
     92 ; KNL-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
     93 ; KNL-64-NEXT:    retq
     94   %z = zext <4 x i8> %A to <4 x i32>
     95   %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
     96   ret <4 x i32> %m
     97 }
     98 
     99 define <8 x i32> @test_mul_v8i32_v8i8(<8 x i8> %A) {
    100 ; SLM32-LABEL: test_mul_v8i32_v8i8:
    101 ; SLM32:       # %bb.0:
    102 ; SLM32-NEXT:    movdqa %xmm0, %xmm1
    103 ; SLM32-NEXT:    pand {{\.LCPI.*}}, %xmm1
    104 ; SLM32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
    105 ; SLM32-NEXT:    movdqa %xmm1, %xmm2
    106 ; SLM32-NEXT:    pmullw %xmm0, %xmm1
    107 ; SLM32-NEXT:    pmulhw %xmm0, %xmm2
    108 ; SLM32-NEXT:    movdqa %xmm1, %xmm0
    109 ; SLM32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    110 ; SLM32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    111 ; SLM32-NEXT:    retl
    112 ;
    113 ; SLM64-LABEL: test_mul_v8i32_v8i8:
    114 ; SLM64:       # %bb.0:
    115 ; SLM64-NEXT:    movdqa %xmm0, %xmm1
    116 ; SLM64-NEXT:    pand {{.*}}(%rip), %xmm1
    117 ; SLM64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
    118 ; SLM64-NEXT:    movdqa %xmm1, %xmm2
    119 ; SLM64-NEXT:    pmullw %xmm0, %xmm1
    120 ; SLM64-NEXT:    pmulhw %xmm0, %xmm2
    121 ; SLM64-NEXT:    movdqa %xmm1, %xmm0
    122 ; SLM64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    123 ; SLM64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    124 ; SLM64-NEXT:    retq
    125 ;
    126 ; SLOW32-LABEL: test_mul_v8i32_v8i8:
    127 ; SLOW32:       # %bb.0:
    128 ; SLOW32-NEXT:    movdqa %xmm0, %xmm1
    129 ; SLOW32-NEXT:    pand {{\.LCPI.*}}, %xmm1
    130 ; SLOW32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
    131 ; SLOW32-NEXT:    movdqa %xmm1, %xmm2
    132 ; SLOW32-NEXT:    pmulhw %xmm0, %xmm2
    133 ; SLOW32-NEXT:    pmullw %xmm0, %xmm1
    134 ; SLOW32-NEXT:    movdqa %xmm1, %xmm0
    135 ; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    136 ; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    137 ; SLOW32-NEXT:    retl
    138 ;
    139 ; SLOW64-LABEL: test_mul_v8i32_v8i8:
    140 ; SLOW64:       # %bb.0:
    141 ; SLOW64-NEXT:    movdqa %xmm0, %xmm1
    142 ; SLOW64-NEXT:    pand {{.*}}(%rip), %xmm1
    143 ; SLOW64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
    144 ; SLOW64-NEXT:    movdqa %xmm1, %xmm2
    145 ; SLOW64-NEXT:    pmulhw %xmm0, %xmm2
    146 ; SLOW64-NEXT:    pmullw %xmm0, %xmm1
    147 ; SLOW64-NEXT:    movdqa %xmm1, %xmm0
    148 ; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    149 ; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    150 ; SLOW64-NEXT:    retq
    151 ;
    152 ; SSE4-32-LABEL: test_mul_v8i32_v8i8:
    153 ; SSE4-32:       # %bb.0:
    154 ; SSE4-32-NEXT:    pand {{\.LCPI.*}}, %xmm0
    155 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    156 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    157 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    158 ; SSE4-32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
    159 ; SSE4-32-NEXT:    pmaddwd %xmm2, %xmm0
    160 ; SSE4-32-NEXT:    pmaddwd %xmm2, %xmm1
    161 ; SSE4-32-NEXT:    retl
    162 ;
    163 ; SSE4-64-LABEL: test_mul_v8i32_v8i8:
    164 ; SSE4-64:       # %bb.0:
    165 ; SSE4-64-NEXT:    pand {{.*}}(%rip), %xmm0
    166 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    167 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    168 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    169 ; SSE4-64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
    170 ; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm0
    171 ; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm1
    172 ; SSE4-64-NEXT:    retq
    173 ;
    174 ; AVX2-32-LABEL: test_mul_v8i32_v8i8:
    175 ; AVX2-32:       # %bb.0:
    176 ; AVX2-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    177 ; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    178 ; AVX2-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
    179 ; AVX2-32-NEXT:    retl
    180 ;
    181 ; AVX2-64-LABEL: test_mul_v8i32_v8i8:
    182 ; AVX2-64:       # %bb.0:
    183 ; AVX2-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    184 ; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    185 ; AVX2-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
    186 ; AVX2-64-NEXT:    retq
    187 ;
    188 ; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8:
    189 ; AVX512DQ-32:       # %bb.0:
    190 ; AVX512DQ-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    191 ; AVX512DQ-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    192 ; AVX512DQ-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
    193 ; AVX512DQ-32-NEXT:    retl
    194 ;
    195 ; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8:
    196 ; AVX512DQ-64:       # %bb.0:
    197 ; AVX512DQ-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    198 ; AVX512DQ-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    199 ; AVX512DQ-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
    200 ; AVX512DQ-64-NEXT:    retq
    201 ;
    202 ; AVX512BW-32-LABEL: test_mul_v8i32_v8i8:
    203 ; AVX512BW-32:       # %bb.0:
    204 ; AVX512BW-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    205 ; AVX512BW-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    206 ; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
    207 ; AVX512BW-32-NEXT:    retl
    208 ;
    209 ; AVX512BW-64-LABEL: test_mul_v8i32_v8i8:
    210 ; AVX512BW-64:       # %bb.0:
    211 ; AVX512BW-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    212 ; AVX512BW-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    213 ; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
    214 ; AVX512BW-64-NEXT:    retq
    215 ;
    216 ; KNL-32-LABEL: test_mul_v8i32_v8i8:
    217 ; KNL-32:       # %bb.0:
    218 ; KNL-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    219 ; KNL-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    220 ; KNL-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
    221 ; KNL-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    222 ; KNL-32-NEXT:    retl
    223 ;
    224 ; KNL-64-LABEL: test_mul_v8i32_v8i8:
    225 ; KNL-64:       # %bb.0:
    226 ; KNL-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    227 ; KNL-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    228 ; KNL-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
    229 ; KNL-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    230 ; KNL-64-NEXT:    retq
    231   %z = zext <8 x i8> %A to <8 x i32>
    232   %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
    233   ret <8 x i32> %m
    234 }
    235 
    236 define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
    237 ; SLM32-LABEL: test_mul_v16i32_v16i8:
    238 ; SLM32:       # %bb.0:
    239 ; SLM32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
    240 ; SLM32-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    241 ; SLM32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    242 ; SLM32-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    243 ; SLM32-NEXT:    movdqa %xmm1, %xmm4
    244 ; SLM32-NEXT:    movdqa %xmm3, %xmm5
    245 ; SLM32-NEXT:    pmullw %xmm2, %xmm1
    246 ; SLM32-NEXT:    pmullw %xmm2, %xmm3
    247 ; SLM32-NEXT:    pmulhw %xmm2, %xmm4
    248 ; SLM32-NEXT:    pmulhw %xmm2, %xmm5
    249 ; SLM32-NEXT:    movdqa %xmm1, %xmm0
    250 ; SLM32-NEXT:    movdqa %xmm3, %xmm2
    251 ; SLM32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
    252 ; SLM32-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
    253 ; SLM32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    254 ; SLM32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
    255 ; SLM32-NEXT:    retl
    256 ;
    257 ; SLM64-LABEL: test_mul_v16i32_v16i8:
    258 ; SLM64:       # %bb.0:
    259 ; SLM64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
    260 ; SLM64-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    261 ; SLM64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    262 ; SLM64-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    263 ; SLM64-NEXT:    movdqa %xmm1, %xmm4
    264 ; SLM64-NEXT:    movdqa %xmm3, %xmm5
    265 ; SLM64-NEXT:    pmullw %xmm2, %xmm1
    266 ; SLM64-NEXT:    pmullw %xmm2, %xmm3
    267 ; SLM64-NEXT:    pmulhw %xmm2, %xmm4
    268 ; SLM64-NEXT:    pmulhw %xmm2, %xmm5
    269 ; SLM64-NEXT:    movdqa %xmm1, %xmm0
    270 ; SLM64-NEXT:    movdqa %xmm3, %xmm2
    271 ; SLM64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
    272 ; SLM64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
    273 ; SLM64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    274 ; SLM64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
    275 ; SLM64-NEXT:    retq
    276 ;
    277 ; SLOW32-LABEL: test_mul_v16i32_v16i8:
    278 ; SLOW32:       # %bb.0:
    279 ; SLOW32-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    280 ; SLOW32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
    281 ; SLOW32-NEXT:    movdqa %xmm1, %xmm3
    282 ; SLOW32-NEXT:    pmulhw %xmm2, %xmm3
    283 ; SLOW32-NEXT:    pmullw %xmm2, %xmm1
    284 ; SLOW32-NEXT:    movdqa %xmm1, %xmm4
    285 ; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
    286 ; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
    287 ; SLOW32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    288 ; SLOW32-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    289 ; SLOW32-NEXT:    movdqa %xmm3, %xmm0
    290 ; SLOW32-NEXT:    pmulhw %xmm2, %xmm0
    291 ; SLOW32-NEXT:    pmullw %xmm2, %xmm3
    292 ; SLOW32-NEXT:    movdqa %xmm3, %xmm2
    293 ; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    294 ; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
    295 ; SLOW32-NEXT:    movdqa %xmm4, %xmm0
    296 ; SLOW32-NEXT:    retl
    297 ;
    298 ; SLOW64-LABEL: test_mul_v16i32_v16i8:
    299 ; SLOW64:       # %bb.0:
    300 ; SLOW64-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    301 ; SLOW64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
    302 ; SLOW64-NEXT:    movdqa %xmm1, %xmm3
    303 ; SLOW64-NEXT:    pmulhw %xmm2, %xmm3
    304 ; SLOW64-NEXT:    pmullw %xmm2, %xmm1
    305 ; SLOW64-NEXT:    movdqa %xmm1, %xmm4
    306 ; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
    307 ; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
    308 ; SLOW64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
    309 ; SLOW64-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    310 ; SLOW64-NEXT:    movdqa %xmm3, %xmm0
    311 ; SLOW64-NEXT:    pmulhw %xmm2, %xmm0
    312 ; SLOW64-NEXT:    pmullw %xmm2, %xmm3
    313 ; SLOW64-NEXT:    movdqa %xmm3, %xmm2
    314 ; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
    315 ; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
    316 ; SLOW64-NEXT:    movdqa %xmm4, %xmm0
    317 ; SLOW64-NEXT:    retq
    318 ;
    319 ; SSE4-32-LABEL: test_mul_v16i32_v16i8:
    320 ; SSE4-32:       # %bb.0:
    321 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
    322 ; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    323 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    324 ; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    325 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    326 ; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    327 ; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    328 ; SSE4-32-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
    329 ; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm0
    330 ; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm1
    331 ; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm2
    332 ; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm3
    333 ; SSE4-32-NEXT:    retl
    334 ;
    335 ; SSE4-64-LABEL: test_mul_v16i32_v16i8:
    336 ; SSE4-64:       # %bb.0:
    337 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
    338 ; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    339 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    340 ; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    341 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    342 ; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    343 ; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    344 ; SSE4-64-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
    345 ; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm0
    346 ; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm1
    347 ; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm2
    348 ; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm3
    349 ; SSE4-64-NEXT:    retq
    350 ;
    351 ; AVX2-32-LABEL: test_mul_v16i32_v16i8:
    352 ; AVX2-32:       # %bb.0:
    353 ; AVX2-32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    354 ; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
    355 ; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
    356 ; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
    357 ; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
    358 ; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
    359 ; AVX2-32-NEXT:    retl
    360 ;
    361 ; AVX2-64-LABEL: test_mul_v16i32_v16i8:
    362 ; AVX2-64:       # %bb.0:
    363 ; AVX2-64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    364 ; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
    365 ; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
    366 ; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
    367 ; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
    368 ; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
    369 ; AVX2-64-NEXT:    retq
    370 ;
    371 ; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8:
    372 ; AVX512DQ-32:       # %bb.0:
    373 ; AVX512DQ-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    374 ; AVX512DQ-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
    375 ; AVX512DQ-32-NEXT:    retl
    376 ;
    377 ; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8:
    378 ; AVX512DQ-64:       # %bb.0:
    379 ; AVX512DQ-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    380 ; AVX512DQ-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
    381 ; AVX512DQ-64-NEXT:    retq
    382 ;
    383 ; AVX512BW-32-LABEL: test_mul_v16i32_v16i8:
    384 ; AVX512BW-32:       # %bb.0:
    385 ; AVX512BW-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    386 ; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0
    387 ; AVX512BW-32-NEXT:    retl
    388 ;
    389 ; AVX512BW-64-LABEL: test_mul_v16i32_v16i8:
    390 ; AVX512BW-64:       # %bb.0:
    391 ; AVX512BW-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    392 ; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %zmm0, %zmm0
    393 ; AVX512BW-64-NEXT:    retq
    394 ;
    395 ; KNL-32-LABEL: test_mul_v16i32_v16i8:
    396 ; KNL-32:       # %bb.0:
    397 ; KNL-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    398 ; KNL-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
    399 ; KNL-32-NEXT:    retl
    400 ;
    401 ; KNL-64-LABEL: test_mul_v16i32_v16i8:
    402 ; KNL-64:       # %bb.0:
    403 ; KNL-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
    404 ; KNL-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
    405 ; KNL-64-NEXT:    retq
    406   %z = zext <16 x i8> %A to <16 x i32>
    407   %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
    408   ret <16 x i32> %m
    409 }
    410 
    411 define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
    412 ; SLM32-LABEL: test_mul_v4i32_v4i16:
    413 ; SLM32:       # %bb.0:
    414 ; SLM32-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    415 ; SLM32-NEXT:    movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
    416 ; SLM32-NEXT:    movdqa %xmm0, %xmm2
    417 ; SLM32-NEXT:    pmullw %xmm1, %xmm0
    418 ; SLM32-NEXT:    pmulhuw %xmm1, %xmm2
    419 ; SLM32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    420 ; SLM32-NEXT:    retl
    421 ;
    422 ; SLM64-LABEL: test_mul_v4i32_v4i16:
    423 ; SLM64:       # %bb.0:
    424 ; SLM64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    425 ; SLM64-NEXT:    movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
    426 ; SLM64-NEXT:    movdqa %xmm0, %xmm2
    427 ; SLM64-NEXT:    pmullw %xmm1, %xmm0
    428 ; SLM64-NEXT:    pmulhuw %xmm1, %xmm2
    429 ; SLM64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    430 ; SLM64-NEXT:    retq
    431 ;
    432 ; SLOW32-LABEL: test_mul_v4i32_v4i16:
    433 ; SLOW32:       # %bb.0:
    434 ; SLOW32-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    435 ; SLOW32-NEXT:    movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
    436 ; SLOW32-NEXT:    movdqa %xmm0, %xmm2
    437 ; SLOW32-NEXT:    pmulhuw %xmm1, %xmm2
    438 ; SLOW32-NEXT:    pmullw %xmm1, %xmm0
    439 ; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    440 ; SLOW32-NEXT:    retl
    441 ;
    442 ; SLOW64-LABEL: test_mul_v4i32_v4i16:
    443 ; SLOW64:       # %bb.0:
    444 ; SLOW64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
    445 ; SLOW64-NEXT:    movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
    446 ; SLOW64-NEXT:    movdqa %xmm0, %xmm2
    447 ; SLOW64-NEXT:    pmulhuw %xmm1, %xmm2
    448 ; SLOW64-NEXT:    pmullw %xmm1, %xmm0
    449 ; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    450 ; SLOW64-NEXT:    retq
    451 ;
    452 ; SSE4-32-LABEL: test_mul_v4i32_v4i16:
    453 ; SSE4-32:       # %bb.0:
    454 ; SSE4-32-NEXT:    pxor %xmm1, %xmm1
    455 ; SSE4-32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
    456 ; SSE4-32-NEXT:    pmulld {{\.LCPI.*}}, %xmm0
    457 ; SSE4-32-NEXT:    retl
    458 ;
    459 ; SSE4-64-LABEL: test_mul_v4i32_v4i16:
    460 ; SSE4-64:       # %bb.0:
    461 ; SSE4-64-NEXT:    pxor %xmm1, %xmm1
    462 ; SSE4-64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
    463 ; SSE4-64-NEXT:    pmulld {{.*}}(%rip), %xmm0
    464 ; SSE4-64-NEXT:    retq
    465 ;
    466 ; AVX-32-LABEL: test_mul_v4i32_v4i16:
    467 ; AVX-32:       # %bb.0:
    468 ; AVX-32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    469 ; AVX-32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
    470 ; AVX-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
    471 ; AVX-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    472 ; AVX-32-NEXT:    retl
    473 ;
    474 ; AVX-64-LABEL: test_mul_v4i32_v4i16:
    475 ; AVX-64:       # %bb.0:
    476 ; AVX-64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
    477 ; AVX-64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
    478 ; AVX-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
    479 ; AVX-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    480 ; AVX-64-NEXT:    retq
    481   %z = zext <4 x i16> %A to <4 x i32>
    482   %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
    483   ret <4 x i32> %m
    484 }
    485 
    486 define <8 x i32> @test_mul_v8i32_v8i16(<8 x i16> %A) {
    487 ; SLM32-LABEL: test_mul_v8i32_v8i16:
    488 ; SLM32:       # %bb.0:
    489 ; SLM32-NEXT:    movdqa %xmm0, %xmm1
    490 ; SLM32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
    491 ; SLM32-NEXT:    movdqa %xmm1, %xmm2
    492 ; SLM32-NEXT:    pmullw %xmm0, %xmm1
    493 ; SLM32-NEXT:    pmulhuw %xmm0, %xmm2
    494 ; SLM32-NEXT:    movdqa %xmm1, %xmm0
    495 ; SLM32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    496 ; SLM32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    497 ; SLM32-NEXT:    retl
    498 ;
    499 ; SLM64-LABEL: test_mul_v8i32_v8i16:
    500 ; SLM64:       # %bb.0:
    501 ; SLM64-NEXT:    movdqa %xmm0, %xmm1
    502 ; SLM64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
    503 ; SLM64-NEXT:    movdqa %xmm1, %xmm2
    504 ; SLM64-NEXT:    pmullw %xmm0, %xmm1
    505 ; SLM64-NEXT:    pmulhuw %xmm0, %xmm2
    506 ; SLM64-NEXT:    movdqa %xmm1, %xmm0
    507 ; SLM64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    508 ; SLM64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    509 ; SLM64-NEXT:    retq
    510 ;
    511 ; SLOW32-LABEL: test_mul_v8i32_v8i16:
    512 ; SLOW32:       # %bb.0:
    513 ; SLOW32-NEXT:    movdqa %xmm0, %xmm1
    514 ; SLOW32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
    515 ; SLOW32-NEXT:    movdqa %xmm1, %xmm2
    516 ; SLOW32-NEXT:    pmulhuw %xmm0, %xmm2
    517 ; SLOW32-NEXT:    pmullw %xmm0, %xmm1
    518 ; SLOW32-NEXT:    movdqa %xmm1, %xmm0
    519 ; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    520 ; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    521 ; SLOW32-NEXT:    retl
    522 ;
    523 ; SLOW64-LABEL: test_mul_v8i32_v8i16:
    524 ; SLOW64:       # %bb.0:
    525 ; SLOW64-NEXT:    movdqa %xmm0, %xmm1
    526 ; SLOW64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
    527 ; SLOW64-NEXT:    movdqa %xmm1, %xmm2
    528 ; SLOW64-NEXT:    pmulhuw %xmm0, %xmm2
    529 ; SLOW64-NEXT:    pmullw %xmm0, %xmm1
    530 ; SLOW64-NEXT:    movdqa %xmm1, %xmm0
    531 ; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    532 ; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    533 ; SLOW64-NEXT:    retq
    534 ;
    535 ; SSE4-32-LABEL: test_mul_v8i32_v8i16:
    536 ; SSE4-32:       # %bb.0:
    537 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    538 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    539 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    540 ; SSE4-32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
    541 ; SSE4-32-NEXT:    pmulld %xmm2, %xmm0
    542 ; SSE4-32-NEXT:    pmulld %xmm2, %xmm1
    543 ; SSE4-32-NEXT:    retl
    544 ;
    545 ; SSE4-64-LABEL: test_mul_v8i32_v8i16:
    546 ; SSE4-64:       # %bb.0:
    547 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    548 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    549 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    550 ; SSE4-64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
    551 ; SSE4-64-NEXT:    pmulld %xmm2, %xmm0
    552 ; SSE4-64-NEXT:    pmulld %xmm2, %xmm1
    553 ; SSE4-64-NEXT:    retq
    554 ;
    555 ; AVX-32-LABEL: test_mul_v8i32_v8i16:
    556 ; AVX-32:       # %bb.0:
    557 ; AVX-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    558 ; AVX-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
    559 ; AVX-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    560 ; AVX-32-NEXT:    retl
    561 ;
    562 ; AVX-64-LABEL: test_mul_v8i32_v8i16:
    563 ; AVX-64:       # %bb.0:
    564 ; AVX-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    565 ; AVX-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
    566 ; AVX-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    567 ; AVX-64-NEXT:    retq
    568   %z = zext <8 x i16> %A to <8 x i32>
    569   %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
    570   ret <8 x i32> %m
    571 }
    572 
    573 define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
    574 ; SLM32-LABEL: test_mul_v16i32_v16i16:
    575 ; SLM32:       # %bb.0:
    576 ; SLM32-NEXT:    movdqa %xmm1, %xmm3
    577 ; SLM32-NEXT:    movdqa %xmm0, %xmm1
    578 ; SLM32-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
    579 ; SLM32-NEXT:    movdqa %xmm1, %xmm2
    580 ; SLM32-NEXT:    movdqa %xmm3, %xmm4
    581 ; SLM32-NEXT:    pmullw %xmm0, %xmm1
    582 ; SLM32-NEXT:    pmulhuw %xmm0, %xmm2
    583 ; SLM32-NEXT:    pmullw %xmm0, %xmm3
    584 ; SLM32-NEXT:    pmulhuw %xmm0, %xmm4
    585 ; SLM32-NEXT:    movdqa %xmm1, %xmm0
    586 ; SLM32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    587 ; SLM32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    588 ; SLM32-NEXT:    movdqa %xmm3, %xmm2
    589 ; SLM32-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
    590 ; SLM32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
    591 ; SLM32-NEXT:    retl
    592 ;
    593 ; SLM64-LABEL: test_mul_v16i32_v16i16:
    594 ; SLM64:       # %bb.0:
    595 ; SLM64-NEXT:    movdqa %xmm1, %xmm3
    596 ; SLM64-NEXT:    movdqa %xmm0, %xmm1
    597 ; SLM64-NEXT:    movdqa {{.*#+}} xmm0 = [18778,18778,18778,18778,18778,18778,18778,18778]
    598 ; SLM64-NEXT:    movdqa %xmm1, %xmm2
    599 ; SLM64-NEXT:    movdqa %xmm3, %xmm4
    600 ; SLM64-NEXT:    pmullw %xmm0, %xmm1
    601 ; SLM64-NEXT:    pmulhuw %xmm0, %xmm2
    602 ; SLM64-NEXT:    pmullw %xmm0, %xmm3
    603 ; SLM64-NEXT:    pmulhuw %xmm0, %xmm4
    604 ; SLM64-NEXT:    movdqa %xmm1, %xmm0
    605 ; SLM64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    606 ; SLM64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    607 ; SLM64-NEXT:    movdqa %xmm3, %xmm2
    608 ; SLM64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
    609 ; SLM64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
    610 ; SLM64-NEXT:    retq
    611 ;
    612 ; SLOW32-LABEL: test_mul_v16i32_v16i16:
    613 ; SLOW32:       # %bb.0:
    614 ; SLOW32-NEXT:    movdqa %xmm1, %xmm3
    615 ; SLOW32-NEXT:    movdqa %xmm0, %xmm1
    616 ; SLOW32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
    617 ; SLOW32-NEXT:    movdqa %xmm0, %xmm4
    618 ; SLOW32-NEXT:    pmulhuw %xmm2, %xmm4
    619 ; SLOW32-NEXT:    pmullw %xmm2, %xmm1
    620 ; SLOW32-NEXT:    movdqa %xmm1, %xmm0
    621 ; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    622 ; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
    623 ; SLOW32-NEXT:    movdqa %xmm3, %xmm4
    624 ; SLOW32-NEXT:    pmulhuw %xmm2, %xmm4
    625 ; SLOW32-NEXT:    pmullw %xmm2, %xmm3
    626 ; SLOW32-NEXT:    movdqa %xmm3, %xmm2
    627 ; SLOW32-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
    628 ; SLOW32-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
    629 ; SLOW32-NEXT:    retl
    630 ;
    631 ; SLOW64-LABEL: test_mul_v16i32_v16i16:
    632 ; SLOW64:       # %bb.0:
    633 ; SLOW64-NEXT:    movdqa %xmm1, %xmm3
    634 ; SLOW64-NEXT:    movdqa %xmm0, %xmm1
    635 ; SLOW64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
    636 ; SLOW64-NEXT:    movdqa %xmm0, %xmm4
    637 ; SLOW64-NEXT:    pmulhuw %xmm2, %xmm4
    638 ; SLOW64-NEXT:    pmullw %xmm2, %xmm1
    639 ; SLOW64-NEXT:    movdqa %xmm1, %xmm0
    640 ; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    641 ; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
    642 ; SLOW64-NEXT:    movdqa %xmm3, %xmm4
    643 ; SLOW64-NEXT:    pmulhuw %xmm2, %xmm4
    644 ; SLOW64-NEXT:    pmullw %xmm2, %xmm3
    645 ; SLOW64-NEXT:    movdqa %xmm3, %xmm2
    646 ; SLOW64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
    647 ; SLOW64-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
    648 ; SLOW64-NEXT:    retq
    649 ;
    650 ; SSE4-32-LABEL: test_mul_v16i32_v16i16:
    651 ; SSE4-32:       # %bb.0:
    652 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
    653 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
    654 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    655 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
    656 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    657 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    658 ; SSE4-32-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
    659 ; SSE4-32-NEXT:    pmulld %xmm1, %xmm0
    660 ; SSE4-32-NEXT:    pmulld %xmm1, %xmm2
    661 ; SSE4-32-NEXT:    pmulld %xmm1, %xmm4
    662 ; SSE4-32-NEXT:    pmulld %xmm1, %xmm3
    663 ; SSE4-32-NEXT:    movdqa %xmm4, %xmm1
    664 ; SSE4-32-NEXT:    retl
    665 ;
    666 ; SSE4-64-LABEL: test_mul_v16i32_v16i16:
    667 ; SSE4-64:       # %bb.0:
    668 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
    669 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
    670 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
    671 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
    672 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    673 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    674 ; SSE4-64-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
    675 ; SSE4-64-NEXT:    pmulld %xmm1, %xmm0
    676 ; SSE4-64-NEXT:    pmulld %xmm1, %xmm2
    677 ; SSE4-64-NEXT:    pmulld %xmm1, %xmm4
    678 ; SSE4-64-NEXT:    pmulld %xmm1, %xmm3
    679 ; SSE4-64-NEXT:    movdqa %xmm4, %xmm1
    680 ; SSE4-64-NEXT:    retq
    681 ;
    682 ; AVX2-32-LABEL: test_mul_v16i32_v16i16:
    683 ; AVX2-32:       # %bb.0:
    684 ; AVX2-32-NEXT:    vextracti128 $1, %ymm0, %xmm1
    685 ; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    686 ; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    687 ; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
    688 ; AVX2-32-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
    689 ; AVX2-32-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
    690 ; AVX2-32-NEXT:    retl
    691 ;
    692 ; AVX2-64-LABEL: test_mul_v16i32_v16i16:
    693 ; AVX2-64:       # %bb.0:
    694 ; AVX2-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
    695 ; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
    696 ; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    697 ; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
    698 ; AVX2-64-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
    699 ; AVX2-64-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
    700 ; AVX2-64-NEXT:    retq
    701 ;
    702 ; AVX512-32-LABEL: test_mul_v16i32_v16i16:
    703 ; AVX512-32:       # %bb.0:
    704 ; AVX512-32-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
    705 ; AVX512-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
    706 ; AVX512-32-NEXT:    retl
    707 ;
    708 ; AVX512-64-LABEL: test_mul_v16i32_v16i16:
    709 ; AVX512-64:       # %bb.0:
    710 ; AVX512-64-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
    711 ; AVX512-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
    712 ; AVX512-64-NEXT:    retq
    713   %z = zext <16 x i16> %A to <16 x i32>
    714   %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
    715   ret <16 x i32> %m
    716 }
    717 
    718 ;
    719 ; MinSize Tests
    720 ;
    721 
    722 define <4 x i32> @test_mul_v4i32_v4i8_minsize(<4 x i8> %A) minsize {
    723 ; CHECK32-LABEL: test_mul_v4i32_v4i8_minsize:
    724 ; CHECK32:       # %bb.0:
    725 ; CHECK32-NEXT:    pand {{\.LCPI.*}}, %xmm0
    726 ; CHECK32-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
    727 ; CHECK32-NEXT:    retl
    728 ;
    729 ; CHECK64-LABEL: test_mul_v4i32_v4i8_minsize:
    730 ; CHECK64:       # %bb.0:
    731 ; CHECK64-NEXT:    pand {{.*}}(%rip), %xmm0
    732 ; CHECK64-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
    733 ; CHECK64-NEXT:    retq
    734 ;
    735 ; SSE4-32-LABEL: test_mul_v4i32_v4i8_minsize:
    736 ; SSE4-32:       # %bb.0:
    737 ; SSE4-32-NEXT:    pand {{\.LCPI.*}}, %xmm0
    738 ; SSE4-32-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm0
    739 ; SSE4-32-NEXT:    retl
    740 ;
    741 ; SSE4-64-LABEL: test_mul_v4i32_v4i8_minsize:
    742 ; SSE4-64:       # %bb.0:
    743 ; SSE4-64-NEXT:    pand {{.*}}(%rip), %xmm0
    744 ; SSE4-64-NEXT:    pmaddwd {{.*}}(%rip), %xmm0
    745 ; SSE4-64-NEXT:    retq
    746 ;
    747 ; AVX2-32-LABEL: test_mul_v4i32_v4i8_minsize:
    748 ; AVX2-32:       # %bb.0:
    749 ; AVX2-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    750 ; AVX2-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
    751 ; AVX2-32-NEXT:    retl
    752 ;
    753 ; AVX2-64-LABEL: test_mul_v4i32_v4i8_minsize:
    754 ; AVX2-64:       # %bb.0:
    755 ; AVX2-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    756 ; AVX2-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
    757 ; AVX2-64-NEXT:    retq
    758 ;
    759 ; AVX512DQ-32-LABEL: test_mul_v4i32_v4i8_minsize:
    760 ; AVX512DQ-32:       # %bb.0:
    761 ; AVX512DQ-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    762 ; AVX512DQ-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
    763 ; AVX512DQ-32-NEXT:    retl
    764 ;
    765 ; AVX512DQ-64-LABEL: test_mul_v4i32_v4i8_minsize:
    766 ; AVX512DQ-64:       # %bb.0:
    767 ; AVX512DQ-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    768 ; AVX512DQ-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
    769 ; AVX512DQ-64-NEXT:    retq
    770 ;
    771 ; AVX512BW-32-LABEL: test_mul_v4i32_v4i8_minsize:
    772 ; AVX512BW-32:       # %bb.0:
    773 ; AVX512BW-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    774 ; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
    775 ; AVX512BW-32-NEXT:    retl
    776 ;
    777 ; AVX512BW-64-LABEL: test_mul_v4i32_v4i8_minsize:
    778 ; AVX512BW-64:       # %bb.0:
    779 ; AVX512BW-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    780 ; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %xmm0, %xmm0
    781 ; AVX512BW-64-NEXT:    retq
    782 ;
    783 ; KNL-32-LABEL: test_mul_v4i32_v4i8_minsize:
    784 ; KNL-32:       # %bb.0:
    785 ; KNL-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    786 ; KNL-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
    787 ; KNL-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    788 ; KNL-32-NEXT:    retl
    789 ;
    790 ; KNL-64-LABEL: test_mul_v4i32_v4i8_minsize:
    791 ; KNL-64:       # %bb.0:
    792 ; KNL-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    793 ; KNL-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
    794 ; KNL-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
    795 ; KNL-64-NEXT:    retq
    796   %z = zext <4 x i8> %A to <4 x i32>
    797   %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
    798   ret <4 x i32> %m
    799 }
    800 
    801 define <8 x i32> @test_mul_v8i32_v8i8_minsize(<8 x i8> %A) minsize {
    802 ; SLM32-LABEL: test_mul_v8i32_v8i8_minsize:
    803 ; SLM32:       # %bb.0:
    804 ; SLM32-NEXT:    pand {{\.LCPI.*}}, %xmm0
    805 ; SLM32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
    806 ; SLM32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    807 ; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    808 ; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    809 ; SLM32-NEXT:    pmaddwd %xmm2, %xmm0
    810 ; SLM32-NEXT:    pmaddwd %xmm2, %xmm1
    811 ; SLM32-NEXT:    retl
    812 ;
    813 ; SLM64-LABEL: test_mul_v8i32_v8i8_minsize:
    814 ; SLM64:       # %bb.0:
    815 ; SLM64-NEXT:    pand {{.*}}(%rip), %xmm0
    816 ; SLM64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
    817 ; SLM64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    818 ; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    819 ; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    820 ; SLM64-NEXT:    pmaddwd %xmm2, %xmm0
    821 ; SLM64-NEXT:    pmaddwd %xmm2, %xmm1
    822 ; SLM64-NEXT:    retq
    823 ;
    824 ; SLOW32-LABEL: test_mul_v8i32_v8i8_minsize:
    825 ; SLOW32:       # %bb.0:
    826 ; SLOW32-NEXT:    pand {{\.LCPI.*}}, %xmm0
    827 ; SLOW32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    828 ; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    829 ; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    830 ; SLOW32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
    831 ; SLOW32-NEXT:    pmaddwd %xmm2, %xmm0
    832 ; SLOW32-NEXT:    pmaddwd %xmm2, %xmm1
    833 ; SLOW32-NEXT:    retl
    834 ;
    835 ; SLOW64-LABEL: test_mul_v8i32_v8i8_minsize:
    836 ; SLOW64:       # %bb.0:
    837 ; SLOW64-NEXT:    pand {{.*}}(%rip), %xmm0
    838 ; SLOW64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    839 ; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    840 ; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    841 ; SLOW64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
    842 ; SLOW64-NEXT:    pmaddwd %xmm2, %xmm0
    843 ; SLOW64-NEXT:    pmaddwd %xmm2, %xmm1
    844 ; SLOW64-NEXT:    retq
    845 ;
    846 ; SSE4-32-LABEL: test_mul_v8i32_v8i8_minsize:
    847 ; SSE4-32:       # %bb.0:
    848 ; SSE4-32-NEXT:    pand {{\.LCPI.*}}, %xmm0
    849 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    850 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    851 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    852 ; SSE4-32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
    853 ; SSE4-32-NEXT:    pmaddwd %xmm2, %xmm0
    854 ; SSE4-32-NEXT:    pmaddwd %xmm2, %xmm1
    855 ; SSE4-32-NEXT:    retl
    856 ;
    857 ; SSE4-64-LABEL: test_mul_v8i32_v8i8_minsize:
    858 ; SSE4-64:       # %bb.0:
    859 ; SSE4-64-NEXT:    pand {{.*}}(%rip), %xmm0
    860 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    861 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    862 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    863 ; SSE4-64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
    864 ; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm0
    865 ; SSE4-64-NEXT:    pmaddwd %xmm2, %xmm1
    866 ; SSE4-64-NEXT:    retq
    867 ;
    868 ; AVX2-32-LABEL: test_mul_v8i32_v8i8_minsize:
    869 ; AVX2-32:       # %bb.0:
    870 ; AVX2-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    871 ; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    872 ; AVX2-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
    873 ; AVX2-32-NEXT:    retl
    874 ;
    875 ; AVX2-64-LABEL: test_mul_v8i32_v8i8_minsize:
    876 ; AVX2-64:       # %bb.0:
    877 ; AVX2-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    878 ; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    879 ; AVX2-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
    880 ; AVX2-64-NEXT:    retq
    881 ;
    882 ; AVX512DQ-32-LABEL: test_mul_v8i32_v8i8_minsize:
    883 ; AVX512DQ-32:       # %bb.0:
    884 ; AVX512DQ-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    885 ; AVX512DQ-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    886 ; AVX512DQ-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
    887 ; AVX512DQ-32-NEXT:    retl
    888 ;
    889 ; AVX512DQ-64-LABEL: test_mul_v8i32_v8i8_minsize:
    890 ; AVX512DQ-64:       # %bb.0:
    891 ; AVX512DQ-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    892 ; AVX512DQ-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    893 ; AVX512DQ-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
    894 ; AVX512DQ-64-NEXT:    retq
    895 ;
    896 ; AVX512BW-32-LABEL: test_mul_v8i32_v8i8_minsize:
    897 ; AVX512BW-32:       # %bb.0:
    898 ; AVX512BW-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    899 ; AVX512BW-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    900 ; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %ymm0, %ymm0
    901 ; AVX512BW-32-NEXT:    retl
    902 ;
    903 ; AVX512BW-64-LABEL: test_mul_v8i32_v8i8_minsize:
    904 ; AVX512BW-64:       # %bb.0:
    905 ; AVX512BW-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    906 ; AVX512BW-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    907 ; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %ymm0, %ymm0
    908 ; AVX512BW-64-NEXT:    retq
    909 ;
    910 ; KNL-32-LABEL: test_mul_v8i32_v8i8_minsize:
    911 ; KNL-32:       # %bb.0:
    912 ; KNL-32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
    913 ; KNL-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    914 ; KNL-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
    915 ; KNL-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    916 ; KNL-32-NEXT:    retl
    917 ;
    918 ; KNL-64-LABEL: test_mul_v8i32_v8i8_minsize:
    919 ; KNL-64:       # %bb.0:
    920 ; KNL-64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
    921 ; KNL-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
    922 ; KNL-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
    923 ; KNL-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
    924 ; KNL-64-NEXT:    retq
    925   %z = zext <8 x i8> %A to <8 x i32>
    926   %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
    927   ret <8 x i32> %m
    928 }
    929 
    930 define <16 x i32> @test_mul_v16i32_v16i8_minsize(<16 x i8> %A) minsize {
    931 ; SLM32-LABEL: test_mul_v16i32_v16i8_minsize:
    932 ; SLM32:       # %bb.0:
    933 ; SLM32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
    934 ; SLM32-NEXT:    movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
    935 ; SLM32-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
    936 ; SLM32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    937 ; SLM32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    938 ; SLM32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    939 ; SLM32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    940 ; SLM32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
    941 ; SLM32-NEXT:    pmaddwd %xmm5, %xmm0
    942 ; SLM32-NEXT:    pmaddwd %xmm5, %xmm1
    943 ; SLM32-NEXT:    pmaddwd %xmm5, %xmm2
    944 ; SLM32-NEXT:    pmaddwd %xmm5, %xmm3
    945 ; SLM32-NEXT:    retl
    946 ;
    947 ; SLM64-LABEL: test_mul_v16i32_v16i8_minsize:
    948 ; SLM64:       # %bb.0:
    949 ; SLM64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
    950 ; SLM64-NEXT:    movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778]
    951 ; SLM64-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
    952 ; SLM64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    953 ; SLM64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    954 ; SLM64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    955 ; SLM64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    956 ; SLM64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
    957 ; SLM64-NEXT:    pmaddwd %xmm5, %xmm0
    958 ; SLM64-NEXT:    pmaddwd %xmm5, %xmm1
    959 ; SLM64-NEXT:    pmaddwd %xmm5, %xmm2
    960 ; SLM64-NEXT:    pmaddwd %xmm5, %xmm3
    961 ; SLM64-NEXT:    retq
    962 ;
    963 ; SLOW32-LABEL: test_mul_v16i32_v16i8_minsize:
    964 ; SLOW32:       # %bb.0:
    965 ; SLOW32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
    966 ; SLOW32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    967 ; SLOW32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    968 ; SLOW32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    969 ; SLOW32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    970 ; SLOW32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    971 ; SLOW32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    972 ; SLOW32-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
    973 ; SLOW32-NEXT:    pmaddwd %xmm4, %xmm0
    974 ; SLOW32-NEXT:    pmaddwd %xmm4, %xmm1
    975 ; SLOW32-NEXT:    pmaddwd %xmm4, %xmm2
    976 ; SLOW32-NEXT:    pmaddwd %xmm4, %xmm3
    977 ; SLOW32-NEXT:    retl
    978 ;
    979 ; SLOW64-LABEL: test_mul_v16i32_v16i8_minsize:
    980 ; SLOW64:       # %bb.0:
    981 ; SLOW64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
    982 ; SLOW64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    983 ; SLOW64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
    984 ; SLOW64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    985 ; SLOW64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
    986 ; SLOW64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    987 ; SLOW64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
    988 ; SLOW64-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
    989 ; SLOW64-NEXT:    pmaddwd %xmm4, %xmm0
    990 ; SLOW64-NEXT:    pmaddwd %xmm4, %xmm1
    991 ; SLOW64-NEXT:    pmaddwd %xmm4, %xmm2
    992 ; SLOW64-NEXT:    pmaddwd %xmm4, %xmm3
    993 ; SLOW64-NEXT:    retq
    994 ;
    995 ; SSE4-32-LABEL: test_mul_v16i32_v16i8_minsize:
    996 ; SSE4-32:       # %bb.0:
    997 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
    998 ; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
    999 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1000 ; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
   1001 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1002 ; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
   1003 ; SSE4-32-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
   1004 ; SSE4-32-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
   1005 ; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm0
   1006 ; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm1
   1007 ; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm2
   1008 ; SSE4-32-NEXT:    pmaddwd %xmm4, %xmm3
   1009 ; SSE4-32-NEXT:    retl
   1010 ;
   1011 ; SSE4-64-LABEL: test_mul_v16i32_v16i8_minsize:
   1012 ; SSE4-64:       # %bb.0:
   1013 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
   1014 ; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
   1015 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1016 ; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
   1017 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
   1018 ; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
   1019 ; SSE4-64-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
   1020 ; SSE4-64-NEXT:    movdqa {{.*#+}} xmm4 = [18778,18778,18778,18778]
   1021 ; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm0
   1022 ; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm1
   1023 ; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm2
   1024 ; SSE4-64-NEXT:    pmaddwd %xmm4, %xmm3
   1025 ; SSE4-64-NEXT:    retq
   1026 ;
   1027 ; AVX2-32-LABEL: test_mul_v16i32_v16i8_minsize:
   1028 ; AVX2-32:       # %bb.0:
   1029 ; AVX2-32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1030 ; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
   1031 ; AVX2-32-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
   1032 ; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
   1033 ; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
   1034 ; AVX2-32-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
   1035 ; AVX2-32-NEXT:    retl
   1036 ;
   1037 ; AVX2-64-LABEL: test_mul_v16i32_v16i8_minsize:
   1038 ; AVX2-64:       # %bb.0:
   1039 ; AVX2-64-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1040 ; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
   1041 ; AVX2-64-NEXT:    vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
   1042 ; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
   1043 ; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
   1044 ; AVX2-64-NEXT:    vpmaddwd %ymm2, %ymm1, %ymm1
   1045 ; AVX2-64-NEXT:    retq
   1046 ;
   1047 ; AVX512DQ-32-LABEL: test_mul_v16i32_v16i8_minsize:
   1048 ; AVX512DQ-32:       # %bb.0:
   1049 ; AVX512DQ-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
   1050 ; AVX512DQ-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
   1051 ; AVX512DQ-32-NEXT:    retl
   1052 ;
   1053 ; AVX512DQ-64-LABEL: test_mul_v16i32_v16i8_minsize:
   1054 ; AVX512DQ-64:       # %bb.0:
   1055 ; AVX512DQ-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
   1056 ; AVX512DQ-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
   1057 ; AVX512DQ-64-NEXT:    retq
   1058 ;
   1059 ; AVX512BW-32-LABEL: test_mul_v16i32_v16i8_minsize:
   1060 ; AVX512BW-32:       # %bb.0:
   1061 ; AVX512BW-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
   1062 ; AVX512BW-32-NEXT:    vpmaddwd {{\.LCPI.*}}, %zmm0, %zmm0
   1063 ; AVX512BW-32-NEXT:    retl
   1064 ;
   1065 ; AVX512BW-64-LABEL: test_mul_v16i32_v16i8_minsize:
   1066 ; AVX512BW-64:       # %bb.0:
   1067 ; AVX512BW-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
   1068 ; AVX512BW-64-NEXT:    vpmaddwd {{.*}}(%rip), %zmm0, %zmm0
   1069 ; AVX512BW-64-NEXT:    retq
   1070 ;
   1071 ; KNL-32-LABEL: test_mul_v16i32_v16i8_minsize:
   1072 ; KNL-32:       # %bb.0:
   1073 ; KNL-32-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
   1074 ; KNL-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
   1075 ; KNL-32-NEXT:    retl
   1076 ;
   1077 ; KNL-64-LABEL: test_mul_v16i32_v16i8_minsize:
   1078 ; KNL-64:       # %bb.0:
   1079 ; KNL-64-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
   1080 ; KNL-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
   1081 ; KNL-64-NEXT:    retq
   1082   %z = zext <16 x i8> %A to <16 x i32>
   1083   %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
   1084   ret <16 x i32> %m
   1085 }
   1086 
   1087 define <4 x i32> @test_mul_v4i32_v4i16_minsize(<4 x i16> %A) minsize {
   1088 ; CHECK32-LABEL: test_mul_v4i32_v4i16_minsize:
   1089 ; CHECK32:       # %bb.0:
   1090 ; CHECK32-NEXT:    pxor %xmm1, %xmm1
   1091 ; CHECK32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
   1092 ; CHECK32-NEXT:    pmulld {{\.LCPI.*}}, %xmm0
   1093 ; CHECK32-NEXT:    retl
   1094 ;
   1095 ; CHECK64-LABEL: test_mul_v4i32_v4i16_minsize:
   1096 ; CHECK64:       # %bb.0:
   1097 ; CHECK64-NEXT:    pxor %xmm1, %xmm1
   1098 ; CHECK64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
   1099 ; CHECK64-NEXT:    pmulld {{.*}}(%rip), %xmm0
   1100 ; CHECK64-NEXT:    retq
   1101 ;
   1102 ; SSE4-32-LABEL: test_mul_v4i32_v4i16_minsize:
   1103 ; SSE4-32:       # %bb.0:
   1104 ; SSE4-32-NEXT:    pxor %xmm1, %xmm1
   1105 ; SSE4-32-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
   1106 ; SSE4-32-NEXT:    pmulld {{\.LCPI.*}}, %xmm0
   1107 ; SSE4-32-NEXT:    retl
   1108 ;
   1109 ; SSE4-64-LABEL: test_mul_v4i32_v4i16_minsize:
   1110 ; SSE4-64:       # %bb.0:
   1111 ; SSE4-64-NEXT:    pxor %xmm1, %xmm1
   1112 ; SSE4-64-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
   1113 ; SSE4-64-NEXT:    pmulld {{.*}}(%rip), %xmm0
   1114 ; SSE4-64-NEXT:    retq
   1115 ;
   1116 ; AVX-32-LABEL: test_mul_v4i32_v4i16_minsize:
   1117 ; AVX-32:       # %bb.0:
   1118 ; AVX-32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1119 ; AVX-32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
   1120 ; AVX-32-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
   1121 ; AVX-32-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   1122 ; AVX-32-NEXT:    retl
   1123 ;
   1124 ; AVX-64-LABEL: test_mul_v4i32_v4i16_minsize:
   1125 ; AVX-64:       # %bb.0:
   1126 ; AVX-64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
   1127 ; AVX-64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
   1128 ; AVX-64-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [18778,18778,18778,18778]
   1129 ; AVX-64-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   1130 ; AVX-64-NEXT:    retq
   1131   %z = zext <4 x i16> %A to <4 x i32>
   1132   %m = mul nuw nsw <4 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778>
   1133   ret <4 x i32> %m
   1134 }
   1135 
   1136 define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize {
   1137 ; SLM32-LABEL: test_mul_v8i32_v8i16_minsize:
   1138 ; SLM32:       # %bb.0:
   1139 ; SLM32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
   1140 ; SLM32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1141 ; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1142 ; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1143 ; SLM32-NEXT:    pmulld %xmm2, %xmm0
   1144 ; SLM32-NEXT:    pmulld %xmm2, %xmm1
   1145 ; SLM32-NEXT:    retl
   1146 ;
   1147 ; SLM64-LABEL: test_mul_v8i32_v8i16_minsize:
   1148 ; SLM64:       # %bb.0:
   1149 ; SLM64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
   1150 ; SLM64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1151 ; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1152 ; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1153 ; SLM64-NEXT:    pmulld %xmm2, %xmm0
   1154 ; SLM64-NEXT:    pmulld %xmm2, %xmm1
   1155 ; SLM64-NEXT:    retq
   1156 ;
   1157 ; SLOW32-LABEL: test_mul_v8i32_v8i16_minsize:
   1158 ; SLOW32:       # %bb.0:
   1159 ; SLOW32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1160 ; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1161 ; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1162 ; SLOW32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
   1163 ; SLOW32-NEXT:    pmulld %xmm2, %xmm0
   1164 ; SLOW32-NEXT:    pmulld %xmm2, %xmm1
   1165 ; SLOW32-NEXT:    retl
   1166 ;
   1167 ; SLOW64-LABEL: test_mul_v8i32_v8i16_minsize:
   1168 ; SLOW64:       # %bb.0:
   1169 ; SLOW64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1170 ; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1171 ; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1172 ; SLOW64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
   1173 ; SLOW64-NEXT:    pmulld %xmm2, %xmm0
   1174 ; SLOW64-NEXT:    pmulld %xmm2, %xmm1
   1175 ; SLOW64-NEXT:    retq
   1176 ;
   1177 ; SSE4-32-LABEL: test_mul_v8i32_v8i16_minsize:
   1178 ; SSE4-32:       # %bb.0:
   1179 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1180 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1181 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1182 ; SSE4-32-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
   1183 ; SSE4-32-NEXT:    pmulld %xmm2, %xmm0
   1184 ; SSE4-32-NEXT:    pmulld %xmm2, %xmm1
   1185 ; SSE4-32-NEXT:    retl
   1186 ;
   1187 ; SSE4-64-LABEL: test_mul_v8i32_v8i16_minsize:
   1188 ; SSE4-64:       # %bb.0:
   1189 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
   1190 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1191 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1192 ; SSE4-64-NEXT:    movdqa {{.*#+}} xmm2 = [18778,18778,18778,18778]
   1193 ; SSE4-64-NEXT:    pmulld %xmm2, %xmm0
   1194 ; SSE4-64-NEXT:    pmulld %xmm2, %xmm1
   1195 ; SSE4-64-NEXT:    retq
   1196 ;
   1197 ; AVX-32-LABEL: test_mul_v8i32_v8i16_minsize:
   1198 ; AVX-32:       # %bb.0:
   1199 ; AVX-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1200 ; AVX-32-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
   1201 ; AVX-32-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
   1202 ; AVX-32-NEXT:    retl
   1203 ;
   1204 ; AVX-64-LABEL: test_mul_v8i32_v8i16_minsize:
   1205 ; AVX-64:       # %bb.0:
   1206 ; AVX-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1207 ; AVX-64-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778]
   1208 ; AVX-64-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
   1209 ; AVX-64-NEXT:    retq
   1210   %z = zext <8 x i16> %A to <8 x i32>
   1211   %m = mul nuw nsw <8 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
   1212   ret <8 x i32> %m
   1213 }
   1214 
   1215 define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize {
   1216 ; SLM32-LABEL: test_mul_v16i32_v16i16_minsize:
   1217 ; SLM32:       # %bb.0:
   1218 ; SLM32-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
   1219 ; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1220 ; SLM32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
   1221 ; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1222 ; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1223 ; SLM32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1224 ; SLM32-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
   1225 ; SLM32-NEXT:    pmulld %xmm1, %xmm4
   1226 ; SLM32-NEXT:    pmulld %xmm1, %xmm0
   1227 ; SLM32-NEXT:    pmulld %xmm1, %xmm2
   1228 ; SLM32-NEXT:    pmulld %xmm1, %xmm3
   1229 ; SLM32-NEXT:    movdqa %xmm4, %xmm1
   1230 ; SLM32-NEXT:    retl
   1231 ;
   1232 ; SLM64-LABEL: test_mul_v16i32_v16i16_minsize:
   1233 ; SLM64:       # %bb.0:
   1234 ; SLM64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
   1235 ; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1236 ; SLM64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
   1237 ; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1238 ; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1239 ; SLM64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1240 ; SLM64-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
   1241 ; SLM64-NEXT:    pmulld %xmm1, %xmm4
   1242 ; SLM64-NEXT:    pmulld %xmm1, %xmm0
   1243 ; SLM64-NEXT:    pmulld %xmm1, %xmm2
   1244 ; SLM64-NEXT:    pmulld %xmm1, %xmm3
   1245 ; SLM64-NEXT:    movdqa %xmm4, %xmm1
   1246 ; SLM64-NEXT:    retq
   1247 ;
   1248 ; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize:
   1249 ; SLOW32:       # %bb.0:
   1250 ; SLOW32-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
   1251 ; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1252 ; SLOW32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
   1253 ; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1254 ; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1255 ; SLOW32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1256 ; SLOW32-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
   1257 ; SLOW32-NEXT:    pmulld %xmm1, %xmm0
   1258 ; SLOW32-NEXT:    pmulld %xmm1, %xmm2
   1259 ; SLOW32-NEXT:    pmulld %xmm1, %xmm4
   1260 ; SLOW32-NEXT:    pmulld %xmm1, %xmm3
   1261 ; SLOW32-NEXT:    movdqa %xmm4, %xmm1
   1262 ; SLOW32-NEXT:    retl
   1263 ;
   1264 ; SLOW64-LABEL: test_mul_v16i32_v16i16_minsize:
   1265 ; SLOW64:       # %bb.0:
   1266 ; SLOW64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
   1267 ; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1268 ; SLOW64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
   1269 ; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1270 ; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1271 ; SLOW64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1272 ; SLOW64-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
   1273 ; SLOW64-NEXT:    pmulld %xmm1, %xmm0
   1274 ; SLOW64-NEXT:    pmulld %xmm1, %xmm2
   1275 ; SLOW64-NEXT:    pmulld %xmm1, %xmm4
   1276 ; SLOW64-NEXT:    pmulld %xmm1, %xmm3
   1277 ; SLOW64-NEXT:    movdqa %xmm4, %xmm1
   1278 ; SLOW64-NEXT:    retq
   1279 ;
   1280 ; SSE4-32-LABEL: test_mul_v16i32_v16i16_minsize:
   1281 ; SSE4-32:       # %bb.0:
   1282 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
   1283 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1284 ; SSE4-32-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
   1285 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1286 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1287 ; SSE4-32-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1288 ; SSE4-32-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
   1289 ; SSE4-32-NEXT:    pmulld %xmm1, %xmm0
   1290 ; SSE4-32-NEXT:    pmulld %xmm1, %xmm2
   1291 ; SSE4-32-NEXT:    pmulld %xmm1, %xmm4
   1292 ; SSE4-32-NEXT:    pmulld %xmm1, %xmm3
   1293 ; SSE4-32-NEXT:    movdqa %xmm4, %xmm1
   1294 ; SSE4-32-NEXT:    retl
   1295 ;
   1296 ; SSE4-64-LABEL: test_mul_v16i32_v16i16_minsize:
   1297 ; SSE4-64:       # %bb.0:
   1298 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
   1299 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1300 ; SSE4-64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
   1301 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
   1302 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1303 ; SSE4-64-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1304 ; SSE4-64-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778]
   1305 ; SSE4-64-NEXT:    pmulld %xmm1, %xmm0
   1306 ; SSE4-64-NEXT:    pmulld %xmm1, %xmm2
   1307 ; SSE4-64-NEXT:    pmulld %xmm1, %xmm4
   1308 ; SSE4-64-NEXT:    pmulld %xmm1, %xmm3
   1309 ; SSE4-64-NEXT:    movdqa %xmm4, %xmm1
   1310 ; SSE4-64-NEXT:    retq
   1311 ;
   1312 ; AVX2-32-LABEL: test_mul_v16i32_v16i16_minsize:
   1313 ; AVX2-32:       # %bb.0:
   1314 ; AVX2-32-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1315 ; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   1316 ; AVX2-32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1317 ; AVX2-32-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
   1318 ; AVX2-32-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
   1319 ; AVX2-32-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
   1320 ; AVX2-32-NEXT:    retl
   1321 ;
   1322 ; AVX2-64-LABEL: test_mul_v16i32_v16i16_minsize:
   1323 ; AVX2-64:       # %bb.0:
   1324 ; AVX2-64-NEXT:    vextracti128 $1, %ymm0, %xmm1
   1325 ; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
   1326 ; AVX2-64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
   1327 ; AVX2-64-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [18778,18778,18778,18778,18778,18778,18778,18778]
   1328 ; AVX2-64-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
   1329 ; AVX2-64-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
   1330 ; AVX2-64-NEXT:    retq
   1331 ;
   1332 ; AVX512-32-LABEL: test_mul_v16i32_v16i16_minsize:
   1333 ; AVX512-32:       # %bb.0:
   1334 ; AVX512-32-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
   1335 ; AVX512-32-NEXT:    vpmulld {{\.LCPI.*}}{1to16}, %zmm0, %zmm0
   1336 ; AVX512-32-NEXT:    retl
   1337 ;
   1338 ; AVX512-64-LABEL: test_mul_v16i32_v16i16_minsize:
   1339 ; AVX512-64:       # %bb.0:
   1340 ; AVX512-64-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
   1341 ; AVX512-64-NEXT:    vpmulld {{.*}}(%rip){1to16}, %zmm0, %zmm0
   1342 ; AVX512-64-NEXT:    retq
   1343   %z = zext <16 x i16> %A to <16 x i32>
   1344   %m = mul nuw nsw <16 x i32> %z, <i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778, i32 18778>
   1345   ret <16 x i32> %m
   1346 }
   1347