Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE
      3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1
      4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2
      5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE
      6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1
      7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2
      8 
      9 @c = external global i32*, align 8
     10 
     11 ; %val1 = load <2 x i8>
     12 ; %op1 = zext<2 x i32> %val1
     13 ; %val2 = load <2 x i8>
     14 ; %op2 = zext<2 x i32> %val2
     15 ; %rst = mul <2 x i32> %op1, %op2
     16 ;
     17 define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
     18 ; X86-SSE-LABEL: mul_2xi8:
     19 ; X86-SSE:       # %bb.0: # %entry
     20 ; X86-SSE-NEXT:    pushl %esi
     21 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
     22 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
     23 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
     24 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     25 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
     26 ; X86-SSE-NEXT:    movl c, %esi
     27 ; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
     28 ; X86-SSE-NEXT:    movd %edx, %xmm0
     29 ; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
     30 ; X86-SSE-NEXT:    movd %eax, %xmm1
     31 ; X86-SSE-NEXT:    pxor %xmm2, %xmm2
     32 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
     33 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
     34 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
     35 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
     36 ; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
     37 ; X86-SSE-NEXT:    popl %esi
     38 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
     39 ; X86-SSE-NEXT:    retl
     40 ;
     41 ; X86-AVX-LABEL: mul_2xi8:
     42 ; X86-AVX:       # %bb.0: # %entry
     43 ; X86-AVX-NEXT:    pushl %esi
     44 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
     45 ; X86-AVX-NEXT:    .cfi_offset %esi, -8
     46 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
     47 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
     48 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
     49 ; X86-AVX-NEXT:    movl c, %esi
     50 ; X86-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
     51 ; X86-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
     52 ; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
     53 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
     54 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
     55 ; X86-AVX-NEXT:    popl %esi
     56 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
     57 ; X86-AVX-NEXT:    retl
     58 ;
     59 ; X64-SSE-LABEL: mul_2xi8:
     60 ; X64-SSE:       # %bb.0: # %entry
     61 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
     62 ; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
     63 ; X64-SSE-NEXT:    movd %ecx, %xmm0
     64 ; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
     65 ; X64-SSE-NEXT:    movd %ecx, %xmm1
     66 ; X64-SSE-NEXT:    pxor %xmm2, %xmm2
     67 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
     68 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
     69 ; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
     70 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
     71 ; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
     72 ; X64-SSE-NEXT:    retq
     73 ;
     74 ; X64-AVX-LABEL: mul_2xi8:
     75 ; X64-AVX:       # %bb.0: # %entry
     76 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
     77 ; X64-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
     78 ; X64-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
     79 ; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
     80 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
     81 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
     82 ; X64-AVX-NEXT:    retq
     83 entry:
     84   %pre = load i32*, i32** @c
     85   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
     86   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
     87   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
     88   %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
     89   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
     90   %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
     91   %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
     92   %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
     93   %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
     94   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
     95   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
     96   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
     97   ret void
     98 }
     99 
    100 ; %val1 = load <4 x i8>
    101 ; %op1 = zext<4 x i32> %val1
    102 ; %val2 = load <4 x i8>
    103 ; %op2 = zext<4 x i32> %val2
    104 ; %rst = mul <4 x i32> %op1, %op2
    105 ;
    106 define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    107 ; X86-SSE-LABEL: mul_4xi8:
    108 ; X86-SSE:       # %bb.0: # %entry
    109 ; X86-SSE-NEXT:    pushl %esi
    110 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
    111 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
    112 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    113 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    114 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
    115 ; X86-SSE-NEXT:    movl c, %esi
    116 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    117 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
    118 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    119 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    120 ; X86-SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
    121 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
    122 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    123 ; X86-SSE-NEXT:    pmaddwd %xmm0, %xmm2
    124 ; X86-SSE-NEXT:    movdqu %xmm2, (%esi,%ecx,4)
    125 ; X86-SSE-NEXT:    popl %esi
    126 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
    127 ; X86-SSE-NEXT:    retl
    128 ;
    129 ; X86-AVX-LABEL: mul_4xi8:
    130 ; X86-AVX:       # %bb.0: # %entry
    131 ; X86-AVX-NEXT:    pushl %esi
    132 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
    133 ; X86-AVX-NEXT:    .cfi_offset %esi, -8
    134 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    135 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    136 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
    137 ; X86-AVX-NEXT:    movl c, %esi
    138 ; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    139 ; X86-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    140 ; X86-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
    141 ; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
    142 ; X86-AVX-NEXT:    popl %esi
    143 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
    144 ; X86-AVX-NEXT:    retl
    145 ;
    146 ; X64-SSE-LABEL: mul_4xi8:
    147 ; X64-SSE:       # %bb.0: # %entry
    148 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
    149 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    150 ; X64-SSE-NEXT:    pxor %xmm1, %xmm1
    151 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    152 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    153 ; X64-SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
    154 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
    155 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
    156 ; X64-SSE-NEXT:    pmaddwd %xmm0, %xmm2
    157 ; X64-SSE-NEXT:    movdqu %xmm2, (%rax,%rdx,4)
    158 ; X64-SSE-NEXT:    retq
    159 ;
    160 ; X64-AVX-LABEL: mul_4xi8:
    161 ; X64-AVX:       # %bb.0: # %entry
    162 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
    163 ; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    164 ; X64-AVX-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    165 ; X64-AVX-NEXT:    vpmaddwd %xmm0, %xmm1, %xmm0
    166 ; X64-AVX-NEXT:    vmovdqu %xmm0, (%rax,%rdx,4)
    167 ; X64-AVX-NEXT:    retq
    168 entry:
    169   %pre = load i32*, i32** @c
    170   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    171   %tmp7 = bitcast i8* %tmp6 to <4 x i8>*
    172   %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
    173   %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
    174   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    175   %tmp11 = bitcast i8* %tmp10 to <4 x i8>*
    176   %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
    177   %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
    178   %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
    179   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    180   %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
    181   store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
    182   ret void
    183 }
    184 
    185 ; %val1 = load <8 x i8>
    186 ; %op1 = zext<8 x i32> %val1
    187 ; %val2 = load <8 x i8>
    188 ; %op2 = zext<8 x i32> %val2
    189 ; %rst = mul <8 x i32> %op1, %op2
    190 ;
    191 define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    192 ; X86-SSE-LABEL: mul_8xi8:
    193 ; X86-SSE:       # %bb.0: # %entry
    194 ; X86-SSE-NEXT:    pushl %esi
    195 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
    196 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
    197 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    198 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    199 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
    200 ; X86-SSE-NEXT:    movl c, %esi
    201 ; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    202 ; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    203 ; X86-SSE-NEXT:    pxor %xmm2, %xmm2
    204 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
    205 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    206 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
    207 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
    208 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    209 ; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    210 ; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
    211 ; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
    212 ; X86-SSE-NEXT:    popl %esi
    213 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
    214 ; X86-SSE-NEXT:    retl
    215 ;
    216 ; X86-AVX1-LABEL: mul_8xi8:
    217 ; X86-AVX1:       # %bb.0: # %entry
    218 ; X86-AVX1-NEXT:    pushl %esi
    219 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
    220 ; X86-AVX1-NEXT:    .cfi_offset %esi, -8
    221 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
    222 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    223 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
    224 ; X86-AVX1-NEXT:    movl c, %esi
    225 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    226 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    227 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    228 ; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
    229 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    230 ; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
    231 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    232 ; X86-AVX1-NEXT:    vmovups %ymm0, (%esi,%ecx,4)
    233 ; X86-AVX1-NEXT:    popl %esi
    234 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
    235 ; X86-AVX1-NEXT:    vzeroupper
    236 ; X86-AVX1-NEXT:    retl
    237 ;
    238 ; X86-AVX2-LABEL: mul_8xi8:
    239 ; X86-AVX2:       # %bb.0: # %entry
    240 ; X86-AVX2-NEXT:    pushl %esi
    241 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
    242 ; X86-AVX2-NEXT:    .cfi_offset %esi, -8
    243 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
    244 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    245 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
    246 ; X86-AVX2-NEXT:    movl c, %esi
    247 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    248 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    249 ; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
    250 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
    251 ; X86-AVX2-NEXT:    popl %esi
    252 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
    253 ; X86-AVX2-NEXT:    vzeroupper
    254 ; X86-AVX2-NEXT:    retl
    255 ;
    256 ; X64-SSE-LABEL: mul_8xi8:
    257 ; X64-SSE:       # %bb.0: # %entry
    258 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
    259 ; X64-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    260 ; X64-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    261 ; X64-SSE-NEXT:    pxor %xmm2, %xmm2
    262 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
    263 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    264 ; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
    265 ; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
    266 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    267 ; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    268 ; X64-SSE-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
    269 ; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
    270 ; X64-SSE-NEXT:    retq
    271 ;
    272 ; X64-AVX1-LABEL: mul_8xi8:
    273 ; X64-AVX1:       # %bb.0: # %entry
    274 ; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
    275 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    276 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    277 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    278 ; X64-AVX1-NEXT:    vpmaddwd %xmm0, %xmm2, %xmm0
    279 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    280 ; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
    281 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    282 ; X64-AVX1-NEXT:    vmovups %ymm0, (%rax,%rdx,4)
    283 ; X64-AVX1-NEXT:    vzeroupper
    284 ; X64-AVX1-NEXT:    retq
    285 ;
    286 ; X64-AVX2-LABEL: mul_8xi8:
    287 ; X64-AVX2:       # %bb.0: # %entry
    288 ; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
    289 ; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    290 ; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    291 ; X64-AVX2-NEXT:    vpmaddwd %ymm0, %ymm1, %ymm0
    292 ; X64-AVX2-NEXT:    vmovdqu %ymm0, (%rax,%rdx,4)
    293 ; X64-AVX2-NEXT:    vzeroupper
    294 ; X64-AVX2-NEXT:    retq
    295 entry:
    296   %pre = load i32*, i32** @c
    297   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    298   %tmp7 = bitcast i8* %tmp6 to <8 x i8>*
    299   %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
    300   %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
    301   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    302   %tmp11 = bitcast i8* %tmp10 to <8 x i8>*
    303   %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
    304   %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
    305   %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
    306   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    307   %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
    308   store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
    309   ret void
    310 }
    311 
    312 ; %val1 = load <16 x i8>
    313 ; %op1 = zext<16 x i32> %val1
    314 ; %val2 = load <16 x i8>
    315 ; %op2 = zext<16 x i32> %val2
    316 ; %rst = mul <16 x i32> %op1, %op2
    317 ;
    318 define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    319 ; X86-SSE-LABEL: mul_16xi8:
    320 ; X86-SSE:       # %bb.0: # %entry
    321 ; X86-SSE-NEXT:    pushl %esi
    322 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
    323 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
    324 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    325 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    326 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
    327 ; X86-SSE-NEXT:    movl c, %esi
    328 ; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
    329 ; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm1
    330 ; X86-SSE-NEXT:    pxor %xmm2, %xmm2
    331 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
    332 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
    333 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm4
    334 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
    335 ; X86-SSE-NEXT:    pmullw %xmm3, %xmm4
    336 ; X86-SSE-NEXT:    movdqa %xmm4, %xmm3
    337 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
    338 ; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
    339 ; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
    340 ; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
    341 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
    342 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
    343 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    344 ; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    345 ; X86-SSE-NEXT:    movdqu %xmm1, 48(%esi,%ecx,4)
    346 ; X86-SSE-NEXT:    movdqu %xmm0, 32(%esi,%ecx,4)
    347 ; X86-SSE-NEXT:    movdqu %xmm4, 16(%esi,%ecx,4)
    348 ; X86-SSE-NEXT:    movdqu %xmm3, (%esi,%ecx,4)
    349 ; X86-SSE-NEXT:    popl %esi
    350 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
    351 ; X86-SSE-NEXT:    retl
    352 ;
    353 ; X86-AVX1-LABEL: mul_16xi8:
    354 ; X86-AVX1:       # %bb.0: # %entry
    355 ; X86-AVX1-NEXT:    pushl %esi
    356 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
    357 ; X86-AVX1-NEXT:    .cfi_offset %esi, -8
    358 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
    359 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    360 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
    361 ; X86-AVX1-NEXT:    movl c, %esi
    362 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    363 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    364 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    365 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    366 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    367 ; X86-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
    368 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    369 ; X86-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
    370 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    371 ; X86-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
    372 ; X86-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    373 ; X86-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
    374 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    375 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    376 ; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
    377 ; X86-AVX1-NEXT:    vmovups %ymm2, (%esi,%ecx,4)
    378 ; X86-AVX1-NEXT:    popl %esi
    379 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
    380 ; X86-AVX1-NEXT:    vzeroupper
    381 ; X86-AVX1-NEXT:    retl
    382 ;
    383 ; X86-AVX2-LABEL: mul_16xi8:
    384 ; X86-AVX2:       # %bb.0: # %entry
    385 ; X86-AVX2-NEXT:    pushl %esi
    386 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
    387 ; X86-AVX2-NEXT:    .cfi_offset %esi, -8
    388 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
    389 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    390 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
    391 ; X86-AVX2-NEXT:    movl c, %esi
    392 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    393 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    394 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    395 ; X86-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
    396 ; X86-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    397 ; X86-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
    398 ; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
    399 ; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
    400 ; X86-AVX2-NEXT:    popl %esi
    401 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
    402 ; X86-AVX2-NEXT:    vzeroupper
    403 ; X86-AVX2-NEXT:    retl
    404 ;
    405 ; X64-SSE-LABEL: mul_16xi8:
    406 ; X64-SSE:       # %bb.0: # %entry
    407 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
    408 ; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
    409 ; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm1
    410 ; X64-SSE-NEXT:    pxor %xmm2, %xmm2
    411 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm3
    412 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
    413 ; X64-SSE-NEXT:    movdqa %xmm1, %xmm4
    414 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
    415 ; X64-SSE-NEXT:    pmullw %xmm3, %xmm4
    416 ; X64-SSE-NEXT:    movdqa %xmm4, %xmm3
    417 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
    418 ; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
    419 ; X64-SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
    420 ; X64-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
    421 ; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
    422 ; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
    423 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    424 ; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    425 ; X64-SSE-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
    426 ; X64-SSE-NEXT:    movdqu %xmm0, 32(%rax,%rdx,4)
    427 ; X64-SSE-NEXT:    movdqu %xmm4, 16(%rax,%rdx,4)
    428 ; X64-SSE-NEXT:    movdqu %xmm3, (%rax,%rdx,4)
    429 ; X64-SSE-NEXT:    retq
    430 ;
    431 ; X64-AVX1-LABEL: mul_16xi8:
    432 ; X64-AVX1:       # %bb.0: # %entry
    433 ; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
    434 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    435 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    436 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    437 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    438 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    439 ; X64-AVX1-NEXT:    vpmaddwd %xmm0, %xmm4, %xmm0
    440 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    441 ; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm4, %xmm1
    442 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    443 ; X64-AVX1-NEXT:    vpmaddwd %xmm2, %xmm4, %xmm2
    444 ; X64-AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
    445 ; X64-AVX1-NEXT:    vpmaddwd %xmm3, %xmm4, %xmm3
    446 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    447 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    448 ; X64-AVX1-NEXT:    vmovups %ymm0, 32(%rax,%rdx,4)
    449 ; X64-AVX1-NEXT:    vmovups %ymm2, (%rax,%rdx,4)
    450 ; X64-AVX1-NEXT:    vzeroupper
    451 ; X64-AVX1-NEXT:    retq
    452 ;
    453 ; X64-AVX2-LABEL: mul_16xi8:
    454 ; X64-AVX2:       # %bb.0: # %entry
    455 ; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
    456 ; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    457 ; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    458 ; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    459 ; X64-AVX2-NEXT:    vpmaddwd %ymm0, %ymm2, %ymm0
    460 ; X64-AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
    461 ; X64-AVX2-NEXT:    vpmaddwd %ymm1, %ymm2, %ymm1
    462 ; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
    463 ; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
    464 ; X64-AVX2-NEXT:    vzeroupper
    465 ; X64-AVX2-NEXT:    retq
    466 entry:
    467   %pre = load i32*, i32** @c
    468   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    469   %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
    470   %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
    471   %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
    472   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    473   %tmp11 = bitcast i8* %tmp10 to <16 x i8>*
    474   %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
    475   %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
    476   %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
    477   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    478   %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
    479   store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
    480   ret void
    481 }
    482 
    483 ; %val1 = load <2 x i16>
    484 ; %op1 = zext<2 x i32> %val1
    485 ; %val2 = load <2 x i16>
    486 ; %op2 = zext<2 x i32> %val2
    487 ; %rst = mul <2 x i32> %op1, %op2
    488 ;
    489 define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    490 ; X86-SSE-LABEL: mul_2xi16:
    491 ; X86-SSE:       # %bb.0: # %entry
    492 ; X86-SSE-NEXT:    pushl %esi
    493 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
    494 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
    495 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    496 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    497 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
    498 ; X86-SSE-NEXT:    movl c, %esi
    499 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    500 ; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    501 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
    502 ; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
    503 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
    504 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    505 ; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
    506 ; X86-SSE-NEXT:    popl %esi
    507 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
    508 ; X86-SSE-NEXT:    retl
    509 ;
    510 ; X86-AVX-LABEL: mul_2xi16:
    511 ; X86-AVX:       # %bb.0: # %entry
    512 ; X86-AVX-NEXT:    pushl %esi
    513 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
    514 ; X86-AVX-NEXT:    .cfi_offset %esi, -8
    515 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    516 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    517 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
    518 ; X86-AVX-NEXT:    movl c, %esi
    519 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    520 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    521 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    522 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    523 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
    524 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
    525 ; X86-AVX-NEXT:    popl %esi
    526 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
    527 ; X86-AVX-NEXT:    retl
    528 ;
    529 ; X64-SSE-LABEL: mul_2xi16:
    530 ; X64-SSE:       # %bb.0: # %entry
    531 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
    532 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    533 ; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    534 ; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
    535 ; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
    536 ; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
    537 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    538 ; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
    539 ; X64-SSE-NEXT:    retq
    540 ;
    541 ; X64-AVX-LABEL: mul_2xi16:
    542 ; X64-AVX:       # %bb.0: # %entry
    543 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
    544 ; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    545 ; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
    546 ; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    547 ; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
    548 ; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
    549 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
    550 ; X64-AVX-NEXT:    retq
    551 entry:
    552   %pre = load i32*, i32** @c
    553   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    554   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
    555   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
    556   %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
    557   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    558   %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
    559   %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
    560   %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
    561   %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
    562   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    563   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    564   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    565   ret void
    566 }
    567 
    568 ; %val1 = load <4 x i16>
    569 ; %op1 = zext<4 x i32> %val1
    570 ; %val2 = load <4 x i16>
    571 ; %op2 = zext<4 x i32> %val2
    572 ; %rst = mul <4 x i32> %op1, %op2
    573 ;
    574 define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    575 ; X86-SSE-LABEL: mul_4xi16:
    576 ; X86-SSE:       # %bb.0: # %entry
    577 ; X86-SSE-NEXT:    pushl %esi
    578 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
    579 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
    580 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    581 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    582 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
    583 ; X86-SSE-NEXT:    movl c, %esi
    584 ; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    585 ; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    586 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
    587 ; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
    588 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
    589 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    590 ; X86-SSE-NEXT:    movdqu %xmm1, (%esi,%ecx,4)
    591 ; X86-SSE-NEXT:    popl %esi
    592 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
    593 ; X86-SSE-NEXT:    retl
    594 ;
    595 ; X86-AVX-LABEL: mul_4xi16:
    596 ; X86-AVX:       # %bb.0: # %entry
    597 ; X86-AVX-NEXT:    pushl %esi
    598 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
    599 ; X86-AVX-NEXT:    .cfi_offset %esi, -8
    600 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    601 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    602 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
    603 ; X86-AVX-NEXT:    movl c, %esi
    604 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    605 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    606 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
    607 ; X86-AVX-NEXT:    vmovdqu %xmm0, (%esi,%ecx,4)
    608 ; X86-AVX-NEXT:    popl %esi
    609 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
    610 ; X86-AVX-NEXT:    retl
    611 ;
    612 ; X64-SSE-LABEL: mul_4xi16:
    613 ; X64-SSE:       # %bb.0: # %entry
    614 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
    615 ; X64-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    616 ; X64-SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    617 ; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
    618 ; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
    619 ; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
    620 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    621 ; X64-SSE-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
    622 ; X64-SSE-NEXT:    retq
    623 ;
    624 ; X64-AVX-LABEL: mul_4xi16:
    625 ; X64-AVX:       # %bb.0: # %entry
    626 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
    627 ; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    628 ; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    629 ; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
    630 ; X64-AVX-NEXT:    vmovdqu %xmm0, (%rax,%rdx,4)
    631 ; X64-AVX-NEXT:    retq
    632 entry:
    633   %pre = load i32*, i32** @c
    634   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    635   %tmp7 = bitcast i8* %tmp6 to <4 x i16>*
    636   %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
    637   %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
    638   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    639   %tmp11 = bitcast i8* %tmp10 to <4 x i16>*
    640   %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
    641   %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
    642   %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
    643   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    644   %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
    645   store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
    646   ret void
    647 }
    648 
    649 ; %val1 = load <8 x i16>
    650 ; %op1 = zext<8 x i32> %val1
    651 ; %val2 = load <8 x i16>
    652 ; %op2 = zext<8 x i32> %val2
    653 ; %rst = mul <8 x i32> %op1, %op2
    654 ;
    655 define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    656 ; X86-SSE-LABEL: mul_8xi16:
    657 ; X86-SSE:       # %bb.0: # %entry
    658 ; X86-SSE-NEXT:    pushl %esi
    659 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
    660 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
    661 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    662 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    663 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
    664 ; X86-SSE-NEXT:    movl c, %esi
    665 ; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
    666 ; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm1
    667 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
    668 ; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm2
    669 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
    670 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
    671 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    672 ; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    673 ; X86-SSE-NEXT:    movdqu %xmm1, 16(%esi,%ecx,4)
    674 ; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
    675 ; X86-SSE-NEXT:    popl %esi
    676 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
    677 ; X86-SSE-NEXT:    retl
    678 ;
    679 ; X86-AVX1-LABEL: mul_8xi16:
    680 ; X86-AVX1:       # %bb.0: # %entry
    681 ; X86-AVX1-NEXT:    pushl %esi
    682 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
    683 ; X86-AVX1-NEXT:    .cfi_offset %esi, -8
    684 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
    685 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    686 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
    687 ; X86-AVX1-NEXT:    movl c, %esi
    688 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    689 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    690 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    691 ; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
    692 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    693 ; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
    694 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    695 ; X86-AVX1-NEXT:    vmovups %ymm0, (%esi,%ecx,4)
    696 ; X86-AVX1-NEXT:    popl %esi
    697 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
    698 ; X86-AVX1-NEXT:    vzeroupper
    699 ; X86-AVX1-NEXT:    retl
    700 ;
    701 ; X86-AVX2-LABEL: mul_8xi16:
    702 ; X86-AVX2:       # %bb.0: # %entry
    703 ; X86-AVX2-NEXT:    pushl %esi
    704 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
    705 ; X86-AVX2-NEXT:    .cfi_offset %esi, -8
    706 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
    707 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    708 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
    709 ; X86-AVX2-NEXT:    movl c, %esi
    710 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    711 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    712 ; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
    713 ; X86-AVX2-NEXT:    vmovdqu %ymm0, (%esi,%ecx,4)
    714 ; X86-AVX2-NEXT:    popl %esi
    715 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
    716 ; X86-AVX2-NEXT:    vzeroupper
    717 ; X86-AVX2-NEXT:    retl
    718 ;
    719 ; X64-SSE-LABEL: mul_8xi16:
    720 ; X64-SSE:       # %bb.0: # %entry
    721 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
    722 ; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
    723 ; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm1
    724 ; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
    725 ; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm2
    726 ; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
    727 ; X64-SSE-NEXT:    movdqa %xmm1, %xmm0
    728 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    729 ; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    730 ; X64-SSE-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
    731 ; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
    732 ; X64-SSE-NEXT:    retq
    733 ;
    734 ; X64-AVX1-LABEL: mul_8xi16:
    735 ; X64-AVX1:       # %bb.0: # %entry
    736 ; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
    737 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    738 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    739 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    740 ; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm2, %xmm0
    741 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    742 ; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm2, %xmm1
    743 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    744 ; X64-AVX1-NEXT:    vmovups %ymm0, (%rax,%rdx,4)
    745 ; X64-AVX1-NEXT:    vzeroupper
    746 ; X64-AVX1-NEXT:    retq
    747 ;
    748 ; X64-AVX2-LABEL: mul_8xi16:
    749 ; X64-AVX2:       # %bb.0: # %entry
    750 ; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
    751 ; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    752 ; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    753 ; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm1, %ymm0
    754 ; X64-AVX2-NEXT:    vmovdqu %ymm0, (%rax,%rdx,4)
    755 ; X64-AVX2-NEXT:    vzeroupper
    756 ; X64-AVX2-NEXT:    retq
    757 entry:
    758   %pre = load i32*, i32** @c
    759   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    760   %tmp7 = bitcast i8* %tmp6 to <8 x i16>*
    761   %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
    762   %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
    763   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    764   %tmp11 = bitcast i8* %tmp10 to <8 x i16>*
    765   %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
    766   %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
    767   %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
    768   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    769   %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
    770   store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
    771   ret void
    772 }
    773 
    774 ; %val1 = load <16 x i16>
    775 ; %op1 = zext<16 x i32> %val1
    776 ; %val2 = load <16 x i16>
    777 ; %op2 = zext<16 x i32> %val2
    778 ; %rst = mul <16 x i32> %op1, %op2
    779 ;
    780 define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    781 ; X86-SSE-LABEL: mul_16xi16:
    782 ; X86-SSE:       # %bb.0: # %entry
    783 ; X86-SSE-NEXT:    pushl %esi
    784 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
    785 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
    786 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    787 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    788 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
    789 ; X86-SSE-NEXT:    movl c, %esi
    790 ; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
    791 ; X86-SSE-NEXT:    movdqu 16(%edx,%ecx), %xmm1
    792 ; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm2
    793 ; X86-SSE-NEXT:    movdqu 16(%eax,%ecx), %xmm3
    794 ; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
    795 ; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm4
    796 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
    797 ; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
    798 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    799 ; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
    800 ; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
    801 ; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm4
    802 ; X86-SSE-NEXT:    pmullw %xmm1, %xmm3
    803 ; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
    804 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
    805 ; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
    806 ; X86-SSE-NEXT:    movdqu %xmm3, 48(%esi,%ecx,4)
    807 ; X86-SSE-NEXT:    movdqu %xmm1, 32(%esi,%ecx,4)
    808 ; X86-SSE-NEXT:    movdqu %xmm2, 16(%esi,%ecx,4)
    809 ; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
    810 ; X86-SSE-NEXT:    popl %esi
    811 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
    812 ; X86-SSE-NEXT:    retl
    813 ;
    814 ; X86-AVX1-LABEL: mul_16xi16:
    815 ; X86-AVX1:       # %bb.0: # %entry
    816 ; X86-AVX1-NEXT:    pushl %esi
    817 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
    818 ; X86-AVX1-NEXT:    .cfi_offset %esi, -8
    819 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
    820 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    821 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
    822 ; X86-AVX1-NEXT:    movl c, %esi
    823 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    824 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    825 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    826 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    827 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    828 ; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
    829 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    830 ; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
    831 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    832 ; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
    833 ; X86-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    834 ; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
    835 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    836 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    837 ; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
    838 ; X86-AVX1-NEXT:    vmovups %ymm2, (%esi,%ecx,4)
    839 ; X86-AVX1-NEXT:    popl %esi
    840 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
    841 ; X86-AVX1-NEXT:    vzeroupper
    842 ; X86-AVX1-NEXT:    retl
    843 ;
    844 ; X86-AVX2-LABEL: mul_16xi16:
    845 ; X86-AVX2:       # %bb.0: # %entry
    846 ; X86-AVX2-NEXT:    pushl %esi
    847 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
    848 ; X86-AVX2-NEXT:    .cfi_offset %esi, -8
    849 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
    850 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    851 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
    852 ; X86-AVX2-NEXT:    movl c, %esi
    853 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    854 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    855 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    856 ; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
    857 ; X86-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    858 ; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
    859 ; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
    860 ; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
    861 ; X86-AVX2-NEXT:    popl %esi
    862 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
    863 ; X86-AVX2-NEXT:    vzeroupper
    864 ; X86-AVX2-NEXT:    retl
    865 ;
    866 ; X64-SSE-LABEL: mul_16xi16:
    867 ; X64-SSE:       # %bb.0: # %entry
    868 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
    869 ; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
    870 ; X64-SSE-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
    871 ; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm2
    872 ; X64-SSE-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
    873 ; X64-SSE-NEXT:    movdqa %xmm2, %xmm4
    874 ; X64-SSE-NEXT:    pmulhuw %xmm0, %xmm4
    875 ; X64-SSE-NEXT:    pmullw %xmm0, %xmm2
    876 ; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
    877 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    878 ; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
    879 ; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
    880 ; X64-SSE-NEXT:    pmulhuw %xmm1, %xmm4
    881 ; X64-SSE-NEXT:    pmullw %xmm1, %xmm3
    882 ; X64-SSE-NEXT:    movdqa %xmm3, %xmm1
    883 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
    884 ; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
    885 ; X64-SSE-NEXT:    movdqu %xmm3, 48(%rax,%rdx,4)
    886 ; X64-SSE-NEXT:    movdqu %xmm1, 32(%rax,%rdx,4)
    887 ; X64-SSE-NEXT:    movdqu %xmm2, 16(%rax,%rdx,4)
    888 ; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
    889 ; X64-SSE-NEXT:    retq
    890 ;
    891 ; X64-AVX1-LABEL: mul_16xi16:
    892 ; X64-AVX1:       # %bb.0: # %entry
    893 ; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
    894 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    895 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    896 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    897 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    898 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    899 ; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
    900 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    901 ; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
    902 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    903 ; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
    904 ; X64-AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
    905 ; X64-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
    906 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
    907 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
    908 ; X64-AVX1-NEXT:    vmovups %ymm0, 32(%rax,%rdx,4)
    909 ; X64-AVX1-NEXT:    vmovups %ymm2, (%rax,%rdx,4)
    910 ; X64-AVX1-NEXT:    vzeroupper
    911 ; X64-AVX1-NEXT:    retq
    912 ;
    913 ; X64-AVX2-LABEL: mul_16xi16:
    914 ; X64-AVX2:       # %bb.0: # %entry
    915 ; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
    916 ; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    917 ; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    918 ; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    919 ; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
    920 ; X64-AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
    921 ; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
    922 ; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
    923 ; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
    924 ; X64-AVX2-NEXT:    vzeroupper
    925 ; X64-AVX2-NEXT:    retq
    926 entry:
    927   %pre = load i32*, i32** @c
    928   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    929   %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
    930   %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
    931   %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
    932   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    933   %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
    934   %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
    935   %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
    936   %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
    937   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    938   %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
    939   store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
    940   ret void
    941 }
    942 
    943 ; %val1 = load <2 x i8>
    944 ; %op1 = sext<2 x i32> %val1
    945 ; %val2 = load <2 x i8>
    946 ; %op2 = sext<2 x i32> %val2
    947 ; %rst = mul <2 x i32> %op1, %op2
    948 ;
    949 define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    950 ; X86-SSE-LABEL: mul_2xi8_sext:
    951 ; X86-SSE:       # %bb.0: # %entry
    952 ; X86-SSE-NEXT:    pushl %esi
    953 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
    954 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
    955 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
    956 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    957 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
    958 ; X86-SSE-NEXT:    movl c, %esi
    959 ; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
    960 ; X86-SSE-NEXT:    movd %edx, %xmm0
    961 ; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
    962 ; X86-SSE-NEXT:    movd %eax, %xmm1
    963 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    964 ; X86-SSE-NEXT:    psraw $8, %xmm0
    965 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    966 ; X86-SSE-NEXT:    psraw $8, %xmm1
    967 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
    968 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    969 ; X86-SSE-NEXT:    psrad $16, %xmm0
    970 ; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
    971 ; X86-SSE-NEXT:    popl %esi
    972 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
    973 ; X86-SSE-NEXT:    retl
    974 ;
    975 ; X86-AVX-LABEL: mul_2xi8_sext:
    976 ; X86-AVX:       # %bb.0: # %entry
    977 ; X86-AVX-NEXT:    pushl %esi
    978 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
    979 ; X86-AVX-NEXT:    .cfi_offset %esi, -8
    980 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
    981 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
    982 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
    983 ; X86-AVX-NEXT:    movl c, %esi
    984 ; X86-AVX-NEXT:    vpmovsxbq (%edx,%ecx), %xmm0
    985 ; X86-AVX-NEXT:    vpmovsxbq (%eax,%ecx), %xmm1
    986 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
    987 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    988 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
    989 ; X86-AVX-NEXT:    popl %esi
    990 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
    991 ; X86-AVX-NEXT:    retl
    992 ;
    993 ; X64-SSE-LABEL: mul_2xi8_sext:
    994 ; X64-SSE:       # %bb.0: # %entry
    995 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
    996 ; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
    997 ; X64-SSE-NEXT:    movd %ecx, %xmm0
    998 ; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
    999 ; X64-SSE-NEXT:    movd %ecx, %xmm1
   1000 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1001 ; X64-SSE-NEXT:    psraw $8, %xmm0
   1002 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1003 ; X64-SSE-NEXT:    psraw $8, %xmm1
   1004 ; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
   1005 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1006 ; X64-SSE-NEXT:    psrad $16, %xmm0
   1007 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rdx,4)
   1008 ; X64-SSE-NEXT:    retq
   1009 ;
   1010 ; X64-AVX-LABEL: mul_2xi8_sext:
   1011 ; X64-AVX:       # %bb.0: # %entry
   1012 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   1013 ; X64-AVX-NEXT:    vpmovsxbq (%rdi,%rdx), %xmm0
   1014 ; X64-AVX-NEXT:    vpmovsxbq (%rsi,%rdx), %xmm1
   1015 ; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   1016 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1017 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
   1018 ; X64-AVX-NEXT:    retq
   1019 entry:
   1020   %pre = load i32*, i32** @c
   1021   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   1022   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
   1023   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
   1024   %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
   1025   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
   1026   %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
   1027   %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
   1028   %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
   1029   %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
   1030   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   1031   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   1032   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   1033   ret void
   1034 }
   1035 
   1036 ; %val1 = load <2 x i8>
   1037 ; %op1 = sext<2 x i32> %val1
   1038 ; %val2 = load <2 x i8>
   1039 ; %op2 = zext<2 x i32> %val2
   1040 ; %rst = mul <2 x i32> %op1, %op2
   1041 ;
   1042 define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
   1043 ; X86-SSE-LABEL: mul_2xi8_sext_zext:
   1044 ; X86-SSE:       # %bb.0: # %entry
   1045 ; X86-SSE-NEXT:    pushl %esi
   1046 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
   1047 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
   1048 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1049 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1050 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1051 ; X86-SSE-NEXT:    movl c, %esi
   1052 ; X86-SSE-NEXT:    movzwl (%edx,%ecx), %edx
   1053 ; X86-SSE-NEXT:    movd %edx, %xmm0
   1054 ; X86-SSE-NEXT:    movzwl (%eax,%ecx), %eax
   1055 ; X86-SSE-NEXT:    movd %eax, %xmm1
   1056 ; X86-SSE-NEXT:    pxor %xmm2, %xmm2
   1057 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
   1058 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1059 ; X86-SSE-NEXT:    psraw $8, %xmm0
   1060 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
   1061 ; X86-SSE-NEXT:    pmulhw %xmm0, %xmm2
   1062 ; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
   1063 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1064 ; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
   1065 ; X86-SSE-NEXT:    popl %esi
   1066 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
   1067 ; X86-SSE-NEXT:    retl
   1068 ;
   1069 ; X86-AVX-LABEL: mul_2xi8_sext_zext:
   1070 ; X86-AVX:       # %bb.0: # %entry
   1071 ; X86-AVX-NEXT:    pushl %esi
   1072 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
   1073 ; X86-AVX-NEXT:    .cfi_offset %esi, -8
   1074 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1075 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1076 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1077 ; X86-AVX-NEXT:    movl c, %esi
   1078 ; X86-AVX-NEXT:    vpmovsxbq (%edx,%ecx), %xmm0
   1079 ; X86-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
   1080 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   1081 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1082 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
   1083 ; X86-AVX-NEXT:    popl %esi
   1084 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
   1085 ; X86-AVX-NEXT:    retl
   1086 ;
   1087 ; X64-SSE-LABEL: mul_2xi8_sext_zext:
   1088 ; X64-SSE:       # %bb.0: # %entry
   1089 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   1090 ; X64-SSE-NEXT:    movzwl (%rdi,%rdx), %ecx
   1091 ; X64-SSE-NEXT:    movd %ecx, %xmm0
   1092 ; X64-SSE-NEXT:    movzwl (%rsi,%rdx), %ecx
   1093 ; X64-SSE-NEXT:    movd %ecx, %xmm1
   1094 ; X64-SSE-NEXT:    pxor %xmm2, %xmm2
   1095 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
   1096 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1097 ; X64-SSE-NEXT:    psraw $8, %xmm0
   1098 ; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
   1099 ; X64-SSE-NEXT:    pmulhw %xmm0, %xmm2
   1100 ; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
   1101 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1102 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rdx,4)
   1103 ; X64-SSE-NEXT:    retq
   1104 ;
   1105 ; X64-AVX-LABEL: mul_2xi8_sext_zext:
   1106 ; X64-AVX:       # %bb.0: # %entry
   1107 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   1108 ; X64-AVX-NEXT:    vpmovsxbq (%rdi,%rdx), %xmm0
   1109 ; X64-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
   1110 ; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   1111 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1112 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
   1113 ; X64-AVX-NEXT:    retq
   1114 entry:
   1115   %pre = load i32*, i32** @c
   1116   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   1117   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
   1118   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
   1119   %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
   1120   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
   1121   %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
   1122   %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
   1123   %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
   1124   %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
   1125   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   1126   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   1127   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   1128   ret void
   1129 }
   1130 
   1131 ; %val1 = load <2 x i16>
   1132 ; %op1 = sext<2 x i32> %val1
   1133 ; %val2 = load <2 x i16>
   1134 ; %op2 = sext<2 x i32> %val2
   1135 ; %rst = mul <2 x i32> %op1, %op2
   1136 ;
   1137 define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
   1138 ; X86-SSE-LABEL: mul_2xi16_sext:
   1139 ; X86-SSE:       # %bb.0: # %entry
   1140 ; X86-SSE-NEXT:    pushl %esi
   1141 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
   1142 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
   1143 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1144 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1145 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1146 ; X86-SSE-NEXT:    movl c, %esi
   1147 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1148 ; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1149 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
   1150 ; X86-SSE-NEXT:    pmulhw %xmm0, %xmm2
   1151 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm1
   1152 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
   1153 ; X86-SSE-NEXT:    movq %xmm1, (%esi,%ecx,4)
   1154 ; X86-SSE-NEXT:    popl %esi
   1155 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
   1156 ; X86-SSE-NEXT:    retl
   1157 ;
   1158 ; X86-AVX-LABEL: mul_2xi16_sext:
   1159 ; X86-AVX:       # %bb.0: # %entry
   1160 ; X86-AVX-NEXT:    pushl %esi
   1161 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
   1162 ; X86-AVX-NEXT:    .cfi_offset %esi, -8
   1163 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1164 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1165 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1166 ; X86-AVX-NEXT:    movl c, %esi
   1167 ; X86-AVX-NEXT:    vpmovsxwq (%edx,%ecx), %xmm0
   1168 ; X86-AVX-NEXT:    vpmovsxwq (%eax,%ecx), %xmm1
   1169 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   1170 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1171 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
   1172 ; X86-AVX-NEXT:    popl %esi
   1173 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
   1174 ; X86-AVX-NEXT:    retl
   1175 ;
   1176 ; X64-SSE-LABEL: mul_2xi16_sext:
   1177 ; X64-SSE:       # %bb.0: # %entry
   1178 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   1179 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1180 ; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1181 ; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
   1182 ; X64-SSE-NEXT:    pmulhw %xmm0, %xmm2
   1183 ; X64-SSE-NEXT:    pmullw %xmm0, %xmm1
   1184 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
   1185 ; X64-SSE-NEXT:    movq %xmm1, (%rax,%rdx,4)
   1186 ; X64-SSE-NEXT:    retq
   1187 ;
   1188 ; X64-AVX-LABEL: mul_2xi16_sext:
   1189 ; X64-AVX:       # %bb.0: # %entry
   1190 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   1191 ; X64-AVX-NEXT:    vpmovsxwq (%rdi,%rdx), %xmm0
   1192 ; X64-AVX-NEXT:    vpmovsxwq (%rsi,%rdx), %xmm1
   1193 ; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   1194 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1195 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
   1196 ; X64-AVX-NEXT:    retq
   1197 entry:
   1198   %pre = load i32*, i32** @c
   1199   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   1200   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
   1201   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
   1202   %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
   1203   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
   1204   %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
   1205   %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
   1206   %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
   1207   %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
   1208   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   1209   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   1210   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   1211   ret void
   1212 }
   1213 
   1214 ; %val1 = load <2 x i16>
   1215 ; %op1 = sext<2 x i32> %val1
   1216 ; %val2 = load <2 x i16>
   1217 ; %op2 = zext<2 x i32> %val2
   1218 ; %rst = mul <2 x i32> %op1, %op2
   1219 ;
   1220 define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
   1221 ; X86-SSE-LABEL: mul_2xi16_sext_zext:
   1222 ; X86-SSE:       # %bb.0: # %entry
   1223 ; X86-SSE-NEXT:    pushl %esi
   1224 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
   1225 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
   1226 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1227 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1228 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1229 ; X86-SSE-NEXT:    movl c, %esi
   1230 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1231 ; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
   1232 ; X86-SSE-NEXT:    psrad $16, %xmm0
   1233 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
   1234 ; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1235 ; X86-SSE-NEXT:    pxor %xmm2, %xmm2
   1236 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
   1237 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
   1238 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
   1239 ; X86-SSE-NEXT:    psrlq $32, %xmm2
   1240 ; X86-SSE-NEXT:    pmuludq %xmm0, %xmm2
   1241 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
   1242 ; X86-SSE-NEXT:    psrlq $32, %xmm3
   1243 ; X86-SSE-NEXT:    pmuludq %xmm1, %xmm3
   1244 ; X86-SSE-NEXT:    paddq %xmm2, %xmm3
   1245 ; X86-SSE-NEXT:    psllq $32, %xmm3
   1246 ; X86-SSE-NEXT:    pmuludq %xmm0, %xmm1
   1247 ; X86-SSE-NEXT:    paddq %xmm3, %xmm1
   1248 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
   1249 ; X86-SSE-NEXT:    movq %xmm0, (%esi,%ecx,4)
   1250 ; X86-SSE-NEXT:    popl %esi
   1251 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
   1252 ; X86-SSE-NEXT:    retl
   1253 ;
   1254 ; X86-AVX-LABEL: mul_2xi16_sext_zext:
   1255 ; X86-AVX:       # %bb.0: # %entry
   1256 ; X86-AVX-NEXT:    pushl %esi
   1257 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 8
   1258 ; X86-AVX-NEXT:    .cfi_offset %esi, -8
   1259 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1260 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1261 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1262 ; X86-AVX-NEXT:    movl c, %esi
   1263 ; X86-AVX-NEXT:    vpmovsxwq (%edx,%ecx), %xmm0
   1264 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1265 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1266 ; X86-AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
   1267 ; X86-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   1268 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1269 ; X86-AVX-NEXT:    vmovq %xmm0, (%esi,%ecx,4)
   1270 ; X86-AVX-NEXT:    popl %esi
   1271 ; X86-AVX-NEXT:    .cfi_def_cfa_offset 4
   1272 ; X86-AVX-NEXT:    retl
   1273 ;
   1274 ; X64-SSE-LABEL: mul_2xi16_sext_zext:
   1275 ; X64-SSE:       # %bb.0: # %entry
   1276 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   1277 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1278 ; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
   1279 ; X64-SSE-NEXT:    psrad $16, %xmm0
   1280 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
   1281 ; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1282 ; X64-SSE-NEXT:    pxor %xmm2, %xmm2
   1283 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
   1284 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
   1285 ; X64-SSE-NEXT:    movdqa %xmm1, %xmm2
   1286 ; X64-SSE-NEXT:    psrlq $32, %xmm2
   1287 ; X64-SSE-NEXT:    pmuludq %xmm0, %xmm2
   1288 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm3
   1289 ; X64-SSE-NEXT:    psrlq $32, %xmm3
   1290 ; X64-SSE-NEXT:    pmuludq %xmm1, %xmm3
   1291 ; X64-SSE-NEXT:    paddq %xmm2, %xmm3
   1292 ; X64-SSE-NEXT:    psllq $32, %xmm3
   1293 ; X64-SSE-NEXT:    pmuludq %xmm0, %xmm1
   1294 ; X64-SSE-NEXT:    paddq %xmm3, %xmm1
   1295 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
   1296 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rdx,4)
   1297 ; X64-SSE-NEXT:    retq
   1298 ;
   1299 ; X64-AVX-LABEL: mul_2xi16_sext_zext:
   1300 ; X64-AVX:       # %bb.0: # %entry
   1301 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   1302 ; X64-AVX-NEXT:    vpmovsxwq (%rdi,%rdx), %xmm0
   1303 ; X64-AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
   1304 ; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
   1305 ; X64-AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
   1306 ; X64-AVX-NEXT:    vpmulld %xmm0, %xmm1, %xmm0
   1307 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1308 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rdx,4)
   1309 ; X64-AVX-NEXT:    retq
   1310 entry:
   1311   %pre = load i32*, i32** @c
   1312   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   1313   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
   1314   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
   1315   %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
   1316   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
   1317   %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
   1318   %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
   1319   %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
   1320   %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
   1321   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   1322   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   1323   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   1324   ret void
   1325 }
   1326 
   1327 ; %val1 = load <16 x i16>
   1328 ; %op1 = sext<16 x i32> %val1
   1329 ; %val2 = load <16 x i16>
   1330 ; %op2 = sext<16 x i32> %val2
   1331 ; %rst = mul <16 x i32> %op1, %op2
   1332 ;
   1333 define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
   1334 ; X86-SSE-LABEL: mul_16xi16_sext:
   1335 ; X86-SSE:       # %bb.0: # %entry
   1336 ; X86-SSE-NEXT:    pushl %esi
   1337 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
   1338 ; X86-SSE-NEXT:    .cfi_offset %esi, -8
   1339 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1340 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1341 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1342 ; X86-SSE-NEXT:    movl c, %esi
   1343 ; X86-SSE-NEXT:    movdqu (%edx,%ecx), %xmm0
   1344 ; X86-SSE-NEXT:    movdqu 16(%edx,%ecx), %xmm1
   1345 ; X86-SSE-NEXT:    movdqu (%eax,%ecx), %xmm2
   1346 ; X86-SSE-NEXT:    movdqu 16(%eax,%ecx), %xmm3
   1347 ; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
   1348 ; X86-SSE-NEXT:    pmulhw %xmm0, %xmm4
   1349 ; X86-SSE-NEXT:    pmullw %xmm0, %xmm2
   1350 ; X86-SSE-NEXT:    movdqa %xmm2, %xmm0
   1351 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
   1352 ; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
   1353 ; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
   1354 ; X86-SSE-NEXT:    pmulhw %xmm1, %xmm4
   1355 ; X86-SSE-NEXT:    pmullw %xmm1, %xmm3
   1356 ; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
   1357 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
   1358 ; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
   1359 ; X86-SSE-NEXT:    movdqu %xmm3, 48(%esi,%ecx,4)
   1360 ; X86-SSE-NEXT:    movdqu %xmm1, 32(%esi,%ecx,4)
   1361 ; X86-SSE-NEXT:    movdqu %xmm2, 16(%esi,%ecx,4)
   1362 ; X86-SSE-NEXT:    movdqu %xmm0, (%esi,%ecx,4)
   1363 ; X86-SSE-NEXT:    popl %esi
   1364 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
   1365 ; X86-SSE-NEXT:    retl
   1366 ;
   1367 ; X86-AVX1-LABEL: mul_16xi16_sext:
   1368 ; X86-AVX1:       # %bb.0: # %entry
   1369 ; X86-AVX1-NEXT:    pushl %esi
   1370 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
   1371 ; X86-AVX1-NEXT:    .cfi_offset %esi, -8
   1372 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1373 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1374 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1375 ; X86-AVX1-NEXT:    movl c, %esi
   1376 ; X86-AVX1-NEXT:    vpmovsxwd 16(%edx,%ecx), %xmm0
   1377 ; X86-AVX1-NEXT:    vpmovsxwd 24(%edx,%ecx), %xmm1
   1378 ; X86-AVX1-NEXT:    vpmovsxwd (%edx,%ecx), %xmm2
   1379 ; X86-AVX1-NEXT:    vpmovsxwd 8(%edx,%ecx), %xmm3
   1380 ; X86-AVX1-NEXT:    vpmovsxwd 16(%eax,%ecx), %xmm4
   1381 ; X86-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
   1382 ; X86-AVX1-NEXT:    vpmovsxwd 24(%eax,%ecx), %xmm4
   1383 ; X86-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
   1384 ; X86-AVX1-NEXT:    vpmovsxwd (%eax,%ecx), %xmm4
   1385 ; X86-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
   1386 ; X86-AVX1-NEXT:    vpmovsxwd 8(%eax,%ecx), %xmm4
   1387 ; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
   1388 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
   1389 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1390 ; X86-AVX1-NEXT:    vmovups %ymm0, 32(%esi,%ecx,4)
   1391 ; X86-AVX1-NEXT:    vmovups %ymm2, (%esi,%ecx,4)
   1392 ; X86-AVX1-NEXT:    popl %esi
   1393 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
   1394 ; X86-AVX1-NEXT:    vzeroupper
   1395 ; X86-AVX1-NEXT:    retl
   1396 ;
   1397 ; X86-AVX2-LABEL: mul_16xi16_sext:
   1398 ; X86-AVX2:       # %bb.0: # %entry
   1399 ; X86-AVX2-NEXT:    pushl %esi
   1400 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
   1401 ; X86-AVX2-NEXT:    .cfi_offset %esi, -8
   1402 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1403 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1404 ; X86-AVX2-NEXT:    movl {{[0-9]+}}(%esp), %edx
   1405 ; X86-AVX2-NEXT:    movl c, %esi
   1406 ; X86-AVX2-NEXT:    vpmovsxwd 16(%edx,%ecx), %ymm0
   1407 ; X86-AVX2-NEXT:    vpmovsxwd (%edx,%ecx), %ymm1
   1408 ; X86-AVX2-NEXT:    vpmovsxwd 16(%eax,%ecx), %ymm2
   1409 ; X86-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
   1410 ; X86-AVX2-NEXT:    vpmovsxwd (%eax,%ecx), %ymm2
   1411 ; X86-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
   1412 ; X86-AVX2-NEXT:    vmovdqu %ymm0, 32(%esi,%ecx,4)
   1413 ; X86-AVX2-NEXT:    vmovdqu %ymm1, (%esi,%ecx,4)
   1414 ; X86-AVX2-NEXT:    popl %esi
   1415 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
   1416 ; X86-AVX2-NEXT:    vzeroupper
   1417 ; X86-AVX2-NEXT:    retl
   1418 ;
   1419 ; X64-SSE-LABEL: mul_16xi16_sext:
   1420 ; X64-SSE:       # %bb.0: # %entry
   1421 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   1422 ; X64-SSE-NEXT:    movdqu (%rdi,%rdx), %xmm0
   1423 ; X64-SSE-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
   1424 ; X64-SSE-NEXT:    movdqu (%rsi,%rdx), %xmm2
   1425 ; X64-SSE-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
   1426 ; X64-SSE-NEXT:    movdqa %xmm2, %xmm4
   1427 ; X64-SSE-NEXT:    pmulhw %xmm0, %xmm4
   1428 ; X64-SSE-NEXT:    pmullw %xmm0, %xmm2
   1429 ; X64-SSE-NEXT:    movdqa %xmm2, %xmm0
   1430 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
   1431 ; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
   1432 ; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
   1433 ; X64-SSE-NEXT:    pmulhw %xmm1, %xmm4
   1434 ; X64-SSE-NEXT:    pmullw %xmm1, %xmm3
   1435 ; X64-SSE-NEXT:    movdqa %xmm3, %xmm1
   1436 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
   1437 ; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
   1438 ; X64-SSE-NEXT:    movdqu %xmm3, 48(%rax,%rdx,4)
   1439 ; X64-SSE-NEXT:    movdqu %xmm1, 32(%rax,%rdx,4)
   1440 ; X64-SSE-NEXT:    movdqu %xmm2, 16(%rax,%rdx,4)
   1441 ; X64-SSE-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
   1442 ; X64-SSE-NEXT:    retq
   1443 ;
   1444 ; X64-AVX1-LABEL: mul_16xi16_sext:
   1445 ; X64-AVX1:       # %bb.0: # %entry
   1446 ; X64-AVX1-NEXT:    movq {{.*}}(%rip), %rax
   1447 ; X64-AVX1-NEXT:    vpmovsxwd 16(%rdi,%rdx), %xmm0
   1448 ; X64-AVX1-NEXT:    vpmovsxwd 24(%rdi,%rdx), %xmm1
   1449 ; X64-AVX1-NEXT:    vpmovsxwd (%rdi,%rdx), %xmm2
   1450 ; X64-AVX1-NEXT:    vpmovsxwd 8(%rdi,%rdx), %xmm3
   1451 ; X64-AVX1-NEXT:    vpmovsxwd 16(%rsi,%rdx), %xmm4
   1452 ; X64-AVX1-NEXT:    vpmulld %xmm0, %xmm4, %xmm0
   1453 ; X64-AVX1-NEXT:    vpmovsxwd 24(%rsi,%rdx), %xmm4
   1454 ; X64-AVX1-NEXT:    vpmulld %xmm1, %xmm4, %xmm1
   1455 ; X64-AVX1-NEXT:    vpmovsxwd (%rsi,%rdx), %xmm4
   1456 ; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm4, %xmm2
   1457 ; X64-AVX1-NEXT:    vpmovsxwd 8(%rsi,%rdx), %xmm4
   1458 ; X64-AVX1-NEXT:    vpmulld %xmm3, %xmm4, %xmm3
   1459 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
   1460 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
   1461 ; X64-AVX1-NEXT:    vmovups %ymm0, 32(%rax,%rdx,4)
   1462 ; X64-AVX1-NEXT:    vmovups %ymm2, (%rax,%rdx,4)
   1463 ; X64-AVX1-NEXT:    vzeroupper
   1464 ; X64-AVX1-NEXT:    retq
   1465 ;
   1466 ; X64-AVX2-LABEL: mul_16xi16_sext:
   1467 ; X64-AVX2:       # %bb.0: # %entry
   1468 ; X64-AVX2-NEXT:    movq {{.*}}(%rip), %rax
   1469 ; X64-AVX2-NEXT:    vpmovsxwd 16(%rdi,%rdx), %ymm0
   1470 ; X64-AVX2-NEXT:    vpmovsxwd (%rdi,%rdx), %ymm1
   1471 ; X64-AVX2-NEXT:    vpmovsxwd 16(%rsi,%rdx), %ymm2
   1472 ; X64-AVX2-NEXT:    vpmulld %ymm0, %ymm2, %ymm0
   1473 ; X64-AVX2-NEXT:    vpmovsxwd (%rsi,%rdx), %ymm2
   1474 ; X64-AVX2-NEXT:    vpmulld %ymm1, %ymm2, %ymm1
   1475 ; X64-AVX2-NEXT:    vmovdqu %ymm0, 32(%rax,%rdx,4)
   1476 ; X64-AVX2-NEXT:    vmovdqu %ymm1, (%rax,%rdx,4)
   1477 ; X64-AVX2-NEXT:    vzeroupper
   1478 ; X64-AVX2-NEXT:    retq
   1479 entry:
   1480   %pre = load i32*, i32** @c
   1481   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   1482   %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
   1483   %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
   1484   %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
   1485   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
   1486   %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
   1487   %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
   1488   %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
   1489   %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
   1490   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   1491   %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
   1492   store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
   1493   ret void
   1494 }
   1495 
   1496 ; %val = load <2 x i8>
   1497 ; %op1 = zext<2 x i32> %val
   1498 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
   1499 ; %rst = mul <2 x i32> %op1, %op2
   1500 ;
   1501 define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
   1502 ; X86-SSE-LABEL: mul_2xi8_varconst1:
   1503 ; X86-SSE:       # %bb.0: # %entry
   1504 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1505 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1506 ; X86-SSE-NEXT:    movl c, %edx
   1507 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
   1508 ; X86-SSE-NEXT:    movd %ecx, %xmm0
   1509 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
   1510 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1511 ; X86-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
   1512 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1513 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
   1514 ; X86-SSE-NEXT:    retl
   1515 ;
   1516 ; X86-AVX-LABEL: mul_2xi8_varconst1:
   1517 ; X86-AVX:       # %bb.0: # %entry
   1518 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1519 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1520 ; X86-AVX-NEXT:    movl c, %edx
   1521 ; X86-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
   1522 ; X86-AVX-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
   1523 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1524 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
   1525 ; X86-AVX-NEXT:    retl
   1526 ;
   1527 ; X64-SSE-LABEL: mul_2xi8_varconst1:
   1528 ; X64-SSE:       # %bb.0: # %entry
   1529 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   1530 ; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
   1531 ; X64-SSE-NEXT:    movd %ecx, %xmm0
   1532 ; X64-SSE-NEXT:    pxor %xmm1, %xmm1
   1533 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1534 ; X64-SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
   1535 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   1536 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
   1537 ; X64-SSE-NEXT:    retq
   1538 ;
   1539 ; X64-AVX-LABEL: mul_2xi8_varconst1:
   1540 ; X64-AVX:       # %bb.0: # %entry
   1541 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   1542 ; X64-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
   1543 ; X64-AVX-NEXT:    movl $255, %ecx
   1544 ; X64-AVX-NEXT:    vmovq %rcx, %xmm1
   1545 ; X64-AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
   1546 ; X64-AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
   1547 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1548 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
   1549 ; X64-AVX-NEXT:    retq
   1550 entry:
   1551   %pre = load i32*, i32** @c
   1552   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   1553   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
   1554   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
   1555   %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
   1556   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
   1557   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   1558   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   1559   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   1560   ret void
   1561 }
   1562 
   1563 ; %val = load <2 x i8>
   1564 ; %op1 = sext<2 x i32> %val
   1565 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
   1566 ; %rst = mul <2 x i32> %op1, %op2
   1567 ;
   1568 define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
   1569 ; X86-SSE-LABEL: mul_2xi8_varconst2:
   1570 ; X86-SSE:       # %bb.0: # %entry
   1571 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1572 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1573 ; X86-SSE-NEXT:    movl c, %edx
   1574 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
   1575 ; X86-SSE-NEXT:    movd %ecx, %xmm0
   1576 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1577 ; X86-SSE-NEXT:    psraw $8, %xmm0
   1578 ; X86-SSE-NEXT:    pmullw {{\.LCPI.*}}, %xmm0
   1579 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1580 ; X86-SSE-NEXT:    psrad $16, %xmm0
   1581 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
   1582 ; X86-SSE-NEXT:    retl
   1583 ;
   1584 ; X86-AVX-LABEL: mul_2xi8_varconst2:
   1585 ; X86-AVX:       # %bb.0: # %entry
   1586 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1587 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1588 ; X86-AVX-NEXT:    movl c, %edx
   1589 ; X86-AVX-NEXT:    vpmovsxbq (%ecx,%eax), %xmm0
   1590 ; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
   1591 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1592 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
   1593 ; X86-AVX-NEXT:    retl
   1594 ;
   1595 ; X64-SSE-LABEL: mul_2xi8_varconst2:
   1596 ; X64-SSE:       # %bb.0: # %entry
   1597 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   1598 ; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
   1599 ; X64-SSE-NEXT:    movd %ecx, %xmm0
   1600 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1601 ; X64-SSE-NEXT:    psraw $8, %xmm0
   1602 ; X64-SSE-NEXT:    pmullw {{.*}}(%rip), %xmm0
   1603 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
   1604 ; X64-SSE-NEXT:    psrad $16, %xmm0
   1605 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
   1606 ; X64-SSE-NEXT:    retq
   1607 ;
   1608 ; X64-AVX-LABEL: mul_2xi8_varconst2:
   1609 ; X64-AVX:       # %bb.0: # %entry
   1610 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   1611 ; X64-AVX-NEXT:    vpmovsxbq (%rdi,%rsi), %xmm0
   1612 ; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   1613 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1614 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
   1615 ; X64-AVX-NEXT:    retq
   1616 entry:
   1617   %pre = load i32*, i32** @c
   1618   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   1619   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
   1620   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
   1621   %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
   1622   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
   1623   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   1624   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   1625   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   1626   ret void
   1627 }
   1628 
   1629 ; %val = load <2 x i8>
   1630 ; %op1 = zext<2 x i32> %val
   1631 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
   1632 ; %rst = mul <2 x i32> %op1, %op2
   1633 ;
   1634 define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
   1635 ; X86-SSE-LABEL: mul_2xi8_varconst3:
   1636 ; X86-SSE:       # %bb.0: # %entry
   1637 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1638 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1639 ; X86-SSE-NEXT:    movl c, %edx
   1640 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
   1641 ; X86-SSE-NEXT:    movd %ecx, %xmm0
   1642 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
   1643 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1644 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
   1645 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
   1646 ; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
   1647 ; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
   1648 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1649 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
   1650 ; X86-SSE-NEXT:    retl
   1651 ;
   1652 ; X86-AVX-LABEL: mul_2xi8_varconst3:
   1653 ; X86-AVX:       # %bb.0: # %entry
   1654 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1655 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1656 ; X86-AVX-NEXT:    movl c, %edx
   1657 ; X86-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
   1658 ; X86-AVX-NEXT:    vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0
   1659 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1660 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
   1661 ; X86-AVX-NEXT:    retl
   1662 ;
   1663 ; X64-SSE-LABEL: mul_2xi8_varconst3:
   1664 ; X64-SSE:       # %bb.0: # %entry
   1665 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   1666 ; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
   1667 ; X64-SSE-NEXT:    movd %ecx, %xmm0
   1668 ; X64-SSE-NEXT:    pxor %xmm1, %xmm1
   1669 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1670 ; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
   1671 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
   1672 ; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
   1673 ; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
   1674 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1675 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
   1676 ; X64-SSE-NEXT:    retq
   1677 ;
   1678 ; X64-AVX-LABEL: mul_2xi8_varconst3:
   1679 ; X64-AVX:       # %bb.0: # %entry
   1680 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   1681 ; X64-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
   1682 ; X64-AVX-NEXT:    movl $256, %ecx # imm = 0x100
   1683 ; X64-AVX-NEXT:    vmovq %rcx, %xmm1
   1684 ; X64-AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
   1685 ; X64-AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
   1686 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1687 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
   1688 ; X64-AVX-NEXT:    retq
   1689 entry:
   1690   %pre = load i32*, i32** @c
   1691   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   1692   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
   1693   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
   1694   %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
   1695   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
   1696   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   1697   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   1698   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   1699   ret void
   1700 }
   1701 
   1702 ; %val = load <2 x i8>
   1703 ; %op1 = zext<2 x i32> %val
   1704 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
   1705 ; %rst = mul <2 x i32> %op1, %op2
   1706 ;
   1707 define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
   1708 ; X86-SSE-LABEL: mul_2xi8_varconst4:
   1709 ; X86-SSE:       # %bb.0: # %entry
   1710 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1711 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1712 ; X86-SSE-NEXT:    movl c, %edx
   1713 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
   1714 ; X86-SSE-NEXT:    movd %ecx, %xmm0
   1715 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
   1716 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1717 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
   1718 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
   1719 ; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
   1720 ; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
   1721 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1722 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
   1723 ; X86-SSE-NEXT:    retl
   1724 ;
   1725 ; X86-AVX-LABEL: mul_2xi8_varconst4:
   1726 ; X86-AVX:       # %bb.0: # %entry
   1727 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1728 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1729 ; X86-AVX-NEXT:    movl c, %edx
   1730 ; X86-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
   1731 ; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
   1732 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1733 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
   1734 ; X86-AVX-NEXT:    retl
   1735 ;
   1736 ; X64-SSE-LABEL: mul_2xi8_varconst4:
   1737 ; X64-SSE:       # %bb.0: # %entry
   1738 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   1739 ; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
   1740 ; X64-SSE-NEXT:    movd %ecx, %xmm0
   1741 ; X64-SSE-NEXT:    pxor %xmm1, %xmm1
   1742 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
   1743 ; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
   1744 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
   1745 ; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
   1746 ; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
   1747 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1748 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
   1749 ; X64-SSE-NEXT:    retq
   1750 ;
   1751 ; X64-AVX-LABEL: mul_2xi8_varconst4:
   1752 ; X64-AVX:       # %bb.0: # %entry
   1753 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   1754 ; X64-AVX-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
   1755 ; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   1756 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1757 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
   1758 ; X64-AVX-NEXT:    retq
   1759 entry:
   1760   %pre = load i32*, i32** @c
   1761   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   1762   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
   1763   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
   1764   %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
   1765   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
   1766   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   1767   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   1768   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   1769   ret void
   1770 }
   1771 
   1772 ; %val = load <2 x i8>
   1773 ; %op1 = sext<2 x i32> %val
   1774 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
   1775 ; %rst = mul <2 x i32> %op1, %op2
   1776 ;
   1777 define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
   1778 ; X86-SSE-LABEL: mul_2xi8_varconst5:
   1779 ; X86-SSE:       # %bb.0: # %entry
   1780 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1781 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1782 ; X86-SSE-NEXT:    movl c, %edx
   1783 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
   1784 ; X86-SSE-NEXT:    movd %ecx, %xmm0
   1785 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1786 ; X86-SSE-NEXT:    psraw $8, %xmm0
   1787 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
   1788 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
   1789 ; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
   1790 ; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
   1791 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1792 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
   1793 ; X86-SSE-NEXT:    retl
   1794 ;
   1795 ; X86-AVX-LABEL: mul_2xi8_varconst5:
   1796 ; X86-AVX:       # %bb.0: # %entry
   1797 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1798 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1799 ; X86-AVX-NEXT:    movl c, %edx
   1800 ; X86-AVX-NEXT:    vpmovsxbq (%ecx,%eax), %xmm0
   1801 ; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
   1802 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1803 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
   1804 ; X86-AVX-NEXT:    retl
   1805 ;
   1806 ; X64-SSE-LABEL: mul_2xi8_varconst5:
   1807 ; X64-SSE:       # %bb.0: # %entry
   1808 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   1809 ; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
   1810 ; X64-SSE-NEXT:    movd %ecx, %xmm0
   1811 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1812 ; X64-SSE-NEXT:    psraw $8, %xmm0
   1813 ; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
   1814 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
   1815 ; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
   1816 ; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
   1817 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1818 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
   1819 ; X64-SSE-NEXT:    retq
   1820 ;
   1821 ; X64-AVX-LABEL: mul_2xi8_varconst5:
   1822 ; X64-AVX:       # %bb.0: # %entry
   1823 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   1824 ; X64-AVX-NEXT:    vpmovsxbq (%rdi,%rsi), %xmm0
   1825 ; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   1826 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1827 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
   1828 ; X64-AVX-NEXT:    retq
   1829 entry:
   1830   %pre = load i32*, i32** @c
   1831   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   1832   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
   1833   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
   1834   %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
   1835   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
   1836   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   1837   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   1838   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   1839   ret void
   1840 }
   1841 
   1842 ; %val = load <2 x i8>
   1843 ; %op1 = sext<2 x i32> %val
   1844 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
   1845 ; %rst = mul <2 x i32> %op1, %op2
   1846 ;
   1847 define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
   1848 ; X86-SSE-LABEL: mul_2xi8_varconst6:
   1849 ; X86-SSE:       # %bb.0: # %entry
   1850 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1851 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1852 ; X86-SSE-NEXT:    movl c, %edx
   1853 ; X86-SSE-NEXT:    movzwl (%ecx,%eax), %ecx
   1854 ; X86-SSE-NEXT:    movd %ecx, %xmm0
   1855 ; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1856 ; X86-SSE-NEXT:    psraw $8, %xmm0
   1857 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
   1858 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
   1859 ; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
   1860 ; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
   1861 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1862 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
   1863 ; X86-SSE-NEXT:    retl
   1864 ;
   1865 ; X86-AVX-LABEL: mul_2xi8_varconst6:
   1866 ; X86-AVX:       # %bb.0: # %entry
   1867 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1868 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1869 ; X86-AVX-NEXT:    movl c, %edx
   1870 ; X86-AVX-NEXT:    vpmovsxbq (%ecx,%eax), %xmm0
   1871 ; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
   1872 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1873 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
   1874 ; X86-AVX-NEXT:    retl
   1875 ;
   1876 ; X64-SSE-LABEL: mul_2xi8_varconst6:
   1877 ; X64-SSE:       # %bb.0: # %entry
   1878 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   1879 ; X64-SSE-NEXT:    movzwl (%rdi,%rsi), %ecx
   1880 ; X64-SSE-NEXT:    movd %ecx, %xmm0
   1881 ; X64-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
   1882 ; X64-SSE-NEXT:    psraw $8, %xmm0
   1883 ; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
   1884 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
   1885 ; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
   1886 ; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
   1887 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1888 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
   1889 ; X64-SSE-NEXT:    retq
   1890 ;
   1891 ; X64-AVX-LABEL: mul_2xi8_varconst6:
   1892 ; X64-AVX:       # %bb.0: # %entry
   1893 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   1894 ; X64-AVX-NEXT:    vpmovsxbq (%rdi,%rsi), %xmm0
   1895 ; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   1896 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1897 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
   1898 ; X64-AVX-NEXT:    retq
   1899 entry:
   1900   %pre = load i32*, i32** @c
   1901   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   1902   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
   1903   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
   1904   %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
   1905   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
   1906   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   1907   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   1908   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   1909   ret void
   1910 }
   1911 
   1912 ; %val = load <2 x i16>
   1913 ; %op1 = zext<2 x i32> %val
   1914 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
   1915 ; %rst = mul <2 x i32> %op1, %op2
   1916 ;
   1917 define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
   1918 ; X86-SSE-LABEL: mul_2xi16_varconst1:
   1919 ; X86-SSE:       # %bb.0: # %entry
   1920 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1921 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1922 ; X86-SSE-NEXT:    movl c, %edx
   1923 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1924 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
   1925 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
   1926 ; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm2
   1927 ; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
   1928 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1929 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
   1930 ; X86-SSE-NEXT:    retl
   1931 ;
   1932 ; X86-AVX-LABEL: mul_2xi16_varconst1:
   1933 ; X86-AVX:       # %bb.0: # %entry
   1934 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1935 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1936 ; X86-AVX-NEXT:    movl c, %edx
   1937 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1938 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1939 ; X86-AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
   1940 ; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
   1941 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1942 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
   1943 ; X86-AVX-NEXT:    retl
   1944 ;
   1945 ; X64-SSE-LABEL: mul_2xi16_varconst1:
   1946 ; X64-SSE:       # %bb.0: # %entry
   1947 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   1948 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1949 ; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
   1950 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
   1951 ; X64-SSE-NEXT:    pmulhuw %xmm1, %xmm2
   1952 ; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
   1953 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   1954 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
   1955 ; X64-SSE-NEXT:    retq
   1956 ;
   1957 ; X64-AVX-LABEL: mul_2xi16_varconst1:
   1958 ; X64-AVX:       # %bb.0: # %entry
   1959 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   1960 ; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1961 ; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   1962 ; X64-AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
   1963 ; X64-AVX-NEXT:    movl $65535, %ecx # imm = 0xFFFF
   1964 ; X64-AVX-NEXT:    vmovq %rcx, %xmm1
   1965 ; X64-AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
   1966 ; X64-AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   1967 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   1968 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
   1969 ; X64-AVX-NEXT:    retq
   1970 entry:
   1971   %pre = load i32*, i32** @c
   1972   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   1973   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
   1974   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
   1975   %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
   1976   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
   1977   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   1978   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   1979   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   1980   ret void
   1981 }
   1982 
   1983 ; %val = load <2 x i16>
   1984 ; %op1 = sext<2 x i32> %val
   1985 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
   1986 ; %rst = mul <2 x i32> %op1, %op2
   1987 ;
   1988 define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
   1989 ; X86-SSE-LABEL: mul_2xi16_varconst2:
   1990 ; X86-SSE:       # %bb.0: # %entry
   1991 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   1992 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   1993 ; X86-SSE-NEXT:    movl c, %edx
   1994 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   1995 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
   1996 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
   1997 ; X86-SSE-NEXT:    pmulhw %xmm1, %xmm2
   1998 ; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
   1999 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   2000 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
   2001 ; X86-SSE-NEXT:    retl
   2002 ;
   2003 ; X86-AVX-LABEL: mul_2xi16_varconst2:
   2004 ; X86-AVX:       # %bb.0: # %entry
   2005 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2006 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   2007 ; X86-AVX-NEXT:    movl c, %edx
   2008 ; X86-AVX-NEXT:    vpmovsxwq (%ecx,%eax), %xmm0
   2009 ; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
   2010 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2011 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
   2012 ; X86-AVX-NEXT:    retl
   2013 ;
   2014 ; X64-SSE-LABEL: mul_2xi16_varconst2:
   2015 ; X64-SSE:       # %bb.0: # %entry
   2016 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   2017 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2018 ; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
   2019 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
   2020 ; X64-SSE-NEXT:    pmulhw %xmm1, %xmm2
   2021 ; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
   2022 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
   2023 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
   2024 ; X64-SSE-NEXT:    retq
   2025 ;
   2026 ; X64-AVX-LABEL: mul_2xi16_varconst2:
   2027 ; X64-AVX:       # %bb.0: # %entry
   2028 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   2029 ; X64-AVX-NEXT:    vpmovsxwq (%rdi,%rsi), %xmm0
   2030 ; X64-AVX-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
   2031 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2032 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
   2033 ; X64-AVX-NEXT:    retq
   2034 entry:
   2035   %pre = load i32*, i32** @c
   2036   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   2037   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
   2038   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
   2039   %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
   2040   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
   2041   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   2042   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   2043   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   2044   ret void
   2045 }
   2046 
   2047 ; %val = load <2 x i16>
   2048 ; %op1 = zext<2 x i32> %val
   2049 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
   2050 ; %rst = mul <2 x i32> %op1, %op2
   2051 ;
   2052 define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
   2053 ; X86-SSE-LABEL: mul_2xi16_varconst3:
   2054 ; X86-SSE:       # %bb.0: # %entry
   2055 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2056 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   2057 ; X86-SSE-NEXT:    movl c, %edx
   2058 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2059 ; X86-SSE-NEXT:    pxor %xmm1, %xmm1
   2060 ; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   2061 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
   2062 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,65536,0]
   2063 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
   2064 ; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
   2065 ; X86-SSE-NEXT:    psrlq $32, %xmm0
   2066 ; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
   2067 ; X86-SSE-NEXT:    psllq $32, %xmm0
   2068 ; X86-SSE-NEXT:    paddq %xmm2, %xmm0
   2069 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2070 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
   2071 ; X86-SSE-NEXT:    retl
   2072 ;
   2073 ; X86-AVX-LABEL: mul_2xi16_varconst3:
   2074 ; X86-AVX:       # %bb.0: # %entry
   2075 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2076 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   2077 ; X86-AVX-NEXT:    movl c, %edx
   2078 ; X86-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2079 ; X86-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   2080 ; X86-AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
   2081 ; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
   2082 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2083 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
   2084 ; X86-AVX-NEXT:    retl
   2085 ;
   2086 ; X64-SSE-LABEL: mul_2xi16_varconst3:
   2087 ; X64-SSE:       # %bb.0: # %entry
   2088 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   2089 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2090 ; X64-SSE-NEXT:    pxor %xmm1, %xmm1
   2091 ; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
   2092 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
   2093 ; X64-SSE-NEXT:    movl $65536, %ecx # imm = 0x10000
   2094 ; X64-SSE-NEXT:    movq %rcx, %xmm1
   2095 ; X64-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
   2096 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
   2097 ; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
   2098 ; X64-SSE-NEXT:    psrlq $32, %xmm0
   2099 ; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0
   2100 ; X64-SSE-NEXT:    psllq $32, %xmm0
   2101 ; X64-SSE-NEXT:    paddq %xmm2, %xmm0
   2102 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2103 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
   2104 ; X64-SSE-NEXT:    retq
   2105 ;
   2106 ; X64-AVX-LABEL: mul_2xi16_varconst3:
   2107 ; X64-AVX:       # %bb.0: # %entry
   2108 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   2109 ; X64-AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2110 ; X64-AVX-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
   2111 ; X64-AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
   2112 ; X64-AVX-NEXT:    movl $65536, %ecx # imm = 0x10000
   2113 ; X64-AVX-NEXT:    vmovq %rcx, %xmm1
   2114 ; X64-AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
   2115 ; X64-AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   2116 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2117 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
   2118 ; X64-AVX-NEXT:    retq
   2119 entry:
   2120   %pre = load i32*, i32** @c
   2121   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   2122   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
   2123   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
   2124   %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
   2125   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536>
   2126   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   2127   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   2128   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   2129   ret void
   2130 }
   2131 
   2132 ; %val = load <2 x i16>
   2133 ; %op1 = sext<2 x i32> %val
   2134 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
   2135 ; %rst = mul <2 x i32> %op1, %op2
   2136 ;
   2137 define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
   2138 ; X86-SSE-LABEL: mul_2xi16_varconst4:
   2139 ; X86-SSE:       # %bb.0: # %entry
   2140 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2141 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   2142 ; X86-SSE-NEXT:    movl c, %edx
   2143 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2144 ; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
   2145 ; X86-SSE-NEXT:    psrad $16, %xmm0
   2146 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
   2147 ; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,0,32768,0]
   2148 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
   2149 ; X86-SSE-NEXT:    pmuludq %xmm1, %xmm2
   2150 ; X86-SSE-NEXT:    psrlq $32, %xmm0
   2151 ; X86-SSE-NEXT:    pmuludq %xmm1, %xmm0
   2152 ; X86-SSE-NEXT:    psllq $32, %xmm0
   2153 ; X86-SSE-NEXT:    paddq %xmm2, %xmm0
   2154 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2155 ; X86-SSE-NEXT:    movq %xmm0, (%edx,%eax,4)
   2156 ; X86-SSE-NEXT:    retl
   2157 ;
   2158 ; X86-AVX-LABEL: mul_2xi16_varconst4:
   2159 ; X86-AVX:       # %bb.0: # %entry
   2160 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
   2161 ; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
   2162 ; X86-AVX-NEXT:    movl c, %edx
   2163 ; X86-AVX-NEXT:    vpmovsxwq (%ecx,%eax), %xmm0
   2164 ; X86-AVX-NEXT:    vpmulld {{\.LCPI.*}}, %xmm0, %xmm0
   2165 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2166 ; X86-AVX-NEXT:    vmovq %xmm0, (%edx,%eax,4)
   2167 ; X86-AVX-NEXT:    retl
   2168 ;
   2169 ; X64-SSE-LABEL: mul_2xi16_varconst4:
   2170 ; X64-SSE:       # %bb.0: # %entry
   2171 ; X64-SSE-NEXT:    movq {{.*}}(%rip), %rax
   2172 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
   2173 ; X64-SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
   2174 ; X64-SSE-NEXT:    psrad $16, %xmm0
   2175 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
   2176 ; X64-SSE-NEXT:    movl $32768, %ecx # imm = 0x8000
   2177 ; X64-SSE-NEXT:    movq %rcx, %xmm1
   2178 ; X64-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
   2179 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
   2180 ; X64-SSE-NEXT:    pmuludq %xmm1, %xmm2
   2181 ; X64-SSE-NEXT:    psrlq $32, %xmm0
   2182 ; X64-SSE-NEXT:    pmuludq %xmm1, %xmm0
   2183 ; X64-SSE-NEXT:    psllq $32, %xmm0
   2184 ; X64-SSE-NEXT:    paddq %xmm2, %xmm0
   2185 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2186 ; X64-SSE-NEXT:    movq %xmm0, (%rax,%rsi,4)
   2187 ; X64-SSE-NEXT:    retq
   2188 ;
   2189 ; X64-AVX-LABEL: mul_2xi16_varconst4:
   2190 ; X64-AVX:       # %bb.0: # %entry
   2191 ; X64-AVX-NEXT:    movq {{.*}}(%rip), %rax
   2192 ; X64-AVX-NEXT:    vpmovsxwq (%rdi,%rsi), %xmm0
   2193 ; X64-AVX-NEXT:    movl $32768, %ecx # imm = 0x8000
   2194 ; X64-AVX-NEXT:    vmovq %rcx, %xmm1
   2195 ; X64-AVX-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
   2196 ; X64-AVX-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
   2197 ; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
   2198 ; X64-AVX-NEXT:    vmovq %xmm0, (%rax,%rsi,4)
   2199 ; X64-AVX-NEXT:    retq
   2200 entry:
   2201   %pre = load i32*, i32** @c
   2202   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
   2203   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
   2204   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
   2205   %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
   2206   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
   2207   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
   2208   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
   2209   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
   2210   ret void
   2211 }
   2212 
   2213 ;
   2214 ; Illegal Types
   2215 ;
   2216 
   2217 define void @PR34947() {
   2218 ; X86-SSE-LABEL: PR34947:
   2219 ; X86-SSE:       # %bb.0:
   2220 ; X86-SSE-NEXT:    movdqa (%eax), %xmm0
   2221 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2222 ; X86-SSE-NEXT:    movd %xmm1, %ecx
   2223 ; X86-SSE-NEXT:    xorl %eax, %eax
   2224 ; X86-SSE-NEXT:    xorl %edx, %edx
   2225 ; X86-SSE-NEXT:    divl %ecx
   2226 ; X86-SSE-NEXT:    movd %edx, %xmm1
   2227 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
   2228 ; X86-SSE-NEXT:    movd %xmm2, %ecx
   2229 ; X86-SSE-NEXT:    xorl %eax, %eax
   2230 ; X86-SSE-NEXT:    xorl %edx, %edx
   2231 ; X86-SSE-NEXT:    divl %ecx
   2232 ; X86-SSE-NEXT:    movd %edx, %xmm2
   2233 ; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
   2234 ; X86-SSE-NEXT:    movd %xmm0, %ecx
   2235 ; X86-SSE-NEXT:    xorl %eax, %eax
   2236 ; X86-SSE-NEXT:    xorl %edx, %edx
   2237 ; X86-SSE-NEXT:    divl %ecx
   2238 ; X86-SSE-NEXT:    movd %edx, %xmm1
   2239 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
   2240 ; X86-SSE-NEXT:    movd %xmm0, %ecx
   2241 ; X86-SSE-NEXT:    xorl %eax, %eax
   2242 ; X86-SSE-NEXT:    xorl %edx, %edx
   2243 ; X86-SSE-NEXT:    divl %ecx
   2244 ; X86-SSE-NEXT:    movd %edx, %xmm0
   2245 ; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   2246 ; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   2247 ; X86-SSE-NEXT:    xorl %eax, %eax
   2248 ; X86-SSE-NEXT:    xorl %edx, %edx
   2249 ; X86-SSE-NEXT:    divl (%eax)
   2250 ; X86-SSE-NEXT:    movd %edx, %xmm0
   2251 ; X86-SSE-NEXT:    pmaddwd {{\.LCPI.*}}, %xmm1
   2252 ; X86-SSE-NEXT:    movl $8199, %eax # imm = 0x2007
   2253 ; X86-SSE-NEXT:    movd %eax, %xmm2
   2254 ; X86-SSE-NEXT:    pmuludq %xmm0, %xmm2
   2255 ; X86-SSE-NEXT:    movd %xmm2, (%eax)
   2256 ; X86-SSE-NEXT:    movdqa %xmm1, (%eax)
   2257 ; X86-SSE-NEXT:    retl
   2258 ;
   2259 ; X86-AVX1-LABEL: PR34947:
   2260 ; X86-AVX1:       # %bb.0:
   2261 ; X86-AVX1-NEXT:    pushl %ebp
   2262 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
   2263 ; X86-AVX1-NEXT:    pushl %ebx
   2264 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 12
   2265 ; X86-AVX1-NEXT:    pushl %edi
   2266 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 16
   2267 ; X86-AVX1-NEXT:    pushl %esi
   2268 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 20
   2269 ; X86-AVX1-NEXT:    subl $16, %esp
   2270 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 36
   2271 ; X86-AVX1-NEXT:    .cfi_offset %esi, -20
   2272 ; X86-AVX1-NEXT:    .cfi_offset %edi, -16
   2273 ; X86-AVX1-NEXT:    .cfi_offset %ebx, -12
   2274 ; X86-AVX1-NEXT:    .cfi_offset %ebp, -8
   2275 ; X86-AVX1-NEXT:    vmovdqa (%eax), %ymm0
   2276 ; X86-AVX1-NEXT:    xorl %eax, %eax
   2277 ; X86-AVX1-NEXT:    xorl %edx, %edx
   2278 ; X86-AVX1-NEXT:    divl (%eax)
   2279 ; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
   2280 ; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx
   2281 ; X86-AVX1-NEXT:    xorl %eax, %eax
   2282 ; X86-AVX1-NEXT:    xorl %edx, %edx
   2283 ; X86-AVX1-NEXT:    divl %ecx
   2284 ; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
   2285 ; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %ecx
   2286 ; X86-AVX1-NEXT:    xorl %eax, %eax
   2287 ; X86-AVX1-NEXT:    xorl %edx, %edx
   2288 ; X86-AVX1-NEXT:    divl %ecx
   2289 ; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
   2290 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %ecx
   2291 ; X86-AVX1-NEXT:    xorl %eax, %eax
   2292 ; X86-AVX1-NEXT:    xorl %edx, %edx
   2293 ; X86-AVX1-NEXT:    divl %ecx
   2294 ; X86-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
   2295 ; X86-AVX1-NEXT:    vmovd %xmm0, %ecx
   2296 ; X86-AVX1-NEXT:    xorl %eax, %eax
   2297 ; X86-AVX1-NEXT:    xorl %edx, %edx
   2298 ; X86-AVX1-NEXT:    divl %ecx
   2299 ; X86-AVX1-NEXT:    movl %edx, %ebp
   2300 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2301 ; X86-AVX1-NEXT:    xorl %eax, %eax
   2302 ; X86-AVX1-NEXT:    xorl %edx, %edx
   2303 ; X86-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx
   2304 ; X86-AVX1-NEXT:    divl %ecx
   2305 ; X86-AVX1-NEXT:    movl %edx, %ecx
   2306 ; X86-AVX1-NEXT:    xorl %eax, %eax
   2307 ; X86-AVX1-NEXT:    xorl %edx, %edx
   2308 ; X86-AVX1-NEXT:    vpextrd $2, %xmm0, %esi
   2309 ; X86-AVX1-NEXT:    divl %esi
   2310 ; X86-AVX1-NEXT:    movl %edx, %esi
   2311 ; X86-AVX1-NEXT:    xorl %eax, %eax
   2312 ; X86-AVX1-NEXT:    xorl %edx, %edx
   2313 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edi
   2314 ; X86-AVX1-NEXT:    divl %edi
   2315 ; X86-AVX1-NEXT:    movl %edx, %edi
   2316 ; X86-AVX1-NEXT:    xorl %eax, %eax
   2317 ; X86-AVX1-NEXT:    xorl %edx, %edx
   2318 ; X86-AVX1-NEXT:    vmovd %xmm0, %ebx
   2319 ; X86-AVX1-NEXT:    divl %ebx
   2320 ; X86-AVX1-NEXT:    vmovd %edx, %xmm0
   2321 ; X86-AVX1-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
   2322 ; X86-AVX1-NEXT:    vpinsrd $2, %esi, %xmm0, %xmm0
   2323 ; X86-AVX1-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
   2324 ; X86-AVX1-NEXT:    vmovd %ebp, %xmm1
   2325 ; X86-AVX1-NEXT:    vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload
   2326 ; X86-AVX1-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload
   2327 ; X86-AVX1-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload
   2328 ; X86-AVX1-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm2 # 4-byte Folded Reload
   2329 ; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero
   2330 ; X86-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007
   2331 ; X86-AVX1-NEXT:    vmovd %eax, %xmm3
   2332 ; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199]
   2333 ; X86-AVX1-NEXT:    vpmaddwd %xmm4, %xmm0, %xmm0
   2334 ; X86-AVX1-NEXT:    vpmaddwd %xmm4, %xmm1, %xmm1
   2335 ; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2336 ; X86-AVX1-NEXT:    vpmulld %xmm3, %xmm2, %xmm1
   2337 ; X86-AVX1-NEXT:    vmovd %xmm1, (%eax)
   2338 ; X86-AVX1-NEXT:    vmovaps %ymm0, (%eax)
   2339 ; X86-AVX1-NEXT:    addl $16, %esp
   2340 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 20
   2341 ; X86-AVX1-NEXT:    popl %esi
   2342 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 16
   2343 ; X86-AVX1-NEXT:    popl %edi
   2344 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 12
   2345 ; X86-AVX1-NEXT:    popl %ebx
   2346 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
   2347 ; X86-AVX1-NEXT:    popl %ebp
   2348 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 4
   2349 ; X86-AVX1-NEXT:    vzeroupper
   2350 ; X86-AVX1-NEXT:    retl
   2351 ;
   2352 ; X86-AVX2-LABEL: PR34947:
   2353 ; X86-AVX2:       # %bb.0:
   2354 ; X86-AVX2-NEXT:    pushl %esi
   2355 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 8
   2356 ; X86-AVX2-NEXT:    .cfi_offset %esi, -8
   2357 ; X86-AVX2-NEXT:    vmovdqa (%eax), %ymm0
   2358 ; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2359 ; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
   2360 ; X86-AVX2-NEXT:    xorl %eax, %eax
   2361 ; X86-AVX2-NEXT:    xorl %edx, %edx
   2362 ; X86-AVX2-NEXT:    divl %ecx
   2363 ; X86-AVX2-NEXT:    movl %edx, %ecx
   2364 ; X86-AVX2-NEXT:    vmovd %xmm1, %esi
   2365 ; X86-AVX2-NEXT:    xorl %eax, %eax
   2366 ; X86-AVX2-NEXT:    xorl %edx, %edx
   2367 ; X86-AVX2-NEXT:    divl %esi
   2368 ; X86-AVX2-NEXT:    vmovd %edx, %xmm2
   2369 ; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
   2370 ; X86-AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
   2371 ; X86-AVX2-NEXT:    xorl %eax, %eax
   2372 ; X86-AVX2-NEXT:    xorl %edx, %edx
   2373 ; X86-AVX2-NEXT:    divl %ecx
   2374 ; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
   2375 ; X86-AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
   2376 ; X86-AVX2-NEXT:    xorl %eax, %eax
   2377 ; X86-AVX2-NEXT:    xorl %edx, %edx
   2378 ; X86-AVX2-NEXT:    divl %ecx
   2379 ; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm1
   2380 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %ecx
   2381 ; X86-AVX2-NEXT:    xorl %eax, %eax
   2382 ; X86-AVX2-NEXT:    xorl %edx, %edx
   2383 ; X86-AVX2-NEXT:    divl %ecx
   2384 ; X86-AVX2-NEXT:    movl %edx, %ecx
   2385 ; X86-AVX2-NEXT:    vmovd %xmm0, %esi
   2386 ; X86-AVX2-NEXT:    xorl %eax, %eax
   2387 ; X86-AVX2-NEXT:    xorl %edx, %edx
   2388 ; X86-AVX2-NEXT:    divl %esi
   2389 ; X86-AVX2-NEXT:    vmovd %edx, %xmm2
   2390 ; X86-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
   2391 ; X86-AVX2-NEXT:    vpextrd $2, %xmm0, %ecx
   2392 ; X86-AVX2-NEXT:    xorl %eax, %eax
   2393 ; X86-AVX2-NEXT:    xorl %edx, %edx
   2394 ; X86-AVX2-NEXT:    divl %ecx
   2395 ; X86-AVX2-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
   2396 ; X86-AVX2-NEXT:    vpextrd $3, %xmm0, %ecx
   2397 ; X86-AVX2-NEXT:    xorl %eax, %eax
   2398 ; X86-AVX2-NEXT:    xorl %edx, %edx
   2399 ; X86-AVX2-NEXT:    divl %ecx
   2400 ; X86-AVX2-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
   2401 ; X86-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   2402 ; X86-AVX2-NEXT:    xorl %eax, %eax
   2403 ; X86-AVX2-NEXT:    xorl %edx, %edx
   2404 ; X86-AVX2-NEXT:    divl (%eax)
   2405 ; X86-AVX2-NEXT:    vmovd %edx, %xmm1
   2406 ; X86-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
   2407 ; X86-AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
   2408 ; X86-AVX2-NEXT:    movl $8199, %eax # imm = 0x2007
   2409 ; X86-AVX2-NEXT:    vmovd %eax, %xmm2
   2410 ; X86-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
   2411 ; X86-AVX2-NEXT:    vmovd %xmm1, (%eax)
   2412 ; X86-AVX2-NEXT:    vmovdqa %ymm0, (%eax)
   2413 ; X86-AVX2-NEXT:    popl %esi
   2414 ; X86-AVX2-NEXT:    .cfi_def_cfa_offset 4
   2415 ; X86-AVX2-NEXT:    vzeroupper
   2416 ; X86-AVX2-NEXT:    retl
   2417 ;
   2418 ; X64-SSE-LABEL: PR34947:
   2419 ; X64-SSE:       # %bb.0:
   2420 ; X64-SSE-NEXT:    movdqa (%rax), %xmm0
   2421 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
   2422 ; X64-SSE-NEXT:    movd %xmm1, %ecx
   2423 ; X64-SSE-NEXT:    xorl %eax, %eax
   2424 ; X64-SSE-NEXT:    xorl %edx, %edx
   2425 ; X64-SSE-NEXT:    divl %ecx
   2426 ; X64-SSE-NEXT:    movd %edx, %xmm1
   2427 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
   2428 ; X64-SSE-NEXT:    movd %xmm2, %ecx
   2429 ; X64-SSE-NEXT:    xorl %eax, %eax
   2430 ; X64-SSE-NEXT:    xorl %edx, %edx
   2431 ; X64-SSE-NEXT:    divl %ecx
   2432 ; X64-SSE-NEXT:    movd %edx, %xmm2
   2433 ; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
   2434 ; X64-SSE-NEXT:    movd %xmm0, %ecx
   2435 ; X64-SSE-NEXT:    xorl %eax, %eax
   2436 ; X64-SSE-NEXT:    xorl %edx, %edx
   2437 ; X64-SSE-NEXT:    divl %ecx
   2438 ; X64-SSE-NEXT:    movd %edx, %xmm1
   2439 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
   2440 ; X64-SSE-NEXT:    movd %xmm0, %ecx
   2441 ; X64-SSE-NEXT:    xorl %eax, %eax
   2442 ; X64-SSE-NEXT:    xorl %edx, %edx
   2443 ; X64-SSE-NEXT:    divl %ecx
   2444 ; X64-SSE-NEXT:    movd %edx, %xmm0
   2445 ; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
   2446 ; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
   2447 ; X64-SSE-NEXT:    xorl %eax, %eax
   2448 ; X64-SSE-NEXT:    xorl %edx, %edx
   2449 ; X64-SSE-NEXT:    divl (%rax)
   2450 ; X64-SSE-NEXT:    movd %edx, %xmm0
   2451 ; X64-SSE-NEXT:    pmaddwd {{.*}}(%rip), %xmm1
   2452 ; X64-SSE-NEXT:    movl $8199, %eax # imm = 0x2007
   2453 ; X64-SSE-NEXT:    movd %eax, %xmm2
   2454 ; X64-SSE-NEXT:    pmuludq %xmm0, %xmm2
   2455 ; X64-SSE-NEXT:    movd %xmm2, (%rax)
   2456 ; X64-SSE-NEXT:    movdqa %xmm1, (%rax)
   2457 ; X64-SSE-NEXT:    retq
   2458 ;
   2459 ; X64-AVX1-LABEL: PR34947:
   2460 ; X64-AVX1:       # %bb.0:
   2461 ; X64-AVX1-NEXT:    pushq %rbp
   2462 ; X64-AVX1-NEXT:    .cfi_def_cfa_offset 16
   2463 ; X64-AVX1-NEXT:    pushq %rbx
   2464 ; X64-AVX1-NEXT:    .cfi_def_cfa_offset 24
   2465 ; X64-AVX1-NEXT:    .cfi_offset %rbx, -24
   2466 ; X64-AVX1-NEXT:    .cfi_offset %rbp, -16
   2467 ; X64-AVX1-NEXT:    vmovdqa (%rax), %ymm0
   2468 ; X64-AVX1-NEXT:    xorl %eax, %eax
   2469 ; X64-AVX1-NEXT:    xorl %edx, %edx
   2470 ; X64-AVX1-NEXT:    divl (%rax)
   2471 ; X64-AVX1-NEXT:    movl %edx, %r8d
   2472 ; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx
   2473 ; X64-AVX1-NEXT:    xorl %eax, %eax
   2474 ; X64-AVX1-NEXT:    xorl %edx, %edx
   2475 ; X64-AVX1-NEXT:    divl %ecx
   2476 ; X64-AVX1-NEXT:    movl %edx, %r9d
   2477 ; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %ecx
   2478 ; X64-AVX1-NEXT:    xorl %eax, %eax
   2479 ; X64-AVX1-NEXT:    xorl %edx, %edx
   2480 ; X64-AVX1-NEXT:    divl %ecx
   2481 ; X64-AVX1-NEXT:    movl %edx, %r10d
   2482 ; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %ecx
   2483 ; X64-AVX1-NEXT:    xorl %eax, %eax
   2484 ; X64-AVX1-NEXT:    xorl %edx, %edx
   2485 ; X64-AVX1-NEXT:    divl %ecx
   2486 ; X64-AVX1-NEXT:    movl %edx, %r11d
   2487 ; X64-AVX1-NEXT:    vmovd %xmm0, %ecx
   2488 ; X64-AVX1-NEXT:    xorl %eax, %eax
   2489 ; X64-AVX1-NEXT:    xorl %edx, %edx
   2490 ; X64-AVX1-NEXT:    divl %ecx
   2491 ; X64-AVX1-NEXT:    movl %edx, %esi
   2492 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
   2493 ; X64-AVX1-NEXT:    vpextrd $3, %xmm0, %ecx
   2494 ; X64-AVX1-NEXT:    xorl %eax, %eax
   2495 ; X64-AVX1-NEXT:    xorl %edx, %edx
   2496 ; X64-AVX1-NEXT:    divl %ecx
   2497 ; X64-AVX1-NEXT:    movl %edx, %edi
   2498 ; X64-AVX1-NEXT:    vpextrd $2, %xmm0, %ecx
   2499 ; X64-AVX1-NEXT:    xorl %eax, %eax
   2500 ; X64-AVX1-NEXT:    xorl %edx, %edx
   2501 ; X64-AVX1-NEXT:    divl %ecx
   2502 ; X64-AVX1-NEXT:    movl %edx, %ecx
   2503 ; X64-AVX1-NEXT:    vpextrd $1, %xmm0, %ebx
   2504 ; X64-AVX1-NEXT:    xorl %eax, %eax
   2505 ; X64-AVX1-NEXT:    xorl %edx, %edx
   2506 ; X64-AVX1-NEXT:    divl %ebx
   2507 ; X64-AVX1-NEXT:    movl %edx, %ebx
   2508 ; X64-AVX1-NEXT:    vmovd %xmm0, %ebp
   2509 ; X64-AVX1-NEXT:    xorl %eax, %eax
   2510 ; X64-AVX1-NEXT:    xorl %edx, %edx
   2511 ; X64-AVX1-NEXT:    divl %ebp
   2512 ; X64-AVX1-NEXT:    vmovd %edx, %xmm0
   2513 ; X64-AVX1-NEXT:    vpinsrd $1, %ebx, %xmm0, %xmm0
   2514 ; X64-AVX1-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
   2515 ; X64-AVX1-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
   2516 ; X64-AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199]
   2517 ; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0
   2518 ; X64-AVX1-NEXT:    vmovd %esi, %xmm2
   2519 ; X64-AVX1-NEXT:    vpinsrd $1, %r11d, %xmm2, %xmm2
   2520 ; X64-AVX1-NEXT:    vpinsrd $2, %r10d, %xmm2, %xmm2
   2521 ; X64-AVX1-NEXT:    vpinsrd $3, %r9d, %xmm2, %xmm2
   2522 ; X64-AVX1-NEXT:    vpmaddwd %xmm1, %xmm2, %xmm1
   2523 ; X64-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
   2524 ; X64-AVX1-NEXT:    vmovd %r8d, %xmm1
   2525 ; X64-AVX1-NEXT:    movl $8199, %eax # imm = 0x2007
   2526 ; X64-AVX1-NEXT:    vmovd %eax, %xmm2
   2527 ; X64-AVX1-NEXT:    vpmulld %xmm2, %xmm1, %xmm1
   2528 ; X64-AVX1-NEXT:    vmovd %xmm1, (%rax)
   2529 ; X64-AVX1-NEXT:    vmovaps %ymm0, (%rax)
   2530 ; X64-AVX1-NEXT:    popq %rbx
   2531 ; X64-AVX1-NEXT:    .cfi_def_cfa_offset 16
   2532 ; X64-AVX1-NEXT:    popq %rbp
   2533 ; X64-AVX1-NEXT:    .cfi_def_cfa_offset 8
   2534 ; X64-AVX1-NEXT:    vzeroupper
   2535 ; X64-AVX1-NEXT:    retq
   2536 ;
   2537 ; X64-AVX2-LABEL: PR34947:
   2538 ; X64-AVX2:       # %bb.0:
   2539 ; X64-AVX2-NEXT:    vmovdqa (%rax), %ymm0
   2540 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
   2541 ; X64-AVX2-NEXT:    vpextrd $1, %xmm1, %ecx
   2542 ; X64-AVX2-NEXT:    xorl %eax, %eax
   2543 ; X64-AVX2-NEXT:    xorl %edx, %edx
   2544 ; X64-AVX2-NEXT:    divl %ecx
   2545 ; X64-AVX2-NEXT:    movl %edx, %ecx
   2546 ; X64-AVX2-NEXT:    vmovd %xmm1, %esi
   2547 ; X64-AVX2-NEXT:    xorl %eax, %eax
   2548 ; X64-AVX2-NEXT:    xorl %edx, %edx
   2549 ; X64-AVX2-NEXT:    divl %esi
   2550 ; X64-AVX2-NEXT:    vmovd %edx, %xmm2
   2551 ; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
   2552 ; X64-AVX2-NEXT:    vpextrd $2, %xmm1, %ecx
   2553 ; X64-AVX2-NEXT:    xorl %eax, %eax
   2554 ; X64-AVX2-NEXT:    xorl %edx, %edx
   2555 ; X64-AVX2-NEXT:    divl %ecx
   2556 ; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
   2557 ; X64-AVX2-NEXT:    vpextrd $3, %xmm1, %ecx
   2558 ; X64-AVX2-NEXT:    xorl %eax, %eax
   2559 ; X64-AVX2-NEXT:    xorl %edx, %edx
   2560 ; X64-AVX2-NEXT:    divl %ecx
   2561 ; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm1
   2562 ; X64-AVX2-NEXT:    vpextrd $1, %xmm0, %ecx
   2563 ; X64-AVX2-NEXT:    xorl %eax, %eax
   2564 ; X64-AVX2-NEXT:    xorl %edx, %edx
   2565 ; X64-AVX2-NEXT:    divl %ecx
   2566 ; X64-AVX2-NEXT:    movl %edx, %ecx
   2567 ; X64-AVX2-NEXT:    vmovd %xmm0, %esi
   2568 ; X64-AVX2-NEXT:    xorl %eax, %eax
   2569 ; X64-AVX2-NEXT:    xorl %edx, %edx
   2570 ; X64-AVX2-NEXT:    divl %esi
   2571 ; X64-AVX2-NEXT:    vmovd %edx, %xmm2
   2572 ; X64-AVX2-NEXT:    vpinsrd $1, %ecx, %xmm2, %xmm2
   2573 ; X64-AVX2-NEXT:    vpextrd $2, %xmm0, %ecx
   2574 ; X64-AVX2-NEXT:    xorl %eax, %eax
   2575 ; X64-AVX2-NEXT:    xorl %edx, %edx
   2576 ; X64-AVX2-NEXT:    divl %ecx
   2577 ; X64-AVX2-NEXT:    vpinsrd $2, %edx, %xmm2, %xmm2
   2578 ; X64-AVX2-NEXT:    vpextrd $3, %xmm0, %ecx
   2579 ; X64-AVX2-NEXT:    xorl %eax, %eax
   2580 ; X64-AVX2-NEXT:    xorl %edx, %edx
   2581 ; X64-AVX2-NEXT:    divl %ecx
   2582 ; X64-AVX2-NEXT:    vpinsrd $3, %edx, %xmm2, %xmm0
   2583 ; X64-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
   2584 ; X64-AVX2-NEXT:    xorl %eax, %eax
   2585 ; X64-AVX2-NEXT:    xorl %edx, %edx
   2586 ; X64-AVX2-NEXT:    divl (%rax)
   2587 ; X64-AVX2-NEXT:    vmovd %edx, %xmm1
   2588 ; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199]
   2589 ; X64-AVX2-NEXT:    vpmaddwd %ymm2, %ymm0, %ymm0
   2590 ; X64-AVX2-NEXT:    movl $8199, %eax # imm = 0x2007
   2591 ; X64-AVX2-NEXT:    vmovd %eax, %xmm2
   2592 ; X64-AVX2-NEXT:    vpmulld %ymm2, %ymm1, %ymm1
   2593 ; X64-AVX2-NEXT:    vmovd %xmm1, (%rax)
   2594 ; X64-AVX2-NEXT:    vmovdqa %ymm0, (%rax)
   2595 ; X64-AVX2-NEXT:    vzeroupper
   2596 ; X64-AVX2-NEXT:    retq
   2597   %tmp = load <9 x i32>, <9 x i32>* undef, align 64
   2598   %rem = urem <9 x i32> zeroinitializer, %tmp
   2599   %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem
   2600   store <9 x i32> %mul, <9 x i32>* undef, align 64
   2601   ret void
   2602 }
   2603