Home | History | Annotate | Download | only in X86
      1 ; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
      2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s
      3 
      4 @c = external global i32*, align 8
      5 
      6 ; %val1 = load <2 x i8>
      7 ; %op1 = zext<2 x i32> %val1
      8 ; %val2 = load <2 x i8>
      9 ; %op2 = zext<2 x i32> %val2
     10 ; %rst = mul <2 x i32> %op1, %op2
     11 ;
     12 define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
     13 ; CHECK-LABEL: mul_2xi8:
     14 ; CHECK:       # BB#0: # %entry
     15 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
     16 ; CHECK-NEXT:    movzwl (%rdi,%rdx), %ecx
     17 ; CHECK-NEXT:    movd %ecx, %xmm0
     18 ; CHECK-NEXT:    movzwl (%rsi,%rdx), %ecx
     19 ; CHECK-NEXT:    movd %ecx, %xmm1
     20 ; CHECK-NEXT:    pxor %xmm2, %xmm2
     21 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
     22 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
     23 ; CHECK-NEXT:    pmullw %xmm0, %xmm1
     24 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
     25 ; CHECK-NEXT:    movq %xmm1, (%rax,%rdx,4)
     26 ; CHECK-NEXT:    retq
     27 entry:
     28   %pre = load i32*, i32** @c
     29   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
     30   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
     31   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
     32   %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
     33   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
     34   %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
     35   %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
     36   %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
     37   %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
     38   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
     39   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
     40   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
     41   ret void
     42 }
     43 
     44 ; %val1 = load <4 x i8>
     45 ; %op1 = zext<4 x i32> %val1
     46 ; %val2 = load <4 x i8>
     47 ; %op2 = zext<4 x i32> %val2
     48 ; %rst = mul <4 x i32> %op1, %op2
     49 ;
     50 define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
     51 ; CHECK-LABEL: mul_4xi8:
     52 ; CHECK:       # BB#0: # %entry
     53 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
     54 ; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
     55 ; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
     56 ; CHECK-NEXT:    pxor %xmm2, %xmm2
     57 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
     58 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
     59 ; CHECK-NEXT:    pmullw %xmm0, %xmm1
     60 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
     61 ; CHECK-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
     62 ; CHECK-NEXT:    retq
     63 entry:
     64   %pre = load i32*, i32** @c
     65   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
     66   %tmp7 = bitcast i8* %tmp6 to <4 x i8>*
     67   %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1
     68   %tmp8 = zext <4 x i8> %wide.load to <4 x i32>
     69   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
     70   %tmp11 = bitcast i8* %tmp10 to <4 x i8>*
     71   %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1
     72   %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32>
     73   %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
     74   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
     75   %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
     76   store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
     77   ret void
     78 }
     79 
     80 ; %val1 = load <8 x i8>
     81 ; %op1 = zext<8 x i32> %val1
     82 ; %val2 = load <8 x i8>
     83 ; %op2 = zext<8 x i32> %val2
     84 ; %rst = mul <8 x i32> %op1, %op2
     85 ;
     86 define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
     87 ; CHECK-LABEL: mul_8xi8:
     88 ; CHECK:       # BB#0: # %entry
     89 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
     90 ; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
     91 ; CHECK-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
     92 ; CHECK-NEXT:    pxor %xmm2, %xmm2
     93 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
     94 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
     95 ; CHECK-NEXT:    pmullw %xmm0, %xmm1
     96 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
     97 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
     98 ; CHECK-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
     99 ; CHECK-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
    100 ; CHECK-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
    101 ; CHECK-NEXT:    retq
    102 entry:
    103   %pre = load i32*, i32** @c
    104   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    105   %tmp7 = bitcast i8* %tmp6 to <8 x i8>*
    106   %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1
    107   %tmp8 = zext <8 x i8> %wide.load to <8 x i32>
    108   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    109   %tmp11 = bitcast i8* %tmp10 to <8 x i8>*
    110   %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1
    111   %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32>
    112   %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
    113   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    114   %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
    115   store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
    116   ret void
    117 }
    118 
    119 ; %val1 = load <16 x i8>
    120 ; %op1 = zext<16 x i32> %val1
    121 ; %val2 = load <16 x i8>
    122 ; %op2 = zext<16 x i32> %val2
    123 ; %rst = mul <16 x i32> %op1, %op2
    124 ;
    125 define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    126 ; CHECK-LABEL: mul_16xi8:
    127 ; CHECK:       # BB#0: # %entry
    128 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    129 ; CHECK-NEXT:    movdqu (%rdi,%rdx), %xmm0
    130 ; CHECK-NEXT:    movdqu (%rsi,%rdx), %xmm1
    131 ; CHECK-NEXT:    pxor %xmm2, %xmm2
    132 ; CHECK-NEXT:    movdqa %xmm0, %xmm3
    133 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
    134 ; CHECK-NEXT:    movdqa %xmm1, %xmm4
    135 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
    136 ; CHECK-NEXT:    pmullw %xmm3, %xmm4
    137 ; CHECK-NEXT:    movdqa %xmm4, %xmm3
    138 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
    139 ; CHECK-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
    140 ; CHECK-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
    141 ; CHECK-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
    142 ; CHECK-NEXT:    pmullw %xmm0, %xmm1
    143 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
    144 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    145 ; CHECK-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    146 ; CHECK-NEXT:    movdqu %xmm1, 48(%rax,%rdx,4)
    147 ; CHECK-NEXT:    movdqu %xmm0, 32(%rax,%rdx,4)
    148 ; CHECK-NEXT:    movdqu %xmm4, 16(%rax,%rdx,4)
    149 ; CHECK-NEXT:    movdqu %xmm3, (%rax,%rdx,4)
    150 ; CHECK-NEXT:    retq
    151 entry:
    152   %pre = load i32*, i32** @c
    153   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    154   %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
    155   %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1
    156   %tmp8 = zext <16 x i8> %wide.load to <16 x i32>
    157   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    158   %tmp11 = bitcast i8* %tmp10 to <16 x i8>*
    159   %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1
    160   %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32>
    161   %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
    162   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    163   %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
    164   store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
    165   ret void
    166 }
    167 
    168 ; %val1 = load <2 x i16>
    169 ; %op1 = zext<2 x i32> %val1
    170 ; %val2 = load <2 x i16>
    171 ; %op2 = zext<2 x i32> %val2
    172 ; %rst = mul <2 x i32> %op1, %op2
    173 ;
    174 define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    175 ; CHECK-LABEL: mul_2xi16:
    176 ; CHECK:       # BB#0: # %entry
    177 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    178 ; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    179 ; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    180 ; CHECK-NEXT:    movdqa %xmm1, %xmm2
    181 ; CHECK-NEXT:    pmulhuw %xmm0, %xmm2
    182 ; CHECK-NEXT:    pmullw %xmm0, %xmm1
    183 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    184 ; CHECK-NEXT:    movq %xmm1, (%rax,%rdx,4)
    185 ; CHECK-NEXT:    retq
    186 entry:
    187   %pre = load i32*, i32** @c
    188   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    189   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
    190   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
    191   %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
    192   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    193   %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
    194   %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
    195   %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
    196   %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
    197   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    198   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    199   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    200   ret void
    201 }
    202 
    203 ; %val1 = load <4 x i16>
    204 ; %op1 = zext<4 x i32> %val1
    205 ; %val2 = load <4 x i16>
    206 ; %op2 = zext<4 x i32> %val2
    207 ; %rst = mul <4 x i32> %op1, %op2
    208 ;
    209 define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    210 ; CHECK-LABEL: mul_4xi16:
    211 ; CHECK:       # BB#0: # %entry
    212 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    213 ; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
    214 ; CHECK-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
    215 ; CHECK-NEXT:    movdqa %xmm1, %xmm2
    216 ; CHECK-NEXT:    pmulhuw %xmm0, %xmm2
    217 ; CHECK-NEXT:    pmullw %xmm0, %xmm1
    218 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    219 ; CHECK-NEXT:    movdqu %xmm1, (%rax,%rdx,4)
    220 ; CHECK-NEXT:    retq
    221 entry:
    222   %pre = load i32*, i32** @c
    223   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    224   %tmp7 = bitcast i8* %tmp6 to <4 x i16>*
    225   %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1
    226   %tmp8 = zext <4 x i16> %wide.load to <4 x i32>
    227   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    228   %tmp11 = bitcast i8* %tmp10 to <4 x i16>*
    229   %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1
    230   %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32>
    231   %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8
    232   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    233   %tmp15 = bitcast i32* %tmp14 to <4 x i32>*
    234   store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4
    235   ret void
    236 }
    237 
    238 ; %val1 = load <8 x i16>
    239 ; %op1 = zext<8 x i32> %val1
    240 ; %val2 = load <8 x i16>
    241 ; %op2 = zext<8 x i32> %val2
    242 ; %rst = mul <8 x i32> %op1, %op2
    243 ;
    244 define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    245 ; CHECK-LABEL: mul_8xi16:
    246 ; CHECK:       # BB#0: # %entry
    247 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    248 ; CHECK-NEXT:    movdqu (%rdi,%rdx), %xmm0
    249 ; CHECK-NEXT:    movdqu (%rsi,%rdx), %xmm1
    250 ; CHECK-NEXT:    movdqa %xmm1, %xmm2
    251 ; CHECK-NEXT:    pmulhuw %xmm0, %xmm2
    252 ; CHECK-NEXT:    pmullw %xmm0, %xmm1
    253 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
    254 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    255 ; CHECK-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    256 ; CHECK-NEXT:    movdqu %xmm1, 16(%rax,%rdx,4)
    257 ; CHECK-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
    258 ; CHECK-NEXT:    retq
    259 entry:
    260   %pre = load i32*, i32** @c
    261   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    262   %tmp7 = bitcast i8* %tmp6 to <8 x i16>*
    263   %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1
    264   %tmp8 = zext <8 x i16> %wide.load to <8 x i32>
    265   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    266   %tmp11 = bitcast i8* %tmp10 to <8 x i16>*
    267   %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1
    268   %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32>
    269   %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8
    270   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    271   %tmp15 = bitcast i32* %tmp14 to <8 x i32>*
    272   store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4
    273   ret void
    274 }
    275 
    276 ; %val1 = load <16 x i16>
    277 ; %op1 = zext<16 x i32> %val1
    278 ; %val2 = load <16 x i16>
    279 ; %op2 = zext<16 x i32> %val2
    280 ; %rst = mul <16 x i32> %op1, %op2
    281 ;
    282 define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    283 ; CHECK-LABEL: mul_16xi16:
    284 ; CHECK:       # BB#0: # %entry
    285 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    286 ; CHECK-NEXT:    movdqu (%rdi,%rdx), %xmm0
    287 ; CHECK-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
    288 ; CHECK-NEXT:    movdqu (%rsi,%rdx), %xmm2
    289 ; CHECK-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
    290 ; CHECK-NEXT:    movdqa %xmm2, %xmm4
    291 ; CHECK-NEXT:    pmulhuw %xmm0, %xmm4
    292 ; CHECK-NEXT:    pmullw %xmm0, %xmm2
    293 ; CHECK-NEXT:    movdqa %xmm2, %xmm0
    294 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    295 ; CHECK-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
    296 ; CHECK-NEXT:    movdqa %xmm3, %xmm4
    297 ; CHECK-NEXT:    pmulhuw %xmm1, %xmm4
    298 ; CHECK-NEXT:    pmullw %xmm1, %xmm3
    299 ; CHECK-NEXT:    movdqa %xmm3, %xmm1
    300 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
    301 ; CHECK-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
    302 ; CHECK-NEXT:    movdqu %xmm3, 48(%rax,%rdx,4)
    303 ; CHECK-NEXT:    movdqu %xmm1, 32(%rax,%rdx,4)
    304 ; CHECK-NEXT:    movdqu %xmm2, 16(%rax,%rdx,4)
    305 ; CHECK-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
    306 ; CHECK-NEXT:    retq
    307 entry:
    308   %pre = load i32*, i32** @c
    309   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    310   %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
    311   %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
    312   %tmp8 = zext <16 x i16> %wide.load to <16 x i32>
    313   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    314   %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
    315   %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
    316   %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32>
    317   %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
    318   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    319   %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
    320   store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
    321   ret void
    322 }
    323 
    324 ; %val1 = load <2 x i8>
    325 ; %op1 = sext<2 x i32> %val1
    326 ; %val2 = load <2 x i8>
    327 ; %op2 = sext<2 x i32> %val2
    328 ; %rst = mul <2 x i32> %op1, %op2
    329 ;
    330 define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    331 ; CHECK-LABEL: mul_2xi8_sext:
    332 ; CHECK:       # BB#0: # %entry
    333 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    334 ; CHECK-NEXT:    movzwl (%rdi,%rdx), %ecx
    335 ; CHECK-NEXT:    movd %ecx, %xmm0
    336 ; CHECK-NEXT:    movzwl (%rsi,%rdx), %ecx
    337 ; CHECK-NEXT:    movd %ecx, %xmm1
    338 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    339 ; CHECK-NEXT:    psraw $8, %xmm0
    340 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    341 ; CHECK-NEXT:    psraw $8, %xmm1
    342 ; CHECK-NEXT:    pmullw %xmm0, %xmm1
    343 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    344 ; CHECK-NEXT:    psrad $16, %xmm0
    345 ; CHECK-NEXT:    movq %xmm0, (%rax,%rdx,4)
    346 ; CHECK-NEXT:    retq
    347 entry:
    348   %pre = load i32*, i32** @c
    349   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    350   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
    351   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
    352   %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
    353   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    354   %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
    355   %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
    356   %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32>
    357   %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
    358   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    359   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    360   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    361   ret void
    362 }
    363 
    364 ; %val1 = load <2 x i8>
    365 ; %op1 = sext<2 x i32> %val1
    366 ; %val2 = load <2 x i8>
    367 ; %op2 = zext<2 x i32> %val2
    368 ; %rst = mul <2 x i32> %op1, %op2
    369 ;
    370 define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    371 ; CHECK-LABEL: mul_2xi8_sext_zext:
    372 ; CHECK:       # BB#0: # %entry
    373 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    374 ; CHECK-NEXT:    movzwl (%rdi,%rdx), %ecx
    375 ; CHECK-NEXT:    movd %ecx, %xmm0
    376 ; CHECK-NEXT:    movzwl (%rsi,%rdx), %ecx
    377 ; CHECK-NEXT:    movd %ecx, %xmm1
    378 ; CHECK-NEXT:    pxor %xmm2, %xmm2
    379 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
    380 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    381 ; CHECK-NEXT:    psraw $8, %xmm0
    382 ; CHECK-NEXT:    movdqa %xmm1, %xmm2
    383 ; CHECK-NEXT:    pmulhw %xmm0, %xmm2
    384 ; CHECK-NEXT:    pmullw %xmm1, %xmm0
    385 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    386 ; CHECK-NEXT:    movq %xmm0, (%rax,%rdx,4)
    387 ; CHECK-NEXT:    retq
    388 entry:
    389   %pre = load i32*, i32** @c
    390   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    391   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
    392   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
    393   %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
    394   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    395   %tmp11 = bitcast i8* %tmp10 to <2 x i8>*
    396   %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1
    397   %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32>
    398   %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
    399   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    400   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    401   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    402   ret void
    403 }
    404 
    405 ; %val1 = load <2 x i16>
    406 ; %op1 = sext<2 x i32> %val1
    407 ; %val2 = load <2 x i16>
    408 ; %op2 = sext<2 x i32> %val2
    409 ; %rst = mul <2 x i32> %op1, %op2
    410 ;
    411 define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    412 ; CHECK-LABEL: mul_2xi16_sext:
    413 ; CHECK:       # BB#0: # %entry
    414 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    415 ; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    416 ; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    417 ; CHECK-NEXT:    movdqa %xmm1, %xmm2
    418 ; CHECK-NEXT:    pmulhw %xmm0, %xmm2
    419 ; CHECK-NEXT:    pmullw %xmm0, %xmm1
    420 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    421 ; CHECK-NEXT:    movq %xmm1, (%rax,%rdx,4)
    422 ; CHECK-NEXT:    retq
    423 entry:
    424   %pre = load i32*, i32** @c
    425   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    426   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
    427   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
    428   %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
    429   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    430   %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
    431   %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
    432   %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32>
    433   %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
    434   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    435   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    436   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    437   ret void
    438 }
    439 
    440 ; %val1 = load <2 x i16>
    441 ; %op1 = sext<2 x i32> %val1
    442 ; %val2 = load <2 x i16>
    443 ; %op2 = zext<2 x i32> %val2
    444 ; %rst = mul <2 x i32> %op1, %op2
    445 ;
    446 define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    447 ; CHECK-LABEL: mul_2xi16_sext_zext:
    448 ; CHECK:       # BB#0: # %entry
    449 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    450 ; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    451 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    452 ; CHECK-NEXT:    psrad $16, %xmm0
    453 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
    454 ; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
    455 ; CHECK-NEXT:    pxor %xmm2, %xmm2
    456 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
    457 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
    458 ; CHECK-NEXT:    movdqa %xmm1, %xmm2
    459 ; CHECK-NEXT:    pmuludq %xmm0, %xmm2
    460 ; CHECK-NEXT:    movdqa %xmm0, %xmm3
    461 ; CHECK-NEXT:    psrlq $32, %xmm3
    462 ; CHECK-NEXT:    pmuludq %xmm1, %xmm3
    463 ; CHECK-NEXT:    psllq $32, %xmm3
    464 ; CHECK-NEXT:    paddq %xmm2, %xmm3
    465 ; CHECK-NEXT:    psrlq $32, %xmm1
    466 ; CHECK-NEXT:    pmuludq %xmm0, %xmm1
    467 ; CHECK-NEXT:    psllq $32, %xmm1
    468 ; CHECK-NEXT:    paddq %xmm3, %xmm1
    469 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
    470 ; CHECK-NEXT:    movq %xmm0, (%rax,%rdx,4)
    471 ; CHECK-NEXT:    retq
    472 entry:
    473   %pre = load i32*, i32** @c
    474   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    475   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
    476   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
    477   %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
    478   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    479   %tmp11 = bitcast i8* %tmp10 to <2 x i16>*
    480   %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1
    481   %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32>
    482   %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8
    483   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    484   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    485   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    486   ret void
    487 }
    488 
    489 ; %val1 = load <16 x i16>
    490 ; %op1 = sext<16 x i32> %val1
    491 ; %val2 = load <16 x i16>
    492 ; %op2 = sext<16 x i32> %val2
    493 ; %rst = mul <16 x i32> %op1, %op2
    494 ;
    495 define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) {
    496 ; CHECK-LABEL: mul_16xi16_sext:
    497 ; CHECK:       # BB#0: # %entry
    498 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    499 ; CHECK-NEXT:    movdqu (%rdi,%rdx), %xmm0
    500 ; CHECK-NEXT:    movdqu 16(%rdi,%rdx), %xmm1
    501 ; CHECK-NEXT:    movdqu (%rsi,%rdx), %xmm2
    502 ; CHECK-NEXT:    movdqu 16(%rsi,%rdx), %xmm3
    503 ; CHECK-NEXT:    movdqa %xmm2, %xmm4
    504 ; CHECK-NEXT:    pmulhw %xmm0, %xmm4
    505 ; CHECK-NEXT:    pmullw %xmm0, %xmm2
    506 ; CHECK-NEXT:    movdqa %xmm2, %xmm0
    507 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
    508 ; CHECK-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
    509 ; CHECK-NEXT:    movdqa %xmm3, %xmm4
    510 ; CHECK-NEXT:    pmulhw %xmm1, %xmm4
    511 ; CHECK-NEXT:    pmullw %xmm1, %xmm3
    512 ; CHECK-NEXT:    movdqa %xmm3, %xmm1
    513 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
    514 ; CHECK-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
    515 ; CHECK-NEXT:    movdqu %xmm3, 48(%rax,%rdx,4)
    516 ; CHECK-NEXT:    movdqu %xmm1, 32(%rax,%rdx,4)
    517 ; CHECK-NEXT:    movdqu %xmm2, 16(%rax,%rdx,4)
    518 ; CHECK-NEXT:    movdqu %xmm0, (%rax,%rdx,4)
    519 ; CHECK-NEXT:    retq
    520 entry:
    521   %pre = load i32*, i32** @c
    522   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    523   %tmp7 = bitcast i8* %tmp6 to <16 x i16>*
    524   %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1
    525   %tmp8 = sext <16 x i16> %wide.load to <16 x i32>
    526   %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index
    527   %tmp11 = bitcast i8* %tmp10 to <16 x i16>*
    528   %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1
    529   %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32>
    530   %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8
    531   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    532   %tmp15 = bitcast i32* %tmp14 to <16 x i32>*
    533   store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4
    534   ret void
    535 }
    536 
    537 ; %val = load <2 x i8>
    538 ; %op1 = zext<2 x i32> %val
    539 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255)
    540 ; %rst = mul <2 x i32> %op1, %op2
    541 ;
    542 define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) {
    543 ; CHECK-LABEL: mul_2xi8_varconst1:
    544 ; CHECK:       # BB#0: # %entry
    545 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    546 ; CHECK-NEXT:    movzwl (%rdi,%rsi), %ecx
    547 ; CHECK-NEXT:    movd %ecx, %xmm0
    548 ; CHECK-NEXT:    pxor %xmm1, %xmm1
    549 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    550 ; CHECK-NEXT:    pmullw {{.*}}(%rip), %xmm0
    551 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    552 ; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
    553 ; CHECK-NEXT:    retq
    554 entry:
    555   %pre = load i32*, i32** @c
    556   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    557   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
    558   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
    559   %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
    560   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255>
    561   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    562   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    563   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    564   ret void
    565 }
    566 
    567 ; %val = load <2 x i8>
    568 ; %op1 = sext<2 x i32> %val
    569 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127)
    570 ; %rst = mul <2 x i32> %op1, %op2
    571 ;
    572 define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) {
    573 ; CHECK-LABEL: mul_2xi8_varconst2:
    574 ; CHECK:       # BB#0: # %entry
    575 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    576 ; CHECK-NEXT:    movzwl (%rdi,%rsi), %ecx
    577 ; CHECK-NEXT:    movd %ecx, %xmm0
    578 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    579 ; CHECK-NEXT:    psraw $8, %xmm0
    580 ; CHECK-NEXT:    pmullw {{.*}}(%rip), %xmm0
    581 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    582 ; CHECK-NEXT:    psrad $16, %xmm0
    583 ; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
    584 ; CHECK-NEXT:    retq
    585 entry:
    586   %pre = load i32*, i32** @c
    587   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    588   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
    589   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
    590   %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
    591   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127>
    592   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    593   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    594   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    595   ret void
    596 }
    597 
    598 ; %val = load <2 x i8>
    599 ; %op1 = zext<2 x i32> %val
    600 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256)
    601 ; %rst = mul <2 x i32> %op1, %op2
    602 ;
    603 define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) {
    604 ; CHECK-LABEL: mul_2xi8_varconst3:
    605 ; CHECK:       # BB#0: # %entry
    606 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    607 ; CHECK-NEXT:    movzwl (%rdi,%rsi), %ecx
    608 ; CHECK-NEXT:    movd %ecx, %xmm0
    609 ; CHECK-NEXT:    pxor %xmm1, %xmm1
    610 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    611 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u>
    612 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
    613 ; CHECK-NEXT:    pmulhw %xmm1, %xmm2
    614 ; CHECK-NEXT:    pmullw %xmm1, %xmm0
    615 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    616 ; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
    617 ; CHECK-NEXT:    retq
    618 entry:
    619   %pre = load i32*, i32** @c
    620   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    621   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
    622   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
    623   %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
    624   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256>
    625   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    626   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    627   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    628   ret void
    629 }
    630 
    631 ; %val = load <2 x i8>
    632 ; %op1 = zext<2 x i32> %val
    633 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255)
    634 ; %rst = mul <2 x i32> %op1, %op2
    635 ;
    636 define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) {
    637 ; CHECK-LABEL: mul_2xi8_varconst4:
    638 ; CHECK:       # BB#0: # %entry
    639 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    640 ; CHECK-NEXT:    movzwl (%rdi,%rsi), %ecx
    641 ; CHECK-NEXT:    movd %ecx, %xmm0
    642 ; CHECK-NEXT:    pxor %xmm1, %xmm1
    643 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
    644 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u>
    645 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
    646 ; CHECK-NEXT:    pmulhw %xmm1, %xmm2
    647 ; CHECK-NEXT:    pmullw %xmm1, %xmm0
    648 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    649 ; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
    650 ; CHECK-NEXT:    retq
    651 entry:
    652   %pre = load i32*, i32** @c
    653   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    654   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
    655   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
    656   %tmp8 = zext <2 x i8> %wide.load to <2 x i32>
    657   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255>
    658   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    659   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    660   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    661   ret void
    662 }
    663 
    664 ; %val = load <2 x i8>
    665 ; %op1 = sext<2 x i32> %val
    666 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127)
    667 ; %rst = mul <2 x i32> %op1, %op2
    668 ;
    669 define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) {
    670 ; CHECK-LABEL: mul_2xi8_varconst5:
    671 ; CHECK:       # BB#0: # %entry
    672 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    673 ; CHECK-NEXT:    movzwl (%rdi,%rsi), %ecx
    674 ; CHECK-NEXT:    movd %ecx, %xmm0
    675 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    676 ; CHECK-NEXT:    psraw $8, %xmm0
    677 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u>
    678 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
    679 ; CHECK-NEXT:    pmulhw %xmm1, %xmm2
    680 ; CHECK-NEXT:    pmullw %xmm1, %xmm0
    681 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    682 ; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
    683 ; CHECK-NEXT:    retq
    684 entry:
    685   %pre = load i32*, i32** @c
    686   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    687   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
    688   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
    689   %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
    690   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127>
    691   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    692   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    693   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    694   ret void
    695 }
    696 
    697 ; %val = load <2 x i8>
    698 ; %op1 = sext<2 x i32> %val
    699 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128)
    700 ; %rst = mul <2 x i32> %op1, %op2
    701 ;
    702 define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) {
    703 ; CHECK-LABEL: mul_2xi8_varconst6:
    704 ; CHECK:       # BB#0: # %entry
    705 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    706 ; CHECK-NEXT:    movzwl (%rdi,%rsi), %ecx
    707 ; CHECK-NEXT:    movd %ecx, %xmm0
    708 ; CHECK-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
    709 ; CHECK-NEXT:    psraw $8, %xmm0
    710 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u>
    711 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
    712 ; CHECK-NEXT:    pmulhw %xmm1, %xmm2
    713 ; CHECK-NEXT:    pmullw %xmm1, %xmm0
    714 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    715 ; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
    716 ; CHECK-NEXT:    retq
    717 entry:
    718   %pre = load i32*, i32** @c
    719   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    720   %tmp7 = bitcast i8* %tmp6 to <2 x i8>*
    721   %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1
    722   %tmp8 = sext <2 x i8> %wide.load to <2 x i32>
    723   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128>
    724   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    725   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    726   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    727   ret void
    728 }
    729 
    730 ; %val = load <2 x i16>
    731 ; %op1 = zext<2 x i32> %val
    732 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535)
    733 ; %rst = mul <2 x i32> %op1, %op2
    734 ;
    735 define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) {
    736 ; CHECK-LABEL: mul_2xi16_varconst1:
    737 ; CHECK:       # BB#0: # %entry
    738 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    739 ; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    740 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
    741 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
    742 ; CHECK-NEXT:    pmulhuw %xmm1, %xmm2
    743 ; CHECK-NEXT:    pmullw %xmm1, %xmm0
    744 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    745 ; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
    746 ; CHECK-NEXT:    retq
    747 entry:
    748   %pre = load i32*, i32** @c
    749   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    750   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
    751   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
    752   %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
    753   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535>
    754   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    755   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    756   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    757   ret void
    758 }
    759 
    760 ; %val = load <2 x i16>
    761 ; %op1 = sext<2 x i32> %val
    762 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767)
    763 ; %rst = mul <2 x i32> %op1, %op2
    764 ;
    765 define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) {
    766 ; CHECK-LABEL: mul_2xi16_varconst2:
    767 ; CHECK:       # BB#0: # %entry
    768 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    769 ; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    770 ; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u>
    771 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
    772 ; CHECK-NEXT:    pmulhw %xmm1, %xmm2
    773 ; CHECK-NEXT:    pmullw %xmm1, %xmm0
    774 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
    775 ; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
    776 ; CHECK-NEXT:    retq
    777 entry:
    778   %pre = load i32*, i32** @c
    779   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    780   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
    781   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
    782   %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
    783   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767>
    784   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    785   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    786   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    787   ret void
    788 }
    789 
    790 ; %val = load <2 x i16>
    791 ; %op1 = zext<2 x i32> %val
    792 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536)
    793 ; %rst = mul <2 x i32> %op1, %op2
    794 ;
    795 define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) {
    796 ; CHECK-LABEL: mul_2xi16_varconst3:
    797 ; CHECK:       # BB#0: # %entry
    798 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    799 ; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    800 ; CHECK-NEXT:    pxor %xmm1, %xmm1
    801 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
    802 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
    803 ; CHECK-NEXT:    movl $65536, %ecx # imm = 0x10000
    804 ; CHECK-NEXT:    movd %rcx, %xmm1
    805 ; CHECK-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
    806 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
    807 ; CHECK-NEXT:    pmuludq %xmm1, %xmm2
    808 ; CHECK-NEXT:    psrlq $32, %xmm0
    809 ; CHECK-NEXT:    pmuludq %xmm1, %xmm0
    810 ; CHECK-NEXT:    psllq $32, %xmm0
    811 ; CHECK-NEXT:    paddq %xmm2, %xmm0
    812 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    813 ; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
    814 ; CHECK-NEXT:    retq
    815 entry:
    816   %pre = load i32*, i32** @c
    817   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    818   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
    819   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
    820   %tmp8 = zext <2 x i16> %wide.load to <2 x i32>
    821   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536> 
    822   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    823   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    824   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    825   ret void
    826 }
    827 
    828 ; %val = load <2 x i16>
    829 ; %op1 = sext<2 x i32> %val
    830 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768)
    831 ; %rst = mul <2 x i32> %op1, %op2
    832 ;
    833 define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
    834 ; CHECK-LABEL: mul_2xi16_varconst4:
    835 ; CHECK:       # BB#0: # %entry
    836 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
    837 ; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
    838 ; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
    839 ; CHECK-NEXT:    psrad $16, %xmm0
    840 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
    841 ; CHECK-NEXT:    movl $32768, %ecx # imm = 0x8000
    842 ; CHECK-NEXT:    movd %rcx, %xmm1
    843 ; CHECK-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
    844 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
    845 ; CHECK-NEXT:    pmuludq %xmm1, %xmm2
    846 ; CHECK-NEXT:    psrlq $32, %xmm0
    847 ; CHECK-NEXT:    pmuludq %xmm1, %xmm0
    848 ; CHECK-NEXT:    psllq $32, %xmm0
    849 ; CHECK-NEXT:    paddq %xmm2, %xmm0
    850 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
    851 ; CHECK-NEXT:    movq %xmm0, (%rax,%rsi,4)
    852 ; CHECK-NEXT:    retq
    853 entry:
    854   %pre = load i32*, i32** @c
    855   %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index
    856   %tmp7 = bitcast i8* %tmp6 to <2 x i16>*
    857   %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1
    858   %tmp8 = sext <2 x i16> %wide.load to <2 x i32>
    859   %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768>
    860   %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index
    861   %tmp15 = bitcast i32* %tmp14 to <2 x i32>*
    862   store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4
    863   ret void
    864 }
    865