1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-SSE 3 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX1 4 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86 --check-prefix=X86-AVX --check-prefix=X86-AVX2 5 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-SSE 6 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX1 7 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX --check-prefix=X64-AVX2 8 9 @c = external global i32*, align 8 10 11 ; %val1 = load <2 x i8> 12 ; %op1 = zext<2 x i32> %val1 13 ; %val2 = load <2 x i8> 14 ; %op2 = zext<2 x i32> %val2 15 ; %rst = mul <2 x i32> %op1, %op2 16 ; 17 define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 18 ; X86-SSE-LABEL: mul_2xi8: 19 ; X86-SSE: # %bb.0: # %entry 20 ; X86-SSE-NEXT: pushl %esi 21 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 22 ; X86-SSE-NEXT: .cfi_offset %esi, -8 23 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 24 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 25 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 26 ; X86-SSE-NEXT: movl c, %esi 27 ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx 28 ; X86-SSE-NEXT: movd %edx, %xmm0 29 ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax 30 ; X86-SSE-NEXT: movd %eax, %xmm1 31 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 32 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 33 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 34 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 35 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 36 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) 37 ; X86-SSE-NEXT: popl %esi 38 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 39 ; X86-SSE-NEXT: retl 40 ; 41 ; X86-AVX-LABEL: mul_2xi8: 42 ; X86-AVX: # %bb.0: # %entry 43 ; X86-AVX-NEXT: pushl %esi 44 ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 45 ; X86-AVX-NEXT: .cfi_offset %esi, -8 46 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 47 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 48 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 49 ; X86-AVX-NEXT: movl c, %esi 50 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 51 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 52 ; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 53 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 54 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 55 ; X86-AVX-NEXT: popl %esi 56 ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 57 ; X86-AVX-NEXT: retl 58 ; 59 ; X64-SSE-LABEL: mul_2xi8: 60 ; X64-SSE: # %bb.0: # %entry 61 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 62 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx 63 ; X64-SSE-NEXT: movd %ecx, %xmm0 64 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx 65 ; X64-SSE-NEXT: movd %ecx, %xmm1 66 ; X64-SSE-NEXT: pxor %xmm2, %xmm2 67 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 68 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 69 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1 70 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 71 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) 72 ; X64-SSE-NEXT: retq 73 ; 74 ; X64-AVX-LABEL: mul_2xi8: 75 ; X64-AVX: # %bb.0: # %entry 76 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 77 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 78 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 79 ; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 80 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 81 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 82 ; X64-AVX-NEXT: retq 83 entry: 84 %pre = load i32*, i32** @c 85 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 86 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 87 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 88 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 89 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 90 %tmp11 = bitcast i8* %tmp10 to <2 x i8>* 91 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 92 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> 93 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 94 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 95 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 96 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 97 ret void 98 } 99 100 ; %val1 = load <4 x i8> 101 ; %op1 = zext<4 x i32> %val1 102 ; %val2 = load <4 x i8> 103 ; %op2 = zext<4 x i32> %val2 104 ; %rst = mul <4 x i32> %op1, %op2 105 ; 106 define void @mul_4xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 107 ; X86-SSE-LABEL: mul_4xi8: 108 ; X86-SSE: # %bb.0: # %entry 109 ; X86-SSE-NEXT: pushl %esi 110 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 111 ; X86-SSE-NEXT: .cfi_offset %esi, -8 112 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 113 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 114 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 115 ; X86-SSE-NEXT: movl c, %esi 116 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 117 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 118 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 119 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 120 ; X86-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 121 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 122 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 123 ; X86-SSE-NEXT: pmaddwd %xmm0, %xmm2 124 ; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) 125 ; X86-SSE-NEXT: popl %esi 126 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 127 ; X86-SSE-NEXT: retl 128 ; 129 ; X86-AVX-LABEL: mul_4xi8: 130 ; X86-AVX: # %bb.0: # %entry 131 ; X86-AVX-NEXT: pushl %esi 132 ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 133 ; X86-AVX-NEXT: .cfi_offset %esi, -8 134 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 135 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 136 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 137 ; X86-AVX-NEXT: movl c, %esi 138 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 139 ; X86-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 140 ; X86-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 141 ; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) 142 ; X86-AVX-NEXT: popl %esi 143 ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 144 ; X86-AVX-NEXT: retl 145 ; 146 ; X64-SSE-LABEL: mul_4xi8: 147 ; X64-SSE: # %bb.0: # %entry 148 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 149 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 150 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 151 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 152 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 153 ; X64-SSE-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero 154 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 155 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 156 ; X64-SSE-NEXT: pmaddwd %xmm0, %xmm2 157 ; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) 158 ; X64-SSE-NEXT: retq 159 ; 160 ; X64-AVX-LABEL: mul_4xi8: 161 ; X64-AVX: # %bb.0: # %entry 162 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 163 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 164 ; X64-AVX-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 165 ; X64-AVX-NEXT: vpmaddwd %xmm0, %xmm1, %xmm0 166 ; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) 167 ; X64-AVX-NEXT: retq 168 entry: 169 %pre = load i32*, i32** @c 170 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 171 %tmp7 = bitcast i8* %tmp6 to <4 x i8>* 172 %wide.load = load <4 x i8>, <4 x i8>* %tmp7, align 1 173 %tmp8 = zext <4 x i8> %wide.load to <4 x i32> 174 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 175 %tmp11 = bitcast i8* %tmp10 to <4 x i8>* 176 %wide.load17 = load <4 x i8>, <4 x i8>* %tmp11, align 1 177 %tmp12 = zext <4 x i8> %wide.load17 to <4 x i32> 178 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 179 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 180 %tmp15 = bitcast i32* %tmp14 to <4 x i32>* 181 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 182 ret void 183 } 184 185 ; %val1 = load <8 x i8> 186 ; %op1 = zext<8 x i32> %val1 187 ; %val2 = load <8 x i8> 188 ; %op2 = zext<8 x i32> %val2 189 ; %rst = mul <8 x i32> %op1, %op2 190 ; 191 define void @mul_8xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 192 ; X86-SSE-LABEL: mul_8xi8: 193 ; X86-SSE: # %bb.0: # %entry 194 ; X86-SSE-NEXT: pushl %esi 195 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 196 ; X86-SSE-NEXT: .cfi_offset %esi, -8 197 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 198 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 199 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 200 ; X86-SSE-NEXT: movl c, %esi 201 ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 202 ; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 203 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 204 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 205 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 206 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 207 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0 208 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 209 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 210 ; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) 211 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) 212 ; X86-SSE-NEXT: popl %esi 213 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 214 ; X86-SSE-NEXT: retl 215 ; 216 ; X86-AVX1-LABEL: mul_8xi8: 217 ; X86-AVX1: # %bb.0: # %entry 218 ; X86-AVX1-NEXT: pushl %esi 219 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 220 ; X86-AVX1-NEXT: .cfi_offset %esi, -8 221 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 222 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 223 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 224 ; X86-AVX1-NEXT: movl c, %esi 225 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 226 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 227 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 228 ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 229 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 230 ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 231 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 232 ; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) 233 ; X86-AVX1-NEXT: popl %esi 234 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 235 ; X86-AVX1-NEXT: vzeroupper 236 ; X86-AVX1-NEXT: retl 237 ; 238 ; X86-AVX2-LABEL: mul_8xi8: 239 ; X86-AVX2: # %bb.0: # %entry 240 ; X86-AVX2-NEXT: pushl %esi 241 ; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 242 ; X86-AVX2-NEXT: .cfi_offset %esi, -8 243 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 244 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 245 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 246 ; X86-AVX2-NEXT: movl c, %esi 247 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 248 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 249 ; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 250 ; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) 251 ; X86-AVX2-NEXT: popl %esi 252 ; X86-AVX2-NEXT: .cfi_def_cfa_offset 4 253 ; X86-AVX2-NEXT: vzeroupper 254 ; X86-AVX2-NEXT: retl 255 ; 256 ; X64-SSE-LABEL: mul_8xi8: 257 ; X64-SSE: # %bb.0: # %entry 258 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 259 ; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 260 ; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 261 ; X64-SSE-NEXT: pxor %xmm2, %xmm2 262 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 263 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 264 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1 265 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0 266 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 267 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 268 ; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) 269 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) 270 ; X64-SSE-NEXT: retq 271 ; 272 ; X64-AVX1-LABEL: mul_8xi8: 273 ; X64-AVX1: # %bb.0: # %entry 274 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax 275 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 276 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 277 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 278 ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm2, %xmm0 279 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 280 ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 281 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 282 ; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) 283 ; X64-AVX1-NEXT: vzeroupper 284 ; X64-AVX1-NEXT: retq 285 ; 286 ; X64-AVX2-LABEL: mul_8xi8: 287 ; X64-AVX2: # %bb.0: # %entry 288 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax 289 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 290 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 291 ; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm1, %ymm0 292 ; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) 293 ; X64-AVX2-NEXT: vzeroupper 294 ; X64-AVX2-NEXT: retq 295 entry: 296 %pre = load i32*, i32** @c 297 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 298 %tmp7 = bitcast i8* %tmp6 to <8 x i8>* 299 %wide.load = load <8 x i8>, <8 x i8>* %tmp7, align 1 300 %tmp8 = zext <8 x i8> %wide.load to <8 x i32> 301 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 302 %tmp11 = bitcast i8* %tmp10 to <8 x i8>* 303 %wide.load17 = load <8 x i8>, <8 x i8>* %tmp11, align 1 304 %tmp12 = zext <8 x i8> %wide.load17 to <8 x i32> 305 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 306 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 307 %tmp15 = bitcast i32* %tmp14 to <8 x i32>* 308 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 309 ret void 310 } 311 312 ; %val1 = load <16 x i8> 313 ; %op1 = zext<16 x i32> %val1 314 ; %val2 = load <16 x i8> 315 ; %op2 = zext<16 x i32> %val2 316 ; %rst = mul <16 x i32> %op1, %op2 317 ; 318 define void @mul_16xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 319 ; X86-SSE-LABEL: mul_16xi8: 320 ; X86-SSE: # %bb.0: # %entry 321 ; X86-SSE-NEXT: pushl %esi 322 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 323 ; X86-SSE-NEXT: .cfi_offset %esi, -8 324 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 325 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 326 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 327 ; X86-SSE-NEXT: movl c, %esi 328 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 329 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 330 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 331 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3 332 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 333 ; X86-SSE-NEXT: movdqa %xmm1, %xmm4 334 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 335 ; X86-SSE-NEXT: pmullw %xmm3, %xmm4 336 ; X86-SSE-NEXT: movdqa %xmm4, %xmm3 337 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 338 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 339 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 340 ; X86-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 341 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 342 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0 343 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 344 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 345 ; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) 346 ; X86-SSE-NEXT: movdqu %xmm0, 32(%esi,%ecx,4) 347 ; X86-SSE-NEXT: movdqu %xmm4, 16(%esi,%ecx,4) 348 ; X86-SSE-NEXT: movdqu %xmm3, (%esi,%ecx,4) 349 ; X86-SSE-NEXT: popl %esi 350 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 351 ; X86-SSE-NEXT: retl 352 ; 353 ; X86-AVX1-LABEL: mul_16xi8: 354 ; X86-AVX1: # %bb.0: # %entry 355 ; X86-AVX1-NEXT: pushl %esi 356 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 357 ; X86-AVX1-NEXT: .cfi_offset %esi, -8 358 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 359 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 360 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 361 ; X86-AVX1-NEXT: movl c, %esi 362 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 363 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 364 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 365 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 366 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 367 ; X86-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 368 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 369 ; X86-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 370 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 371 ; X86-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 372 ; X86-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 373 ; X86-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 374 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 375 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 376 ; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) 377 ; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) 378 ; X86-AVX1-NEXT: popl %esi 379 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 380 ; X86-AVX1-NEXT: vzeroupper 381 ; X86-AVX1-NEXT: retl 382 ; 383 ; X86-AVX2-LABEL: mul_16xi8: 384 ; X86-AVX2: # %bb.0: # %entry 385 ; X86-AVX2-NEXT: pushl %esi 386 ; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 387 ; X86-AVX2-NEXT: .cfi_offset %esi, -8 388 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 389 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 390 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 391 ; X86-AVX2-NEXT: movl c, %esi 392 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 393 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 394 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 395 ; X86-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 396 ; X86-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 397 ; X86-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 398 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) 399 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) 400 ; X86-AVX2-NEXT: popl %esi 401 ; X86-AVX2-NEXT: .cfi_def_cfa_offset 4 402 ; X86-AVX2-NEXT: vzeroupper 403 ; X86-AVX2-NEXT: retl 404 ; 405 ; X64-SSE-LABEL: mul_16xi8: 406 ; X64-SSE: # %bb.0: # %entry 407 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 408 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 409 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 410 ; X64-SSE-NEXT: pxor %xmm2, %xmm2 411 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3 412 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] 413 ; X64-SSE-NEXT: movdqa %xmm1, %xmm4 414 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 415 ; X64-SSE-NEXT: pmullw %xmm3, %xmm4 416 ; X64-SSE-NEXT: movdqa %xmm4, %xmm3 417 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] 418 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 419 ; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] 420 ; X64-SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 421 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1 422 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0 423 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 424 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 425 ; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) 426 ; X64-SSE-NEXT: movdqu %xmm0, 32(%rax,%rdx,4) 427 ; X64-SSE-NEXT: movdqu %xmm4, 16(%rax,%rdx,4) 428 ; X64-SSE-NEXT: movdqu %xmm3, (%rax,%rdx,4) 429 ; X64-SSE-NEXT: retq 430 ; 431 ; X64-AVX1-LABEL: mul_16xi8: 432 ; X64-AVX1: # %bb.0: # %entry 433 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax 434 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 435 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 436 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 437 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 438 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 439 ; X64-AVX1-NEXT: vpmaddwd %xmm0, %xmm4, %xmm0 440 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 441 ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm4, %xmm1 442 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 443 ; X64-AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 444 ; X64-AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero 445 ; X64-AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 446 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 447 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 448 ; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) 449 ; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) 450 ; X64-AVX1-NEXT: vzeroupper 451 ; X64-AVX1-NEXT: retq 452 ; 453 ; X64-AVX2-LABEL: mul_16xi8: 454 ; X64-AVX2: # %bb.0: # %entry 455 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax 456 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 457 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 458 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 459 ; X64-AVX2-NEXT: vpmaddwd %ymm0, %ymm2, %ymm0 460 ; X64-AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero 461 ; X64-AVX2-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 462 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) 463 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) 464 ; X64-AVX2-NEXT: vzeroupper 465 ; X64-AVX2-NEXT: retq 466 entry: 467 %pre = load i32*, i32** @c 468 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 469 %tmp7 = bitcast i8* %tmp6 to <16 x i8>* 470 %wide.load = load <16 x i8>, <16 x i8>* %tmp7, align 1 471 %tmp8 = zext <16 x i8> %wide.load to <16 x i32> 472 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 473 %tmp11 = bitcast i8* %tmp10 to <16 x i8>* 474 %wide.load17 = load <16 x i8>, <16 x i8>* %tmp11, align 1 475 %tmp12 = zext <16 x i8> %wide.load17 to <16 x i32> 476 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 477 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 478 %tmp15 = bitcast i32* %tmp14 to <16 x i32>* 479 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 480 ret void 481 } 482 483 ; %val1 = load <2 x i16> 484 ; %op1 = zext<2 x i32> %val1 485 ; %val2 = load <2 x i16> 486 ; %op2 = zext<2 x i32> %val2 487 ; %rst = mul <2 x i32> %op1, %op2 488 ; 489 define void @mul_2xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 490 ; X86-SSE-LABEL: mul_2xi16: 491 ; X86-SSE: # %bb.0: # %entry 492 ; X86-SSE-NEXT: pushl %esi 493 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 494 ; X86-SSE-NEXT: .cfi_offset %esi, -8 495 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 496 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 497 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 498 ; X86-SSE-NEXT: movl c, %esi 499 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 500 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 501 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2 502 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 503 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 504 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 505 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) 506 ; X86-SSE-NEXT: popl %esi 507 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 508 ; X86-SSE-NEXT: retl 509 ; 510 ; X86-AVX-LABEL: mul_2xi16: 511 ; X86-AVX: # %bb.0: # %entry 512 ; X86-AVX-NEXT: pushl %esi 513 ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 514 ; X86-AVX-NEXT: .cfi_offset %esi, -8 515 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 516 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 517 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 518 ; X86-AVX-NEXT: movl c, %esi 519 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 520 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 521 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 522 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 523 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 524 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 525 ; X86-AVX-NEXT: popl %esi 526 ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 527 ; X86-AVX-NEXT: retl 528 ; 529 ; X64-SSE-LABEL: mul_2xi16: 530 ; X64-SSE: # %bb.0: # %entry 531 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 532 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 533 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 534 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2 535 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 536 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1 537 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 538 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) 539 ; X64-SSE-NEXT: retq 540 ; 541 ; X64-AVX-LABEL: mul_2xi16: 542 ; X64-AVX: # %bb.0: # %entry 543 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 544 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 545 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 546 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 547 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 548 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 549 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 550 ; X64-AVX-NEXT: retq 551 entry: 552 %pre = load i32*, i32** @c 553 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 554 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 555 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 556 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 557 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 558 %tmp11 = bitcast i8* %tmp10 to <2 x i16>* 559 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 560 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> 561 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 562 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 563 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 564 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 565 ret void 566 } 567 568 ; %val1 = load <4 x i16> 569 ; %op1 = zext<4 x i32> %val1 570 ; %val2 = load <4 x i16> 571 ; %op2 = zext<4 x i32> %val2 572 ; %rst = mul <4 x i32> %op1, %op2 573 ; 574 define void @mul_4xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 575 ; X86-SSE-LABEL: mul_4xi16: 576 ; X86-SSE: # %bb.0: # %entry 577 ; X86-SSE-NEXT: pushl %esi 578 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 579 ; X86-SSE-NEXT: .cfi_offset %esi, -8 580 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 581 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 582 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 583 ; X86-SSE-NEXT: movl c, %esi 584 ; X86-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 585 ; X86-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 586 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2 587 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 588 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 589 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 590 ; X86-SSE-NEXT: movdqu %xmm1, (%esi,%ecx,4) 591 ; X86-SSE-NEXT: popl %esi 592 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 593 ; X86-SSE-NEXT: retl 594 ; 595 ; X86-AVX-LABEL: mul_4xi16: 596 ; X86-AVX: # %bb.0: # %entry 597 ; X86-AVX-NEXT: pushl %esi 598 ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 599 ; X86-AVX-NEXT: .cfi_offset %esi, -8 600 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 601 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 602 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 603 ; X86-AVX-NEXT: movl c, %esi 604 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 605 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 606 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 607 ; X86-AVX-NEXT: vmovdqu %xmm0, (%esi,%ecx,4) 608 ; X86-AVX-NEXT: popl %esi 609 ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 610 ; X86-AVX-NEXT: retl 611 ; 612 ; X64-SSE-LABEL: mul_4xi16: 613 ; X64-SSE: # %bb.0: # %entry 614 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 615 ; X64-SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 616 ; X64-SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 617 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2 618 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 619 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1 620 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 621 ; X64-SSE-NEXT: movdqu %xmm1, (%rax,%rdx,4) 622 ; X64-SSE-NEXT: retq 623 ; 624 ; X64-AVX-LABEL: mul_4xi16: 625 ; X64-AVX: # %bb.0: # %entry 626 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 627 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 628 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 629 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 630 ; X64-AVX-NEXT: vmovdqu %xmm0, (%rax,%rdx,4) 631 ; X64-AVX-NEXT: retq 632 entry: 633 %pre = load i32*, i32** @c 634 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 635 %tmp7 = bitcast i8* %tmp6 to <4 x i16>* 636 %wide.load = load <4 x i16>, <4 x i16>* %tmp7, align 1 637 %tmp8 = zext <4 x i16> %wide.load to <4 x i32> 638 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 639 %tmp11 = bitcast i8* %tmp10 to <4 x i16>* 640 %wide.load17 = load <4 x i16>, <4 x i16>* %tmp11, align 1 641 %tmp12 = zext <4 x i16> %wide.load17 to <4 x i32> 642 %tmp13 = mul nuw nsw <4 x i32> %tmp12, %tmp8 643 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 644 %tmp15 = bitcast i32* %tmp14 to <4 x i32>* 645 store <4 x i32> %tmp13, <4 x i32>* %tmp15, align 4 646 ret void 647 } 648 649 ; %val1 = load <8 x i16> 650 ; %op1 = zext<8 x i32> %val1 651 ; %val2 = load <8 x i16> 652 ; %op2 = zext<8 x i32> %val2 653 ; %rst = mul <8 x i32> %op1, %op2 654 ; 655 define void @mul_8xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 656 ; X86-SSE-LABEL: mul_8xi16: 657 ; X86-SSE: # %bb.0: # %entry 658 ; X86-SSE-NEXT: pushl %esi 659 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 660 ; X86-SSE-NEXT: .cfi_offset %esi, -8 661 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 662 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 663 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 664 ; X86-SSE-NEXT: movl c, %esi 665 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 666 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm1 667 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2 668 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm2 669 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 670 ; X86-SSE-NEXT: movdqa %xmm1, %xmm0 671 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 672 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 673 ; X86-SSE-NEXT: movdqu %xmm1, 16(%esi,%ecx,4) 674 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) 675 ; X86-SSE-NEXT: popl %esi 676 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 677 ; X86-SSE-NEXT: retl 678 ; 679 ; X86-AVX1-LABEL: mul_8xi16: 680 ; X86-AVX1: # %bb.0: # %entry 681 ; X86-AVX1-NEXT: pushl %esi 682 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 683 ; X86-AVX1-NEXT: .cfi_offset %esi, -8 684 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 685 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 686 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 687 ; X86-AVX1-NEXT: movl c, %esi 688 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 689 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 690 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 691 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 692 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 693 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 694 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 695 ; X86-AVX1-NEXT: vmovups %ymm0, (%esi,%ecx,4) 696 ; X86-AVX1-NEXT: popl %esi 697 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 698 ; X86-AVX1-NEXT: vzeroupper 699 ; X86-AVX1-NEXT: retl 700 ; 701 ; X86-AVX2-LABEL: mul_8xi16: 702 ; X86-AVX2: # %bb.0: # %entry 703 ; X86-AVX2-NEXT: pushl %esi 704 ; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 705 ; X86-AVX2-NEXT: .cfi_offset %esi, -8 706 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 707 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 708 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 709 ; X86-AVX2-NEXT: movl c, %esi 710 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 711 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 712 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 713 ; X86-AVX2-NEXT: vmovdqu %ymm0, (%esi,%ecx,4) 714 ; X86-AVX2-NEXT: popl %esi 715 ; X86-AVX2-NEXT: .cfi_def_cfa_offset 4 716 ; X86-AVX2-NEXT: vzeroupper 717 ; X86-AVX2-NEXT: retl 718 ; 719 ; X64-SSE-LABEL: mul_8xi16: 720 ; X64-SSE: # %bb.0: # %entry 721 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 722 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 723 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm1 724 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2 725 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm2 726 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1 727 ; X64-SSE-NEXT: movdqa %xmm1, %xmm0 728 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 729 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 730 ; X64-SSE-NEXT: movdqu %xmm1, 16(%rax,%rdx,4) 731 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) 732 ; X64-SSE-NEXT: retq 733 ; 734 ; X64-AVX1-LABEL: mul_8xi16: 735 ; X64-AVX1: # %bb.0: # %entry 736 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax 737 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 738 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 739 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 740 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm2, %xmm0 741 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 742 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm2, %xmm1 743 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 744 ; X64-AVX1-NEXT: vmovups %ymm0, (%rax,%rdx,4) 745 ; X64-AVX1-NEXT: vzeroupper 746 ; X64-AVX1-NEXT: retq 747 ; 748 ; X64-AVX2-LABEL: mul_8xi16: 749 ; X64-AVX2: # %bb.0: # %entry 750 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax 751 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 752 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 753 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm1, %ymm0 754 ; X64-AVX2-NEXT: vmovdqu %ymm0, (%rax,%rdx,4) 755 ; X64-AVX2-NEXT: vzeroupper 756 ; X64-AVX2-NEXT: retq 757 entry: 758 %pre = load i32*, i32** @c 759 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 760 %tmp7 = bitcast i8* %tmp6 to <8 x i16>* 761 %wide.load = load <8 x i16>, <8 x i16>* %tmp7, align 1 762 %tmp8 = zext <8 x i16> %wide.load to <8 x i32> 763 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 764 %tmp11 = bitcast i8* %tmp10 to <8 x i16>* 765 %wide.load17 = load <8 x i16>, <8 x i16>* %tmp11, align 1 766 %tmp12 = zext <8 x i16> %wide.load17 to <8 x i32> 767 %tmp13 = mul nuw nsw <8 x i32> %tmp12, %tmp8 768 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 769 %tmp15 = bitcast i32* %tmp14 to <8 x i32>* 770 store <8 x i32> %tmp13, <8 x i32>* %tmp15, align 4 771 ret void 772 } 773 774 ; %val1 = load <16 x i16> 775 ; %op1 = zext<16 x i32> %val1 776 ; %val2 = load <16 x i16> 777 ; %op2 = zext<16 x i32> %val2 778 ; %rst = mul <16 x i32> %op1, %op2 779 ; 780 define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 781 ; X86-SSE-LABEL: mul_16xi16: 782 ; X86-SSE: # %bb.0: # %entry 783 ; X86-SSE-NEXT: pushl %esi 784 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 785 ; X86-SSE-NEXT: .cfi_offset %esi, -8 786 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 787 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 788 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 789 ; X86-SSE-NEXT: movl c, %esi 790 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 791 ; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 792 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 793 ; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 794 ; X86-SSE-NEXT: movdqa %xmm2, %xmm4 795 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 796 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2 797 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0 798 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 799 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 800 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 801 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 802 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3 803 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1 804 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 805 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 806 ; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) 807 ; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) 808 ; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) 809 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) 810 ; X86-SSE-NEXT: popl %esi 811 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 812 ; X86-SSE-NEXT: retl 813 ; 814 ; X86-AVX1-LABEL: mul_16xi16: 815 ; X86-AVX1: # %bb.0: # %entry 816 ; X86-AVX1-NEXT: pushl %esi 817 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 818 ; X86-AVX1-NEXT: .cfi_offset %esi, -8 819 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 820 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 821 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 822 ; X86-AVX1-NEXT: movl c, %esi 823 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 824 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 825 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 826 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 827 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 828 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 829 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 830 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 831 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 832 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 833 ; X86-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 834 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 835 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 836 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 837 ; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) 838 ; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) 839 ; X86-AVX1-NEXT: popl %esi 840 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 841 ; X86-AVX1-NEXT: vzeroupper 842 ; X86-AVX1-NEXT: retl 843 ; 844 ; X86-AVX2-LABEL: mul_16xi16: 845 ; X86-AVX2: # %bb.0: # %entry 846 ; X86-AVX2-NEXT: pushl %esi 847 ; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 848 ; X86-AVX2-NEXT: .cfi_offset %esi, -8 849 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 850 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 851 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 852 ; X86-AVX2-NEXT: movl c, %esi 853 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 854 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 855 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 856 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 857 ; X86-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 858 ; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 859 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) 860 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) 861 ; X86-AVX2-NEXT: popl %esi 862 ; X86-AVX2-NEXT: .cfi_def_cfa_offset 4 863 ; X86-AVX2-NEXT: vzeroupper 864 ; X86-AVX2-NEXT: retl 865 ; 866 ; X64-SSE-LABEL: mul_16xi16: 867 ; X64-SSE: # %bb.0: # %entry 868 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 869 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 870 ; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 871 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 872 ; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 873 ; X64-SSE-NEXT: movdqa %xmm2, %xmm4 874 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 875 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 876 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 877 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 878 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 879 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 880 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 881 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 882 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 883 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 884 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 885 ; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) 886 ; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) 887 ; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) 888 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) 889 ; X64-SSE-NEXT: retq 890 ; 891 ; X64-AVX1-LABEL: mul_16xi16: 892 ; X64-AVX1: # %bb.0: # %entry 893 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax 894 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 895 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 896 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 897 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 898 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 899 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 900 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 901 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 902 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 903 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 904 ; X64-AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero 905 ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 906 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 907 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 908 ; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) 909 ; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) 910 ; X64-AVX1-NEXT: vzeroupper 911 ; X64-AVX1-NEXT: retq 912 ; 913 ; X64-AVX2-LABEL: mul_16xi16: 914 ; X64-AVX2: # %bb.0: # %entry 915 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax 916 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 917 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 918 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 919 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 920 ; X64-AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero 921 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 922 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) 923 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) 924 ; X64-AVX2-NEXT: vzeroupper 925 ; X64-AVX2-NEXT: retq 926 entry: 927 %pre = load i32*, i32** @c 928 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 929 %tmp7 = bitcast i8* %tmp6 to <16 x i16>* 930 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 931 %tmp8 = zext <16 x i16> %wide.load to <16 x i32> 932 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 933 %tmp11 = bitcast i8* %tmp10 to <16 x i16>* 934 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 935 %tmp12 = zext <16 x i16> %wide.load17 to <16 x i32> 936 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 937 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 938 %tmp15 = bitcast i32* %tmp14 to <16 x i32>* 939 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 940 ret void 941 } 942 943 ; %val1 = load <2 x i8> 944 ; %op1 = sext<2 x i32> %val1 945 ; %val2 = load <2 x i8> 946 ; %op2 = sext<2 x i32> %val2 947 ; %rst = mul <2 x i32> %op1, %op2 948 ; 949 define void @mul_2xi8_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 950 ; X86-SSE-LABEL: mul_2xi8_sext: 951 ; X86-SSE: # %bb.0: # %entry 952 ; X86-SSE-NEXT: pushl %esi 953 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 954 ; X86-SSE-NEXT: .cfi_offset %esi, -8 955 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 956 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 957 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 958 ; X86-SSE-NEXT: movl c, %esi 959 ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx 960 ; X86-SSE-NEXT: movd %edx, %xmm0 961 ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax 962 ; X86-SSE-NEXT: movd %eax, %xmm1 963 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 964 ; X86-SSE-NEXT: psraw $8, %xmm0 965 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 966 ; X86-SSE-NEXT: psraw $8, %xmm1 967 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 968 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 969 ; X86-SSE-NEXT: psrad $16, %xmm0 970 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) 971 ; X86-SSE-NEXT: popl %esi 972 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 973 ; X86-SSE-NEXT: retl 974 ; 975 ; X86-AVX-LABEL: mul_2xi8_sext: 976 ; X86-AVX: # %bb.0: # %entry 977 ; X86-AVX-NEXT: pushl %esi 978 ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 979 ; X86-AVX-NEXT: .cfi_offset %esi, -8 980 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 981 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 982 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 983 ; X86-AVX-NEXT: movl c, %esi 984 ; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 985 ; X86-AVX-NEXT: vpmovsxbq (%eax,%ecx), %xmm1 986 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 987 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 988 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 989 ; X86-AVX-NEXT: popl %esi 990 ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 991 ; X86-AVX-NEXT: retl 992 ; 993 ; X64-SSE-LABEL: mul_2xi8_sext: 994 ; X64-SSE: # %bb.0: # %entry 995 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 996 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx 997 ; X64-SSE-NEXT: movd %ecx, %xmm0 998 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx 999 ; X64-SSE-NEXT: movd %ecx, %xmm1 1000 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1001 ; X64-SSE-NEXT: psraw $8, %xmm0 1002 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1003 ; X64-SSE-NEXT: psraw $8, %xmm1 1004 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1 1005 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1006 ; X64-SSE-NEXT: psrad $16, %xmm0 1007 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) 1008 ; X64-SSE-NEXT: retq 1009 ; 1010 ; X64-AVX-LABEL: mul_2xi8_sext: 1011 ; X64-AVX: # %bb.0: # %entry 1012 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1013 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 1014 ; X64-AVX-NEXT: vpmovsxbq (%rsi,%rdx), %xmm1 1015 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1016 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1017 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 1018 ; X64-AVX-NEXT: retq 1019 entry: 1020 %pre = load i32*, i32** @c 1021 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1022 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1023 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1024 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1025 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 1026 %tmp11 = bitcast i8* %tmp10 to <2 x i8>* 1027 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 1028 %tmp12 = sext <2 x i8> %wide.load17 to <2 x i32> 1029 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 1030 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1031 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1032 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1033 ret void 1034 } 1035 1036 ; %val1 = load <2 x i8> 1037 ; %op1 = sext<2 x i32> %val1 1038 ; %val2 = load <2 x i8> 1039 ; %op2 = zext<2 x i32> %val2 1040 ; %rst = mul <2 x i32> %op1, %op2 1041 ; 1042 define void @mul_2xi8_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 1043 ; X86-SSE-LABEL: mul_2xi8_sext_zext: 1044 ; X86-SSE: # %bb.0: # %entry 1045 ; X86-SSE-NEXT: pushl %esi 1046 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 1047 ; X86-SSE-NEXT: .cfi_offset %esi, -8 1048 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1049 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1050 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 1051 ; X86-SSE-NEXT: movl c, %esi 1052 ; X86-SSE-NEXT: movzwl (%edx,%ecx), %edx 1053 ; X86-SSE-NEXT: movd %edx, %xmm0 1054 ; X86-SSE-NEXT: movzwl (%eax,%ecx), %eax 1055 ; X86-SSE-NEXT: movd %eax, %xmm1 1056 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 1057 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1058 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1059 ; X86-SSE-NEXT: psraw $8, %xmm0 1060 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2 1061 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 1062 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1063 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1064 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) 1065 ; X86-SSE-NEXT: popl %esi 1066 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 1067 ; X86-SSE-NEXT: retl 1068 ; 1069 ; X86-AVX-LABEL: mul_2xi8_sext_zext: 1070 ; X86-AVX: # %bb.0: # %entry 1071 ; X86-AVX-NEXT: pushl %esi 1072 ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 1073 ; X86-AVX-NEXT: .cfi_offset %esi, -8 1074 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1075 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1076 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 1077 ; X86-AVX-NEXT: movl c, %esi 1078 ; X86-AVX-NEXT: vpmovsxbq (%edx,%ecx), %xmm0 1079 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 1080 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1081 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1082 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 1083 ; X86-AVX-NEXT: popl %esi 1084 ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 1085 ; X86-AVX-NEXT: retl 1086 ; 1087 ; X64-SSE-LABEL: mul_2xi8_sext_zext: 1088 ; X64-SSE: # %bb.0: # %entry 1089 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1090 ; X64-SSE-NEXT: movzwl (%rdi,%rdx), %ecx 1091 ; X64-SSE-NEXT: movd %ecx, %xmm0 1092 ; X64-SSE-NEXT: movzwl (%rsi,%rdx), %ecx 1093 ; X64-SSE-NEXT: movd %ecx, %xmm1 1094 ; X64-SSE-NEXT: pxor %xmm2, %xmm2 1095 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 1096 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1097 ; X64-SSE-NEXT: psraw $8, %xmm0 1098 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2 1099 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 1100 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1101 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1102 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) 1103 ; X64-SSE-NEXT: retq 1104 ; 1105 ; X64-AVX-LABEL: mul_2xi8_sext_zext: 1106 ; X64-AVX: # %bb.0: # %entry 1107 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1108 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rdx), %xmm0 1109 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 1110 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1111 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1112 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 1113 ; X64-AVX-NEXT: retq 1114 entry: 1115 %pre = load i32*, i32** @c 1116 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1117 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1118 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1119 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1120 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 1121 %tmp11 = bitcast i8* %tmp10 to <2 x i8>* 1122 %wide.load17 = load <2 x i8>, <2 x i8>* %tmp11, align 1 1123 %tmp12 = zext <2 x i8> %wide.load17 to <2 x i32> 1124 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 1125 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1126 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1127 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1128 ret void 1129 } 1130 1131 ; %val1 = load <2 x i16> 1132 ; %op1 = sext<2 x i32> %val1 1133 ; %val2 = load <2 x i16> 1134 ; %op2 = sext<2 x i32> %val2 1135 ; %rst = mul <2 x i32> %op1, %op2 1136 ; 1137 define void @mul_2xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 1138 ; X86-SSE-LABEL: mul_2xi16_sext: 1139 ; X86-SSE: # %bb.0: # %entry 1140 ; X86-SSE-NEXT: pushl %esi 1141 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 1142 ; X86-SSE-NEXT: .cfi_offset %esi, -8 1143 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1144 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1145 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 1146 ; X86-SSE-NEXT: movl c, %esi 1147 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1148 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1149 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2 1150 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm2 1151 ; X86-SSE-NEXT: pmullw %xmm0, %xmm1 1152 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1153 ; X86-SSE-NEXT: movq %xmm1, (%esi,%ecx,4) 1154 ; X86-SSE-NEXT: popl %esi 1155 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 1156 ; X86-SSE-NEXT: retl 1157 ; 1158 ; X86-AVX-LABEL: mul_2xi16_sext: 1159 ; X86-AVX: # %bb.0: # %entry 1160 ; X86-AVX-NEXT: pushl %esi 1161 ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 1162 ; X86-AVX-NEXT: .cfi_offset %esi, -8 1163 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1164 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1165 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 1166 ; X86-AVX-NEXT: movl c, %esi 1167 ; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 1168 ; X86-AVX-NEXT: vpmovsxwq (%eax,%ecx), %xmm1 1169 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1170 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1171 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 1172 ; X86-AVX-NEXT: popl %esi 1173 ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 1174 ; X86-AVX-NEXT: retl 1175 ; 1176 ; X64-SSE-LABEL: mul_2xi16_sext: 1177 ; X64-SSE: # %bb.0: # %entry 1178 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1179 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1180 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1181 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2 1182 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm2 1183 ; X64-SSE-NEXT: pmullw %xmm0, %xmm1 1184 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1185 ; X64-SSE-NEXT: movq %xmm1, (%rax,%rdx,4) 1186 ; X64-SSE-NEXT: retq 1187 ; 1188 ; X64-AVX-LABEL: mul_2xi16_sext: 1189 ; X64-AVX: # %bb.0: # %entry 1190 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1191 ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 1192 ; X64-AVX-NEXT: vpmovsxwq (%rsi,%rdx), %xmm1 1193 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1194 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1195 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 1196 ; X64-AVX-NEXT: retq 1197 entry: 1198 %pre = load i32*, i32** @c 1199 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1200 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 1201 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 1202 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 1203 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 1204 %tmp11 = bitcast i8* %tmp10 to <2 x i16>* 1205 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 1206 %tmp12 = sext <2 x i16> %wide.load17 to <2 x i32> 1207 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 1208 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1209 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1210 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1211 ret void 1212 } 1213 1214 ; %val1 = load <2 x i16> 1215 ; %op1 = sext<2 x i32> %val1 1216 ; %val2 = load <2 x i16> 1217 ; %op2 = zext<2 x i32> %val2 1218 ; %rst = mul <2 x i32> %op1, %op2 1219 ; 1220 define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 1221 ; X86-SSE-LABEL: mul_2xi16_sext_zext: 1222 ; X86-SSE: # %bb.0: # %entry 1223 ; X86-SSE-NEXT: pushl %esi 1224 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 1225 ; X86-SSE-NEXT: .cfi_offset %esi, -8 1226 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1227 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1228 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 1229 ; X86-SSE-NEXT: movl c, %esi 1230 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1231 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 1232 ; X86-SSE-NEXT: psrad $16, %xmm0 1233 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 1234 ; X86-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1235 ; X86-SSE-NEXT: pxor %xmm2, %xmm2 1236 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1237 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 1238 ; X86-SSE-NEXT: movdqa %xmm1, %xmm2 1239 ; X86-SSE-NEXT: psrlq $32, %xmm2 1240 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 1241 ; X86-SSE-NEXT: movdqa %xmm0, %xmm3 1242 ; X86-SSE-NEXT: psrlq $32, %xmm3 1243 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 1244 ; X86-SSE-NEXT: paddq %xmm2, %xmm3 1245 ; X86-SSE-NEXT: psllq $32, %xmm3 1246 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 1247 ; X86-SSE-NEXT: paddq %xmm3, %xmm1 1248 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 1249 ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) 1250 ; X86-SSE-NEXT: popl %esi 1251 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 1252 ; X86-SSE-NEXT: retl 1253 ; 1254 ; X86-AVX-LABEL: mul_2xi16_sext_zext: 1255 ; X86-AVX: # %bb.0: # %entry 1256 ; X86-AVX-NEXT: pushl %esi 1257 ; X86-AVX-NEXT: .cfi_def_cfa_offset 8 1258 ; X86-AVX-NEXT: .cfi_offset %esi, -8 1259 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1260 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1261 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx 1262 ; X86-AVX-NEXT: movl c, %esi 1263 ; X86-AVX-NEXT: vpmovsxwq (%edx,%ecx), %xmm0 1264 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1265 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1266 ; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1267 ; X86-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1268 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1269 ; X86-AVX-NEXT: vmovq %xmm0, (%esi,%ecx,4) 1270 ; X86-AVX-NEXT: popl %esi 1271 ; X86-AVX-NEXT: .cfi_def_cfa_offset 4 1272 ; X86-AVX-NEXT: retl 1273 ; 1274 ; X64-SSE-LABEL: mul_2xi16_sext_zext: 1275 ; X64-SSE: # %bb.0: # %entry 1276 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1277 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1278 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 1279 ; X64-SSE-NEXT: psrad $16, %xmm0 1280 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 1281 ; X64-SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1282 ; X64-SSE-NEXT: pxor %xmm2, %xmm2 1283 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] 1284 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] 1285 ; X64-SSE-NEXT: movdqa %xmm1, %xmm2 1286 ; X64-SSE-NEXT: psrlq $32, %xmm2 1287 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 1288 ; X64-SSE-NEXT: movdqa %xmm0, %xmm3 1289 ; X64-SSE-NEXT: psrlq $32, %xmm3 1290 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 1291 ; X64-SSE-NEXT: paddq %xmm2, %xmm3 1292 ; X64-SSE-NEXT: psllq $32, %xmm3 1293 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 1294 ; X64-SSE-NEXT: paddq %xmm3, %xmm1 1295 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] 1296 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) 1297 ; X64-SSE-NEXT: retq 1298 ; 1299 ; X64-AVX-LABEL: mul_2xi16_sext_zext: 1300 ; X64-AVX: # %bb.0: # %entry 1301 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1302 ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rdx), %xmm0 1303 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1304 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero 1305 ; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero 1306 ; X64-AVX-NEXT: vpmulld %xmm0, %xmm1, %xmm0 1307 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1308 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rdx,4) 1309 ; X64-AVX-NEXT: retq 1310 entry: 1311 %pre = load i32*, i32** @c 1312 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1313 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 1314 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 1315 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 1316 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 1317 %tmp11 = bitcast i8* %tmp10 to <2 x i16>* 1318 %wide.load17 = load <2 x i16>, <2 x i16>* %tmp11, align 1 1319 %tmp12 = zext <2 x i16> %wide.load17 to <2 x i32> 1320 %tmp13 = mul nuw nsw <2 x i32> %tmp12, %tmp8 1321 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1322 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1323 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1324 ret void 1325 } 1326 1327 ; %val1 = load <16 x i16> 1328 ; %op1 = sext<16 x i32> %val1 1329 ; %val2 = load <16 x i16> 1330 ; %op2 = sext<16 x i32> %val2 1331 ; %rst = mul <16 x i32> %op1, %op2 1332 ; 1333 define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 %index) { 1334 ; X86-SSE-LABEL: mul_16xi16_sext: 1335 ; X86-SSE: # %bb.0: # %entry 1336 ; X86-SSE-NEXT: pushl %esi 1337 ; X86-SSE-NEXT: .cfi_def_cfa_offset 8 1338 ; X86-SSE-NEXT: .cfi_offset %esi, -8 1339 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1340 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1341 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %edx 1342 ; X86-SSE-NEXT: movl c, %esi 1343 ; X86-SSE-NEXT: movdqu (%edx,%ecx), %xmm0 1344 ; X86-SSE-NEXT: movdqu 16(%edx,%ecx), %xmm1 1345 ; X86-SSE-NEXT: movdqu (%eax,%ecx), %xmm2 1346 ; X86-SSE-NEXT: movdqu 16(%eax,%ecx), %xmm3 1347 ; X86-SSE-NEXT: movdqa %xmm2, %xmm4 1348 ; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 1349 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2 1350 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0 1351 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1352 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 1353 ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 1354 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 1355 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3 1356 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1 1357 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 1358 ; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1359 ; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) 1360 ; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) 1361 ; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) 1362 ; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) 1363 ; X86-SSE-NEXT: popl %esi 1364 ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 1365 ; X86-SSE-NEXT: retl 1366 ; 1367 ; X86-AVX1-LABEL: mul_16xi16_sext: 1368 ; X86-AVX1: # %bb.0: # %entry 1369 ; X86-AVX1-NEXT: pushl %esi 1370 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 1371 ; X86-AVX1-NEXT: .cfi_offset %esi, -8 1372 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax 1373 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx 1374 ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx 1375 ; X86-AVX1-NEXT: movl c, %esi 1376 ; X86-AVX1-NEXT: vpmovsxwd 16(%edx,%ecx), %xmm0 1377 ; X86-AVX1-NEXT: vpmovsxwd 24(%edx,%ecx), %xmm1 1378 ; X86-AVX1-NEXT: vpmovsxwd (%edx,%ecx), %xmm2 1379 ; X86-AVX1-NEXT: vpmovsxwd 8(%edx,%ecx), %xmm3 1380 ; X86-AVX1-NEXT: vpmovsxwd 16(%eax,%ecx), %xmm4 1381 ; X86-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 1382 ; X86-AVX1-NEXT: vpmovsxwd 24(%eax,%ecx), %xmm4 1383 ; X86-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 1384 ; X86-AVX1-NEXT: vpmovsxwd (%eax,%ecx), %xmm4 1385 ; X86-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 1386 ; X86-AVX1-NEXT: vpmovsxwd 8(%eax,%ecx), %xmm4 1387 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 1388 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1389 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1390 ; X86-AVX1-NEXT: vmovups %ymm0, 32(%esi,%ecx,4) 1391 ; X86-AVX1-NEXT: vmovups %ymm2, (%esi,%ecx,4) 1392 ; X86-AVX1-NEXT: popl %esi 1393 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 1394 ; X86-AVX1-NEXT: vzeroupper 1395 ; X86-AVX1-NEXT: retl 1396 ; 1397 ; X86-AVX2-LABEL: mul_16xi16_sext: 1398 ; X86-AVX2: # %bb.0: # %entry 1399 ; X86-AVX2-NEXT: pushl %esi 1400 ; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 1401 ; X86-AVX2-NEXT: .cfi_offset %esi, -8 1402 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax 1403 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx 1404 ; X86-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx 1405 ; X86-AVX2-NEXT: movl c, %esi 1406 ; X86-AVX2-NEXT: vpmovsxwd 16(%edx,%ecx), %ymm0 1407 ; X86-AVX2-NEXT: vpmovsxwd (%edx,%ecx), %ymm1 1408 ; X86-AVX2-NEXT: vpmovsxwd 16(%eax,%ecx), %ymm2 1409 ; X86-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 1410 ; X86-AVX2-NEXT: vpmovsxwd (%eax,%ecx), %ymm2 1411 ; X86-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 1412 ; X86-AVX2-NEXT: vmovdqu %ymm0, 32(%esi,%ecx,4) 1413 ; X86-AVX2-NEXT: vmovdqu %ymm1, (%esi,%ecx,4) 1414 ; X86-AVX2-NEXT: popl %esi 1415 ; X86-AVX2-NEXT: .cfi_def_cfa_offset 4 1416 ; X86-AVX2-NEXT: vzeroupper 1417 ; X86-AVX2-NEXT: retl 1418 ; 1419 ; X64-SSE-LABEL: mul_16xi16_sext: 1420 ; X64-SSE: # %bb.0: # %entry 1421 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1422 ; X64-SSE-NEXT: movdqu (%rdi,%rdx), %xmm0 1423 ; X64-SSE-NEXT: movdqu 16(%rdi,%rdx), %xmm1 1424 ; X64-SSE-NEXT: movdqu (%rsi,%rdx), %xmm2 1425 ; X64-SSE-NEXT: movdqu 16(%rsi,%rdx), %xmm3 1426 ; X64-SSE-NEXT: movdqa %xmm2, %xmm4 1427 ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 1428 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 1429 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 1430 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 1431 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 1432 ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 1433 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 1434 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 1435 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 1436 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] 1437 ; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] 1438 ; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) 1439 ; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) 1440 ; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) 1441 ; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) 1442 ; X64-SSE-NEXT: retq 1443 ; 1444 ; X64-AVX1-LABEL: mul_16xi16_sext: 1445 ; X64-AVX1: # %bb.0: # %entry 1446 ; X64-AVX1-NEXT: movq {{.*}}(%rip), %rax 1447 ; X64-AVX1-NEXT: vpmovsxwd 16(%rdi,%rdx), %xmm0 1448 ; X64-AVX1-NEXT: vpmovsxwd 24(%rdi,%rdx), %xmm1 1449 ; X64-AVX1-NEXT: vpmovsxwd (%rdi,%rdx), %xmm2 1450 ; X64-AVX1-NEXT: vpmovsxwd 8(%rdi,%rdx), %xmm3 1451 ; X64-AVX1-NEXT: vpmovsxwd 16(%rsi,%rdx), %xmm4 1452 ; X64-AVX1-NEXT: vpmulld %xmm0, %xmm4, %xmm0 1453 ; X64-AVX1-NEXT: vpmovsxwd 24(%rsi,%rdx), %xmm4 1454 ; X64-AVX1-NEXT: vpmulld %xmm1, %xmm4, %xmm1 1455 ; X64-AVX1-NEXT: vpmovsxwd (%rsi,%rdx), %xmm4 1456 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2 1457 ; X64-AVX1-NEXT: vpmovsxwd 8(%rsi,%rdx), %xmm4 1458 ; X64-AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm3 1459 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 1460 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 1461 ; X64-AVX1-NEXT: vmovups %ymm0, 32(%rax,%rdx,4) 1462 ; X64-AVX1-NEXT: vmovups %ymm2, (%rax,%rdx,4) 1463 ; X64-AVX1-NEXT: vzeroupper 1464 ; X64-AVX1-NEXT: retq 1465 ; 1466 ; X64-AVX2-LABEL: mul_16xi16_sext: 1467 ; X64-AVX2: # %bb.0: # %entry 1468 ; X64-AVX2-NEXT: movq {{.*}}(%rip), %rax 1469 ; X64-AVX2-NEXT: vpmovsxwd 16(%rdi,%rdx), %ymm0 1470 ; X64-AVX2-NEXT: vpmovsxwd (%rdi,%rdx), %ymm1 1471 ; X64-AVX2-NEXT: vpmovsxwd 16(%rsi,%rdx), %ymm2 1472 ; X64-AVX2-NEXT: vpmulld %ymm0, %ymm2, %ymm0 1473 ; X64-AVX2-NEXT: vpmovsxwd (%rsi,%rdx), %ymm2 1474 ; X64-AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 1475 ; X64-AVX2-NEXT: vmovdqu %ymm0, 32(%rax,%rdx,4) 1476 ; X64-AVX2-NEXT: vmovdqu %ymm1, (%rax,%rdx,4) 1477 ; X64-AVX2-NEXT: vzeroupper 1478 ; X64-AVX2-NEXT: retq 1479 entry: 1480 %pre = load i32*, i32** @c 1481 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1482 %tmp7 = bitcast i8* %tmp6 to <16 x i16>* 1483 %wide.load = load <16 x i16>, <16 x i16>* %tmp7, align 1 1484 %tmp8 = sext <16 x i16> %wide.load to <16 x i32> 1485 %tmp10 = getelementptr inbounds i8, i8* %b, i64 %index 1486 %tmp11 = bitcast i8* %tmp10 to <16 x i16>* 1487 %wide.load17 = load <16 x i16>, <16 x i16>* %tmp11, align 1 1488 %tmp12 = sext <16 x i16> %wide.load17 to <16 x i32> 1489 %tmp13 = mul nuw nsw <16 x i32> %tmp12, %tmp8 1490 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1491 %tmp15 = bitcast i32* %tmp14 to <16 x i32>* 1492 store <16 x i32> %tmp13, <16 x i32>* %tmp15, align 4 1493 ret void 1494 } 1495 1496 ; %val = load <2 x i8> 1497 ; %op1 = zext<2 x i32> %val 1498 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 255) 1499 ; %rst = mul <2 x i32> %op1, %op2 1500 ; 1501 define void @mul_2xi8_varconst1(i8* nocapture readonly %a, i64 %index) { 1502 ; X86-SSE-LABEL: mul_2xi8_varconst1: 1503 ; X86-SSE: # %bb.0: # %entry 1504 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1505 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1506 ; X86-SSE-NEXT: movl c, %edx 1507 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1508 ; X86-SSE-NEXT: movd %ecx, %xmm0 1509 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 1510 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1511 ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 1512 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1513 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1514 ; X86-SSE-NEXT: retl 1515 ; 1516 ; X86-AVX-LABEL: mul_2xi8_varconst1: 1517 ; X86-AVX: # %bb.0: # %entry 1518 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1519 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1520 ; X86-AVX-NEXT: movl c, %edx 1521 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 1522 ; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 1523 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1524 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1525 ; X86-AVX-NEXT: retl 1526 ; 1527 ; X64-SSE-LABEL: mul_2xi8_varconst1: 1528 ; X64-SSE: # %bb.0: # %entry 1529 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1530 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1531 ; X64-SSE-NEXT: movd %ecx, %xmm0 1532 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 1533 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1534 ; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 1535 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 1536 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1537 ; X64-SSE-NEXT: retq 1538 ; 1539 ; X64-AVX-LABEL: mul_2xi8_varconst1: 1540 ; X64-AVX: # %bb.0: # %entry 1541 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1542 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 1543 ; X64-AVX-NEXT: movl $255, %ecx 1544 ; X64-AVX-NEXT: vmovq %rcx, %xmm1 1545 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 1546 ; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1547 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1548 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1549 ; X64-AVX-NEXT: retq 1550 entry: 1551 %pre = load i32*, i32** @c 1552 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1553 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1554 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1555 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 1556 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 255> 1557 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1558 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1559 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1560 ret void 1561 } 1562 1563 ; %val = load <2 x i8> 1564 ; %op1 = sext<2 x i32> %val 1565 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 127) 1566 ; %rst = mul <2 x i32> %op1, %op2 1567 ; 1568 define void @mul_2xi8_varconst2(i8* nocapture readonly %a, i64 %index) { 1569 ; X86-SSE-LABEL: mul_2xi8_varconst2: 1570 ; X86-SSE: # %bb.0: # %entry 1571 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1572 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1573 ; X86-SSE-NEXT: movl c, %edx 1574 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1575 ; X86-SSE-NEXT: movd %ecx, %xmm0 1576 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1577 ; X86-SSE-NEXT: psraw $8, %xmm0 1578 ; X86-SSE-NEXT: pmullw {{\.LCPI.*}}, %xmm0 1579 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1580 ; X86-SSE-NEXT: psrad $16, %xmm0 1581 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1582 ; X86-SSE-NEXT: retl 1583 ; 1584 ; X86-AVX-LABEL: mul_2xi8_varconst2: 1585 ; X86-AVX: # %bb.0: # %entry 1586 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1587 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1588 ; X86-AVX-NEXT: movl c, %edx 1589 ; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 1590 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 1591 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1592 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1593 ; X86-AVX-NEXT: retl 1594 ; 1595 ; X64-SSE-LABEL: mul_2xi8_varconst2: 1596 ; X64-SSE: # %bb.0: # %entry 1597 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1598 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1599 ; X64-SSE-NEXT: movd %ecx, %xmm0 1600 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1601 ; X64-SSE-NEXT: psraw $8, %xmm0 1602 ; X64-SSE-NEXT: pmullw {{.*}}(%rip), %xmm0 1603 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] 1604 ; X64-SSE-NEXT: psrad $16, %xmm0 1605 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1606 ; X64-SSE-NEXT: retq 1607 ; 1608 ; X64-AVX-LABEL: mul_2xi8_varconst2: 1609 ; X64-AVX: # %bb.0: # %entry 1610 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1611 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 1612 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 1613 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1614 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1615 ; X64-AVX-NEXT: retq 1616 entry: 1617 %pre = load i32*, i32** @c 1618 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1619 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1620 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1621 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1622 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 127> 1623 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1624 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1625 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1626 ret void 1627 } 1628 1629 ; %val = load <2 x i8> 1630 ; %op1 = zext<2 x i32> %val 1631 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 256) 1632 ; %rst = mul <2 x i32> %op1, %op2 1633 ; 1634 define void @mul_2xi8_varconst3(i8* nocapture readonly %a, i64 %index) { 1635 ; X86-SSE-LABEL: mul_2xi8_varconst3: 1636 ; X86-SSE: # %bb.0: # %entry 1637 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1638 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1639 ; X86-SSE-NEXT: movl c, %edx 1640 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1641 ; X86-SSE-NEXT: movd %ecx, %xmm0 1642 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 1643 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1644 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> 1645 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1646 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 1647 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1648 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1649 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1650 ; X86-SSE-NEXT: retl 1651 ; 1652 ; X86-AVX-LABEL: mul_2xi8_varconst3: 1653 ; X86-AVX: # %bb.0: # %entry 1654 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1655 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1656 ; X86-AVX-NEXT: movl c, %edx 1657 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 1658 ; X86-AVX-NEXT: vpmaddwd {{\.LCPI.*}}, %xmm0, %xmm0 1659 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1660 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1661 ; X86-AVX-NEXT: retl 1662 ; 1663 ; X64-SSE-LABEL: mul_2xi8_varconst3: 1664 ; X64-SSE: # %bb.0: # %entry 1665 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1666 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1667 ; X64-SSE-NEXT: movd %ecx, %xmm0 1668 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 1669 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1670 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,256,u,u,u,u,u,u> 1671 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 1672 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 1673 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1674 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1675 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1676 ; X64-SSE-NEXT: retq 1677 ; 1678 ; X64-AVX-LABEL: mul_2xi8_varconst3: 1679 ; X64-AVX: # %bb.0: # %entry 1680 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1681 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 1682 ; X64-AVX-NEXT: movl $256, %ecx # imm = 0x100 1683 ; X64-AVX-NEXT: vmovq %rcx, %xmm1 1684 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 1685 ; X64-AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 1686 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1687 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1688 ; X64-AVX-NEXT: retq 1689 entry: 1690 %pre = load i32*, i32** @c 1691 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1692 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1693 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1694 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 1695 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 256> 1696 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1697 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1698 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1699 ret void 1700 } 1701 1702 ; %val = load <2 x i8> 1703 ; %op1 = zext<2 x i32> %val 1704 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-1 ~ 255) 1705 ; %rst = mul <2 x i32> %op1, %op2 1706 ; 1707 define void @mul_2xi8_varconst4(i8* nocapture readonly %a, i64 %index) { 1708 ; X86-SSE-LABEL: mul_2xi8_varconst4: 1709 ; X86-SSE: # %bb.0: # %entry 1710 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1711 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1712 ; X86-SSE-NEXT: movl c, %edx 1713 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1714 ; X86-SSE-NEXT: movd %ecx, %xmm0 1715 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 1716 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1717 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> 1718 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1719 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 1720 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1721 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1722 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1723 ; X86-SSE-NEXT: retl 1724 ; 1725 ; X86-AVX-LABEL: mul_2xi8_varconst4: 1726 ; X86-AVX: # %bb.0: # %entry 1727 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1728 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1729 ; X86-AVX-NEXT: movl c, %edx 1730 ; X86-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 1731 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 1732 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1733 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1734 ; X86-AVX-NEXT: retl 1735 ; 1736 ; X64-SSE-LABEL: mul_2xi8_varconst4: 1737 ; X64-SSE: # %bb.0: # %entry 1738 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1739 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1740 ; X64-SSE-NEXT: movd %ecx, %xmm0 1741 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 1742 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 1743 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65535,255,u,u,u,u,u,u> 1744 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 1745 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 1746 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1747 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1748 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1749 ; X64-SSE-NEXT: retq 1750 ; 1751 ; X64-AVX-LABEL: mul_2xi8_varconst4: 1752 ; X64-AVX: # %bb.0: # %entry 1753 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1754 ; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero 1755 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 1756 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1757 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1758 ; X64-AVX-NEXT: retq 1759 entry: 1760 %pre = load i32*, i32** @c 1761 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1762 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1763 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1764 %tmp8 = zext <2 x i8> %wide.load to <2 x i32> 1765 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -1, i32 255> 1766 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1767 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1768 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1769 ret void 1770 } 1771 1772 ; %val = load <2 x i8> 1773 ; %op1 = sext<2 x i32> %val 1774 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-129 ~ 127) 1775 ; %rst = mul <2 x i32> %op1, %op2 1776 ; 1777 define void @mul_2xi8_varconst5(i8* nocapture readonly %a, i64 %index) { 1778 ; X86-SSE-LABEL: mul_2xi8_varconst5: 1779 ; X86-SSE: # %bb.0: # %entry 1780 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1781 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1782 ; X86-SSE-NEXT: movl c, %edx 1783 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1784 ; X86-SSE-NEXT: movd %ecx, %xmm0 1785 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1786 ; X86-SSE-NEXT: psraw $8, %xmm0 1787 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> 1788 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1789 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 1790 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1791 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1792 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1793 ; X86-SSE-NEXT: retl 1794 ; 1795 ; X86-AVX-LABEL: mul_2xi8_varconst5: 1796 ; X86-AVX: # %bb.0: # %entry 1797 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1798 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1799 ; X86-AVX-NEXT: movl c, %edx 1800 ; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 1801 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 1802 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1803 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1804 ; X86-AVX-NEXT: retl 1805 ; 1806 ; X64-SSE-LABEL: mul_2xi8_varconst5: 1807 ; X64-SSE: # %bb.0: # %entry 1808 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1809 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1810 ; X64-SSE-NEXT: movd %ecx, %xmm0 1811 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1812 ; X64-SSE-NEXT: psraw $8, %xmm0 1813 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65407,127,u,u,u,u,u,u> 1814 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 1815 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 1816 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1817 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1818 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1819 ; X64-SSE-NEXT: retq 1820 ; 1821 ; X64-AVX-LABEL: mul_2xi8_varconst5: 1822 ; X64-AVX: # %bb.0: # %entry 1823 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1824 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 1825 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 1826 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1827 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1828 ; X64-AVX-NEXT: retq 1829 entry: 1830 %pre = load i32*, i32** @c 1831 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1832 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1833 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1834 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1835 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -129, i32 127> 1836 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1837 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1838 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1839 ret void 1840 } 1841 1842 ; %val = load <2 x i8> 1843 ; %op1 = sext<2 x i32> %val 1844 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-128 ~ 128) 1845 ; %rst = mul <2 x i32> %op1, %op2 1846 ; 1847 define void @mul_2xi8_varconst6(i8* nocapture readonly %a, i64 %index) { 1848 ; X86-SSE-LABEL: mul_2xi8_varconst6: 1849 ; X86-SSE: # %bb.0: # %entry 1850 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1851 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1852 ; X86-SSE-NEXT: movl c, %edx 1853 ; X86-SSE-NEXT: movzwl (%ecx,%eax), %ecx 1854 ; X86-SSE-NEXT: movd %ecx, %xmm0 1855 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1856 ; X86-SSE-NEXT: psraw $8, %xmm0 1857 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> 1858 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1859 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 1860 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1861 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1862 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1863 ; X86-SSE-NEXT: retl 1864 ; 1865 ; X86-AVX-LABEL: mul_2xi8_varconst6: 1866 ; X86-AVX: # %bb.0: # %entry 1867 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1868 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1869 ; X86-AVX-NEXT: movl c, %edx 1870 ; X86-AVX-NEXT: vpmovsxbq (%ecx,%eax), %xmm0 1871 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 1872 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1873 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1874 ; X86-AVX-NEXT: retl 1875 ; 1876 ; X64-SSE-LABEL: mul_2xi8_varconst6: 1877 ; X64-SSE: # %bb.0: # %entry 1878 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1879 ; X64-SSE-NEXT: movzwl (%rdi,%rsi), %ecx 1880 ; X64-SSE-NEXT: movd %ecx, %xmm0 1881 ; X64-SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 1882 ; X64-SSE-NEXT: psraw $8, %xmm0 1883 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <65408,128,u,u,u,u,u,u> 1884 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 1885 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 1886 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1887 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1888 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1889 ; X64-SSE-NEXT: retq 1890 ; 1891 ; X64-AVX-LABEL: mul_2xi8_varconst6: 1892 ; X64-AVX: # %bb.0: # %entry 1893 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1894 ; X64-AVX-NEXT: vpmovsxbq (%rdi,%rsi), %xmm0 1895 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 1896 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1897 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1898 ; X64-AVX-NEXT: retq 1899 entry: 1900 %pre = load i32*, i32** @c 1901 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1902 %tmp7 = bitcast i8* %tmp6 to <2 x i8>* 1903 %wide.load = load <2 x i8>, <2 x i8>* %tmp7, align 1 1904 %tmp8 = sext <2 x i8> %wide.load to <2 x i32> 1905 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -128, i32 128> 1906 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1907 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1908 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1909 ret void 1910 } 1911 1912 ; %val = load <2 x i16> 1913 ; %op1 = zext<2 x i32> %val 1914 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65535) 1915 ; %rst = mul <2 x i32> %op1, %op2 1916 ; 1917 define void @mul_2xi16_varconst1(i8* nocapture readonly %a, i64 %index) { 1918 ; X86-SSE-LABEL: mul_2xi16_varconst1: 1919 ; X86-SSE: # %bb.0: # %entry 1920 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1921 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1922 ; X86-SSE-NEXT: movl c, %edx 1923 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1924 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> 1925 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1926 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm2 1927 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1928 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1929 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 1930 ; X86-SSE-NEXT: retl 1931 ; 1932 ; X86-AVX-LABEL: mul_2xi16_varconst1: 1933 ; X86-AVX: # %bb.0: # %entry 1934 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 1935 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 1936 ; X86-AVX-NEXT: movl c, %edx 1937 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1938 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1939 ; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1940 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 1941 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1942 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 1943 ; X86-AVX-NEXT: retl 1944 ; 1945 ; X64-SSE-LABEL: mul_2xi16_varconst1: 1946 ; X64-SSE: # %bb.0: # %entry 1947 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 1948 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1949 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u> 1950 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 1951 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm2 1952 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0 1953 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 1954 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 1955 ; X64-SSE-NEXT: retq 1956 ; 1957 ; X64-AVX-LABEL: mul_2xi16_varconst1: 1958 ; X64-AVX: # %bb.0: # %entry 1959 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 1960 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1961 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 1962 ; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 1963 ; X64-AVX-NEXT: movl $65535, %ecx # imm = 0xFFFF 1964 ; X64-AVX-NEXT: vmovq %rcx, %xmm1 1965 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 1966 ; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 1967 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1968 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 1969 ; X64-AVX-NEXT: retq 1970 entry: 1971 %pre = load i32*, i32** @c 1972 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 1973 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 1974 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 1975 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 1976 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65535> 1977 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 1978 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 1979 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 1980 ret void 1981 } 1982 1983 ; %val = load <2 x i16> 1984 ; %op1 = sext<2 x i32> %val 1985 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (-32768 ~ 32767) 1986 ; %rst = mul <2 x i32> %op1, %op2 1987 ; 1988 define void @mul_2xi16_varconst2(i8* nocapture readonly %a, i64 %index) { 1989 ; X86-SSE-LABEL: mul_2xi16_varconst2: 1990 ; X86-SSE: # %bb.0: # %entry 1991 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 1992 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 1993 ; X86-SSE-NEXT: movl c, %edx 1994 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 1995 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> 1996 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 1997 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm2 1998 ; X86-SSE-NEXT: pmullw %xmm1, %xmm0 1999 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2000 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 2001 ; X86-SSE-NEXT: retl 2002 ; 2003 ; X86-AVX-LABEL: mul_2xi16_varconst2: 2004 ; X86-AVX: # %bb.0: # %entry 2005 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 2006 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 2007 ; X86-AVX-NEXT: movl c, %edx 2008 ; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 2009 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 2010 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2011 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 2012 ; X86-AVX-NEXT: retl 2013 ; 2014 ; X64-SSE-LABEL: mul_2xi16_varconst2: 2015 ; X64-SSE: # %bb.0: # %entry 2016 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 2017 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2018 ; X64-SSE-NEXT: movdqa {{.*#+}} xmm1 = <32768,32767,u,u,u,u,u,u> 2019 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 2020 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm2 2021 ; X64-SSE-NEXT: pmullw %xmm1, %xmm0 2022 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2023 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 2024 ; X64-SSE-NEXT: retq 2025 ; 2026 ; X64-AVX-LABEL: mul_2xi16_varconst2: 2027 ; X64-AVX: # %bb.0: # %entry 2028 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 2029 ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 2030 ; X64-AVX-NEXT: vpmulld {{.*}}(%rip), %xmm0, %xmm0 2031 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2032 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 2033 ; X64-AVX-NEXT: retq 2034 entry: 2035 %pre = load i32*, i32** @c 2036 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 2037 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 2038 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 2039 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 2040 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 -32768, i32 32767> 2041 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 2042 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 2043 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 2044 ret void 2045 } 2046 2047 ; %val = load <2 x i16> 2048 ; %op1 = zext<2 x i32> %val 2049 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 65536) 2050 ; %rst = mul <2 x i32> %op1, %op2 2051 ; 2052 define void @mul_2xi16_varconst3(i8* nocapture readonly %a, i64 %index) { 2053 ; X86-SSE-LABEL: mul_2xi16_varconst3: 2054 ; X86-SSE: # %bb.0: # %entry 2055 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 2056 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 2057 ; X86-SSE-NEXT: movl c, %edx 2058 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2059 ; X86-SSE-NEXT: pxor %xmm1, %xmm1 2060 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2061 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 2062 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0] 2063 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 2064 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 2065 ; X86-SSE-NEXT: psrlq $32, %xmm0 2066 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 2067 ; X86-SSE-NEXT: psllq $32, %xmm0 2068 ; X86-SSE-NEXT: paddq %xmm2, %xmm0 2069 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2070 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 2071 ; X86-SSE-NEXT: retl 2072 ; 2073 ; X86-AVX-LABEL: mul_2xi16_varconst3: 2074 ; X86-AVX: # %bb.0: # %entry 2075 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 2076 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 2077 ; X86-AVX-NEXT: movl c, %edx 2078 ; X86-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2079 ; X86-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2080 ; X86-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 2081 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 2082 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2083 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 2084 ; X86-AVX-NEXT: retl 2085 ; 2086 ; X64-SSE-LABEL: mul_2xi16_varconst3: 2087 ; X64-SSE: # %bb.0: # %entry 2088 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 2089 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2090 ; X64-SSE-NEXT: pxor %xmm1, %xmm1 2091 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 2092 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 2093 ; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000 2094 ; X64-SSE-NEXT: movq %rcx, %xmm1 2095 ; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 2096 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 2097 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 2098 ; X64-SSE-NEXT: psrlq $32, %xmm0 2099 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 2100 ; X64-SSE-NEXT: psllq $32, %xmm0 2101 ; X64-SSE-NEXT: paddq %xmm2, %xmm0 2102 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2103 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 2104 ; X64-SSE-NEXT: retq 2105 ; 2106 ; X64-AVX-LABEL: mul_2xi16_varconst3: 2107 ; X64-AVX: # %bb.0: # %entry 2108 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 2109 ; X64-AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2110 ; X64-AVX-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero 2111 ; X64-AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 2112 ; X64-AVX-NEXT: movl $65536, %ecx # imm = 0x10000 2113 ; X64-AVX-NEXT: vmovq %rcx, %xmm1 2114 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 2115 ; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 2116 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2117 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 2118 ; X64-AVX-NEXT: retq 2119 entry: 2120 %pre = load i32*, i32** @c 2121 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 2122 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 2123 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 2124 %tmp8 = zext <2 x i16> %wide.load to <2 x i32> 2125 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 65536> 2126 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 2127 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 2128 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 2129 ret void 2130 } 2131 2132 ; %val = load <2 x i16> 2133 ; %op1 = sext<2 x i32> %val 2134 ; %op2 = const <2 x i32> {c1, c2} // c1 and c2 are within (0 ~ 32768) 2135 ; %rst = mul <2 x i32> %op1, %op2 2136 ; 2137 define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) { 2138 ; X86-SSE-LABEL: mul_2xi16_varconst4: 2139 ; X86-SSE: # %bb.0: # %entry 2140 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax 2141 ; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx 2142 ; X86-SSE-NEXT: movl c, %edx 2143 ; X86-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2144 ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 2145 ; X86-SSE-NEXT: psrad $16, %xmm0 2146 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 2147 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0] 2148 ; X86-SSE-NEXT: movdqa %xmm0, %xmm2 2149 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 2150 ; X86-SSE-NEXT: psrlq $32, %xmm0 2151 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 2152 ; X86-SSE-NEXT: psllq $32, %xmm0 2153 ; X86-SSE-NEXT: paddq %xmm2, %xmm0 2154 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2155 ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) 2156 ; X86-SSE-NEXT: retl 2157 ; 2158 ; X86-AVX-LABEL: mul_2xi16_varconst4: 2159 ; X86-AVX: # %bb.0: # %entry 2160 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax 2161 ; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx 2162 ; X86-AVX-NEXT: movl c, %edx 2163 ; X86-AVX-NEXT: vpmovsxwq (%ecx,%eax), %xmm0 2164 ; X86-AVX-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 2165 ; X86-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2166 ; X86-AVX-NEXT: vmovq %xmm0, (%edx,%eax,4) 2167 ; X86-AVX-NEXT: retl 2168 ; 2169 ; X64-SSE-LABEL: mul_2xi16_varconst4: 2170 ; X64-SSE: # %bb.0: # %entry 2171 ; X64-SSE-NEXT: movq {{.*}}(%rip), %rax 2172 ; X64-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2173 ; X64-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] 2174 ; X64-SSE-NEXT: psrad $16, %xmm0 2175 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] 2176 ; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 2177 ; X64-SSE-NEXT: movq %rcx, %xmm1 2178 ; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 2179 ; X64-SSE-NEXT: movdqa %xmm0, %xmm2 2180 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 2181 ; X64-SSE-NEXT: psrlq $32, %xmm0 2182 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 2183 ; X64-SSE-NEXT: psllq $32, %xmm0 2184 ; X64-SSE-NEXT: paddq %xmm2, %xmm0 2185 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2186 ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) 2187 ; X64-SSE-NEXT: retq 2188 ; 2189 ; X64-AVX-LABEL: mul_2xi16_varconst4: 2190 ; X64-AVX: # %bb.0: # %entry 2191 ; X64-AVX-NEXT: movq {{.*}}(%rip), %rax 2192 ; X64-AVX-NEXT: vpmovsxwq (%rdi,%rsi), %xmm0 2193 ; X64-AVX-NEXT: movl $32768, %ecx # imm = 0x8000 2194 ; X64-AVX-NEXT: vmovq %rcx, %xmm1 2195 ; X64-AVX-NEXT: vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] 2196 ; X64-AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0 2197 ; X64-AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2198 ; X64-AVX-NEXT: vmovq %xmm0, (%rax,%rsi,4) 2199 ; X64-AVX-NEXT: retq 2200 entry: 2201 %pre = load i32*, i32** @c 2202 %tmp6 = getelementptr inbounds i8, i8* %a, i64 %index 2203 %tmp7 = bitcast i8* %tmp6 to <2 x i16>* 2204 %wide.load = load <2 x i16>, <2 x i16>* %tmp7, align 1 2205 %tmp8 = sext <2 x i16> %wide.load to <2 x i32> 2206 %tmp13 = mul nuw nsw <2 x i32> %tmp8, <i32 0, i32 32768> 2207 %tmp14 = getelementptr inbounds i32, i32* %pre, i64 %index 2208 %tmp15 = bitcast i32* %tmp14 to <2 x i32>* 2209 store <2 x i32> %tmp13, <2 x i32>* %tmp15, align 4 2210 ret void 2211 } 2212 2213 ; 2214 ; Illegal Types 2215 ; 2216 2217 define void @PR34947() { 2218 ; X86-SSE-LABEL: PR34947: 2219 ; X86-SSE: # %bb.0: 2220 ; X86-SSE-NEXT: movdqa (%eax), %xmm0 2221 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 2222 ; X86-SSE-NEXT: movd %xmm1, %ecx 2223 ; X86-SSE-NEXT: xorl %eax, %eax 2224 ; X86-SSE-NEXT: xorl %edx, %edx 2225 ; X86-SSE-NEXT: divl %ecx 2226 ; X86-SSE-NEXT: movd %edx, %xmm1 2227 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 2228 ; X86-SSE-NEXT: movd %xmm2, %ecx 2229 ; X86-SSE-NEXT: xorl %eax, %eax 2230 ; X86-SSE-NEXT: xorl %edx, %edx 2231 ; X86-SSE-NEXT: divl %ecx 2232 ; X86-SSE-NEXT: movd %edx, %xmm2 2233 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2234 ; X86-SSE-NEXT: movd %xmm0, %ecx 2235 ; X86-SSE-NEXT: xorl %eax, %eax 2236 ; X86-SSE-NEXT: xorl %edx, %edx 2237 ; X86-SSE-NEXT: divl %ecx 2238 ; X86-SSE-NEXT: movd %edx, %xmm1 2239 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 2240 ; X86-SSE-NEXT: movd %xmm0, %ecx 2241 ; X86-SSE-NEXT: xorl %eax, %eax 2242 ; X86-SSE-NEXT: xorl %edx, %edx 2243 ; X86-SSE-NEXT: divl %ecx 2244 ; X86-SSE-NEXT: movd %edx, %xmm0 2245 ; X86-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2246 ; X86-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2247 ; X86-SSE-NEXT: xorl %eax, %eax 2248 ; X86-SSE-NEXT: xorl %edx, %edx 2249 ; X86-SSE-NEXT: divl (%eax) 2250 ; X86-SSE-NEXT: movd %edx, %xmm0 2251 ; X86-SSE-NEXT: pmaddwd {{\.LCPI.*}}, %xmm1 2252 ; X86-SSE-NEXT: movl $8199, %eax # imm = 0x2007 2253 ; X86-SSE-NEXT: movd %eax, %xmm2 2254 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 2255 ; X86-SSE-NEXT: movd %xmm2, (%eax) 2256 ; X86-SSE-NEXT: movdqa %xmm1, (%eax) 2257 ; X86-SSE-NEXT: retl 2258 ; 2259 ; X86-AVX1-LABEL: PR34947: 2260 ; X86-AVX1: # %bb.0: 2261 ; X86-AVX1-NEXT: pushl %ebp 2262 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 2263 ; X86-AVX1-NEXT: pushl %ebx 2264 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 12 2265 ; X86-AVX1-NEXT: pushl %edi 2266 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 16 2267 ; X86-AVX1-NEXT: pushl %esi 2268 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 20 2269 ; X86-AVX1-NEXT: subl $16, %esp 2270 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 36 2271 ; X86-AVX1-NEXT: .cfi_offset %esi, -20 2272 ; X86-AVX1-NEXT: .cfi_offset %edi, -16 2273 ; X86-AVX1-NEXT: .cfi_offset %ebx, -12 2274 ; X86-AVX1-NEXT: .cfi_offset %ebp, -8 2275 ; X86-AVX1-NEXT: vmovdqa (%eax), %ymm0 2276 ; X86-AVX1-NEXT: xorl %eax, %eax 2277 ; X86-AVX1-NEXT: xorl %edx, %edx 2278 ; X86-AVX1-NEXT: divl (%eax) 2279 ; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill 2280 ; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx 2281 ; X86-AVX1-NEXT: xorl %eax, %eax 2282 ; X86-AVX1-NEXT: xorl %edx, %edx 2283 ; X86-AVX1-NEXT: divl %ecx 2284 ; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill 2285 ; X86-AVX1-NEXT: vpextrd $2, %xmm0, %ecx 2286 ; X86-AVX1-NEXT: xorl %eax, %eax 2287 ; X86-AVX1-NEXT: xorl %edx, %edx 2288 ; X86-AVX1-NEXT: divl %ecx 2289 ; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill 2290 ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %ecx 2291 ; X86-AVX1-NEXT: xorl %eax, %eax 2292 ; X86-AVX1-NEXT: xorl %edx, %edx 2293 ; X86-AVX1-NEXT: divl %ecx 2294 ; X86-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill 2295 ; X86-AVX1-NEXT: vmovd %xmm0, %ecx 2296 ; X86-AVX1-NEXT: xorl %eax, %eax 2297 ; X86-AVX1-NEXT: xorl %edx, %edx 2298 ; X86-AVX1-NEXT: divl %ecx 2299 ; X86-AVX1-NEXT: movl %edx, %ebp 2300 ; X86-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2301 ; X86-AVX1-NEXT: xorl %eax, %eax 2302 ; X86-AVX1-NEXT: xorl %edx, %edx 2303 ; X86-AVX1-NEXT: vpextrd $3, %xmm0, %ecx 2304 ; X86-AVX1-NEXT: divl %ecx 2305 ; X86-AVX1-NEXT: movl %edx, %ecx 2306 ; X86-AVX1-NEXT: xorl %eax, %eax 2307 ; X86-AVX1-NEXT: xorl %edx, %edx 2308 ; X86-AVX1-NEXT: vpextrd $2, %xmm0, %esi 2309 ; X86-AVX1-NEXT: divl %esi 2310 ; X86-AVX1-NEXT: movl %edx, %esi 2311 ; X86-AVX1-NEXT: xorl %eax, %eax 2312 ; X86-AVX1-NEXT: xorl %edx, %edx 2313 ; X86-AVX1-NEXT: vpextrd $1, %xmm0, %edi 2314 ; X86-AVX1-NEXT: divl %edi 2315 ; X86-AVX1-NEXT: movl %edx, %edi 2316 ; X86-AVX1-NEXT: xorl %eax, %eax 2317 ; X86-AVX1-NEXT: xorl %edx, %edx 2318 ; X86-AVX1-NEXT: vmovd %xmm0, %ebx 2319 ; X86-AVX1-NEXT: divl %ebx 2320 ; X86-AVX1-NEXT: vmovd %edx, %xmm0 2321 ; X86-AVX1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 2322 ; X86-AVX1-NEXT: vpinsrd $2, %esi, %xmm0, %xmm0 2323 ; X86-AVX1-NEXT: vpinsrd $3, %ecx, %xmm0, %xmm0 2324 ; X86-AVX1-NEXT: vmovd %ebp, %xmm1 2325 ; X86-AVX1-NEXT: vpinsrd $1, (%esp), %xmm1, %xmm1 # 4-byte Folded Reload 2326 ; X86-AVX1-NEXT: vpinsrd $2, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload 2327 ; X86-AVX1-NEXT: vpinsrd $3, {{[0-9]+}}(%esp), %xmm1, %xmm1 # 4-byte Folded Reload 2328 ; X86-AVX1-NEXT: vmovd {{[0-9]+}}(%esp), %xmm2 # 4-byte Folded Reload 2329 ; X86-AVX1-NEXT: # xmm2 = mem[0],zero,zero,zero 2330 ; X86-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 2331 ; X86-AVX1-NEXT: vmovd %eax, %xmm3 2332 ; X86-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8199,8199,8199,8199] 2333 ; X86-AVX1-NEXT: vpmaddwd %xmm4, %xmm0, %xmm0 2334 ; X86-AVX1-NEXT: vpmaddwd %xmm4, %xmm1, %xmm1 2335 ; X86-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2336 ; X86-AVX1-NEXT: vpmulld %xmm3, %xmm2, %xmm1 2337 ; X86-AVX1-NEXT: vmovd %xmm1, (%eax) 2338 ; X86-AVX1-NEXT: vmovaps %ymm0, (%eax) 2339 ; X86-AVX1-NEXT: addl $16, %esp 2340 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 20 2341 ; X86-AVX1-NEXT: popl %esi 2342 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 16 2343 ; X86-AVX1-NEXT: popl %edi 2344 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 12 2345 ; X86-AVX1-NEXT: popl %ebx 2346 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 8 2347 ; X86-AVX1-NEXT: popl %ebp 2348 ; X86-AVX1-NEXT: .cfi_def_cfa_offset 4 2349 ; X86-AVX1-NEXT: vzeroupper 2350 ; X86-AVX1-NEXT: retl 2351 ; 2352 ; X86-AVX2-LABEL: PR34947: 2353 ; X86-AVX2: # %bb.0: 2354 ; X86-AVX2-NEXT: pushl %esi 2355 ; X86-AVX2-NEXT: .cfi_def_cfa_offset 8 2356 ; X86-AVX2-NEXT: .cfi_offset %esi, -8 2357 ; X86-AVX2-NEXT: vmovdqa (%eax), %ymm0 2358 ; X86-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2359 ; X86-AVX2-NEXT: vpextrd $1, %xmm1, %ecx 2360 ; X86-AVX2-NEXT: xorl %eax, %eax 2361 ; X86-AVX2-NEXT: xorl %edx, %edx 2362 ; X86-AVX2-NEXT: divl %ecx 2363 ; X86-AVX2-NEXT: movl %edx, %ecx 2364 ; X86-AVX2-NEXT: vmovd %xmm1, %esi 2365 ; X86-AVX2-NEXT: xorl %eax, %eax 2366 ; X86-AVX2-NEXT: xorl %edx, %edx 2367 ; X86-AVX2-NEXT: divl %esi 2368 ; X86-AVX2-NEXT: vmovd %edx, %xmm2 2369 ; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 2370 ; X86-AVX2-NEXT: vpextrd $2, %xmm1, %ecx 2371 ; X86-AVX2-NEXT: xorl %eax, %eax 2372 ; X86-AVX2-NEXT: xorl %edx, %edx 2373 ; X86-AVX2-NEXT: divl %ecx 2374 ; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 2375 ; X86-AVX2-NEXT: vpextrd $3, %xmm1, %ecx 2376 ; X86-AVX2-NEXT: xorl %eax, %eax 2377 ; X86-AVX2-NEXT: xorl %edx, %edx 2378 ; X86-AVX2-NEXT: divl %ecx 2379 ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm1 2380 ; X86-AVX2-NEXT: vpextrd $1, %xmm0, %ecx 2381 ; X86-AVX2-NEXT: xorl %eax, %eax 2382 ; X86-AVX2-NEXT: xorl %edx, %edx 2383 ; X86-AVX2-NEXT: divl %ecx 2384 ; X86-AVX2-NEXT: movl %edx, %ecx 2385 ; X86-AVX2-NEXT: vmovd %xmm0, %esi 2386 ; X86-AVX2-NEXT: xorl %eax, %eax 2387 ; X86-AVX2-NEXT: xorl %edx, %edx 2388 ; X86-AVX2-NEXT: divl %esi 2389 ; X86-AVX2-NEXT: vmovd %edx, %xmm2 2390 ; X86-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 2391 ; X86-AVX2-NEXT: vpextrd $2, %xmm0, %ecx 2392 ; X86-AVX2-NEXT: xorl %eax, %eax 2393 ; X86-AVX2-NEXT: xorl %edx, %edx 2394 ; X86-AVX2-NEXT: divl %ecx 2395 ; X86-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 2396 ; X86-AVX2-NEXT: vpextrd $3, %xmm0, %ecx 2397 ; X86-AVX2-NEXT: xorl %eax, %eax 2398 ; X86-AVX2-NEXT: xorl %edx, %edx 2399 ; X86-AVX2-NEXT: divl %ecx 2400 ; X86-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 2401 ; X86-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2402 ; X86-AVX2-NEXT: xorl %eax, %eax 2403 ; X86-AVX2-NEXT: xorl %edx, %edx 2404 ; X86-AVX2-NEXT: divl (%eax) 2405 ; X86-AVX2-NEXT: vmovd %edx, %xmm1 2406 ; X86-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199] 2407 ; X86-AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 2408 ; X86-AVX2-NEXT: movl $8199, %eax # imm = 0x2007 2409 ; X86-AVX2-NEXT: vmovd %eax, %xmm2 2410 ; X86-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 2411 ; X86-AVX2-NEXT: vmovd %xmm1, (%eax) 2412 ; X86-AVX2-NEXT: vmovdqa %ymm0, (%eax) 2413 ; X86-AVX2-NEXT: popl %esi 2414 ; X86-AVX2-NEXT: .cfi_def_cfa_offset 4 2415 ; X86-AVX2-NEXT: vzeroupper 2416 ; X86-AVX2-NEXT: retl 2417 ; 2418 ; X64-SSE-LABEL: PR34947: 2419 ; X64-SSE: # %bb.0: 2420 ; X64-SSE-NEXT: movdqa (%rax), %xmm0 2421 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 2422 ; X64-SSE-NEXT: movd %xmm1, %ecx 2423 ; X64-SSE-NEXT: xorl %eax, %eax 2424 ; X64-SSE-NEXT: xorl %edx, %edx 2425 ; X64-SSE-NEXT: divl %ecx 2426 ; X64-SSE-NEXT: movd %edx, %xmm1 2427 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] 2428 ; X64-SSE-NEXT: movd %xmm2, %ecx 2429 ; X64-SSE-NEXT: xorl %eax, %eax 2430 ; X64-SSE-NEXT: xorl %edx, %edx 2431 ; X64-SSE-NEXT: divl %ecx 2432 ; X64-SSE-NEXT: movd %edx, %xmm2 2433 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2434 ; X64-SSE-NEXT: movd %xmm0, %ecx 2435 ; X64-SSE-NEXT: xorl %eax, %eax 2436 ; X64-SSE-NEXT: xorl %edx, %edx 2437 ; X64-SSE-NEXT: divl %ecx 2438 ; X64-SSE-NEXT: movd %edx, %xmm1 2439 ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] 2440 ; X64-SSE-NEXT: movd %xmm0, %ecx 2441 ; X64-SSE-NEXT: xorl %eax, %eax 2442 ; X64-SSE-NEXT: xorl %edx, %edx 2443 ; X64-SSE-NEXT: divl %ecx 2444 ; X64-SSE-NEXT: movd %edx, %xmm0 2445 ; X64-SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2446 ; X64-SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] 2447 ; X64-SSE-NEXT: xorl %eax, %eax 2448 ; X64-SSE-NEXT: xorl %edx, %edx 2449 ; X64-SSE-NEXT: divl (%rax) 2450 ; X64-SSE-NEXT: movd %edx, %xmm0 2451 ; X64-SSE-NEXT: pmaddwd {{.*}}(%rip), %xmm1 2452 ; X64-SSE-NEXT: movl $8199, %eax # imm = 0x2007 2453 ; X64-SSE-NEXT: movd %eax, %xmm2 2454 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 2455 ; X64-SSE-NEXT: movd %xmm2, (%rax) 2456 ; X64-SSE-NEXT: movdqa %xmm1, (%rax) 2457 ; X64-SSE-NEXT: retq 2458 ; 2459 ; X64-AVX1-LABEL: PR34947: 2460 ; X64-AVX1: # %bb.0: 2461 ; X64-AVX1-NEXT: pushq %rbp 2462 ; X64-AVX1-NEXT: .cfi_def_cfa_offset 16 2463 ; X64-AVX1-NEXT: pushq %rbx 2464 ; X64-AVX1-NEXT: .cfi_def_cfa_offset 24 2465 ; X64-AVX1-NEXT: .cfi_offset %rbx, -24 2466 ; X64-AVX1-NEXT: .cfi_offset %rbp, -16 2467 ; X64-AVX1-NEXT: vmovdqa (%rax), %ymm0 2468 ; X64-AVX1-NEXT: xorl %eax, %eax 2469 ; X64-AVX1-NEXT: xorl %edx, %edx 2470 ; X64-AVX1-NEXT: divl (%rax) 2471 ; X64-AVX1-NEXT: movl %edx, %r8d 2472 ; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx 2473 ; X64-AVX1-NEXT: xorl %eax, %eax 2474 ; X64-AVX1-NEXT: xorl %edx, %edx 2475 ; X64-AVX1-NEXT: divl %ecx 2476 ; X64-AVX1-NEXT: movl %edx, %r9d 2477 ; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx 2478 ; X64-AVX1-NEXT: xorl %eax, %eax 2479 ; X64-AVX1-NEXT: xorl %edx, %edx 2480 ; X64-AVX1-NEXT: divl %ecx 2481 ; X64-AVX1-NEXT: movl %edx, %r10d 2482 ; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ecx 2483 ; X64-AVX1-NEXT: xorl %eax, %eax 2484 ; X64-AVX1-NEXT: xorl %edx, %edx 2485 ; X64-AVX1-NEXT: divl %ecx 2486 ; X64-AVX1-NEXT: movl %edx, %r11d 2487 ; X64-AVX1-NEXT: vmovd %xmm0, %ecx 2488 ; X64-AVX1-NEXT: xorl %eax, %eax 2489 ; X64-AVX1-NEXT: xorl %edx, %edx 2490 ; X64-AVX1-NEXT: divl %ecx 2491 ; X64-AVX1-NEXT: movl %edx, %esi 2492 ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 2493 ; X64-AVX1-NEXT: vpextrd $3, %xmm0, %ecx 2494 ; X64-AVX1-NEXT: xorl %eax, %eax 2495 ; X64-AVX1-NEXT: xorl %edx, %edx 2496 ; X64-AVX1-NEXT: divl %ecx 2497 ; X64-AVX1-NEXT: movl %edx, %edi 2498 ; X64-AVX1-NEXT: vpextrd $2, %xmm0, %ecx 2499 ; X64-AVX1-NEXT: xorl %eax, %eax 2500 ; X64-AVX1-NEXT: xorl %edx, %edx 2501 ; X64-AVX1-NEXT: divl %ecx 2502 ; X64-AVX1-NEXT: movl %edx, %ecx 2503 ; X64-AVX1-NEXT: vpextrd $1, %xmm0, %ebx 2504 ; X64-AVX1-NEXT: xorl %eax, %eax 2505 ; X64-AVX1-NEXT: xorl %edx, %edx 2506 ; X64-AVX1-NEXT: divl %ebx 2507 ; X64-AVX1-NEXT: movl %edx, %ebx 2508 ; X64-AVX1-NEXT: vmovd %xmm0, %ebp 2509 ; X64-AVX1-NEXT: xorl %eax, %eax 2510 ; X64-AVX1-NEXT: xorl %edx, %edx 2511 ; X64-AVX1-NEXT: divl %ebp 2512 ; X64-AVX1-NEXT: vmovd %edx, %xmm0 2513 ; X64-AVX1-NEXT: vpinsrd $1, %ebx, %xmm0, %xmm0 2514 ; X64-AVX1-NEXT: vpinsrd $2, %ecx, %xmm0, %xmm0 2515 ; X64-AVX1-NEXT: vpinsrd $3, %edi, %xmm0, %xmm0 2516 ; X64-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8199,8199,8199,8199] 2517 ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 2518 ; X64-AVX1-NEXT: vmovd %esi, %xmm2 2519 ; X64-AVX1-NEXT: vpinsrd $1, %r11d, %xmm2, %xmm2 2520 ; X64-AVX1-NEXT: vpinsrd $2, %r10d, %xmm2, %xmm2 2521 ; X64-AVX1-NEXT: vpinsrd $3, %r9d, %xmm2, %xmm2 2522 ; X64-AVX1-NEXT: vpmaddwd %xmm1, %xmm2, %xmm1 2523 ; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 2524 ; X64-AVX1-NEXT: vmovd %r8d, %xmm1 2525 ; X64-AVX1-NEXT: movl $8199, %eax # imm = 0x2007 2526 ; X64-AVX1-NEXT: vmovd %eax, %xmm2 2527 ; X64-AVX1-NEXT: vpmulld %xmm2, %xmm1, %xmm1 2528 ; X64-AVX1-NEXT: vmovd %xmm1, (%rax) 2529 ; X64-AVX1-NEXT: vmovaps %ymm0, (%rax) 2530 ; X64-AVX1-NEXT: popq %rbx 2531 ; X64-AVX1-NEXT: .cfi_def_cfa_offset 16 2532 ; X64-AVX1-NEXT: popq %rbp 2533 ; X64-AVX1-NEXT: .cfi_def_cfa_offset 8 2534 ; X64-AVX1-NEXT: vzeroupper 2535 ; X64-AVX1-NEXT: retq 2536 ; 2537 ; X64-AVX2-LABEL: PR34947: 2538 ; X64-AVX2: # %bb.0: 2539 ; X64-AVX2-NEXT: vmovdqa (%rax), %ymm0 2540 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 2541 ; X64-AVX2-NEXT: vpextrd $1, %xmm1, %ecx 2542 ; X64-AVX2-NEXT: xorl %eax, %eax 2543 ; X64-AVX2-NEXT: xorl %edx, %edx 2544 ; X64-AVX2-NEXT: divl %ecx 2545 ; X64-AVX2-NEXT: movl %edx, %ecx 2546 ; X64-AVX2-NEXT: vmovd %xmm1, %esi 2547 ; X64-AVX2-NEXT: xorl %eax, %eax 2548 ; X64-AVX2-NEXT: xorl %edx, %edx 2549 ; X64-AVX2-NEXT: divl %esi 2550 ; X64-AVX2-NEXT: vmovd %edx, %xmm2 2551 ; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 2552 ; X64-AVX2-NEXT: vpextrd $2, %xmm1, %ecx 2553 ; X64-AVX2-NEXT: xorl %eax, %eax 2554 ; X64-AVX2-NEXT: xorl %edx, %edx 2555 ; X64-AVX2-NEXT: divl %ecx 2556 ; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 2557 ; X64-AVX2-NEXT: vpextrd $3, %xmm1, %ecx 2558 ; X64-AVX2-NEXT: xorl %eax, %eax 2559 ; X64-AVX2-NEXT: xorl %edx, %edx 2560 ; X64-AVX2-NEXT: divl %ecx 2561 ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm1 2562 ; X64-AVX2-NEXT: vpextrd $1, %xmm0, %ecx 2563 ; X64-AVX2-NEXT: xorl %eax, %eax 2564 ; X64-AVX2-NEXT: xorl %edx, %edx 2565 ; X64-AVX2-NEXT: divl %ecx 2566 ; X64-AVX2-NEXT: movl %edx, %ecx 2567 ; X64-AVX2-NEXT: vmovd %xmm0, %esi 2568 ; X64-AVX2-NEXT: xorl %eax, %eax 2569 ; X64-AVX2-NEXT: xorl %edx, %edx 2570 ; X64-AVX2-NEXT: divl %esi 2571 ; X64-AVX2-NEXT: vmovd %edx, %xmm2 2572 ; X64-AVX2-NEXT: vpinsrd $1, %ecx, %xmm2, %xmm2 2573 ; X64-AVX2-NEXT: vpextrd $2, %xmm0, %ecx 2574 ; X64-AVX2-NEXT: xorl %eax, %eax 2575 ; X64-AVX2-NEXT: xorl %edx, %edx 2576 ; X64-AVX2-NEXT: divl %ecx 2577 ; X64-AVX2-NEXT: vpinsrd $2, %edx, %xmm2, %xmm2 2578 ; X64-AVX2-NEXT: vpextrd $3, %xmm0, %ecx 2579 ; X64-AVX2-NEXT: xorl %eax, %eax 2580 ; X64-AVX2-NEXT: xorl %edx, %edx 2581 ; X64-AVX2-NEXT: divl %ecx 2582 ; X64-AVX2-NEXT: vpinsrd $3, %edx, %xmm2, %xmm0 2583 ; X64-AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 2584 ; X64-AVX2-NEXT: xorl %eax, %eax 2585 ; X64-AVX2-NEXT: xorl %edx, %edx 2586 ; X64-AVX2-NEXT: divl (%rax) 2587 ; X64-AVX2-NEXT: vmovd %edx, %xmm1 2588 ; X64-AVX2-NEXT: vpbroadcastd {{.*#+}} ymm2 = [8199,8199,8199,8199,8199,8199,8199,8199] 2589 ; X64-AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0 2590 ; X64-AVX2-NEXT: movl $8199, %eax # imm = 0x2007 2591 ; X64-AVX2-NEXT: vmovd %eax, %xmm2 2592 ; X64-AVX2-NEXT: vpmulld %ymm2, %ymm1, %ymm1 2593 ; X64-AVX2-NEXT: vmovd %xmm1, (%rax) 2594 ; X64-AVX2-NEXT: vmovdqa %ymm0, (%rax) 2595 ; X64-AVX2-NEXT: vzeroupper 2596 ; X64-AVX2-NEXT: retq 2597 %tmp = load <9 x i32>, <9 x i32>* undef, align 64 2598 %rem = urem <9 x i32> zeroinitializer, %tmp 2599 %mul = mul <9 x i32> <i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199, i32 8199>, %rem 2600 store <9 x i32> %mul, <9 x i32>* undef, align 64 2601 ret void 2602 } 2603